[RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
@ 2008-07-30  0:19 Karen Xie
  2008-07-30 18:15 ` Shyam_Iyer
                   ` (3 more replies)
  0 siblings, 4 replies; 71+ messages in thread
From: Karen Xie @ 2008-07-30  0:19 UTC (permalink / raw)
  To: netdev, open-iscsi
  Cc: jgarzik, davem, michaelc, swise, rdreier, daisyc, wenxiong, bhua,
	divy, dm, leedom

Cxgb3i iSCSI driver

Signed-off-by: Karen Xie <kxie@chelsio.com>
---

 drivers/scsi/cxgb3i/Kconfig          |    6 
 drivers/scsi/cxgb3i/Makefile         |    5 
 drivers/scsi/cxgb3i/cxgb3i.h         |  155 +++
 drivers/scsi/cxgb3i/cxgb3i_init.c    |  109 ++
 drivers/scsi/cxgb3i/cxgb3i_iscsi.c   |  800 ++++++++++++++
 drivers/scsi/cxgb3i/cxgb3i_offload.c | 2001 ++++++++++++++++++++++++++++++++++
 drivers/scsi/cxgb3i/cxgb3i_offload.h |  242 ++++
 drivers/scsi/cxgb3i/cxgb3i_ulp2.c    |  692 ++++++++++++
 drivers/scsi/cxgb3i/cxgb3i_ulp2.h    |  106 ++
 9 files changed, 4116 insertions(+), 0 deletions(-)
 create mode 100644 drivers/scsi/cxgb3i/Kconfig
 create mode 100644 drivers/scsi/cxgb3i/Makefile
 create mode 100644 drivers/scsi/cxgb3i/cxgb3i.h
 create mode 100644 drivers/scsi/cxgb3i/cxgb3i_init.c
 create mode 100644 drivers/scsi/cxgb3i/cxgb3i_iscsi.c
 create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.c
 create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.h
 create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.c
 create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.h


diff --git a/drivers/scsi/cxgb3i/Kconfig b/drivers/scsi/cxgb3i/Kconfig
new file mode 100644
index 0000000..2762814
--- /dev/null
+++ b/drivers/scsi/cxgb3i/Kconfig
@@ -0,0 +1,6 @@
+config SCSI_CXGB3_ISCSI
+	tristate "Chelsio S3xx iSCSI support"
+	select CHELSIO_T3
+	select SCSI_ISCSI_ATTRS
+	---help---
+	This driver supports iSCSI offload for the Chelsio S3 series devices.
diff --git a/drivers/scsi/cxgb3i/Makefile b/drivers/scsi/cxgb3i/Makefile
new file mode 100644
index 0000000..8c8a894
--- /dev/null
+++ b/drivers/scsi/cxgb3i/Makefile
@@ -0,0 +1,5 @@
+EXTRA_CFLAGS += -I$(TOPDIR)/drivers/net/cxgb3
+
+cxgb3i-y := cxgb3i_init.o cxgb3i_iscsi.o cxgb3i_ulp2.o cxgb3i_offload.o
+
+obj-$(CONFIG_SCSI_CXGB3_ISCSI) += cxgb3i.o
diff --git a/drivers/scsi/cxgb3i/cxgb3i.h b/drivers/scsi/cxgb3i/cxgb3i.h
new file mode 100644
index 0000000..3c44c3c
--- /dev/null
+++ b/drivers/scsi/cxgb3i/cxgb3i.h
@@ -0,0 +1,155 @@
+/*
+ * cxgb3i.h: Chelsio S3xx iSCSI driver.
+ *
+ * Copyright (c) 2008 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ */
+
+#ifndef __CXGB3I_H__
+#define __CXGB3I_H__
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/scatterlist.h>
+
+/* from cxgb3 LLD */
+#include "common.h"
+#include "t3_cpl.h"
+#include "t3cdev.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxgb3_offload.h"
+#include "firmware_exports.h"
+#include "cxgb3i_offload.h"
+
+#define CXGB3I_SCSI_QDEPTH_DFLT	128
+#define ISCSI_PDU_HEADER_MAX	(56 + 256) /* bhs + digests + ahs */
+
+struct cxgb3i_adapter;
+struct cxgb3i_hba;
+struct cxgb3i_endpoint;
+
+/**
+ * struct cxgb3i_tag_format - cxgb3i ulp tag for steering pdu payload
+ *
+ * @rsvd_bits:	# of bits used by h/w
+ * @rsvd_shift:	shift left
+ * @rsvd_mask:  bit mask
+ *
+ */
+struct cxgb3i_tag_format {
+	unsigned char idx_bits;
+	unsigned char age_bits;
+	unsigned char rsvd_bits;
+	unsigned char rsvd_shift;
+	u32 rsvd_mask;
+};
+
+/**
+ * struct cxgb3i_ddp_info - cxgb3i direct data placement for pdu payload
+ *
+ * @llimit:	lower bound of the page pod memory
+ * @ulimit:	upper bound of the page pod memory
+ * @nppods:	# of page pod entries
+ * @idx_last:	page pod entry last used
+ * @map_lock:	lock to synchonize access to the page pod map
+ * @map:	page pod map
+ */
+struct cxgb3i_ddp_info {
+	unsigned int llimit;
+	unsigned int ulimit;
+	unsigned int nppods;
+	unsigned int idx_last;
+	spinlock_t map_lock;
+	u8 *map;
+};
+
+struct cxgb3i_hba {
+	struct cxgb3i_adapter *snic;
+	struct net_device *ndev;
+	struct Scsi_Host *shost;
+
+	rwlock_t cconn_rwlock;
+	struct list_head cconn_list;
+};
+
+struct cxgb3i_adapter {
+	struct list_head list_head;
+	spinlock_t lock;
+	struct t3cdev *tdev;
+	struct pci_dev *pdev;
+	unsigned char hba_cnt;
+	struct cxgb3i_hba *hba[MAX_NPORTS];
+
+	unsigned int tx_max_size;
+	unsigned int rx_max_size;
+
+	struct cxgb3i_tag_format tag_format;
+	struct cxgb3i_ddp_info ddp;
+};
+
+struct cxgb3i_conn {
+	struct list_head list_head;
+
+	struct cxgb3i_endpoint *cep;
+	struct iscsi_conn *conn;
+	struct cxgb3i_hba *hba;
+};
+
+struct cxgb3i_endpoint {
+	struct s3_conn *c3cn;
+	struct cxgb3i_hba *hba;
+	struct cxgb3i_conn *cconn;
+};
+
+int cxgb3i_iscsi_init(void);
+void cxgb3i_iscsi_cleanup(void);
+
+struct cxgb3i_adapter *cxgb3i_adapter_find_by_tdev(struct t3cdev *);
+struct cxgb3i_adapter *cxgb3i_adapter_add(struct t3cdev *);
+void cxgb3i_adapter_remove(struct cxgb3i_adapter *);
+int cxgb3i_adapter_ulp_init(struct cxgb3i_adapter *);
+void cxgb3i_adapter_ulp_cleanup(struct cxgb3i_adapter *);
+
+struct cxgb3i_hba *cxgb3i_hba_find_by_netdev(struct net_device *);
+struct cxgb3i_hba *cxgb3i_hba_host_add(struct cxgb3i_adapter *,
+				       struct net_device *);
+void cxgb3i_hba_host_remove(struct cxgb3i_hba *);
+
+void cxgb3i_hba_conn_add(struct cxgb3i_conn *, struct cxgb3i_hba *);
+void cxgb3i_hba_conn_remove(struct cxgb3i_conn *);
+
+int cxgb3i_ulp2_init(void);
+void cxgb3i_ulp2_cleanup(void);
+int cxgb3i_conn_ulp_setup(struct cxgb3i_conn *, int, int);
+
+void cxgb3i_ddp_tag_release(struct cxgb3i_adapter *, u32,
+			    struct scatterlist *, unsigned int);
+u32 cxgb3i_ddp_tag_reserve(struct cxgb3i_adapter *, unsigned int,
+			   u32, unsigned int, struct scatterlist *,
+			   unsigned int);
+static inline void cxgb3i_parse_tag(struct cxgb3i_tag_format *format,
+				    u32 tag, u32 *rsvd_bits, u32 *sw_bits)
+{
+	if (rsvd_bits)
+		*rsvd_bits = (tag >> format->rsvd_shift) & format->rsvd_mask;
+	if (sw_bits) {
+		*sw_bits = (tag >> (format->rsvd_shift + format->rsvd_bits))
+		    << format->rsvd_shift;
+		*sw_bits |= tag & ((1 << format->rsvd_shift) - 1);
+	}
+}
+
+int cxgb3i_conn_ulp2_xmit(struct iscsi_conn *);
+
+void cxgb3i_display_byte_string(char *, unsigned char *, int, int);
+
+#endif
diff --git a/drivers/scsi/cxgb3i/cxgb3i_init.c b/drivers/scsi/cxgb3i/cxgb3i_init.c
new file mode 100644
index 0000000..1c91bb0
--- /dev/null
+++ b/drivers/scsi/cxgb3i/cxgb3i_init.c
@@ -0,0 +1,109 @@
+/* cxgb3i_init.c: Chelsio S3xx iSCSI driver.
+ *
+ * Copyright (c) 2008 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ */
+
+#include "cxgb3i.h"
+
+#define DRV_MODULE_NAME         "cxgb3i"
+#define DRV_MODULE_VERSION      "1.0.0"
+#define DRV_MODULE_RELDATE      "May 1, 2008"
+
+static char version[] =
+    "Chelsio S3xx iSCSI Driver " DRV_MODULE_NAME
+    " v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+
+MODULE_AUTHOR("Karen Xie <kxie@chelsio.com>");
+MODULE_DESCRIPTION("Chelsio S3xx iSCSI Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+static void open_s3_dev(struct t3cdev *);
+static void close_s3_dev(struct t3cdev *);
+cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS];
+struct cxgb3_client t3c_client = {
+	.name = "iscsi_cxgb3",
+	.handlers = cxgb3i_cpl_handlers,
+	.add = open_s3_dev,
+	.remove = close_s3_dev,
+};
+
+/**
+ * open_s3_dev - register with cxgb3 LLD
+ * @t3dev	cxgb3 adapter instance
+ */
+static void open_s3_dev(struct t3cdev *t3dev)
+{
+	static int vers_printed;
+
+	if (!vers_printed) {
+		printk(KERN_INFO "%s", version);
+		vers_printed = 1;
+	}
+
+	cxgb3i_log_debug("open cxgb3 %s.\n", t3dev->name);
+
+	cxgb3i_sdev_add(t3dev, &t3c_client);
+	cxgb3i_adapter_add(t3dev);
+}
+
+/**
+ * close_s3_dev - de-register with cxgb3 LLD
+ * @t3dev	cxgb3 adapter instance
+ */
+static void close_s3_dev(struct t3cdev *t3dev)
+{
+	struct cxgb3i_adapter *snic = cxgb3i_adapter_find_by_tdev(t3dev);
+	cxgb3i_log_debug("close cxgb3 %s.\n", t3dev->name);
+	if (snic)
+		cxgb3i_adapter_remove(snic);
+	cxgb3i_sdev_remove(t3dev);
+}
+
+/**
+ * cxgb3i_init_module - module init entry point
+ *
+ * initialize any driver wide global data structures and register itself
+ *	with the cxgb3 module
+ */
+static int __init cxgb3i_init_module(void)
+{
+	int err;
+
+	err = cxgb3i_sdev_init(cxgb3i_cpl_handlers);
+	if (err < 0)
+		return err;
+
+	err = cxgb3i_iscsi_init();
+	if (err < 0)
+		return err;
+
+	err = cxgb3i_ulp2_init();
+	if (err < 0)
+		return err;
+
+	cxgb3_register_client(&t3c_client);
+	return 0;
+}
+
+/**
+ * cxgb3i_exit_module - module cleanup/exit entry point
+ *
+ * go through the driver hba list and for each hba, release any resource held.
+ *	and unregisters iscsi transport and the cxgb3 module
+ */
+static void __exit cxgb3i_exit_module(void)
+{
+	cxgb3_unregister_client(&t3c_client);
+	cxgb3i_ulp2_cleanup();
+	cxgb3i_iscsi_cleanup();
+}
+
+module_init(cxgb3i_init_module);
+module_exit(cxgb3i_exit_module);
diff --git a/drivers/scsi/cxgb3i/cxgb3i_iscsi.c b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c
new file mode 100644
index 0000000..ed3d340
--- /dev/null
+++ b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c
@@ -0,0 +1,800 @@
+/* cxgb3i_iscsi.c: Chelsio S3xx iSCSI driver.
+ *
+ * Copyright (c) 2008 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ */
+
+#include <net/tcp.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi.h>
+#include <scsi/iscsi_proto.h>
+#include <scsi/libiscsi.h>
+#include <scsi/scsi_transport_iscsi.h>
+#include <linux/crypto.h>
+#include "../iscsi_tcp.h"
+
+#include "cxgb3i.h"
+
+static struct scsi_transport_template *cxgb3i_scsi_transport;
+static struct scsi_host_template cxgb3i_host_template;
+static struct iscsi_transport cxgb3i_iscsi_transport;
+
+static LIST_HEAD(cxgb3i_snic_list);
+static DEFINE_RWLOCK(cxgb3i_snic_rwlock);
+
+/**
+ * cxgb3i_adapter_add - initialize a s3 adapter structure and any h/w settings
+ *	necessary
+ * @snic:	pointer to adapter instance
+ */
+struct cxgb3i_adapter *cxgb3i_adapter_add(struct t3cdev *t3dev)
+{
+	struct cxgb3i_adapter *snic;
+	struct adapter *adapter = tdev2adap(t3dev);
+	int i;
+
+	snic = kzalloc(sizeof(*snic), GFP_KERNEL);
+	if (!snic) {
+		cxgb3i_log_debug("cxgb3 %s, OOM.\n", t3dev->name);
+		return NULL;
+	}
+
+	spin_lock_init(&snic->lock);
+	snic->tdev = t3dev;
+	snic->pdev = adapter->pdev;
+
+	if (cxgb3i_adapter_ulp_init(snic))
+		goto free_snic;
+
+	for_each_port(adapter, i) {
+		snic->hba[i] = cxgb3i_hba_host_add(snic, adapter->port[i]);
+		if (!snic->hba[i])
+			goto ulp_cleanup;
+	}
+	snic->hba_cnt = adapter->params.nports;
+
+	/* add to the list */
+	write_lock(&cxgb3i_snic_rwlock);
+	list_add_tail(&snic->list_head, &cxgb3i_snic_list);
+	write_unlock(&cxgb3i_snic_rwlock);
+
+	return snic;
+
+ulp_cleanup:
+	cxgb3i_adapter_ulp_cleanup(snic);
+free_snic:
+	kfree(snic);
+	return NULL;
+}
+
+/**
+ * cxgb3i_snic_cleanup - release all the resources held and cleanup any h/w
+ *	settings necessary
+ * @snic:	pointer to adapter instance
+ */
+void cxgb3i_adapter_remove(struct cxgb3i_adapter *snic)
+{
+	int i;
+
+	/* remove from the list */
+	write_lock(&cxgb3i_snic_rwlock);
+	list_del(&snic->list_head);
+	write_unlock(&cxgb3i_snic_rwlock);
+
+	for (i = 0; i < snic->hba_cnt; i++) {
+		if (snic->hba[i]) {
+			cxgb3i_hba_host_remove(snic->hba[i]);
+			snic->hba[i] = NULL;
+		}
+	}
+
+	/* release ddp resources */
+	cxgb3i_adapter_ulp_cleanup(snic);
+	kfree(snic);
+}
+
+struct cxgb3i_adapter *cxgb3i_adapter_find_by_tdev(struct t3cdev *t3dev)
+{
+	struct cxgb3i_adapter *snic;
+
+	read_lock(&cxgb3i_snic_rwlock);
+	list_for_each_entry(snic, &cxgb3i_snic_list, list_head) {
+		if (snic->tdev == t3dev) {
+			read_unlock(&cxgb3i_snic_rwlock);
+			return snic;
+		}
+	}
+	read_unlock(&cxgb3i_snic_rwlock);
+
+	return NULL;
+}
+
+struct cxgb3i_hba *cxgb3i_hba_find_by_netdev(struct net_device *ndev)
+{
+	struct cxgb3i_adapter *snic;
+	int i;
+
+	read_lock(&cxgb3i_snic_rwlock);
+	list_for_each_entry(snic, &cxgb3i_snic_list, list_head) {
+		for (i = 0; i < snic->hba_cnt; i++) {
+			if (snic->hba[i]->ndev == ndev) {
+				read_unlock(&cxgb3i_snic_rwlock);
+				return (snic->hba[i]);
+			}
+		}
+	}
+	read_unlock(&cxgb3i_snic_rwlock);
+	return NULL;
+}
+
+void cxgb3i_hba_conn_add(struct cxgb3i_conn *cconn, struct cxgb3i_hba *hba)
+{
+	cconn->hba = hba;
+	write_lock(&hba->cconn_rwlock);
+	list_add_tail(&cconn->list_head, &hba->cconn_list);
+	write_unlock(&hba->cconn_rwlock);
+}
+
+void cxgb3i_hba_conn_remove(struct cxgb3i_conn *cconn)
+{
+	struct cxgb3i_hba *hba = cconn->hba;
+
+	if (hba) {
+		write_lock(&hba->cconn_rwlock);
+		list_del(&cconn->list_head);
+		write_unlock(&hba->cconn_rwlock);
+	}
+}
+
+struct cxgb3i_hba *cxgb3i_hba_host_add(struct cxgb3i_adapter *snic,
+				       struct net_device *ndev)
+{
+	struct cxgb3i_hba *hba;
+	struct Scsi_Host *shost;
+	int err;
+
+	shost = iscsi_host_alloc(&cxgb3i_host_template,
+				 sizeof(struct cxgb3i_hba),
+				 CXGB3I_SCSI_QDEPTH_DFLT);
+	if (!shost) {
+		cxgb3i_log_info("iscsi_host_alloc failed.\n");
+		return NULL;
+	}
+
+	shost->transportt = cxgb3i_scsi_transport;
+	shost->max_lun = 512;
+	shost->max_id = 0;
+	shost->max_channel = 0;
+	shost->max_cmd_len = 16;
+
+	hba = iscsi_host_priv(shost);
+	INIT_LIST_HEAD(&hba->cconn_list);
+	rwlock_init(&hba->cconn_rwlock);
+	hba->snic = snic;
+	hba->ndev = ndev;
+	hba->shost = shost;
+
+	pci_dev_get(snic->pdev);
+	err = iscsi_host_add(shost, &snic->pdev->dev);
+	if (err) {
+		cxgb3i_log_info("iscsi_host_add failed.\n");
+		goto pci_dev_put;
+	}
+
+	cxgb3i_log_debug("shost 0x%p, hba 0x%p, no %u.\n",
+			 shost, hba, shost->host_no);
+
+	return hba;
+
+pci_dev_put:
+	pci_dev_put(snic->pdev);
+	scsi_host_put(shost);
+	return NULL;
+}
+
+void cxgb3i_hba_host_remove(struct cxgb3i_hba *hba)
+{
+	if (hba->shost) {
+		cxgb3i_log_debug("shost 0x%p, hba 0x%p, no %u.\n",
+				 hba->shost, hba, hba->shost->host_no);
+		iscsi_host_remove(hba->shost);
+		pci_dev_put(hba->snic->pdev);
+		/* cleanup connections ? */
+		iscsi_host_free(hba->shost);
+	}
+}
+
+/**
+ * cxgb3i_ep_connect - establish TCP connection to target portal
+ * @dst_addr:		target IP address
+ * @non_blocking:	blocking or non-blocking call
+ *
+ * Initiates a TCP/IP connection to the dst_addr
+ */
+static struct iscsi_endpoint *cxgb3i_ep_connect(struct sockaddr *dst_addr,
+						int non_blocking)
+{
+	struct iscsi_endpoint *ep;
+	struct cxgb3i_endpoint *cep;
+	struct cxgb3i_hba *hba;
+	struct s3_conn *c3cn;
+	int err;
+
+	c3cn = cxgb3i_c3cn_create();
+	if (!c3cn) {
+		cxgb3i_log_info("ep connect OOM.\n");
+		return NULL;
+	}
+
+	err = cxgb3i_c3cn_connect(c3cn, (struct sockaddr_in *)dst_addr);
+	if (err < 0) {
+		cxgb3i_log_info("ep connect failed.\n");
+		goto release_conn;
+	}
+	hba = cxgb3i_hba_find_by_netdev(c3cn->dst_cache->dev);
+	if (!hba) {
+		cxgb3i_log_info("NOT going through cxgbi device.\n");
+		goto release_conn;
+	}
+
+	ep = iscsi_create_endpoint(sizeof(*cep));
+	if (!ep) {
+		cxgb3i_log_info("iscsi alloc ep, OOM.\n");
+		goto release_conn;
+	}
+	cep = ep->dd_data;
+	cep->c3cn = c3cn;
+	cep->hba = hba;
+
+	cxgb3i_log_debug("iscsi_ep 0x%p, cxgb_ep 0x%p, hba 0x%p, c3cn 0x%p.\n",
+			  ep, cep, hba, c3cn);
+	return ep;
+
+release_conn:
+	c3cn_release(c3cn);
+	return NULL;
+}
+
+/**
+ * cxgb3i_ep_poll - polls for TCP connection establishement
+ * @ep:		TCP connection (endpoint) handle
+ * @timeout_ms:	timeout value in milli secs
+ *
+ * polls for TCP connect request to complete
+ */
+static int cxgb3i_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+{
+	cxgb3i_log_debug("iscsi_ep 0x%p, timeout_ms %d.\n", ep, timeout_ms);
+	return 1;
+}
+
+/**
+ * cxgb3i_ep_disconnect - teardown TCP connection
+ * @ep:		TCP connection (endpoint) handle
+ *
+ * teardown TCP connection
+ */
+static void cxgb3i_ep_disconnect(struct iscsi_endpoint *ep)
+{
+	struct cxgb3i_endpoint *cep = (struct cxgb3i_endpoint *)ep->dd_data;
+	struct cxgb3i_conn *cconn = cep->cconn;
+
+	cxgb3i_log_debug("ep 0x%p, cep 0x%p.\n", ep, cep);
+
+	if (cconn && cconn->conn) {
+		struct iscsi_conn *conn = cconn->conn;
+		struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+		write_lock_bh(&cep->c3cn->callback_lock);
+		cep->c3cn->user_data = NULL;
+		set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx);
+		cconn->cep = NULL;
+		tcp_conn->sock = NULL;
+		write_unlock_bh(&cep->c3cn->callback_lock);
+	}
+
+	c3cn_release(cep->c3cn);
+	iscsi_destroy_endpoint(ep);
+}
+
+/**
+ * cxgb3i_session_create - create a new iscsi session
+ * @cmds_max:		max # of commands
+ * @qdepth:		scsi queue depth
+ * @initial_cmdsn:	initial iscsi CMDSN for this session
+ * @host_no:		pointer to return host no
+ *
+ * Creates a new iSCSI session
+ */
+static struct iscsi_cls_session *cxgb3i_session_create(struct iscsi_endpoint
+						       *ep, uint16_t cmds_max,
+						       uint16_t qdepth,
+						       uint32_t initial_cmdsn,
+						       uint32_t *host_no)
+{
+	struct cxgb3i_endpoint *cep;
+	struct cxgb3i_hba *hba;
+	struct Scsi_Host *shost;
+	struct iscsi_cls_session *cls_session;
+	struct iscsi_session *session;
+	int i;
+
+	if (!ep) {
+		cxgb3i_log_error("%s, missing endpoint.\n", __func__);
+		return NULL;
+	}
+
+	cep = (struct cxgb3i_endpoint *)ep->dd_data;
+	hba = cep->hba;
+	shost = hba->shost;
+	cxgb3i_log_debug("ep 0x%p, cep 0x%p, hba 0x%p.\n", ep, cep, hba);
+	BUG_ON(hba != iscsi_host_priv(shost));
+
+	*host_no = shost->host_no;
+
+	cls_session = iscsi_session_setup(&cxgb3i_iscsi_transport, shost,
+					  cmds_max,
+					  sizeof(struct iscsi_tcp_task),
+					  initial_cmdsn, ISCSI_MAX_TARGET);
+	if (!cls_session)
+		return NULL;
+
+	session = cls_session->dd_data;
+
+	for (i = 0; i < session->cmds_max; i++) {
+		struct iscsi_task *task = session->cmds[i];
+		struct iscsi_tcp_task *tcp_task = task->dd_data;
+
+		task->hdr = &tcp_task->hdr.cmd_hdr;
+		task->hdr_max = sizeof(tcp_task->hdr) - ISCSI_DIGEST_SIZE;
+	}
+
+	if (iscsi_r2tpool_alloc(session))
+		goto remove_session;
+
+	return cls_session;
+
+remove_session:
+	iscsi_session_teardown(cls_session);
+	return NULL;
+}
+
+/**
+ * cxgb3i_session_destroy - destroys iscsi session
+ * @cls_session:	pointer to iscsi cls session
+ *
+ * Destroys an iSCSI session instance and releases its all resources held
+ */
+static void cxgb3i_session_destroy(struct iscsi_cls_session *cls_session)
+{
+	cxgb3i_log_debug("sess 0x%p.\n", cls_session);
+	iscsi_r2tpool_free(cls_session->dd_data);
+	iscsi_session_teardown(cls_session);
+}
+
+/**
+ * cxgb3i_conn_create - create iscsi connection instance
+ * @cls_session:	pointer to iscsi cls session
+ * @cid:		iscsi cid
+ *
+ * Creates a new iSCSI connection instance for a given session
+ */
+static struct iscsi_cls_conn *cxgb3i_conn_create(struct iscsi_cls_session
+						 *cls_session, uint32_t cid)
+{
+	struct iscsi_cls_conn *cls_conn;
+	struct iscsi_conn *conn;
+	struct iscsi_tcp_conn *tcp_conn;
+	struct cxgb3i_conn *cconn;
+
+	cxgb3i_log_debug("sess 0x%p, cid %u.\n", cls_session, cid);
+
+	cls_conn = iscsi_conn_setup(cls_session,
+				    sizeof(*tcp_conn) + sizeof(*cconn), cid);
+	if (!cls_conn)
+		return NULL;
+	conn = cls_conn->dd_data;
+
+	conn->max_xmit_dlength = conn->max_recv_dlength = 16224 - 56 - 256;
+
+	tcp_conn = conn->dd_data;
+	tcp_conn->iscsi_conn = conn;
+
+	cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
+	cconn->conn = conn;
+
+	return cls_conn;
+}
+
+/**
+ * cxgb3i_conn_bind - binds iscsi sess, conn and endpoint together
+ * @cls_session:	pointer to iscsi cls session
+ * @cls_conn:		pointer to iscsi cls conn
+ * @transport_eph:	64-bit EP handle
+ * @is_leading:		leading connection on this session?
+ *
+ * Binds together an iSCSI session, an iSCSI connection and a
+ *	TCP connection. This routine returns error code if the TCP
+ *	connection does not belong on the device iSCSI sess/conn is bound
+ */
+
+static int cxgb3i_conn_bind(struct iscsi_cls_session *cls_session,
+			    struct iscsi_cls_conn *cls_conn,
+			    uint64_t transport_eph, int is_leading)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
+	struct iscsi_endpoint *ep;
+	struct cxgb3i_endpoint *cep;
+	struct s3_conn *c3cn;
+	int err;
+
+	ep = iscsi_lookup_endpoint(transport_eph);
+	if (!ep)
+		return -EINVAL;
+
+	cxgb3i_log_debug("ep 0x%p, cls sess 0x%p, cls conn 0x%p.\n",
+			 ep, cls_session, cls_conn);
+
+	err = iscsi_conn_bind(cls_session, cls_conn, is_leading);
+	if (err)
+		return -EINVAL;
+
+	cep = (struct cxgb3i_endpoint *)ep->dd_data;
+	c3cn = cep->c3cn;
+
+	read_lock(&c3cn->callback_lock);
+	tcp_conn->sock = (struct socket *)c3cn;
+	c3cn->user_data = conn;
+	read_unlock(&c3cn->callback_lock);
+
+	cconn->hba = cep->hba;
+	cconn->cep = cep;
+	cep->cconn = cconn;
+
+	conn->max_recv_dlength = cconn->hba->snic->rx_max_size - ISCSI_PDU_HEADER_MAX;
+	conn->max_xmit_dlength = cconn->hba->snic->tx_max_size - ISCSI_PDU_HEADER_MAX;
+
+	spin_lock_bh(&conn->session->lock);
+	sprintf(conn->portal_address, NIPQUAD_FMT,
+		NIPQUAD(c3cn->daddr.sin_addr.s_addr));
+	conn->portal_port = ntohs(c3cn->daddr.sin_port);
+	spin_unlock_bh(&conn->session->lock);
+
+	iscsi_tcp_hdr_recv_prep(tcp_conn);
+
+	return 0;
+}
+
+/**
+ * cxgb3i_conn_flush - flush tx
+ * @conn:	pointer to iscsi conn
+ */
+static int cxgb3i_conn_flush(struct iscsi_conn *conn)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct iscsi_segment *segment = &tcp_conn->out.segment;
+
+	if (segment->total_copied < segment->total_size)
+		return cxgb3i_conn_ulp2_xmit(conn);
+	return 0;
+}
+
+/**
+ * cxgb3i_conn_get_param - return iscsi connection parameter to caller
+ * @cls_conn:	pointer to iscsi cls conn
+ * @param:	parameter type identifier
+ * @buf:	buffer pointer
+ *
+ * returns iSCSI connection parameters
+ */
+static int cxgb3i_conn_get_param(struct iscsi_cls_conn *cls_conn,
+				 enum iscsi_param param, char *buf)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	int len;
+
+	cxgb3i_log_debug("cls_conn 0x%p, param %d.\n", cls_conn, param);
+
+	switch (param) {
+	case ISCSI_PARAM_CONN_PORT:
+		spin_lock_bh(&conn->session->lock);
+		len = sprintf(buf, "%hu\n", conn->portal_port);
+		spin_unlock_bh(&conn->session->lock);
+		break;
+	case ISCSI_PARAM_CONN_ADDRESS:
+		spin_lock_bh(&conn->session->lock);
+		len = sprintf(buf, "%s\n", conn->portal_address);
+		spin_unlock_bh(&conn->session->lock);
+		break;
+	default:
+		return iscsi_conn_get_param(cls_conn, param, buf);
+	}
+
+	return len;
+}
+
+static int cxgb3i_conn_set_param(struct iscsi_cls_conn *cls_conn,
+				 enum iscsi_param param, char *buf, int buflen)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iscsi_session *session = conn->session;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
+	int value, err = 0;
+
+	switch (param) {
+	case ISCSI_PARAM_HDRDGST_EN:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err && conn->hdrdgst_en)
+			cxgb3i_conn_ulp_setup(cconn, conn->hdrdgst_en,
+					      conn->datadgst_en);
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err && conn->datadgst_en)
+			cxgb3i_conn_ulp_setup(cconn, conn->hdrdgst_en,
+					      conn->datadgst_en);
+		break;
+	case ISCSI_PARAM_MAX_R2T:
+		sscanf(buf, "%d", &value);
+		if (value <= 0 || !is_power_of_2(value))
+			return -EINVAL;
+		if (session->max_r2t == value)
+			break;
+		iscsi_r2tpool_free(session);
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		if (!err && iscsi_r2tpool_alloc(session))
+			return -ENOMEM;
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		cxgb3i_log_debug("MAX_RECV %u.\n", conn->max_recv_dlength);
+		break;
+	case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		err = iscsi_set_param(cls_conn, param, buf, buflen);
+		cxgb3i_log_debug("MAX_XMIT %u.\n", conn->max_xmit_dlength);
+		break;
+	default:
+		return iscsi_set_param(cls_conn, param, buf, buflen);
+	}
+	return err;
+}
+
+/**
+ * cxgb3i_host_get_param - returns host (adapter) related parameters
+ * @shost:	scsi host pointer
+ * @param:	parameter type identifier
+ * @buf:	buffer pointer
+ */
+static int cxgb3i_host_get_param(struct Scsi_Host *shost,
+				 enum iscsi_host_param param, char *buf)
+{
+	struct cxgb3i_hba *hba = iscsi_host_priv(shost);
+	int i;
+	int len = 0;
+
+	switch (param) {
+	case ISCSI_HOST_PARAM_HWADDRESS:
+		for (i = 0; i < 6; i++)
+			len +=
+			    sprintf(buf + len, "%02x.",
+				    hba->ndev->dev_addr[i]);
+		len--;
+		buf[len] = '\0';
+		break;
+	case ISCSI_HOST_PARAM_NETDEV_NAME:
+		len = sprintf(buf, "%s\n", hba->ndev->name);
+		break;
+	default:
+		return iscsi_host_get_param(shost, param, buf);
+	}
+	return len;
+}
+
+/**
+ * cxgb3i_conn_get_stats - returns iSCSI stats
+ * @cls_conn:	pointer to iscsi cls conn
+ * @stats:	pointer to iscsi statistic struct
+ */
+static void cxgb3i_conn_get_stats(struct iscsi_cls_conn *cls_conn,
+				  struct iscsi_stats *stats)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+
+	stats->txdata_octets = conn->txdata_octets;
+	stats->rxdata_octets = conn->rxdata_octets;
+	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
+	stats->dataout_pdus = conn->dataout_pdus_cnt;
+	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
+	stats->datain_pdus = conn->datain_pdus_cnt;
+	stats->r2t_pdus = conn->r2t_pdus_cnt;
+	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
+	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
+	stats->digest_err = 0;
+	stats->timeout_err = 0;
+	stats->custom_length = 1;
+	strcpy(stats->custom[0].desc, "eh_abort_cnt");
+	stats->custom[0].value = conn->eh_abort_cnt;
+}
+
+static inline u32 tag_base(struct cxgb3i_tag_format *format,
+			   unsigned int idx, unsigned int age)
+{
+	u32 sw_bits = idx | (age << format->idx_bits);
+	u32 tag = sw_bits >> format->rsvd_shift;
+	tag <<= format->rsvd_bits + format->rsvd_shift;
+	tag |= sw_bits & ((1 << format->rsvd_shift) - 1);
+	return tag;
+}
+
+static void cxgb3i_parse_itt(struct iscsi_conn *conn, itt_t itt,
+			     int *idx, int *age)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
+	struct cxgb3i_adapter *snic = cconn->hba->snic;
+	u32 sw_bits;
+
+	cxgb3i_parse_tag(&snic->tag_format, itt, NULL, &sw_bits);
+	if (idx)
+		*idx = sw_bits & ISCSI_ITT_MASK;
+	if (age)
+		*age = (sw_bits >> snic->tag_format.idx_bits) & ISCSI_AGE_MASK;
+}
+
+static int cxgb3i_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt)
+{
+	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_session *sess = conn->session;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
+	struct cxgb3i_adapter *snic = cconn->hba->snic;
+	u32 sw_tag = tag_base(&snic->tag_format, task->itt, sess->age);
+	u32 tag = RESERVED_ITT;
+
+	if (sc && (sc->sc_data_direction == DMA_FROM_DEVICE)) {
+		struct s3_conn *c3cn =
+			(struct s3_conn *)(tcp_conn->sock);
+		tag =
+		    cxgb3i_ddp_tag_reserve(snic, c3cn->tid, sw_tag,
+					   scsi_out(sc)->length,
+					   scsi_out(sc)->table.sgl,
+					   scsi_out(sc)->table.nents);
+	}
+	if (tag == RESERVED_ITT)
+		tag = sw_tag | (snic->tag_format.rsvd_mask <<
+				snic->tag_format.rsvd_shift);
+	*hdr_itt = htonl(tag);
+	return 0;
+}
+
+static void cxgb3i_release_itt(struct iscsi_task *task, itt_t hdr_itt)
+{
+	struct scsi_cmnd *sc = task->sc;
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
+	struct cxgb3i_adapter *snic = cconn->hba->snic;
+
+	hdr_itt = ntohl(hdr_itt);
+	if (sc && (sc->sc_data_direction == DMA_FROM_DEVICE))
+		cxgb3i_ddp_tag_release(snic, hdr_itt,
+				       scsi_out(sc)->table.sgl,
+				       scsi_out(sc)->table.nents);
+}
+
+/**
+ * cxgb3i_host_template -- Scsi_Host_Template structure
+ *	used when registering with the scsi mid layer
+ */
+static struct scsi_host_template cxgb3i_host_template = {
+	.module = THIS_MODULE,
+	.name = "Chelsio S3xx iSCSI Initiator",
+	.proc_name = "cxgb3i",
+	.queuecommand = iscsi_queuecommand,
+	.change_queue_depth = iscsi_change_queue_depth,
+	.can_queue = 128 * (ISCSI_DEF_XMIT_CMDS_MAX - 1),
+	.sg_tablesize = SG_ALL,
+	.max_sectors = 0xFFFF,
+	.cmd_per_lun = ISCSI_DEF_CMD_PER_LUN,
+	.eh_abort_handler = iscsi_eh_abort,
+	.eh_device_reset_handler = iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_target_reset,
+	.use_clustering = DISABLE_CLUSTERING,
+	.slave_alloc = iscsi_slave_alloc,
+	.this_id = -1,
+};
+
+static struct iscsi_transport cxgb3i_iscsi_transport = {
+	.owner = THIS_MODULE,
+	.name = "cxgb3i",
+	.caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST
+	    | CAP_DATADGST | CAP_DIGEST_OFFLOAD,
+	.param_mask = ISCSI_MAX_RECV_DLENGTH |
+	    ISCSI_MAX_XMIT_DLENGTH |
+	    ISCSI_HDRDGST_EN |
+	    ISCSI_DATADGST_EN |
+	    ISCSI_INITIAL_R2T_EN |
+	    ISCSI_MAX_R2T |
+	    ISCSI_IMM_DATA_EN |
+	    ISCSI_FIRST_BURST |
+	    ISCSI_MAX_BURST |
+	    ISCSI_PDU_INORDER_EN |
+	    ISCSI_DATASEQ_INORDER_EN |
+	    ISCSI_ERL |
+	    ISCSI_CONN_PORT |
+	    ISCSI_CONN_ADDRESS |
+	    ISCSI_EXP_STATSN |
+	    ISCSI_PERSISTENT_PORT |
+	    ISCSI_PERSISTENT_ADDRESS |
+	    ISCSI_TARGET_NAME | ISCSI_TPGT |
+	    ISCSI_USERNAME | ISCSI_PASSWORD |
+	    ISCSI_USERNAME_IN | ISCSI_PASSWORD_IN |
+	    ISCSI_FAST_ABORT | ISCSI_ABORT_TMO |
+	    ISCSI_LU_RESET_TMO |
+	    ISCSI_PING_TMO | ISCSI_RECV_TMO |
+	    ISCSI_IFACE_NAME | ISCSI_INITIATOR_NAME,
+	.host_param_mask = ISCSI_HOST_HWADDRESS | ISCSI_HOST_IPADDRESS |
+	    ISCSI_HOST_INITIATOR_NAME | ISCSI_HOST_NETDEV_NAME,
+	.get_host_param = cxgb3i_host_get_param,
+	/* session management */
+	.create_session = cxgb3i_session_create,
+	.destroy_session = cxgb3i_session_destroy,
+	.get_session_param = iscsi_session_get_param,
+	/* connection management */
+	.create_conn = cxgb3i_conn_create,
+	.bind_conn = cxgb3i_conn_bind,
+	.destroy_conn = iscsi_conn_teardown,
+	.start_conn = iscsi_conn_start,
+	.stop_conn = iscsi_conn_stop,
+	.flush_conn = cxgb3i_conn_flush,
+	.get_conn_param = cxgb3i_conn_get_param,
+	.set_param = cxgb3i_conn_set_param,
+	.get_stats = cxgb3i_conn_get_stats,
+	/* pdu xmit req. from user space */
+	.send_pdu = iscsi_conn_send_pdu,
+	/* task */
+	.init_task = iscsi_tcp_task_init,
+	.xmit_task = iscsi_tcp_task_xmit,
+	.cleanup_task = iscsi_tcp_cleanup_task,
+	.parse_itt = cxgb3i_parse_itt,
+	.reserve_itt = cxgb3i_reserve_itt,
+	.release_itt = cxgb3i_release_itt,
+	/* TCP connect/disconnect */
+	.ep_connect = cxgb3i_ep_connect,
+	.ep_poll = cxgb3i_ep_poll,
+	.ep_disconnect = cxgb3i_ep_disconnect,
+	/* Error recovery timeout call */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+};
+
+int cxgb3i_iscsi_init(void)
+{
+	cxgb3i_scsi_transport =
+	    iscsi_register_transport(&cxgb3i_iscsi_transport);
+	if (!cxgb3i_scsi_transport) {
+		cxgb3i_log_error("Could not register cxgb3i transport.\n");
+		return -ENODEV;
+	}
+	cxgb3i_log_debug("cxgb3i transport 0x%p.\n", cxgb3i_scsi_transport);
+	return 0;
+}
+
+void cxgb3i_iscsi_cleanup(void)
+{
+	if (cxgb3i_scsi_transport) {
+		cxgb3i_log_debug("cxgb3i transport 0x%p.\n",
+				 cxgb3i_scsi_transport);
+		iscsi_unregister_transport(&cxgb3i_iscsi_transport);
+		cxgb3i_scsi_transport = NULL;
+	}
+}
diff --git a/drivers/scsi/cxgb3i/cxgb3i_offload.c b/drivers/scsi/cxgb3i/cxgb3i_offload.c
new file mode 100644
index 0000000..d4d8b85
--- /dev/null
+++ b/drivers/scsi/cxgb3i/cxgb3i_offload.c
@@ -0,0 +1,2001 @@
+/*
+ * Copyright (C) 2003-2008 Chelsio Communications.  All rights reserved.
+ *
+ * Written by Dimitris Michailidis (dm@chelsio.com)
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
+ * release for licensing terms and conditions.
+ */
+
+#include <linux/if_vlan.h>
+#include <linux/version.h>
+
+#include "cxgb3_defs.h"
+#include "cxgb3_ctl_defs.h"
+#include "firmware_exports.h"
+#include "cxgb3i_offload.h"
+#include "cxgb3i_ulp2.h"
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)");
+
+static int snd_win = 32 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
+
+static int rx_credit_thres = 10 * 1024;
+module_param(rx_credit_thres, int, 0644);
+MODULE_PARM_DESC(snd_win, "RX credits return threshold in bytes (default=10KB)");
+
+static unsigned int max_connect = 8 * 1024;
+module_param(max_connect, uint, 0644);
+MODULE_PARM_DESC(max_connect, "Max. # of connections (default=8092)");
+
+static unsigned int sport_base = 20000;
+module_param(sport_base, uint, 0644);
+MODULE_PARM_DESC(sport_start, "starting port number (default=20000)");
+
+#ifdef __DEBUG_C3CN_CONN__
+#define c3cn_conn_debug         cxgb3i_log_debug
+#else
+#define c3cn_conn_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_C3CN_TX__
+#define c3cn_tx_debug         cxgb3i_log_debug
+#else
+#define c3cn_tx_debug(fmt...)
+#endif
+
+/* minimal port allocation management scheme */
+spinlock_t sport_map_lock;
+unsigned int sport_map_next = 0;
+unsigned char *sport_map = NULL;
+
+/*
+ * Find a free source port in our allocation map.  We use a very simple rotor
+ * scheme to look for the next free port.
+ *
+ * If a source port has been specified make sure that it doesn't collide with
+ * our normal source port allocation map.  If it's outside the range of our
+ * allocation scheme just let them use it.
+ */
+static int c3cn_get_port(struct s3_conn *c3cn)
+{
+	unsigned int start;
+
+	if (!sport_map)
+		goto error_out;
+
+	if (c3cn->saddr.sin_port != 0) {
+		int sport = ntohs(c3cn->saddr.sin_port) - sport_base;
+		int err = 0;
+
+		if (sport < 0 || sport >= max_connect)
+			return 0;
+		spin_lock(&sport_map_lock);
+		err = __test_and_set_bit(sport, sport_map);
+		spin_unlock(&sport_map_lock);
+		return (err ? -EADDRINUSE : 0);
+	}
+
+	spin_lock(&sport_map_lock);
+	start = sport_map_next;
+	do {
+		unsigned int new = sport_map_next;
+		if (++sport_map_next >= max_connect)
+			sport_map_next = 0;
+		if (!(__test_and_set_bit(new, sport_map))) {
+			spin_unlock(&sport_map_lock);
+			c3cn->saddr.sin_port = htons(sport_base + new);
+			return 0;
+		}
+	} while (sport_map_next != start);
+	spin_unlock(&sport_map_lock);
+
+error_out:
+	return -EADDRNOTAVAIL;
+}
+
+/*
+ * Deallocate a source port from the allocation map.  If the source port is
+ * outside our allocation range just return -- the caller is responsible for
+ * keeping track of their port usage outside of our allocation map.
+ */
+static void c3cn_put_port(struct s3_conn *c3cn)
+{
+	int old = ntohs(c3cn->saddr.sin_port) - sport_base;
+	c3cn->saddr.sin_port = 0;
+
+	if (old < 0 || old >= max_connect)
+		return;
+
+	spin_lock(&sport_map_lock);
+	__clear_bit(old, sport_map);
+	spin_unlock(&sport_map_lock);
+}
+
+static inline unsigned int c3cn_in_state(const struct s3_conn *c3cn,
+                                         unsigned int states)
+{
+	return (states & c3cn->state);
+}
+
+static void c3cn_set_state(struct s3_conn *c3cn, int state)
+{
+	c3cn_conn_debug("c3cn 0x%p state -> 0x%x.\n", c3cn, state);
+	if (state == C3CN_STATE_CLOSE)
+		c3cn_put_port(c3cn);
+	c3cn->state = state;
+}
+
+
+void c3cn_reset_timer(struct s3_conn *c3cn, struct timer_list* timer,
+		      unsigned long expires)
+{
+	if (!mod_timer(timer, expires))
+		c3cn_hold(c3cn);
+}
+
+typedef int (cxgb3_cpl_handler_decl) (struct t3cdev *,
+				      struct sk_buff *, void *);
+
+static cxgb3_cpl_handler_decl do_act_establish;
+static cxgb3_cpl_handler_decl do_act_open_rpl;
+static cxgb3_cpl_handler_decl do_wr_ack;
+static cxgb3_cpl_handler_decl do_peer_close;
+static cxgb3_cpl_handler_decl do_abort_req;
+static cxgb3_cpl_handler_decl do_abort_rpl;
+static cxgb3_cpl_handler_decl do_close_con_rpl;
+static cxgb3_cpl_handler_decl do_iscsi_hdr;
+
+/*
+ * Protocol functions for our connections.
+ */
+static int c3cn_destroy(struct s3_conn *);
+static void process_deferq(struct work_struct *);
+
+static LIST_HEAD(cxgb3_list);
+static DEFINE_MUTEX(cxgb3_list_lock);
+
+/*
+ * For ULP connections HW may inserts digest bytes into the pdu. This array
+ * contains the compensating extra lengths for ULP packets.  It is indexed by
+ * a packet's ULP submode.
+ */
+static const unsigned int cxgb3_ulp_extra_len[] = { 0, 4, 4, 8 };
+
+/*
+ * Return the length of any HW additions that will be made to a Tx packet.
+ * Such additions can happen for some types of ULP packets.
+ */
+static inline unsigned int ulp_extra_len(const struct sk_buff *skb)
+{
+	return cxgb3_ulp_extra_len[skb_ulp_mode(skb) & 3];
+}
+
+/*
+ * Size of WRs in bytes.  Note that we assume all devices we are handling have
+ * the same WR size.
+ */
+static unsigned int wrlen __read_mostly;
+
+/*
+ * The number of WRs needed for an skb depends on the number of page fragments
+ * in the skb and whether it has any payload in its main body.  This maps the
+ * length of the gather list represented by an skb into the # of necessary WRs.
+ */
+static unsigned int skb_wrs[MAX_SKB_FRAGS + 2] __read_mostly;
+
+static void s3_init_wr_tab(unsigned int wr_len)
+{
+	int i;
+
+	if (skb_wrs[1])		/* already initialized */
+		return;
+
+	for (i = 1; i < ARRAY_SIZE(skb_wrs); i++) {
+		int sgl_len = (3 * i) / 2 + (i & 1);
+
+		sgl_len += 3;
+		skb_wrs[i] = (sgl_len <= wr_len
+			      ? 1 : 1 + (sgl_len - 2) / (wr_len - 1));
+	}
+
+	wrlen = wr_len * 8;
+}
+
+/*
+ * Initialization/cleanup cxgb3 API operations.
+ */
+/*
+ * large memory chunk allocation/release
+ */
+void *cxgb3i_alloc_big_mem(unsigned int size)
+{
+	void *p = kmalloc(size, GFP_KERNEL);
+	if (!p)
+		p = vmalloc(size);
+	if (p)
+		memset(p, 0, size);
+	return p;
+}
+
+void cxgb3i_free_big_mem(void *addr)
+{
+	unsigned long p = (unsigned long)addr;
+	if (p >= VMALLOC_START && p < VMALLOC_END)
+		vfree(addr);
+	else
+		kfree(addr);
+}
+
+void cxgb3i_sdev_cleanup(cxgb3_cpl_handler_func *cpl_handlers)
+{
+	memset(cpl_handlers, 0, NUM_CPL_CMDS*(sizeof(*cpl_handlers)));
+	if (sport_map)
+		cxgb3i_free_big_mem(sport_map);
+}
+
+int cxgb3i_sdev_init(cxgb3_cpl_handler_func *cpl_handlers)
+{
+	cpl_handlers[CPL_ACT_ESTABLISH] = do_act_establish;
+	cpl_handlers[CPL_ACT_OPEN_RPL] = do_act_open_rpl;
+	cpl_handlers[CPL_PEER_CLOSE] = do_peer_close;
+	cpl_handlers[CPL_ABORT_REQ_RSS] = do_abort_req;
+	cpl_handlers[CPL_ABORT_RPL_RSS] = do_abort_rpl;
+	cpl_handlers[CPL_CLOSE_CON_RPL] = do_close_con_rpl;
+	cpl_handlers[CPL_TX_DMA_ACK] = do_wr_ack;
+	cpl_handlers[CPL_ISCSI_HDR] = do_iscsi_hdr;
+
+	sport_map = cxgb3i_alloc_big_mem((max_connect + 7)/8);
+	if (!sport_map)
+		return -ENOMEM;
+	return 0;
+}
+
+void cxgb3i_sdev_add(struct t3cdev *cdev, struct cxgb3_client *client)
+{
+	struct cxgb3i_sdev_data *cdata;
+	struct adap_ports *ports;
+	struct ofld_page_info rx_page_info;
+	unsigned int wr_len;
+	int i;
+
+	cdata = kzalloc(sizeof *cdata, GFP_KERNEL);
+	if (!cdata)
+		return;
+	ports = kzalloc(sizeof *ports, GFP_KERNEL);
+	if (!ports)
+		goto free_ports;
+	cdata->ports = ports;
+
+	if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0 ||
+	    cdev->ctl(cdev, GET_PORTS, cdata->ports) < 0 ||
+	    cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info) < 0)
+		goto free_ports;
+
+	s3_init_wr_tab(wr_len);
+
+	INIT_LIST_HEAD(&cdata->list);
+	cdata->cdev = cdev;
+	cdata->client = client;
+	cdata->rx_page_size = rx_page_info.page_size;
+	skb_queue_head_init(&cdata->deferq);
+	INIT_WORK(&cdata->deferq_task, process_deferq);
+
+	for (i = 0; i < ports->nports; i++)
+		NDEV2CDATA(ports->lldevs[i]) = cdata;
+
+	mutex_lock(&cxgb3_list_lock);
+	list_add_tail(&cdata->list, &cxgb3_list);
+	mutex_unlock(&cxgb3_list_lock);
+
+	return;
+
+free_ports:
+	kfree(ports);
+	kfree(cdata);
+}
+
+void cxgb3i_sdev_remove(struct t3cdev *cdev)
+{
+	struct cxgb3i_sdev_data *cdata = CXGB3_SDEV_DATA(cdev);
+	struct adap_ports *ports = cdata->ports;
+	int i;
+
+	for (i = 0; i < ports->nports; i++)
+		NDEV2CDATA(ports->lldevs[i]) = NULL;
+
+	mutex_lock(&cxgb3_list_lock);
+	list_del(&cdata->list);
+	mutex_unlock(&cxgb3_list_lock);
+
+	kfree(ports);
+	kfree(cdata);
+}
+
+/*
+ * Return TRUE if the specified net device is for a port on one of our
+ * registered adapters.
+ */
+static int is_cxgb3_dev(struct net_device *dev)
+{
+	struct cxgb3i_sdev_data *cdata;
+
+	mutex_lock(&cxgb3_list_lock);
+	list_for_each_entry(cdata, &cxgb3_list, list) {
+		struct adap_ports *ports = cdata->ports;
+		int i;
+
+		for (i = 0; i < ports->nports; i++)
+			if (dev == ports->lldevs[i]) {
+				mutex_unlock(&cxgb3_list_lock);
+				return 1;
+			}
+	}
+	mutex_unlock(&cxgb3_list_lock);
+	return 0;
+}
+
+/*
+ * Primary cxgb3 API operations.
+ * =============================
+ */
+
+static int s3_push_frames(struct s3_conn *, int);
+static int s3_send_reset(struct s3_conn *, int, struct sk_buff *);
+
+struct s3_conn * cxgb3i_c3cn_create(void)
+{
+	struct s3_conn *c3cn;
+
+	c3cn = kzalloc(sizeof(*c3cn), GFP_KERNEL);
+	if (c3cn == NULL)
+		return NULL;
+
+	c3cn->flags = 0;
+	spin_lock_init(&c3cn->lock);
+	atomic_set(&c3cn->refcnt, 1);
+	skb_queue_head_init(&c3cn->receive_queue);
+	skb_queue_head_init(&c3cn->write_queue);
+	setup_timer(&c3cn->retry_timer, NULL, (unsigned long)c3cn);
+	rwlock_init(&c3cn->callback_lock);
+
+	return c3cn;
+}
+
+static void mk_close_req(struct s3_conn *);
+static inline void s3_purge_write_queue(struct s3_conn *);
+
+/*
+ * Release a connection's local port if the connection is bound.
+ */
+static inline void release_port(struct s3_conn *c3cn)
+{
+	c3cn_conn_debug("c3cn 0x%p, port %u.\n", c3cn, c3cn->saddr.sin_port);
+	if (c3cn->saddr.sin_port)
+		c3cn_put_port(c3cn);
+}
+
+static void c3cn_done(struct s3_conn *c3cn)
+{
+	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
+
+	c3cn_set_state(c3cn, C3CN_STATE_CLOSE);
+	c3cn->shutdown = C3CN_SHUTDOWN_MASK;
+
+	cxgb3i_conn_closing(c3cn);
+}
+
+void c3cn_close(struct s3_conn *c3cn)
+{
+	int data_lost, old_state;
+
+	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
+			 c3cn, c3cn->state, c3cn->flags);
+
+	dst_confirm(c3cn->dst_cache);
+
+	spin_lock_bh(&c3cn->lock);
+	c3cn->shutdown |= C3CN_SHUTDOWN_MASK;
+
+	/*
+	 * We need to flush the receive buffs.  We do this only on the
+	 * descriptor close, not protocol-sourced closes, because the
+	 * reader process may not have drained the data yet!  Make a note
+	 * of whether any received data will be lost so we can decide whether
+	 * to FIN or RST.
+	 */
+	data_lost = skb_queue_len(&c3cn->receive_queue);
+	__skb_queue_purge(&c3cn->receive_queue);
+
+	if (c3cn->state == C3CN_STATE_CLOSE)	/* Nothing if we are already closed */
+		;
+	else if (data_lost || c3cn->state == C3CN_STATE_SYN_SENT) {
+		/* Unread data was tossed, zap the connection. */
+		s3_send_reset(c3cn, CPL_ABORT_SEND_RST, NULL);
+		release_port(c3cn);
+		goto unlock;
+	} else if (c3cn->state == C3CN_STATE_ESTABLISHED) {
+		c3cn_set_state(c3cn, C3CN_STATE_CLOSING);
+		mk_close_req(c3cn);
+	}
+
+unlock:
+	old_state = c3cn->state;
+	c3cn_hold(c3cn); /* must last past the potential destroy() */
+
+	spin_unlock_bh(&c3cn->lock); /* Final release in connection's lifetime. */
+
+	/*
+	 * There are no more user references at this point.  Grab the
+	 * connection lock and finish the close.
+	 */
+	local_bh_disable();
+	spin_lock(&c3cn->lock);
+
+	/*
+	 * Because the connection was orphaned before the spin_lock()
+	 * either the backlog or a BH may have already destroyed it.
+	 * Bail out if so.
+	 */
+	if (old_state != C3CN_STATE_CLOSE && c3cn->state == C3CN_STATE_CLOSE)
+		goto out;
+
+	if (c3cn->state == C3CN_STATE_CLOSE)
+		c3cn_destroy(c3cn);
+
+out:
+	spin_unlock(&c3cn->lock);
+	local_bh_enable();
+	c3cn_put(c3cn);
+}
+
+/*
+ * Destroy connection.  Purge the write queue and drop a reference on the
+ * connection.
+ */
+static int c3cn_destroy(struct s3_conn *c3cn)
+{
+	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
+
+	s3_purge_write_queue(c3cn);
+	c3cn_put(c3cn);
+	return 0;
+}
+
+/*
+ * Local utility routines used to implement primary cxgb3 API operations.
+ * ======================================================================
+ */
+
+static int s3_connect(struct s3_conn *);
+static u32 s3_send_rx_credits(struct s3_conn *, u32, u32, int);
+static void mk_act_open_req(struct s3_conn *, struct sk_buff *,
+			    unsigned int, const struct l2t_entry *);
+static void skb_entail(struct s3_conn *, struct sk_buff *, int);
+
+static inline void reset_wr_list(struct s3_conn *c3cn)
+{
+	c3cn->wr_pending_head = NULL;
+}
+
+/*
+ * Add a WR to a connections's list of pending WRs.  This is a singly-linked
+ * list of sk_buffs operating as a FIFO.  The head is kept in wr_pending_head
+ * and the tail in wr_pending_tail.
+ */
+static inline void enqueue_wr(struct s3_conn *c3cn,
+			      struct sk_buff *skb)
+{
+	skb->sp = NULL;
+
+	/*
+	 * We want to take an extra reference since both us and the driver
+	 * need to free the packet before it's really freed.  We know there's
+	 * just one user currently so we use atomic_set rather than skb_get
+	 * to avoid the atomic op.
+	 */
+	atomic_set(&skb->users, 2);
+
+	if (!c3cn->wr_pending_head)
+		c3cn->wr_pending_head = skb;
+	else
+		c3cn->wr_pending_tail->sp = (void *)skb;
+	c3cn->wr_pending_tail = skb;
+}
+
+/*
+ * The next two functions calculate the option 0 value for a connection.
+ */
+static inline int compute_wscale(int win)
+{
+	int wscale = 0; 
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+static inline unsigned int calc_opt0h(struct s3_conn *c3cn)
+{
+	int wscale = compute_wscale(rcv_win);
+	return (V_KEEP_ALIVE(1) |
+		F_TCAM_BYPASS |
+		V_WND_SCALE(wscale) |
+		V_MSS_IDX(c3cn->mss_idx));
+}
+
+static inline unsigned int calc_opt0l(struct s3_conn *c3cn)
+{
+	return (V_ULP_MODE(ULP_MODE_ISCSI) |
+		V_RCV_BUFSIZ(rcv_win>>10));	
+}
+
+static inline void make_tx_data_wr(struct s3_conn *c3cn,
+				   struct sk_buff *skb, int len)
+{
+	struct tx_data_wr *req;
+
+	skb_reset_transport_header(skb);
+	req = (struct tx_data_wr *)__skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
+	req->wr_lo = htonl(V_WR_TID(c3cn->tid));
+	req->sndseq = htonl(c3cn->snd_nxt);
+	/* len includes the length of any HW ULP additions */
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(c3cn->l2t->smt_idx));
+	/* V_TX_ULP_SUBMODE sets both the mode and submode */
+	req->flags = htonl(V_TX_ULP_SUBMODE(skb_ulp_mode(skb)) |
+			   V_TX_SHOVE((skb_peek(&c3cn->write_queue) ? 0 : 1)));
+
+	if (!c3cn_flag(c3cn, C3CN_TX_DATA_SENT)) {
+
+		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
+				    V_TX_CPU_IDX(c3cn->qset));
+
+		/* Sendbuffer is in units of 32KB.
+		 */
+		req->param |= htonl(V_TX_SNDBUF(snd_win >> 15));
+		c3cn_set_flag(c3cn, C3CN_TX_DATA_SENT);
+	}
+}
+
+static struct rtable *find_route(__be32 saddr, __be32 daddr,
+				 __be16 sport, __be16 dport)
+{
+	struct rtable *rt;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			 .ip4_u = {
+				   .daddr = daddr,
+				   .saddr = saddr,
+				   .tos = 0 } },
+		.proto = IPPROTO_TCP,
+		.uli_u = {
+			  .ports = {
+				    .sport = sport,
+				    .dport = dport } } };
+
+	if (ip_route_output_flow(&init_net, &rt, &fl, NULL, 0))
+		return NULL;
+	return rt;
+}
+
+int cxgb3i_c3cn_connect(struct s3_conn *c3cn, struct sockaddr_in *usin)
+{
+	struct rtable *rt;
+	int err;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	/* get a source port if one hasn't been provided */
+	err = c3cn_get_port(c3cn);
+	if (err)
+		return err;
+	c3cn_conn_debug("c3cn 0x%p get port %u.\n", c3cn, ntohs(c3cn->saddr.sin_port));
+
+	c3cn->daddr.sin_port = usin->sin_port;
+	c3cn->daddr.sin_addr.s_addr = usin->sin_addr.s_addr;
+
+	rt = find_route(c3cn->saddr.sin_addr.s_addr,
+			c3cn->daddr.sin_addr.s_addr,
+			c3cn->saddr.sin_port,
+			c3cn->daddr.sin_port);
+	if (rt == NULL) {
+		c3cn_conn_debug("NO route to 0x%x, port %u.\n", 
+				c3cn->daddr.sin_addr.s_addr,
+				ntohs(c3cn->daddr.sin_port));
+		return -ENETUNREACH;
+	}
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		c3cn_conn_debug("multi-cast route to 0x%x, port %u.\n", 
+				c3cn->daddr.sin_addr.s_addr,
+				ntohs(c3cn->daddr.sin_port));
+		ip_rt_put(rt);
+		return -ENETUNREACH;
+	}
+
+	if (!c3cn->saddr.sin_addr.s_addr)
+		c3cn->saddr.sin_addr.s_addr = rt->rt_src;
+
+	c3cn_conn_debug("c3cn 0x%p -> SYN_SENT.\n", c3cn);
+	c3cn_set_state(c3cn, C3CN_STATE_SYN_SENT);
+
+	/* now commit destination to connection */
+	c3cn->dst_cache = &rt->u.dst;
+
+	if (s3_connect(c3cn))
+		return 0;
+	/*
+	 * If we get here, we don't have an offload connection so simply
+	 * return a failure.
+	 */
+	err = -ENOTSUPP;
+
+	/*
+	 * This trashes the connection and releases the local port,
+	 * if necessary.
+	 */
+	c3cn_conn_debug("c3cn 0x%p -> CLOSE.\n", c3cn);
+	c3cn_set_state(c3cn, C3CN_STATE_CLOSE);
+	ip_rt_put(rt);
+	c3cn_put_port(c3cn);
+	c3cn->daddr.sin_port = 0;
+	return err;
+}
+
+/*
+ * Set of states for which we should return RX credits.
+ */
+#define CREDIT_RETURN_STATE (C3CN_STATE_ESTABLISHED)
+
+/*
+ * Called after some received data has been read.  It returns RX credits
+ * to the HW for the amount of data processed.
+ */
+void cxgb3i_c3cn_rx_credits(struct s3_conn *c3cn, int copied)
+{
+	struct t3cdev *cdev;
+	int must_send;
+	u32 credits, dack = 0;
+
+	if (!c3cn_in_state(c3cn, CREDIT_RETURN_STATE))
+		return;
+
+	credits = c3cn->copied_seq - c3cn->rcv_wup;
+	if (unlikely(!credits))
+		return;
+
+	cdev = c3cn->cdev;
+
+	if (unlikely(rx_credit_thres == 0))
+		return;
+
+	dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
+
+	/*
+	 * For coalescing to work effectively ensure the receive window has
+	 * at least 16KB left.
+	 */
+	must_send = credits + 16384 >= rcv_win;
+
+	if (must_send || credits >= rx_credit_thres)
+		c3cn->rcv_wup += s3_send_rx_credits(c3cn, credits, dack, must_send);
+}
+
+/*
+ * Generic ARP failure handler that discards the buffer.
+ */
+static void arp_failure_discard(struct t3cdev *cdev, struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+/*
+ * Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a
+ * connection's send queue and sends them on to T3.  Must be called with the
+ * connection's lock held.  Returns the amount of send buffer space that was
+ * freed as a result of sending queued data to T3.
+ */
+static int s3_push_frames(struct s3_conn *c3cn, int req_completion)
+{
+	int total_size = 0;
+	struct sk_buff *skb;
+	struct t3cdev *cdev;
+	struct cxgb3i_sdev_data *cdata;
+
+	if (unlikely(c3cn_in_state(c3cn, C3CN_STATE_SYN_SENT | C3CN_STATE_CLOSE)))
+		return 0;
+
+	/*
+	 * We shouldn't really be called at all after an abort but check just
+	 * in case.
+	 */
+	if (unlikely(c3cn_flag(c3cn, C3CN_ABORT_SHUTDOWN)))
+		return 0;
+
+	cdev = c3cn->cdev;
+	cdata = CXGB3_SDEV_DATA(cdev);
+
+	while (c3cn->wr_avail
+	       && (skb = skb_peek(&c3cn->write_queue)) != NULL
+	       && !c3cn_flag(c3cn, C3CN_TX_WAIT_IDLE)) {
+
+		int len = skb->len;	/* length before skb_push */
+		int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len);
+		int wrs_needed = skb_wrs[frags];
+
+		if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen)
+			wrs_needed = 1;
+
+		WARN_ON(frags >= ARRAY_SIZE(skb_wrs) || wrs_needed < 1);
+		if (c3cn->wr_avail < wrs_needed)
+			break;
+
+		__skb_unlink(skb, &c3cn->write_queue);
+		skb->priority = CPL_PRIORITY_DATA;
+		skb->csum = wrs_needed;	/* remember this until the WR_ACK */
+		c3cn->wr_avail -= wrs_needed;
+		c3cn->wr_unacked += wrs_needed;
+		enqueue_wr(c3cn, skb);
+
+		if (likely(CXGB3_SKB_CB(skb)->flags & C3CB_FLAG_NEED_HDR)) {
+			len += ulp_extra_len(skb);
+			make_tx_data_wr(c3cn, skb, len);
+			c3cn->snd_nxt += len;
+			if ((req_completion
+			     && c3cn->wr_unacked == wrs_needed)
+			    || (CXGB3_SKB_CB(skb)->flags & C3CB_FLAG_COMPL)
+			    || c3cn->wr_unacked >= c3cn->wr_max / 2) {
+				struct work_request_hdr *wr = cplhdr(skb);
+
+				wr->wr_hi |= htonl(F_WR_COMPL);
+				c3cn->wr_unacked = 0;
+			}
+			CXGB3_SKB_CB(skb)->flags &= ~C3CB_FLAG_NEED_HDR;
+		} else if (skb->data[0] == FW_WROPCODE_OFLD_CLOSE_CON)
+			c3cn_set_flag(c3cn, C3CN_CLOSE_CON_REQUESTED);
+
+		total_size += skb->truesize;
+		set_arp_failure_handler(skb, arp_failure_discard);
+		l2t_send(cdev, skb, c3cn->l2t);
+	}
+	return total_size;
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(struct t3cdev *cdev, struct sk_buff *skb)
+{
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	req->cmd = CPL_ABORT_NO_RST;
+	cxgb3_ofld_send(cdev, skb);
+}
+
+/*
+ * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
+ * not send multiple ABORT_REQs for the same connection and also that we do
+ * not try to send a message after the connection has closed.  Returns 1 if
+ * an ABORT_REQ wasn't generated after all, 0 otherwise.
+ */
+static int s3_send_reset(struct s3_conn *c3cn, int mode,
+			 struct sk_buff *skb)
+{
+	struct cpl_abort_req *req;
+	unsigned int tid = c3cn->tid;
+
+	if (unlikely(c3cn_flag(c3cn, C3CN_ABORT_SHUTDOWN) || !c3cn->cdev)) {
+		if (skb)
+			__kfree_skb(skb);
+		return 1;
+	}
+
+	c3cn_conn_debug("c3cn 0x%p, mode %d.\n", c3cn, mode);
+
+	c3cn_set_flag(c3cn, C3CN_ABORT_RPL_PENDING);
+	c3cn_set_flag(c3cn, C3CN_ABORT_SHUTDOWN);
+
+	/* Purge the send queue so we don't send anything after an abort. */
+	s3_purge_write_queue(c3cn);
+
+	if (!skb)
+		skb = alloc_skb(sizeof(*req), GFP_KERNEL | __GFP_NOFAIL);
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, abort_arp_failure);
+
+	req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
+	req->rsvd0 = htonl(c3cn->snd_nxt);
+	req->rsvd1 = !c3cn_flag(c3cn, C3CN_TX_DATA_SENT);
+	req->cmd = mode;
+
+	l2t_send(c3cn->cdev, skb, c3cn->l2t);
+	return 0;
+}
+
+/*
+ * Add a list of skbs to a connection send queue.  This interface is intended
+ * for use by in-kernel ULPs.  The skbs must comply with the max size limit of
+ * the device and have a headroom of at least TX_HEADER_LEN bytes.
+ */
+int cxgb3i_c3cn_send_pdus(struct s3_conn *c3cn, struct sk_buff *skb, int flags)
+{
+	struct sk_buff *next;
+	int err, copied = 0;
+
+	spin_lock_bh(&c3cn->lock);
+
+	if (!c3cn_in_state(c3cn, C3CN_STATE_ESTABLISHED)) {
+		err = -EAGAIN;
+		goto out_err;
+	}
+
+	err = -EPIPE;
+	if (c3cn->err || (c3cn->shutdown & C3CN_SEND_SHUTDOWN))
+		goto out_err;
+
+	while (skb) {
+		if (unlikely(skb_headroom(skb) < TX_HEADER_LEN)) {
+			c3cn_tx_debug("c3cn 0x%p, skb head.\n", c3cn);
+			err = -EINVAL;
+			goto out_err;
+		}
+
+		next = skb->next;
+		skb->next = NULL;
+		skb_entail(c3cn, skb, C3CB_FLAG_NO_APPEND | C3CB_FLAG_NEED_HDR);
+		copied += skb->len;
+		c3cn->write_seq += skb->len + ulp_extra_len(skb);
+		skb = next;
+	}
+done:
+	if (likely(skb_queue_len(&c3cn->write_queue)))
+		s3_push_frames(c3cn, 1);
+	spin_unlock_bh(&c3cn->lock);
+	return copied;
+
+out_err:
+	if (copied == 0 && err == -EPIPE)
+		copied = c3cn->err ? c3cn->err : -EPIPE;
+	goto done;
+}
+
+/*
+ * Low-level utility routines for primary API functions.
+ * =====================================================
+ */
+/* routines to implement CPL message processing */
+static void c3cn_act_establish(struct s3_conn *, struct sk_buff *);
+static void active_open_failed(struct s3_conn *, struct sk_buff *);
+static void wr_ack(struct s3_conn *, struct sk_buff *);
+static void do_peer_fin(struct s3_conn *, struct sk_buff *);
+static void process_abort_req(struct s3_conn *, struct sk_buff *);
+static void process_abort_rpl(struct s3_conn *, struct sk_buff *);
+static void process_close_con_rpl(struct s3_conn *, struct sk_buff *);
+static void process_rx_iscsi_hdr(struct s3_conn *, struct sk_buff *);
+
+static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *, size_t, gfp_t);
+
+static int act_open(struct s3_conn *, struct net_device *);
+static void fail_act_open(struct s3_conn *, int);
+static void init_offload_conn(struct s3_conn *, struct t3cdev *,
+			      struct dst_entry *);
+
+/*
+ * Insert a connection into the TID table and take an extra reference.
+ */
+static inline void c3cn_insert_tid(struct cxgb3i_sdev_data *cdata,
+				   struct s3_conn *c3cn,
+				   unsigned int tid)
+{
+	c3cn_hold(c3cn);
+	cxgb3_insert_tid(cdata->cdev, cdata->client, c3cn, tid);
+}
+
+static inline void free_atid(struct t3cdev *cdev, unsigned int tid)
+{
+	struct s3_conn *c3cn = cxgb3_free_atid(cdev, tid);
+	if (c3cn)
+		c3cn_put(c3cn);
+}
+
+/*
+ * This function is intended for allocations of small control messages.
+ * Such messages go as immediate data and usually the pakets are freed
+ * immediately.  We maintain a cache of one small sk_buff and use it whenever
+ * it is available (has a user count of 1).  Otherwise we get a fresh buffer.
+ */
+#define CTRL_SKB_LEN 120
+
+static struct sk_buff *alloc_ctrl_skb(const struct s3_conn *c3cn,
+				      int len)
+{
+	struct sk_buff *skb = c3cn->ctrl_skb_cache;
+
+	if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) {
+		__skb_trim(skb, 0);
+		atomic_set(&skb->users, 2);
+	} else if (likely(!in_atomic()))
+		skb = alloc_skb(len, GFP_ATOMIC | __GFP_NOFAIL);
+	else
+		skb = alloc_skb(len, GFP_ATOMIC);
+	return skb;
+}
+
+/**
+ * cxgb3_egress_dev - return the cxgb3 egress device or NULL if the egress
+ *     device isn't one of our ports.
+ *
+ * @root_dev: the root device anchoring the search
+ * @c3cn: the connection used to determine egress port in bonding mode
+ * @context: in bonding mode, indicates a connection set up or failover
+ *
+ * Given a root network device it returns the physical egress device that is a
+ * descendant of the root device.  The root device may be either a physical
+ * device, in which case it is the device returned, or a virtual device, such
+ * as a VLAN or bonding device.  In case of a bonding device the search
+ * considers the decisions of the bonding device given its mode to locate the
+ * correct egress device.
+ */
+static struct net_device *cxgb3_egress_dev(struct net_device *root_dev,
+					   struct s3_conn *c3cn,
+					   int context)
+{
+	while (root_dev) {
+		if (root_dev->priv_flags & IFF_802_1Q_VLAN)
+			root_dev = vlan_dev_info(root_dev)->real_dev;
+		else if (is_cxgb3_dev(root_dev))
+			return root_dev;
+		else
+			return NULL;
+	}
+	return NULL;
+}
+
+/*
+ * Return TRUE if we're able to establish an offload connection; otherwise
+ * return FALSE.
+ */
+static int s3_connect(struct s3_conn *c3cn)
+{
+	struct net_device *dev = cxgb3_egress_dev(c3cn->dst_cache->dev,
+						  c3cn, 0);
+	if (dev == NULL) {
+		c3cn_conn_debug("c3cn 0x%p, egress dev NULL.\n", c3cn);
+		return 0;
+	}
+	return act_open(c3cn, dev) == 0;
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct s3_conn *c3cn = (struct s3_conn *)skb->sk;
+
+	c3cn_hold(c3cn);
+	spin_lock(&c3cn->lock);
+	if (c3cn->state == C3CN_STATE_SYN_SENT) {
+		fail_act_open(c3cn, EHOSTUNREACH);
+		__kfree_skb(skb);
+	}
+	spin_unlock(&c3cn->lock);
+	c3cn_put(c3cn);
+}
+
+/*
+ * Send an active open request.
+ */
+static int act_open(struct s3_conn *c3cn, struct net_device *dev)
+{
+	struct cxgb3i_sdev_data *cdata = NDEV2CDATA(dev);
+	struct t3cdev *cdev = cdata->cdev;
+	struct dst_entry *dst = c3cn->dst_cache;
+	struct sk_buff *skb;
+
+	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
+	/*
+	 * Initialize connection data.  Note that the flags and ULP mode are
+	 * initialized higher up ...
+	 */
+	c3cn->dev = dev;
+	c3cn->cdev = cdev;
+	c3cn->tid = cxgb3_alloc_atid(cdev, cdata->client, c3cn);
+	if (c3cn->tid < 0)
+		goto out_err;
+	c3cn->qset = 0;
+	c3cn->l2t = t3_l2t_get(cdev, dst->neighbour, dev);
+	if (!c3cn->l2t)
+		goto free_tid;
+
+	skb = alloc_skb(sizeof(struct cpl_act_open_req),
+			GFP_KERNEL | __GFP_NOFAIL);
+	skb->sk = (struct sock *)c3cn;
+	set_arp_failure_handler(skb, act_open_req_arp_failure);
+
+	c3cn_hold(c3cn);
+
+	init_offload_conn(c3cn, cdev, dst);
+	c3cn->err = 0;
+	c3cn_reset_flag(c3cn, C3CN_DONE);
+
+	mk_act_open_req(c3cn, skb, c3cn->tid, c3cn->l2t);
+	l2t_send(cdev, skb, c3cn->l2t);
+	return 0;
+
+free_tid:
+	free_atid(cdev, c3cn->tid);
+	c3cn->tid = 0;
+out_err:
+	return -1;
+}
+
+/*
+ * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
+ * under any circumstances.  We take the easy way out and always queue the
+ * message to the write_queue.  We can optimize the case where the queue is
+ * already empty though the optimization is probably not worth it.
+ */
+static void mk_close_req(struct s3_conn *c3cn)
+{
+	struct sk_buff *skb;
+	struct cpl_close_con_req *req;
+	unsigned int tid = c3cn->tid;
+
+	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
+
+	skb = alloc_skb(sizeof(struct cpl_close_con_req),
+			GFP_KERNEL | __GFP_NOFAIL);
+	req = (struct cpl_close_con_req *)__skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
+	req->rsvd = htonl(c3cn->write_seq);
+
+	skb_entail(c3cn, skb, C3CB_FLAG_NO_APPEND);
+	if (c3cn->state != C3CN_STATE_SYN_SENT)
+		s3_push_frames(c3cn, 1);
+}
+
+static void skb_entail(struct s3_conn *c3cn, struct sk_buff *skb,
+		       int flags)
+{
+	CXGB3_SKB_CB(skb)->seq = c3cn->write_seq;
+	CXGB3_SKB_CB(skb)->flags = flags;
+	__skb_queue_tail(&c3cn->write_queue, skb);
+}
+
+/*
+ * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
+ * permitted to return without sending the message in case we cannot allocate
+ * an sk_buff.  Returns the number of credits sent.
+ */
+static u32 s3_send_rx_credits(struct s3_conn *c3cn, u32 credits, u32 dack,
+			      int nofail)
+{
+	struct sk_buff *skb;
+	struct cpl_rx_data_ack *req;
+
+	skb = (nofail ? alloc_ctrl_skb(c3cn, sizeof(*req))
+	       : alloc_skb(sizeof(*req), GFP_ATOMIC));
+	if (!skb)
+		return 0;
+
+	req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, c3cn->tid));
+	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
+	skb->priority = CPL_PRIORITY_ACK;
+	cxgb3_ofld_send(c3cn->cdev, skb);
+	return credits;
+}
+
+static void mk_act_open_req(struct s3_conn *c3cn, struct sk_buff *skb,
+			    unsigned int atid, const struct l2t_entry *e)
+{
+	struct cpl_act_open_req *req;
+
+	c3cn_conn_debug("c3cn 0x%p, atid 0x%x.\n", c3cn, atid);
+
+	skb->priority = CPL_PRIORITY_SETUP;
+	req = (struct cpl_act_open_req *)__skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
+	req->local_port = c3cn->saddr.sin_port;
+	req->peer_port = c3cn->daddr.sin_port;
+	req->local_ip = c3cn->saddr.sin_addr.s_addr;
+	req->peer_ip = c3cn->daddr.sin_addr.s_addr;
+	req->opt0h = htonl(calc_opt0h(c3cn) | V_L2T_IDX(e->idx) |
+			   V_TX_CHANNEL(e->smt_idx));
+	req->opt0l = htonl(calc_opt0l(c3cn));
+	req->params = 0;
+}
+
+static inline void s3_purge_write_queue(struct s3_conn *c3cn)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&c3cn->write_queue))) {
+		__kfree_skb(skb);
+	}
+}
+
+/*
+ * Definitions and declarations for CPL handler functions.
+ * =======================================================
+ */
+
+/*
+ * Similar to process_cpl_msg() but takes an extra connection reference around
+ * the call to the handler.  Should be used if the handler may drop a
+ * connection reference.
+ */
+static inline void process_cpl_msg_ref(void (*fn) (struct s3_conn *,
+						   struct sk_buff *),
+				       struct s3_conn *c3cn,
+				       struct sk_buff *skb)
+{
+	c3cn_hold(c3cn);
+	process_cpl_msg(fn, c3cn, skb);
+	c3cn_put(c3cn);
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	    status != CPL_ERR_ARP_MISS;
+}
+
+/*
+ * Returns true if a connection cannot accept new Rx data.
+ */
+static inline int c3cn_no_receive(const struct s3_conn *c3cn)
+{
+	return (c3cn->shutdown & C3CN_RCV_SHUTDOWN);
+}
+
+/*
+ * A helper function that aborts a connection and increments the given MIB
+ * counter.  The supplied skb is used to generate the ABORT_REQ message if
+ * possible.  Must be called with softirqs disabled.
+ */
+static inline void abort_conn(struct s3_conn *c3cn,
+			      struct sk_buff *skb)
+{
+	struct sk_buff *abort_skb;
+
+	abort_skb = __get_cpl_reply_skb(skb, sizeof(struct cpl_abort_req),
+					GFP_ATOMIC);
+	if (abort_skb)
+		s3_send_reset(c3cn, CPL_ABORT_SEND_RST, abort_skb);
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static inline int is_neg_adv_abort(unsigned int status)
+{
+	return (status == CPL_ERR_RTX_NEG_ADVICE
+		|| status == CPL_ERR_PERSIST_NEG_ADVICE);
+}
+
+/*
+ * CPL handler functions.
+ * ======================
+ */
+
+/*
+ * Process a CPL_ACT_ESTABLISH message.
+ */
+static int do_act_establish(struct t3cdev *cdev, struct sk_buff *skb,
+			    void *ctx)
+{
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
+	struct s3_conn *c3cn = (struct s3_conn *)ctx;
+	struct cxgb3i_sdev_data *cdata = CXGB3_SDEV_DATA(cdev);
+
+	c3cn_conn_debug("c3cn 0x%p, tid 0x%x.\n", c3cn, tid);
+	/*
+	 * It's OK if the TID is currently in use, the owning connection may
+	 * have backlogged its last CPL message(s).  Just take it away.
+	 */
+	c3cn->tid = tid;
+	c3cn_insert_tid(cdata, c3cn, tid);
+	free_atid(cdev, atid);
+
+	c3cn->qset = G_QNUM(ntohl(skb->csum));
+
+	process_cpl_msg(c3cn_act_establish, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Process an ACT_OPEN_RPL CPL message.
+ */
+static int do_act_open_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct s3_conn *c3cn = (struct s3_conn *)ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+	c3cn_conn_debug("c3cn 0x%p, status 0x%x.\n", c3cn, rpl->status);
+
+	if (act_open_has_tid(rpl->status))
+		cxgb3_queue_tid_release(cdev, GET_TID(rpl));
+
+	process_cpl_msg_ref(active_open_failed, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Handler RX_ISCSI_HDR CPL messages.
+ */
+static int do_iscsi_hdr(struct t3cdev *t3dev, struct sk_buff *skb, void *ctx)
+{
+	struct s3_conn *c3cn = (struct s3_conn *)ctx;
+	process_cpl_msg(process_rx_iscsi_hdr, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Handler for TX_DATA_ACK CPL messages.
+ */
+static int do_wr_ack(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct s3_conn *c3cn = (struct s3_conn *)ctx;
+
+	process_cpl_msg(wr_ack, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Handler for PEER_CLOSE CPL messages.
+ */
+static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct s3_conn *c3cn = (struct s3_conn *)ctx;
+
+	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
+			 c3cn, c3cn->state, c3cn->flags);
+	process_cpl_msg_ref(do_peer_fin, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Handle an ABORT_REQ_RSS CPL message.
+ */
+static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	const struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct s3_conn *c3cn = (struct s3_conn *)ctx;
+
+	if (is_neg_adv_abort(req->status)) {
+		__kfree_skb(skb);
+		return 0;
+	}
+
+	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
+			 c3cn, c3cn->state, c3cn->flags);
+
+	process_cpl_msg_ref(process_abort_req, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Handle an ABORT_RPL_RSS CPL message.
+ */
+static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	struct s3_conn *c3cn;
+
+	/*
+	 * Ignore replies to post-close aborts indicating that the abort was
+	 * requested too late.  These connections are terminated when we get
+	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
+	 * arrives the TID is either no longer used or it has been recycled.
+	 */
+	if (rpl->status == CPL_ERR_ABORT_FAILED) {
+discard:
+		__kfree_skb(skb);
+		return 0;
+	}
+
+	c3cn = (struct s3_conn *)ctx;
+	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
+			 c3cn, c3cn->state, c3cn->flags);
+
+	/*
+	 * Sometimes we've already closed the connection, e.g., a post-close
+	 * abort races with ABORT_REQ_RSS, the latter frees the connection
+	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
+	 * but FW turns the ABORT_REQ into a regular one and so we get
+	 * ABORT_RPL_RSS with status 0 and no connection.  Only on T3A.
+	 */
+	if (!c3cn)
+		goto discard;
+
+	process_cpl_msg_ref(process_abort_rpl, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Handler for CLOSE_CON_RPL CPL messages.
+ */
+static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb,
+			    void *ctx)
+{
+	struct s3_conn *c3cn = (struct s3_conn *)ctx;
+
+	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
+			 c3cn, c3cn->state, c3cn->flags);
+
+	process_cpl_msg_ref(process_close_con_rpl, c3cn, skb);
+	return 0;
+}
+
+/*
+ * Definitions and declarations for CPL message processing.
+ * ========================================================
+ */
+
+static void make_established(struct s3_conn *, u32, unsigned int);
+static void t3_release_offload_resources(struct s3_conn *);
+static void act_open_retry_timer(unsigned long);
+static void mk_act_open_req(struct s3_conn *, struct sk_buff *,
+			    unsigned int, const struct l2t_entry *);
+static int act_open_rpl_status_to_errno(int);
+static void handle_excess_rx(struct s3_conn *, struct sk_buff *);
+static int abort_status_to_errno(struct s3_conn *, int, int *);
+static void send_abort_rpl(struct sk_buff *, struct t3cdev *, int);
+static struct sk_buff *get_cpl_reply_skb(struct sk_buff *, size_t, gfp_t);
+static void t3_defer_reply(struct sk_buff *, struct t3cdev *, defer_handler_t);
+static void send_deferred_abort_rpl(struct t3cdev *, struct sk_buff *);
+
+/*
+ * Dequeue and return the first unacknowledged's WR on a connections's pending
+ * list.
+ */
+static inline struct sk_buff *dequeue_wr(struct s3_conn *c3cn)
+{
+	struct sk_buff *skb = c3cn->wr_pending_head;
+
+	if (likely(skb)) {
+		/* Don't bother clearing the tail */
+		c3cn->wr_pending_head = (struct sk_buff *)skb->sp;
+		skb->sp = NULL;
+	}
+	return skb;
+}
+
+/*
+ * Return the first pending WR without removing it from the list.
+ */
+static inline struct sk_buff *peek_wr(const struct s3_conn *c3cn)
+{
+	return c3cn->wr_pending_head;
+}
+
+static inline void free_wr_skb(struct sk_buff *skb)
+{
+	kfree_skb(skb);
+}
+
+static void purge_wr_queue(struct s3_conn *c3cn)
+{
+	struct sk_buff *skb;
+	while ((skb = dequeue_wr(c3cn)) != NULL)
+		free_wr_skb(skb);
+}
+
+static inline void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid,
+				    int cmd)
+{
+	struct cpl_abort_rpl *rpl = cplhdr(skb);
+
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
+	rpl->cmd = cmd;
+}
+
+/*
+ * CPL message processing ...
+ * ==========================
+ */
+
+/*
+ * Updates connection state from an active establish CPL message.  Runs with
+ * the connection lock held.
+ */
+static void c3cn_act_establish(struct s3_conn *c3cn,
+			       struct sk_buff *skb)
+{
+	struct cpl_act_establish *req = cplhdr(skb);
+	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
+
+	if (unlikely(c3cn->state != C3CN_STATE_SYN_SENT))
+		printk(KERN_ERR "TID %u expected SYN_SENT, found %d\n",
+		       c3cn->tid, c3cn->state);
+
+	c3cn->copied_seq = c3cn->rcv_wup = c3cn->rcv_nxt = rcv_isn;
+	make_established(c3cn, ntohl(req->snd_isn), ntohs(req->tcp_opt));
+
+	__kfree_skb(skb);
+
+	if (s3_push_frames(c3cn, 1))
+		cxgb3i_conn_tx_open(c3cn);
+}
+
+/*
+ * Handle active open failures.
+ */
+static void active_open_failed(struct s3_conn *c3cn,
+			       struct sk_buff *skb)
+{
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status == CPL_ERR_CONN_EXIST &&
+	    c3cn->retry_timer.function != act_open_retry_timer) {
+		c3cn->retry_timer.function = act_open_retry_timer;
+		c3cn_reset_timer(c3cn, &c3cn->retry_timer,
+				 jiffies + HZ / 2);
+	} else
+		fail_act_open(c3cn, act_open_rpl_status_to_errno(rpl->status));
+	__kfree_skb(skb);
+}
+
+/*
+ * Process received pdu for a connection.
+ */
+static void process_rx_iscsi_hdr(struct s3_conn *c3cn,
+				 struct sk_buff *skb)
+{
+	struct cpl_iscsi_hdr *hdr_cpl = cplhdr(skb);
+	struct cpl_iscsi_hdr_norss data_cpl;
+	struct cpl_rx_data_ddp_norss ddp_cpl;
+	unsigned int hdr_len, data_len, status;
+	unsigned int len;
+	int err;
+
+	if (unlikely(c3cn_no_receive(c3cn))) {
+		handle_excess_rx(c3cn, skb);
+		return;
+	}
+
+	CXGB3_SKB_CB(skb)->seq = ntohl(hdr_cpl->seq);
+	CXGB3_SKB_CB(skb)->flags = 0;
+
+	skb_reset_transport_header(skb);
+	__skb_pull(skb, sizeof(struct cpl_iscsi_hdr));
+
+	len = hdr_len = ntohs(hdr_cpl->len);
+	/* msg coalesce is off or not enough data received */
+	if (skb->len <= hdr_len) {
+		printk(KERN_ERR "%s: TID %u, ISCSI_HDR, skb len %u < %u.\n",
+		       c3cn->cdev->name, c3cn->tid, skb->len, hdr_len);
+		goto abort_conn;
+	}
+
+	err = skb_copy_bits(skb, skb->len - sizeof(ddp_cpl), &ddp_cpl,
+			    sizeof(ddp_cpl));
+	if (err < 0)
+		goto abort_conn;
+
+	skb_ulp_mode(skb) = ULP2_FLAG_DATA_READY;
+	skb_ulp_pdulen(skb) = ntohs(ddp_cpl.len);
+	skb_ulp_ddigest(skb) = ntohl(ddp_cpl.ulp_crc);
+	status = ntohl(ddp_cpl.ddp_status);
+
+	if (status & (1 << RX_DDP_STATUS_HCRC_SHIFT))
+		skb_ulp_mode(skb) |= ULP2_FLAG_HCRC_ERROR;
+	if (status & (1 << RX_DDP_STATUS_DCRC_SHIFT))
+		skb_ulp_mode(skb) |= ULP2_FLAG_DCRC_ERROR;
+	if (status & (1 << RX_DDP_STATUS_PAD_SHIFT))
+		skb_ulp_mode(skb) |= ULP2_FLAG_PAD_ERROR;
+
+	if (skb->len > (hdr_len + sizeof(ddp_cpl))) {
+		err = skb_copy_bits(skb, hdr_len, &data_cpl, sizeof(data_cpl));
+		if (err < 0)
+			goto abort_conn;
+		data_len = ntohs(data_cpl.len);
+		len += sizeof(data_cpl) + data_len;
+	} else if (status & (1 << RX_DDP_STATUS_DDP_SHIFT))
+		skb_ulp_mode(skb) |= ULP2_FLAG_DATA_DDPED;
+
+	c3cn->rcv_nxt = ntohl(ddp_cpl.seq) + skb_ulp_pdulen(skb);
+	__pskb_trim(skb, len);
+	__skb_queue_tail(&c3cn->receive_queue, skb);
+	cxgb3i_conn_pdu_ready(c3cn);
+
+	return;
+
+abort_conn:
+	s3_send_reset(c3cn, CPL_ABORT_SEND_RST, NULL);
+	__kfree_skb(skb);
+}
+
+/*
+ * Process an acknowledgment of WR completion.  Advance snd_una and send the
+ * next batch of work requests from the write queue.
+ */
+static void wr_ack(struct s3_conn *c3cn, struct sk_buff *skb)
+{
+	struct cpl_wr_ack *hdr = cplhdr(skb);
+	unsigned int credits = ntohs(hdr->credits);
+	u32 snd_una = ntohl(hdr->snd_una);
+
+	c3cn->wr_avail += credits;
+	if (c3cn->wr_unacked > c3cn->wr_max - c3cn->wr_avail)
+		c3cn->wr_unacked = c3cn->wr_max - c3cn->wr_avail;
+
+	while (credits) {
+		struct sk_buff *p = peek_wr(c3cn);
+
+		if (unlikely(!p)) {
+			printk(KERN_ERR "%u WR_ACK credits for TID %u with "
+			       "nothing pending, state %u\n",
+			       credits, c3cn->tid, c3cn->state);
+			break;
+		}
+		if (unlikely(credits < p->csum)) {
+			p->csum -= credits;
+			break;
+		} else {
+			dequeue_wr(c3cn);
+			credits -= p->csum;
+			free_wr_skb(p);
+		}
+	}
+
+	if (unlikely(before(snd_una, c3cn->snd_una))) {
+		goto out_free;
+	}
+
+	if (c3cn->snd_una != snd_una) {
+		c3cn->snd_una = snd_una;
+		dst_confirm(c3cn->dst_cache);
+		if (c3cn->snd_una == c3cn->snd_nxt)
+			c3cn_reset_flag(c3cn, C3CN_TX_WAIT_IDLE);
+	}
+
+	if (skb_queue_len(&c3cn->write_queue) && s3_push_frames(c3cn, 0))
+		cxgb3i_conn_tx_open(c3cn);
+out_free:
+	__kfree_skb(skb);
+}
+
+/*
+ * Handle a peer FIN.
+ */
+static void do_peer_fin(struct s3_conn *c3cn, struct sk_buff *skb)
+{
+	int keep = 0;
+
+	if (c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING))
+		goto out;
+
+	c3cn->shutdown |= C3CN_RCV_SHUTDOWN;
+	c3cn_set_flag(c3cn, C3CN_DONE);
+
+	switch (c3cn->state) {
+	case C3CN_STATE_ESTABLISHED:
+		break;
+	case C3CN_STATE_CLOSING:
+		t3_release_offload_resources(c3cn);
+		c3cn_done(c3cn);
+		break;
+	default:
+		printk(KERN_ERR
+		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
+		       c3cn->cdev->name, c3cn->tid, c3cn->state);
+	}
+	
+	cxgb3i_conn_closing(c3cn);
+out:
+	if (!keep)
+		__kfree_skb(skb);
+}
+
+/*
+ * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
+ * request except that we need to reply to it.
+ */
+static void process_abort_req(struct s3_conn *c3cn,
+			      struct sk_buff *skb)
+{
+	int rst_status = CPL_ABORT_NO_RST;
+	const struct cpl_abort_req_rss *req = cplhdr(skb);
+
+	if (!c3cn_flag(c3cn, C3CN_ABORT_REQ_RCVD)) {
+		c3cn_set_flag(c3cn, C3CN_ABORT_REQ_RCVD);
+		c3cn_set_flag(c3cn, C3CN_ABORT_SHUTDOWN);
+		__kfree_skb(skb);
+		return;
+	}
+	c3cn_reset_flag(c3cn, C3CN_ABORT_REQ_RCVD);
+
+	/*
+	 * Three cases to consider:
+	 * a) We haven't sent an abort_req; close the connection.
+	 * b) We have sent a post-close abort_req that will get to TP too late
+	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
+	 *    be ignored and the connection should be closed now.
+	 * c) We have sent a regular abort_req that will get to TP too late.
+	 *    That will generate an abort_rpl with status 0, wait for it.
+	 */
+	send_abort_rpl(skb, c3cn->cdev, rst_status);
+
+	if (!c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING)) {
+		c3cn->err =
+		    abort_status_to_errno(c3cn, req->status, &rst_status);
+
+		t3_release_offload_resources(c3cn);
+		c3cn_done(c3cn);
+	}
+}
+
+/*
+ * Process abort replies.  We only process these messages if we anticipate
+ * them as the coordination between SW and HW in this area is somewhat lacking
+ * and sometimes we get ABORT_RPLs after we are done with the connection that
+ * originated the ABORT_REQ.
+ */
+static void process_abort_rpl(struct s3_conn *c3cn,
+			      struct sk_buff *skb)
+{
+	if (c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING)) {
+		if (!c3cn_flag(c3cn, C3CN_ABORT_RPL_RCVD))
+			c3cn_set_flag(c3cn, C3CN_ABORT_RPL_RCVD);
+		else {
+			c3cn_reset_flag(c3cn, C3CN_ABORT_RPL_RCVD);
+			c3cn_reset_flag(c3cn, C3CN_ABORT_RPL_PENDING);
+			BUG_ON(c3cn_flag(c3cn, C3CN_ABORT_REQ_RCVD));
+			t3_release_offload_resources(c3cn);
+			c3cn_done(c3cn);
+		}
+	}
+	__kfree_skb(skb);
+}
+
+/*
+ * Process a peer ACK to our FIN.
+ */
+static void process_close_con_rpl(struct s3_conn *c3cn,
+				  struct sk_buff *skb)
+{
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+
+	c3cn->snd_una = ntohl(rpl->snd_nxt) - 1;	/* exclude FIN */
+
+	if (c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING))
+		goto out;
+
+	if (c3cn->state == C3CN_STATE_CLOSING) {
+		t3_release_offload_resources(c3cn);
+		c3cn_done(c3cn);
+	} else 
+		printk(KERN_ERR
+		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
+		       c3cn->cdev->name, c3cn->tid, c3cn->state);
+out:
+	kfree_skb(skb);
+}
+
+/*
+ * Random utility functions for CPL message processing ...
+ * =======================================================
+ */
+
+/**
+ *	find_best_mtu - find the entry in the MTU table closest to an MTU
+ *	@d: TOM state
+ *	@mtu: the target MTU
+ *
+ *	Returns the index of the value in the MTU table that is closest to but
+ *	does not exceed the target MTU.
+ */
+static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+		++i;
+	return i;
+}
+
+static unsigned int select_mss(struct s3_conn *c3cn, unsigned int pmtu)
+{
+	unsigned int idx;
+	struct dst_entry *dst = c3cn->dst_cache;
+	struct t3cdev *cdev = c3cn->cdev;
+	const struct t3c_data *td = T3C_DATA(cdev);
+	u16 advmss = dst_metric(dst, RTAX_ADVMSS);
+
+	if (advmss > pmtu - 40)
+		advmss = pmtu - 40;
+	if (advmss < td->mtus[0] - 40)
+		advmss = td->mtus[0] - 40;
+	idx = find_best_mtu(td, advmss + 40);
+	return idx;
+}
+
+static void fail_act_open(struct s3_conn *c3cn, int errno)
+{
+	c3cn->err = errno;
+	t3_release_offload_resources(c3cn);
+	c3cn_done(c3cn);
+}
+
+/*
+ * Assign offload parameters to some connection fields.
+ */
+static void init_offload_conn(struct s3_conn *c3cn,
+			      struct t3cdev *cdev,
+			      struct dst_entry *dst)
+{
+	BUG_ON(c3cn->cdev != cdev);
+	c3cn->wr_max = c3cn->wr_avail = T3C_DATA(cdev)->max_wrs;
+	c3cn->wr_unacked = 0;
+	c3cn->mss_idx = select_mss(c3cn, dst_mtu(dst));
+
+	c3cn->ctrl_skb_cache = alloc_skb(CTRL_SKB_LEN, gfp_any());
+	reset_wr_list(c3cn);
+}
+
+static void act_open_retry_timer(unsigned long data)
+{
+	struct sk_buff *skb;
+	struct s3_conn *c3cn = (struct s3_conn *)data;
+
+	spin_lock(&c3cn->lock);
+	skb = alloc_skb(sizeof(struct cpl_act_open_req), GFP_ATOMIC);
+	if (!skb)
+		fail_act_open(c3cn, ENOMEM);
+	else {
+		skb->sk = (struct sock *)c3cn;
+		set_arp_failure_handler(skb, act_open_req_arp_failure);
+		mk_act_open_req(c3cn, skb, c3cn->tid, c3cn->l2t);
+		l2t_send(c3cn->cdev, skb, c3cn->l2t);
+	}
+	spin_unlock(&c3cn->lock);
+	c3cn_put(c3cn);
+}
+
+/*
+ * Convert an ACT_OPEN_RPL status to a Linux errno.
+ */
+static int act_open_rpl_status_to_errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+		return ECONNREFUSED;
+	case CPL_ERR_ARP_MISS:
+		return EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		printk(KERN_ERR "ACTIVE_OPEN_RPL: 4-tuple in use\n");
+		return EADDRINUSE;
+	default:
+		return EIO;
+	}
+}
+
+/*
+ * Convert the status code of an ABORT_REQ into a Linux error code.  Also
+ * indicate whether RST should be sent in response.
+ */
+static int abort_status_to_errno(struct s3_conn *c3cn,
+				 int abort_reason, int *need_rst)
+{
+	switch (abort_reason) {
+	case CPL_ERR_BAD_SYN: /* fall through */
+	case CPL_ERR_CONN_RESET:
+		return c3cn->state == C3CN_STATE_CLOSING ? EPIPE : ECONNRESET;
+	case CPL_ERR_XMIT_TIMEDOUT:
+	case CPL_ERR_PERSIST_TIMEDOUT:
+	case CPL_ERR_FINWAIT2_TIMEDOUT:
+	case CPL_ERR_KEEPALIVE_TIMEDOUT:
+		return ETIMEDOUT;
+	default:
+		return EIO;
+	}
+}
+
+static void send_abort_rpl(struct sk_buff *skb, struct t3cdev *cdev,
+			   int rst_status)
+{
+	struct sk_buff *reply_skb;
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+
+	reply_skb = get_cpl_reply_skb(skb, sizeof(struct cpl_abort_rpl),
+				      gfp_any());
+	if (!reply_skb) {
+		/* Defer the reply.  Stick rst_status into req->cmd. */
+		req->status = rst_status;
+		t3_defer_reply(skb, cdev, send_deferred_abort_rpl);
+		return;
+	}
+
+	reply_skb->priority = CPL_PRIORITY_DATA;
+	set_abort_rpl_wr(reply_skb, GET_TID(req), rst_status);
+	kfree_skb(skb);
+	cxgb3_ofld_send(cdev, reply_skb);
+}
+
+/*
+ * Returns an sk_buff for a reply CPL message of size len.  If the input
+ * sk_buff has no other users it is trimmed and reused, otherwise a new buffer
+ * is allocated.  The input skb must be of size at least len.  Note that this
+ * operation does not destroy the original skb data even if it decides to reuse
+ * the buffer.
+ */
+static struct sk_buff *get_cpl_reply_skb(struct sk_buff *skb, size_t len,
+					 gfp_t gfp)
+{
+	if (likely(!skb_cloned(skb))) {
+		BUG_ON(skb->len < len);
+		__skb_trim(skb, len);
+		skb_get(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+		if (skb)
+			__skb_put(skb, len);
+	}
+	return skb;
+}
+
+/*
+ * Add an skb to the deferred skb queue for processing from process context.
+ */
+static void t3_defer_reply(struct sk_buff *skb, struct t3cdev *cdev,
+			   defer_handler_t handler)
+{
+	struct cxgb3i_sdev_data *cdata = CXGB3_SDEV_DATA(cdev);
+
+	DEFERRED_SKB_CB(skb)->handler = handler;
+	spin_lock_bh(&cdata->deferq.lock);
+	__skb_queue_tail(&cdata->deferq, skb);
+	if (skb_queue_len(&cdata->deferq) == 1)
+		schedule_work(&cdata->deferq_task);
+	spin_unlock_bh(&cdata->deferq.lock);
+}
+
+/*
+ * Process the defer queue.
+ */
+static void process_deferq(struct work_struct *task_param)
+{
+	struct sk_buff *skb;
+	struct cxgb3i_sdev_data *cdata = container_of(task_param,
+						     struct cxgb3i_sdev_data,
+						     deferq_task);
+
+	spin_lock_bh(&cdata->deferq.lock);
+	while ((skb = __skb_dequeue(&cdata->deferq)) != NULL) {
+		spin_unlock_bh(&cdata->deferq.lock);
+		DEFERRED_SKB_CB(skb)->handler(cdata->cdev, skb);
+		spin_lock_bh(&cdata->deferq.lock);
+	}
+	spin_unlock_bh(&cdata->deferq.lock);
+}
+
+static void send_deferred_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb)
+{
+	struct sk_buff *reply_skb;
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+
+	reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl),
+			      GFP_KERNEL | __GFP_NOFAIL);
+	reply_skb->priority = CPL_PRIORITY_DATA;
+	__skb_put(reply_skb, sizeof(struct cpl_abort_rpl));
+	set_abort_rpl_wr(reply_skb, GET_TID(req), req->status);
+	cxgb3_ofld_send(cdev, reply_skb);
+	kfree_skb(skb);
+}
+
+/*
+ * Release resources held by an offload connection (TID, L2T entry, etc.)
+ */
+static void t3_release_offload_resources(struct s3_conn *c3cn)
+{
+	struct t3cdev *cdev = c3cn->cdev;
+	unsigned int tid = c3cn->tid;
+
+	if (!cdev)
+		return;
+
+	c3cn->qset = 0;
+
+	kfree_skb(c3cn->ctrl_skb_cache);
+	c3cn->ctrl_skb_cache = NULL;
+
+	if (c3cn->wr_avail != c3cn->wr_max) {
+		purge_wr_queue(c3cn);
+		reset_wr_list(c3cn);
+	}
+
+	if (c3cn->l2t) {
+		l2t_release(L2DATA(cdev), c3cn->l2t);
+		c3cn->l2t = NULL;
+	}
+
+	if (c3cn->state == C3CN_STATE_SYN_SENT) /* we have ATID */
+		free_atid(cdev, tid);
+	else {		/* we have TID */
+		cxgb3_remove_tid(cdev, (void *)c3cn, tid);
+		c3cn_put(c3cn);
+	}
+
+	c3cn->cdev = NULL;
+}
+
+/*
+ * Handles Rx data that arrives in a state where the connection isn't
+ * accepting new data.
+ */
+static void handle_excess_rx(struct s3_conn *c3cn, struct sk_buff *skb)
+{
+	if (!c3cn_flag(c3cn, C3CN_ABORT_SHUTDOWN))
+		abort_conn(c3cn, skb);
+
+	kfree_skb(skb);
+}
+
+/*
+ * Like get_cpl_reply_skb() but the returned buffer starts out empty.
+ */
+static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *skb, size_t len,
+					   gfp_t gfp)
+{
+	if (likely(!skb_cloned(skb) && !skb->data_len)) {
+		__skb_trim(skb, 0);
+		skb_get(skb);
+	} else
+		skb = alloc_skb(len, gfp);
+	return skb;
+}
+
+/*
+ * Completes some final bits of initialization for just established connections
+ * and changes their state to C3CN_STATE_ESTABLISHED.
+ *
+ * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
+ */
+static void make_established(struct s3_conn *c3cn, u32 snd_isn,
+			     unsigned int opt)
+{
+	c3cn->write_seq = c3cn->snd_nxt = c3cn->snd_una = snd_isn;
+
+	/*
+	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
+	 * pass through opt0.
+	 */
+	if (rcv_win > (M_RCV_BUFSIZ << 10))
+		c3cn->rcv_wup -= rcv_win - (M_RCV_BUFSIZ << 10);
+
+	dst_confirm(c3cn->dst_cache);
+
+	smp_mb();
+	c3cn_set_state(c3cn, C3CN_STATE_ESTABLISHED);
+}
diff --git a/drivers/scsi/cxgb3i/cxgb3i_offload.h b/drivers/scsi/cxgb3i/cxgb3i_offload.h
new file mode 100644
index 0000000..98d5c7d
--- /dev/null
+++ b/drivers/scsi/cxgb3i/cxgb3i_offload.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright (C) 2003-2008 Chelsio Communications.  All rights reserved.
+ *
+ * Written by Dimitris Michailidis (dm@chelsio.com)
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
+ * release for licensing terms and conditions.
+ */
+
+#ifndef _CXGB3I_OFFLOAD_H
+#define _CXGB3I_OFFLOAD_H
+
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include "t3cdev.h"
+#include "cxgb3_offload.h"
+
+#define cxgb3i_log_error(fmt...) printk(KERN_ERR "cxgb3i: ERR! " fmt)
+#define cxgb3i_log_warn(fmt...)	 printk(KERN_WARNING "cxgb3i: WARN! " fmt)
+#define cxgb3i_log_info(fmt...)  printk(KERN_INFO "cxgb3i: " fmt)
+
+#ifdef __DEBUG_CXGB3I__
+#define cxgb3i_log_debug(fmt, args...) \
+        printk(KERN_ERR "cxgb3i: %s - " fmt, __func__ , ## args)
+#else
+#define cxgb3i_log_debug(fmt...)
+#endif
+
+/*
+ * Data structure to keep track of cxgb3 connection.
+ */
+struct s3_conn {
+	struct net_device *dev;
+	struct t3cdev *cdev;
+	unsigned long flags;
+	int tid;
+	int qset;
+	int mss_idx;
+	struct l2t_entry *l2t;
+	int wr_max;
+	int wr_avail;
+	int wr_unacked;
+	struct sk_buff *wr_pending_head;
+	struct sk_buff *wr_pending_tail;
+	struct sk_buff *ctrl_skb_cache;
+
+	spinlock_t lock;
+	atomic_t refcnt;
+	volatile unsigned int state;
+	struct sockaddr_in saddr;
+	struct sockaddr_in daddr;
+	struct dst_entry *dst_cache;
+	unsigned char shutdown;
+	struct sk_buff_head receive_queue;
+	struct sk_buff_head write_queue;
+	struct timer_list retry_timer;
+	int err;
+	rwlock_t callback_lock;
+	void *user_data;
+
+	u32 rcv_nxt;		/* What we want to receive next		*/
+	u32 copied_seq;		/* Head of yet unread data		*/
+	u32 rcv_wup;		/* rcv_nxt on last window update sent	*/
+	u32 snd_nxt;		/* Next sequence we send		*/
+	u32 snd_una;		/* First byte we want an ack for	*/
+
+	u32 write_seq;		/* Tail(+1) of data held in send buffer */
+};
+
+/* Flags in c3cn->shutdown */
+#define C3CN_SHUTDOWN_MASK	3
+#define C3CN_RCV_SHUTDOWN	1
+#define C3CN_SEND_SHUTDOWN	2
+
+/*
+ * connection state bitmap
+ */
+#define C3CN_STATE_CLOSE	0x1
+#define C3CN_STATE_SYN_SENT	0x2
+#define C3CN_STATE_ESTABLISHED	0x4
+#define C3CN_STATE_CLOSING	0x8
+#define C3CN_STATE_ABORING	0x10
+
+#define C3CN_STATE_MASK		0xFF
+#define C3CN_NEED_CLOSE		0x100
+
+/*
+ * Connection flags -- many to track some close related events.
+ */
+enum c3cn_flags {
+	C3CN_ABORT_RPL_RCVD,	/* received one ABORT_RPL_RSS message */
+	C3CN_ABORT_REQ_RCVD,	/* received one ABORT_REQ_RSS message */
+	C3CN_TX_WAIT_IDLE,	/* suspend Tx until in-flight data is ACKed */
+	C3CN_ABORT_SHUTDOWN,	/* shouldn't send more abort requests */
+	C3CN_ABORT_RPL_PENDING,	/* expecting an abort reply */
+	C3CN_CLOSE_CON_REQUESTED,	/* we've sent a close_conn_req */
+	C3CN_TX_DATA_SENT,	/* already sent a TX_DATA WR */
+
+	C3CN_DONE,
+};
+
+static inline void c3cn_set_flag(struct s3_conn *c3cn,
+				 enum c3cn_flags flag)
+{
+	__set_bit(flag, &c3cn->flags);
+}
+
+static inline void c3cn_reset_flag(struct s3_conn *c3cn,
+				   enum c3cn_flags flag)
+{
+	__clear_bit(flag, &c3cn->flags);
+}
+
+static inline int c3cn_flag(struct s3_conn *c3cn, enum c3cn_flags flag)
+{
+	if (c3cn == NULL)
+		return 0;
+	return test_bit(flag, &c3cn->flags);
+}
+
+/*
+ * Per adapter data.  Linked off of each Ethernet device port on the adapter.
+ * Also available via the t3cdev structure since we have pointers to our port
+ * net_device's there ...
+ */
+struct cxgb3i_sdev_data {
+	struct list_head list;
+	struct t3cdev *cdev;
+	struct cxgb3_client *client;
+	struct adap_ports *ports;
+	unsigned int rx_page_size;
+	struct sk_buff_head deferq;
+	struct work_struct deferq_task;
+};
+#define NDEV2CDATA(ndev) (*(struct cxgb3i_sdev_data **)&(ndev)->ec_ptr)
+#define CXGB3_SDEV_DATA(cdev) NDEV2CDATA((cdev)->lldev)
+
+static inline void c3cn_hold(struct s3_conn *c3cn)
+{
+	atomic_inc(&c3cn->refcnt);
+}
+
+static inline void c3cn_put(struct s3_conn *c3cn)
+{
+	if (atomic_dec_and_test(&c3cn->refcnt))
+		kfree(c3cn);
+}
+
+void c3cn_close(struct s3_conn *);
+static inline void c3cn_release(struct s3_conn *c3cn)
+{
+	c3cn_close(c3cn);
+	c3cn_put(c3cn);
+}
+
+/*
+ * Primary API routines.
+ */
+
+int cxgb3i_sdev_init(cxgb3_cpl_handler_func *);
+void cxgb3i_sdev_add(struct t3cdev *, struct cxgb3_client *);
+void cxgb3i_sdev_remove(struct t3cdev *);
+
+struct s3_conn *cxgb3i_c3cn_create(void);
+int cxgb3i_c3cn_connect(struct s3_conn *, struct sockaddr_in *);
+void cxgb3i_c3cn_rx_credits(struct s3_conn *, int);
+int cxgb3i_c3cn_send_pdus(struct s3_conn *, struct sk_buff *, int);
+
+/*
+ * Definitions for sk_buff state and ULP mode management.
+ */
+
+struct cxgb3_skb_cb {
+	__u8 flags;		
+	__u8 ulp_mode;		/* ULP mode/submode of sk_buff */
+	__u32 seq;		/* sequence number */
+	__u32 ddigest;		/* ULP rx_data_ddp selected field */
+	__u32 pdulen;		/* ULP rx_data_ddp selected field */
+	__u8 ulp_data[16];	/* scratch area for ULP */
+};
+
+#define CXGB3_SKB_CB(skb)	((struct cxgb3_skb_cb *)&((skb)->cb[0]))
+
+#define skb_ulp_mode(skb)	(CXGB3_SKB_CB(skb)->ulp_mode)
+#define skb_ulp_ddigest(skb)	(CXGB3_SKB_CB(skb)->ddigest)
+#define skb_ulp_pdulen(skb)	(CXGB3_SKB_CB(skb)->pdulen)
+#define skb_ulp_data(skb)	(CXGB3_SKB_CB(skb)->ulp_data)
+
+enum {
+	C3CB_FLAG_NEED_HDR = 1 << 0,	/* packet needs a TX_DATA_WR header */
+	C3CB_FLAG_NO_APPEND = 1 << 1,	/* don't grow this skb */
+	C3CB_FLAG_BARRIER = 1 << 2,	/* set TX_WAIT_IDLE after sending */
+	C3CB_FLAG_COMPL = 1 << 4,	/* request WR completion */
+};
+
+/*
+ * Definitions for managing deferred CPL replies from process context.
+ */
+
+typedef void (*defer_handler_t) (struct t3cdev *, struct sk_buff *);
+
+struct deferred_skb_cb {
+	defer_handler_t handler;
+	struct t3cdev *cdev;
+};
+
+#define DEFERRED_SKB_CB(skb) ((struct deferred_skb_cb *)(skb)->cb)
+
+/*
+ * Top-level CPL message processing used by most CPL messages that
+ * pertain to connections.
+ */
+static inline void process_cpl_msg(void (*fn)(struct s3_conn *,
+					      struct sk_buff *),
+				   struct s3_conn *c3cn,
+				   struct sk_buff *skb)
+{
+	spin_lock(&c3cn->lock);
+	fn(c3cn, skb);
+	spin_unlock(&c3cn->lock);
+}
+
+/*
+ * Opaque version of structure the SGE stores at skb->head of TX_DATA packets
+ * and for which we must reserve space.
+ */
+struct sge_opaque_hdr {
+	void *dev;
+	dma_addr_t addr[MAX_SKB_FRAGS + 1];
+};
+
+/* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */
+#define TX_HEADER_LEN \
+		(sizeof(struct tx_data_wr) + sizeof(struct sge_opaque_hdr))
+
+void *cxgb3i_alloc_big_mem(unsigned int);
+void cxgb3i_free_big_mem(void *);
+
+#endif /* _CXGB3_OFFLOAD_H */
diff --git a/drivers/scsi/cxgb3i/cxgb3i_ulp2.c b/drivers/scsi/cxgb3i/cxgb3i_ulp2.c
new file mode 100644
index 0000000..313bb90
--- /dev/null
+++ b/drivers/scsi/cxgb3i/cxgb3i_ulp2.c
@@ -0,0 +1,692 @@
+/*
+ * cxgb3i_ddp.c: Chelsio S3xx iSCSI driver.
+ *
+ * Copyright (c) 2008 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ */
+
+#include <linux/skbuff.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+#include <linux/crypto.h>
+#include "../iscsi_tcp.h"
+
+#include "cxgb3i.h"
+#include "cxgb3i_ulp2.h"
+
+#ifdef __DEBUG_CXGB3I_RX__
+#define cxgb3i_rx_debug		cxgb3i_log_debug
+#else
+#define cxgb3i_rx_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB3I_TX__
+#define cxgb3i_tx_debug		cxgb3i_log_debug
+#else
+#define cxgb3i_tx_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB3I_TAG__
+#define cxgb3i_tag_debug	cxgb3i_log_debug
+#else
+#define cxgb3i_tag_debug(fmt...)
+#endif
+
+#ifdef __DEBUG_CXGB3I_DDP__
+#define cxgb3i_ddp_debug	cxgb3i_log_debug
+#else
+#define cxgb3i_ddp_debug(fmt...)
+#endif
+
+static struct page *pad_page;
+
+#define ULP2_PGIDX_MAX		4
+#define ULP2_4K_PAGE_SHIFT	12
+#define ULP2_4K_PAGE_MASK	(~((1UL << ULP2_4K_PAGE_SHIFT) - 1))
+static unsigned char ddp_page_order[ULP2_PGIDX_MAX];
+static unsigned long ddp_page_size[ULP2_PGIDX_MAX];
+static unsigned char ddp_page_shift[ULP2_PGIDX_MAX];
+static unsigned char sw_tag_idx_bits;
+static unsigned char sw_tag_age_bits;
+
+static void cxgb3i_ddp_page_init(void)
+{
+	int i;
+	unsigned long n = PAGE_SIZE >> ULP2_4K_PAGE_SHIFT;
+
+	if (PAGE_SIZE & (~ULP2_4K_PAGE_MASK)) {
+		cxgb3i_log_warn("PAGE_SIZE 0x%lx is not multiple of 4K, "
+				"ddp disabled.\n", PAGE_SIZE);
+		return;
+	}
+	n = __ilog2_u32(n);
+	for (i = 0; i < ULP2_PGIDX_MAX; i++, n++) {
+		ddp_page_order[i] = n;
+		ddp_page_shift[i] = ULP2_4K_PAGE_SHIFT + n;
+		ddp_page_size[i] = 1 << ddp_page_shift[i];
+		cxgb3i_log_debug("%d, order %u, shift %u, size 0x%lx.\n", i,
+				 ddp_page_order[i], ddp_page_shift[i],
+				 ddp_page_size[i]);
+	}
+
+	sw_tag_idx_bits = (__ilog2_u32(ISCSI_ITT_MASK)) + 1;
+	sw_tag_age_bits = (__ilog2_u32(ISCSI_AGE_MASK)) + 1;
+}
+
+static inline void ulp_mem_io_set_hdr(struct sk_buff *skb, unsigned int addr)
+{
+	struct ulp_mem_io *req = (struct ulp_mem_io *)skb->head;
+	req->wr.wr_lo = 0;
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+	req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(addr >> 5) |
+				   V_ULPTX_CMD(ULP_MEM_WRITE));
+	req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE >> 5) |
+			 V_ULPTX_NFLITS((PPOD_SIZE >> 3) + 1));
+}
+
+static int set_ddp_map(struct cxgb3i_adapter *snic, struct pagepod_hdr *hdr,
+		       unsigned int idx, unsigned int npods,
+		       struct scatterlist *sgl, unsigned int sgcnt)
+{
+	struct cxgb3i_ddp_info *ddp = &snic->ddp;
+	struct scatterlist *sg = sgl;
+	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit;
+	int i;
+
+	for (i = 0; i < npods; i++, pm_addr += PPOD_SIZE) {
+		struct sk_buff *skb;
+		struct pagepod *ppod;
+		int j, k;
+		skb =
+		    alloc_skb(sizeof(struct ulp_mem_io) + PPOD_SIZE,
+			      GFP_ATOMIC);
+		if (!skb)
+			return -ENOMEM;
+		skb_put(skb, sizeof(struct ulp_mem_io) + PPOD_SIZE);
+
+		ulp_mem_io_set_hdr(skb, pm_addr);
+		ppod =
+		    (struct pagepod *)(skb->head + sizeof(struct ulp_mem_io));
+		memcpy(&(ppod->hdr), hdr, sizeof(struct pagepod));
+		for (j = 0, k = i * 4; j < 5; j++, k++) {
+			if (k < sgcnt) {
+				ppod->addr[j] = cpu_to_be64(sg_dma_address(sg));
+				if (j < 4)
+					sg = sg_next(sg);
+			} else
+				ppod->addr[j] = 0UL;
+		}
+
+		skb->priority = CPL_PRIORITY_CONTROL;
+		cxgb3_ofld_send(snic->tdev, skb);
+	}
+	return 0;
+}
+
+static int clear_ddp_map(struct cxgb3i_adapter *snic, unsigned int idx,
+			 unsigned int npods)
+{
+	struct cxgb3i_ddp_info *ddp = &snic->ddp;
+	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit;
+	int i;
+
+	for (i = 0; i < npods; i++, pm_addr += PPOD_SIZE) {
+		struct sk_buff *skb;
+		skb =
+		    alloc_skb(sizeof(struct ulp_mem_io) + PPOD_SIZE,
+			      GFP_ATOMIC);
+		if (!skb)
+			return -ENOMEM;
+		skb_put(skb, sizeof(struct ulp_mem_io) + PPOD_SIZE);
+		memset((skb->head + sizeof(struct ulp_mem_io)), 0, PPOD_SIZE);
+		ulp_mem_io_set_hdr(skb, pm_addr);
+		skb->priority = CPL_PRIORITY_CONTROL;
+		cxgb3_ofld_send(snic->tdev, skb);
+	}
+	return 0;
+}
+
+static int cxgb3i_ddp_sgl_check(struct scatterlist *sgl, unsigned int sgcnt)
+{
+	struct scatterlist *sg;
+	int i;
+
+	/* make sure the sgl is fit for ddp:
+	 *      each has the same page size, and
+	 *      first & last page do not need to be used completely, and
+	 *      the rest of page must be used completely
+	 */
+	for_each_sg(sgl, sg, sgcnt, i) {
+		if ((i && sg->offset) ||
+		    ((i != sgcnt - 1) &&
+		     (sg->length + sg->offset) != PAGE_SIZE))
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline int ddp_find_unused_entries(struct cxgb3i_ddp_info *ddp,
+					  int start, int max, int count)
+{
+	unsigned int i, j;
+
+	spin_lock(&ddp->map_lock);
+	for (i = start; i <= max;) {
+		for (j = 0; j < count; j++) {
+			if (ddp->map[i + j])
+				break;
+		}
+		if (j == count) {
+			memset(&ddp->map[i], 1, count);
+			spin_unlock(&ddp->map_lock);
+			return i;
+		}
+		i += j + 1;
+	}
+	spin_unlock(&ddp->map_lock);
+	return -EBUSY;
+}
+
+static inline void ddp_unmark_entries(struct cxgb3i_ddp_info *ddp,
+				      int start, int count)
+{
+	spin_lock(&ddp->map_lock);
+	memset(&ddp->map[start], 0, count);
+	spin_unlock(&ddp->map_lock);
+}
+
+u32 cxgb3i_ddp_tag_reserve(struct cxgb3i_adapter *snic, unsigned int tid,
+			   u32 sw_tag, unsigned int xferlen,
+			   struct scatterlist *sgl, unsigned int sgcnt)
+{
+	struct cxgb3i_ddp_info *ddp = &snic->ddp;
+	struct pagepod_hdr hdr;
+	unsigned int npods;
+	int idx = -1, idx_max;
+	u32 tag;
+	int err;
+
+	if (!ddp || !sgcnt || xferlen < PAGE_SIZE)
+		return RESERVED_ITT;
+
+	err = cxgb3i_ddp_sgl_check(sgl, sgcnt);
+	if (err < 0)
+		return RESERVED_ITT;
+
+	npods = (sgcnt + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
+	idx_max = ddp->nppods - npods + 1;
+
+	if (ddp->idx_last == ddp->nppods)
+		idx = ddp_find_unused_entries(ddp, 0, idx_max, npods);
+	else {
+		idx = ddp_find_unused_entries(ddp, ddp->idx_last + 1, idx_max,
+					      npods);
+		if ((idx < 0) && (ddp->idx_last >= npods))
+			idx = ddp_find_unused_entries(ddp, 0,
+						      ddp->idx_last - npods + 1,
+						      npods);
+	}
+	if (idx < 0)
+		return RESERVED_ITT;
+
+	if (pci_map_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE) <= 0)
+		goto unmark_entries;
+
+	tag = sw_tag | (idx << snic->tag_format.rsvd_shift);
+
+	hdr.rsvd = 0;
+	hdr.vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
+	hdr.pgsz_tag_clr = htonl(tag);
+	hdr.maxoffset = htonl(xferlen);
+	hdr.pgoffset = htonl(sgl->offset);
+
+	if (set_ddp_map(snic, &hdr, idx, npods, sgl, sgcnt) < 0)
+		goto unmap_sgl;
+
+	ddp->idx_last = idx;
+	cxgb3i_tag_debug("tid 0x%x, xfer %u, 0x%x -> ddp tag 0x%x (%u, %u).\n",
+			 tid, xferlen, sw_tag, tag, idx, npods);
+	return tag;
+
+unmap_sgl:
+	pci_unmap_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE);
+
+unmark_entries:
+	ddp_unmark_entries(ddp, idx, npods);
+	return RESERVED_ITT;
+}
+
+void cxgb3i_ddp_tag_release(struct cxgb3i_adapter *snic, u32 tag,
+			    struct scatterlist *sgl, unsigned int sgcnt)
+{
+	u32 idx = (tag >> snic->tag_format.rsvd_shift) &
+	    snic->tag_format.rsvd_mask;
+	unsigned int npods = (sgcnt + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
+
+	if (idx < snic->tag_format.rsvd_mask) {
+		cxgb3i_tag_debug("ddp tag 0x%x, release idx 0x%x, npods %u.\n",
+				 tag, idx, npods);
+		clear_ddp_map(snic, idx, npods);
+		ddp_unmark_entries(&snic->ddp, idx, npods);
+		pci_unmap_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE);
+	}
+}
+
+int cxgb3i_conn_ulp_setup(struct cxgb3i_conn *cconn, int hcrc, int dcrc)
+{
+	struct iscsi_tcp_conn *tcp_conn = cconn->conn->dd_data;
+	struct s3_conn *c3cn = (struct s3_conn *)(tcp_conn->sock);
+	struct sk_buff *skb = alloc_skb(sizeof(struct cpl_set_tcb_field),
+					GFP_KERNEL | __GFP_NOFAIL);
+	struct cpl_set_tcb_field *req;
+	u32 submode = (hcrc ? 1 : 0) | (dcrc ? 2 : 0);
+
+	/* set up ulp submode and page size */
+	req = (struct cpl_set_tcb_field *)skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, c3cn->tid));
+	req->reply = V_NO_REPLY(1);
+	req->cpu_idx = 0;
+	req->word = htons(31);
+	req->mask = cpu_to_be64(0xFF000000);
+	/* the connection page size is always the same as ddp-pgsz0 */
+	req->val = cpu_to_be64(submode << 24);
+	skb->priority = CPL_PRIORITY_CONTROL;
+
+	cxgb3_ofld_send(c3cn->cdev, skb);
+	return 0;
+}
+
+static int cxgb3i_conn_read_pdu_skb(struct iscsi_conn *conn,
+				    struct sk_buff *skb)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct iscsi_segment *segment = &tcp_conn->in.segment;
+	struct iscsi_hdr *hdr = (struct iscsi_hdr *)tcp_conn->in.hdr_buf;
+	unsigned char *buf = (unsigned char *)hdr;
+	unsigned int offset = sizeof(struct iscsi_hdr);
+	int err;
+
+	cxgb3i_rx_debug("conn 0x%p, skb 0x%p, len %u, flag 0x%x.\n",
+			conn, skb, skb->len, skb_ulp_mode(skb));
+
+	/* read bhs */
+	err = skb_copy_bits(skb, 0, buf, sizeof(struct iscsi_hdr));
+	if (err < 0)
+		return err;
+	segment->copied = sizeof(struct iscsi_hdr);
+	/* read ahs */
+	if (hdr->hlength) {
+		unsigned int ahslen = hdr->hlength << 2;
+		/* Make sure we don't overflow */
+		if (sizeof(*hdr) + ahslen > sizeof(tcp_conn->in.hdr_buf))
+			return -ISCSI_ERR_AHSLEN;
+		err = skb_copy_bits(skb, offset, buf + offset, ahslen);
+		if (err < 0)
+			return err;
+		offset += ahslen;
+	}
+	/* header digest */
+	if (conn->hdrdgst_en)
+		offset += ISCSI_DIGEST_SIZE;
+
+	/* check header digest */
+	segment->status = (conn->hdrdgst_en &&
+			   (skb_ulp_mode(skb) & ULP2_FLAG_HCRC_ERROR)) ?
+	    ISCSI_SEGMENT_DGST_ERR : 0;
+
+	hdr->itt = ntohl(hdr->itt);
+	segment->total_copied = segment->total_size;
+	tcp_conn->in.hdr = hdr;
+	err = iscsi_tcp_hdr_dissect(conn, hdr);
+	if (err)
+		return err;
+
+	if (tcp_conn->in.datalen) {
+		segment = &tcp_conn->in.segment;
+		segment->status = (conn->datadgst_en &&
+				   (skb_ulp_mode(skb) & ULP2_FLAG_DCRC_ERROR)) ?
+		    ISCSI_SEGMENT_DGST_ERR : 0;
+		if (skb_ulp_mode(skb) & ULP2_FLAG_DATA_DDPED) {
+			cxgb3i_ddp_debug("opcode 0x%x, data %u, ddp'ed.\n",
+					 hdr->opcode & ISCSI_OPCODE_MASK,
+					 tcp_conn->in.datalen);
+			segment->total_copied = segment->total_size;
+		} else {
+			cxgb3i_ddp_debug("opcode 0x%x, data %u, not ddp'ed.\n",
+					 hdr->opcode & ISCSI_OPCODE_MASK,
+					 tcp_conn->in.datalen);
+			offset += sizeof(struct cpl_iscsi_hdr_norss);
+		}
+		while (segment->total_copied < segment->total_size) {
+			iscsi_tcp_segment_map(segment, 1);
+			err = skb_copy_bits(skb, offset, segment->data,
+					    segment->size);
+			iscsi_tcp_segment_unmap(segment);
+			if (err)
+				return err;
+			segment->total_copied += segment->size;
+			offset += segment->size;
+
+			if (segment->total_copied < segment->total_size)
+				iscsi_tcp_segment_init_sg(segment,
+							  sg_next(segment->sg),
+							  0);
+		}
+		err = segment->done(tcp_conn, segment);
+	}
+	return err;
+}
+
+static inline void tx_skb_setmode(struct sk_buff *skb, int hcrc, int dcrc)
+{
+	u8 submode = 0;
+	if (hcrc)
+		submode |= 1;
+	if (dcrc)
+		submode |= 2;
+	skb_ulp_mode(skb) = (ULP_MODE_ISCSI << 4) | submode;
+}
+
+int cxgb3i_conn_ulp2_xmit(struct iscsi_conn *conn)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct iscsi_segment *hdr_seg = &tcp_conn->out.segment;
+	struct iscsi_segment *data_seg = &tcp_conn->out.data_segment;
+	unsigned int hdrlen = hdr_seg->total_size;
+	unsigned int datalen = data_seg->total_size;
+	unsigned int padlen = iscsi_padding(datalen);
+	unsigned int copymax = SKB_MAX_HEAD(TX_HEADER_LEN);
+	unsigned int copylen;
+	struct sk_buff *skb;
+	unsigned char *dst;
+	int err = -EAGAIN;
+
+	if (conn->suspend_tx)
+		return 0;
+
+	if (data_seg->data && ((datalen + padlen) < copymax))
+		copylen = hdrlen + datalen + padlen;
+	else
+		copylen = hdrlen;
+
+	/* supports max. 16K pdus, so one skb is enough to hold all the data */
+	skb = alloc_skb(TX_HEADER_LEN + copylen, GFP_ATOMIC);
+	if (!skb)
+		return -EAGAIN;
+
+	skb_reserve(skb, TX_HEADER_LEN);
+	skb_put(skb, copylen);
+	dst = skb->data;
+
+	tx_skb_setmode(skb, conn->hdrdgst_en, datalen ? conn->datadgst_en : 0);
+
+	memcpy(dst, hdr_seg->data, hdrlen);
+	dst += hdrlen;
+
+	if (!datalen)
+		goto send_pdu;
+
+	if (data_seg->data) {
+		/* data is in a linear buffer */
+		if (copylen > hdrlen) {
+			/* data fits in the skb's headroom */
+			memcpy(dst, data_seg->data, datalen);
+			dst += datalen;
+			if (padlen)
+				memset(dst, 0, padlen);
+		} else {
+			unsigned int offset = 0;
+			while (datalen) {
+				struct page *page =
+				    alloc_pages(GFP_ATOMIC, 0);
+				int idx = skb_shinfo(skb)->nr_frags;
+				skb_frag_t *frag = &skb_shinfo(skb)->frags[idx];
+
+				if (!page)
+					goto free_skb;
+
+				frag->page = page;
+				frag->page_offset = 0;
+				if (datalen > PAGE_SIZE)
+					frag->size = PAGE_SIZE;
+				else
+					frag->size = datalen;
+				memcpy(page_address(page),
+				       data_seg->data + offset, frag->size);
+
+				skb_shinfo(skb)->nr_frags++;
+				datalen -= frag->size;
+				offset += frag->size;
+			}
+		}
+	} else {
+		struct scatterlist *sg = data_seg->sg;
+		unsigned int offset = data_seg->sg_offset;
+		while (datalen) {
+			int idx = skb_shinfo(skb)->nr_frags;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[idx];
+			struct page *pg = sg_page(sg);
+
+			get_page(pg);
+			frag->page = pg;
+			frag->page_offset = offset + sg->offset;
+			frag->size = min(sg->length, datalen);
+
+			offset = 0;
+			skb_shinfo(skb)->nr_frags++;
+			datalen -= frag->size;
+			sg = sg_next(sg);
+		}
+	}
+
+	if (skb_shinfo(skb)->nr_frags) {
+		if (padlen) {
+			int idx = skb_shinfo(skb)->nr_frags;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[idx];
+			frag->page = pad_page;
+			frag->page_offset = 0;
+			frag->size = padlen;
+			skb_shinfo(skb)->nr_frags++;
+		}
+		datalen = data_seg->total_size + padlen;
+		skb->data_len += datalen;
+		skb->truesize += datalen;
+		skb->len += datalen;
+	}
+
+send_pdu:
+	err = cxgb3i_c3cn_send_pdus((struct s3_conn *)tcp_conn->sock,
+				    skb, MSG_DONTWAIT | MSG_NOSIGNAL);
+	if (err > 0) {
+		int pdulen = hdrlen + datalen + padlen;
+		if (conn->hdrdgst_en)
+			pdulen += ISCSI_DIGEST_SIZE;
+		if (datalen && conn->datadgst_en)
+			pdulen += ISCSI_DIGEST_SIZE;
+
+		hdr_seg->total_copied = hdr_seg->total_size;
+		if (datalen)
+			data_seg->total_copied = data_seg->total_size;
+		conn->txdata_octets += pdulen;
+		return pdulen;
+	}
+
+free_skb:
+	kfree_skb(skb);
+	if (err < 0 && err != -EAGAIN) {
+		cxgb3i_log_error("conn 0x%p, xmit err %d.\n", conn, err);
+		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+		return err;
+	}
+	return -EAGAIN;
+}
+
+int cxgb3i_ulp2_init(void)
+{
+	pad_page = alloc_page(GFP_KERNEL);
+	if (!pad_page)
+		return -ENOMEM;
+	memset(page_address(pad_page), 0, PAGE_SIZE);
+	cxgb3i_ddp_page_init();
+	return 0;
+}
+
+void cxgb3i_ulp2_cleanup(void)
+{
+	if (pad_page) {
+		__free_page(pad_page);
+		pad_page = NULL;
+	}
+}
+
+void cxgb3i_conn_pdu_ready(struct s3_conn *c3cn)
+{
+	struct sk_buff *skb;
+	unsigned int read = 0;
+	struct iscsi_conn *conn = c3cn->user_data;
+	int err = 0;
+
+	cxgb3i_rx_debug("cn 0x%p.\n", c3cn);
+
+	read_lock(&c3cn->callback_lock);
+	if (unlikely(!conn || conn->suspend_rx)) {
+		cxgb3i_rx_debug("conn 0x%p, id %d, suspend_rx %d!\n", 
+				conn, conn ? conn->id : 0xFF,
+				conn ? conn->suspend_rx : 0xFF);
+		read_unlock(&c3cn->callback_lock);
+		return;
+	}
+	skb = skb_peek(&c3cn->receive_queue);
+	while (!err && skb) {
+		__skb_unlink(skb, &c3cn->receive_queue);
+		read += skb_ulp_pdulen(skb);
+		err = cxgb3i_conn_read_pdu_skb(conn, skb);
+		__kfree_skb(skb);
+		skb = skb_peek(&c3cn->receive_queue);
+	}
+	read_unlock(&c3cn->callback_lock);
+	if (c3cn) {
+		c3cn->copied_seq += read;
+		cxgb3i_c3cn_rx_credits(c3cn, read);
+	}
+	conn->rxdata_octets += read;
+
+	if (err) {
+		cxgb3i_log_info("conn 0x%p rx failed err %d.\n", conn, err);
+		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+	}
+}
+
+void cxgb3i_conn_tx_open(struct s3_conn *c3cn)
+{
+	struct iscsi_conn *conn = (struct iscsi_conn *)c3cn->user_data;
+	struct iscsi_tcp_conn *tcp_conn;
+	cxgb3i_tx_debug("cn 0x%p.\n", c3cn);
+	if (conn) {
+		cxgb3i_tx_debug("cn 0x%p, cid %d.\n", c3cn, conn->id);
+		tcp_conn = conn->dd_data;
+		scsi_queue_work(conn->session->host, &conn->xmitwork);
+	}
+}
+
+void cxgb3i_conn_closing(struct s3_conn *c3cn)
+{
+	struct iscsi_conn *conn;
+	read_lock(&c3cn->callback_lock);
+	conn = (struct iscsi_conn *)c3cn->user_data;
+	if (conn && c3cn->state != C3CN_STATE_ESTABLISHED)
+		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+	read_unlock(&c3cn->callback_lock);
+}
+
+int cxgb3i_adapter_ulp_init(struct cxgb3i_adapter *snic)
+{
+	struct t3cdev *tdev = snic->tdev;
+	struct cxgb3i_ddp_info *ddp = &snic->ddp;
+	struct ulp_iscsi_info uinfo;
+	unsigned int ppmax, bits, max_bits;
+	int i, err;
+
+	spin_lock_init(&ddp->map_lock);
+
+	err = tdev->ctl(tdev, ULP_ISCSI_GET_PARAMS, &uinfo);
+	if (err < 0) {
+		cxgb3i_log_error("%s, failed to get iscsi param err=%d.\n",
+				 tdev->name, err);
+		return err;
+	}
+
+	ppmax = (uinfo.ulimit - uinfo.llimit + 1) >> PPOD_SIZE_SHIFT;
+	max_bits = min(PPOD_IDX_MAX_SIZE,
+		       (32 - sw_tag_idx_bits - sw_tag_age_bits));
+	bits = __ilog2_u32(ppmax) + 1;
+	if (bits > max_bits)
+		bits = max_bits;
+	ppmax = (1 << bits) - 1;
+
+	snic->tx_max_size = uinfo.max_txsz;
+	snic->rx_max_size = uinfo.max_rxsz;
+	cxgb3i_log_debug("snic tx %u, rx %u.\n", snic->tx_max_size,
+			 snic->rx_max_size);
+	snic->tag_format.idx_bits = sw_tag_idx_bits;
+	snic->tag_format.age_bits = sw_tag_age_bits;
+	snic->tag_format.rsvd_bits = bits;
+	snic->tag_format.rsvd_shift = PPOD_IDX_SHIFT;
+	snic->tag_format.rsvd_mask = (1 << snic->tag_format.rsvd_bits) - 1;
+
+	cxgb3i_log_debug("snic nppods %u, rsvd shift %u, bits %u, mask 0x%x.\n",
+			 ppmax, snic->tag_format.rsvd_shift,
+			 snic->tag_format.rsvd_bits,
+			 snic->tag_format.rsvd_mask);
+
+	ddp->map = cxgb3i_alloc_big_mem(ppmax);
+	if (!ddp->map) {
+		cxgb3i_log_warn("snic unable to alloc ddp ppod 0x%u, "
+				"ddp disabled.\n", ppmax);
+		return 0;
+	}
+	ddp->llimit = uinfo.llimit;
+	ddp->ulimit = uinfo.ulimit;
+
+	uinfo.tagmask =
+	    snic->tag_format.rsvd_mask << snic->tag_format.rsvd_shift;
+	for (i = 0; i < ULP2_PGIDX_MAX; i++)
+		uinfo.pgsz_factor[i] = ddp_page_order[i];
+
+	err = tdev->ctl(tdev, ULP_ISCSI_SET_PARAMS, &uinfo);
+	if (err < 0) {
+		cxgb3i_log_warn("snic unable to set iscsi param err=%d, "
+				"ddp disabled.\n", err);
+		goto free_ppod_map;
+	}
+
+	ddp->nppods = ppmax;
+	ddp->idx_last = ppmax;
+
+	tdev->ulp_iscsi = ddp;
+
+	return 0;
+
+free_ppod_map:
+	cxgb3i_free_big_mem(ddp->map);
+	return 0;
+}
+
+void cxgb3i_adapter_ulp_cleanup(struct cxgb3i_adapter *snic)
+{
+	u8 *map = snic->ddp.map;
+	if (map) {
+		snic->tdev->ulp_iscsi = NULL;
+		spin_lock(&snic->lock);
+		snic->ddp.map = NULL;
+		spin_unlock(&snic->lock);
+		cxgb3i_free_big_mem(map);
+	}
+}
diff --git a/drivers/scsi/cxgb3i/cxgb3i_ulp2.h b/drivers/scsi/cxgb3i/cxgb3i_ulp2.h
new file mode 100644
index 0000000..e3f46dc
--- /dev/null
+++ b/drivers/scsi/cxgb3i/cxgb3i_ulp2.h
@@ -0,0 +1,106 @@
+/*
+ * cxgb3i_ulp2.h: Chelsio S3xx iSCSI driver.
+ *
+ * Copyright (c) 2008 Chelsio Communications, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.
+ *
+ * Written by: Karen Xie (kxie@chelsio.com)
+ */
+
+#ifndef __CXGB3I_ULP2_H__
+#define __CXGB3I_ULP2_H__
+
+#define ULP2_PDU_PAYLOAD_DFLT	(16224 - ISCSI_PDU_HEADER_MAX)
+#define PPOD_PAGES_MAX		4
+#define PPOD_PAGES_SHIFT	2	/* 4 pages per pod */
+
+struct pagepod_hdr {
+	u32 vld_tid;
+	u32 pgsz_tag_clr;
+	u32 maxoffset;
+	u32 pgoffset;
+	u64 rsvd;
+};
+
+struct pagepod {
+	struct pagepod_hdr hdr;
+	u64 addr[PPOD_PAGES_MAX + 1];
+};
+
+#define PPOD_SIZE		sizeof(struct pagepod)	/* 64 */
+#define PPOD_SIZE_SHIFT		6
+
+#define PPOD_COLOR_SHIFT	0
+#define PPOD_COLOR_SIZE		6
+#define PPOD_COLOR_MASK		((1 << PPOD_COLOR_SIZE) - 1)
+
+#define PPOD_IDX_SHIFT		PPOD_COLOR_SIZE
+#define PPOD_IDX_MAX_SIZE	24
+
+#define S_PPOD_TID    0
+#define M_PPOD_TID    0xFFFFFF
+#define V_PPOD_TID(x) ((x) << S_PPOD_TID)
+
+#define S_PPOD_VALID    24
+#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID)
+#define F_PPOD_VALID    V_PPOD_VALID(1U)
+
+#define S_PPOD_COLOR    0
+#define M_PPOD_COLOR    0x3F
+#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR)
+
+#define S_PPOD_TAG    6
+#define M_PPOD_TAG    0xFFFFFF
+#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
+
+#define S_PPOD_PGSZ    30
+#define M_PPOD_PGSZ    0x3
+#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
+
+struct cpl_iscsi_hdr_norss {
+	union opcode_tid ot;
+	u16 pdu_len_ddp;
+	u16 len;
+	u32 seq;
+	u16 urg;
+	u8 rsvd;
+	u8 status;
+};
+
+struct cpl_rx_data_ddp_norss {
+	union opcode_tid ot;
+	u16 urg;
+	u16 len;
+	u32 seq;
+	u32 nxt_seq;
+	u32 ulp_crc;
+	u32 ddp_status;
+};
+
+#define RX_DDP_STATUS_IPP_SHIFT		27	/* invalid pagepod */
+#define RX_DDP_STATUS_TID_SHIFT		26	/* tid mismatch */
+#define RX_DDP_STATUS_COLOR_SHIFT	25	/* color mismatch */
+#define RX_DDP_STATUS_OFFSET_SHIFT	24	/* offset mismatch */
+#define RX_DDP_STATUS_ULIMIT_SHIFT	23	/* ulimit error */
+#define RX_DDP_STATUS_TAG_SHIFT		22	/* tag mismatch */
+#define RX_DDP_STATUS_DCRC_SHIFT	21	/* dcrc error */
+#define RX_DDP_STATUS_HCRC_SHIFT	20	/* hcrc error */
+#define RX_DDP_STATUS_PAD_SHIFT		19	/* pad error */
+#define RX_DDP_STATUS_PPP_SHIFT		18	/* pagepod parity error */
+#define RX_DDP_STATUS_LLIMIT_SHIFT	17	/* llimit error */
+#define RX_DDP_STATUS_DDP_SHIFT		16	/* ddp'able */
+#define RX_DDP_STATUS_PMM_SHIFT		15	/* pagepod mismatch */
+
+#define ULP2_FLAG_DATA_READY		0x1
+#define ULP2_FLAG_DATA_DDPED		0x2
+#define ULP2_FLAG_HCRC_ERROR		0x10
+#define ULP2_FLAG_DCRC_ERROR		0x20
+#define ULP2_FLAG_PAD_ERROR		0x40
+
+void cxgb3i_conn_closing(struct s3_conn *);
+void cxgb3i_conn_pdu_ready(struct s3_conn *c3cn);
+void cxgb3i_conn_tx_open(struct s3_conn *c3cn);
+#endif

^ permalink raw reply related	[flat|nested] 71+ messages in thread

* RE: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30  0:19 [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator Karen Xie
@ 2008-07-30 18:15 ` Shyam_Iyer
  2008-07-30 18:37   ` Karen Xie
  2008-07-30 19:21 ` Roland Dreier
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 71+ messages in thread
From: Shyam_Iyer @ 2008-07-30 18:15 UTC (permalink / raw)
  To: open-iscsi, netdev
  Cc: jgarzik, davem, michaelc, swise, rdreier, daisyc, wenxiong, bhua,
	divy, dm, leedom

+
+static struct iscsi_transport cxgb3i_iscsi_transport = {
+	.owner = THIS_MODULE,
+	.name = "cxgb3i",
+	.caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST
+	    | CAP_DATADGST | CAP_DIGEST_OFFLOAD,

Correct me if I am wrong but possibly CAP_DATA_OFFLOAD needs to be added
here for including PDU Offload capability into the adapter. I am trying
to correlate with the open-iscsi userspace code currently upstream.

Thanks,
Shyam Iyer

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30 18:15 ` Shyam_Iyer
@ 2008-07-30 18:37   ` Karen Xie
  0 siblings, 0 replies; 71+ messages in thread
From: Karen Xie @ 2008-07-30 18:37 UTC (permalink / raw)
  To: open-iscsi, netdev
  Cc: jgarzik, davem, michaelc, Steve Wise, rdreier, daisyc, wenxiong,
	bhua, Divy Le Ray, Dimitrios Michailidis, Casey Leedom

Hi, Shyam,

The define is already in the cxgb3i branch of the iscsi git tree, so I
did not include it here. 
This particular patch contains the connection offload part which we'd
like to get reviewed by the netdev list.

Thanks.
Karen

-----Original Message-----
From: open-iscsi@googlegroups.com [mailto:open-iscsi@googlegroups.com]
On Behalf Of Shyam_Iyer@Dell.com
Sent: Wednesday, July 30, 2008 11:16 AM
To: open-iscsi@googlegroups.com; netdev@vger.kernel.org
Cc: jgarzik@pobox.com; davem@davemloft.net; michaelc@cs.wisc.edu; Steve
Wise; rdreier@cisco.com; daisyc@us.ibm.com; wenxiong@us.ibm.com;
bhua@us.ibm.com; Divy Le Ray; Dimitrios Michailidis; Casey Leedom
Subject: RE: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator


+
+static struct iscsi_transport cxgb3i_iscsi_transport = {
+	.owner = THIS_MODULE,
+	.name = "cxgb3i",
+	.caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST
+	    | CAP_DATADGST | CAP_DIGEST_OFFLOAD,

Correct me if I am wrong but possibly CAP_DATA_OFFLOAD needs to be added
here for including PDU Offload capability into the adapter. I am trying
to correlate with the open-iscsi userspace code currently upstream.

Thanks,
Shyam Iyer

--~--~---------~--~----~------------~-------~--~----~
You received this message because you are subscribed to the Google
Groups "open-iscsi" group.
To post to this group, send email to open-iscsi@googlegroups.com
To unsubscribe from this group, send email to
open-iscsi+unsubscribe@googlegroups.com
For more options, visit this group at
http://groups.google.com/group/open-iscsi
-~----------~----~----~----~------~----~------~--~---


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30  0:19 [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator Karen Xie
  2008-07-30 18:15 ` Shyam_Iyer
@ 2008-07-30 19:21 ` Roland Dreier
  2008-07-30 19:35 ` Jeff Garzik
  2008-07-31 12:33 ` Boaz Harrosh
  3 siblings, 0 replies; 71+ messages in thread
From: Roland Dreier @ 2008-07-30 19:21 UTC (permalink / raw)
  To: Karen Xie
  Cc: netdev, open-iscsi, jgarzik, davem, michaelc, swise, daisyc,
	wenxiong, bhua, divy, dm, leedom

 > Cxgb3i iSCSI driver

I think a little more detail in the changelog would be
helpful... something like what you had in the 0/1 email (no reason to
put that nice info in an email that will be discarded rather than in the
patch that will be merged).  Maybe:

  The cxgb3i driver provides iscsi acceleration (PDU digest offload and
  payload direct-placement) to the open-iscsi initiator.  It accesses
  the hardware through the cxgb3 module.

I didn't read in detail yet but one quick comments:

 > --- /dev/null
 > +++ b/drivers/scsi/cxgb3i/Kconfig
 > @@ -0,0 +1,6 @@
 > +config SCSI_CXGB3_ISCSI
 > +	tristate "Chelsio S3xx iSCSI support"
 > +	select CHELSIO_T3
 > +	select SCSI_ISCSI_ATTRS
 > +	---help---
 > +	This driver supports iSCSI offload for the Chelsio S3 series devices.

I don't see any other Kconfig changes that hook this Kconfig file into
the build?  Also are there sufficient dependencies to avoid broken
configs here?

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30  0:19 [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator Karen Xie
  2008-07-30 18:15 ` Shyam_Iyer
  2008-07-30 19:21 ` Roland Dreier
@ 2008-07-30 19:35 ` Jeff Garzik
  2008-07-30 21:35   ` Roland Dreier
  2008-07-31  1:24   ` Karen Xie
  2008-07-31 12:33 ` Boaz Harrosh
  3 siblings, 2 replies; 71+ messages in thread
From: Jeff Garzik @ 2008-07-30 19:35 UTC (permalink / raw)
  To: Karen Xie
  Cc: netdev, open-iscsi, davem, michaelc, swise, rdreier, daisyc,
	wenxiong, bhua, divy, dm, leedom, linux-scsi, LKML

Karen Xie wrote:
> Cxgb3i iSCSI driver
> 
> Signed-off-by: Karen Xie <kxie@chelsio.com>
> ---
> 
>  drivers/scsi/cxgb3i/Kconfig          |    6 
>  drivers/scsi/cxgb3i/Makefile         |    5 
>  drivers/scsi/cxgb3i/cxgb3i.h         |  155 +++
>  drivers/scsi/cxgb3i/cxgb3i_init.c    |  109 ++
>  drivers/scsi/cxgb3i/cxgb3i_iscsi.c   |  800 ++++++++++++++
>  drivers/scsi/cxgb3i/cxgb3i_offload.c | 2001 ++++++++++++++++++++++++++++++++++
>  drivers/scsi/cxgb3i/cxgb3i_offload.h |  242 ++++
>  drivers/scsi/cxgb3i/cxgb3i_ulp2.c    |  692 ++++++++++++
>  drivers/scsi/cxgb3i/cxgb3i_ulp2.h    |  106 ++
>  9 files changed, 4116 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/scsi/cxgb3i/Kconfig
>  create mode 100644 drivers/scsi/cxgb3i/Makefile
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i.h
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_init.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_iscsi.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.h
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.h

Comments:

* SCSI drivers should be submitted via the linux-scsi@vger.kernel.org 
mailing list.

* The driver is clean and readable, well done

* From a networking standpoint, our main concern becomes how this 
interacts with the networking stack.  In particular, I'm concerned based 
on reading the source that this driver uses "TCP port stealing" rather 
than using a totally separate MAC address (and IP).

Stealing a TCP port on an IP/interface already assigned is a common 
solution in this space, but also a flawed one.  Precisely because the 
kernel and applications are unaware of this "special, magic TCP port" 
you open the potential for application problems that are very difficult 
for an admin to diagnose based on observed behavior.

So, additional information on your TCP port usage would be greatly 
appreciated.  Also, how does this interact with IPv6?  Clearly it 
interacts with IPv4...

	Jeff

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30 19:35 ` Jeff Garzik
@ 2008-07-30 21:35   ` Roland Dreier
  2008-08-01  0:51     ` Divy Le Ray
  2008-07-31  1:24   ` Karen Xie
  1 sibling, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-07-30 21:35 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Karen Xie, netdev, open-iscsi, davem, michaelc, swise, daisyc,
	wenxiong, bhua, divy, dm, leedom, linux-scsi, LKML

 > * From a networking standpoint, our main concern becomes how this
 > interacts with the networking stack.  In particular, I'm concerned
 > based on reading the source that this driver uses "TCP port stealing"
 > rather than using a totally separate MAC address (and IP).
 > 
 > Stealing a TCP port on an IP/interface already assigned is a common
 > solution in this space, but also a flawed one.  Precisely because the
 > kernel and applications are unaware of this "special, magic TCP port"
 > you open the potential for application problems that are very
 > difficult for an admin to diagnose based on observed behavior.

That's true, but using a separate MAC and IP opens up a bunch of other
operational problems.  I don't think the right answer for iSCSI offload
is clear yet.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* RE: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30 19:35 ` Jeff Garzik
  2008-07-30 21:35   ` Roland Dreier
@ 2008-07-31  1:24   ` Karen Xie
  2008-07-31 12:45     ` Boaz Harrosh
  1 sibling, 1 reply; 71+ messages in thread
From: Karen Xie @ 2008-07-31  1:24 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: netdev, open-iscsi, davem, michaelc, Steve Wise, rdreier, daisyc,
	wenxiong, bhua, Divy Le Ray, Dimitrios Michailidis, Casey Leedom,
	linux-scsi, LKML

>Comments:
>
>* SCSI drivers should be submitted via the linux-scsi@vger.kernel.org 
>mailing list.

Will do that. Thanks.

>
>* The driver is clean and readable, well done
>
>* From a networking standpoint, our main concern becomes how this 
>interacts with the networking stack.  In particular, I'm concerned
based 
>on reading the source that this driver uses "TCP port stealing" rather 
>than using a totally separate MAC address (and IP).
>
>Stealing a TCP port on an IP/interface already assigned is a common 
>solution in this space, but also a flawed one.  Precisely because the 
>kernel and applications are unaware of this "special, magic TCP port" 
>you open the potential for application problems that are very difficult

>for an admin to diagnose based on observed behavior.

The collisions between the host stack and iSCSI offload are unlikely
because the iSCSI target server's port is unique (nailed down as 3260).
If an offload card is plugged in, all iSCSI connections to a given
target (i.e., destination/port) are offloaded. There is precedence for
this approach such as RDMA/iWarp.

>
>So, additional information on your TCP port usage would be greatly 
>appreciated.  Also, how does this interact with IPv6?  Clearly it 
>interacts with IPv4...

Currently, IPv6 connection request will not be honored, I will make sure
the checking is added in the resubmission.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30  0:19 [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator Karen Xie
                   ` (2 preceding siblings ...)
  2008-07-30 19:35 ` Jeff Garzik
@ 2008-07-31 12:33 ` Boaz Harrosh
  3 siblings, 0 replies; 71+ messages in thread
From: Boaz Harrosh @ 2008-07-31 12:33 UTC (permalink / raw)
  To: open-iscsi
  Cc: netdev, jgarzik, davem, michaelc, swise, rdreier, daisyc,
	wenxiong, bhua, divy, dm, leedom, linux-scsi

Karen Xie wrote:
> Cxgb3i iSCSI driver
> 

Sorry for not following the cxgb3i thread , but I would like a little
description about this HW. What are it's capabilities over a regular
NIC, and in what way it is special.

Also I would like some documentation like:

  This card does ....
  It is setup the same as iscsi_tcp ....

  Additional parameters not available with iscsi_tcp are:
    ...

  iscsi_tcp Parameters not available:
    ...

  Special System considerations ...

Also is there a website that has more Document ion/information

(More comments in code)

> Signed-off-by: Karen Xie <kxie@chelsio.com>
> ---
> 
>  drivers/scsi/cxgb3i/Kconfig          |    6 
>  drivers/scsi/cxgb3i/Makefile         |    5 
>  drivers/scsi/cxgb3i/cxgb3i.h         |  155 +++
>  drivers/scsi/cxgb3i/cxgb3i_init.c    |  109 ++
>  drivers/scsi/cxgb3i/cxgb3i_iscsi.c   |  800 ++++++++++++++
>  drivers/scsi/cxgb3i/cxgb3i_offload.c | 2001 ++++++++++++++++++++++++++++++++++
>  drivers/scsi/cxgb3i/cxgb3i_offload.h |  242 ++++
>  drivers/scsi/cxgb3i/cxgb3i_ulp2.c    |  692 ++++++++++++
>  drivers/scsi/cxgb3i/cxgb3i_ulp2.h    |  106 ++
>  9 files changed, 4116 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/scsi/cxgb3i/Kconfig
>  create mode 100644 drivers/scsi/cxgb3i/Makefile
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i.h
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_init.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_iscsi.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_offload.h
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.c
>  create mode 100644 drivers/scsi/cxgb3i/cxgb3i_ulp2.h
> 
> 
> diff --git a/drivers/scsi/cxgb3i/Kconfig b/drivers/scsi/cxgb3i/Kconfig
> new file mode 100644
> index 0000000..2762814
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/Kconfig
> @@ -0,0 +1,6 @@
> +config SCSI_CXGB3_ISCSI
> +	tristate "Chelsio S3xx iSCSI support"
> +	select CHELSIO_T3
> +	select SCSI_ISCSI_ATTRS
> +	---help---
> +	This driver supports iSCSI offload for the Chelsio S3 series devices.
> diff --git a/drivers/scsi/cxgb3i/Makefile b/drivers/scsi/cxgb3i/Makefile
> new file mode 100644
> index 0000000..8c8a894
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/Makefile
> @@ -0,0 +1,5 @@
> +EXTRA_CFLAGS += -I$(TOPDIR)/drivers/net/cxgb3
> +
> +cxgb3i-y := cxgb3i_init.o cxgb3i_iscsi.o cxgb3i_ulp2.o cxgb3i_offload.o
> +
> +obj-$(CONFIG_SCSI_CXGB3_ISCSI) += cxgb3i.o
> diff --git a/drivers/scsi/cxgb3i/cxgb3i.h b/drivers/scsi/cxgb3i/cxgb3i.h
> new file mode 100644
> index 0000000..3c44c3c
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/cxgb3i.h
> @@ -0,0 +1,155 @@
> +/*
> + * cxgb3i.h: Chelsio S3xx iSCSI driver.
> + *
> + * Copyright (c) 2008 Chelsio Communications, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation.
> + *
> + * Written by: Karen Xie (kxie@chelsio.com)
> + */
> +
> +#ifndef __CXGB3I_H__
> +#define __CXGB3I_H__
> +
> +#include <linux/module.h>
> +#include <linux/moduleparam.h>
> +#include <linux/errno.h>
> +#include <linux/types.h>
> +#include <linux/list.h>
> +#include <linux/netdevice.h>
> +#include <linux/scatterlist.h>
> +
> +/* from cxgb3 LLD */
> +#include "common.h"
> +#include "t3_cpl.h"
> +#include "t3cdev.h"
> +#include "cxgb3_ctl_defs.h"
> +#include "cxgb3_offload.h"
> +#include "firmware_exports.h"
> +#include "cxgb3i_offload.h"
> +

#include "../iscsi_tcp.h"

> +#define CXGB3I_SCSI_QDEPTH_DFLT	128
> +#define ISCSI_PDU_HEADER_MAX	(56 + 256) /* bhs + digests + ahs */

#define ISCSI_PDU_HEADER_MAX sizeof(struct iscsi_hdr_buff)

> +
> +struct cxgb3i_adapter;
> +struct cxgb3i_hba;
> +struct cxgb3i_endpoint;
> +
> +/**
> + * struct cxgb3i_tag_format - cxgb3i ulp tag for steering pdu payload
> + *
> + * @rsvd_bits:	# of bits used by h/w
> + * @rsvd_shift:	shift left
> + * @rsvd_mask:  bit mask
> + *
> + */
> +struct cxgb3i_tag_format {
> +	unsigned char idx_bits;
> +	unsigned char age_bits;
> +	unsigned char rsvd_bits;
> +	unsigned char rsvd_shift;
> +	u32 rsvd_mask;
> +};
> +
> +/**
> + * struct cxgb3i_ddp_info - cxgb3i direct data placement for pdu payload
> + *
> + * @llimit:	lower bound of the page pod memory
> + * @ulimit:	upper bound of the page pod memory
> + * @nppods:	# of page pod entries
> + * @idx_last:	page pod entry last used
> + * @map_lock:	lock to synchonize access to the page pod map
> + * @map:	page pod map
> + */
> +struct cxgb3i_ddp_info {
> +	unsigned int llimit;
> +	unsigned int ulimit;
> +	unsigned int nppods;
> +	unsigned int idx_last;
> +	spinlock_t map_lock;
> +	u8 *map;
> +};
> +
> +struct cxgb3i_hba {
> +	struct cxgb3i_adapter *snic;
> +	struct net_device *ndev;
> +	struct Scsi_Host *shost;
> +
> +	rwlock_t cconn_rwlock;
> +	struct list_head cconn_list;
> +};
> +
> +struct cxgb3i_adapter {
> +	struct list_head list_head;
> +	spinlock_t lock;
> +	struct t3cdev *tdev;
> +	struct pci_dev *pdev;
> +	unsigned char hba_cnt;
> +	struct cxgb3i_hba *hba[MAX_NPORTS];
> +
> +	unsigned int tx_max_size;
> +	unsigned int rx_max_size;
> +
> +	struct cxgb3i_tag_format tag_format;
> +	struct cxgb3i_ddp_info ddp;
> +};
> +
> +struct cxgb3i_conn {
> +	struct list_head list_head;
> +
> +	struct cxgb3i_endpoint *cep;
> +	struct iscsi_conn *conn;
> +	struct cxgb3i_hba *hba;
> +};
> +
> +struct cxgb3i_endpoint {
> +	struct s3_conn *c3cn;
> +	struct cxgb3i_hba *hba;
> +	struct cxgb3i_conn *cconn;
> +};
> +
> +int cxgb3i_iscsi_init(void);
> +void cxgb3i_iscsi_cleanup(void);
> +
> +struct cxgb3i_adapter *cxgb3i_adapter_find_by_tdev(struct t3cdev *);
> +struct cxgb3i_adapter *cxgb3i_adapter_add(struct t3cdev *);
> +void cxgb3i_adapter_remove(struct cxgb3i_adapter *);
> +int cxgb3i_adapter_ulp_init(struct cxgb3i_adapter *);
> +void cxgb3i_adapter_ulp_cleanup(struct cxgb3i_adapter *);
> +
> +struct cxgb3i_hba *cxgb3i_hba_find_by_netdev(struct net_device *);
> +struct cxgb3i_hba *cxgb3i_hba_host_add(struct cxgb3i_adapter *,
> +				       struct net_device *);
> +void cxgb3i_hba_host_remove(struct cxgb3i_hba *);
> +
> +void cxgb3i_hba_conn_add(struct cxgb3i_conn *, struct cxgb3i_hba *);
> +void cxgb3i_hba_conn_remove(struct cxgb3i_conn *);
> +
> +int cxgb3i_ulp2_init(void);
> +void cxgb3i_ulp2_cleanup(void);
> +int cxgb3i_conn_ulp_setup(struct cxgb3i_conn *, int, int);
> +
> +void cxgb3i_ddp_tag_release(struct cxgb3i_adapter *, u32,
> +			    struct scatterlist *, unsigned int);
> +u32 cxgb3i_ddp_tag_reserve(struct cxgb3i_adapter *, unsigned int,
> +			   u32, unsigned int, struct scatterlist *,
> +			   unsigned int);
> +static inline void cxgb3i_parse_tag(struct cxgb3i_tag_format *format,
> +				    u32 tag, u32 *rsvd_bits, u32 *sw_bits)
> +{
> +	if (rsvd_bits)
> +		*rsvd_bits = (tag >> format->rsvd_shift) & format->rsvd_mask;
> +	if (sw_bits) {
> +		*sw_bits = (tag >> (format->rsvd_shift + format->rsvd_bits))
> +		    << format->rsvd_shift;
> +		*sw_bits |= tag & ((1 << format->rsvd_shift) - 1);
> +	}
> +}
> +
> +int cxgb3i_conn_ulp2_xmit(struct iscsi_conn *);
> +
> +void cxgb3i_display_byte_string(char *, unsigned char *, int, int);
> +
> +#endif
> diff --git a/drivers/scsi/cxgb3i/cxgb3i_init.c b/drivers/scsi/cxgb3i/cxgb3i_init.c
> new file mode 100644
> index 0000000..1c91bb0
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/cxgb3i_init.c
> @@ -0,0 +1,109 @@
> +/* cxgb3i_init.c: Chelsio S3xx iSCSI driver.
> + *
> + * Copyright (c) 2008 Chelsio Communications, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation.
> + *
> + * Written by: Karen Xie (kxie@chelsio.com)
> + */
> +
> +#include "cxgb3i.h"
> +
> +#define DRV_MODULE_NAME         "cxgb3i"
> +#define DRV_MODULE_VERSION      "1.0.0"
> +#define DRV_MODULE_RELDATE      "May 1, 2008"
> +
> +static char version[] =
> +    "Chelsio S3xx iSCSI Driver " DRV_MODULE_NAME
> +    " v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
> +
> +MODULE_AUTHOR("Karen Xie <kxie@chelsio.com>");
> +MODULE_DESCRIPTION("Chelsio S3xx iSCSI Driver");
> +MODULE_LICENSE("GPL");
> +MODULE_VERSION(DRV_MODULE_VERSION);
> +
> +static void open_s3_dev(struct t3cdev *);
> +static void close_s3_dev(struct t3cdev *);
> +cxgb3_cpl_handler_func cxgb3i_cpl_handlers[NUM_CPL_CMDS];
> +struct cxgb3_client t3c_client = {
> +	.name = "iscsi_cxgb3",
> +	.handlers = cxgb3i_cpl_handlers,
> +	.add = open_s3_dev,
> +	.remove = close_s3_dev,
> +};
> +
> +/**
> + * open_s3_dev - register with cxgb3 LLD
> + * @t3dev	cxgb3 adapter instance
> + */
> +static void open_s3_dev(struct t3cdev *t3dev)
> +{
> +	static int vers_printed;
> +
> +	if (!vers_printed) {
> +		printk(KERN_INFO "%s", version);
> +		vers_printed = 1;
> +	}
> +
> +	cxgb3i_log_debug("open cxgb3 %s.\n", t3dev->name);
> +
> +	cxgb3i_sdev_add(t3dev, &t3c_client);
> +	cxgb3i_adapter_add(t3dev);
> +}
> +
> +/**
> + * close_s3_dev - de-register with cxgb3 LLD
> + * @t3dev	cxgb3 adapter instance
> + */
> +static void close_s3_dev(struct t3cdev *t3dev)
> +{
> +	struct cxgb3i_adapter *snic = cxgb3i_adapter_find_by_tdev(t3dev);
> +	cxgb3i_log_debug("close cxgb3 %s.\n", t3dev->name);
> +	if (snic)
> +		cxgb3i_adapter_remove(snic);
> +	cxgb3i_sdev_remove(t3dev);
> +}
> +
> +/**
> + * cxgb3i_init_module - module init entry point
> + *
> + * initialize any driver wide global data structures and register itself
> + *	with the cxgb3 module
> + */
> +static int __init cxgb3i_init_module(void)
> +{
> +	int err;
> +
> +	err = cxgb3i_sdev_init(cxgb3i_cpl_handlers);
> +	if (err < 0)
> +		return err;
> +
> +	err = cxgb3i_iscsi_init();
> +	if (err < 0)
> +		return err;
> +
> +	err = cxgb3i_ulp2_init();
> +	if (err < 0)
> +		return err;
> +
> +	cxgb3_register_client(&t3c_client);
> +	return 0;
> +}
> +
> +/**
> + * cxgb3i_exit_module - module cleanup/exit entry point
> + *
> + * go through the driver hba list and for each hba, release any resource held.
> + *	and unregisters iscsi transport and the cxgb3 module
> + */
> +static void __exit cxgb3i_exit_module(void)
> +{
> +	cxgb3_unregister_client(&t3c_client);
> +	cxgb3i_ulp2_cleanup();
> +	cxgb3i_iscsi_cleanup();
> +}
> +
> +module_init(cxgb3i_init_module);
> +module_exit(cxgb3i_exit_module);
> diff --git a/drivers/scsi/cxgb3i/cxgb3i_iscsi.c b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c
> new file mode 100644
> index 0000000..ed3d340
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/cxgb3i_iscsi.c
> @@ -0,0 +1,800 @@
> +/* cxgb3i_iscsi.c: Chelsio S3xx iSCSI driver.
> + *
> + * Copyright (c) 2008 Chelsio Communications, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation.
> + *
> + * Written by: Karen Xie (kxie@chelsio.com)
> + */
> +
> +#include <net/tcp.h>
> +#include <scsi/scsi_cmnd.h>
> +#include <scsi/scsi_device.h>
> +#include <scsi/scsi_eh.h>
> +#include <scsi/scsi_host.h>
> +#include <scsi/scsi.h>
> +#include <scsi/iscsi_proto.h>
> +#include <scsi/libiscsi.h>
> +#include <scsi/scsi_transport_iscsi.h>
> +#include <linux/crypto.h>
> +#include "../iscsi_tcp.h"
> +
> +#include "cxgb3i.h"
> +
> +static struct scsi_transport_template *cxgb3i_scsi_transport;
> +static struct scsi_host_template cxgb3i_host_template;
> +static struct iscsi_transport cxgb3i_iscsi_transport;
> +
> +static LIST_HEAD(cxgb3i_snic_list);
> +static DEFINE_RWLOCK(cxgb3i_snic_rwlock);
> +
> +/**
> + * cxgb3i_adapter_add - initialize a s3 adapter structure and any h/w settings
> + *	necessary
> + * @snic:	pointer to adapter instance
> + */
> +struct cxgb3i_adapter *cxgb3i_adapter_add(struct t3cdev *t3dev)
> +{
> +	struct cxgb3i_adapter *snic;
> +	struct adapter *adapter = tdev2adap(t3dev);
> +	int i;
> +
> +	snic = kzalloc(sizeof(*snic), GFP_KERNEL);
> +	if (!snic) {
> +		cxgb3i_log_debug("cxgb3 %s, OOM.\n", t3dev->name);
> +		return NULL;
> +	}
> +
> +	spin_lock_init(&snic->lock);
> +	snic->tdev = t3dev;
> +	snic->pdev = adapter->pdev;
> +
> +	if (cxgb3i_adapter_ulp_init(snic))
> +		goto free_snic;
> +
> +	for_each_port(adapter, i) {
> +		snic->hba[i] = cxgb3i_hba_host_add(snic, adapter->port[i]);
> +		if (!snic->hba[i])
> +			goto ulp_cleanup;
> +	}
> +	snic->hba_cnt = adapter->params.nports;
> +
> +	/* add to the list */
> +	write_lock(&cxgb3i_snic_rwlock);
> +	list_add_tail(&snic->list_head, &cxgb3i_snic_list);
> +	write_unlock(&cxgb3i_snic_rwlock);
> +
> +	return snic;
> +
> +ulp_cleanup:
> +	cxgb3i_adapter_ulp_cleanup(snic);
> +free_snic:
> +	kfree(snic);
> +	return NULL;
> +}
> +
> +/**
> + * cxgb3i_snic_cleanup - release all the resources held and cleanup any h/w
> + *	settings necessary
> + * @snic:	pointer to adapter instance
> + */
> +void cxgb3i_adapter_remove(struct cxgb3i_adapter *snic)
> +{
> +	int i;
> +
> +	/* remove from the list */
> +	write_lock(&cxgb3i_snic_rwlock);
> +	list_del(&snic->list_head);
> +	write_unlock(&cxgb3i_snic_rwlock);
> +
> +	for (i = 0; i < snic->hba_cnt; i++) {
> +		if (snic->hba[i]) {
> +			cxgb3i_hba_host_remove(snic->hba[i]);
> +			snic->hba[i] = NULL;
> +		}
> +	}
> +
> +	/* release ddp resources */
> +	cxgb3i_adapter_ulp_cleanup(snic);
> +	kfree(snic);
> +}
> +
> +struct cxgb3i_adapter *cxgb3i_adapter_find_by_tdev(struct t3cdev *t3dev)
> +{
> +	struct cxgb3i_adapter *snic;
> +
> +	read_lock(&cxgb3i_snic_rwlock);
> +	list_for_each_entry(snic, &cxgb3i_snic_list, list_head) {
> +		if (snic->tdev == t3dev) {
> +			read_unlock(&cxgb3i_snic_rwlock);
> +			return snic;
> +		}
> +	}
> +	read_unlock(&cxgb3i_snic_rwlock);
> +
> +	return NULL;
> +}
> +
> +struct cxgb3i_hba *cxgb3i_hba_find_by_netdev(struct net_device *ndev)
> +{
> +	struct cxgb3i_adapter *snic;
> +	int i;
> +
> +	read_lock(&cxgb3i_snic_rwlock);
> +	list_for_each_entry(snic, &cxgb3i_snic_list, list_head) {
> +		for (i = 0; i < snic->hba_cnt; i++) {
> +			if (snic->hba[i]->ndev == ndev) {
> +				read_unlock(&cxgb3i_snic_rwlock);
> +				return (snic->hba[i]);
> +			}
> +		}
> +	}
> +	read_unlock(&cxgb3i_snic_rwlock);
> +	return NULL;
> +}
> +
> +void cxgb3i_hba_conn_add(struct cxgb3i_conn *cconn, struct cxgb3i_hba *hba)
> +{
> +	cconn->hba = hba;
> +	write_lock(&hba->cconn_rwlock);
> +	list_add_tail(&cconn->list_head, &hba->cconn_list);
> +	write_unlock(&hba->cconn_rwlock);
> +}
> +
> +void cxgb3i_hba_conn_remove(struct cxgb3i_conn *cconn)
> +{
> +	struct cxgb3i_hba *hba = cconn->hba;
> +
> +	if (hba) {
> +		write_lock(&hba->cconn_rwlock);
> +		list_del(&cconn->list_head);
> +		write_unlock(&hba->cconn_rwlock);
> +	}
> +}
> +
> +struct cxgb3i_hba *cxgb3i_hba_host_add(struct cxgb3i_adapter *snic,
> +				       struct net_device *ndev)
> +{
> +	struct cxgb3i_hba *hba;
> +	struct Scsi_Host *shost;
> +	int err;
> +
> +	shost = iscsi_host_alloc(&cxgb3i_host_template,
> +				 sizeof(struct cxgb3i_hba),
> +				 CXGB3I_SCSI_QDEPTH_DFLT);
> +	if (!shost) {
> +		cxgb3i_log_info("iscsi_host_alloc failed.\n");
> +		return NULL;
> +	}
> +
> +	shost->transportt = cxgb3i_scsi_transport;
> +	shost->max_lun = 512;
> +	shost->max_id = 0;
> +	shost->max_channel = 0;
> +	shost->max_cmd_len = 16;

It looks like code supports AHS, is that some hardware
limitation? I could not find this limitation in submitted
code. Maybe = SCSI_MAX_VARLEN_CDB_SIZE.

> +
> +	hba = iscsi_host_priv(shost);
> +	INIT_LIST_HEAD(&hba->cconn_list);
> +	rwlock_init(&hba->cconn_rwlock);
> +	hba->snic = snic;
> +	hba->ndev = ndev;
> +	hba->shost = shost;
> +
> +	pci_dev_get(snic->pdev);
> +	err = iscsi_host_add(shost, &snic->pdev->dev);
> +	if (err) {
> +		cxgb3i_log_info("iscsi_host_add failed.\n");
> +		goto pci_dev_put;
> +	}
> +
> +	cxgb3i_log_debug("shost 0x%p, hba 0x%p, no %u.\n",
> +			 shost, hba, shost->host_no);
> +
> +	return hba;
> +
> +pci_dev_put:
> +	pci_dev_put(snic->pdev);
> +	scsi_host_put(shost);
> +	return NULL;
> +}
> +
> +void cxgb3i_hba_host_remove(struct cxgb3i_hba *hba)
> +{
> +	if (hba->shost) {
> +		cxgb3i_log_debug("shost 0x%p, hba 0x%p, no %u.\n",
> +				 hba->shost, hba, hba->shost->host_no);
> +		iscsi_host_remove(hba->shost);
> +		pci_dev_put(hba->snic->pdev);
> +		/* cleanup connections ? */
> +		iscsi_host_free(hba->shost);
> +	}
> +}
> +
> +/**
> + * cxgb3i_ep_connect - establish TCP connection to target portal
> + * @dst_addr:		target IP address
> + * @non_blocking:	blocking or non-blocking call
> + *
> + * Initiates a TCP/IP connection to the dst_addr
> + */
> +static struct iscsi_endpoint *cxgb3i_ep_connect(struct sockaddr *dst_addr,
> +						int non_blocking)
> +{
> +	struct iscsi_endpoint *ep;
> +	struct cxgb3i_endpoint *cep;
> +	struct cxgb3i_hba *hba;
> +	struct s3_conn *c3cn;
> +	int err;
> +
> +	c3cn = cxgb3i_c3cn_create();
> +	if (!c3cn) {
> +		cxgb3i_log_info("ep connect OOM.\n");
> +		return NULL;
> +	}
> +
> +	err = cxgb3i_c3cn_connect(c3cn, (struct sockaddr_in *)dst_addr);
> +	if (err < 0) {
> +		cxgb3i_log_info("ep connect failed.\n");
> +		goto release_conn;
> +	}
> +	hba = cxgb3i_hba_find_by_netdev(c3cn->dst_cache->dev);
> +	if (!hba) {
> +		cxgb3i_log_info("NOT going through cxgbi device.\n");
> +		goto release_conn;
> +	}
> +
> +	ep = iscsi_create_endpoint(sizeof(*cep));
> +	if (!ep) {
> +		cxgb3i_log_info("iscsi alloc ep, OOM.\n");
> +		goto release_conn;
> +	}
> +	cep = ep->dd_data;
> +	cep->c3cn = c3cn;
> +	cep->hba = hba;
> +
> +	cxgb3i_log_debug("iscsi_ep 0x%p, cxgb_ep 0x%p, hba 0x%p, c3cn 0x%p.\n",
> +			  ep, cep, hba, c3cn);
> +	return ep;
> +
> +release_conn:
> +	c3cn_release(c3cn);
> +	return NULL;
> +}
> +
> +/**
> + * cxgb3i_ep_poll - polls for TCP connection establishement
> + * @ep:		TCP connection (endpoint) handle
> + * @timeout_ms:	timeout value in milli secs
> + *
> + * polls for TCP connect request to complete
> + */
> +static int cxgb3i_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
> +{
> +	cxgb3i_log_debug("iscsi_ep 0x%p, timeout_ms %d.\n", ep, timeout_ms);
> +	return 1;
> +}
> +
> +/**
> + * cxgb3i_ep_disconnect - teardown TCP connection
> + * @ep:		TCP connection (endpoint) handle
> + *
> + * teardown TCP connection
> + */
> +static void cxgb3i_ep_disconnect(struct iscsi_endpoint *ep)
> +{
> +	struct cxgb3i_endpoint *cep = (struct cxgb3i_endpoint *)ep->dd_data;
> +	struct cxgb3i_conn *cconn = cep->cconn;
> +
> +	cxgb3i_log_debug("ep 0x%p, cep 0x%p.\n", ep, cep);
> +
> +	if (cconn && cconn->conn) {
> +		struct iscsi_conn *conn = cconn->conn;
> +		struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +		write_lock_bh(&cep->c3cn->callback_lock);
> +		cep->c3cn->user_data = NULL;
> +		set_bit(ISCSI_SUSPEND_BIT, &conn->suspend_rx);
> +		cconn->cep = NULL;
> +		tcp_conn->sock = NULL;
> +		write_unlock_bh(&cep->c3cn->callback_lock);
> +	}
> +
> +	c3cn_release(cep->c3cn);
> +	iscsi_destroy_endpoint(ep);
> +}
> +
> +/**
> + * cxgb3i_session_create - create a new iscsi session
> + * @cmds_max:		max # of commands
> + * @qdepth:		scsi queue depth
> + * @initial_cmdsn:	initial iscsi CMDSN for this session
> + * @host_no:		pointer to return host no
> + *
> + * Creates a new iSCSI session
> + */
> +static struct iscsi_cls_session *cxgb3i_session_create(struct iscsi_endpoint
> +						       *ep, uint16_t cmds_max,
> +						       uint16_t qdepth,
> +						       uint32_t initial_cmdsn,
> +						       uint32_t *host_no)
> +{
> +	struct cxgb3i_endpoint *cep;
> +	struct cxgb3i_hba *hba;
> +	struct Scsi_Host *shost;
> +	struct iscsi_cls_session *cls_session;
> +	struct iscsi_session *session;
> +	int i;
> +
> +	if (!ep) {
> +		cxgb3i_log_error("%s, missing endpoint.\n", __func__);
> +		return NULL;
> +	}
> +
> +	cep = (struct cxgb3i_endpoint *)ep->dd_data;
> +	hba = cep->hba;
> +	shost = hba->shost;
> +	cxgb3i_log_debug("ep 0x%p, cep 0x%p, hba 0x%p.\n", ep, cep, hba);
> +	BUG_ON(hba != iscsi_host_priv(shost));
> +
> +	*host_no = shost->host_no;
> +
> +	cls_session = iscsi_session_setup(&cxgb3i_iscsi_transport, shost,
> +					  cmds_max,
> +					  sizeof(struct iscsi_tcp_task),
> +					  initial_cmdsn, ISCSI_MAX_TARGET);
> +	if (!cls_session)
> +		return NULL;
> +
> +	session = cls_session->dd_data;
> +
> +	for (i = 0; i < session->cmds_max; i++) {
> +		struct iscsi_task *task = session->cmds[i];
> +		struct iscsi_tcp_task *tcp_task = task->dd_data;
> +
> +		task->hdr = &tcp_task->hdr.cmd_hdr;
> +		task->hdr_max = sizeof(tcp_task->hdr) - ISCSI_DIGEST_SIZE;

This little code will signal libiscsi to support AHS. By making
room at hdr_max

> +	}
> +
> +	if (iscsi_r2tpool_alloc(session))
> +		goto remove_session;
> +
> +	return cls_session;
> +
> +remove_session:
> +	iscsi_session_teardown(cls_session);
> +	return NULL;
> +}
> +
> +/**
> + * cxgb3i_session_destroy - destroys iscsi session
> + * @cls_session:	pointer to iscsi cls session
> + *
> + * Destroys an iSCSI session instance and releases its all resources held
> + */
> +static void cxgb3i_session_destroy(struct iscsi_cls_session *cls_session)
> +{
> +	cxgb3i_log_debug("sess 0x%p.\n", cls_session);
> +	iscsi_r2tpool_free(cls_session->dd_data);
> +	iscsi_session_teardown(cls_session);
> +}
> +
> +/**
> + * cxgb3i_conn_create - create iscsi connection instance
> + * @cls_session:	pointer to iscsi cls session
> + * @cid:		iscsi cid
> + *
> + * Creates a new iSCSI connection instance for a given session
> + */
> +static struct iscsi_cls_conn *cxgb3i_conn_create(struct iscsi_cls_session
> +						 *cls_session, uint32_t cid)
> +{
> +	struct iscsi_cls_conn *cls_conn;
> +	struct iscsi_conn *conn;
> +	struct iscsi_tcp_conn *tcp_conn;
> +	struct cxgb3i_conn *cconn;
> +
> +	cxgb3i_log_debug("sess 0x%p, cid %u.\n", cls_session, cid);
> +
> +	cls_conn = iscsi_conn_setup(cls_session,
> +				    sizeof(*tcp_conn) + sizeof(*cconn), cid);
> +	if (!cls_conn)
> +		return NULL;
> +	conn = cls_conn->dd_data;
> +
> +	conn->max_xmit_dlength = conn->max_recv_dlength = 16224 - 56 - 256;

Why not ULP2_PDU_PAYLOAD_DFLT (Which is defined but never used)

> +
> +	tcp_conn = conn->dd_data;
> +	tcp_conn->iscsi_conn = conn;
> +
> +	cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
> +	cconn->conn = conn;
> +
> +	return cls_conn;
> +}
> +
> +/**
> + * cxgb3i_conn_bind - binds iscsi sess, conn and endpoint together
> + * @cls_session:	pointer to iscsi cls session
> + * @cls_conn:		pointer to iscsi cls conn
> + * @transport_eph:	64-bit EP handle
> + * @is_leading:		leading connection on this session?
> + *
> + * Binds together an iSCSI session, an iSCSI connection and a
> + *	TCP connection. This routine returns error code if the TCP
> + *	connection does not belong on the device iSCSI sess/conn is bound
> + */
> +
> +static int cxgb3i_conn_bind(struct iscsi_cls_session *cls_session,
> +			    struct iscsi_cls_conn *cls_conn,
> +			    uint64_t transport_eph, int is_leading)
> +{
> +	struct iscsi_conn *conn = cls_conn->dd_data;
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
> +	struct iscsi_endpoint *ep;
> +	struct cxgb3i_endpoint *cep;
> +	struct s3_conn *c3cn;
> +	int err;
> +
> +	ep = iscsi_lookup_endpoint(transport_eph);
> +	if (!ep)
> +		return -EINVAL;
> +
> +	cxgb3i_log_debug("ep 0x%p, cls sess 0x%p, cls conn 0x%p.\n",
> +			 ep, cls_session, cls_conn);
> +
> +	err = iscsi_conn_bind(cls_session, cls_conn, is_leading);
> +	if (err)
> +		return -EINVAL;
> +
> +	cep = (struct cxgb3i_endpoint *)ep->dd_data;
> +	c3cn = cep->c3cn;
> +
> +	read_lock(&c3cn->callback_lock);
> +	tcp_conn->sock = (struct socket *)c3cn;
> +	c3cn->user_data = conn;
> +	read_unlock(&c3cn->callback_lock);
> +
> +	cconn->hba = cep->hba;
> +	cconn->cep = cep;
> +	cep->cconn = cconn;
> +
> +	conn->max_recv_dlength = cconn->hba->snic->rx_max_size - ISCSI_PDU_HEADER_MAX;
> +	conn->max_xmit_dlength = cconn->hba->snic->tx_max_size - ISCSI_PDU_HEADER_MAX;
> +
> +	spin_lock_bh(&conn->session->lock);
> +	sprintf(conn->portal_address, NIPQUAD_FMT,
> +		NIPQUAD(c3cn->daddr.sin_addr.s_addr));
> +	conn->portal_port = ntohs(c3cn->daddr.sin_port);
> +	spin_unlock_bh(&conn->session->lock);
> +
> +	iscsi_tcp_hdr_recv_prep(tcp_conn);
> +
> +	return 0;
> +}
> +
> +/**
> + * cxgb3i_conn_flush - flush tx
> + * @conn:	pointer to iscsi conn
> + */
> +static int cxgb3i_conn_flush(struct iscsi_conn *conn)
> +{
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct iscsi_segment *segment = &tcp_conn->out.segment;
> +
> +	if (segment->total_copied < segment->total_size)
> +		return cxgb3i_conn_ulp2_xmit(conn);
> +	return 0;
> +}
> +
> +/**
> + * cxgb3i_conn_get_param - return iscsi connection parameter to caller
> + * @cls_conn:	pointer to iscsi cls conn
> + * @param:	parameter type identifier
> + * @buf:	buffer pointer
> + *
> + * returns iSCSI connection parameters
> + */
> +static int cxgb3i_conn_get_param(struct iscsi_cls_conn *cls_conn,
> +				 enum iscsi_param param, char *buf)
> +{
> +	struct iscsi_conn *conn = cls_conn->dd_data;
> +	int len;
> +
> +	cxgb3i_log_debug("cls_conn 0x%p, param %d.\n", cls_conn, param);
> +
> +	switch (param) {
> +	case ISCSI_PARAM_CONN_PORT:
> +		spin_lock_bh(&conn->session->lock);
> +		len = sprintf(buf, "%hu\n", conn->portal_port);
> +		spin_unlock_bh(&conn->session->lock);
> +		break;
> +	case ISCSI_PARAM_CONN_ADDRESS:
> +		spin_lock_bh(&conn->session->lock);
> +		len = sprintf(buf, "%s\n", conn->portal_address);
> +		spin_unlock_bh(&conn->session->lock);
> +		break;
> +	default:
> +		return iscsi_conn_get_param(cls_conn, param, buf);
> +	}
> +
> +	return len;
> +}
> +
> +static int cxgb3i_conn_set_param(struct iscsi_cls_conn *cls_conn,
> +				 enum iscsi_param param, char *buf, int buflen)
> +{
> +	struct iscsi_conn *conn = cls_conn->dd_data;
> +	struct iscsi_session *session = conn->session;
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
> +	int value, err = 0;
> +
> +	switch (param) {
> +	case ISCSI_PARAM_HDRDGST_EN:
> +		err = iscsi_set_param(cls_conn, param, buf, buflen);
> +		if (!err && conn->hdrdgst_en)
> +			cxgb3i_conn_ulp_setup(cconn, conn->hdrdgst_en,
> +					      conn->datadgst_en);
> +		break;
> +	case ISCSI_PARAM_DATADGST_EN:
> +		err = iscsi_set_param(cls_conn, param, buf, buflen);
> +		if (!err && conn->datadgst_en)
> +			cxgb3i_conn_ulp_setup(cconn, conn->hdrdgst_en,
> +					      conn->datadgst_en);
> +		break;
> +	case ISCSI_PARAM_MAX_R2T:
> +		sscanf(buf, "%d", &value);
> +		if (value <= 0 || !is_power_of_2(value))
> +			return -EINVAL;
> +		if (session->max_r2t == value)
> +			break;
> +		iscsi_r2tpool_free(session);
> +		err = iscsi_set_param(cls_conn, param, buf, buflen);
> +		if (!err && iscsi_r2tpool_alloc(session))
> +			return -ENOMEM;
> +	case ISCSI_PARAM_MAX_RECV_DLENGTH:
> +		err = iscsi_set_param(cls_conn, param, buf, buflen);
> +		cxgb3i_log_debug("MAX_RECV %u.\n", conn->max_recv_dlength);
> +		break;
> +	case ISCSI_PARAM_MAX_XMIT_DLENGTH:
> +		err = iscsi_set_param(cls_conn, param, buf, buflen);
> +		cxgb3i_log_debug("MAX_XMIT %u.\n", conn->max_xmit_dlength);
> +		break;
> +	default:
> +		return iscsi_set_param(cls_conn, param, buf, buflen);
> +	}
> +	return err;
> +}
> +
> +/**
> + * cxgb3i_host_get_param - returns host (adapter) related parameters
> + * @shost:	scsi host pointer
> + * @param:	parameter type identifier
> + * @buf:	buffer pointer
> + */
> +static int cxgb3i_host_get_param(struct Scsi_Host *shost,
> +				 enum iscsi_host_param param, char *buf)
> +{
> +	struct cxgb3i_hba *hba = iscsi_host_priv(shost);
> +	int i;
> +	int len = 0;
> +
> +	switch (param) {
> +	case ISCSI_HOST_PARAM_HWADDRESS:
> +		for (i = 0; i < 6; i++)
> +			len +=
> +			    sprintf(buf + len, "%02x.",
> +				    hba->ndev->dev_addr[i]);
> +		len--;
> +		buf[len] = '\0';
> +		break;
> +	case ISCSI_HOST_PARAM_NETDEV_NAME:
> +		len = sprintf(buf, "%s\n", hba->ndev->name);
> +		break;
> +	default:
> +		return iscsi_host_get_param(shost, param, buf);
> +	}
> +	return len;
> +}
> +
> +/**
> + * cxgb3i_conn_get_stats - returns iSCSI stats
> + * @cls_conn:	pointer to iscsi cls conn
> + * @stats:	pointer to iscsi statistic struct
> + */
> +static void cxgb3i_conn_get_stats(struct iscsi_cls_conn *cls_conn,
> +				  struct iscsi_stats *stats)
> +{
> +	struct iscsi_conn *conn = cls_conn->dd_data;
> +
> +	stats->txdata_octets = conn->txdata_octets;
> +	stats->rxdata_octets = conn->rxdata_octets;
> +	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
> +	stats->dataout_pdus = conn->dataout_pdus_cnt;
> +	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
> +	stats->datain_pdus = conn->datain_pdus_cnt;
> +	stats->r2t_pdus = conn->r2t_pdus_cnt;
> +	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
> +	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
> +	stats->digest_err = 0;
> +	stats->timeout_err = 0;
> +	stats->custom_length = 1;
> +	strcpy(stats->custom[0].desc, "eh_abort_cnt");
> +	stats->custom[0].value = conn->eh_abort_cnt;
> +}
> +
> +static inline u32 tag_base(struct cxgb3i_tag_format *format,
> +			   unsigned int idx, unsigned int age)
> +{
> +	u32 sw_bits = idx | (age << format->idx_bits);
> +	u32 tag = sw_bits >> format->rsvd_shift;
> +	tag <<= format->rsvd_bits + format->rsvd_shift;
> +	tag |= sw_bits & ((1 << format->rsvd_shift) - 1);
> +	return tag;
> +}
> +
> +static void cxgb3i_parse_itt(struct iscsi_conn *conn, itt_t itt,
> +			     int *idx, int *age)
> +{
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
> +	struct cxgb3i_adapter *snic = cconn->hba->snic;
> +	u32 sw_bits;
> +
> +	cxgb3i_parse_tag(&snic->tag_format, itt, NULL, &sw_bits);
> +	if (idx)
> +		*idx = sw_bits & ISCSI_ITT_MASK;
> +	if (age)
> +		*age = (sw_bits >> snic->tag_format.idx_bits) & ISCSI_AGE_MASK;
> +}
> +
> +static int cxgb3i_reserve_itt(struct iscsi_task *task, itt_t *hdr_itt)
> +{
> +	struct scsi_cmnd *sc = task->sc;
> +	struct iscsi_conn *conn = task->conn;
> +	struct iscsi_session *sess = conn->session;
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
> +	struct cxgb3i_adapter *snic = cconn->hba->snic;
> +	u32 sw_tag = tag_base(&snic->tag_format, task->itt, sess->age);
> +	u32 tag = RESERVED_ITT;
> +
> +	if (sc && (sc->sc_data_direction == DMA_FROM_DEVICE)) {
> +		struct s3_conn *c3cn =
> +			(struct s3_conn *)(tcp_conn->sock);
> +		tag =
> +		    cxgb3i_ddp_tag_reserve(snic, c3cn->tid, sw_tag,
> +					   scsi_out(sc)->length,
> +					   scsi_out(sc)->table.sgl,
> +					   scsi_out(sc)->table.nents);

sc->sc_data_direction == DMA_FROM_DEVICE should use scsi_in(sc)
you had no problems because for uni-direction commands the scsi_out() and
scsi_in() are the same

> +	}
> +	if (tag == RESERVED_ITT)
> +		tag = sw_tag | (snic->tag_format.rsvd_mask <<
> +				snic->tag_format.rsvd_shift);
> +	*hdr_itt = htonl(tag);
> +	return 0;
> +}
> +
> +static void cxgb3i_release_itt(struct iscsi_task *task, itt_t hdr_itt)
> +{
> +	struct scsi_cmnd *sc = task->sc;
> +	struct iscsi_conn *conn = task->conn;
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct cxgb3i_conn *cconn = (struct cxgb3i_conn *)(tcp_conn + 1);
> +	struct cxgb3i_adapter *snic = cconn->hba->snic;
> +
> +	hdr_itt = ntohl(hdr_itt);
> +	if (sc && (sc->sc_data_direction == DMA_FROM_DEVICE))
> +		cxgb3i_ddp_tag_release(snic, hdr_itt,
> +				       scsi_out(sc)->table.sgl,
> +				       scsi_out(sc)->table.nents);

here also scsi_in()

> +}
> +
> +/**
> + * cxgb3i_host_template -- Scsi_Host_Template structure
> + *	used when registering with the scsi mid layer
> + */
> +static struct scsi_host_template cxgb3i_host_template = {
> +	.module = THIS_MODULE,
> +	.name = "Chelsio S3xx iSCSI Initiator",
> +	.proc_name = "cxgb3i",
> +	.queuecommand = iscsi_queuecommand,
> +	.change_queue_depth = iscsi_change_queue_depth,
> +	.can_queue = 128 * (ISCSI_DEF_XMIT_CMDS_MAX - 1),
> +	.sg_tablesize = SG_ALL,

iscsi_tcp supports sg_chaining here. It looks like submitted
code is sg-safe you can easily put:
	.sg_tablesize = ~0,

> +	.max_sectors = 0xFFFF,
> +	.cmd_per_lun = ISCSI_DEF_CMD_PER_LUN,
> +	.eh_abort_handler = iscsi_eh_abort,
> +	.eh_device_reset_handler = iscsi_eh_device_reset,
> +	.eh_target_reset_handler = iscsi_eh_target_reset,
> +	.use_clustering = DISABLE_CLUSTERING,
> +	.slave_alloc = iscsi_slave_alloc,
> +	.this_id = -1,
> +};
> +
> +static struct iscsi_transport cxgb3i_iscsi_transport = {
> +	.owner = THIS_MODULE,
> +	.name = "cxgb3i",
> +	.caps = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_HDRDGST
> +	    | CAP_DATADGST | CAP_DIGEST_OFFLOAD,
> +	.param_mask = ISCSI_MAX_RECV_DLENGTH |
> +	    ISCSI_MAX_XMIT_DLENGTH |
> +	    ISCSI_HDRDGST_EN |
> +	    ISCSI_DATADGST_EN |
> +	    ISCSI_INITIAL_R2T_EN |
> +	    ISCSI_MAX_R2T |
> +	    ISCSI_IMM_DATA_EN |
> +	    ISCSI_FIRST_BURST |
> +	    ISCSI_MAX_BURST |
> +	    ISCSI_PDU_INORDER_EN |
> +	    ISCSI_DATASEQ_INORDER_EN |
> +	    ISCSI_ERL |
> +	    ISCSI_CONN_PORT |
> +	    ISCSI_CONN_ADDRESS |
> +	    ISCSI_EXP_STATSN |
> +	    ISCSI_PERSISTENT_PORT |
> +	    ISCSI_PERSISTENT_ADDRESS |
> +	    ISCSI_TARGET_NAME | ISCSI_TPGT |
> +	    ISCSI_USERNAME | ISCSI_PASSWORD |
> +	    ISCSI_USERNAME_IN | ISCSI_PASSWORD_IN |
> +	    ISCSI_FAST_ABORT | ISCSI_ABORT_TMO |
> +	    ISCSI_LU_RESET_TMO |
> +	    ISCSI_PING_TMO | ISCSI_RECV_TMO |
> +	    ISCSI_IFACE_NAME | ISCSI_INITIATOR_NAME,
> +	.host_param_mask = ISCSI_HOST_HWADDRESS | ISCSI_HOST_IPADDRESS |
> +	    ISCSI_HOST_INITIATOR_NAME | ISCSI_HOST_NETDEV_NAME,
> +	.get_host_param = cxgb3i_host_get_param,
> +	/* session management */
> +	.create_session = cxgb3i_session_create,
> +	.destroy_session = cxgb3i_session_destroy,
> +	.get_session_param = iscsi_session_get_param,
> +	/* connection management */
> +	.create_conn = cxgb3i_conn_create,
> +	.bind_conn = cxgb3i_conn_bind,
> +	.destroy_conn = iscsi_conn_teardown,
> +	.start_conn = iscsi_conn_start,
> +	.stop_conn = iscsi_conn_stop,
> +	.flush_conn = cxgb3i_conn_flush,
> +	.get_conn_param = cxgb3i_conn_get_param,
> +	.set_param = cxgb3i_conn_set_param,
> +	.get_stats = cxgb3i_conn_get_stats,
> +	/* pdu xmit req. from user space */
> +	.send_pdu = iscsi_conn_send_pdu,
> +	/* task */
> +	.init_task = iscsi_tcp_task_init,
> +	.xmit_task = iscsi_tcp_task_xmit,
> +	.cleanup_task = iscsi_tcp_cleanup_task,
> +	.parse_itt = cxgb3i_parse_itt,
> +	.reserve_itt = cxgb3i_reserve_itt,
> +	.release_itt = cxgb3i_release_itt,
> +	/* TCP connect/disconnect */
> +	.ep_connect = cxgb3i_ep_connect,
> +	.ep_poll = cxgb3i_ep_poll,
> +	.ep_disconnect = cxgb3i_ep_disconnect,
> +	/* Error recovery timeout call */
> +	.session_recovery_timedout = iscsi_session_recovery_timedout,
> +};
> +
> +int cxgb3i_iscsi_init(void)
> +{
> +	cxgb3i_scsi_transport =
> +	    iscsi_register_transport(&cxgb3i_iscsi_transport);
> +	if (!cxgb3i_scsi_transport) {
> +		cxgb3i_log_error("Could not register cxgb3i transport.\n");
> +		return -ENODEV;
> +	}
> +	cxgb3i_log_debug("cxgb3i transport 0x%p.\n", cxgb3i_scsi_transport);
> +	return 0;
> +}
> +
> +void cxgb3i_iscsi_cleanup(void)
> +{
> +	if (cxgb3i_scsi_transport) {
> +		cxgb3i_log_debug("cxgb3i transport 0x%p.\n",
> +				 cxgb3i_scsi_transport);
> +		iscsi_unregister_transport(&cxgb3i_iscsi_transport);
> +		cxgb3i_scsi_transport = NULL;
> +	}
> +}
> diff --git a/drivers/scsi/cxgb3i/cxgb3i_offload.c b/drivers/scsi/cxgb3i/cxgb3i_offload.c
> new file mode 100644
> index 0000000..d4d8b85
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/cxgb3i_offload.c
> @@ -0,0 +1,2001 @@
> +/*
> + * Copyright (C) 2003-2008 Chelsio Communications.  All rights reserved.
> + *
> + * Written by Dimitris Michailidis (dm@chelsio.com)
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
> + * release for licensing terms and conditions.
> + */
> +
> +#include <linux/if_vlan.h>
> +#include <linux/version.h>
> +
> +#include "cxgb3_defs.h"
> +#include "cxgb3_ctl_defs.h"
> +#include "firmware_exports.h"
> +#include "cxgb3i_offload.h"
> +#include "cxgb3i_ulp2.h"
> +
> +static int rcv_win = 256 * 1024;
> +module_param(rcv_win, int, 0644);
> +MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)");
> +
> +static int snd_win = 32 * 1024;
> +module_param(snd_win, int, 0644);
> +MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
> +
> +static int rx_credit_thres = 10 * 1024;
> +module_param(rx_credit_thres, int, 0644);
> +MODULE_PARM_DESC(snd_win, "RX credits return threshold in bytes (default=10KB)");
> +
> +static unsigned int max_connect = 8 * 1024;
> +module_param(max_connect, uint, 0644);
> +MODULE_PARM_DESC(max_connect, "Max. # of connections (default=8092)");
> +
> +static unsigned int sport_base = 20000;
> +module_param(sport_base, uint, 0644);
> +MODULE_PARM_DESC(sport_start, "starting port number (default=20000)");
> +
> +#ifdef __DEBUG_C3CN_CONN__
> +#define c3cn_conn_debug         cxgb3i_log_debug
> +#else
> +#define c3cn_conn_debug(fmt...)
> +#endif
> +
> +#ifdef __DEBUG_C3CN_TX__
> +#define c3cn_tx_debug         cxgb3i_log_debug
> +#else
> +#define c3cn_tx_debug(fmt...)
> +#endif
> +
> +/* minimal port allocation management scheme */
> +spinlock_t sport_map_lock;
> +unsigned int sport_map_next = 0;
> +unsigned char *sport_map = NULL;
> +
> +/*
> + * Find a free source port in our allocation map.  We use a very simple rotor
> + * scheme to look for the next free port.
> + *
> + * If a source port has been specified make sure that it doesn't collide with
> + * our normal source port allocation map.  If it's outside the range of our
> + * allocation scheme just let them use it.
> + */
> +static int c3cn_get_port(struct s3_conn *c3cn)
> +{
> +	unsigned int start;
> +
> +	if (!sport_map)
> +		goto error_out;
> +
> +	if (c3cn->saddr.sin_port != 0) {
> +		int sport = ntohs(c3cn->saddr.sin_port) - sport_base;
> +		int err = 0;
> +
> +		if (sport < 0 || sport >= max_connect)
> +			return 0;
> +		spin_lock(&sport_map_lock);
> +		err = __test_and_set_bit(sport, sport_map);
> +		spin_unlock(&sport_map_lock);
> +		return (err ? -EADDRINUSE : 0);
> +	}
> +
> +	spin_lock(&sport_map_lock);
> +	start = sport_map_next;
> +	do {
> +		unsigned int new = sport_map_next;
> +		if (++sport_map_next >= max_connect)
> +			sport_map_next = 0;
> +		if (!(__test_and_set_bit(new, sport_map))) {
> +			spin_unlock(&sport_map_lock);
> +			c3cn->saddr.sin_port = htons(sport_base + new);
> +			return 0;
> +		}
> +	} while (sport_map_next != start);
> +	spin_unlock(&sport_map_lock);
> +
> +error_out:
> +	return -EADDRNOTAVAIL;
> +}
> +
> +/*
> + * Deallocate a source port from the allocation map.  If the source port is
> + * outside our allocation range just return -- the caller is responsible for
> + * keeping track of their port usage outside of our allocation map.
> + */
> +static void c3cn_put_port(struct s3_conn *c3cn)
> +{
> +	int old = ntohs(c3cn->saddr.sin_port) - sport_base;
> +	c3cn->saddr.sin_port = 0;
> +
> +	if (old < 0 || old >= max_connect)
> +		return;
> +
> +	spin_lock(&sport_map_lock);
> +	__clear_bit(old, sport_map);
> +	spin_unlock(&sport_map_lock);
> +}
> +
> +static inline unsigned int c3cn_in_state(const struct s3_conn *c3cn,
> +                                         unsigned int states)
> +{
> +	return (states & c3cn->state);
> +}
> +
> +static void c3cn_set_state(struct s3_conn *c3cn, int state)
> +{
> +	c3cn_conn_debug("c3cn 0x%p state -> 0x%x.\n", c3cn, state);
> +	if (state == C3CN_STATE_CLOSE)
> +		c3cn_put_port(c3cn);
> +	c3cn->state = state;
> +}
> +
> +
> +void c3cn_reset_timer(struct s3_conn *c3cn, struct timer_list* timer,
> +		      unsigned long expires)
> +{
> +	if (!mod_timer(timer, expires))
> +		c3cn_hold(c3cn);
> +}
> +
> +typedef int (cxgb3_cpl_handler_decl) (struct t3cdev *,
> +				      struct sk_buff *, void *);
> +
> +static cxgb3_cpl_handler_decl do_act_establish;
> +static cxgb3_cpl_handler_decl do_act_open_rpl;
> +static cxgb3_cpl_handler_decl do_wr_ack;
> +static cxgb3_cpl_handler_decl do_peer_close;
> +static cxgb3_cpl_handler_decl do_abort_req;
> +static cxgb3_cpl_handler_decl do_abort_rpl;
> +static cxgb3_cpl_handler_decl do_close_con_rpl;
> +static cxgb3_cpl_handler_decl do_iscsi_hdr;
> +
> +/*
> + * Protocol functions for our connections.
> + */
> +static int c3cn_destroy(struct s3_conn *);
> +static void process_deferq(struct work_struct *);
> +
> +static LIST_HEAD(cxgb3_list);
> +static DEFINE_MUTEX(cxgb3_list_lock);
> +
> +/*
> + * For ULP connections HW may inserts digest bytes into the pdu. This array
> + * contains the compensating extra lengths for ULP packets.  It is indexed by
> + * a packet's ULP submode.
> + */
> +static const unsigned int cxgb3_ulp_extra_len[] = { 0, 4, 4, 8 };
> +
> +/*
> + * Return the length of any HW additions that will be made to a Tx packet.
> + * Such additions can happen for some types of ULP packets.
> + */
> +static inline unsigned int ulp_extra_len(const struct sk_buff *skb)
> +{
> +	return cxgb3_ulp_extra_len[skb_ulp_mode(skb) & 3];
> +}
> +
> +/*
> + * Size of WRs in bytes.  Note that we assume all devices we are handling have
> + * the same WR size.
> + */
> +static unsigned int wrlen __read_mostly;
> +
> +/*
> + * The number of WRs needed for an skb depends on the number of page fragments
> + * in the skb and whether it has any payload in its main body.  This maps the
> + * length of the gather list represented by an skb into the # of necessary WRs.
> + */
> +static unsigned int skb_wrs[MAX_SKB_FRAGS + 2] __read_mostly;
> +
> +static void s3_init_wr_tab(unsigned int wr_len)
> +{
> +	int i;
> +
> +	if (skb_wrs[1])		/* already initialized */
> +		return;
> +
> +	for (i = 1; i < ARRAY_SIZE(skb_wrs); i++) {
> +		int sgl_len = (3 * i) / 2 + (i & 1);
> +
> +		sgl_len += 3;
> +		skb_wrs[i] = (sgl_len <= wr_len
> +			      ? 1 : 1 + (sgl_len - 2) / (wr_len - 1));
> +	}
> +
> +	wrlen = wr_len * 8;
> +}
> +
> +/*
> + * Initialization/cleanup cxgb3 API operations.
> + */
> +/*
> + * large memory chunk allocation/release
> + */
> +void *cxgb3i_alloc_big_mem(unsigned int size)
> +{
> +	void *p = kmalloc(size, GFP_KERNEL);
> +	if (!p)
> +		p = vmalloc(size);
> +	if (p)
> +		memset(p, 0, size);
> +	return p;
> +}
> +
> +void cxgb3i_free_big_mem(void *addr)
> +{
> +	unsigned long p = (unsigned long)addr;
> +	if (p >= VMALLOC_START && p < VMALLOC_END)
> +		vfree(addr);
> +	else
> +		kfree(addr);
> +}
> +
> +void cxgb3i_sdev_cleanup(cxgb3_cpl_handler_func *cpl_handlers)
> +{
> +	memset(cpl_handlers, 0, NUM_CPL_CMDS*(sizeof(*cpl_handlers)));
> +	if (sport_map)
> +		cxgb3i_free_big_mem(sport_map);
> +}
> +
> +int cxgb3i_sdev_init(cxgb3_cpl_handler_func *cpl_handlers)
> +{
> +	cpl_handlers[CPL_ACT_ESTABLISH] = do_act_establish;
> +	cpl_handlers[CPL_ACT_OPEN_RPL] = do_act_open_rpl;
> +	cpl_handlers[CPL_PEER_CLOSE] = do_peer_close;
> +	cpl_handlers[CPL_ABORT_REQ_RSS] = do_abort_req;
> +	cpl_handlers[CPL_ABORT_RPL_RSS] = do_abort_rpl;
> +	cpl_handlers[CPL_CLOSE_CON_RPL] = do_close_con_rpl;
> +	cpl_handlers[CPL_TX_DMA_ACK] = do_wr_ack;
> +	cpl_handlers[CPL_ISCSI_HDR] = do_iscsi_hdr;
> +
> +	sport_map = cxgb3i_alloc_big_mem((max_connect + 7)/8);
> +	if (!sport_map)
> +		return -ENOMEM;
> +	return 0;
> +}
> +
> +void cxgb3i_sdev_add(struct t3cdev *cdev, struct cxgb3_client *client)
> +{
> +	struct cxgb3i_sdev_data *cdata;
> +	struct adap_ports *ports;
> +	struct ofld_page_info rx_page_info;
> +	unsigned int wr_len;
> +	int i;
> +
> +	cdata = kzalloc(sizeof *cdata, GFP_KERNEL);
> +	if (!cdata)
> +		return;
> +	ports = kzalloc(sizeof *ports, GFP_KERNEL);
> +	if (!ports)
> +		goto free_ports;
> +	cdata->ports = ports;
> +
> +	if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0 ||
> +	    cdev->ctl(cdev, GET_PORTS, cdata->ports) < 0 ||
> +	    cdev->ctl(cdev, GET_RX_PAGE_INFO, &rx_page_info) < 0)
> +		goto free_ports;
> +
> +	s3_init_wr_tab(wr_len);
> +
> +	INIT_LIST_HEAD(&cdata->list);
> +	cdata->cdev = cdev;
> +	cdata->client = client;
> +	cdata->rx_page_size = rx_page_info.page_size;
> +	skb_queue_head_init(&cdata->deferq);
> +	INIT_WORK(&cdata->deferq_task, process_deferq);
> +
> +	for (i = 0; i < ports->nports; i++)
> +		NDEV2CDATA(ports->lldevs[i]) = cdata;
> +
> +	mutex_lock(&cxgb3_list_lock);
> +	list_add_tail(&cdata->list, &cxgb3_list);
> +	mutex_unlock(&cxgb3_list_lock);
> +
> +	return;
> +
> +free_ports:
> +	kfree(ports);
> +	kfree(cdata);
> +}
> +
> +void cxgb3i_sdev_remove(struct t3cdev *cdev)
> +{
> +	struct cxgb3i_sdev_data *cdata = CXGB3_SDEV_DATA(cdev);
> +	struct adap_ports *ports = cdata->ports;
> +	int i;
> +
> +	for (i = 0; i < ports->nports; i++)
> +		NDEV2CDATA(ports->lldevs[i]) = NULL;
> +
> +	mutex_lock(&cxgb3_list_lock);
> +	list_del(&cdata->list);
> +	mutex_unlock(&cxgb3_list_lock);
> +
> +	kfree(ports);
> +	kfree(cdata);
> +}
> +
> +/*
> + * Return TRUE if the specified net device is for a port on one of our
> + * registered adapters.
> + */
> +static int is_cxgb3_dev(struct net_device *dev)
> +{
> +	struct cxgb3i_sdev_data *cdata;
> +
> +	mutex_lock(&cxgb3_list_lock);
> +	list_for_each_entry(cdata, &cxgb3_list, list) {
> +		struct adap_ports *ports = cdata->ports;
> +		int i;
> +
> +		for (i = 0; i < ports->nports; i++)
> +			if (dev == ports->lldevs[i]) {
> +				mutex_unlock(&cxgb3_list_lock);
> +				return 1;
> +			}
> +	}
> +	mutex_unlock(&cxgb3_list_lock);
> +	return 0;
> +}
> +
> +/*
> + * Primary cxgb3 API operations.
> + * =============================
> + */
> +
> +static int s3_push_frames(struct s3_conn *, int);
> +static int s3_send_reset(struct s3_conn *, int, struct sk_buff *);
> +
> +struct s3_conn * cxgb3i_c3cn_create(void)
> +{
> +	struct s3_conn *c3cn;
> +
> +	c3cn = kzalloc(sizeof(*c3cn), GFP_KERNEL);
> +	if (c3cn == NULL)
> +		return NULL;
> +
> +	c3cn->flags = 0;
> +	spin_lock_init(&c3cn->lock);
> +	atomic_set(&c3cn->refcnt, 1);
> +	skb_queue_head_init(&c3cn->receive_queue);
> +	skb_queue_head_init(&c3cn->write_queue);
> +	setup_timer(&c3cn->retry_timer, NULL, (unsigned long)c3cn);
> +	rwlock_init(&c3cn->callback_lock);
> +
> +	return c3cn;
> +}
> +
> +static void mk_close_req(struct s3_conn *);
> +static inline void s3_purge_write_queue(struct s3_conn *);
> +
> +/*
> + * Release a connection's local port if the connection is bound.
> + */
> +static inline void release_port(struct s3_conn *c3cn)
> +{
> +	c3cn_conn_debug("c3cn 0x%p, port %u.\n", c3cn, c3cn->saddr.sin_port);
> +	if (c3cn->saddr.sin_port)
> +		c3cn_put_port(c3cn);
> +}
> +
> +static void c3cn_done(struct s3_conn *c3cn)
> +{
> +	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
> +
> +	c3cn_set_state(c3cn, C3CN_STATE_CLOSE);
> +	c3cn->shutdown = C3CN_SHUTDOWN_MASK;
> +
> +	cxgb3i_conn_closing(c3cn);
> +}
> +
> +void c3cn_close(struct s3_conn *c3cn)
> +{
> +	int data_lost, old_state;
> +
> +	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
> +			 c3cn, c3cn->state, c3cn->flags);
> +
> +	dst_confirm(c3cn->dst_cache);
> +
> +	spin_lock_bh(&c3cn->lock);
> +	c3cn->shutdown |= C3CN_SHUTDOWN_MASK;
> +
> +	/*
> +	 * We need to flush the receive buffs.  We do this only on the
> +	 * descriptor close, not protocol-sourced closes, because the
> +	 * reader process may not have drained the data yet!  Make a note
> +	 * of whether any received data will be lost so we can decide whether
> +	 * to FIN or RST.
> +	 */
> +	data_lost = skb_queue_len(&c3cn->receive_queue);
> +	__skb_queue_purge(&c3cn->receive_queue);
> +
> +	if (c3cn->state == C3CN_STATE_CLOSE)	/* Nothing if we are already closed */
> +		;
> +	else if (data_lost || c3cn->state == C3CN_STATE_SYN_SENT) {
> +		/* Unread data was tossed, zap the connection. */
> +		s3_send_reset(c3cn, CPL_ABORT_SEND_RST, NULL);
> +		release_port(c3cn);
> +		goto unlock;
> +	} else if (c3cn->state == C3CN_STATE_ESTABLISHED) {
> +		c3cn_set_state(c3cn, C3CN_STATE_CLOSING);
> +		mk_close_req(c3cn);
> +	}
> +
> +unlock:
> +	old_state = c3cn->state;
> +	c3cn_hold(c3cn); /* must last past the potential destroy() */
> +
> +	spin_unlock_bh(&c3cn->lock); /* Final release in connection's lifetime. */
> +
> +	/*
> +	 * There are no more user references at this point.  Grab the
> +	 * connection lock and finish the close.
> +	 */
> +	local_bh_disable();
> +	spin_lock(&c3cn->lock);
> +
> +	/*
> +	 * Because the connection was orphaned before the spin_lock()
> +	 * either the backlog or a BH may have already destroyed it.
> +	 * Bail out if so.
> +	 */
> +	if (old_state != C3CN_STATE_CLOSE && c3cn->state == C3CN_STATE_CLOSE)
> +		goto out;
> +
> +	if (c3cn->state == C3CN_STATE_CLOSE)
> +		c3cn_destroy(c3cn);
> +
> +out:
> +	spin_unlock(&c3cn->lock);
> +	local_bh_enable();
> +	c3cn_put(c3cn);
> +}
> +
> +/*
> + * Destroy connection.  Purge the write queue and drop a reference on the
> + * connection.
> + */
> +static int c3cn_destroy(struct s3_conn *c3cn)
> +{
> +	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
> +
> +	s3_purge_write_queue(c3cn);
> +	c3cn_put(c3cn);
> +	return 0;
> +}
> +
> +/*
> + * Local utility routines used to implement primary cxgb3 API operations.
> + * ======================================================================
> + */
> +
> +static int s3_connect(struct s3_conn *);
> +static u32 s3_send_rx_credits(struct s3_conn *, u32, u32, int);
> +static void mk_act_open_req(struct s3_conn *, struct sk_buff *,
> +			    unsigned int, const struct l2t_entry *);
> +static void skb_entail(struct s3_conn *, struct sk_buff *, int);
> +
> +static inline void reset_wr_list(struct s3_conn *c3cn)
> +{
> +	c3cn->wr_pending_head = NULL;
> +}
> +
> +/*
> + * Add a WR to a connections's list of pending WRs.  This is a singly-linked
> + * list of sk_buffs operating as a FIFO.  The head is kept in wr_pending_head
> + * and the tail in wr_pending_tail.
> + */
> +static inline void enqueue_wr(struct s3_conn *c3cn,
> +			      struct sk_buff *skb)
> +{
> +	skb->sp = NULL;
> +
> +	/*
> +	 * We want to take an extra reference since both us and the driver
> +	 * need to free the packet before it's really freed.  We know there's
> +	 * just one user currently so we use atomic_set rather than skb_get
> +	 * to avoid the atomic op.
> +	 */
> +	atomic_set(&skb->users, 2);
> +
> +	if (!c3cn->wr_pending_head)
> +		c3cn->wr_pending_head = skb;
> +	else
> +		c3cn->wr_pending_tail->sp = (void *)skb;
> +	c3cn->wr_pending_tail = skb;
> +}
> +
> +/*
> + * The next two functions calculate the option 0 value for a connection.
> + */
> +static inline int compute_wscale(int win)
> +{
> +	int wscale = 0; 
> +	while (wscale < 14 && (65535<<wscale) < win)
> +		wscale++;
> +	return wscale;
> +}
> +
> +static inline unsigned int calc_opt0h(struct s3_conn *c3cn)
> +{
> +	int wscale = compute_wscale(rcv_win);
> +	return (V_KEEP_ALIVE(1) |
> +		F_TCAM_BYPASS |
> +		V_WND_SCALE(wscale) |
> +		V_MSS_IDX(c3cn->mss_idx));
> +}
> +
> +static inline unsigned int calc_opt0l(struct s3_conn *c3cn)
> +{
> +	return (V_ULP_MODE(ULP_MODE_ISCSI) |
> +		V_RCV_BUFSIZ(rcv_win>>10));	
> +}
> +
> +static inline void make_tx_data_wr(struct s3_conn *c3cn,
> +				   struct sk_buff *skb, int len)
> +{
> +	struct tx_data_wr *req;
> +
> +	skb_reset_transport_header(skb);
> +	req = (struct tx_data_wr *)__skb_push(skb, sizeof(*req));
> +	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA));
> +	req->wr_lo = htonl(V_WR_TID(c3cn->tid));
> +	req->sndseq = htonl(c3cn->snd_nxt);
> +	/* len includes the length of any HW ULP additions */
> +	req->len = htonl(len);
> +	req->param = htonl(V_TX_PORT(c3cn->l2t->smt_idx));
> +	/* V_TX_ULP_SUBMODE sets both the mode and submode */
> +	req->flags = htonl(V_TX_ULP_SUBMODE(skb_ulp_mode(skb)) |
> +			   V_TX_SHOVE((skb_peek(&c3cn->write_queue) ? 0 : 1)));
> +
> +	if (!c3cn_flag(c3cn, C3CN_TX_DATA_SENT)) {
> +
> +		req->flags |= htonl(V_TX_ACK_PAGES(2) | F_TX_INIT |
> +				    V_TX_CPU_IDX(c3cn->qset));
> +
> +		/* Sendbuffer is in units of 32KB.
> +		 */
> +		req->param |= htonl(V_TX_SNDBUF(snd_win >> 15));
> +		c3cn_set_flag(c3cn, C3CN_TX_DATA_SENT);
> +	}
> +}
> +
> +static struct rtable *find_route(__be32 saddr, __be32 daddr,
> +				 __be16 sport, __be16 dport)
> +{
> +	struct rtable *rt;
> +	struct flowi fl = {
> +		.oif = 0,
> +		.nl_u = {
> +			 .ip4_u = {
> +				   .daddr = daddr,
> +				   .saddr = saddr,
> +				   .tos = 0 } },
> +		.proto = IPPROTO_TCP,
> +		.uli_u = {
> +			  .ports = {
> +				    .sport = sport,
> +				    .dport = dport } } };
> +
> +	if (ip_route_output_flow(&init_net, &rt, &fl, NULL, 0))
> +		return NULL;
> +	return rt;
> +}
> +
> +int cxgb3i_c3cn_connect(struct s3_conn *c3cn, struct sockaddr_in *usin)
> +{
> +	struct rtable *rt;
> +	int err;
> +
> +	if (usin->sin_family != AF_INET)
> +		return -EAFNOSUPPORT;
> +
> +	/* get a source port if one hasn't been provided */
> +	err = c3cn_get_port(c3cn);
> +	if (err)
> +		return err;
> +	c3cn_conn_debug("c3cn 0x%p get port %u.\n", c3cn, ntohs(c3cn->saddr.sin_port));
> +
> +	c3cn->daddr.sin_port = usin->sin_port;
> +	c3cn->daddr.sin_addr.s_addr = usin->sin_addr.s_addr;
> +
> +	rt = find_route(c3cn->saddr.sin_addr.s_addr,
> +			c3cn->daddr.sin_addr.s_addr,
> +			c3cn->saddr.sin_port,
> +			c3cn->daddr.sin_port);
> +	if (rt == NULL) {
> +		c3cn_conn_debug("NO route to 0x%x, port %u.\n", 
> +				c3cn->daddr.sin_addr.s_addr,
> +				ntohs(c3cn->daddr.sin_port));
> +		return -ENETUNREACH;
> +	}
> +
> +	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
> +		c3cn_conn_debug("multi-cast route to 0x%x, port %u.\n", 
> +				c3cn->daddr.sin_addr.s_addr,
> +				ntohs(c3cn->daddr.sin_port));
> +		ip_rt_put(rt);
> +		return -ENETUNREACH;
> +	}
> +
> +	if (!c3cn->saddr.sin_addr.s_addr)
> +		c3cn->saddr.sin_addr.s_addr = rt->rt_src;
> +
> +	c3cn_conn_debug("c3cn 0x%p -> SYN_SENT.\n", c3cn);
> +	c3cn_set_state(c3cn, C3CN_STATE_SYN_SENT);
> +
> +	/* now commit destination to connection */
> +	c3cn->dst_cache = &rt->u.dst;
> +
> +	if (s3_connect(c3cn))
> +		return 0;
> +	/*
> +	 * If we get here, we don't have an offload connection so simply
> +	 * return a failure.
> +	 */
> +	err = -ENOTSUPP;
> +
> +	/*
> +	 * This trashes the connection and releases the local port,
> +	 * if necessary.
> +	 */
> +	c3cn_conn_debug("c3cn 0x%p -> CLOSE.\n", c3cn);
> +	c3cn_set_state(c3cn, C3CN_STATE_CLOSE);
> +	ip_rt_put(rt);
> +	c3cn_put_port(c3cn);
> +	c3cn->daddr.sin_port = 0;
> +	return err;
> +}
> +
> +/*
> + * Set of states for which we should return RX credits.
> + */
> +#define CREDIT_RETURN_STATE (C3CN_STATE_ESTABLISHED)
> +
> +/*
> + * Called after some received data has been read.  It returns RX credits
> + * to the HW for the amount of data processed.
> + */
> +void cxgb3i_c3cn_rx_credits(struct s3_conn *c3cn, int copied)
> +{
> +	struct t3cdev *cdev;
> +	int must_send;
> +	u32 credits, dack = 0;
> +
> +	if (!c3cn_in_state(c3cn, CREDIT_RETURN_STATE))
> +		return;
> +
> +	credits = c3cn->copied_seq - c3cn->rcv_wup;
> +	if (unlikely(!credits))
> +		return;
> +
> +	cdev = c3cn->cdev;
> +
> +	if (unlikely(rx_credit_thres == 0))
> +		return;
> +
> +	dack = F_RX_DACK_CHANGE | V_RX_DACK_MODE(1);
> +
> +	/*
> +	 * For coalescing to work effectively ensure the receive window has
> +	 * at least 16KB left.
> +	 */
> +	must_send = credits + 16384 >= rcv_win;
> +
> +	if (must_send || credits >= rx_credit_thres)
> +		c3cn->rcv_wup += s3_send_rx_credits(c3cn, credits, dack, must_send);
> +}
> +
> +/*
> + * Generic ARP failure handler that discards the buffer.
> + */
> +static void arp_failure_discard(struct t3cdev *cdev, struct sk_buff *skb)
> +{
> +	kfree_skb(skb);
> +}
> +
> +/*
> + * Prepends TX_DATA_WR or CPL_CLOSE_CON_REQ headers to buffers waiting in a
> + * connection's send queue and sends them on to T3.  Must be called with the
> + * connection's lock held.  Returns the amount of send buffer space that was
> + * freed as a result of sending queued data to T3.
> + */
> +static int s3_push_frames(struct s3_conn *c3cn, int req_completion)
> +{
> +	int total_size = 0;
> +	struct sk_buff *skb;
> +	struct t3cdev *cdev;
> +	struct cxgb3i_sdev_data *cdata;
> +
> +	if (unlikely(c3cn_in_state(c3cn, C3CN_STATE_SYN_SENT | C3CN_STATE_CLOSE)))
> +		return 0;
> +
> +	/*
> +	 * We shouldn't really be called at all after an abort but check just
> +	 * in case.
> +	 */
> +	if (unlikely(c3cn_flag(c3cn, C3CN_ABORT_SHUTDOWN)))
> +		return 0;
> +
> +	cdev = c3cn->cdev;
> +	cdata = CXGB3_SDEV_DATA(cdev);
> +
> +	while (c3cn->wr_avail
> +	       && (skb = skb_peek(&c3cn->write_queue)) != NULL
> +	       && !c3cn_flag(c3cn, C3CN_TX_WAIT_IDLE)) {
> +
> +		int len = skb->len;	/* length before skb_push */
> +		int frags = skb_shinfo(skb)->nr_frags + (len != skb->data_len);
> +		int wrs_needed = skb_wrs[frags];
> +
> +		if (wrs_needed > 1 && len + sizeof(struct tx_data_wr) <= wrlen)
> +			wrs_needed = 1;
> +
> +		WARN_ON(frags >= ARRAY_SIZE(skb_wrs) || wrs_needed < 1);
> +		if (c3cn->wr_avail < wrs_needed)
> +			break;
> +
> +		__skb_unlink(skb, &c3cn->write_queue);
> +		skb->priority = CPL_PRIORITY_DATA;
> +		skb->csum = wrs_needed;	/* remember this until the WR_ACK */
> +		c3cn->wr_avail -= wrs_needed;
> +		c3cn->wr_unacked += wrs_needed;
> +		enqueue_wr(c3cn, skb);
> +
> +		if (likely(CXGB3_SKB_CB(skb)->flags & C3CB_FLAG_NEED_HDR)) {
> +			len += ulp_extra_len(skb);
> +			make_tx_data_wr(c3cn, skb, len);
> +			c3cn->snd_nxt += len;
> +			if ((req_completion
> +			     && c3cn->wr_unacked == wrs_needed)
> +			    || (CXGB3_SKB_CB(skb)->flags & C3CB_FLAG_COMPL)
> +			    || c3cn->wr_unacked >= c3cn->wr_max / 2) {
> +				struct work_request_hdr *wr = cplhdr(skb);
> +
> +				wr->wr_hi |= htonl(F_WR_COMPL);
> +				c3cn->wr_unacked = 0;
> +			}
> +			CXGB3_SKB_CB(skb)->flags &= ~C3CB_FLAG_NEED_HDR;
> +		} else if (skb->data[0] == FW_WROPCODE_OFLD_CLOSE_CON)
> +			c3cn_set_flag(c3cn, C3CN_CLOSE_CON_REQUESTED);
> +
> +		total_size += skb->truesize;
> +		set_arp_failure_handler(skb, arp_failure_discard);
> +		l2t_send(cdev, skb, c3cn->l2t);
> +	}
> +	return total_size;
> +}
> +
> +/*
> + * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
> + * and send it along.
> + */
> +static void abort_arp_failure(struct t3cdev *cdev, struct sk_buff *skb)
> +{
> +	struct cpl_abort_req *req = cplhdr(skb);
> +
> +	req->cmd = CPL_ABORT_NO_RST;
> +	cxgb3_ofld_send(cdev, skb);
> +}
> +
> +/*
> + * Send an ABORT_REQ message.  Cannot fail.  This routine makes sure we do
> + * not send multiple ABORT_REQs for the same connection and also that we do
> + * not try to send a message after the connection has closed.  Returns 1 if
> + * an ABORT_REQ wasn't generated after all, 0 otherwise.
> + */
> +static int s3_send_reset(struct s3_conn *c3cn, int mode,
> +			 struct sk_buff *skb)
> +{
> +	struct cpl_abort_req *req;
> +	unsigned int tid = c3cn->tid;
> +
> +	if (unlikely(c3cn_flag(c3cn, C3CN_ABORT_SHUTDOWN) || !c3cn->cdev)) {
> +		if (skb)
> +			__kfree_skb(skb);
> +		return 1;
> +	}
> +
> +	c3cn_conn_debug("c3cn 0x%p, mode %d.\n", c3cn, mode);
> +
> +	c3cn_set_flag(c3cn, C3CN_ABORT_RPL_PENDING);
> +	c3cn_set_flag(c3cn, C3CN_ABORT_SHUTDOWN);
> +
> +	/* Purge the send queue so we don't send anything after an abort. */
> +	s3_purge_write_queue(c3cn);
> +
> +	if (!skb)
> +		skb = alloc_skb(sizeof(*req), GFP_KERNEL | __GFP_NOFAIL);
> +	skb->priority = CPL_PRIORITY_DATA;
> +	set_arp_failure_handler(skb, abort_arp_failure);
> +
> +	req = (struct cpl_abort_req *)skb_put(skb, sizeof(*req));
> +	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
> +	req->wr.wr_lo = htonl(V_WR_TID(tid));
> +	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, tid));
> +	req->rsvd0 = htonl(c3cn->snd_nxt);
> +	req->rsvd1 = !c3cn_flag(c3cn, C3CN_TX_DATA_SENT);
> +	req->cmd = mode;
> +
> +	l2t_send(c3cn->cdev, skb, c3cn->l2t);
> +	return 0;
> +}
> +
> +/*
> + * Add a list of skbs to a connection send queue.  This interface is intended
> + * for use by in-kernel ULPs.  The skbs must comply with the max size limit of
> + * the device and have a headroom of at least TX_HEADER_LEN bytes.
> + */
> +int cxgb3i_c3cn_send_pdus(struct s3_conn *c3cn, struct sk_buff *skb, int flags)
> +{
> +	struct sk_buff *next;
> +	int err, copied = 0;
> +
> +	spin_lock_bh(&c3cn->lock);
> +
> +	if (!c3cn_in_state(c3cn, C3CN_STATE_ESTABLISHED)) {
> +		err = -EAGAIN;
> +		goto out_err;
> +	}
> +
> +	err = -EPIPE;
> +	if (c3cn->err || (c3cn->shutdown & C3CN_SEND_SHUTDOWN))
> +		goto out_err;
> +
> +	while (skb) {
> +		if (unlikely(skb_headroom(skb) < TX_HEADER_LEN)) {
> +			c3cn_tx_debug("c3cn 0x%p, skb head.\n", c3cn);
> +			err = -EINVAL;
> +			goto out_err;
> +		}
> +
> +		next = skb->next;
> +		skb->next = NULL;
> +		skb_entail(c3cn, skb, C3CB_FLAG_NO_APPEND | C3CB_FLAG_NEED_HDR);
> +		copied += skb->len;
> +		c3cn->write_seq += skb->len + ulp_extra_len(skb);
> +		skb = next;
> +	}
> +done:
> +	if (likely(skb_queue_len(&c3cn->write_queue)))
> +		s3_push_frames(c3cn, 1);
> +	spin_unlock_bh(&c3cn->lock);
> +	return copied;
> +
> +out_err:
> +	if (copied == 0 && err == -EPIPE)
> +		copied = c3cn->err ? c3cn->err : -EPIPE;
> +	goto done;
> +}
> +
> +/*
> + * Low-level utility routines for primary API functions.
> + * =====================================================
> + */
> +/* routines to implement CPL message processing */
> +static void c3cn_act_establish(struct s3_conn *, struct sk_buff *);
> +static void active_open_failed(struct s3_conn *, struct sk_buff *);
> +static void wr_ack(struct s3_conn *, struct sk_buff *);
> +static void do_peer_fin(struct s3_conn *, struct sk_buff *);
> +static void process_abort_req(struct s3_conn *, struct sk_buff *);
> +static void process_abort_rpl(struct s3_conn *, struct sk_buff *);
> +static void process_close_con_rpl(struct s3_conn *, struct sk_buff *);
> +static void process_rx_iscsi_hdr(struct s3_conn *, struct sk_buff *);
> +
> +static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *, size_t, gfp_t);
> +
> +static int act_open(struct s3_conn *, struct net_device *);
> +static void fail_act_open(struct s3_conn *, int);
> +static void init_offload_conn(struct s3_conn *, struct t3cdev *,
> +			      struct dst_entry *);
> +
> +/*
> + * Insert a connection into the TID table and take an extra reference.
> + */
> +static inline void c3cn_insert_tid(struct cxgb3i_sdev_data *cdata,
> +				   struct s3_conn *c3cn,
> +				   unsigned int tid)
> +{
> +	c3cn_hold(c3cn);
> +	cxgb3_insert_tid(cdata->cdev, cdata->client, c3cn, tid);
> +}
> +
> +static inline void free_atid(struct t3cdev *cdev, unsigned int tid)
> +{
> +	struct s3_conn *c3cn = cxgb3_free_atid(cdev, tid);
> +	if (c3cn)
> +		c3cn_put(c3cn);
> +}
> +
> +/*
> + * This function is intended for allocations of small control messages.
> + * Such messages go as immediate data and usually the pakets are freed
> + * immediately.  We maintain a cache of one small sk_buff and use it whenever
> + * it is available (has a user count of 1).  Otherwise we get a fresh buffer.
> + */
> +#define CTRL_SKB_LEN 120
> +
> +static struct sk_buff *alloc_ctrl_skb(const struct s3_conn *c3cn,
> +				      int len)
> +{
> +	struct sk_buff *skb = c3cn->ctrl_skb_cache;
> +
> +	if (likely(skb && !skb_shared(skb) && !skb_cloned(skb))) {
> +		__skb_trim(skb, 0);
> +		atomic_set(&skb->users, 2);
> +	} else if (likely(!in_atomic()))
> +		skb = alloc_skb(len, GFP_ATOMIC | __GFP_NOFAIL);
> +	else
> +		skb = alloc_skb(len, GFP_ATOMIC);
> +	return skb;
> +}
> +
> +/**
> + * cxgb3_egress_dev - return the cxgb3 egress device or NULL if the egress
> + *     device isn't one of our ports.
> + *
> + * @root_dev: the root device anchoring the search
> + * @c3cn: the connection used to determine egress port in bonding mode
> + * @context: in bonding mode, indicates a connection set up or failover
> + *
> + * Given a root network device it returns the physical egress device that is a
> + * descendant of the root device.  The root device may be either a physical
> + * device, in which case it is the device returned, or a virtual device, such
> + * as a VLAN or bonding device.  In case of a bonding device the search
> + * considers the decisions of the bonding device given its mode to locate the
> + * correct egress device.
> + */
> +static struct net_device *cxgb3_egress_dev(struct net_device *root_dev,
> +					   struct s3_conn *c3cn,
> +					   int context)
> +{
> +	while (root_dev) {
> +		if (root_dev->priv_flags & IFF_802_1Q_VLAN)
> +			root_dev = vlan_dev_info(root_dev)->real_dev;
> +		else if (is_cxgb3_dev(root_dev))
> +			return root_dev;
> +		else
> +			return NULL;
> +	}
> +	return NULL;
> +}
> +
> +/*
> + * Return TRUE if we're able to establish an offload connection; otherwise
> + * return FALSE.
> + */
> +static int s3_connect(struct s3_conn *c3cn)
> +{
> +	struct net_device *dev = cxgb3_egress_dev(c3cn->dst_cache->dev,
> +						  c3cn, 0);
> +	if (dev == NULL) {
> +		c3cn_conn_debug("c3cn 0x%p, egress dev NULL.\n", c3cn);
> +		return 0;
> +	}
> +	return act_open(c3cn, dev) == 0;
> +}
> +
> +/*
> + * Handle an ARP failure for an active open.
> + */
> +static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
> +{
> +	struct s3_conn *c3cn = (struct s3_conn *)skb->sk;
> +
> +	c3cn_hold(c3cn);
> +	spin_lock(&c3cn->lock);
> +	if (c3cn->state == C3CN_STATE_SYN_SENT) {
> +		fail_act_open(c3cn, EHOSTUNREACH);
> +		__kfree_skb(skb);
> +	}
> +	spin_unlock(&c3cn->lock);
> +	c3cn_put(c3cn);
> +}
> +
> +/*
> + * Send an active open request.
> + */
> +static int act_open(struct s3_conn *c3cn, struct net_device *dev)
> +{
> +	struct cxgb3i_sdev_data *cdata = NDEV2CDATA(dev);
> +	struct t3cdev *cdev = cdata->cdev;
> +	struct dst_entry *dst = c3cn->dst_cache;
> +	struct sk_buff *skb;
> +
> +	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
> +	/*
> +	 * Initialize connection data.  Note that the flags and ULP mode are
> +	 * initialized higher up ...
> +	 */
> +	c3cn->dev = dev;
> +	c3cn->cdev = cdev;
> +	c3cn->tid = cxgb3_alloc_atid(cdev, cdata->client, c3cn);
> +	if (c3cn->tid < 0)
> +		goto out_err;
> +	c3cn->qset = 0;
> +	c3cn->l2t = t3_l2t_get(cdev, dst->neighbour, dev);
> +	if (!c3cn->l2t)
> +		goto free_tid;
> +
> +	skb = alloc_skb(sizeof(struct cpl_act_open_req),
> +			GFP_KERNEL | __GFP_NOFAIL);
> +	skb->sk = (struct sock *)c3cn;
> +	set_arp_failure_handler(skb, act_open_req_arp_failure);
> +
> +	c3cn_hold(c3cn);
> +
> +	init_offload_conn(c3cn, cdev, dst);
> +	c3cn->err = 0;
> +	c3cn_reset_flag(c3cn, C3CN_DONE);
> +
> +	mk_act_open_req(c3cn, skb, c3cn->tid, c3cn->l2t);
> +	l2t_send(cdev, skb, c3cn->l2t);
> +	return 0;
> +
> +free_tid:
> +	free_atid(cdev, c3cn->tid);
> +	c3cn->tid = 0;
> +out_err:
> +	return -1;
> +}
> +
> +/*
> + * Close a connection by sending a CPL_CLOSE_CON_REQ message.  Cannot fail
> + * under any circumstances.  We take the easy way out and always queue the
> + * message to the write_queue.  We can optimize the case where the queue is
> + * already empty though the optimization is probably not worth it.
> + */
> +static void mk_close_req(struct s3_conn *c3cn)
> +{
> +	struct sk_buff *skb;
> +	struct cpl_close_con_req *req;
> +	unsigned int tid = c3cn->tid;
> +
> +	c3cn_conn_debug("c3cn 0x%p.\n", c3cn);
> +
> +	skb = alloc_skb(sizeof(struct cpl_close_con_req),
> +			GFP_KERNEL | __GFP_NOFAIL);
> +	req = (struct cpl_close_con_req *)__skb_put(skb, sizeof(*req));
> +	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
> +	req->wr.wr_lo = htonl(V_WR_TID(tid));
> +	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, tid));
> +	req->rsvd = htonl(c3cn->write_seq);
> +
> +	skb_entail(c3cn, skb, C3CB_FLAG_NO_APPEND);
> +	if (c3cn->state != C3CN_STATE_SYN_SENT)
> +		s3_push_frames(c3cn, 1);
> +}
> +
> +static void skb_entail(struct s3_conn *c3cn, struct sk_buff *skb,
> +		       int flags)
> +{
> +	CXGB3_SKB_CB(skb)->seq = c3cn->write_seq;
> +	CXGB3_SKB_CB(skb)->flags = flags;
> +	__skb_queue_tail(&c3cn->write_queue, skb);
> +}
> +
> +/*
> + * Send RX credits through an RX_DATA_ACK CPL message.  If nofail is 0 we are
> + * permitted to return without sending the message in case we cannot allocate
> + * an sk_buff.  Returns the number of credits sent.
> + */
> +static u32 s3_send_rx_credits(struct s3_conn *c3cn, u32 credits, u32 dack,
> +			      int nofail)
> +{
> +	struct sk_buff *skb;
> +	struct cpl_rx_data_ack *req;
> +
> +	skb = (nofail ? alloc_ctrl_skb(c3cn, sizeof(*req))
> +	       : alloc_skb(sizeof(*req), GFP_ATOMIC));
> +	if (!skb)
> +		return 0;
> +
> +	req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req));
> +	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
> +	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, c3cn->tid));
> +	req->credit_dack = htonl(dack | V_RX_CREDITS(credits));
> +	skb->priority = CPL_PRIORITY_ACK;
> +	cxgb3_ofld_send(c3cn->cdev, skb);
> +	return credits;
> +}
> +
> +static void mk_act_open_req(struct s3_conn *c3cn, struct sk_buff *skb,
> +			    unsigned int atid, const struct l2t_entry *e)
> +{
> +	struct cpl_act_open_req *req;
> +
> +	c3cn_conn_debug("c3cn 0x%p, atid 0x%x.\n", c3cn, atid);
> +
> +	skb->priority = CPL_PRIORITY_SETUP;
> +	req = (struct cpl_act_open_req *)__skb_put(skb, sizeof(*req));
> +	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
> +	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, atid));
> +	req->local_port = c3cn->saddr.sin_port;
> +	req->peer_port = c3cn->daddr.sin_port;
> +	req->local_ip = c3cn->saddr.sin_addr.s_addr;
> +	req->peer_ip = c3cn->daddr.sin_addr.s_addr;
> +	req->opt0h = htonl(calc_opt0h(c3cn) | V_L2T_IDX(e->idx) |
> +			   V_TX_CHANNEL(e->smt_idx));
> +	req->opt0l = htonl(calc_opt0l(c3cn));
> +	req->params = 0;
> +}
> +
> +static inline void s3_purge_write_queue(struct s3_conn *c3cn)
> +{
> +	struct sk_buff *skb;
> +
> +	while ((skb = __skb_dequeue(&c3cn->write_queue))) {
> +		__kfree_skb(skb);
> +	}
> +}
> +
> +/*
> + * Definitions and declarations for CPL handler functions.
> + * =======================================================
> + */
> +
> +/*
> + * Similar to process_cpl_msg() but takes an extra connection reference around
> + * the call to the handler.  Should be used if the handler may drop a
> + * connection reference.
> + */
> +static inline void process_cpl_msg_ref(void (*fn) (struct s3_conn *,
> +						   struct sk_buff *),
> +				       struct s3_conn *c3cn,
> +				       struct sk_buff *skb)
> +{
> +	c3cn_hold(c3cn);
> +	process_cpl_msg(fn, c3cn, skb);
> +	c3cn_put(c3cn);
> +}
> +
> +/*
> + * Return whether a failed active open has allocated a TID
> + */
> +static inline int act_open_has_tid(int status)
> +{
> +	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
> +	    status != CPL_ERR_ARP_MISS;
> +}
> +
> +/*
> + * Returns true if a connection cannot accept new Rx data.
> + */
> +static inline int c3cn_no_receive(const struct s3_conn *c3cn)
> +{
> +	return (c3cn->shutdown & C3CN_RCV_SHUTDOWN);
> +}
> +
> +/*
> + * A helper function that aborts a connection and increments the given MIB
> + * counter.  The supplied skb is used to generate the ABORT_REQ message if
> + * possible.  Must be called with softirqs disabled.
> + */
> +static inline void abort_conn(struct s3_conn *c3cn,
> +			      struct sk_buff *skb)
> +{
> +	struct sk_buff *abort_skb;
> +
> +	abort_skb = __get_cpl_reply_skb(skb, sizeof(struct cpl_abort_req),
> +					GFP_ATOMIC);
> +	if (abort_skb)
> +		s3_send_reset(c3cn, CPL_ABORT_SEND_RST, abort_skb);
> +}
> +
> +/*
> + * Returns whether an ABORT_REQ_RSS message is a negative advice.
> + */
> +static inline int is_neg_adv_abort(unsigned int status)
> +{
> +	return (status == CPL_ERR_RTX_NEG_ADVICE
> +		|| status == CPL_ERR_PERSIST_NEG_ADVICE);
> +}
> +
> +/*
> + * CPL handler functions.
> + * ======================
> + */
> +
> +/*
> + * Process a CPL_ACT_ESTABLISH message.
> + */
> +static int do_act_establish(struct t3cdev *cdev, struct sk_buff *skb,
> +			    void *ctx)
> +{
> +	struct cpl_act_establish *req = cplhdr(skb);
> +	unsigned int tid = GET_TID(req);
> +	unsigned int atid = G_PASS_OPEN_TID(ntohl(req->tos_tid));
> +	struct s3_conn *c3cn = (struct s3_conn *)ctx;
> +	struct cxgb3i_sdev_data *cdata = CXGB3_SDEV_DATA(cdev);
> +
> +	c3cn_conn_debug("c3cn 0x%p, tid 0x%x.\n", c3cn, tid);
> +	/*
> +	 * It's OK if the TID is currently in use, the owning connection may
> +	 * have backlogged its last CPL message(s).  Just take it away.
> +	 */
> +	c3cn->tid = tid;
> +	c3cn_insert_tid(cdata, c3cn, tid);
> +	free_atid(cdev, atid);
> +
> +	c3cn->qset = G_QNUM(ntohl(skb->csum));
> +
> +	process_cpl_msg(c3cn_act_establish, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Process an ACT_OPEN_RPL CPL message.
> + */
> +static int do_act_open_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
> +{
> +	struct s3_conn *c3cn = (struct s3_conn *)ctx;
> +	struct cpl_act_open_rpl *rpl = cplhdr(skb);
> +
> +	c3cn_conn_debug("c3cn 0x%p, status 0x%x.\n", c3cn, rpl->status);
> +
> +	if (act_open_has_tid(rpl->status))
> +		cxgb3_queue_tid_release(cdev, GET_TID(rpl));
> +
> +	process_cpl_msg_ref(active_open_failed, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Handler RX_ISCSI_HDR CPL messages.
> + */
> +static int do_iscsi_hdr(struct t3cdev *t3dev, struct sk_buff *skb, void *ctx)
> +{
> +	struct s3_conn *c3cn = (struct s3_conn *)ctx;
> +	process_cpl_msg(process_rx_iscsi_hdr, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Handler for TX_DATA_ACK CPL messages.
> + */
> +static int do_wr_ack(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
> +{
> +	struct s3_conn *c3cn = (struct s3_conn *)ctx;
> +
> +	process_cpl_msg(wr_ack, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Handler for PEER_CLOSE CPL messages.
> + */
> +static int do_peer_close(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
> +{
> +	struct s3_conn *c3cn = (struct s3_conn *)ctx;
> +
> +	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
> +			 c3cn, c3cn->state, c3cn->flags);
> +	process_cpl_msg_ref(do_peer_fin, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Handle an ABORT_REQ_RSS CPL message.
> + */
> +static int do_abort_req(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
> +{
> +	const struct cpl_abort_req_rss *req = cplhdr(skb);
> +	struct s3_conn *c3cn = (struct s3_conn *)ctx;
> +
> +	if (is_neg_adv_abort(req->status)) {
> +		__kfree_skb(skb);
> +		return 0;
> +	}
> +
> +	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
> +			 c3cn, c3cn->state, c3cn->flags);
> +
> +	process_cpl_msg_ref(process_abort_req, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Handle an ABORT_RPL_RSS CPL message.
> + */
> +static int do_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb, void *ctx)
> +{
> +	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
> +	struct s3_conn *c3cn;
> +
> +	/*
> +	 * Ignore replies to post-close aborts indicating that the abort was
> +	 * requested too late.  These connections are terminated when we get
> +	 * PEER_CLOSE or CLOSE_CON_RPL and by the time the abort_rpl_rss
> +	 * arrives the TID is either no longer used or it has been recycled.
> +	 */
> +	if (rpl->status == CPL_ERR_ABORT_FAILED) {
> +discard:
> +		__kfree_skb(skb);
> +		return 0;
> +	}
> +
> +	c3cn = (struct s3_conn *)ctx;
> +	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
> +			 c3cn, c3cn->state, c3cn->flags);
> +
> +	/*
> +	 * Sometimes we've already closed the connection, e.g., a post-close
> +	 * abort races with ABORT_REQ_RSS, the latter frees the connection
> +	 * expecting the ABORT_REQ will fail with CPL_ERR_ABORT_FAILED,
> +	 * but FW turns the ABORT_REQ into a regular one and so we get
> +	 * ABORT_RPL_RSS with status 0 and no connection.  Only on T3A.
> +	 */
> +	if (!c3cn)
> +		goto discard;
> +
> +	process_cpl_msg_ref(process_abort_rpl, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Handler for CLOSE_CON_RPL CPL messages.
> + */
> +static int do_close_con_rpl(struct t3cdev *cdev, struct sk_buff *skb,
> +			    void *ctx)
> +{
> +	struct s3_conn *c3cn = (struct s3_conn *)ctx;
> +
> +	c3cn_conn_debug("c3cn 0x%p, state 0x%x, flag 0x%lx.\n",
> +			 c3cn, c3cn->state, c3cn->flags);
> +
> +	process_cpl_msg_ref(process_close_con_rpl, c3cn, skb);
> +	return 0;
> +}
> +
> +/*
> + * Definitions and declarations for CPL message processing.
> + * ========================================================
> + */
> +
> +static void make_established(struct s3_conn *, u32, unsigned int);
> +static void t3_release_offload_resources(struct s3_conn *);
> +static void act_open_retry_timer(unsigned long);
> +static void mk_act_open_req(struct s3_conn *, struct sk_buff *,
> +			    unsigned int, const struct l2t_entry *);
> +static int act_open_rpl_status_to_errno(int);
> +static void handle_excess_rx(struct s3_conn *, struct sk_buff *);
> +static int abort_status_to_errno(struct s3_conn *, int, int *);
> +static void send_abort_rpl(struct sk_buff *, struct t3cdev *, int);
> +static struct sk_buff *get_cpl_reply_skb(struct sk_buff *, size_t, gfp_t);
> +static void t3_defer_reply(struct sk_buff *, struct t3cdev *, defer_handler_t);
> +static void send_deferred_abort_rpl(struct t3cdev *, struct sk_buff *);
> +
> +/*
> + * Dequeue and return the first unacknowledged's WR on a connections's pending
> + * list.
> + */
> +static inline struct sk_buff *dequeue_wr(struct s3_conn *c3cn)
> +{
> +	struct sk_buff *skb = c3cn->wr_pending_head;
> +
> +	if (likely(skb)) {
> +		/* Don't bother clearing the tail */
> +		c3cn->wr_pending_head = (struct sk_buff *)skb->sp;
> +		skb->sp = NULL;
> +	}
> +	return skb;
> +}
> +
> +/*
> + * Return the first pending WR without removing it from the list.
> + */
> +static inline struct sk_buff *peek_wr(const struct s3_conn *c3cn)
> +{
> +	return c3cn->wr_pending_head;
> +}
> +
> +static inline void free_wr_skb(struct sk_buff *skb)
> +{
> +	kfree_skb(skb);
> +}
> +
> +static void purge_wr_queue(struct s3_conn *c3cn)
> +{
> +	struct sk_buff *skb;
> +	while ((skb = dequeue_wr(c3cn)) != NULL)
> +		free_wr_skb(skb);
> +}
> +
> +static inline void set_abort_rpl_wr(struct sk_buff *skb, unsigned int tid,
> +				    int cmd)
> +{
> +	struct cpl_abort_rpl *rpl = cplhdr(skb);
> +
> +	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
> +	rpl->wr.wr_lo = htonl(V_WR_TID(tid));
> +	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, tid));
> +	rpl->cmd = cmd;
> +}
> +
> +/*
> + * CPL message processing ...
> + * ==========================
> + */
> +
> +/*
> + * Updates connection state from an active establish CPL message.  Runs with
> + * the connection lock held.
> + */
> +static void c3cn_act_establish(struct s3_conn *c3cn,
> +			       struct sk_buff *skb)
> +{
> +	struct cpl_act_establish *req = cplhdr(skb);
> +	u32 rcv_isn = ntohl(req->rcv_isn);	/* real RCV_ISN + 1 */
> +
> +	if (unlikely(c3cn->state != C3CN_STATE_SYN_SENT))
> +		printk(KERN_ERR "TID %u expected SYN_SENT, found %d\n",
> +		       c3cn->tid, c3cn->state);
> +
> +	c3cn->copied_seq = c3cn->rcv_wup = c3cn->rcv_nxt = rcv_isn;
> +	make_established(c3cn, ntohl(req->snd_isn), ntohs(req->tcp_opt));
> +
> +	__kfree_skb(skb);
> +
> +	if (s3_push_frames(c3cn, 1))
> +		cxgb3i_conn_tx_open(c3cn);
> +}
> +
> +/*
> + * Handle active open failures.
> + */
> +static void active_open_failed(struct s3_conn *c3cn,
> +			       struct sk_buff *skb)
> +{
> +	struct cpl_act_open_rpl *rpl = cplhdr(skb);
> +
> +	if (rpl->status == CPL_ERR_CONN_EXIST &&
> +	    c3cn->retry_timer.function != act_open_retry_timer) {
> +		c3cn->retry_timer.function = act_open_retry_timer;
> +		c3cn_reset_timer(c3cn, &c3cn->retry_timer,
> +				 jiffies + HZ / 2);
> +	} else
> +		fail_act_open(c3cn, act_open_rpl_status_to_errno(rpl->status));
> +	__kfree_skb(skb);
> +}
> +
> +/*
> + * Process received pdu for a connection.
> + */
> +static void process_rx_iscsi_hdr(struct s3_conn *c3cn,
> +				 struct sk_buff *skb)
> +{
> +	struct cpl_iscsi_hdr *hdr_cpl = cplhdr(skb);
> +	struct cpl_iscsi_hdr_norss data_cpl;
> +	struct cpl_rx_data_ddp_norss ddp_cpl;
> +	unsigned int hdr_len, data_len, status;
> +	unsigned int len;
> +	int err;
> +
> +	if (unlikely(c3cn_no_receive(c3cn))) {
> +		handle_excess_rx(c3cn, skb);
> +		return;
> +	}
> +
> +	CXGB3_SKB_CB(skb)->seq = ntohl(hdr_cpl->seq);
> +	CXGB3_SKB_CB(skb)->flags = 0;
> +
> +	skb_reset_transport_header(skb);
> +	__skb_pull(skb, sizeof(struct cpl_iscsi_hdr));
> +
> +	len = hdr_len = ntohs(hdr_cpl->len);
> +	/* msg coalesce is off or not enough data received */
> +	if (skb->len <= hdr_len) {
> +		printk(KERN_ERR "%s: TID %u, ISCSI_HDR, skb len %u < %u.\n",
> +		       c3cn->cdev->name, c3cn->tid, skb->len, hdr_len);
> +		goto abort_conn;
> +	}
> +
> +	err = skb_copy_bits(skb, skb->len - sizeof(ddp_cpl), &ddp_cpl,
> +			    sizeof(ddp_cpl));
> +	if (err < 0)
> +		goto abort_conn;
> +
> +	skb_ulp_mode(skb) = ULP2_FLAG_DATA_READY;
> +	skb_ulp_pdulen(skb) = ntohs(ddp_cpl.len);
> +	skb_ulp_ddigest(skb) = ntohl(ddp_cpl.ulp_crc);
> +	status = ntohl(ddp_cpl.ddp_status);
> +
> +	if (status & (1 << RX_DDP_STATUS_HCRC_SHIFT))
> +		skb_ulp_mode(skb) |= ULP2_FLAG_HCRC_ERROR;
> +	if (status & (1 << RX_DDP_STATUS_DCRC_SHIFT))
> +		skb_ulp_mode(skb) |= ULP2_FLAG_DCRC_ERROR;
> +	if (status & (1 << RX_DDP_STATUS_PAD_SHIFT))
> +		skb_ulp_mode(skb) |= ULP2_FLAG_PAD_ERROR;
> +
> +	if (skb->len > (hdr_len + sizeof(ddp_cpl))) {
> +		err = skb_copy_bits(skb, hdr_len, &data_cpl, sizeof(data_cpl));
> +		if (err < 0)
> +			goto abort_conn;
> +		data_len = ntohs(data_cpl.len);
> +		len += sizeof(data_cpl) + data_len;
> +	} else if (status & (1 << RX_DDP_STATUS_DDP_SHIFT))
> +		skb_ulp_mode(skb) |= ULP2_FLAG_DATA_DDPED;
> +
> +	c3cn->rcv_nxt = ntohl(ddp_cpl.seq) + skb_ulp_pdulen(skb);
> +	__pskb_trim(skb, len);
> +	__skb_queue_tail(&c3cn->receive_queue, skb);
> +	cxgb3i_conn_pdu_ready(c3cn);
> +
> +	return;
> +
> +abort_conn:
> +	s3_send_reset(c3cn, CPL_ABORT_SEND_RST, NULL);
> +	__kfree_skb(skb);
> +}
> +
> +/*
> + * Process an acknowledgment of WR completion.  Advance snd_una and send the
> + * next batch of work requests from the write queue.
> + */
> +static void wr_ack(struct s3_conn *c3cn, struct sk_buff *skb)
> +{
> +	struct cpl_wr_ack *hdr = cplhdr(skb);
> +	unsigned int credits = ntohs(hdr->credits);
> +	u32 snd_una = ntohl(hdr->snd_una);
> +
> +	c3cn->wr_avail += credits;
> +	if (c3cn->wr_unacked > c3cn->wr_max - c3cn->wr_avail)
> +		c3cn->wr_unacked = c3cn->wr_max - c3cn->wr_avail;
> +
> +	while (credits) {
> +		struct sk_buff *p = peek_wr(c3cn);
> +
> +		if (unlikely(!p)) {
> +			printk(KERN_ERR "%u WR_ACK credits for TID %u with "
> +			       "nothing pending, state %u\n",
> +			       credits, c3cn->tid, c3cn->state);
> +			break;
> +		}
> +		if (unlikely(credits < p->csum)) {
> +			p->csum -= credits;
> +			break;
> +		} else {
> +			dequeue_wr(c3cn);
> +			credits -= p->csum;
> +			free_wr_skb(p);
> +		}
> +	}
> +
> +	if (unlikely(before(snd_una, c3cn->snd_una))) {
> +		goto out_free;
> +	}
> +
> +	if (c3cn->snd_una != snd_una) {
> +		c3cn->snd_una = snd_una;
> +		dst_confirm(c3cn->dst_cache);
> +		if (c3cn->snd_una == c3cn->snd_nxt)
> +			c3cn_reset_flag(c3cn, C3CN_TX_WAIT_IDLE);
> +	}
> +
> +	if (skb_queue_len(&c3cn->write_queue) && s3_push_frames(c3cn, 0))
> +		cxgb3i_conn_tx_open(c3cn);
> +out_free:
> +	__kfree_skb(skb);
> +}
> +
> +/*
> + * Handle a peer FIN.
> + */
> +static void do_peer_fin(struct s3_conn *c3cn, struct sk_buff *skb)
> +{
> +	int keep = 0;
> +
> +	if (c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING))
> +		goto out;
> +
> +	c3cn->shutdown |= C3CN_RCV_SHUTDOWN;
> +	c3cn_set_flag(c3cn, C3CN_DONE);
> +
> +	switch (c3cn->state) {
> +	case C3CN_STATE_ESTABLISHED:
> +		break;
> +	case C3CN_STATE_CLOSING:
> +		t3_release_offload_resources(c3cn);
> +		c3cn_done(c3cn);
> +		break;
> +	default:
> +		printk(KERN_ERR
> +		       "%s: TID %u received PEER_CLOSE in bad state %d\n",
> +		       c3cn->cdev->name, c3cn->tid, c3cn->state);
> +	}
> +	
> +	cxgb3i_conn_closing(c3cn);
> +out:
> +	if (!keep)
> +		__kfree_skb(skb);
> +}
> +
> +/*
> + * Process abort requests.  If we are waiting for an ABORT_RPL we ignore this
> + * request except that we need to reply to it.
> + */
> +static void process_abort_req(struct s3_conn *c3cn,
> +			      struct sk_buff *skb)
> +{
> +	int rst_status = CPL_ABORT_NO_RST;
> +	const struct cpl_abort_req_rss *req = cplhdr(skb);
> +
> +	if (!c3cn_flag(c3cn, C3CN_ABORT_REQ_RCVD)) {
> +		c3cn_set_flag(c3cn, C3CN_ABORT_REQ_RCVD);
> +		c3cn_set_flag(c3cn, C3CN_ABORT_SHUTDOWN);
> +		__kfree_skb(skb);
> +		return;
> +	}
> +	c3cn_reset_flag(c3cn, C3CN_ABORT_REQ_RCVD);
> +
> +	/*
> +	 * Three cases to consider:
> +	 * a) We haven't sent an abort_req; close the connection.
> +	 * b) We have sent a post-close abort_req that will get to TP too late
> +	 *    and will generate a CPL_ERR_ABORT_FAILED reply.  The reply will
> +	 *    be ignored and the connection should be closed now.
> +	 * c) We have sent a regular abort_req that will get to TP too late.
> +	 *    That will generate an abort_rpl with status 0, wait for it.
> +	 */
> +	send_abort_rpl(skb, c3cn->cdev, rst_status);
> +
> +	if (!c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING)) {
> +		c3cn->err =
> +		    abort_status_to_errno(c3cn, req->status, &rst_status);
> +
> +		t3_release_offload_resources(c3cn);
> +		c3cn_done(c3cn);
> +	}
> +}
> +
> +/*
> + * Process abort replies.  We only process these messages if we anticipate
> + * them as the coordination between SW and HW in this area is somewhat lacking
> + * and sometimes we get ABORT_RPLs after we are done with the connection that
> + * originated the ABORT_REQ.
> + */
> +static void process_abort_rpl(struct s3_conn *c3cn,
> +			      struct sk_buff *skb)
> +{
> +	if (c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING)) {
> +		if (!c3cn_flag(c3cn, C3CN_ABORT_RPL_RCVD))
> +			c3cn_set_flag(c3cn, C3CN_ABORT_RPL_RCVD);
> +		else {
> +			c3cn_reset_flag(c3cn, C3CN_ABORT_RPL_RCVD);
> +			c3cn_reset_flag(c3cn, C3CN_ABORT_RPL_PENDING);
> +			BUG_ON(c3cn_flag(c3cn, C3CN_ABORT_REQ_RCVD));
> +			t3_release_offload_resources(c3cn);
> +			c3cn_done(c3cn);
> +		}
> +	}
> +	__kfree_skb(skb);
> +}
> +
> +/*
> + * Process a peer ACK to our FIN.
> + */
> +static void process_close_con_rpl(struct s3_conn *c3cn,
> +				  struct sk_buff *skb)
> +{
> +	struct cpl_close_con_rpl *rpl = cplhdr(skb);
> +
> +	c3cn->snd_una = ntohl(rpl->snd_nxt) - 1;	/* exclude FIN */
> +
> +	if (c3cn_flag(c3cn, C3CN_ABORT_RPL_PENDING))
> +		goto out;
> +
> +	if (c3cn->state == C3CN_STATE_CLOSING) {
> +		t3_release_offload_resources(c3cn);
> +		c3cn_done(c3cn);
> +	} else 
> +		printk(KERN_ERR
> +		       "%s: TID %u received CLOSE_CON_RPL in bad state %d\n",
> +		       c3cn->cdev->name, c3cn->tid, c3cn->state);
> +out:
> +	kfree_skb(skb);
> +}
> +
> +/*
> + * Random utility functions for CPL message processing ...
> + * =======================================================
> + */
> +
> +/**
> + *	find_best_mtu - find the entry in the MTU table closest to an MTU
> + *	@d: TOM state
> + *	@mtu: the target MTU
> + *
> + *	Returns the index of the value in the MTU table that is closest to but
> + *	does not exceed the target MTU.
> + */
> +static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
> +{
> +	int i = 0;
> +
> +	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
> +		++i;
> +	return i;
> +}
> +
> +static unsigned int select_mss(struct s3_conn *c3cn, unsigned int pmtu)
> +{
> +	unsigned int idx;
> +	struct dst_entry *dst = c3cn->dst_cache;
> +	struct t3cdev *cdev = c3cn->cdev;
> +	const struct t3c_data *td = T3C_DATA(cdev);
> +	u16 advmss = dst_metric(dst, RTAX_ADVMSS);
> +
> +	if (advmss > pmtu - 40)
> +		advmss = pmtu - 40;
> +	if (advmss < td->mtus[0] - 40)
> +		advmss = td->mtus[0] - 40;
> +	idx = find_best_mtu(td, advmss + 40);
> +	return idx;
> +}
> +
> +static void fail_act_open(struct s3_conn *c3cn, int errno)
> +{
> +	c3cn->err = errno;
> +	t3_release_offload_resources(c3cn);
> +	c3cn_done(c3cn);
> +}
> +
> +/*
> + * Assign offload parameters to some connection fields.
> + */
> +static void init_offload_conn(struct s3_conn *c3cn,
> +			      struct t3cdev *cdev,
> +			      struct dst_entry *dst)
> +{
> +	BUG_ON(c3cn->cdev != cdev);
> +	c3cn->wr_max = c3cn->wr_avail = T3C_DATA(cdev)->max_wrs;
> +	c3cn->wr_unacked = 0;
> +	c3cn->mss_idx = select_mss(c3cn, dst_mtu(dst));
> +
> +	c3cn->ctrl_skb_cache = alloc_skb(CTRL_SKB_LEN, gfp_any());
> +	reset_wr_list(c3cn);
> +}
> +
> +static void act_open_retry_timer(unsigned long data)
> +{
> +	struct sk_buff *skb;
> +	struct s3_conn *c3cn = (struct s3_conn *)data;
> +
> +	spin_lock(&c3cn->lock);
> +	skb = alloc_skb(sizeof(struct cpl_act_open_req), GFP_ATOMIC);
> +	if (!skb)
> +		fail_act_open(c3cn, ENOMEM);
> +	else {
> +		skb->sk = (struct sock *)c3cn;
> +		set_arp_failure_handler(skb, act_open_req_arp_failure);
> +		mk_act_open_req(c3cn, skb, c3cn->tid, c3cn->l2t);
> +		l2t_send(c3cn->cdev, skb, c3cn->l2t);
> +	}
> +	spin_unlock(&c3cn->lock);
> +	c3cn_put(c3cn);
> +}
> +
> +/*
> + * Convert an ACT_OPEN_RPL status to a Linux errno.
> + */
> +static int act_open_rpl_status_to_errno(int status)
> +{
> +	switch (status) {
> +	case CPL_ERR_CONN_RESET:
> +		return ECONNREFUSED;
> +	case CPL_ERR_ARP_MISS:
> +		return EHOSTUNREACH;
> +	case CPL_ERR_CONN_TIMEDOUT:
> +		return ETIMEDOUT;
> +	case CPL_ERR_TCAM_FULL:
> +		return ENOMEM;
> +	case CPL_ERR_CONN_EXIST:
> +		printk(KERN_ERR "ACTIVE_OPEN_RPL: 4-tuple in use\n");
> +		return EADDRINUSE;
> +	default:
> +		return EIO;
> +	}
> +}
> +
> +/*
> + * Convert the status code of an ABORT_REQ into a Linux error code.  Also
> + * indicate whether RST should be sent in response.
> + */
> +static int abort_status_to_errno(struct s3_conn *c3cn,
> +				 int abort_reason, int *need_rst)
> +{
> +	switch (abort_reason) {
> +	case CPL_ERR_BAD_SYN: /* fall through */
> +	case CPL_ERR_CONN_RESET:
> +		return c3cn->state == C3CN_STATE_CLOSING ? EPIPE : ECONNRESET;
> +	case CPL_ERR_XMIT_TIMEDOUT:
> +	case CPL_ERR_PERSIST_TIMEDOUT:
> +	case CPL_ERR_FINWAIT2_TIMEDOUT:
> +	case CPL_ERR_KEEPALIVE_TIMEDOUT:
> +		return ETIMEDOUT;
> +	default:
> +		return EIO;
> +	}
> +}
> +
> +static void send_abort_rpl(struct sk_buff *skb, struct t3cdev *cdev,
> +			   int rst_status)
> +{
> +	struct sk_buff *reply_skb;
> +	struct cpl_abort_req_rss *req = cplhdr(skb);
> +
> +	reply_skb = get_cpl_reply_skb(skb, sizeof(struct cpl_abort_rpl),
> +				      gfp_any());
> +	if (!reply_skb) {
> +		/* Defer the reply.  Stick rst_status into req->cmd. */
> +		req->status = rst_status;
> +		t3_defer_reply(skb, cdev, send_deferred_abort_rpl);
> +		return;
> +	}
> +
> +	reply_skb->priority = CPL_PRIORITY_DATA;
> +	set_abort_rpl_wr(reply_skb, GET_TID(req), rst_status);
> +	kfree_skb(skb);
> +	cxgb3_ofld_send(cdev, reply_skb);
> +}
> +
> +/*
> + * Returns an sk_buff for a reply CPL message of size len.  If the input
> + * sk_buff has no other users it is trimmed and reused, otherwise a new buffer
> + * is allocated.  The input skb must be of size at least len.  Note that this
> + * operation does not destroy the original skb data even if it decides to reuse
> + * the buffer.
> + */
> +static struct sk_buff *get_cpl_reply_skb(struct sk_buff *skb, size_t len,
> +					 gfp_t gfp)
> +{
> +	if (likely(!skb_cloned(skb))) {
> +		BUG_ON(skb->len < len);
> +		__skb_trim(skb, len);
> +		skb_get(skb);
> +	} else {
> +		skb = alloc_skb(len, gfp);
> +		if (skb)
> +			__skb_put(skb, len);
> +	}
> +	return skb;
> +}
> +
> +/*
> + * Add an skb to the deferred skb queue for processing from process context.
> + */
> +static void t3_defer_reply(struct sk_buff *skb, struct t3cdev *cdev,
> +			   defer_handler_t handler)
> +{
> +	struct cxgb3i_sdev_data *cdata = CXGB3_SDEV_DATA(cdev);
> +
> +	DEFERRED_SKB_CB(skb)->handler = handler;
> +	spin_lock_bh(&cdata->deferq.lock);
> +	__skb_queue_tail(&cdata->deferq, skb);
> +	if (skb_queue_len(&cdata->deferq) == 1)
> +		schedule_work(&cdata->deferq_task);
> +	spin_unlock_bh(&cdata->deferq.lock);
> +}
> +
> +/*
> + * Process the defer queue.
> + */
> +static void process_deferq(struct work_struct *task_param)
> +{
> +	struct sk_buff *skb;
> +	struct cxgb3i_sdev_data *cdata = container_of(task_param,
> +						     struct cxgb3i_sdev_data,
> +						     deferq_task);
> +
> +	spin_lock_bh(&cdata->deferq.lock);
> +	while ((skb = __skb_dequeue(&cdata->deferq)) != NULL) {
> +		spin_unlock_bh(&cdata->deferq.lock);
> +		DEFERRED_SKB_CB(skb)->handler(cdata->cdev, skb);
> +		spin_lock_bh(&cdata->deferq.lock);
> +	}
> +	spin_unlock_bh(&cdata->deferq.lock);
> +}
> +
> +static void send_deferred_abort_rpl(struct t3cdev *cdev, struct sk_buff *skb)
> +{
> +	struct sk_buff *reply_skb;
> +	struct cpl_abort_req_rss *req = cplhdr(skb);
> +
> +	reply_skb = alloc_skb(sizeof(struct cpl_abort_rpl),
> +			      GFP_KERNEL | __GFP_NOFAIL);
> +	reply_skb->priority = CPL_PRIORITY_DATA;
> +	__skb_put(reply_skb, sizeof(struct cpl_abort_rpl));
> +	set_abort_rpl_wr(reply_skb, GET_TID(req), req->status);
> +	cxgb3_ofld_send(cdev, reply_skb);
> +	kfree_skb(skb);
> +}
> +
> +/*
> + * Release resources held by an offload connection (TID, L2T entry, etc.)
> + */
> +static void t3_release_offload_resources(struct s3_conn *c3cn)
> +{
> +	struct t3cdev *cdev = c3cn->cdev;
> +	unsigned int tid = c3cn->tid;
> +
> +	if (!cdev)
> +		return;
> +
> +	c3cn->qset = 0;
> +
> +	kfree_skb(c3cn->ctrl_skb_cache);
> +	c3cn->ctrl_skb_cache = NULL;
> +
> +	if (c3cn->wr_avail != c3cn->wr_max) {
> +		purge_wr_queue(c3cn);
> +		reset_wr_list(c3cn);
> +	}
> +
> +	if (c3cn->l2t) {
> +		l2t_release(L2DATA(cdev), c3cn->l2t);
> +		c3cn->l2t = NULL;
> +	}
> +
> +	if (c3cn->state == C3CN_STATE_SYN_SENT) /* we have ATID */
> +		free_atid(cdev, tid);
> +	else {		/* we have TID */
> +		cxgb3_remove_tid(cdev, (void *)c3cn, tid);
> +		c3cn_put(c3cn);
> +	}
> +
> +	c3cn->cdev = NULL;
> +}
> +
> +/*
> + * Handles Rx data that arrives in a state where the connection isn't
> + * accepting new data.
> + */
> +static void handle_excess_rx(struct s3_conn *c3cn, struct sk_buff *skb)
> +{
> +	if (!c3cn_flag(c3cn, C3CN_ABORT_SHUTDOWN))
> +		abort_conn(c3cn, skb);
> +
> +	kfree_skb(skb);
> +}
> +
> +/*
> + * Like get_cpl_reply_skb() but the returned buffer starts out empty.
> + */
> +static struct sk_buff *__get_cpl_reply_skb(struct sk_buff *skb, size_t len,
> +					   gfp_t gfp)
> +{
> +	if (likely(!skb_cloned(skb) && !skb->data_len)) {
> +		__skb_trim(skb, 0);
> +		skb_get(skb);
> +	} else
> +		skb = alloc_skb(len, gfp);
> +	return skb;
> +}
> +
> +/*
> + * Completes some final bits of initialization for just established connections
> + * and changes their state to C3CN_STATE_ESTABLISHED.
> + *
> + * snd_isn here is the ISN after the SYN, i.e., the true ISN + 1.
> + */
> +static void make_established(struct s3_conn *c3cn, u32 snd_isn,
> +			     unsigned int opt)
> +{
> +	c3cn->write_seq = c3cn->snd_nxt = c3cn->snd_una = snd_isn;
> +
> +	/*
> +	 * Causes the first RX_DATA_ACK to supply any Rx credits we couldn't
> +	 * pass through opt0.
> +	 */
> +	if (rcv_win > (M_RCV_BUFSIZ << 10))
> +		c3cn->rcv_wup -= rcv_win - (M_RCV_BUFSIZ << 10);
> +
> +	dst_confirm(c3cn->dst_cache);
> +
> +	smp_mb();
> +	c3cn_set_state(c3cn, C3CN_STATE_ESTABLISHED);
> +}
> diff --git a/drivers/scsi/cxgb3i/cxgb3i_offload.h b/drivers/scsi/cxgb3i/cxgb3i_offload.h
> new file mode 100644
> index 0000000..98d5c7d
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/cxgb3i_offload.h
> @@ -0,0 +1,242 @@
> +/*
> + * Copyright (C) 2003-2008 Chelsio Communications.  All rights reserved.
> + *
> + * Written by Dimitris Michailidis (dm@chelsio.com)
> + *
> + * This program is distributed in the hope that it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the LICENSE file included in this
> + * release for licensing terms and conditions.
> + */
> +
> +#ifndef _CXGB3I_OFFLOAD_H
> +#define _CXGB3I_OFFLOAD_H
> +
> +#include <linux/skbuff.h>
> +#include <net/tcp.h>
> +
> +#include "t3cdev.h"
> +#include "cxgb3_offload.h"
> +
> +#define cxgb3i_log_error(fmt...) printk(KERN_ERR "cxgb3i: ERR! " fmt)
> +#define cxgb3i_log_warn(fmt...)	 printk(KERN_WARNING "cxgb3i: WARN! " fmt)
> +#define cxgb3i_log_info(fmt...)  printk(KERN_INFO "cxgb3i: " fmt)
> +
> +#ifdef __DEBUG_CXGB3I__
> +#define cxgb3i_log_debug(fmt, args...) \
> +        printk(KERN_ERR "cxgb3i: %s - " fmt, __func__ , ## args)
> +#else
> +#define cxgb3i_log_debug(fmt...)
> +#endif
> +
> +/*
> + * Data structure to keep track of cxgb3 connection.
> + */
> +struct s3_conn {
> +	struct net_device *dev;
> +	struct t3cdev *cdev;
> +	unsigned long flags;
> +	int tid;
> +	int qset;
> +	int mss_idx;
> +	struct l2t_entry *l2t;
> +	int wr_max;
> +	int wr_avail;
> +	int wr_unacked;
> +	struct sk_buff *wr_pending_head;
> +	struct sk_buff *wr_pending_tail;
> +	struct sk_buff *ctrl_skb_cache;
> +
> +	spinlock_t lock;
> +	atomic_t refcnt;
> +	volatile unsigned int state;
> +	struct sockaddr_in saddr;
> +	struct sockaddr_in daddr;
> +	struct dst_entry *dst_cache;
> +	unsigned char shutdown;
> +	struct sk_buff_head receive_queue;
> +	struct sk_buff_head write_queue;
> +	struct timer_list retry_timer;
> +	int err;
> +	rwlock_t callback_lock;
> +	void *user_data;
> +
> +	u32 rcv_nxt;		/* What we want to receive next		*/
> +	u32 copied_seq;		/* Head of yet unread data		*/
> +	u32 rcv_wup;		/* rcv_nxt on last window update sent	*/
> +	u32 snd_nxt;		/* Next sequence we send		*/
> +	u32 snd_una;		/* First byte we want an ack for	*/
> +
> +	u32 write_seq;		/* Tail(+1) of data held in send buffer */
> +};
> +
> +/* Flags in c3cn->shutdown */
> +#define C3CN_SHUTDOWN_MASK	3
> +#define C3CN_RCV_SHUTDOWN	1
> +#define C3CN_SEND_SHUTDOWN	2
> +
> +/*
> + * connection state bitmap
> + */
> +#define C3CN_STATE_CLOSE	0x1
> +#define C3CN_STATE_SYN_SENT	0x2
> +#define C3CN_STATE_ESTABLISHED	0x4
> +#define C3CN_STATE_CLOSING	0x8
> +#define C3CN_STATE_ABORING	0x10
> +
> +#define C3CN_STATE_MASK		0xFF
> +#define C3CN_NEED_CLOSE		0x100
> +
> +/*
> + * Connection flags -- many to track some close related events.
> + */
> +enum c3cn_flags {
> +	C3CN_ABORT_RPL_RCVD,	/* received one ABORT_RPL_RSS message */
> +	C3CN_ABORT_REQ_RCVD,	/* received one ABORT_REQ_RSS message */
> +	C3CN_TX_WAIT_IDLE,	/* suspend Tx until in-flight data is ACKed */
> +	C3CN_ABORT_SHUTDOWN,	/* shouldn't send more abort requests */
> +	C3CN_ABORT_RPL_PENDING,	/* expecting an abort reply */
> +	C3CN_CLOSE_CON_REQUESTED,	/* we've sent a close_conn_req */
> +	C3CN_TX_DATA_SENT,	/* already sent a TX_DATA WR */
> +
> +	C3CN_DONE,
> +};
> +
> +static inline void c3cn_set_flag(struct s3_conn *c3cn,
> +				 enum c3cn_flags flag)
> +{
> +	__set_bit(flag, &c3cn->flags);
> +}
> +
> +static inline void c3cn_reset_flag(struct s3_conn *c3cn,
> +				   enum c3cn_flags flag)
> +{
> +	__clear_bit(flag, &c3cn->flags);
> +}
> +
> +static inline int c3cn_flag(struct s3_conn *c3cn, enum c3cn_flags flag)
> +{
> +	if (c3cn == NULL)
> +		return 0;
> +	return test_bit(flag, &c3cn->flags);
> +}
> +
> +/*
> + * Per adapter data.  Linked off of each Ethernet device port on the adapter.
> + * Also available via the t3cdev structure since we have pointers to our port
> + * net_device's there ...
> + */
> +struct cxgb3i_sdev_data {
> +	struct list_head list;
> +	struct t3cdev *cdev;
> +	struct cxgb3_client *client;
> +	struct adap_ports *ports;
> +	unsigned int rx_page_size;
> +	struct sk_buff_head deferq;
> +	struct work_struct deferq_task;
> +};
> +#define NDEV2CDATA(ndev) (*(struct cxgb3i_sdev_data **)&(ndev)->ec_ptr)
> +#define CXGB3_SDEV_DATA(cdev) NDEV2CDATA((cdev)->lldev)
> +
> +static inline void c3cn_hold(struct s3_conn *c3cn)
> +{
> +	atomic_inc(&c3cn->refcnt);
> +}
> +
> +static inline void c3cn_put(struct s3_conn *c3cn)
> +{
> +	if (atomic_dec_and_test(&c3cn->refcnt))
> +		kfree(c3cn);
> +}
> +
> +void c3cn_close(struct s3_conn *);
> +static inline void c3cn_release(struct s3_conn *c3cn)
> +{
> +	c3cn_close(c3cn);
> +	c3cn_put(c3cn);
> +}
> +
> +/*
> + * Primary API routines.
> + */
> +
> +int cxgb3i_sdev_init(cxgb3_cpl_handler_func *);
> +void cxgb3i_sdev_add(struct t3cdev *, struct cxgb3_client *);
> +void cxgb3i_sdev_remove(struct t3cdev *);
> +
> +struct s3_conn *cxgb3i_c3cn_create(void);
> +int cxgb3i_c3cn_connect(struct s3_conn *, struct sockaddr_in *);
> +void cxgb3i_c3cn_rx_credits(struct s3_conn *, int);
> +int cxgb3i_c3cn_send_pdus(struct s3_conn *, struct sk_buff *, int);
> +
> +/*
> + * Definitions for sk_buff state and ULP mode management.
> + */
> +
> +struct cxgb3_skb_cb {
> +	__u8 flags;		
> +	__u8 ulp_mode;		/* ULP mode/submode of sk_buff */
> +	__u32 seq;		/* sequence number */
> +	__u32 ddigest;		/* ULP rx_data_ddp selected field */
> +	__u32 pdulen;		/* ULP rx_data_ddp selected field */
> +	__u8 ulp_data[16];	/* scratch area for ULP */
> +};
> +
> +#define CXGB3_SKB_CB(skb)	((struct cxgb3_skb_cb *)&((skb)->cb[0]))
> +
> +#define skb_ulp_mode(skb)	(CXGB3_SKB_CB(skb)->ulp_mode)
> +#define skb_ulp_ddigest(skb)	(CXGB3_SKB_CB(skb)->ddigest)
> +#define skb_ulp_pdulen(skb)	(CXGB3_SKB_CB(skb)->pdulen)
> +#define skb_ulp_data(skb)	(CXGB3_SKB_CB(skb)->ulp_data)
> +
> +enum {
> +	C3CB_FLAG_NEED_HDR = 1 << 0,	/* packet needs a TX_DATA_WR header */
> +	C3CB_FLAG_NO_APPEND = 1 << 1,	/* don't grow this skb */
> +	C3CB_FLAG_BARRIER = 1 << 2,	/* set TX_WAIT_IDLE after sending */
> +	C3CB_FLAG_COMPL = 1 << 4,	/* request WR completion */
> +};
> +
> +/*
> + * Definitions for managing deferred CPL replies from process context.
> + */
> +
> +typedef void (*defer_handler_t) (struct t3cdev *, struct sk_buff *);
> +
> +struct deferred_skb_cb {
> +	defer_handler_t handler;
> +	struct t3cdev *cdev;
> +};
> +
> +#define DEFERRED_SKB_CB(skb) ((struct deferred_skb_cb *)(skb)->cb)
> +
> +/*
> + * Top-level CPL message processing used by most CPL messages that
> + * pertain to connections.
> + */
> +static inline void process_cpl_msg(void (*fn)(struct s3_conn *,
> +					      struct sk_buff *),
> +				   struct s3_conn *c3cn,
> +				   struct sk_buff *skb)
> +{
> +	spin_lock(&c3cn->lock);
> +	fn(c3cn, skb);
> +	spin_unlock(&c3cn->lock);
> +}
> +
> +/*
> + * Opaque version of structure the SGE stores at skb->head of TX_DATA packets
> + * and for which we must reserve space.
> + */
> +struct sge_opaque_hdr {
> +	void *dev;
> +	dma_addr_t addr[MAX_SKB_FRAGS + 1];
> +};
> +
> +/* for TX: a skb must have a headroom of at least TX_HEADER_LEN bytes */
> +#define TX_HEADER_LEN \
> +		(sizeof(struct tx_data_wr) + sizeof(struct sge_opaque_hdr))
> +
> +void *cxgb3i_alloc_big_mem(unsigned int);
> +void cxgb3i_free_big_mem(void *);
> +
> +#endif /* _CXGB3_OFFLOAD_H */
> diff --git a/drivers/scsi/cxgb3i/cxgb3i_ulp2.c b/drivers/scsi/cxgb3i/cxgb3i_ulp2.c
> new file mode 100644
> index 0000000..313bb90
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/cxgb3i_ulp2.c
> @@ -0,0 +1,692 @@
> +/*
> + * cxgb3i_ddp.c: Chelsio S3xx iSCSI driver.
> + *
> + * Copyright (c) 2008 Chelsio Communications, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation.
> + *
> + * Written by: Karen Xie (kxie@chelsio.com)
> + */
> +
> +#include <linux/skbuff.h>
> +#include <scsi/scsi_cmnd.h>
> +#include <scsi/scsi_host.h>
> +#include <linux/crypto.h>
> +#include "../iscsi_tcp.h"
> +
> +#include "cxgb3i.h"
> +#include "cxgb3i_ulp2.h"
> +
> +#ifdef __DEBUG_CXGB3I_RX__
> +#define cxgb3i_rx_debug		cxgb3i_log_debug
> +#else
> +#define cxgb3i_rx_debug(fmt...)
> +#endif
> +
> +#ifdef __DEBUG_CXGB3I_TX__
> +#define cxgb3i_tx_debug		cxgb3i_log_debug
> +#else
> +#define cxgb3i_tx_debug(fmt...)
> +#endif
> +
> +#ifdef __DEBUG_CXGB3I_TAG__
> +#define cxgb3i_tag_debug	cxgb3i_log_debug
> +#else
> +#define cxgb3i_tag_debug(fmt...)
> +#endif
> +
> +#ifdef __DEBUG_CXGB3I_DDP__
> +#define cxgb3i_ddp_debug	cxgb3i_log_debug
> +#else
> +#define cxgb3i_ddp_debug(fmt...)
> +#endif
> +
> +static struct page *pad_page;
> +
> +#define ULP2_PGIDX_MAX		4
> +#define ULP2_4K_PAGE_SHIFT	12
> +#define ULP2_4K_PAGE_MASK	(~((1UL << ULP2_4K_PAGE_SHIFT) - 1))
> +static unsigned char ddp_page_order[ULP2_PGIDX_MAX];
> +static unsigned long ddp_page_size[ULP2_PGIDX_MAX];
> +static unsigned char ddp_page_shift[ULP2_PGIDX_MAX];
> +static unsigned char sw_tag_idx_bits;
> +static unsigned char sw_tag_age_bits;
> +
> +static void cxgb3i_ddp_page_init(void)
> +{
> +	int i;
> +	unsigned long n = PAGE_SIZE >> ULP2_4K_PAGE_SHIFT;
> +
> +	if (PAGE_SIZE & (~ULP2_4K_PAGE_MASK)) {
> +		cxgb3i_log_warn("PAGE_SIZE 0x%lx is not multiple of 4K, "
> +				"ddp disabled.\n", PAGE_SIZE);
> +		return;
> +	}
> +	n = __ilog2_u32(n);
> +	for (i = 0; i < ULP2_PGIDX_MAX; i++, n++) {
> +		ddp_page_order[i] = n;
> +		ddp_page_shift[i] = ULP2_4K_PAGE_SHIFT + n;
> +		ddp_page_size[i] = 1 << ddp_page_shift[i];
> +		cxgb3i_log_debug("%d, order %u, shift %u, size 0x%lx.\n", i,
> +				 ddp_page_order[i], ddp_page_shift[i],
> +				 ddp_page_size[i]);
> +	}
> +
> +	sw_tag_idx_bits = (__ilog2_u32(ISCSI_ITT_MASK)) + 1;
> +	sw_tag_age_bits = (__ilog2_u32(ISCSI_AGE_MASK)) + 1;
> +}
> +
> +static inline void ulp_mem_io_set_hdr(struct sk_buff *skb, unsigned int addr)
> +{
> +	struct ulp_mem_io *req = (struct ulp_mem_io *)skb->head;
> +	req->wr.wr_lo = 0;
> +	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
> +	req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(addr >> 5) |
> +				   V_ULPTX_CMD(ULP_MEM_WRITE));
> +	req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE >> 5) |
> +			 V_ULPTX_NFLITS((PPOD_SIZE >> 3) + 1));
> +}
> +
> +static int set_ddp_map(struct cxgb3i_adapter *snic, struct pagepod_hdr *hdr,
> +		       unsigned int idx, unsigned int npods,
> +		       struct scatterlist *sgl, unsigned int sgcnt)
> +{
> +	struct cxgb3i_ddp_info *ddp = &snic->ddp;
> +	struct scatterlist *sg = sgl;
> +	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit;
> +	int i;
> +
> +	for (i = 0; i < npods; i++, pm_addr += PPOD_SIZE) {
> +		struct sk_buff *skb;
> +		struct pagepod *ppod;
> +		int j, k;
> +		skb =
> +		    alloc_skb(sizeof(struct ulp_mem_io) + PPOD_SIZE,
> +			      GFP_ATOMIC);
> +		if (!skb)
> +			return -ENOMEM;
> +		skb_put(skb, sizeof(struct ulp_mem_io) + PPOD_SIZE);
> +
> +		ulp_mem_io_set_hdr(skb, pm_addr);
> +		ppod =
> +		    (struct pagepod *)(skb->head + sizeof(struct ulp_mem_io));
> +		memcpy(&(ppod->hdr), hdr, sizeof(struct pagepod));
> +		for (j = 0, k = i * 4; j < 5; j++, k++) {
> +			if (k < sgcnt) {
> +				ppod->addr[j] = cpu_to_be64(sg_dma_address(sg));
> +				if (j < 4)
> +					sg = sg_next(sg);
> +			} else
> +				ppod->addr[j] = 0UL;
> +		}
> +
> +		skb->priority = CPL_PRIORITY_CONTROL;
> +		cxgb3_ofld_send(snic->tdev, skb);
> +	}
> +	return 0;
> +}
> +
> +static int clear_ddp_map(struct cxgb3i_adapter *snic, unsigned int idx,
> +			 unsigned int npods)
> +{
> +	struct cxgb3i_ddp_info *ddp = &snic->ddp;
> +	unsigned int pm_addr = (idx << PPOD_SIZE_SHIFT) + ddp->llimit;
> +	int i;
> +
> +	for (i = 0; i < npods; i++, pm_addr += PPOD_SIZE) {
> +		struct sk_buff *skb;
> +		skb =
> +		    alloc_skb(sizeof(struct ulp_mem_io) + PPOD_SIZE,
> +			      GFP_ATOMIC);
> +		if (!skb)
> +			return -ENOMEM;
> +		skb_put(skb, sizeof(struct ulp_mem_io) + PPOD_SIZE);
> +		memset((skb->head + sizeof(struct ulp_mem_io)), 0, PPOD_SIZE);
> +		ulp_mem_io_set_hdr(skb, pm_addr);
> +		skb->priority = CPL_PRIORITY_CONTROL;
> +		cxgb3_ofld_send(snic->tdev, skb);
> +	}
> +	return 0;
> +}
> +
> +static int cxgb3i_ddp_sgl_check(struct scatterlist *sgl, unsigned int sgcnt)
> +{
> +	struct scatterlist *sg;
> +	int i;
> +
> +	/* make sure the sgl is fit for ddp:
> +	 *      each has the same page size, and
> +	 *      first & last page do not need to be used completely, and
> +	 *      the rest of page must be used completely
> +	 */
> +	for_each_sg(sgl, sg, sgcnt, i) {
> +		if ((i && sg->offset) ||
> +		    ((i != sgcnt - 1) &&
> +		     (sg->length + sg->offset) != PAGE_SIZE))
> +			return -EINVAL;

I just want to see if I understood correctly?
If we fail here then it means the request will go through the
regular SW stack, with out HW offloading. But will not fail completely.

This is because in OSD we chose to append disjoint memory segments,
which works well with regular iscsi_tcp.

> +	}
> +
> +	return 0;
> +}
> +
> +static inline int ddp_find_unused_entries(struct cxgb3i_ddp_info *ddp,
> +					  int start, int max, int count)
> +{
> +	unsigned int i, j;
> +
> +	spin_lock(&ddp->map_lock);
> +	for (i = start; i <= max;) {
> +		for (j = 0; j < count; j++) {
> +			if (ddp->map[i + j])
> +				break;
> +		}
> +		if (j == count) {
> +			memset(&ddp->map[i], 1, count);
> +			spin_unlock(&ddp->map_lock);
> +			return i;
> +		}
> +		i += j + 1;
> +	}
> +	spin_unlock(&ddp->map_lock);
> +	return -EBUSY;
> +}
> +
> +static inline void ddp_unmark_entries(struct cxgb3i_ddp_info *ddp,
> +				      int start, int count)
> +{
> +	spin_lock(&ddp->map_lock);
> +	memset(&ddp->map[start], 0, count);
> +	spin_unlock(&ddp->map_lock);
> +}
> +
> +u32 cxgb3i_ddp_tag_reserve(struct cxgb3i_adapter *snic, unsigned int tid,
> +			   u32 sw_tag, unsigned int xferlen,
> +			   struct scatterlist *sgl, unsigned int sgcnt)
> +{
> +	struct cxgb3i_ddp_info *ddp = &snic->ddp;
> +	struct pagepod_hdr hdr;
> +	unsigned int npods;
> +	int idx = -1, idx_max;
> +	u32 tag;
> +	int err;
> +
> +	if (!ddp || !sgcnt || xferlen < PAGE_SIZE)
> +		return RESERVED_ITT;
> +
> +	err = cxgb3i_ddp_sgl_check(sgl, sgcnt);
> +	if (err < 0)
> +		return RESERVED_ITT;
> +
> +	npods = (sgcnt + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
> +	idx_max = ddp->nppods - npods + 1;
> +
> +	if (ddp->idx_last == ddp->nppods)
> +		idx = ddp_find_unused_entries(ddp, 0, idx_max, npods);
> +	else {
> +		idx = ddp_find_unused_entries(ddp, ddp->idx_last + 1, idx_max,
> +					      npods);
> +		if ((idx < 0) && (ddp->idx_last >= npods))
> +			idx = ddp_find_unused_entries(ddp, 0,
> +						      ddp->idx_last - npods + 1,
> +						      npods);
> +	}
> +	if (idx < 0)
> +		return RESERVED_ITT;
> +
> +	if (pci_map_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE) <= 0)
> +		goto unmark_entries;
> +
> +	tag = sw_tag | (idx << snic->tag_format.rsvd_shift);
> +
> +	hdr.rsvd = 0;
> +	hdr.vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
> +	hdr.pgsz_tag_clr = htonl(tag);
> +	hdr.maxoffset = htonl(xferlen);
> +	hdr.pgoffset = htonl(sgl->offset);
> +
> +	if (set_ddp_map(snic, &hdr, idx, npods, sgl, sgcnt) < 0)
> +		goto unmap_sgl;
> +
> +	ddp->idx_last = idx;
> +	cxgb3i_tag_debug("tid 0x%x, xfer %u, 0x%x -> ddp tag 0x%x (%u, %u).\n",
> +			 tid, xferlen, sw_tag, tag, idx, npods);
> +	return tag;
> +
> +unmap_sgl:
> +	pci_unmap_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE);
> +
> +unmark_entries:
> +	ddp_unmark_entries(ddp, idx, npods);
> +	return RESERVED_ITT;
> +}
> +
> +void cxgb3i_ddp_tag_release(struct cxgb3i_adapter *snic, u32 tag,
> +			    struct scatterlist *sgl, unsigned int sgcnt)
> +{
> +	u32 idx = (tag >> snic->tag_format.rsvd_shift) &
> +	    snic->tag_format.rsvd_mask;
> +	unsigned int npods = (sgcnt + PPOD_PAGES_MAX - 1) >> PPOD_PAGES_SHIFT;
> +
> +	if (idx < snic->tag_format.rsvd_mask) {
> +		cxgb3i_tag_debug("ddp tag 0x%x, release idx 0x%x, npods %u.\n",
> +				 tag, idx, npods);
> +		clear_ddp_map(snic, idx, npods);
> +		ddp_unmark_entries(&snic->ddp, idx, npods);
> +		pci_unmap_sg(snic->pdev, sgl, sgcnt, PCI_DMA_FROMDEVICE);
> +	}
> +}
> +
> +int cxgb3i_conn_ulp_setup(struct cxgb3i_conn *cconn, int hcrc, int dcrc)
> +{
> +	struct iscsi_tcp_conn *tcp_conn = cconn->conn->dd_data;
> +	struct s3_conn *c3cn = (struct s3_conn *)(tcp_conn->sock);
> +	struct sk_buff *skb = alloc_skb(sizeof(struct cpl_set_tcb_field),
> +					GFP_KERNEL | __GFP_NOFAIL);
> +	struct cpl_set_tcb_field *req;
> +	u32 submode = (hcrc ? 1 : 0) | (dcrc ? 2 : 0);
> +
> +	/* set up ulp submode and page size */
> +	req = (struct cpl_set_tcb_field *)skb_put(skb, sizeof(*req));
> +	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
> +	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, c3cn->tid));
> +	req->reply = V_NO_REPLY(1);
> +	req->cpu_idx = 0;
> +	req->word = htons(31);
> +	req->mask = cpu_to_be64(0xFF000000);
> +	/* the connection page size is always the same as ddp-pgsz0 */
> +	req->val = cpu_to_be64(submode << 24);
> +	skb->priority = CPL_PRIORITY_CONTROL;
> +
> +	cxgb3_ofld_send(c3cn->cdev, skb);
> +	return 0;
> +}
> +
> +static int cxgb3i_conn_read_pdu_skb(struct iscsi_conn *conn,
> +				    struct sk_buff *skb)
> +{
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct iscsi_segment *segment = &tcp_conn->in.segment;
> +	struct iscsi_hdr *hdr = (struct iscsi_hdr *)tcp_conn->in.hdr_buf;
> +	unsigned char *buf = (unsigned char *)hdr;
> +	unsigned int offset = sizeof(struct iscsi_hdr);
> +	int err;
> +
> +	cxgb3i_rx_debug("conn 0x%p, skb 0x%p, len %u, flag 0x%x.\n",
> +			conn, skb, skb->len, skb_ulp_mode(skb));
> +
> +	/* read bhs */
> +	err = skb_copy_bits(skb, 0, buf, sizeof(struct iscsi_hdr));
> +	if (err < 0)
> +		return err;
> +	segment->copied = sizeof(struct iscsi_hdr);
> +	/* read ahs */
> +	if (hdr->hlength) {
> +		unsigned int ahslen = hdr->hlength << 2;
> +		/* Make sure we don't overflow */
> +		if (sizeof(*hdr) + ahslen > sizeof(tcp_conn->in.hdr_buf))
> +			return -ISCSI_ERR_AHSLEN;
> +		err = skb_copy_bits(skb, offset, buf + offset, ahslen);
> +		if (err < 0)
> +			return err;
> +		offset += ahslen;
> +	}
> +	/* header digest */
> +	if (conn->hdrdgst_en)
> +		offset += ISCSI_DIGEST_SIZE;
> +
> +	/* check header digest */
> +	segment->status = (conn->hdrdgst_en &&
> +			   (skb_ulp_mode(skb) & ULP2_FLAG_HCRC_ERROR)) ?
> +	    ISCSI_SEGMENT_DGST_ERR : 0;
> +
> +	hdr->itt = ntohl(hdr->itt);
> +	segment->total_copied = segment->total_size;
> +	tcp_conn->in.hdr = hdr;
> +	err = iscsi_tcp_hdr_dissect(conn, hdr);
> +	if (err)
> +		return err;
> +
> +	if (tcp_conn->in.datalen) {
> +		segment = &tcp_conn->in.segment;
> +		segment->status = (conn->datadgst_en &&
> +				   (skb_ulp_mode(skb) & ULP2_FLAG_DCRC_ERROR)) ?
> +		    ISCSI_SEGMENT_DGST_ERR : 0;
> +		if (skb_ulp_mode(skb) & ULP2_FLAG_DATA_DDPED) {
> +			cxgb3i_ddp_debug("opcode 0x%x, data %u, ddp'ed.\n",
> +					 hdr->opcode & ISCSI_OPCODE_MASK,
> +					 tcp_conn->in.datalen);
> +			segment->total_copied = segment->total_size;
> +		} else {
> +			cxgb3i_ddp_debug("opcode 0x%x, data %u, not ddp'ed.\n",
> +					 hdr->opcode & ISCSI_OPCODE_MASK,
> +					 tcp_conn->in.datalen);
> +			offset += sizeof(struct cpl_iscsi_hdr_norss);
> +		}
> +		while (segment->total_copied < segment->total_size) {
> +			iscsi_tcp_segment_map(segment, 1);
> +			err = skb_copy_bits(skb, offset, segment->data,
> +					    segment->size);
> +			iscsi_tcp_segment_unmap(segment);
> +			if (err)
> +				return err;
> +			segment->total_copied += segment->size;
> +			offset += segment->size;
> +
> +			if (segment->total_copied < segment->total_size)
> +				iscsi_tcp_segment_init_sg(segment,
> +							  sg_next(segment->sg),
> +							  0);
> +		}
> +		err = segment->done(tcp_conn, segment);
> +	}
> +	return err;
> +}
> +
> +static inline void tx_skb_setmode(struct sk_buff *skb, int hcrc, int dcrc)
> +{
> +	u8 submode = 0;
> +	if (hcrc)
> +		submode |= 1;
> +	if (dcrc)
> +		submode |= 2;
> +	skb_ulp_mode(skb) = (ULP_MODE_ISCSI << 4) | submode;
> +}
> +
> +int cxgb3i_conn_ulp2_xmit(struct iscsi_conn *conn)
> +{
> +	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
> +	struct iscsi_segment *hdr_seg = &tcp_conn->out.segment;
> +	struct iscsi_segment *data_seg = &tcp_conn->out.data_segment;
> +	unsigned int hdrlen = hdr_seg->total_size;
> +	unsigned int datalen = data_seg->total_size;
> +	unsigned int padlen = iscsi_padding(datalen);
> +	unsigned int copymax = SKB_MAX_HEAD(TX_HEADER_LEN);
> +	unsigned int copylen;
> +	struct sk_buff *skb;
> +	unsigned char *dst;
> +	int err = -EAGAIN;
> +
> +	if (conn->suspend_tx)
> +		return 0;
> +
> +	if (data_seg->data && ((datalen + padlen) < copymax))
> +		copylen = hdrlen + datalen + padlen;
> +	else
> +		copylen = hdrlen;
> +
> +	/* supports max. 16K pdus, so one skb is enough to hold all the data */
> +	skb = alloc_skb(TX_HEADER_LEN + copylen, GFP_ATOMIC);
> +	if (!skb)
> +		return -EAGAIN;
> +
> +	skb_reserve(skb, TX_HEADER_LEN);
> +	skb_put(skb, copylen);
> +	dst = skb->data;
> +
> +	tx_skb_setmode(skb, conn->hdrdgst_en, datalen ? conn->datadgst_en : 0);
> +
> +	memcpy(dst, hdr_seg->data, hdrlen);
> +	dst += hdrlen;
> +
> +	if (!datalen)
> +		goto send_pdu;
> +
> +	if (data_seg->data) {
> +		/* data is in a linear buffer */
> +		if (copylen > hdrlen) {
> +			/* data fits in the skb's headroom */
> +			memcpy(dst, data_seg->data, datalen);
> +			dst += datalen;
> +			if (padlen)
> +				memset(dst, 0, padlen);
> +		} else {
> +			unsigned int offset = 0;
> +			while (datalen) {
> +				struct page *page =
> +				    alloc_pages(GFP_ATOMIC, 0);
> +				int idx = skb_shinfo(skb)->nr_frags;
> +				skb_frag_t *frag = &skb_shinfo(skb)->frags[idx];
> +
> +				if (!page)
> +					goto free_skb;
> +
> +				frag->page = page;
> +				frag->page_offset = 0;
> +				if (datalen > PAGE_SIZE)
> +					frag->size = PAGE_SIZE;
> +				else
> +					frag->size = datalen;
> +				memcpy(page_address(page),
> +				       data_seg->data + offset, frag->size);
> +
> +				skb_shinfo(skb)->nr_frags++;
> +				datalen -= frag->size;
> +				offset += frag->size;
> +			}
> +		}
> +	} else {
> +		struct scatterlist *sg = data_seg->sg;
> +		unsigned int offset = data_seg->sg_offset;
> +		while (datalen) {
> +			int idx = skb_shinfo(skb)->nr_frags;
> +			skb_frag_t *frag = &skb_shinfo(skb)->frags[idx];
> +			struct page *pg = sg_page(sg);
> +
> +			get_page(pg);
> +			frag->page = pg;
> +			frag->page_offset = offset + sg->offset;
> +			frag->size = min(sg->length, datalen);
> +
> +			offset = 0;
> +			skb_shinfo(skb)->nr_frags++;
> +			datalen -= frag->size;
> +			sg = sg_next(sg);
> +		}
> +	}
> +
> +	if (skb_shinfo(skb)->nr_frags) {
> +		if (padlen) {
> +			int idx = skb_shinfo(skb)->nr_frags;
> +			skb_frag_t *frag = &skb_shinfo(skb)->frags[idx];
> +			frag->page = pad_page;
> +			frag->page_offset = 0;
> +			frag->size = padlen;
> +			skb_shinfo(skb)->nr_frags++;
> +		}
> +		datalen = data_seg->total_size + padlen;
> +		skb->data_len += datalen;
> +		skb->truesize += datalen;
> +		skb->len += datalen;
> +	}
> +
> +send_pdu:
> +	err = cxgb3i_c3cn_send_pdus((struct s3_conn *)tcp_conn->sock,
> +				    skb, MSG_DONTWAIT | MSG_NOSIGNAL);
> +	if (err > 0) {
> +		int pdulen = hdrlen + datalen + padlen;
> +		if (conn->hdrdgst_en)
> +			pdulen += ISCSI_DIGEST_SIZE;
> +		if (datalen && conn->datadgst_en)
> +			pdulen += ISCSI_DIGEST_SIZE;
> +
> +		hdr_seg->total_copied = hdr_seg->total_size;
> +		if (datalen)
> +			data_seg->total_copied = data_seg->total_size;
> +		conn->txdata_octets += pdulen;
> +		return pdulen;
> +	}
> +
> +free_skb:
> +	kfree_skb(skb);
> +	if (err < 0 && err != -EAGAIN) {
> +		cxgb3i_log_error("conn 0x%p, xmit err %d.\n", conn, err);
> +		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
> +		return err;
> +	}
> +	return -EAGAIN;
> +}
> +
> +int cxgb3i_ulp2_init(void)
> +{
> +	pad_page = alloc_page(GFP_KERNEL);
> +	if (!pad_page)
> +		return -ENOMEM;
> +	memset(page_address(pad_page), 0, PAGE_SIZE);
> +	cxgb3i_ddp_page_init();
> +	return 0;
> +}
> +
> +void cxgb3i_ulp2_cleanup(void)
> +{
> +	if (pad_page) {
> +		__free_page(pad_page);
> +		pad_page = NULL;
> +	}
> +}
> +
> +void cxgb3i_conn_pdu_ready(struct s3_conn *c3cn)
> +{
> +	struct sk_buff *skb;
> +	unsigned int read = 0;
> +	struct iscsi_conn *conn = c3cn->user_data;
> +	int err = 0;
> +
> +	cxgb3i_rx_debug("cn 0x%p.\n", c3cn);
> +
> +	read_lock(&c3cn->callback_lock);
> +	if (unlikely(!conn || conn->suspend_rx)) {
> +		cxgb3i_rx_debug("conn 0x%p, id %d, suspend_rx %d!\n", 
> +				conn, conn ? conn->id : 0xFF,
> +				conn ? conn->suspend_rx : 0xFF);
> +		read_unlock(&c3cn->callback_lock);
> +		return;
> +	}
> +	skb = skb_peek(&c3cn->receive_queue);
> +	while (!err && skb) {
> +		__skb_unlink(skb, &c3cn->receive_queue);
> +		read += skb_ulp_pdulen(skb);
> +		err = cxgb3i_conn_read_pdu_skb(conn, skb);
> +		__kfree_skb(skb);
> +		skb = skb_peek(&c3cn->receive_queue);
> +	}
> +	read_unlock(&c3cn->callback_lock);
> +	if (c3cn) {
> +		c3cn->copied_seq += read;
> +		cxgb3i_c3cn_rx_credits(c3cn, read);
> +	}
> +	conn->rxdata_octets += read;
> +
> +	if (err) {
> +		cxgb3i_log_info("conn 0x%p rx failed err %d.\n", conn, err);
> +		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
> +	}
> +}
> +
> +void cxgb3i_conn_tx_open(struct s3_conn *c3cn)
> +{
> +	struct iscsi_conn *conn = (struct iscsi_conn *)c3cn->user_data;
> +	struct iscsi_tcp_conn *tcp_conn;
> +	cxgb3i_tx_debug("cn 0x%p.\n", c3cn);
> +	if (conn) {
> +		cxgb3i_tx_debug("cn 0x%p, cid %d.\n", c3cn, conn->id);
> +		tcp_conn = conn->dd_data;
> +		scsi_queue_work(conn->session->host, &conn->xmitwork);
> +	}
> +}
> +
> +void cxgb3i_conn_closing(struct s3_conn *c3cn)
> +{
> +	struct iscsi_conn *conn;
> +	read_lock(&c3cn->callback_lock);
> +	conn = (struct iscsi_conn *)c3cn->user_data;
> +	if (conn && c3cn->state != C3CN_STATE_ESTABLISHED)
> +		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
> +	read_unlock(&c3cn->callback_lock);
> +}
> +
> +int cxgb3i_adapter_ulp_init(struct cxgb3i_adapter *snic)
> +{
> +	struct t3cdev *tdev = snic->tdev;
> +	struct cxgb3i_ddp_info *ddp = &snic->ddp;
> +	struct ulp_iscsi_info uinfo;
> +	unsigned int ppmax, bits, max_bits;
> +	int i, err;
> +
> +	spin_lock_init(&ddp->map_lock);
> +
> +	err = tdev->ctl(tdev, ULP_ISCSI_GET_PARAMS, &uinfo);
> +	if (err < 0) {
> +		cxgb3i_log_error("%s, failed to get iscsi param err=%d.\n",
> +				 tdev->name, err);
> +		return err;
> +	}
> +
> +	ppmax = (uinfo.ulimit - uinfo.llimit + 1) >> PPOD_SIZE_SHIFT;
> +	max_bits = min(PPOD_IDX_MAX_SIZE,
> +		       (32 - sw_tag_idx_bits - sw_tag_age_bits));
> +	bits = __ilog2_u32(ppmax) + 1;
> +	if (bits > max_bits)
> +		bits = max_bits;
> +	ppmax = (1 << bits) - 1;
> +
> +	snic->tx_max_size = uinfo.max_txsz;
> +	snic->rx_max_size = uinfo.max_rxsz;
> +	cxgb3i_log_debug("snic tx %u, rx %u.\n", snic->tx_max_size,
> +			 snic->rx_max_size);
> +	snic->tag_format.idx_bits = sw_tag_idx_bits;
> +	snic->tag_format.age_bits = sw_tag_age_bits;
> +	snic->tag_format.rsvd_bits = bits;
> +	snic->tag_format.rsvd_shift = PPOD_IDX_SHIFT;
> +	snic->tag_format.rsvd_mask = (1 << snic->tag_format.rsvd_bits) - 1;
> +
> +	cxgb3i_log_debug("snic nppods %u, rsvd shift %u, bits %u, mask 0x%x.\n",
> +			 ppmax, snic->tag_format.rsvd_shift,
> +			 snic->tag_format.rsvd_bits,
> +			 snic->tag_format.rsvd_mask);
> +
> +	ddp->map = cxgb3i_alloc_big_mem(ppmax);
> +	if (!ddp->map) {
> +		cxgb3i_log_warn("snic unable to alloc ddp ppod 0x%u, "
> +				"ddp disabled.\n", ppmax);
> +		return 0;
> +	}
> +	ddp->llimit = uinfo.llimit;
> +	ddp->ulimit = uinfo.ulimit;
> +
> +	uinfo.tagmask =
> +	    snic->tag_format.rsvd_mask << snic->tag_format.rsvd_shift;
> +	for (i = 0; i < ULP2_PGIDX_MAX; i++)
> +		uinfo.pgsz_factor[i] = ddp_page_order[i];
> +
> +	err = tdev->ctl(tdev, ULP_ISCSI_SET_PARAMS, &uinfo);
> +	if (err < 0) {
> +		cxgb3i_log_warn("snic unable to set iscsi param err=%d, "
> +				"ddp disabled.\n", err);
> +		goto free_ppod_map;
> +	}
> +
> +	ddp->nppods = ppmax;
> +	ddp->idx_last = ppmax;
> +
> +	tdev->ulp_iscsi = ddp;
> +
> +	return 0;
> +
> +free_ppod_map:
> +	cxgb3i_free_big_mem(ddp->map);
> +	return 0;
> +}
> +
> +void cxgb3i_adapter_ulp_cleanup(struct cxgb3i_adapter *snic)
> +{
> +	u8 *map = snic->ddp.map;
> +	if (map) {
> +		snic->tdev->ulp_iscsi = NULL;
> +		spin_lock(&snic->lock);
> +		snic->ddp.map = NULL;
> +		spin_unlock(&snic->lock);
> +		cxgb3i_free_big_mem(map);
> +	}
> +}
> diff --git a/drivers/scsi/cxgb3i/cxgb3i_ulp2.h b/drivers/scsi/cxgb3i/cxgb3i_ulp2.h
> new file mode 100644
> index 0000000..e3f46dc
> --- /dev/null
> +++ b/drivers/scsi/cxgb3i/cxgb3i_ulp2.h
> @@ -0,0 +1,106 @@
> +/*
> + * cxgb3i_ulp2.h: Chelsio S3xx iSCSI driver.
> + *
> + * Copyright (c) 2008 Chelsio Communications, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation.
> + *
> + * Written by: Karen Xie (kxie@chelsio.com)
> + */
> +
> +#ifndef __CXGB3I_ULP2_H__
> +#define __CXGB3I_ULP2_H__
> +
> +#define ULP2_PDU_PAYLOAD_DFLT	(16224 - ISCSI_PDU_HEADER_MAX)
> +#define PPOD_PAGES_MAX		4
> +#define PPOD_PAGES_SHIFT	2	/* 4 pages per pod */
> +
> +struct pagepod_hdr {
> +	u32 vld_tid;
> +	u32 pgsz_tag_clr;
> +	u32 maxoffset;
> +	u32 pgoffset;
> +	u64 rsvd;
> +};
> +
> +struct pagepod {
> +	struct pagepod_hdr hdr;
> +	u64 addr[PPOD_PAGES_MAX + 1];
> +};
> +
> +#define PPOD_SIZE		sizeof(struct pagepod)	/* 64 */
> +#define PPOD_SIZE_SHIFT		6
> +
> +#define PPOD_COLOR_SHIFT	0
> +#define PPOD_COLOR_SIZE		6
> +#define PPOD_COLOR_MASK		((1 << PPOD_COLOR_SIZE) - 1)
> +
> +#define PPOD_IDX_SHIFT		PPOD_COLOR_SIZE
> +#define PPOD_IDX_MAX_SIZE	24
> +
> +#define S_PPOD_TID    0
> +#define M_PPOD_TID    0xFFFFFF
> +#define V_PPOD_TID(x) ((x) << S_PPOD_TID)
> +
> +#define S_PPOD_VALID    24
> +#define V_PPOD_VALID(x) ((x) << S_PPOD_VALID)
> +#define F_PPOD_VALID    V_PPOD_VALID(1U)
> +
> +#define S_PPOD_COLOR    0
> +#define M_PPOD_COLOR    0x3F
> +#define V_PPOD_COLOR(x) ((x) << S_PPOD_COLOR)
> +
> +#define S_PPOD_TAG    6
> +#define M_PPOD_TAG    0xFFFFFF
> +#define V_PPOD_TAG(x) ((x) << S_PPOD_TAG)
> +
> +#define S_PPOD_PGSZ    30
> +#define M_PPOD_PGSZ    0x3
> +#define V_PPOD_PGSZ(x) ((x) << S_PPOD_PGSZ)
> +
> +struct cpl_iscsi_hdr_norss {
> +	union opcode_tid ot;
> +	u16 pdu_len_ddp;
> +	u16 len;
> +	u32 seq;
> +	u16 urg;
> +	u8 rsvd;
> +	u8 status;
> +};
> +
> +struct cpl_rx_data_ddp_norss {
> +	union opcode_tid ot;
> +	u16 urg;
> +	u16 len;
> +	u32 seq;
> +	u32 nxt_seq;
> +	u32 ulp_crc;
> +	u32 ddp_status;
> +};
> +
> +#define RX_DDP_STATUS_IPP_SHIFT		27	/* invalid pagepod */
> +#define RX_DDP_STATUS_TID_SHIFT		26	/* tid mismatch */
> +#define RX_DDP_STATUS_COLOR_SHIFT	25	/* color mismatch */
> +#define RX_DDP_STATUS_OFFSET_SHIFT	24	/* offset mismatch */
> +#define RX_DDP_STATUS_ULIMIT_SHIFT	23	/* ulimit error */
> +#define RX_DDP_STATUS_TAG_SHIFT		22	/* tag mismatch */
> +#define RX_DDP_STATUS_DCRC_SHIFT	21	/* dcrc error */
> +#define RX_DDP_STATUS_HCRC_SHIFT	20	/* hcrc error */
> +#define RX_DDP_STATUS_PAD_SHIFT		19	/* pad error */
> +#define RX_DDP_STATUS_PPP_SHIFT		18	/* pagepod parity error */
> +#define RX_DDP_STATUS_LLIMIT_SHIFT	17	/* llimit error */
> +#define RX_DDP_STATUS_DDP_SHIFT		16	/* ddp'able */
> +#define RX_DDP_STATUS_PMM_SHIFT		15	/* pagepod mismatch */
> +
> +#define ULP2_FLAG_DATA_READY		0x1
> +#define ULP2_FLAG_DATA_DDPED		0x2
> +#define ULP2_FLAG_HCRC_ERROR		0x10
> +#define ULP2_FLAG_DCRC_ERROR		0x20
> +#define ULP2_FLAG_PAD_ERROR		0x40
> +
> +void cxgb3i_conn_closing(struct s3_conn *);
> +void cxgb3i_conn_pdu_ready(struct s3_conn *c3cn);
> +void cxgb3i_conn_tx_open(struct s3_conn *c3cn);
> +#endif
> 

Is the HW bidi ready in theory. I mean, does the firmware
do it's own iscsi parsing and will confuse on iscsi-bidi commands?

Will you, please, be willing to test an OSD stack on trough this card
to test if BIDI and VARLEN works.
What you will need is to set up an OSD target on one machine and set up
my OSD Initiator on this machine. (I'll send you all the instructions)

Or alternatively you could send me a card and some documentation and I
can set it up here and do the testing and debugging.

And one last personal request. Please don't send the RFC as one big patch. 
Send it as small reviewable patches, just for the practical reason of when
editing/reading the reply, it is too long and hard to find.

Thanks
Boaz

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-31  1:24   ` Karen Xie
@ 2008-07-31 12:45     ` Boaz Harrosh
  0 siblings, 0 replies; 71+ messages in thread
From: Boaz Harrosh @ 2008-07-31 12:45 UTC (permalink / raw)
  To: open-iscsi
  Cc: Jeff Garzik, netdev, davem, michaelc, Steve Wise, rdreier, daisyc,
	wenxiong, bhua, Divy Le Ray, Dimitrios Michailidis, Casey Leedom,
	linux-scsi, LKML

Karen Xie wrote:
>> Comments:
>>
>> * SCSI drivers should be submitted via the linux-scsi@vger.kernel.org 
>> mailing list.
> 
> Will do that. Thanks.
> 
>> * The driver is clean and readable, well done
>>
>> * From a networking standpoint, our main concern becomes how this 
>> interacts with the networking stack.  In particular, I'm concerned
> based 
>> on reading the source that this driver uses "TCP port stealing" rather 
>> than using a totally separate MAC address (and IP).
>>
>> Stealing a TCP port on an IP/interface already assigned is a common 
>> solution in this space, but also a flawed one.  Precisely because the 
>> kernel and applications are unaware of this "special, magic TCP port" 
>> you open the potential for application problems that are very difficult
> 
>> for an admin to diagnose based on observed behavior.
> 
> The collisions between the host stack and iSCSI offload are unlikely
> because the iSCSI target server's port is unique (nailed down as 3260).
> If an offload card is plugged in, all iSCSI connections to a given
> target (i.e., destination/port) are offloaded. There is precedence for
> this approach such as RDMA/iWarp.
> 

Please note that all SW iscsi targets I know, let you change the default
3260 port to whatever you want. Is that supported?

Jeff is there a way for the user-mode demon to reserve the port beforehand
so it will appear to be taken.

>> So, additional information on your TCP port usage would be greatly 
>> appreciated.  Also, how does this interact with IPv6?  Clearly it 
>> interacts with IPv4...
> 
> Currently, IPv6 connection request will not be honored, I will make sure
> the checking is added in the resubmission.
> 
> 
Boaz


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-07-30 21:35   ` Roland Dreier
@ 2008-08-01  0:51     ` Divy Le Ray
  2008-08-07 18:45       ` Divy Le Ray
  0 siblings, 1 reply; 71+ messages in thread
From: Divy Le Ray @ 2008-08-01  0:51 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Jeff Garzik, Karen Xie, netdev, open-iscsi, davem, michaelc,
	Steve Wise, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, LKML

On Wednesday 30 July 2008 02:35:51 pm Roland Dreier wrote:
>  > * From a networking standpoint, our main concern becomes how this
>  > interacts with the networking stack.  In particular, I'm concerned
>  > based on reading the source that this driver uses "TCP port stealing"
>  > rather than using a totally separate MAC address (and IP).
>  >
>  > Stealing a TCP port on an IP/interface already assigned is a common
>  > solution in this space, but also a flawed one.  Precisely because the
>  > kernel and applications are unaware of this "special, magic TCP port"
>  > you open the potential for application problems that are very
>  > difficult for an admin to diagnose based on observed behavior.
>
> That's true, but using a separate MAC and IP opens up a bunch of other
> operational problems.  I don't think the right answer for iSCSI offload
> is clear yet.
>
>  - R.

Hi Jeff,

We've considered the approach of having a separate IP/MAC addresses to manage
iSCSI connections. In such a context, the stack would have to be unaware of
this iSCSI specific IP address. The iSCSI driver would then have to implement
at least its own ARP reply mechanism. DHCP too would have to be managed
separately. Most network setting/monitoring tools would also be unavailable.

The open-iscsi initiator is not a huge consumer of TCP connections, allocating
a TCP port from the stack would be reasonable in terms of resources in this
context. It is however unclear if it is an acceptable approach.

Our current implementation was designed to be the most tolerable one
within the constraints - real or expected - aforementioned.

Cheers,
Divy

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-01  0:51     ` Divy Le Ray
@ 2008-08-07 18:45       ` Divy Le Ray
  2008-08-07 20:07         ` Mike Christie
  2008-08-08 18:09         ` Steve Wise
  0 siblings, 2 replies; 71+ messages in thread
From: Divy Le Ray @ 2008-08-07 18:45 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Jeff Garzik, Karen Xie, netdev, open-iscsi, davem, michaelc,
	Steve Wise, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, LKML

On Thursday 31 July 2008 05:51:59 pm Divy Le Ray wrote:
> On Wednesday 30 July 2008 02:35:51 pm Roland Dreier wrote:
> >  > * From a networking standpoint, our main concern becomes how this
> >  > interacts with the networking stack.  In particular, I'm concerned
> >  > based on reading the source that this driver uses "TCP port stealing"
> >  > rather than using a totally separate MAC address (and IP).
> >  >
> >  > Stealing a TCP port on an IP/interface already assigned is a common
> >  > solution in this space, but also a flawed one.  Precisely because the
> >  > kernel and applications are unaware of this "special, magic TCP port"
> >  > you open the potential for application problems that are very
> >  > difficult for an admin to diagnose based on observed behavior.
> >
> > That's true, but using a separate MAC and IP opens up a bunch of other
> > operational problems.  I don't think the right answer for iSCSI offload
> > is clear yet.
> >
> >  - R.
>
> Hi Jeff,
>
> We've considered the approach of having a separate IP/MAC addresses to
> manage iSCSI connections. In such a context, the stack would have to be
> unaware of this iSCSI specific IP address. The iSCSI driver would then have
> to implement at least its own ARP reply mechanism. DHCP too would have to
> be managed separately. Most network setting/monitoring tools would also be
> unavailable.
>
> The open-iscsi initiator is not a huge consumer of TCP connections,
> allocating a TCP port from the stack would be reasonable in terms of
> resources in this context. It is however unclear if it is an acceptable
> approach.
>
> Our current implementation was designed to be the most tolerable one
> within the constraints - real or expected - aforementioned.
>

Hi Jeff,

Mike Christie will not merge this code until he has an explicit 
acknowledgement from netdev.

As you mentioned, the port stealing approach we've taken has its issues.
We consequently analyzed your suggestion to use a different IP/MAC address for 
iSCSI and it raises other tough issues (separate ARP and DHCP management, 
unavailability of common networking tools).
On these grounds, we believe our current approach is the most tolerable.
Would the stack provide a TCP port allocation service, we'd be glad to use it 
to solve the current concerns.
The cxgb3i driver is up and running here, its merge is pending our decision.

Cheers,
Divy

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-07 18:45       ` Divy Le Ray
@ 2008-08-07 20:07         ` Mike Christie
  2008-08-08 18:09         ` Steve Wise
  1 sibling, 0 replies; 71+ messages in thread
From: Mike Christie @ 2008-08-07 20:07 UTC (permalink / raw)
  To: Divy Le Ray
  Cc: Roland Dreier, Jeff Garzik, Karen Xie, netdev, open-iscsi, davem,
	Steve Wise, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, LKML

Divy Le Ray wrote:
> On Thursday 31 July 2008 05:51:59 pm Divy Le Ray wrote:
>> On Wednesday 30 July 2008 02:35:51 pm Roland Dreier wrote:
>>>  > * From a networking standpoint, our main concern becomes how this
>>>  > interacts with the networking stack.  In particular, I'm concerned
>>>  > based on reading the source that this driver uses "TCP port stealing"
>>>  > rather than using a totally separate MAC address (and IP).
>>>  >
>>>  > Stealing a TCP port on an IP/interface already assigned is a common
>>>  > solution in this space, but also a flawed one.  Precisely because the
>>>  > kernel and applications are unaware of this "special, magic TCP port"
>>>  > you open the potential for application problems that are very
>>>  > difficult for an admin to diagnose based on observed behavior.
>>>
>>> That's true, but using a separate MAC and IP opens up a bunch of other
>>> operational problems.  I don't think the right answer for iSCSI offload
>>> is clear yet.
>>>
>>>  - R.
>> Hi Jeff,
>>
>> We've considered the approach of having a separate IP/MAC addresses to
>> manage iSCSI connections. In such a context, the stack would have to be
>> unaware of this iSCSI specific IP address. The iSCSI driver would then have
>> to implement at least its own ARP reply mechanism. DHCP too would have to
>> be managed separately. Most network setting/monitoring tools would also be
>> unavailable.
>>
>> The open-iscsi initiator is not a huge consumer of TCP connections,
>> allocating a TCP port from the stack would be reasonable in terms of
>> resources in this context. It is however unclear if it is an acceptable
>> approach.
>>
>> Our current implementation was designed to be the most tolerable one
>> within the constraints - real or expected - aforementioned.
>>
> 
> Hi Jeff,
> 
> Mike Christie will not merge this code until he has an explicit 
> acknowledgement from netdev.
> 
> As you mentioned, the port stealing approach we've taken has its issues.
> We consequently analyzed your suggestion to use a different IP/MAC address for 
> iSCSI and it raises other tough issues (separate ARP and DHCP management, 
> unavailability of common networking tools).

If the iscsi tools could not have to deal with networking issues that 
are already handled by other networking tools it would great for the 
iscsi users so they do not have to learn new tools. Maybe we could 
somehow hook into the existing network tools so they support these iscsi 
hbas as well as normal NICs. Would it be possible to have the iscsi hbas 
export the necessary network interfaces so that existing network tools 
can manage them?

If it comes down to it and your port stealing implementation is not 
acceptable like how broadcom's was not, I will be ok with doing some 
special iscsi network tools. Or instead of special iscsi tools, is there 
something that the RDMA/iWarp guys are using that we can share?

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-07 18:45       ` Divy Le Ray
  2008-08-07 20:07         ` Mike Christie
@ 2008-08-08 18:09         ` Steve Wise
  2008-08-08 22:15           ` Jeff Garzik
  1 sibling, 1 reply; 71+ messages in thread
From: Steve Wise @ 2008-08-08 18:09 UTC (permalink / raw)
  To: Divy Le Ray, Jeff Garzik, davem
  Cc: Roland Dreier, Karen Xie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, Dimitrios Michailidis, Casey Leedom, linux-scsi,
	LKML

> Hi Jeff,
>
> Mike Christie will not merge this code until he has an explicit 
> acknowledgement from netdev.
>
> As you mentioned, the port stealing approach we've taken has its issues.
> We consequently analyzed your suggestion to use a different IP/MAC address for 
> iSCSI and it raises other tough issues (separate ARP and DHCP management, 
> unavailability of common networking tools).
> On these grounds, we believe our current approach is the most tolerable.
> Would the stack provide a TCP port allocation service, we'd be glad to use it 
> to solve the current concerns.
> The cxgb3i driver is up and running here, its merge is pending our decision.
>
> Cheers,
> Divy
>   
Hey Dave/Jeff,

I think we need some guidance here on how to proceed.   Is the approach 
currently being reviewed ACKable?  Or is it DOA? If its DOA, then what 
approach do you recommend?  I believe Jeff's opinion is a separate 
ipaddr.  But Dave, what do you think?  Lets get some agreement on a high 
level design here. 

Possible solutions seen to date include:

1) reserving a socket to allocate the port.  This has been NAK'd in the 
past and I assume is still a no go.

2) creating a 4-tuple allocation service so the host stack, the rdma 
stack, and the iscsi stack can share the same TCP 4-tuple space.  This 
also has been NAK'd in the past and I assume is still a no go.

3) the iscsi device allocates its own local ephemeral posts (port 
stealing) and use the host's ip address for the iscsi offload device.  
This is the current proposal and you can review the thread for the pros 
and cons.  IMO it is the least objectionable (and I think we really 
should be doing #2).

4) the iscsi device will manage its own ip address thus ensuring 4-tuple 
uniqueness.

Unless you all want to re-open considering #1 or #2, then we're left 
with 3 or 4.  Which one?

Steve.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-08 18:09         ` Steve Wise
@ 2008-08-08 22:15           ` Jeff Garzik
  2008-08-08 22:20             ` Jeff Garzik
                               ` (2 more replies)
  0 siblings, 3 replies; 71+ messages in thread
From: Jeff Garzik @ 2008-08-08 22:15 UTC (permalink / raw)
  To: Steve Wise, davem
  Cc: Divy Le Ray, Roland Dreier, Karen Xie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, LKML, Mike Christie

Steve Wise wrote:
> 
>> Hi Jeff,
>>
>> Mike Christie will not merge this code until he has an explicit 
>> acknowledgement from netdev.
>>
>> As you mentioned, the port stealing approach we've taken has its issues.
>> We consequently analyzed your suggestion to use a different IP/MAC 
>> address for iSCSI and it raises other tough issues (separate ARP and 
>> DHCP management, unavailability of common networking tools).
>> On these grounds, we believe our current approach is the most tolerable.
>> Would the stack provide a TCP port allocation service, we'd be glad to 
>> use it to solve the current concerns.
>> The cxgb3i driver is up and running here, its merge is pending our 
>> decision.
>>
>> Cheers,
>> Divy
>>   
> Hey Dave/Jeff,
> 
> I think we need some guidance here on how to proceed.   Is the approach 
> currently being reviewed ACKable?  Or is it DOA? If its DOA, then what 
> approach do you recommend?  I believe Jeff's opinion is a separate 
> ipaddr.  But Dave, what do you think?  Lets get some agreement on a high 
> level design here.
> Possible solutions seen to date include:
> 
> 1) reserving a socket to allocate the port.  This has been NAK'd in the 
> past and I assume is still a no go.
> 
> 2) creating a 4-tuple allocation service so the host stack, the rdma 
> stack, and the iscsi stack can share the same TCP 4-tuple space.  This 
> also has been NAK'd in the past and I assume is still a no go.
> 
> 3) the iscsi device allocates its own local ephemeral posts (port 
> stealing) and use the host's ip address for the iscsi offload device.  
> This is the current proposal and you can review the thread for the pros 
> and cons.  IMO it is the least objectionable (and I think we really 
> should be doing #2).
> 
> 4) the iscsi device will manage its own ip address thus ensuring 4-tuple 
> uniqueness.

Conceptually, it is a nasty business for the OS kernel to be forced to 
co-manage an IP address in conjunction with a remote, independent entity.

Hardware designers make the mistake of assuming that firmware management 
of a TCP port ("port stealing") successfully provides the illusion to 
the OS that that port is simply inactive, and the OS happily continues 
internetworking its merry way through life.

This is certainly not true, because of current netfilter and userland 
application behavior, which often depends on being able to allocate 
(bind) to random TCP ports.  Allocating a TCP port successfully within 
the OS, that then behaves different from all other TCP ports (because it 
is the magic iSCSI port) creates a cascading functional disconnect.  On 
that magic iSCSI port, strange errors will be returned instead of proper 
behavior.  Which, in turn, cascades through new (and inevitably 
under-utilized) error handling paths in the app.

So, of course, one must work around problems like this, which leads to 
one of two broad choices:

1) implement co-management (sharing) of IP address/port space, between 
the OS kernel and a remote entity.

2) come up with a solution in hardware that does not require the OS to 
co-manage the data it has so far been managing exclusively in software.

It should be obvious that we prefer path #2.

For, trudging down path #1 means

* one must give the user the ability to manage shared IP addresses IN A 
NON-HARDWARE-SPECIFIC manner.  Currently most vendors of "TCP port 
stealing" solutions seem to expect each user to learn a vendor-specific 
method of identifying and managing the "magic port".

Excuse my language, but, what a fucking security and management 
nightmare in a cross-vendor environment.  It is already a pain, with 
some [unnamed system/chipset vendors] management stealing TCP ports -- 
and admins only discover this fact when applications behave strangely on 
new hardware.

But...  its tough to notice because stumbling upon the magic TCP port 
won't happen often unless the server is heavily loaded.  Thus you have a 
security/application problem once in a blue moon, due to this magic TCP 
port mentioned in some obscure online documentation nobody has read.

* however, giving the user the ability to co-manage IP addresses means 
hacking up the kernel TCP code and userland tools for this new concept, 
something that I think DaveM would rightly be a bit reluctant to do? 
You are essentially adding a bunch of special case code whenever TCP 
ports are used:

	if (port in list of "magic" TCP ports with special,
	    hardware-specific behavior)
		...
	else
		do what we've been doing for decades

ISTR Roland(?) pointing out code that already does a bit of this in the 
IB space...  but the point is

Finally, this shared IP address/port co-management thing has several 
problems listed on the TOE page: http://www.linuxfoundation.org/en/Net:TOE

such as,

* security updates for TCP problems mean that a single IP address can be 
PARTIALLY SECURE, because security updates for kernel TCP stack and 
h/w's firmware are inevitably updated separately (even if distributed 
and compiled together).  Yay, we are introducing a wonderful new 
security problem here.

* from a security, network scanner and packet classifier point of view, 
a single IP address no longer behaves like Linux.  It behaves like 
Linux... sometime.  Depending on whether it is a magic TCP port or not.

Talk about security audit hell.

This should be plenty, so I'm stopping now.  But looking down the TOE 
wiki page I could easily come up with more reasons why "IP address 
remote co-management" is more complicated and costly than you think.

	Jeff

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-08 22:15           ` Jeff Garzik
@ 2008-08-08 22:20             ` Jeff Garzik
  2008-08-09  7:28             ` David Miller
  2008-08-10  5:12             ` Roland Dreier
  2 siblings, 0 replies; 71+ messages in thread
From: Jeff Garzik @ 2008-08-08 22:20 UTC (permalink / raw)
  To: Steve Wise, davem
  Cc: Divy Le Ray, Roland Dreier, Karen Xie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, LKML

Jeff Garzik wrote:
> * however, giving the user the ability to co-manage IP addresses means 
> hacking up the kernel TCP code and userland tools for this new concept, 
> something that I think DaveM would rightly be a bit reluctant to do? You 
> are essentially adding a bunch of special case code whenever TCP ports 
> are used:
> 
>     if (port in list of "magic" TCP ports with special,
>         hardware-specific behavior)
>         ...
>     else
>         do what we've been doing for decades
> 
> ISTR Roland(?) pointing out code that already does a bit of this in the 
> IB space...  but the point is

grrr.   but the point is that the solution is not at all complete, with 
feature disconnects and security audit differences still outsanding, and 
non-hw-specific management apps still unwritten.

(I'm not calling for their existence, merely saying trying to strike the 
justification that current capability to limp along exists)

	Jeff



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-08 22:15           ` Jeff Garzik
  2008-08-08 22:20             ` Jeff Garzik
@ 2008-08-09  7:28             ` David Miller
  2008-08-09 14:04               ` Steve Wise
  2008-08-10  5:14               ` Roland Dreier
  2008-08-10  5:12             ` Roland Dreier
  2 siblings, 2 replies; 71+ messages in thread
From: David Miller @ 2008-08-09  7:28 UTC (permalink / raw)
  To: jgarzik
  Cc: swise, divy, rdreier, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Jeff Garzik <jgarzik@pobox.com>
Date: Fri, 08 Aug 2008 18:15:41 -0400

> * security updates for TCP problems mean that a single IP address can be 
> PARTIALLY SECURE, because security updates for kernel TCP stack and 
> h/w's firmware are inevitably updated separately (even if distributed 
> and compiled together).  Yay, we are introducing a wonderful new 
> security problem here.
> 
> * from a security, network scanner and packet classifier point of view, 
> a single IP address no longer behaves like Linux.  It behaves like 
> Linux... sometime.  Depending on whether it is a magic TCP port or not.

I agree with everything Jeff has stated.

Also, I find it ironic that the port abduction is being asked for in
order to be "compatible with existing tools" yet in fact this stuff
breaks everything.  You can't netfilter this traffic, you can't apply
qdiscs to it, you can't execut TC actions on them, you can't do
segmentation offload on them, you can't look for the usual TCP MIB
statistics on the connection, etc. etc. etc.

It is broken from every possible angle.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-09  7:28             ` David Miller
@ 2008-08-09 14:04               ` Steve Wise
  2008-08-10  5:14               ` Roland Dreier
  1 sibling, 0 replies; 71+ messages in thread
From: Steve Wise @ 2008-08-09 14:04 UTC (permalink / raw)
  To: David Miller
  Cc: jgarzik, divy, rdreier, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

David Miller wrote:
> From: Jeff Garzik <jgarzik@pobox.com>
> Date: Fri, 08 Aug 2008 18:15:41 -0400
>
>   
>> * security updates for TCP problems mean that a single IP address can be 
>> PARTIALLY SECURE, because security updates for kernel TCP stack and 
>> h/w's firmware are inevitably updated separately (even if distributed 
>> and compiled together).  Yay, we are introducing a wonderful new 
>> security problem here.
>>
>> * from a security, network scanner and packet classifier point of view, 
>> a single IP address no longer behaves like Linux.  It behaves like 
>> Linux... sometime.  Depending on whether it is a magic TCP port or not.
>>     
>
> I agree with everything Jeff has stated.
>
> Also, I find it ironic that the port abduction is being asked for in
> order to be "compatible with existing tools" yet in fact this stuff
> breaks everything.  You can't netfilter this traffic, you can't apply
> qdiscs to it, you can't execut TC actions on them, you can't do
> segmentation offload on them, you can't look for the usual TCP MIB
> statistics on the connection, etc. etc. etc.
>
> It is broken from every possible angle.
>   

I think a lot of these _could_ be implemented and integrated with the 
standard tools.






^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-08 22:15           ` Jeff Garzik
  2008-08-08 22:20             ` Jeff Garzik
  2008-08-09  7:28             ` David Miller
@ 2008-08-10  5:12             ` Roland Dreier
  2008-08-10  5:46               ` David Miller
                                 ` (2 more replies)
  2 siblings, 3 replies; 71+ messages in thread
From: Roland Dreier @ 2008-08-10  5:12 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Steve Wise, davem, Divy Le Ray, Karen Xie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, LKML

 > * however, giving the user the ability to co-manage IP addresses means
 > hacking up the kernel TCP code and userland tools for this new
 > concept, something that I think DaveM would rightly be a bit reluctant
 > to do? You are essentially adding a bunch of special case code
 > whenever TCP ports are used:
 > 
 > 	if (port in list of "magic" TCP ports with special,
 > 	    hardware-specific behavior)
 > 		...
 > 	else
 > 		do what we've been doing for decades

I think you're arguing against something that no one is actually
pushing.  What I'm sure Chelsio and probably other iSCSI offload vendors
would like is a way to make iSCSI (and other) offloads not steal magic
ports but actually hook into the normal infrastructure so that the
offloaded connections show up in netstat, etc.  Having this solution
would be nice not just for TCP offload but also for things like in-band
system management, which currently lead to the same hard-to-diagnose
issues when someone hits the stolen port.  And it also would seem to
help "classifier NICs" (Sun Neptune, Solarflare, etc) where some traffic
might be steered to a userspace TCP stack.

I don't think the proposal of just using a separate MAC and IP for the
iSCSI HBA really works, for two reasons:

 - It doesn't work in theory, because the suggestion (I guess) is that
   the iSCSI HBA has its own MAC and IP and behaves like a separate
   system.  But this means that to start with the HBA needs its own ARP,
   ICMP, routing, etc interface, which means we need some (probably new)
   interface to configure all of this.  And then it doesn't work in lots
   of networks; for example the ethernet jack in my office doesn't work
   without 802.1x authentication, and putting all of that in an iSCSI
   HBA's firmware clearly is crazy (not to mention creating the
   interface to pass 802.1x credentials into the kernel to pass to the
   HBA).

 - It doesn't work in practice because most of the existing NICs that
   are capable of iSCSI offload, eg Chelsio and Broadcom as well as 3 or
   4 other vendors, don't handle ARP, ICMP, etc in the device -- they
   need the host system to do it.  Which means that either we have a
   separate ARP/ICMP stack for offload adapters (obviously untenable) or
   a separate implemention in each driver (even more untenable), or we
   use the normal stack for the adapter, which seems to force us into
   creating a normal netdev for the iSCSI offload interface, which in
   turn seems to force us to figure out a way for offload adapters to
   coexist with the host stack (assuming of course that we care about
   iSCSI HBAs and/or stuff like NFS/RDMA).

A long time ago, DaveM pointed me at the paper "TCP offload is a dumb
idea whose time has come" (<http://www.usenix.org/events/hotos03/tech/full_papers/mogul/mogul_html/index.html>)
which is an interesting paper that argues that this time really is
different, and OS developers need to figure out how transport offload
fits in.  As a side note, funnily enough back in the thread where DaveM
mentioned that paper, Alan Cox said "Take a look at who holds the
official internet land speed record. Its not a TOE using system" but at
least as of now the current record for IPv4
(http://www.internet2.edu/lsr/) *is* held by a TOE.

I think there are two ways to proceed:

 - Start trying to figure out the best way to support the iSCSI offload
   hardware that's out there.  I don't know the perfect answer but I'm
   sure we can figure something out if we make an honest effort.

 - Ignore the issue and let users of iSCSI offload hardware (and iWARP
   and NFS/RDMA etc) stick to hacky out-of-tree solutions.  This pays
   off if stuff like the Intel CRC32C instruction plus faster CPUs (or
   "multithreaded" NICs that use multicore better) makes offload
   irrelevant.  However this ignores the fundamental 3X memory bandwidth
   cost of not doing direct placement in the NIC, and risks us being in
   a "well Solaris has support" situation down the road.

To be honest I think the best thing to do is just to get support for
these iSCSI offload adapters upstream in whatever form we can all agree
on, so that we can see a) whether anyone cares and b) if someone does
care, whether there's some better way to do things.

 > ISTR Roland(?) pointing out code that already does a bit of this in
 > the IB space...  but the point is

Not me... and I don't think that there would be anything like this for
InfiniBand, since IB is a completely different animal that has nothing
to do with TCP/IP.  You may be thinking of iWARP (RDMA over TCP/IP), but
actually the current Linux iWARP support completely punts on the issue
of coexisting with the native stack (basically because of a lack of
interest in solving the problems from the netdev side of things), which
leads to nasty issues that show up when things happen to collide.  So
far people seem to be coping by using nasty out-of-tree hacks.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-09  7:28             ` David Miller
  2008-08-09 14:04               ` Steve Wise
@ 2008-08-10  5:14               ` Roland Dreier
  2008-08-10  5:47                 ` David Miller
  1 sibling, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-10  5:14 UTC (permalink / raw)
  To: David Miller
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

 > Also, I find it ironic that the port abduction is being asked for in
 > order to be "compatible with existing tools" yet in fact this stuff
 > breaks everything.  You can't netfilter this traffic, you can't apply
 > qdiscs to it, you can't execut TC actions on them, you can't do
 > segmentation offload on them, you can't look for the usual TCP MIB
 > statistics on the connection, etc. etc. etc.

We already support offloads that break other features, eg large receive
offload breaks forwarding.  We deal with it.

I'm sure if we thought about it we could come up with clean ways to fix
some of the issues you raise, and just disable the offload if someone
wanted to use a feature we can't support.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:12             ` Roland Dreier
@ 2008-08-10  5:46               ` David Miller
  2008-08-11 16:07                 ` Roland Dreier
  2008-08-11 18:13                 ` Rick Jones
  2008-08-10  6:24               ` Herbert Xu
  2008-08-10  9:19               ` Alan Cox
  2 siblings, 2 replies; 71+ messages in thread
From: David Miller @ 2008-08-10  5:46 UTC (permalink / raw)
  To: rdreier
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Sat, 09 Aug 2008 22:12:07 -0700

> What I'm sure Chelsio and probably other iSCSI offload vendors
> would like is a way to make iSCSI (and other) offloads not steal magic
> ports but actually hook into the normal infrastructure so that the
> offloaded connections show up in netstat, etc.

Why show these special connections if the user cannot interact with or
shape the stream at all like normal ones?

This whole "make it look normal" argument is entirely bogus because
none of the standard Linux networking facilities can be applied to
these things.

And I even wonder, these days, if you probably get %90 or more of the
gain these "optimized" iSCSI connections obtain from things like LRO.
And since LRO can be done entirely in software (although stateless
HW assistence helps), it is even a NIC agnostic performance improvement.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:14               ` Roland Dreier
@ 2008-08-10  5:47                 ` David Miller
  2008-08-10  6:34                   ` Herbert Xu
                                     ` (2 more replies)
  0 siblings, 3 replies; 71+ messages in thread
From: David Miller @ 2008-08-10  5:47 UTC (permalink / raw)
  To: rdreier
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Sat, 09 Aug 2008 22:14:11 -0700

>  > Also, I find it ironic that the port abduction is being asked for in
>  > order to be "compatible with existing tools" yet in fact this stuff
>  > breaks everything.  You can't netfilter this traffic, you can't apply
>  > qdiscs to it, you can't execut TC actions on them, you can't do
>  > segmentation offload on them, you can't look for the usual TCP MIB
>  > statistics on the connection, etc. etc. etc.
> 
> We already support offloads that break other features, eg large receive
> offload breaks forwarding.  We deal with it.

We turn it off.  If I want to shape or filter one of these iSCSI
connections can we turn it off?

It's funny you mention LRO because it probably gives most of whatever
gain these special iSCSI TCP connection offload things get.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:12             ` Roland Dreier
  2008-08-10  5:46               ` David Miller
@ 2008-08-10  6:24               ` Herbert Xu
  2008-08-10  9:19               ` Alan Cox
  2 siblings, 0 replies; 71+ messages in thread
From: Herbert Xu @ 2008-08-10  6:24 UTC (permalink / raw)
  To: Roland Dreier
  Cc: jgarzik, swise, davem, divy, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

Roland Dreier <rdreier@cisco.com> wrote:
>
> I think there are two ways to proceed:
> 
> - Start trying to figure out the best way to support the iSCSI offload
>   hardware that's out there.  I don't know the perfect answer but I'm
>   sure we can figure something out if we make an honest effort.
> 
> - Ignore the issue and let users of iSCSI offload hardware (and iWARP
>   and NFS/RDMA etc) stick to hacky out-of-tree solutions.  This pays
>   off if stuff like the Intel CRC32C instruction plus faster CPUs (or
>   "multithreaded" NICs that use multicore better) makes offload
>   irrelevant.  However this ignores the fundamental 3X memory bandwidth
>   cost of not doing direct placement in the NIC, and risks us being in
>   a "well Solaris has support" situation down the road.

We've been here many times before.  This is just the smae old TOE
debate all over again.  The fact with TOE is that history has shown
that Dave's decision has been spot on.

So you're going to have to come up with some really convincing
evidence that shows we are all wrong and these TOE-like hardware
offload solutions is the only way to go.  You can start by collecting
solid benchmark numbers that we can all reproduce and look into.

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:47                 ` David Miller
@ 2008-08-10  6:34                   ` Herbert Xu
  2008-08-10 17:57                   ` Steve Wise
  2008-08-11 16:09                   ` Roland Dreier
  2 siblings, 0 replies; 71+ messages in thread
From: Herbert Xu @ 2008-08-10  6:34 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

David Miller <davem@davemloft.net> wrote:
>
>> We already support offloads that break other features, eg large receive
>> offload breaks forwarding.  We deal with it.
> 
> We turn it off.  If I want to shape or filter one of these iSCSI
> connections can we turn it off?

Actually one of my TODO items is to restructure software LRO
so that we preserve the original packet headers while aggregating
the packets.  That would allow us to easily refragment them on
output for forwarding.

In other words LRO (at least the software variant) is not
fundamentally incompatible with forwarding.

I'd also like to encourage all hardware manufacturers considering
LRO support to provide a way for us to access the original headers
so that it doesn't have to be turned off for forwarding.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:12             ` Roland Dreier
  2008-08-10  5:46               ` David Miller
  2008-08-10  6:24               ` Herbert Xu
@ 2008-08-10  9:19               ` Alan Cox
  2008-08-10 12:49                 ` Jeff Garzik
  2 siblings, 1 reply; 71+ messages in thread
From: Alan Cox @ 2008-08-10  9:19 UTC (permalink / raw)
  To: Roland Dreier
  Cc: Jeff Garzik, Steve Wise, davem, Divy Le Ray, Karen Xie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, LKML

>  - It doesn't work in theory, because the suggestion (I guess) is that
>    the iSCSI HBA has its own MAC and IP and behaves like a separate

The iSCSI HBA is its own system - that is the root of the problem.

>    system.  But this means that to start with the HBA needs its own ARP,
>    ICMP, routing, etc interface, which means we need some (probably new)
>    interface to configure all of this.  And then it doesn't work in lots

Its another system so surely SNMP ;)

More seriously I do think iSCSI is actually a subtly special case of TOE.
Most TOE disintegrates under carefully chosen "malicious" workloads
because of the way it is optimised, and the lack of security integration
ranges can be very very dangeorus. A pure iSCSI connection is generally
private, single purpose and really is the classic application of "pigs fly
given enough thrust" - which is the only way to make the pig in question
(iSCSI) work properly.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  9:19               ` Alan Cox
@ 2008-08-10 12:49                 ` Jeff Garzik
  2008-08-10 14:54                   ` James Bottomley
  0 siblings, 1 reply; 71+ messages in thread
From: Jeff Garzik @ 2008-08-10 12:49 UTC (permalink / raw)
  To: Alan Cox
  Cc: Roland Dreier, Steve Wise, davem, Divy Le Ray, Karen Xie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, LKML

Alan Cox wrote:
>>  - It doesn't work in theory, because the suggestion (I guess) is that
>>    the iSCSI HBA has its own MAC and IP and behaves like a separate
> 
> The iSCSI HBA is its own system - that is the root of the problem.

Indeed.

Just like with TOE, from the net stack's point of view, an iSCSI HBA is 
essentially a wholly asynchronous remote system [with a really fast 
communication bus like PCI Express].

As such, the task becomes updating the net stack such that 
formerly-private resources are now shared with an independent, external 
system...  with all the complexity, additional failure modes, and 
additional security complications that come along with that.

	Jeff

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10 12:49                 ` Jeff Garzik
@ 2008-08-10 14:54                   ` James Bottomley
       [not found]                     ` <1218380086.3418.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
  0 siblings, 1 reply; 71+ messages in thread
From: James Bottomley @ 2008-08-10 14:54 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Alan Cox, Roland Dreier, Steve Wise, davem, Divy Le Ray,
	Karen Xie, netdev, open-iscsi, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, LKML

On Sun, 2008-08-10 at 08:49 -0400, Jeff Garzik wrote:
> Alan Cox wrote:
> >>  - It doesn't work in theory, because the suggestion (I guess) is that
> >>    the iSCSI HBA has its own MAC and IP and behaves like a separate
> > 
> > The iSCSI HBA is its own system - that is the root of the problem.
> 
> Indeed.
> 
> Just like with TOE, from the net stack's point of view, an iSCSI HBA is 
> essentially a wholly asynchronous remote system [with a really fast 
> communication bus like PCI Express].
> 
> As such, the task becomes updating the net stack such that 
> formerly-private resources are now shared with an independent, external 
> system...  with all the complexity, additional failure modes, and 
> additional security complications that come along with that.

What's wrong with making it configurable identically to current software
iSCSI?  i.e. plumb the thing into the current iscsi transport class so
that we use the standard daemon for creating and binding sessions?
Then, only once the session is bound do you let your iSCSI TOE stack
take over.

That way the connection appears to the network as completely normal,
because it has an open socket associated with it; and, since the
transport class has done the connection login, it even looks like a
normal iSCSI connection to the usual tools.  iSCSI would manage
connection and authentication, so your TOE stack can be simply around
the block acceleration piece (i.e. you'd need to get the iscsi daemon to
do relogin and things).

I would assume net will require some indicator that the opened
connection has been subsumed, so it knows not to try to manage it, but
other than that I don't see it will need any alteration.  The usual
tools, like netfilter could even use this information to know the limits
of their management.

If this model works, we can use it for TOE acceleration of individual
applications (rather than the entire TCP stack) on an as needed basis.

This is like the port stealing proposal, but since the iSCSI daemon is
responsible for maintaining the session, the port isn't completely
stolen, just switched to accelerator mode when doing the iSCSI offload.

James

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:47                 ` David Miller
  2008-08-10  6:34                   ` Herbert Xu
@ 2008-08-10 17:57                   ` Steve Wise
  2008-08-11 16:09                   ` Roland Dreier
  2 siblings, 0 replies; 71+ messages in thread
From: Steve Wise @ 2008-08-10 17:57 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, jgarzik, divy, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

David Miller wrote:
> From: Roland Dreier <rdreier@cisco.com>
> Date: Sat, 09 Aug 2008 22:14:11 -0700
>
>   
>>  > Also, I find it ironic that the port abduction is being asked for in
>>  > order to be "compatible with existing tools" yet in fact this stuff
>>  > breaks everything.  You can't netfilter this traffic, you can't apply
>>  > qdiscs to it, you can't execut TC actions on them, you can't do
>>  > segmentation offload on them, you can't look for the usual TCP MIB
>>  > statistics on the connection, etc. etc. etc.
>>
>> We already support offloads that break other features, eg large receive
>> offload breaks forwarding.  We deal with it.
>>     
>
> We turn it off.  If I want to shape or filter one of these iSCSI
> connections can we turn it off?
>
>   
Sure.

Seems to me we _could_ architect this all so that these devices would 
have to support a method for the management/admin tools to tweak, and if 
nothing else kill, offload connections if policy rules change and the 
existing connections aren't implementing the policy.  IE: if the offload 
connection doesn't support whatever security or other facilities that 
the admin requires, then the admin should have the ability to disable 
that device.  And of course, some devices will allow doing things like 
netfilter, qos, tweaking vlan tags, etc even on active connection,  if 
the OS infrastructure is there to hook it all up.

BTW:  I think all these offload devices provide MIBs and could be pulled 
in to the normal management tools.

Steve.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:46               ` David Miller
@ 2008-08-11 16:07                 ` Roland Dreier
  2008-08-11 21:08                   ` David Miller
  2008-08-11 18:13                 ` Rick Jones
  1 sibling, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-11 16:07 UTC (permalink / raw)
  To: David Miller
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

 > Why show these special connections if the user cannot interact with or
 > shape the stream at all like normal ones?

So that an admin can see what connections are open, so that the stack
doesn't try to reuse the same 4-tuple for another connection, etc, etc.

 > And I even wonder, these days, if you probably get %90 or more of the
 > gain these "optimized" iSCSI connections obtain from things like LRO.

Yes, that's the question -- are stateless offloads (plus CRC32C in the
CPU etc) going to give good enough performance that the whole TCP
offload exercise is pointless?  The only issue is that I don't see how
to avoid the fundamental 3X increase in memory bandwidth that is chewed
up if the NIC can't do direct placement.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:47                 ` David Miller
  2008-08-10  6:34                   ` Herbert Xu
  2008-08-10 17:57                   ` Steve Wise
@ 2008-08-11 16:09                   ` Roland Dreier
  2008-08-11 21:09                     ` David Miller
  2 siblings, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-11 16:09 UTC (permalink / raw)
  To: David Miller
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

 > We turn it off.  If I want to shape or filter one of these iSCSI
 > connections can we turn it off?

That seems like a reasonable idea to me -- the standard thing to do when
a NIC offload conflicts with something else is to turn off the offload
and fall back to software.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
       [not found]                     ` <1218380086.3418.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
@ 2008-08-11 16:50                       ` Mike Christie
  0 siblings, 0 replies; 71+ messages in thread
From: Mike Christie @ 2008-08-11 16:50 UTC (permalink / raw)
  To: James Bottomley
  Cc: Jeff Garzik, Alan Cox, Roland Dreier, Steve Wise,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, Divy Le Ray, Karen Xie,
	netdev-u79uwXL29TY76Z2rM5mHXA, open-iscsi-/JYPxA39Uh5TLH3MbocFFw,
	daisyc-r/Jw6+rmf7HQT0dZR+AlfA, wenxiong-r/Jw6+rmf7HQT0dZR+AlfA,
	bhua-r/Jw6+rmf7HQT0dZR+AlfA, Dimitrios Michailidis, Casey Leedom,
	linux-scsi, LKML


James Bottomley wrote:
> On Sun, 2008-08-10 at 08:49 -0400, Jeff Garzik wrote:
>> Alan Cox wrote:
>>>>  - It doesn't work in theory, because the suggestion (I guess) is that
>>>>    the iSCSI HBA has its own MAC and IP and behaves like a separate
>>> The iSCSI HBA is its own system - that is the root of the problem.
>> Indeed.
>>
>> Just like with TOE, from the net stack's point of view, an iSCSI HBA is 
>> essentially a wholly asynchronous remote system [with a really fast 
>> communication bus like PCI Express].
>>
>> As such, the task becomes updating the net stack such that 
>> formerly-private resources are now shared with an independent, external 
>> system...  with all the complexity, additional failure modes, and 
>> additional security complications that come along with that.
> 
> What's wrong with making it configurable identically to current software
> iSCSI?  i.e. plumb the thing into the current iscsi transport class so
> that we use the standard daemon for creating and binding sessions?
> Then, only once the session is bound do you let your iSCSI TOE stack
> take over.
> 
> That way the connection appears to the network as completely normal,
> because it has an open socket associated with it; and, since the
> transport class has done the connection login, it even looks like a
> normal iSCSI connection to the usual tools.  iSCSI would manage
> connection and authentication, so your TOE stack can be simply around
> the block acceleration piece (i.e. you'd need to get the iscsi daemon to
> do relogin and things).


This is what Chelsio and broadcom do today more or less. Chelsio did the 
socket trick you are proposing. Broadcom went with a different hack. But 
in the end both hook into the iscsi transport class (the current iscsi 
transport class works for this today), userspace daemon and tools, so 
that the iscsi daemon handles iscsi login, iscsi authentication and all 
other iscsi operations, like it does for software iscsi.


> 
> I would assume net will require some indicator that the opened
> connection has been subsumed, so it knows not to try to manage it, but
> other than that I don't see it will need any alteration.  The usual
> tools, like netfilter could even use this information to know the limits
> of their management.
> 
> If this model works, we can use it for TOE acceleration of individual
> applications (rather than the entire TCP stack) on an as needed basis.
> 
> This is like the port stealing proposal, but since the iSCSI daemon is
> responsible for maintaining the session, the port isn't completely
> stolen, just switched to accelerator mode when doing the iSCSI offload.
> 

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-10  5:46               ` David Miller
  2008-08-11 16:07                 ` Roland Dreier
@ 2008-08-11 18:13                 ` Rick Jones
  2008-08-11 21:12                   ` David Miller
  1 sibling, 1 reply; 71+ messages in thread
From: Rick Jones @ 2008-08-11 18:13 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

David Miller wrote:
> And I even wonder, these days, if you probably get %90 or more of the
> gain these "optimized" iSCSI connections obtain from things like LRO.
> And since LRO can be done entirely in software (although stateless
> HW assistence helps), it is even a NIC agnostic performance improvement.

Probably depends on whether or not the iSCSI offload solutions are doing 
zero-copy receive into the filecache?

rick jones

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 16:07                 ` Roland Dreier
@ 2008-08-11 21:08                   ` David Miller
  2008-08-11 21:39                     ` Roland Dreier
  0 siblings, 1 reply; 71+ messages in thread
From: David Miller @ 2008-08-11 21:08 UTC (permalink / raw)
  To: rdreier
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Mon, 11 Aug 2008 09:07:51 -0700

> Yes, that's the question -- are stateless offloads (plus CRC32C in the
> CPU etc) going to give good enough performance that the whole TCP
> offload exercise is pointless?

This is by definition true, over time.  And this has stedfastly proven
itself, over and over again.

That's why we call stateful offloads a point in time solution.
They are constantly being obsoleted by time.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 16:09                   ` Roland Dreier
@ 2008-08-11 21:09                     ` David Miller
  2008-08-11 21:37                       ` Roland Dreier
  2008-08-11 23:20                       ` Steve Wise
  0 siblings, 2 replies; 71+ messages in thread
From: David Miller @ 2008-08-11 21:09 UTC (permalink / raw)
  To: rdreier
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Mon, 11 Aug 2008 09:09:02 -0700

>  > We turn it off.  If I want to shape or filter one of these iSCSI
>  > connections can we turn it off?
> 
> That seems like a reasonable idea to me -- the standard thing to do when
> a NIC offload conflicts with something else is to turn off the offload
> and fall back to software.

But as Herbert says, we can make LRO such that turning it off
isn't necessary.

Can we shape the iSCSI offload traffic without turning it off?

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 18:13                 ` Rick Jones
@ 2008-08-11 21:12                   ` David Miller
  2008-08-11 21:41                     ` Roland Dreier
  2008-08-14 20:45                     ` Andrew Gallatin
  0 siblings, 2 replies; 71+ messages in thread
From: David Miller @ 2008-08-11 21:12 UTC (permalink / raw)
  To: rick.jones2
  Cc: rdreier, jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 11 Aug 2008 11:13:25 -0700

> David Miller wrote:
> > And I even wonder, these days, if you probably get %90 or more of the
> > gain these "optimized" iSCSI connections obtain from things like LRO.
> > And since LRO can be done entirely in software (although stateless
> > HW assistence helps), it is even a NIC agnostic performance improvement.
> 
> Probably depends on whether or not the iSCSI offload solutions are doing 
> zero-copy receive into the filecache?

That's a data placement issue, which also can be solved with
stateless offloading.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:09                     ` David Miller
@ 2008-08-11 21:37                       ` Roland Dreier
  2008-08-11 21:51                         ` David Miller
  2008-08-11 23:20                       ` Steve Wise
  1 sibling, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-11 21:37 UTC (permalink / raw)
  To: David Miller
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

 > But as Herbert says, we can make LRO such that turning it off
 > isn't necessary.
 > 
 > Can we shape the iSCSI offload traffic without turning it off?

Sure... the same way we can ask the HW vendors to keep old headers
around when aggregating for LRO, we can ask HW vendors for hooks for
shaping iSCSI traffic.  And the Chelsio TCP speed record seems to show
that they already have pretty sophisticated queueing/shaping in their
current HW.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:08                   ` David Miller
@ 2008-08-11 21:39                     ` Roland Dreier
  2008-08-11 21:52                       ` David Miller
  0 siblings, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-11 21:39 UTC (permalink / raw)
  To: David Miller
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

 > > Yes, that's the question -- are stateless offloads (plus CRC32C in the
 > > CPU etc) going to give good enough performance that the whole TCP
 > > offload exercise is pointless?
 > 
 > This is by definition true, over time.  And this has stedfastly proven
 > itself, over and over again.

By the definition of what?

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:12                   ` David Miller
@ 2008-08-11 21:41                     ` Roland Dreier
  2008-08-11 21:53                       ` David Miller
  2008-08-14 20:45                     ` Andrew Gallatin
  1 sibling, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-11 21:41 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, jgarzik, swise, divy, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

 > > Probably depends on whether or not the iSCSI offload solutions are doing 
 > > zero-copy receive into the filecache?
 > 
 > That's a data placement issue, which also can be solved with
 > stateless offloading.

How can you place iSCSI data properly with only stateless offloads?

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:37                       ` Roland Dreier
@ 2008-08-11 21:51                         ` David Miller
  0 siblings, 0 replies; 71+ messages in thread
From: David Miller @ 2008-08-11 21:51 UTC (permalink / raw)
  To: rdreier
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Mon, 11 Aug 2008 14:37:59 -0700

>  > But as Herbert says, we can make LRO such that turning it off
>  > isn't necessary.
>  > 
>  > Can we shape the iSCSI offload traffic without turning it off?
> 
> Sure... the same way we can ask the HW vendors to keep old headers
> around when aggregating for LRO, we can ask HW vendors for hooks for
> shaping iSCSI traffic.  And the Chelsio TCP speed record seems to show
> that they already have pretty sophisticated queueing/shaping in their
> current HW.

You don't get it, you can't add the entire netfilter and qdisc
stack into the silly firmware.

And we can't fix bugs there either.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:39                     ` Roland Dreier
@ 2008-08-11 21:52                       ` David Miller
  0 siblings, 0 replies; 71+ messages in thread
From: David Miller @ 2008-08-11 21:52 UTC (permalink / raw)
  To: rdreier
  Cc: jgarzik, swise, divy, kxie, netdev, open-iscsi, michaelc, daisyc,
	wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Mon, 11 Aug 2008 14:39:47 -0700

>  > > Yes, that's the question -- are stateless offloads (plus CRC32C in the
>  > > CPU etc) going to give good enough performance that the whole TCP
>  > > offload exercise is pointless?
>  > 
>  > This is by definition true, over time.  And this has stedfastly proven
>  > itself, over and over again.
> 
> By the definition of what?

By definition of time always advancing forward, and cpus always
getting faster, and memory (albeit more slowly) increasing in
speed too,


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:41                     ` Roland Dreier
@ 2008-08-11 21:53                       ` David Miller
  2008-08-12 21:57                         ` Divy Le Ray
  2008-08-13 21:27                         ` Roland Dreier
  0 siblings, 2 replies; 71+ messages in thread
From: David Miller @ 2008-08-11 21:53 UTC (permalink / raw)
  To: rdreier
  Cc: rick.jones2, jgarzik, swise, divy, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Mon, 11 Aug 2008 14:41:16 -0700

>  > > Probably depends on whether or not the iSCSI offload solutions are doing 
>  > > zero-copy receive into the filecache?
>  > 
>  > That's a data placement issue, which also can be solved with
>  > stateless offloading.
> 
> How can you place iSCSI data properly with only stateless offloads?

By teaching the stateless offload how to parse the iSCSI headers
on the flow and place the data into pages at the correct offsets
such that you can place the pages hanging off of the SKB directly
into the page cache.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:09                     ` David Miller
  2008-08-11 21:37                       ` Roland Dreier
@ 2008-08-11 23:20                       ` Steve Wise
  2008-08-11 23:45                         ` Divy Le Ray
  2008-08-12  0:22                         ` David Miller
  1 sibling, 2 replies; 71+ messages in thread
From: Steve Wise @ 2008-08-11 23:20 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, jgarzik, divy, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

David Miller wrote:
> From: Roland Dreier <rdreier@cisco.com>
> Date: Mon, 11 Aug 2008 09:09:02 -0700
>
>   
>>  > We turn it off.  If I want to shape or filter one of these iSCSI
>>  > connections can we turn it off?
>>
>> That seems like a reasonable idea to me -- the standard thing to do when
>> a NIC offload conflicts with something else is to turn off the offload
>> and fall back to software.
>>     
>
> But as Herbert says, we can make LRO such that turning it off
> isn't necessary.
>
> Can we shape the iSCSI offload traffic without turning it off?
>   

With Chelsio's product you can do this.  Maybe Divy can provide details?




^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 23:20                       ` Steve Wise
@ 2008-08-11 23:45                         ` Divy Le Ray
  2008-08-12  0:22                         ` David Miller
  1 sibling, 0 replies; 71+ messages in thread
From: Divy Le Ray @ 2008-08-11 23:45 UTC (permalink / raw)
  To: Steve Wise
  Cc: David Miller, rdreier, jgarzik, Karen Xie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, linux-kernel

On Monday 11 August 2008 04:20:07 pm Steve Wise wrote:
> David Miller wrote:
> > From: Roland Dreier <rdreier@cisco.com>
> > Date: Mon, 11 Aug 2008 09:09:02 -0700
> >
> >>  > We turn it off.  If I want to shape or filter one of these iSCSI
> >>  > connections can we turn it off?
> >>
> >> That seems like a reasonable idea to me -- the standard thing to do when
> >> a NIC offload conflicts with something else is to turn off the offload
> >> and fall back to software.
> >
> > But as Herbert says, we can make LRO such that turning it off
> > isn't necessary.
> >
> > Can we shape the iSCSI offload traffic without turning it off?
>
> With Chelsio's product you can do this.  Maybe Divy can provide details?

The T3 adapter is capable of performing rate control and pacing based on RTT 
on a per-connection basis.

Cheers,
Divy

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 23:20                       ` Steve Wise
  2008-08-11 23:45                         ` Divy Le Ray
@ 2008-08-12  0:22                         ` David Miller
  1 sibling, 0 replies; 71+ messages in thread
From: David Miller @ 2008-08-12  0:22 UTC (permalink / raw)
  To: swise
  Cc: rdreier, jgarzik, divy, kxie, netdev, open-iscsi, michaelc,
	daisyc, wenxiong, bhua, dm, leedom, linux-scsi, linux-kernel

From: Steve Wise <swise@opengridcomputing.com>
Date: Mon, 11 Aug 2008 18:20:07 -0500

> David Miller wrote:
> > From: Roland Dreier <rdreier@cisco.com>
> > Date: Mon, 11 Aug 2008 09:09:02 -0700
> >
> >   
> >>  > We turn it off.  If I want to shape or filter one of these iSCSI
> >>  > connections can we turn it off?
> >>
> >> That seems like a reasonable idea to me -- the standard thing to do when
> >> a NIC offload conflicts with something else is to turn off the offload
> >> and fall back to software.
> >>     
> >
> > But as Herbert says, we can make LRO such that turning it off
> > isn't necessary.
> >
> > Can we shape the iSCSI offload traffic without turning it off?
> >   
> 
> With Chelsio's product you can do this.  Maybe Divy can provide details?

When I say shape I mean apply any packet scheduler, any netfilter
module, and any other feature we support.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:53                       ` David Miller
@ 2008-08-12 21:57                         ` Divy Le Ray
  2008-08-12 22:01                           ` David Miller
  2008-08-12 22:02                           ` David Miller
  2008-08-13 21:27                         ` Roland Dreier
  1 sibling, 2 replies; 71+ messages in thread
From: Divy Le Ray @ 2008-08-12 21:57 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, rick.jones2, jgarzik, Steve Wise, Karen Xie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, linux-kernel

On Monday 11 August 2008 02:53:13 pm David Miller wrote:
> From: Roland Dreier <rdreier@cisco.com>
> Date: Mon, 11 Aug 2008 14:41:16 -0700
>
> >  > > Probably depends on whether or not the iSCSI offload solutions are
> >  > > doing zero-copy receive into the filecache?
> >  >
> >  > That's a data placement issue, which also can be solved with
> >  > stateless offloading.
> >
> > How can you place iSCSI data properly with only stateless offloads?
>
> By teaching the stateless offload how to parse the iSCSI headers
> on the flow and place the data into pages at the correct offsets
> such that you can place the pages hanging off of the SKB directly
> into the page cache.

Hi Dave,

iSCSI PDUs might spawn over multiple TCP segments, it is unclear to me how to 
do placement without keeping some state of the transactions.

In any case, such a stateless solution is not yet designed, whereas 
accelerated iSCSI is available now, from us and other companies.
The accelerated iSCSI streams benefit from the performance TOE provides, 
outlined in the following third party papers:
http://www.chelsio.com/assetlibrary/pdf/redhat-chelsio-toe-final_v2.pdf
http://www.chelsio.com/assetlibrary/pdf/RMDS6BNTChelsioRHEL5.pdf

iSCSI is primarily targeted to the data center, where the SW stack's traffic 
shaping features might be redundant with specialized equipment. It should 
however be possible to integrate security features on a per offoaded 
connection basis, and TOEs - at least ours :) - are capable of rate control 
and traffic shaping.

While CPU and - to a far lesser extent - memory performance improves, so does 
ethernet's. 40G, 100G are not too far ahead. It is not obvious at all that 
TOE is a point of time solution, especially for heavy load traffic as in a 
storage environment. It is quite the opposite actually.

There is room for co-existence of the SW managed traffic and accelerated 
traffic. As our submission shows, enabling accelerated iSCSI is not intrusive 
code wise to the stack. The port stealing issue is solved if we can grab a 
port from the stack.

Cheers,
Divy

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-12 21:57                         ` Divy Le Ray
@ 2008-08-12 22:01                           ` David Miller
  2008-08-12 22:02                           ` David Miller
  1 sibling, 0 replies; 71+ messages in thread
From: David Miller @ 2008-08-12 22:01 UTC (permalink / raw)
  To: divy
  Cc: rdreier, rick.jones2, jgarzik, swise, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

From: Divy Le Ray <divy@chelsio.com>
Date: Tue, 12 Aug 2008 14:57:09 -0700

> iSCSI PDUs might spawn over multiple TCP segments, it is unclear to me how to 
> do placement without keeping some state of the transactions.

You keep a flow table with buffer IDs and offsets.

The S2IO guys did something similar for one of their initial LRO
impelementations.

It's still strictly stateless, and best-effort.  Entries can fall out
of the flow cache which makes upcoming data use new buffers and
offsets.

But these are the kinds of tricks you hardware folks should be
more than adequately able to design, rather than me. :-)

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-12 21:57                         ` Divy Le Ray
  2008-08-12 22:01                           ` David Miller
@ 2008-08-12 22:02                           ` David Miller
  2008-08-12 22:21                             ` Divy Le Ray
  1 sibling, 1 reply; 71+ messages in thread
From: David Miller @ 2008-08-12 22:02 UTC (permalink / raw)
  To: divy
  Cc: rdreier, rick.jones2, jgarzik, swise, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

From: Divy Le Ray <divy@chelsio.com>
Date: Tue, 12 Aug 2008 14:57:09 -0700

> In any case, such a stateless solution is not yet designed, whereas 
> accelerated iSCSI is available now, from us and other companies.

So, WHAT?!

There are TOE pieces of crap out there too.

It's strictly not our problem.

Like Herbert said, this is the TOE discussion all over again.
The results will be the same, and as per our decisions wrt.
TOE, history speaks for itself.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-12 22:02                           ` David Miller
@ 2008-08-12 22:21                             ` Divy Le Ray
  2008-08-13  1:57                               ` Herbert Xu
  2008-08-13 18:35                               ` Vladislav Bolkhovitin
  0 siblings, 2 replies; 71+ messages in thread
From: Divy Le Ray @ 2008-08-12 22:21 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, rick.jones2, jgarzik, Steve Wise, Karen Xie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, linux-kernel

On Tuesday 12 August 2008 03:02:46 pm David Miller wrote:
> From: Divy Le Ray <divy@chelsio.com>
> Date: Tue, 12 Aug 2008 14:57:09 -0700
>
> > In any case, such a stateless solution is not yet designed, whereas
> > accelerated iSCSI is available now, from us and other companies.
>
> So, WHAT?!
>
> There are TOE pieces of crap out there too.

Well, there is demand for accerated iscsi out there, which is the driving 
reason of our driver submission. 

>
> It's strictly not our problem.
>
> Like Herbert said, this is the TOE discussion all over again.
> The results will be the same, and as per our decisions wrt.
> TOE, history speaks for itself.

Herbert requested some benchmark numbers, I consequently obliged.

Cheers,
Divy




^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-12 22:21                             ` Divy Le Ray
@ 2008-08-13  1:57                               ` Herbert Xu
  2008-08-13 18:35                               ` Vladislav Bolkhovitin
  1 sibling, 0 replies; 71+ messages in thread
From: Herbert Xu @ 2008-08-13  1:57 UTC (permalink / raw)
  To: Divy Le Ray
  Cc: davem, rdreier, rick.jones2, jgarzik, swise, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

Divy Le Ray <divy@chelsio.com> wrote:
>
> Herbert requested some benchmark numbers, I consequently obliged.

Have you posted a hardware-accelerated iSCSI vs. LRO comparison?

Thanks,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-12 22:21                             ` Divy Le Ray
  2008-08-13  1:57                               ` Herbert Xu
@ 2008-08-13 18:35                               ` Vladislav Bolkhovitin
  2008-08-13 19:29                                 ` Jeff Garzik
  2008-08-13 20:23                                 ` David Miller
  1 sibling, 2 replies; 71+ messages in thread
From: Vladislav Bolkhovitin @ 2008-08-13 18:35 UTC (permalink / raw)
  To: David Miller
  Cc: open-iscsi, rdreier, rick.jones2, jgarzik, Steve Wise, Karen Xie,
	netdev, michaelc, daisyc, wenxiong, bhua, Dimitrios Michailidis,
	Casey Leedom, linux-scsi, linux-kernel

Divy Le Ray wrote:
> On Tuesday 12 August 2008 03:02:46 pm David Miller wrote:
>> From: Divy Le Ray <divy@chelsio.com>
>> Date: Tue, 12 Aug 2008 14:57:09 -0700
>>
>>> In any case, such a stateless solution is not yet designed, whereas
>>> accelerated iSCSI is available now, from us and other companies.
>> So, WHAT?!
>>
>> There are TOE pieces of crap out there too.
> 
> Well, there is demand for accerated iscsi out there, which is the driving 
> reason of our driver submission. 

I'm, as an iSCSI target developer, strongly voting for hardware iSCSI 
offload. Having possibility of the direct data placement is a *HUGE* 
performance gain.

For example, according to measurements done by one iSCSI-SCST user in 
system with iSCSI initiator and iSCSI target (with iSCSI-SCST 
(http://scst.sourceforge.net/target_iscsi.html) running), both with 
identical modern high speed hardware and 10GbE cards, the _INITIATOR_ is 
the bottleneck for READs (data transfers from target to initiator). This 
is because the target sends data in a zero-copy manner, so its CPU is 
capable to deal with the load, but on the initiator there are additional 
data copies from skb's to page cache and from page cache to application. 
As the result, in the measurements initiator got near 100% CPU load and 
only ~500MB/s throughput. Target had ~30% CPU load. For the opposite 
direction (WRITEs), where there is no the application data copy on the 
target, throughput was ~800MB/s with also near 100% CPU load, but in 
this case on the target. The initiator ran Linux with open-iscsi. The 
test was with real backstorage: target ran BLOCKIO (direct BIOs to/from 
backstorage) with 3ware card. Locally on the target the backstorage was 
able to provide 900+MB/s for READs and about 1GB/s for WRITEs. The 
commands queue in both cases was sufficiently big to eliminate the link 
and processing latencies (20-30 outstanding commands).

Vlad

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 18:35                               ` Vladislav Bolkhovitin
@ 2008-08-13 19:29                                 ` Jeff Garzik
  2008-08-13 20:13                                   ` David Miller
  2008-08-14 18:24                                   ` Vladislav Bolkhovitin
  2008-08-13 20:23                                 ` David Miller
  1 sibling, 2 replies; 71+ messages in thread
From: Jeff Garzik @ 2008-08-13 19:29 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: David Miller, open-iscsi, rdreier, rick.jones2, Steve Wise,
	Karen Xie, netdev, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, linux-kernel

Vladislav Bolkhovitin wrote:
> Divy Le Ray wrote:
>> On Tuesday 12 August 2008 03:02:46 pm David Miller wrote:
>>> From: Divy Le Ray <divy@chelsio.com>
>>> Date: Tue, 12 Aug 2008 14:57:09 -0700
>>>
>>>> In any case, such a stateless solution is not yet designed, whereas
>>>> accelerated iSCSI is available now, from us and other companies.
>>> So, WHAT?!
>>>
>>> There are TOE pieces of crap out there too.
>>
>> Well, there is demand for accerated iscsi out there, which is the 
>> driving reason of our driver submission. 
> 
> I'm, as an iSCSI target developer, strongly voting for hardware iSCSI 
> offload. Having possibility of the direct data placement is a *HUGE* 
> performance gain.

Well, two responses here:

* no one is arguing against hardware iSCSI offload.  Rather, it is a 
problem with a specific implementation, one that falsely assumes two 
independent TCP stacks can co-exist peacefully on the same IP address 
and MAC.

* direct data placement is possible without offloading the entire TCP 
stack onto a firmware/chip.

There is plenty of room for hardware iSCSI offload...

	Jeff



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 19:29                                 ` Jeff Garzik
@ 2008-08-13 20:13                                   ` David Miller
  2008-08-14 18:24                                   ` Vladislav Bolkhovitin
  1 sibling, 0 replies; 71+ messages in thread
From: David Miller @ 2008-08-13 20:13 UTC (permalink / raw)
  To: jgarzik
  Cc: vst, open-iscsi, rdreier, rick.jones2, swise, kxie, netdev,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

From: Jeff Garzik <jgarzik@pobox.com>
Date: Wed, 13 Aug 2008 15:29:55 -0400

> * direct data placement is possible without offloading the entire TCP 
> stack onto a firmware/chip.

I've even described in this thread how that's possible.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 18:35                               ` Vladislav Bolkhovitin
  2008-08-13 19:29                                 ` Jeff Garzik
@ 2008-08-13 20:23                                 ` David Miller
  2008-08-14 18:27                                   ` Vladislav Bolkhovitin
  1 sibling, 1 reply; 71+ messages in thread
From: David Miller @ 2008-08-13 20:23 UTC (permalink / raw)
  To: vst
  Cc: open-iscsi, rdreier, rick.jones2, jgarzik, swise, kxie, netdev,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

From: Vladislav Bolkhovitin <vst@vlnb.net>
Date: Wed, 13 Aug 2008 22:35:34 +0400

> This is because the target sends data in a zero-copy manner, so its
> CPU is capable to deal with the load, but on the initiator there are
> additional data copies from skb's to page cache and from page cache
> to application.

If you've actually been reading at all what I've been saying in this
thread you'll see that I've described a method to do this copy
avoidance in a completely stateless manner.

You don't need to implement a TCP stack in the card in order to do
data placement optimizations.  They can be done completely stateless.

Also, large portions of the cpu overhead are transactional costs,
which are significantly reduced by existing technologies such as
LRO.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:53                       ` David Miller
  2008-08-12 21:57                         ` Divy Le Ray
@ 2008-08-13 21:27                         ` Roland Dreier
  2008-08-13 22:08                           ` David Miller
  1 sibling, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-13 21:27 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, jgarzik, swise, divy, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

 > > How can you place iSCSI data properly with only stateless offloads?

 > By teaching the stateless offload how to parse the iSCSI headers
 > on the flow and place the data into pages at the correct offsets
 > such that you can place the pages hanging off of the SKB directly
 > into the page cache.

I don't see how this could work.  First, it seems that you have to let
the adapter know which connections are iSCSI connections so that it
knows when to try and parse iSCSI headers.  So you're already not
totally stateless.  Then, since (AFAIK -- I'm not an expert on iSCSI and
especially I'm not an expert on what common practice is for current
implementations) the iSCSI PDUs can start at any offset in the TCP
stream, I don't see how a stateless adapter can even find the PDU
headers to parse -- there's not any way that I know of to recognize
where a PDU boundary is without keeping track of the lengths of all the
PDUs that go by (ie you need per-connection state).

Even if the adapter could find the PDUs, I don't see how it could come
up with the correct offset to place the data -- PDUs with response data
just carry an opaque tag assigned by the iSCSI initiator.  Finally, if
there are ways around all of those difficulties, we would still have to
do major surgery to our block layer to cope with read requests that
complete into random pages, rather than using a scatter list passed into
the low-level driver.

But I think all this argument is missing the point anyway.  The real
issue is not hand-waving about what someone might build someday, but how
we want to support iSCSI offload with the existing Chelsio, Broadcom,
etc adapters.  The answer might be, "we don't," but I disagree with that
choice because:

 a. "No upstream support" really ends up being "enterprise distros and
    customers end up using hacky out-of-tree drivers and blaming us."

 b. It sends a bad message to vendors who put a lot of effort into
    writing a clean, mergable driver and responding to review if the
    answer is, "Sorry, your hardware is wrong so no driver for you."

Maybe the answer is that we just add the iSCSI HBA drivers with no help
from the networking stack, and ignore the port collision problem.  For
iSCSI initiators, it's really not an issue: for a 4-tuple to collide,
someone would have to use both offloaded and non-offloaded connections
to the same target and be unlucky in the source port chosen.  It would
be nice to be able to discuss solutions to port collisions, but it may
be that this is too emotional an issue for that to be possible.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 21:27                         ` Roland Dreier
@ 2008-08-13 22:08                           ` David Miller
  2008-08-13 23:03                             ` Roland Dreier
  0 siblings, 1 reply; 71+ messages in thread
From: David Miller @ 2008-08-13 22:08 UTC (permalink / raw)
  To: rdreier
  Cc: rick.jones2, jgarzik, swise, divy, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Wed, 13 Aug 2008 14:27:50 -0700

> I don't see how this could work.  First, it seems that you have to let
> the adapter know which connections are iSCSI connections so that it
> knows when to try and parse iSCSI headers.

It always starts from offset zero for never seen before connections.

> So you're already not totally stateless.

Yes, we are.

> Then, since (AFAIK -- I'm not an expert on iSCSI and
> especially I'm not an expert on what common practice is for current
> implementations) the iSCSI PDUs can start at any offset in the TCP
> stream, I don't see how a stateless adapter can even find the PDU
> headers to parse -- there's not any way that I know of to recognize
> where a PDU boundary is without keeping track of the lengths of all the
> PDUs that go by (ie you need per-connection state).

Like I said, you retain a "flow cache" (say it a million times, "flow
cache") that remembers the current parameters and the buffers
currently assigned to that flow and what offset within those buffers.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 22:08                           ` David Miller
@ 2008-08-13 23:03                             ` Roland Dreier
  2008-08-13 23:12                               ` David Miller
  0 siblings, 1 reply; 71+ messages in thread
From: Roland Dreier @ 2008-08-13 23:03 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, jgarzik, swise, divy, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

 > Like I said, you retain a "flow cache" (say it a million times, "flow
 > cache") that remembers the current parameters and the buffers
 > currently assigned to that flow and what offset within those buffers.

OK, I admit you could make something work -- add hooks for the low-level
driver to ask the iSCSI initiator where PDU boundaries are so it can
resync when something is evicted from the flow cache, have the initiator
format its tags in a special way to encode placement data, etc, etc.
The scheme does bring to mind Alan's earlier comment about pigs and
propulsion, though.

In any case, as I said in the part of my email that you snipped, the
real issue is not designing hypothetical hardware, but deciding how to
support the Chelsio, Broadcom, etc hardware that exists today.

 - R.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 23:03                             ` Roland Dreier
@ 2008-08-13 23:12                               ` David Miller
  2008-08-14  1:26                                 ` Tom Tucker
  0 siblings, 1 reply; 71+ messages in thread
From: David Miller @ 2008-08-13 23:12 UTC (permalink / raw)
  To: rdreier
  Cc: rick.jones2, jgarzik, swise, divy, kxie, netdev, open-iscsi,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

From: Roland Dreier <rdreier@cisco.com>
Date: Wed, 13 Aug 2008 16:03:15 -0700

> OK, I admit you could make something work -- add hooks for the low-level
> driver to ask the iSCSI initiator where PDU boundaries are so it can
> resync when something is evicted from the flow cache, have the initiator
> format its tags in a special way to encode placement data, etc, etc.
> The scheme does bring to mind Alan's earlier comment about pigs and
> propulsion, though.

There would need to be _NO_ hooks into the iSCSI initiator at all.

The card would land the block I/O data onto the necessary page boundaries
and the iSCSI code would just be able to thus use the pages directly
and as-is.

It would look perfectly like normal TCP receive traffic.  No hooks,
no special cases, nothing like that.

> In any case, as I said in the part of my email that you snipped, the
> real issue is not designing hypothetical hardware, but deciding how to
> support the Chelsio, Broadcom, etc hardware that exists today.

The same like we support TOE hardware that exists today.  That is, we
don't.



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 23:12                               ` David Miller
@ 2008-08-14  1:26                                 ` Tom Tucker
  2008-08-14  1:37                                   ` David Miller
  2008-08-14  2:09                                   ` David Miller
  0 siblings, 2 replies; 71+ messages in thread
From: Tom Tucker @ 2008-08-14  1:26 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, rick.jones2, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

David Miller wrote:
> From: Roland Dreier <rdreier@cisco.com>
> Date: Wed, 13 Aug 2008 16:03:15 -0700
>
>   
>> OK, I admit you could make something work -- add hooks for the low-level
>> driver to ask the iSCSI initiator where PDU boundaries are so it can
>> resync when something is evicted from the flow cache, have the initiator
>> format its tags in a special way to encode placement data, etc, etc.
>> The scheme does bring to mind Alan's earlier comment about pigs and
>> propulsion, though.
>>     
>
> There would need to be _NO_ hooks into the iSCSI initiator at all.
>
> The card would land the block I/O data onto the necessary page boundaries
> and the iSCSI code would just be able to thus use the pages directly
> and as-is.
>
> It would look perfectly like normal TCP receive traffic.  No hooks,
> no special cases, nothing like that.
>
>   
>> In any case, as I said in the part of my email that you snipped, the
>> real issue is not designing hypothetical hardware, but deciding how to
>> support the Chelsio, Broadcom, etc hardware that exists today.
>>     
>
> The same like we support TOE hardware that exists today.  That is, we
> don't.
>
>   
Is there any chance your could discuss exactly how a stateless adapter 
can determine if a network segment
is in-order, next expected, minus productive ack, paws compliant, etc... 
without TCP state?

I get how you can optimize "flows", but "flows" are a fancy name for a 
key (typically the four-tuple) that looks into a TCAM  to get the 
"information" necessary to do header prediction.

Can you explain how this "information" somehow doesn't qualify as 
"state". Doesn't the next expected sequence number at the very least 
need to be updated? una? etc...?

Could you also include the "non-state-full" information necessary to do 
iSCSI header digest validation, data placement, and marker removal? 

Thanks,
Tom
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14  1:26                                 ` Tom Tucker
@ 2008-08-14  1:37                                   ` David Miller
  2008-08-14  1:52                                     ` Steve Wise
  2008-08-14  1:57                                     ` Tom Tucker
  2008-08-14  2:09                                   ` David Miller
  1 sibling, 2 replies; 71+ messages in thread
From: David Miller @ 2008-08-14  1:37 UTC (permalink / raw)
  To: tom
  Cc: rdreier, rick.jones2, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 13 Aug 2008 20:26:51 -0500

> Can you explain how this "information" somehow doesn't qualify as 
> "state". Doesn't the next expected sequence number at the very least 
> need to be updated? una? etc...?
> 
> Could you also include the "non-state-full" information necessary to do 
> iSCSI header digest validation, data placement, and marker removal? 

It's stateless because the full packet traverses the real networking
stack and thus can be treated like any other packet.

The data placement is a side effect that the networking stack can
completely ignore if it chooses to.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14  1:37                                   ` David Miller
@ 2008-08-14  1:52                                     ` Steve Wise
  2008-08-14  2:05                                       ` David Miller
  2008-08-14  1:57                                     ` Tom Tucker
  1 sibling, 1 reply; 71+ messages in thread
From: Steve Wise @ 2008-08-14  1:52 UTC (permalink / raw)
  To: David Miller
  Cc: tom, rdreier, rick.jones2, jgarzik, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

David Miller wrote:
> From: Tom Tucker <tom@opengridcomputing.com>
> Date: Wed, 13 Aug 2008 20:26:51 -0500
>
>   
>> Can you explain how this "information" somehow doesn't qualify as 
>> "state". Doesn't the next expected sequence number at the very least 
>> need to be updated? una? etc...?
>>
>> Could you also include the "non-state-full" information necessary to do 
>> iSCSI header digest validation, data placement, and marker removal? 
>>     
>
> It's stateless because the full packet traverses the real networking
> stack and thus can be treated like any other packet.
>
> The data placement is a side effect that the networking stack can
> completely ignore if it chooses to.
>   
How do you envision programming such a device?  It will need TCP and 
iSCSI state to have any chance of doing useful and productive placement 
of data.  The smarts about the iSCSI stateless offload hw will be in the 
device driver, probably the iscsi device driver.  How will it gather the 
information from the TCP stack to insert the correct state for a flow 
into the hw cache?



^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14  1:37                                   ` David Miller
  2008-08-14  1:52                                     ` Steve Wise
@ 2008-08-14  1:57                                     ` Tom Tucker
  2008-08-14  2:07                                       ` David Miller
  1 sibling, 1 reply; 71+ messages in thread
From: Tom Tucker @ 2008-08-14  1:57 UTC (permalink / raw)
  To: David Miller
  Cc: rdreier, rick.jones2, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

David Miller wrote:
> From: Tom Tucker <tom@opengridcomputing.com>
> Date: Wed, 13 Aug 2008 20:26:51 -0500
>
>   
>> Can you explain how this "information" somehow doesn't qualify as 
>> "state". Doesn't the next expected sequence number at the very least 
>> need to be updated? una? etc...?
>>
>> Could you also include the "non-state-full" information necessary to do 
>> iSCSI header digest validation, data placement, and marker removal? 
>>     
>
> It's stateless because the full packet traverses the real networking
> stack and thus can be treated like any other packet.
>
> The data placement is a side effect that the networking stack can
> completely ignore if it chooses to.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   
Ok. Maybe we're getting somewhere here ... or at least I am :-)

I'm not trying to be pedantic here but let me try and restate what I 
think you said above:

- The "header" traverses the real networking stack
- The "payload" is placed either by by the hardware if possible or by 
the native stack if on the exception path
- The "header" may aggregate multiple  PDU (RSO)
- Data ready indications are controlled entirely by the software/real 
networking stack

Thanks,
Tom


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14  1:52                                     ` Steve Wise
@ 2008-08-14  2:05                                       ` David Miller
  2008-08-14  2:44                                         ` Steve Wise
  0 siblings, 1 reply; 71+ messages in thread
From: David Miller @ 2008-08-14  2:05 UTC (permalink / raw)
  To: swise
  Cc: tom, rdreier, rick.jones2, jgarzik, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

From: Steve Wise <swise@opengridcomputing.com>
Date: Wed, 13 Aug 2008 20:52:47 -0500

> How do you envision programming such a device?

There should be no special programming.

> It will need TCP and iSCSI state to have any chance of doing useful
> and productive placement of data.

The card can see the entire TCP stream, it doesn't need anything
more than that.  It can parse every packet header, see what kind
of data transfer is being requested or responded to, etc.

Look, I'm not going to design this whole friggin' thing for you guys.

I've stated clearly what the base requirement is, which is that the
packet is fully processed by the networking stack and that the card
merely does data placement optimizations that the stack can completely
ignore if it wants to.

You have an entire engine in there that can interpret an iSCSI
transport stream, you have the logic to do these kinds of things,
and it can be done without managing the connection on the card.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14  1:57                                     ` Tom Tucker
@ 2008-08-14  2:07                                       ` David Miller
  0 siblings, 0 replies; 71+ messages in thread
From: David Miller @ 2008-08-14  2:07 UTC (permalink / raw)
  To: tom
  Cc: rdreier, rick.jones2, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 13 Aug 2008 20:57:08 -0500

> I'm not trying to be pedantic here but let me try and restate what I 
> think you said above:
> 
> - The "header" traverses the real networking stack
> - The "payload" is placed either by by the hardware if possible or by 
> the native stack if on the exception path
> - The "header" may aggregate multiple  PDU (RSO)
> - Data ready indications are controlled entirely by the software/real 
> networking stack

SKB's can be paged, in fact many devices already work by chopping
up lists of pages that the driver gives to the card.  NIU is one
of several examples.

The only difference between what a device like NIU is doing now and
what I propose is smart determination of at what offset and into
which buffers to do the demarcation.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14  1:26                                 ` Tom Tucker
  2008-08-14  1:37                                   ` David Miller
@ 2008-08-14  2:09                                   ` David Miller
  1 sibling, 0 replies; 71+ messages in thread
From: David Miller @ 2008-08-14  2:09 UTC (permalink / raw)
  To: tom
  Cc: rdreier, rick.jones2, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

From: Tom Tucker <tom@opengridcomputing.com>
Date: Wed, 13 Aug 2008 20:26:51 -0500

> Is there any chance your could discuss exactly how a stateless adapter 
> can determine if a network segment
> is in-order, next expected, minus productive ack, paws compliant, etc... 
> without TCP state?

If you're getting packets out of order, data placement optimizations
are the least of your concerns.

In fact this is exactly where we want all of the advanced loss
handling algorithms of the Linux TCP stack to get engaged.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14  2:05                                       ` David Miller
@ 2008-08-14  2:44                                         ` Steve Wise
  0 siblings, 0 replies; 71+ messages in thread
From: Steve Wise @ 2008-08-14  2:44 UTC (permalink / raw)
  To: David Miller
  Cc: tom, rdreier, rick.jones2, jgarzik, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, leedom,
	linux-scsi, linux-kernel

David Miller wrote:
> I've stated clearly what the base requirement is, which is that the
> packet is fully processed by the networking stack and that the card
> merely does data placement optimizations that the stack can completely
> ignore if it wants to.
>
> You have an entire engine in there that can interpret an iSCSI
> transport stream, you have the logic to do these kinds of things,
> and it can be done without managing the connection on the card.
>   

Thanks for finally stating it clearly.


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 19:29                                 ` Jeff Garzik
  2008-08-13 20:13                                   ` David Miller
@ 2008-08-14 18:24                                   ` Vladislav Bolkhovitin
  2008-08-14 21:59                                     ` Nicholas A. Bellinger
  1 sibling, 1 reply; 71+ messages in thread
From: Vladislav Bolkhovitin @ 2008-08-14 18:24 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: David Miller, open-iscsi, rdreier, rick.jones2, Steve Wise,
	Karen Xie, netdev, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, linux-kernel

Jeff Garzik wrote:
> Vladislav Bolkhovitin wrote:
>> Divy Le Ray wrote:
>>> On Tuesday 12 August 2008 03:02:46 pm David Miller wrote:
>>>> From: Divy Le Ray <divy@chelsio.com>
>>>> Date: Tue, 12 Aug 2008 14:57:09 -0700
>>>>
>>>>> In any case, such a stateless solution is not yet designed, whereas
>>>>> accelerated iSCSI is available now, from us and other companies.
>>>> So, WHAT?!
>>>>
>>>> There are TOE pieces of crap out there too.
>>> Well, there is demand for accerated iscsi out there, which is the 
>>> driving reason of our driver submission. 
>> I'm, as an iSCSI target developer, strongly voting for hardware iSCSI 
>> offload. Having possibility of the direct data placement is a *HUGE* 
>> performance gain.
> 
> Well, two responses here:
> 
> * no one is arguing against hardware iSCSI offload.  Rather, it is a 
> problem with a specific implementation, one that falsely assumes two 
> independent TCP stacks can co-exist peacefully on the same IP address 
> and MAC.
> 
> * direct data placement is possible without offloading the entire TCP 
> stack onto a firmware/chip.
> 
> There is plenty of room for hardware iSCSI offload...

Sure, nobody is arguing against that. My points are:

1. All those are things not for near future. I don't think it can be 
implemented earlier than in a year time, but there is a huge demand for 
high speed and low CPU overhead iSCSI _now_. Nobody's satisfied by the 
fact that with the latest high end hardware he can saturate 10GbE link 
on only less than 50%(!). Additionally, for me, as an iSCSI target 
developer, it looks especially annoying that hardware requirements for 
_clients_ (initiators) are significantly higher than for _server_ 
(target). This situation for me looks as a nonsense.

2. I believe, that iSCSI/TCP pair is sufficiently heavy weighted 
protocol to be completely offloaded to hardware. All partial offloads 
will never make it comparably efficient. It still would consume a lot of 
CPU. For example, consider digests. Even if they computed by new CRC32C 
instruction, the computation still would need a chunk of CPU power. I 
think, at least as much as to copy the computed block to new location. 
Can we save it? Sure, with hardware offload. The additional CPU load can 
be acceptable if only data are transferred and there are no other 
activities, but in real life this is quite rare. Consider, for instance, 
a VPS server, like VMware. It always lacks CPU power and 30% CPU load 
during data transfers makes a huge difference. Another example is a 
target doing some processing of transferred data, like encryption or 
de-duplication.

Note, I'm not advocating this particular cxgb3 driver. I have not 
examined it closely enough and don't have sufficient knowledge about the 
hardware to judge it. But I'm advocating the concept of full offload 
HBAs, because they provide a real gain, which IMHO can't be reached by 
any partial offloads.

Actually, in the Fibre Channel world from the very beginning the entire 
FC protocol has been implemented on hardware and everybody have been 
happy with that. Now FCoE is coming, which means that Linux kernel is 
going to have implemented in software a big chunk of FC protocol. Then, 
hopefully, nobody would declare all existing FC cards as a crap and 
force FC vendors redesign their hardware to use Linux FC implementation 
and make partial offloads for it? ;) Instead, several implementations 
would live in a peace. The situation is the same with iSCSI. What we 
need is only to find an acceptable way for two TCP implementations to 
coexist. Then iSCSI on 10GbE hardware would have good chances to 
outperform 8Gbps FC in both performance and CPU efficiency.

Vlad

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-13 20:23                                 ` David Miller
@ 2008-08-14 18:27                                   ` Vladislav Bolkhovitin
  2008-08-14 18:30                                     ` Vladislav Bolkhovitin
  0 siblings, 1 reply; 71+ messages in thread
From: Vladislav Bolkhovitin @ 2008-08-14 18:27 UTC (permalink / raw)
  To: David Miller
  Cc: open-iscsi, rdreier, rick.jones2, jgarzik, swise, kxie, netdev,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1392 bytes --]

David Miller wrote:
> From: Vladislav Bolkhovitin <vst@vlnb.net>
> Date: Wed, 13 Aug 2008 22:35:34 +0400
> 
>> This is because the target sends data in a zero-copy manner, so its
>> CPU is capable to deal with the load, but on the initiator there are
>> additional data copies from skb's to page cache and from page cache
>> to application.
> 
> If you've actually been reading at all what I've been saying in this
> thread you'll see that I've described a method to do this copy
> avoidance in a completely stateless manner.
> 
> You don't need to implement a TCP stack in the card in order to do
> data placement optimizations.  They can be done completely stateless.

Sure, I read what you wrote before writing (although, frankly, didn't 
get the idea). But I don't think that overall it would be as efficient 
as full hardware offload. See my reply to Jeff Garzik about that.

> Also, large portions of the cpu overhead are transactional costs,
> which are significantly reduced by existing technologies such as
> LRO.

The test used Myricom Myri-10G cards (myri10ge driver), which support 
LRO. And from ethtool -S output I conclude it was enabled. Just in case, 
I attached it, so you can recheck me.

Thus, apparently, LRO doesn't make a fundamental difference. Maybe this 
particular implementation isn't too efficient, I don't know. I don't 
have enough information for that.

Vlad


[-- Attachment #2: ethtool_initiator.txt --]
[-- Type: text/plain, Size: 1498 bytes --]

NIC statistics:
     rx_packets: 471090527
     tx_packets: 175404246
     rx_bytes: 683684492944
     tx_bytes: 636200696592
     rx_errors: 0
     tx_errors: 0
     rx_dropped: 0
     tx_dropped: 0
     multicast: 0
     collisions: 0
     rx_length_errors: 0
     rx_over_errors: 0
     rx_crc_errors: 0
     rx_frame_errors: 0
     rx_fifo_errors: 0
     rx_missed_errors: 0
     tx_aborted_errors: 0
     tx_carrier_errors: 0
     tx_fifo_errors: 0
     tx_heartbeat_errors: 0
     tx_window_errors: 0
     rx_skbs: 0
     alloc_order: 0
     builtin_fw: 0
     napi: 1
     tx_boundary: 4096
     WC: 2
     irq: 1268
     MSI: 1
     MSIX: 0
     read_dma_bw_MBs: 1575
     write_dma_bw_MBs: 1375
     read_write_dma_bw_MBs: 2406
     serial_number: 320283
     watchdog_resets: 0
     link_changes: 2
     link_up: 1
     dropped_link_overflow: 0
     dropped_link_error_or_filtered: 0
     dropped_pause: 0
     dropped_bad_phy: 0
     dropped_bad_crc32: 0
     dropped_unicast_filtered: 0
     dropped_multicast_filtered: 0
     dropped_runt: 0
     dropped_overrun: 0
     dropped_no_small_buffer: 0
     dropped_no_big_buffer: 479
     ----------- slice ---------: 0
     tx_pkt_start: 176354843
     tx_pkt_done: 176354843
     tx_req: 474673372
     tx_done: 474673372
     rx_small_cnt: 19592127
     rx_big_cnt: 462319631
     wake_queue: 0
     stop_queue: 0
     tx_linearized: 0
     LRO aggregated: 481899984
     LRO flushed: 43071334
     LRO avg aggr: 11
     LRO no_desc: 0

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14 18:27                                   ` Vladislav Bolkhovitin
@ 2008-08-14 18:30                                     ` Vladislav Bolkhovitin
  0 siblings, 0 replies; 71+ messages in thread
From: Vladislav Bolkhovitin @ 2008-08-14 18:30 UTC (permalink / raw)
  To: David Miller
  Cc: open-iscsi, rdreier, rick.jones2, jgarzik, swise, kxie, netdev,
	michaelc, daisyc, wenxiong, bhua, dm, leedom, linux-scsi,
	linux-kernel

Vladislav Bolkhovitin wrote:
> David Miller wrote:
>> From: Vladislav Bolkhovitin <vst@vlnb.net>
>> Date: Wed, 13 Aug 2008 22:35:34 +0400
>>
>>> This is because the target sends data in a zero-copy manner, so its
>>> CPU is capable to deal with the load, but on the initiator there are
>>> additional data copies from skb's to page cache and from page cache
>>> to application.
>> If you've actually been reading at all what I've been saying in this
>> thread you'll see that I've described a method to do this copy
>> avoidance in a completely stateless manner.
>>
>> You don't need to implement a TCP stack in the card in order to do
>> data placement optimizations.  They can be done completely stateless.
> 
> Sure, I read what you wrote before writing (although, frankly, didn't 
> get the idea). But I don't think that overall it would be as efficient 
> as full hardware offload. See my reply to Jeff Garzik about that.
> 
>> Also, large portions of the cpu overhead are transactional costs,
>> which are significantly reduced by existing technologies such as
>> LRO.
> 
> The test used Myricom Myri-10G cards (myri10ge driver), which support 
> LRO. And from ethtool -S output I conclude it was enabled. Just in case, 
> I attached it, so you can recheck me.

Also, there wasn't big difference between MTU 1500 and 9000, which is 
another point to think that LRO was working.

> Thus, apparently, LRO doesn't make a fundamental difference. Maybe this 
> particular implementation isn't too efficient, I don't know. I don't 
> have enough information for that.
> 
> Vlad
> 
> 


^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-11 21:12                   ` David Miller
  2008-08-11 21:41                     ` Roland Dreier
@ 2008-08-14 20:45                     ` Andrew Gallatin
  2008-08-14 22:23                       ` David Miller
  1 sibling, 1 reply; 71+ messages in thread
From: Andrew Gallatin @ 2008-08-14 20:45 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, rdreier, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, lee

David Miller wrote:
 > From: Rick Jones <rick.jones2@hp.com>
 > Date: Mon, 11 Aug 2008 11:13:25 -0700
 >
 >> David Miller wrote:
 >>> And I even wonder, these days, if you probably get %90 or more of the
 >>> gain these "optimized" iSCSI connections obtain from things like LRO.
 >>> And since LRO can be done entirely in software (although stateless
 >>> HW assistence helps), it is even a NIC agnostic performance 
improvement.
 >> Probably depends on whether or not the iSCSI offload solutions are doing
 >> zero-copy receive into the filecache?
 >
 > That's a data placement issue, which also can be solved with
 > stateless offloading.

Speaking of stateless data placement.  Assume you have a page or set
of pages allocated by a network driver which contain exactly the data
a block driver is interested in to fulfill a read request (eg, the NIC
understands the protocol just well enough to split headers).  Is it
possible in the block driver to simply replace the pages that are
attached to the buf with pages allocated by the network driver?

It was my impression that the pages associated with a buf are in
fairly magical states and that you cannot replace them.  Rather,
you actually need to register them with the NIC, so the NIC can
receive into them rather than into an anonymously allocated page.
At this point, the NIC needs to be smart enough to match the block
read requests with the correct buffer, and you need some kind of
side channel between the network driver and the block driver to
pass the DMA address of the buf's pages and associated read request
tag.

Is this true?

Drew

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14 18:24                                   ` Vladislav Bolkhovitin
@ 2008-08-14 21:59                                     ` Nicholas A. Bellinger
  0 siblings, 0 replies; 71+ messages in thread
From: Nicholas A. Bellinger @ 2008-08-14 21:59 UTC (permalink / raw)
  To: Vladislav Bolkhovitin
  Cc: Jeff Garzik, David Miller, open-iscsi, rdreier, rick.jones2,
	Steve Wise, Karen Xie, netdev, michaelc, daisyc, wenxiong, bhua,
	Dimitrios Michailidis, Casey Leedom, linux-scsi, linux-kernel

On Thu, 2008-08-14 at 22:24 +0400, Vladislav Bolkhovitin wrote:
> Jeff Garzik wrote:
> > Vladislav Bolkhovitin wrote:
> >> Divy Le Ray wrote:
> >>> On Tuesday 12 August 2008 03:02:46 pm David Miller wrote:
> >>>> From: Divy Le Ray <divy@chelsio.com>
> >>>> Date: Tue, 12 Aug 2008 14:57:09 -0700
> >>>>
> >>>>> In any case, such a stateless solution is not yet designed, whereas
> >>>>> accelerated iSCSI is available now, from us and other companies.
> >>>> So, WHAT?!
> >>>>
> >>>> There are TOE pieces of crap out there too.
> >>> Well, there is demand for accerated iscsi out there, which is the 
> >>> driving reason of our driver submission. 
> >> I'm, as an iSCSI target developer, strongly voting for hardware iSCSI 
> >> offload. Having possibility of the direct data placement is a *HUGE* 
> >> performance gain.
> > 
> > Well, two responses here:
> > 
> > * no one is arguing against hardware iSCSI offload.  Rather, it is a 
> > problem with a specific implementation, one that falsely assumes two 
> > independent TCP stacks can co-exist peacefully on the same IP address 
> > and MAC.
> > 
> > * direct data placement is possible without offloading the entire TCP 
> > stack onto a firmware/chip.
> > 
> > There is plenty of room for hardware iSCSI offload...
>
> Sure, nobody is arguing against that. My points are:
> 
> 1. All those are things not for near future. I don't think it can be 
> implemented earlier than in a year time, but there is a huge demand for 
> high speed and low CPU overhead iSCSI _now_.

Well, the first step wrt to this for us software folks is getting the
Slicing by 8 algoritim CRC32C into the kernel..  This would be a great
benefit for not just traditional iSCSI/TCP, but Linux/SCTP and
Linux/iWARP software codebases.

>  Nobody's satisfied by the 
> fact that with the latest high end hardware he can saturate 10GbE link 
> on only less than 50%(!). Additionally, for me, as an iSCSI target 
> developer, it looks especially annoying that hardware requirements for 
> _clients_ (initiators) are significantly higher than for _server_ 
> (target). This situation for me looks as a nonsense.
> 

I have always found this to be the historical case wrt iSCSI on x86
hardware.  The rough estimate was that given identical hardware and
network configuration, an iSCSI target talking to a SCSI subsystem layer
would be able to handle 2x throughput compared to an iSCSI Initiator,
obviously as long as the actual storage could handle it.

> 2. I believe, that iSCSI/TCP pair is sufficiently heavy weighted 
> protocol to be completely offloaded to hardware.

Heh, I think the period of designing news ASICs for traditional iSCSI
offload is probably slowing.  Aside from the actual difficulting of
doing this and competing with software iSCSI on commodity x86 4x & 8x
core (8x and 16x thread) micropressors with highly efficent software
implementation, that can do BOTH traditional iSCSI offload (where
available) and real deal OS independent connection recovery
(ErrorRecoveryLevel=2) between multiple stateless iSER iWARP/TCP
connections across both hardware *AND* software iWARP RNICs.

>  All partial offloads will never make it comparably efficient.

With traditional iSCSI, I definately agree on this.

With iWARP and iSER however, I believe the end balance of simplicity is
greater for both hardware and software, and allows both hardware and
software to scale more effectively because  The simple gain of having a
Framed PDU on top of legacy TCP with RFC 504[0-4] in order to determine
the offload of the received packet that will be mapped to storage
subsystem later memory for eventual hardware DMA on a vast array of
Linux supported storage hardware and CPU architectures.

>  It still would consume a lot of 
> CPU. For example, consider digests. Even if they computed by new CRC32C 
> instruction, the computation still would need a chunk of CPU power. I 
> think, at least as much as to copy the computed block to new location. 
> Can we save it? Sure, with hardware offload.

So yes, we are talking about quite a few possible cases:

I) Traditional iSCSI:

1) Complete hardware offload for legacy HBAs

2) Hybrid of hardware/software 

As mentioned, reducing application layer checksum overhead for current
software implementations is very important for our quickly increase user
base.  Using the Slicing by 8 CRC32C will help the current code, but I
think the only other real optimization by network ASIC design folks
would be to do something along the lines with traditional iSCSI with the
application layer that the say the e1000 driver does with transport and
network layer checksums today.  I believe the complexity and time to
market considerations of a complete traditional iSCSI offload solution
compared to highly optimized software iSCSI on dedicated commodity cores
still outweighs the benefit IMHO.

Not that I am saying there is no room for improvement from the current
set iSCSI Initiator TOEs.  Again I could build a children's fortress
from iSCSI TOE's and their retail boxes that I have in my office that I
have gotten over the years.   I would definately like to see them
running on the LIO production fabric and VHACS bare-metal storage clouds
at some point for validation purposes, et al.  But as for new designs,
this is still a very difficult proposition,  I am glad to see it being
discussed here..

II) iWARP/TCP and iSER

1) Hardware RNIC w/ iWARP/TCP with software iSER

2) Software RNIC w/ iWARP/TCP with software iSER

3) More possible iSER logic in hardware for latency/performance
optimizations (We won't know this until #1 and #2 happen)

Ahh, now this is the interesting case for scaling vendor independent IP
storage fabric to multiple port full duplex 10 Gb/sec fabrics.  As this
hardware on PCIe gets out (yes, I have some AMSO1100 goodness too
Steve :-), and iSER initiator/targets on iWARP/TCP come online, I
believe the common code between the different flavours of implemenations
will be much larger here.   For example, I previously mentioned ERL=2 in
the context of traditional iSCSI/iSER.  This logic is independent of
what RFC5045 knows a network fabric capable of of direct data placement.
I will also make this code independent in lio-target-2.6.git for my
upstream work.

>  The additional CPU load can 
> be acceptable if only data are transferred and there are no other 
> activities, but in real life this is quite rare. Consider, for instance, 
> a VPS server, like VMware. It always lacks CPU power and 30% CPU load 
> during data transfers makes a huge difference. Another example is a 
> target doing some processing of transferred data, like encryption or 
> de-duplication.

Well, I think alot of this depends on hardware.  For example, there is
the X3100 adapter from Neterion today that can do 10 Gb/sec line rate
with x86_64 virtualization.  Obviously, the Linux kernel (and my
project, Linux-iSCSI.org) wants to be able to support this as vendor
neutral as possible, which is why we make extensive use of multiple
technologies in our production fabrics, and in the VHACS stack. :-)

Also, the Nested Page Tables would be a big win for this particular
case, but I am not familar with the exact numbers..

> 
> Actually, in the Fibre Channel world from the very beginning the entire 
> FC protocol has been implemented on hardware and everybody have been 
> happy with that. Now FCoE is coming, which means that Linux kernel is 
> going to have implemented in software a big chunk of FC protocol. Then, 
> hopefully, nobody would declare all existing FC cards as a crap and 
> force FC vendors redesign their hardware to use Linux FC implementation 
> and make partial offloads for it? ;) Instead, several implementations 
> would live in a peace. The situation is the same with iSCSI. What we 
> need is only to find an acceptable way for two TCP implementations to 
> coexist. Then iSCSI on 10GbE hardware would have good chances to 
> outperform 8Gbps FC in both performance and CPU efficiency.
> 

<nod> :-)

--nab

> Vlad
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-scsi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14 20:45                     ` Andrew Gallatin
@ 2008-08-14 22:23                       ` David Miller
  2008-08-15 12:19                         ` Andrew Gallatin
  0 siblings, 1 reply; 71+ messages in thread
From: David Miller @ 2008-08-14 22:23 UTC (permalink / raw)
  To: gallatin
  Cc: rick.jones2, rdreier, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, lee

From: Andrew Gallatin <gallatin@myri.com>
Date: Thu, 14 Aug 2008 16:45:57 -0400

> Speaking of stateless data placement.  Assume you have a page or set
> of pages allocated by a network driver which contain exactly the data
> a block driver is interested in to fulfill a read request (eg, the NIC
> understands the protocol just well enough to split headers).  Is it
> possible in the block driver to simply replace the pages that are
> attached to the buf with pages allocated by the network driver?
> 
> It was my impression that the pages associated with a buf are in
> fairly magical states and that you cannot replace them.

This, if true, would be quite easy to rectify.

^ permalink raw reply	[flat|nested] 71+ messages in thread

* Re: [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator
  2008-08-14 22:23                       ` David Miller
@ 2008-08-15 12:19                         ` Andrew Gallatin
  0 siblings, 0 replies; 71+ messages in thread
From: Andrew Gallatin @ 2008-08-15 12:19 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, rdreier, jgarzik, swise, divy, kxie, netdev,
	open-iscsi, michaelc, daisyc, wenxiong, bhua, dm, lee

David Miller wrote:
 > From: Andrew Gallatin <gallatin@myri.com>
 > Date: Thu, 14 Aug 2008 16:45:57 -0400
 >
 >> Speaking of stateless data placement.  Assume you have a page or set
 >> of pages allocated by a network driver which contain exactly the data
 >> a block driver is interested in to fulfill a read request (eg, the NIC
 >> understands the protocol just well enough to split headers).  Is it
 >> possible in the block driver to simply replace the pages that are
 >> attached to the buf with pages allocated by the network driver?
 >>
 >> It was my impression that the pages associated with a buf are in
 >> fairly magical states and that you cannot replace them.
 >
 > This, if true, would be quite easy to rectify.

That's great news.

The reason I think this is because a few months ago, I tried to make
the AOE block driver zero-copy on read.  I modified the myri10ge
ethernet driver and firmware to split AOE headers, so that the AOE
read data started nicely aligned in a page allocated by myri10ge.
(this obviously only works for for jumbo MTUs..).  This part was
trivial, but I got bogged down trying to figure out how to replace the
buf associated pages in the AOE driver, as they seemed to be in
various states of entanglement with the LRU and page caches.

I'm more of a network driver guy, and my lack of understanding of the
block layer and page cache probably made me give up too quickly.  I'll
dust off this work and ask for help.

Thanks,

Drew

^ permalink raw reply	[flat|nested] 71+ messages in thread

end of thread, other threads:[~2008-08-15 12:27 UTC | newest]

Thread overview: 71+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-07-30  0:19 [RFC][PATCH 1/1] cxgb3i: cxgb3 iSCSI initiator Karen Xie
2008-07-30 18:15 ` Shyam_Iyer
2008-07-30 18:37   ` Karen Xie
2008-07-30 19:21 ` Roland Dreier
2008-07-30 19:35 ` Jeff Garzik
2008-07-30 21:35   ` Roland Dreier
2008-08-01  0:51     ` Divy Le Ray
2008-08-07 18:45       ` Divy Le Ray
2008-08-07 20:07         ` Mike Christie
2008-08-08 18:09         ` Steve Wise
2008-08-08 22:15           ` Jeff Garzik
2008-08-08 22:20             ` Jeff Garzik
2008-08-09  7:28             ` David Miller
2008-08-09 14:04               ` Steve Wise
2008-08-10  5:14               ` Roland Dreier
2008-08-10  5:47                 ` David Miller
2008-08-10  6:34                   ` Herbert Xu
2008-08-10 17:57                   ` Steve Wise
2008-08-11 16:09                   ` Roland Dreier
2008-08-11 21:09                     ` David Miller
2008-08-11 21:37                       ` Roland Dreier
2008-08-11 21:51                         ` David Miller
2008-08-11 23:20                       ` Steve Wise
2008-08-11 23:45                         ` Divy Le Ray
2008-08-12  0:22                         ` David Miller
2008-08-10  5:12             ` Roland Dreier
2008-08-10  5:46               ` David Miller
2008-08-11 16:07                 ` Roland Dreier
2008-08-11 21:08                   ` David Miller
2008-08-11 21:39                     ` Roland Dreier
2008-08-11 21:52                       ` David Miller
2008-08-11 18:13                 ` Rick Jones
2008-08-11 21:12                   ` David Miller
2008-08-11 21:41                     ` Roland Dreier
2008-08-11 21:53                       ` David Miller
2008-08-12 21:57                         ` Divy Le Ray
2008-08-12 22:01                           ` David Miller
2008-08-12 22:02                           ` David Miller
2008-08-12 22:21                             ` Divy Le Ray
2008-08-13  1:57                               ` Herbert Xu
2008-08-13 18:35                               ` Vladislav Bolkhovitin
2008-08-13 19:29                                 ` Jeff Garzik
2008-08-13 20:13                                   ` David Miller
2008-08-14 18:24                                   ` Vladislav Bolkhovitin
2008-08-14 21:59                                     ` Nicholas A. Bellinger
2008-08-13 20:23                                 ` David Miller
2008-08-14 18:27                                   ` Vladislav Bolkhovitin
2008-08-14 18:30                                     ` Vladislav Bolkhovitin
2008-08-13 21:27                         ` Roland Dreier
2008-08-13 22:08                           ` David Miller
2008-08-13 23:03                             ` Roland Dreier
2008-08-13 23:12                               ` David Miller
2008-08-14  1:26                                 ` Tom Tucker
2008-08-14  1:37                                   ` David Miller
2008-08-14  1:52                                     ` Steve Wise
2008-08-14  2:05                                       ` David Miller
2008-08-14  2:44                                         ` Steve Wise
2008-08-14  1:57                                     ` Tom Tucker
2008-08-14  2:07                                       ` David Miller
2008-08-14  2:09                                   ` David Miller
2008-08-14 20:45                     ` Andrew Gallatin
2008-08-14 22:23                       ` David Miller
2008-08-15 12:19                         ` Andrew Gallatin
2008-08-10  6:24               ` Herbert Xu
2008-08-10  9:19               ` Alan Cox
2008-08-10 12:49                 ` Jeff Garzik
2008-08-10 14:54                   ` James Bottomley
     [not found]                     ` <1218380086.3418.21.camel-bi+AKbBUZKY6gyzm1THtWbp2dZbC/Bob@public.gmane.org>
2008-08-11 16:50                       ` Mike Christie
2008-07-31  1:24   ` Karen Xie
2008-07-31 12:45     ` Boaz Harrosh
2008-07-31 12:33 ` Boaz Harrosh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).