Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] SIW: Object management
From: Bernard Metzler @ 2010-10-05  6:54 UTC (permalink / raw)
  To: netdev; +Cc: linux-rdma, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_obj.c |  499 +++++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/siw/siw_obj.h |  109 ++++++++
 2 files changed, 608 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_obj.c
 create mode 100644 drivers/infiniband/hw/siw/siw_obj.h

diff --git a/drivers/infiniband/hw/siw/siw_obj.c b/drivers/infiniband/hw/siw/siw_obj.c
new file mode 100644
index 0000000..b5a1a3d
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_obj.c
@@ -0,0 +1,499 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+
+void siw_objhdr_init(struct siw_objhdr *hdr)
+{
+	kref_init(&hdr->ref);
+}
+
+void siw_idr_init(struct siw_dev *dev)
+{
+	spin_lock_init(&dev->idr_lock);
+
+	idr_init(&dev->qp_idr);
+	idr_init(&dev->cq_idr);
+	idr_init(&dev->pd_idr);
+	idr_init(&dev->mem_idr);
+}
+
+void siw_idr_release(struct siw_dev *dev)
+{
+	idr_destroy(&dev->qp_idr);
+	idr_destroy(&dev->cq_idr);
+	idr_destroy(&dev->pd_idr);
+	idr_destroy(&dev->mem_idr);
+}
+
+static inline int siw_add_obj(spinlock_t *lock, struct idr *idr,
+			      struct siw_objhdr *obj)
+{
+	u32		pre_id, id;
+	unsigned long	flags;
+	int		rv;
+
+	get_random_bytes(&pre_id, sizeof pre_id);
+	pre_id &= 0xffff;
+again:
+	do {
+		if (!(idr_pre_get(idr, GFP_KERNEL)))
+			return -ENOMEM;
+
+		spin_lock_irqsave(lock, flags);
+		rv = idr_get_new_above(idr, obj, pre_id, &id);
+		spin_unlock_irqrestore(lock, flags);
+
+	} while  (rv == -EAGAIN);
+
+	if (rv == 0) {
+		siw_objhdr_init(obj);
+		obj->id = id;
+		dprint(DBG_OBJ, "(OBJ%d): IDR New Object\n", id);
+	} else if (rv == -ENOSPC && pre_id != 1) {
+		pre_id = 1;
+		goto again;
+	} else {
+		dprint(DBG_OBJ|DBG_ON, "(OBJ??): IDR New Object failed!\n");
+	}
+	return rv;
+}
+
+static inline struct siw_objhdr *siw_get_obj(struct idr *idr, int id)
+{
+	struct siw_objhdr *obj;
+
+	obj = idr_find(idr, id);
+	if (obj)
+		kref_get(&obj->ref);
+
+	return obj;
+}
+
+struct siw_cq *siw_cq_id2obj(struct siw_dev *dev, int id)
+{
+	struct siw_objhdr *obj = siw_get_obj(&dev->cq_idr, id);
+	if (obj)
+		return container_of(obj, struct siw_cq, hdr);
+
+	return NULL;
+}
+
+struct siw_qp *siw_qp_id2obj(struct siw_dev *dev, int id)
+{
+	struct siw_objhdr *obj = siw_get_obj(&dev->qp_idr, id);
+	if (obj)
+		return container_of(obj, struct siw_qp, hdr);
+
+	return NULL;
+}
+
+/*
+ * siw_mem_id2obj()
+ *
+ * resolves memory from stag given by id. might be called from:
+ * o process context before sending out of sgl
+ * o or in softirq when resolving target memory
+ */
+struct siw_mem *siw_mem_id2obj(struct siw_dev *dev, int id)
+{
+	struct siw_objhdr *obj;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->idr_lock, flags);
+	obj = siw_get_obj(&dev->mem_idr, id);
+	spin_unlock_irqrestore(&dev->idr_lock, flags);
+
+	if (obj) {
+		dprint(DBG_MM|DBG_OBJ, "(MEM%d): New refcount: %d\n",
+		       obj->id, obj->ref.refcount.counter);
+
+		return container_of(obj, struct siw_mem, hdr);
+	}
+	dprint(DBG_MM|DBG_OBJ|DBG_ON, "(MEM%d): not found!\n", id);
+
+	return NULL;
+}
+
+int siw_qp_add(struct siw_dev *dev, struct siw_qp *qp)
+{
+	int rv = siw_add_obj(&dev->idr_lock, &dev->qp_idr, &qp->hdr);
+	if (!rv) {
+		dprint(DBG_OBJ, "(QP%d): New Object\n", QP_ID(qp));
+		qp->hdr.dev = dev;
+	}
+	return rv;
+}
+
+int siw_cq_add(struct siw_dev *dev, struct siw_cq *cq)
+{
+	int rv = siw_add_obj(&dev->idr_lock, &dev->cq_idr, &cq->hdr);
+	if (!rv) {
+		dprint(DBG_OBJ, "(CQ%d): New Object\n", cq->hdr.id);
+		cq->hdr.dev = dev;
+	}
+	return rv;
+}
+
+int siw_pd_add(struct siw_dev *dev, struct siw_pd *pd)
+{
+	int rv = siw_add_obj(&dev->idr_lock, &dev->pd_idr, &pd->hdr);
+	if (!rv) {
+		dprint(DBG_OBJ, "(PD%d): New Object\n", pd->hdr.id);
+		pd->hdr.dev = dev;
+	}
+	return rv;
+}
+
+/*
+ * Stag lookup is based on its index part only (24 bits)
+ * It is assumed that the idr_get_new_above(,,1,) function will
+ * always return a new id within this range (0x1...0xffffff),
+ * if one is available.
+ * The code avoids special Stag of zero and tries to randomize
+ * STag values.
+ */
+int siw_mem_add(struct siw_dev *dev, struct siw_mem *m)
+{
+	u32		id, pre_id;
+	unsigned long	flags;
+	int		rv;
+
+	do {
+		get_random_bytes(&pre_id, sizeof pre_id);
+		pre_id &= 0xffff;
+	} while (pre_id == 0);
+again:
+	do {
+		if (!(idr_pre_get(&dev->mem_idr, GFP_KERNEL)))
+			return -ENOMEM;
+
+		spin_lock_irqsave(&dev->idr_lock, flags);
+		rv = idr_get_new_above(&dev->mem_idr, m, pre_id, &id);
+		spin_unlock_irqrestore(&dev->idr_lock, flags);
+
+	} while (rv == -EAGAIN);
+
+	if (rv == -ENOSPC || (rv == 0 && id > SIW_STAG_MAX)) {
+		if (rv == 0) {
+			spin_lock_irqsave(&dev->idr_lock, flags);
+			idr_remove(&dev->mem_idr, id);
+			spin_unlock_irqrestore(&dev->idr_lock, flags);
+		}
+		if (pre_id == 1) {
+			dprint(DBG_OBJ|DBG_MM|DBG_ON,
+				"(IDR): New Object failed: %d\n", pre_id);
+			return -ENOSPC;
+		}
+		pre_id = 1;
+		goto again;
+	} else if (rv) {
+		dprint(DBG_OBJ|DBG_MM|DBG_ON,
+			"(IDR%d): New Object failed: rv %d\n", id, rv);
+		return rv;
+	}
+	siw_objhdr_init(&m->hdr);
+	m->hdr.id = id;
+	m->hdr.dev = dev;
+	dprint(DBG_OBJ|DBG_MM, "(IDR%d): New Object\n", id);
+
+	return 0;
+}
+
+void siw_remove_obj(spinlock_t *lock, struct idr *idr,
+		      struct siw_objhdr *hdr)
+{
+	unsigned long	flags;
+
+	dprint(DBG_OBJ, "(OBJ%d): IDR Remove Object\n", hdr->id);
+
+	spin_lock_irqsave(lock, flags);
+	idr_remove(idr, hdr->id);
+	spin_unlock_irqrestore(lock, flags);
+}
+
+
+/********** routines to put objs back and free if no ref left *****/
+
+static void siw_free_cq(struct kref *ref)
+{
+	struct siw_cq *cq =
+		(container_of(container_of(ref, struct siw_objhdr, ref),
+			      struct siw_cq, hdr));
+
+	dprint(DBG_OBJ, "(CQ%d): Free Object\n", cq->hdr.id);
+
+	kfree(cq);
+}
+
+static void siw_free_qp(struct kref *ref)
+{
+	struct siw_qp	*qp =
+		container_of(container_of(ref, struct siw_objhdr, ref),
+			     struct siw_qp, hdr);
+
+	dprint(DBG_OBJ|DBG_CM, "(QP%d): Free Object\n", QP_ID(qp));
+
+	if (qp->cep)
+		siw_cep_put(qp->cep);
+
+	kfree(qp);
+}
+
+static void siw_free_pd(struct kref *ref)
+{
+	struct siw_pd	*pd =
+		container_of(container_of(ref, struct siw_objhdr, ref),
+			     struct siw_pd, hdr);
+
+	dprint(DBG_OBJ, "(PD%d): Free Object\n", pd->hdr.id);
+
+	kfree(pd);
+}
+
+static void siw_free_mem(struct kref *ref)
+{
+	struct siw_mem *m;
+
+	m = container_of(container_of(ref, struct siw_objhdr, ref),
+			 struct siw_mem, hdr);
+
+	dprint(DBG_MM|DBG_OBJ, "(MEM%d): Free Object\n", OBJ_ID(m));
+
+	if (SIW_MEM_IS_MW(m)) {
+		struct siw_mw *mw = container_of(m, struct siw_mw, mem);
+		kfree(mw);
+	} else {
+		struct siw_mr *mr = container_of(m, struct siw_mr, mem);
+		dprint(DBG_MM|DBG_OBJ, "(MEM%d): Release UMem\n", OBJ_ID(m));
+		ib_umem_release(mr->umem);
+		kfree(mr);
+	}
+}
+
+
+void siw_cq_put(struct siw_cq *cq)
+{
+	dprint(DBG_OBJ, "(CQ%d): Old refcount: %d\n",
+		OBJ_ID(cq), atomic_read(&cq->hdr.ref.refcount));
+	kref_put(&cq->hdr.ref, siw_free_cq);
+}
+
+void siw_qp_put(struct siw_qp *qp)
+{
+	dprint(DBG_OBJ, "(QP%d): Old refcount: %d\n",
+		QP_ID(qp), atomic_read(&qp->hdr.ref.refcount));
+	kref_put(&qp->hdr.ref, siw_free_qp);
+}
+
+void siw_pd_put(struct siw_pd *pd)
+{
+	dprint(DBG_OBJ, "(PD%d): Old refcount: %d\n",
+		OBJ_ID(pd), atomic_read(&pd->hdr.ref.refcount));
+	kref_put(&pd->hdr.ref, siw_free_pd);
+}
+
+void siw_mem_put(struct siw_mem *m)
+{
+	dprint(DBG_MM|DBG_OBJ, "(MEM%d): Old refcount: %d\n",
+		OBJ_ID(m), atomic_read(&m->hdr.ref.refcount));
+	kref_put(&m->hdr.ref, siw_free_mem);
+}
+
+
+/***** routines for WQE handling ***/
+
+/*
+ * siw_wqe_get()
+ *
+ * Get new WQE. For READ RESPONSE, take it from the free list which
+ * has a maximum size of maximum inbound READs. All other WQE are
+ * malloc'ed which creates some overhead. Consider change to
+ *
+ * 1. malloc WR only if it cannot be synchonously completed, or
+ * 2. operate own cache of reuseable WQE's.
+ *
+ * Current code trusts on malloc efficiency.
+ */
+inline struct siw_wqe *siw_wqe_get(struct siw_qp *qp, enum siw_wr_opcode op)
+{
+	struct siw_wqe *wqe;
+
+	if (op == SIW_WR_RDMA_READ_RESP) {
+		spin_lock(&qp->freelist_lock);
+		if (!(list_empty(&qp->wqe_freelist))) {
+			wqe = list_entry(qp->wqe_freelist.next,
+					 struct siw_wqe, list);
+			list_del(&wqe->list);
+			spin_unlock(&qp->freelist_lock);
+			wqe->processed = 0;
+			dprint(DBG_OBJ|DBG_WR,
+				"(QP%d): WQE from FreeList p: %p\n",
+				QP_ID(qp), wqe);
+		} else {
+			spin_unlock(&qp->freelist_lock);
+			wqe = NULL;
+			dprint(DBG_ON|DBG_OBJ|DBG_WR,
+				"(QP%d): FreeList empty!\n", QP_ID(qp));
+		}
+	} else {
+		wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
+		dprint(DBG_OBJ|DBG_WR, "(QP%d): New WQE p: %p\n",
+			QP_ID(qp), wqe);
+	}
+	if (wqe) {
+		INIT_LIST_HEAD(&wqe->list);
+		siw_qp_get(qp);
+		wqe->qp = qp;
+	}
+	return wqe;
+}
+
+inline struct siw_wqe *siw_srq_wqe_get(struct siw_srq *srq)
+{
+	struct siw_wqe *wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
+
+	dprint(DBG_OBJ|DBG_WR, "(SRQ%p): New WQE p: %p\n", srq, wqe);
+	if (wqe) {
+		/* implicite: wqe->qp = NULL; */
+		INIT_LIST_HEAD(&wqe->list);
+		wqe->qp = NULL;
+	}
+	return wqe;
+}
+
+/*
+ * siw_srq_fetch_wqe()
+ *
+ * fetch one RQ wqe from the SRQ and inform user
+ * if SRQ lower watermark reached
+ */
+inline struct siw_wqe *siw_srq_fetch_wqe(struct siw_qp *qp)
+{
+	struct siw_wqe *wqe;
+	struct siw_srq *srq = qp->srq;
+	int qlen;
+
+	lock_srq(srq);
+	if (!list_empty(&srq->rq)) {
+		wqe = list_first_wqe(&srq->rq);
+		list_del_init(&wqe->list);
+		qlen = srq->max_wr - atomic_inc_return(&srq->space);
+		unlock_srq(srq);
+		wqe->qp = qp;
+		if (srq->armed && qlen < srq->limit) {
+			srq->armed = 0;
+			siw_async_srq_ev(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+		}
+		return wqe;
+	}
+	unlock_srq(srq);
+	return NULL;
+}
+
+inline void siw_free_inline_sgl(struct siw_sge *sge, int num_sge)
+{
+	while (num_sge--) {
+		kfree(sge->mem.buf); /* kfree handles NULL pointers */
+		sge++;
+	}
+}
+
+inline void siw_unref_mem_sgl(struct siw_sge *sge, int num_sge)
+{
+	while (num_sge--) {
+		if (sge->mem.obj != NULL)
+			siw_mem_put(sge->mem.obj);
+		sge++;
+	}
+}
+
+
+void siw_wqe_put(struct siw_wqe *wqe)
+{
+	struct siw_qp *qp = wqe->qp;
+	unsigned long flags;
+
+	dprint(DBG_OBJ|DBG_WR, " WQE: %llu:, type: %d, p: %p\n",
+		(unsigned long long)wr_id(wqe), wr_type(wqe), wqe);
+
+	switch (wr_type(wqe)) {
+
+	case SIW_WR_SEND:
+	case SIW_WR_RDMA_WRITE:
+		if (likely(!SIW_INLINED_DATA(wqe)))
+			siw_unref_mem_sgl(wqe->wr.sgl.sge,
+					  wqe->wr.sgl.num_sge);
+		else
+			siw_free_inline_sgl(wqe->wr.sgl.sge,
+					    wqe->wr.sgl.num_sge);
+	case SIW_WR_RDMA_WRITE_WITH_IMM:
+	case SIW_WR_SEND_WITH_IMM:
+		kfree(wqe);
+		break;
+
+	case SIW_WR_RECEIVE:
+	case SIW_WR_RDMA_READ_REQ:
+		siw_unref_mem_sgl(wqe->wr.sgl.sge, wqe->wr.sgl.num_sge);
+		kfree(wqe);
+		break;
+
+	case SIW_WR_RDMA_READ_RESP:
+		siw_unref_mem_sgl(wqe->wr.sgl.sge, 1);
+		wqe->wr.sgl.sge[0].mem.obj = NULL;
+		/*
+		 * freelist can be accessed by tx processing (rresp done)
+		 * and rx softirq (get new wqe for rresponse scheduling)
+		 */
+		INIT_LIST_HEAD(&wqe->list);
+		spin_lock_irqsave(&wqe->qp->freelist_lock, flags);
+		list_add_tail(&wqe->list, &wqe->qp->wqe_freelist);
+		spin_unlock_irqrestore(&wqe->qp->freelist_lock, flags);
+		break;
+
+	default:
+		WARN_ON(1);
+	}
+	siw_qp_put(qp);
+}
diff --git a/drivers/infiniband/hw/siw/siw_obj.h b/drivers/infiniband/hw/siw/siw_obj.h
new file mode 100644
index 0000000..7b8af6c
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_obj.h
@@ -0,0 +1,109 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_OBJ_H
+#define _SIW_OBJ_H
+
+#include <linux/idr.h>
+#include <linux/rwsem.h>
+#include <linux/version.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "siw_debug.h"
+
+
+static inline struct siw_dev *siw_dev_ofa2siw(struct ib_device *ofa_dev)
+{
+	return container_of(ofa_dev, struct siw_dev, ofa_dev);
+}
+
+static inline void siw_cq_get(struct siw_cq *cq)
+{
+	kref_get(&cq->hdr.ref);
+	dprint(DBG_OBJ, "(CQ%d): New refcount: %d\n",
+		OBJ_ID(cq), atomic_read(&cq->hdr.ref.refcount));
+}
+static inline void siw_qp_get(struct siw_qp *qp)
+{
+	kref_get(&qp->hdr.ref);
+	dprint(DBG_OBJ, "(QP%d): New refcount: %d\n",
+		OBJ_ID(qp), atomic_read(&qp->hdr.ref.refcount));
+}
+static inline void siw_pd_get(struct siw_pd *pd)
+{
+	kref_get(&pd->hdr.ref);
+	dprint(DBG_OBJ, "(PD%d): New refcount: %d\n",
+		OBJ_ID(pd), atomic_read(&pd->hdr.ref.refcount));
+}
+static inline void siw_mem_get(struct siw_mem *mem)
+{
+	kref_get(&mem->hdr.ref);
+	dprint(DBG_OBJ|DBG_MM, "(MEM%d): New refcount: %d\n",
+		OBJ_ID(mem), atomic_read(&mem->hdr.ref.refcount));
+}
+
+extern void siw_remove_obj(spinlock_t *lock, struct idr *idr,
+				struct siw_objhdr *hdr);
+
+extern void siw_objhdr_init(struct siw_objhdr *);
+extern void siw_idr_init(struct siw_dev *);
+extern void siw_idr_release(struct siw_dev *);
+
+extern struct siw_cq *siw_cq_id2obj(struct siw_dev *, int);
+extern struct siw_qp *siw_qp_id2obj(struct siw_dev *, int);
+extern struct siw_mem *siw_mem_id2obj(struct siw_dev *, int);
+
+extern int siw_qp_add(struct siw_dev *, struct siw_qp *);
+extern int siw_cq_add(struct siw_dev *, struct siw_cq *);
+extern int siw_pd_add(struct siw_dev *, struct siw_pd *);
+extern int siw_mem_add(struct siw_dev *, struct siw_mem *m);
+
+extern struct siw_wqe *siw_wqe_get(struct siw_qp *, enum siw_wr_opcode);
+extern struct siw_wqe *siw_srq_wqe_get(struct siw_srq *);
+extern struct siw_wqe *siw_srq_fetch_wqe(struct siw_qp *);
+
+extern void siw_cq_put(struct siw_cq *);
+extern void siw_qp_put(struct siw_qp *);
+extern void siw_pd_put(struct siw_pd *);
+extern void siw_mem_put(struct siw_mem *);
+extern void siw_wqe_put(struct siw_wqe *);
+
+#endif
-- 
1.5.4.3


^ permalink raw reply related

* [PATCH] SIW: Queue pair
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev; +Cc: linux-rdma, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_qp.c |  989 ++++++++++++++++++++++++++++++++++++
 1 files changed, 989 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_qp.c

diff --git a/drivers/infiniband/hw/siw/siw_qp.c b/drivers/infiniband/hw/siw/siw_qp.c
new file mode 100644
index 0000000..42bc143
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_qp.c
@@ -0,0 +1,989 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *          Fredy Neeser <nfd@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/file.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+
+char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
+	[SIW_QP_STATE_IDLE]		= "IDLE",
+	[SIW_QP_STATE_RTR]		= "RTR",
+	[SIW_QP_STATE_RTS]		= "RTS",
+	[SIW_QP_STATE_CLOSING]		= "CLOSING",
+	[SIW_QP_STATE_TERMINATE]	= "TERMINATE",
+	[SIW_QP_STATE_ERROR]		= "ERROR",
+	[SIW_QP_STATE_MORIBUND]		= "MORIBUND",
+	[SIW_QP_STATE_UNDEF]		= "UNDEF"
+};
+
+
+/*
+ * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
+ * per-RDMAP message basis. Please keep order of initializer. All MPA len
+ * is initialized to minimum packet size.
+ */
+struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] =
+{ {
+	.hdr_len = sizeof(struct iwarp_rdma_write),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_RDMA_WRITE,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 1,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_write
+},
+{
+	.hdr_len = sizeof(struct iwarp_rdma_rreq),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_RDMA_READ_REQ,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_rreq
+},
+{
+	.hdr_len = sizeof(struct iwarp_rdma_rresp),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_RDMA_READ_RESP,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 1,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_rresp
+},
+{
+	.hdr_len = sizeof(struct iwarp_send),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_send
+},
+{
+	.hdr_len = sizeof(struct iwarp_send_inv),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND_INVAL,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_unsupp
+},
+{
+	.hdr_len = sizeof(struct iwarp_send),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND_SE,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_send
+},
+{
+	.hdr_len = sizeof(struct iwarp_send_inv),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_SEND_SE_INVAL,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_unsupp
+},
+{
+	.hdr_len = sizeof(struct iwarp_terminate),
+	.ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
+	.ctrl.dv = DDP_VERSION,
+	.ctrl.opcode = RDMAP_TERMINATE,
+	.ctrl.rv = RDMAP_VERSION,
+	.ctrl.t = 0,
+	.ctrl.l = 1,
+	.proc_data = siw_proc_terminate
+} };
+
+
+static void siw_qp_llp_data_ready(struct sock *sk, int flags)
+{
+	struct siw_qp		*qp;
+
+	read_lock(&sk->sk_callback_lock);
+
+	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk))) {
+		dprint(DBG_ON, " No QP: %p\n", sk->sk_user_data);
+		goto done;
+	}
+	qp = sk_to_qp(sk);
+
+	if (down_read_trylock(&qp->state_lock)) {
+		read_descriptor_t	rd_desc = {.arg.data = qp, .count = 1};
+
+		dprint(DBG_SK|DBG_RX, "(QP%d): "
+			"state (before tcp_read_sock)=%d, flags=%x\n",
+			QP_ID(qp), qp->attrs.state, flags);
+
+		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
+			/*
+			 * Implements data receive operation during
+			 * socket callback. TCP gracefully catches
+			 * the case where there is nothing to receive
+			 * (not calling siw_tcp_rx_data() then).
+			 */
+			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
+
+		dprint(DBG_SK|DBG_RX, "(QP%d): "
+			"state (after tcp_read_sock)=%d, flags=%x\n",
+			QP_ID(qp), qp->attrs.state, flags);
+
+		up_read(&qp->state_lock);
+	} else {
+		dprint(DBG_SK|DBG_RX, "(QP%d): "
+			"Unable to acquire state_lock\n", QP_ID(qp));
+	}
+done:
+	read_unlock(&sk->sk_callback_lock);
+}
+
+
+void siw_qp_llp_close(struct siw_qp *qp)
+{
+	dprint(DBG_CM, "(QP%d): Enter: SIW QP state = %s, cep=0x%p\n",
+		QP_ID(qp), siw_qp_state_to_string[qp->attrs.state],
+		qp->cep);
+
+	down_write(&qp->state_lock);
+
+	qp->rx_ctx.rx_suspend = 1;
+	qp->tx_ctx.tx_suspend = 1;
+	qp->attrs.llp_stream_handle = NULL;
+
+	switch (qp->attrs.state) {
+
+	case SIW_QP_STATE_RTS:
+	case SIW_QP_STATE_RTR:
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_TERMINATE:
+
+		qp->attrs.state = SIW_QP_STATE_ERROR;
+
+		break;
+	/*
+	 * SIW_QP_STATE_CLOSING:
+	 *
+	 * This is a forced close. shall the QP be moved to
+	 * ERROR or IDLE ?
+	 */
+	case SIW_QP_STATE_CLOSING:
+		if (!TX_IDLE(qp))
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+		else
+			qp->attrs.state = SIW_QP_STATE_IDLE;
+
+		break;
+
+	default:
+		dprint(DBG_CM, " No state transition needed: %d\n",
+			qp->attrs.state);
+		break;
+	}
+	siw_sq_flush(qp);
+	siw_rq_flush(qp);
+
+	up_write(&qp->state_lock);
+
+	dprint(DBG_CM, "(QP%d): Exit: SIW QP state = %s\n",
+		QP_ID(qp), siw_qp_state_to_string[qp->attrs.state]);
+}
+
+
+/*
+ * socket callback routine informing about newly available send space.
+ * Function schedules SQ work for processing SQ items.
+ */
+static void siw_qp_llp_write_space(struct sock *sk)
+{
+	struct siw_qp	*qp = sk_to_qp(sk);
+
+	/*
+	 * TODO:
+	 * Resemble sk_stream_write_space() logic for iWARP constraints:
+	 * Clear SOCK_NOSPACE only if sendspace may hold some reasonable
+	 * sized FPDU.
+	 */
+#ifdef SIW_TX_FULLSEGS
+	struct socket *sock = sk->sk_socket;
+	if (sk_stream_wspace(sk) >= (int)qp->tx_ctx.fpdu_len && sock) {
+		clear_bit(SOCK_NOSPACE, &sock->flags);
+		siw_sq_queue_work(qp);
+	}
+#else
+	sk_stream_write_space(sk);
+
+	if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+		siw_sq_queue_work(qp);
+#endif
+}
+
+static void siw_qp_socket_assoc(struct socket *s, struct siw_qp *qp)
+{
+	struct sock *sk = s->sk;
+
+	write_lock_bh(&sk->sk_callback_lock);
+
+	qp->attrs.llp_stream_handle = s;
+	s->sk->sk_data_ready = siw_qp_llp_data_ready;
+	s->sk->sk_write_space = siw_qp_llp_write_space;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+
+static int siw_qp_irq_init(struct siw_qp *qp, int i)
+{
+	struct siw_wqe *wqe;
+
+	dprint(DBG_CM|DBG_WR, "(QP%d): irq size: %d\n", QP_ID(qp), i);
+
+	INIT_LIST_HEAD(&qp->wqe_freelist);
+
+	/*
+	 * Give the IRD one extra entry since after sending
+	 * the RResponse it may trigger another peer RRequest
+	 * before the RResponse goes back to freelist.
+	 */
+	i++;
+
+	while (i--) {
+		wqe = kzalloc(sizeof(struct siw_wqe), GFP_KERNEL);
+		if (!wqe) {
+			siw_qp_freeq_flush(qp);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&wqe->list);
+		wr_type(wqe) = SIW_WR_RDMA_READ_RESP;
+		list_add(&wqe->list, &qp->wqe_freelist);
+	}
+	return 0;
+}
+
+
+static void siw_send_terminate(struct siw_qp *qp)
+{
+	struct iwarp_terminate	pkt;
+
+	memset(&pkt, 0, sizeof pkt);
+	/*
+	 * TODO: send TERMINATE
+	 */
+	dprint(DBG_CM, "(QP%d): Todo\n", QP_ID(qp));
+}
+
+
+/*
+ * caller holds qp->state_lock
+ */
+int
+siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
+	      enum siw_qp_attr_mask mask)
+{
+	int	drop_conn, rv;
+
+	if (!mask)
+		return 0;
+
+	dprint(DBG_CM, "(QP%d)\n", QP_ID(qp));
+
+	if (mask != SIW_QP_ATTR_STATE) {
+		/*
+		 * changes of qp attributes (maybe state, too)
+		 */
+		if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
+
+			if (attrs->flags & SIW_RDMA_BIND_ENABLED)
+				qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
+			else
+				qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
+
+			if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
+				qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
+			else
+				qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+			if (attrs->flags & SIW_RDMA_READ_ENABLED)
+				qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
+			else
+				qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
+
+		}
+		/*
+		 * TODO: what else ??
+		 */
+	}
+	if (!(mask & SIW_QP_ATTR_STATE))
+		return 0;
+
+	dprint(DBG_CM, "(QP%d): SIW QP state: %s => %s\n", QP_ID(qp),
+		siw_qp_state_to_string[qp->attrs.state],
+		   siw_qp_state_to_string[attrs->state]);
+
+	drop_conn = 0;
+
+	switch (qp->attrs.state) {
+
+	case SIW_QP_STATE_IDLE:
+	case SIW_QP_STATE_RTR:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_RTS:
+
+			if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
+				dprint(DBG_ON, "(QP%d): socket?\n", QP_ID(qp));
+				return -EINVAL;
+			}
+			if (!(mask & SIW_QP_ATTR_MPA)) {
+				dprint(DBG_ON, "(QP%d): MPA?\n", QP_ID(qp));
+				return -EINVAL;
+			}
+			dprint(DBG_CM, "(QP%d): Enter RTS: "
+				"peer 0x%08x, local 0x%08x\n", QP_ID(qp),
+				qp->cep->llp.raddr.sin_addr.s_addr,
+				qp->cep->llp.laddr.sin_addr.s_addr);
+			/*
+			 * Initialize global iWARP TX state
+			 */
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
+			qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
+
+			/*
+			 * Initialize global iWARP RX state
+			 */
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
+			qp->rx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
+
+			/*
+			 * init IRD freequeue, caller has already checked
+			 * limits
+			 */
+			rv = siw_qp_irq_init(qp, attrs->ird);
+			if (rv)
+				return rv;
+
+			atomic_set(&qp->orq_space, attrs->ord);
+
+			qp->attrs.ord = attrs->ord;
+			qp->attrs.ird = attrs->ird;
+			qp->attrs.mpa = attrs->mpa;
+			/*
+			 * move socket rx and tx under qp's control
+			 */
+			siw_qp_socket_assoc(attrs->llp_stream_handle, qp);
+
+			qp->attrs.state = SIW_QP_STATE_RTS;
+			/*
+			 * set initial mss
+			 */
+			qp->tx_ctx.tcp_seglen =
+				get_tcp_mss(attrs->llp_stream_handle->sk);
+
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+			drop_conn = 1;
+			break;
+
+		case SIW_QP_STATE_RTR:
+			/* ignore */
+			break;
+
+		default:
+			dprint(DBG_CM,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+			break;
+		}
+		break;
+
+	case SIW_QP_STATE_RTS:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_CLOSING:
+			/*
+			 * Verbs: move to IDLE if SQ and ORQ are empty.
+			 * Move to ERROR otherwise. But first of all we must
+			 * close the connection. So we keep CLOSING or ERROR
+			 * as a transient state, schedule connection drop work
+			 * and wait for the socket state change upcall to
+			 * come back closed.
+			 */
+			if (TX_IDLE(qp))
+				qp->attrs.state = SIW_QP_STATE_CLOSING;
+			else {
+				qp->attrs.state = SIW_QP_STATE_ERROR;
+				siw_sq_flush(qp);
+			}
+			siw_rq_flush(qp);
+
+			drop_conn = 1;
+			break;
+
+		case SIW_QP_STATE_TERMINATE:
+			qp->attrs.state = SIW_QP_STATE_TERMINATE;
+			siw_send_terminate(qp);
+			drop_conn = 1;
+
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			/*
+			 * This is an emergency close.
+			 *
+			 * Any in progress transmit operation will get
+			 * cancelled.
+			 * This will likely result in a protocol failure,
+			 * if a TX operation is in transit. The caller
+			 * could unconditional wait to give the current
+			 * operation a chance to complete.
+			 * Esp., how to handle the non-empty IRQ case?
+			 * The peer was asking for data transfer at a valid
+			 * point in time.
+			 */
+			siw_sq_flush(qp);
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+			drop_conn = 1;
+
+			break;
+
+		default:
+			dprint(DBG_ON,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+			break;
+		}
+		break;
+
+	case SIW_QP_STATE_TERMINATE:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_ERROR:
+			siw_rq_flush(qp);
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+
+			if (!TX_IDLE(qp))
+				siw_sq_flush(qp);
+
+			break;
+
+		default:
+			dprint(DBG_ON,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+		}
+		break;
+
+	case SIW_QP_STATE_CLOSING:
+
+		switch (attrs->state) {
+
+		case SIW_QP_STATE_IDLE:
+			BUG_ON(!TX_IDLE(qp));
+			qp->attrs.state = SIW_QP_STATE_IDLE;
+
+			break;
+
+		case SIW_QP_STATE_CLOSING:
+			/*
+			 * The LLP may already moved the QP to closing
+			 * due to graceful peer close init
+			 */
+			break;
+
+		case SIW_QP_STATE_ERROR:
+			/*
+			 * QP was moved to CLOSING by LLP event
+			 * not yet seen by user.
+			 */
+			qp->attrs.state = SIW_QP_STATE_ERROR;
+
+			if (!TX_IDLE(qp))
+				siw_sq_flush(qp);
+
+			siw_rq_flush(qp);
+
+			break;
+
+		default:
+			dprint(DBG_CM,
+				" QP state transition undefined: %s => %s\n",
+				siw_qp_state_to_string[qp->attrs.state],
+				siw_qp_state_to_string[attrs->state]);
+			return -ECONNABORTED;
+		}
+		break;
+
+	default:
+		dprint(DBG_CM, " NOP: State: %d\n", qp->attrs.state);
+		break;
+	}
+	if (drop_conn)
+		siw_qp_cm_drop(qp, 0);
+
+	return 0;
+}
+
+struct ib_qp *siw_get_ofaqp(struct ib_device *dev, int id)
+{
+	struct siw_qp *qp =  siw_qp_id2obj(siw_dev_ofa2siw(dev), id);
+
+	dprint(DBG_OBJ, ": dev_name: %s, OFA QPID: %d, QP: %p\n",
+		dev->name, id, qp);
+	if (qp) {
+		/*
+		 * siw_qp_id2obj() increments object reference count
+		 */
+		siw_qp_put(qp);
+		dprint(DBG_OBJ, " QPID: %d\n", QP_ID(qp));
+		return &qp->ofa_qp;
+	}
+	return (struct ib_qp *)NULL;
+}
+
+/*
+ * siw_check_mem()
+ *
+ * Check protection domain, STAG state, access permissions and
+ * address range for memory object.
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @mem:	memory to be checked
+ * @addr:	starting addr of mem
+ * @perms:	requested access permissions
+ * @len:	len of memory interval to be checked
+ *
+ */
+int siw_check_mem(struct siw_pd *pd, struct siw_mem *mem, u64 addr,
+		  enum siw_access_flags perms, int len)
+{
+	if (siw_mem2mr(mem)->pd != pd) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): PD mismatch %p : %p\n",
+			OBJ_ID(pd),
+			siw_mem2mr(mem)->pd, pd);
+
+		return -EINVAL;
+	}
+	if (mem->stag_state == STAG_INVALID) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): STAG 0x%08x invalid\n",
+			OBJ_ID(pd), OBJ_ID(mem));
+		return -EPERM;
+	}
+	/*
+	 * check access permissions
+	 */
+	if ((mem->perms & perms) < perms) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): "
+			"INSUFFICIENT permissions 0x%08x : 0x%08x\n",
+			OBJ_ID(pd), mem->perms, perms);
+		return -EPERM;
+	}
+	/*
+	 * Check address interval: we relax check to allow memory shrinked
+	 * from the start address _after_ placing or fetching len bytes.
+	 * TODO: this relaxation is probably overdone
+	 */
+	if (addr < mem->va || addr + len > mem->va + mem->len) {
+		dprint(DBG_WR|DBG_ON, "(PD%d): MEM interval len %d "
+			"[0x%016llx, 0x%016llx) out of bounds "
+			"[0x%016llx, 0x%016llx) for LKey=0x%08x\n",
+			OBJ_ID(pd), len, (unsigned long long)addr,
+			(unsigned long long)(addr + len),
+			(unsigned long long)mem->va,
+			(unsigned long long)(mem->va + mem->len),
+			OBJ_ID(mem));
+
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * siw_check_sge()
+ *
+ * Check SGE for access rights in given interval
+ *
+ * @pd:		Protection Domain memory should belong to
+ * @sge:	SGE to be checked
+ * @perms:	requested access permissions
+ * @off:	starting offset in SGE
+ * @len:	len of memory interval to be checked
+ *
+ * NOTE: Function references each SGE's memory object (sge->mem)
+ * if not yet done. New reference is kept if check went ok and
+ * released if check failed. If sge->mem is already valid, no new
+ * lookup is being done and mem is not released it check fails.
+ */
+int
+siw_check_sge(struct siw_pd *pd, struct siw_sge *sge,
+	      enum siw_access_flags perms, u32 off, int len)
+{
+	struct siw_dev	*dev = pd->hdr.dev;
+	struct siw_mem	*mem;
+	int		new_ref = 0, rv = 0;
+
+	if (len + off > sge->len) {
+		rv = -EPERM;
+		goto fail;
+	}
+	if (sge->mem.obj == NULL) {
+		mem = siw_mem_id2obj(dev, sge->lkey >> 8);
+		if (!mem) {
+			rv = -EINVAL;
+			goto fail;
+		}
+		sge->mem.obj = mem;
+		new_ref = 1;
+	} else {
+		mem = sge->mem.obj;
+		new_ref = 0;
+	}
+	rv = siw_check_mem(pd, mem, sge->addr + off, perms, len);
+	if (rv)
+		goto fail;
+
+	return 0;
+
+fail:
+	if (new_ref) {
+		siw_mem_put(mem);
+		sge->mem.obj = NULL;
+	}
+	return rv;
+}
+
+
+/*
+ * siw_check_sgl()
+ *
+ * Check permissions for a list of SGE's (SGL)
+ *
+ * @pd:		Protection Domain SGL should belong to
+ * @sge:	List of SGE to be checked
+ * @perms:	requested access permissions
+ * @off:	starting offset in SGL
+ * @len:	len of memory interval to be checked
+ *
+ * Function checks only subinterval of SGL described by bytelen @len,
+ * check starts with byte offset @off which must be within
+ * the length of the first SGE.
+ *
+ * The caller is responsible for keeping @len + @off within
+ * the total byte len of the SGL.
+ */
+
+int siw_check_sgl(struct siw_pd *pd, struct siw_sge *sge,
+		  enum siw_access_flags perms, u32 off, int len)
+{
+	int	rv = 0;
+
+	dprint(DBG_WR, "(PD%d): Enter\n", OBJ_ID(pd));
+
+	BUG_ON(off >= sge->len);
+
+	while (len > 0) {
+		dprint(DBG_WR, "(PD%d): sge=%p, perms=0x%x, "
+			"len=%d, off=%u, sge->len=%d\n",
+			OBJ_ID(pd), sge, perms, len, off, sge->len);
+		/*
+		 * rdma verbs: do not check stag for a zero length sge
+		 */
+		if (sge->len == 0) {
+			sge++;
+			continue;
+		}
+
+		rv = siw_check_sge(pd, sge, perms, off, sge->len - off);
+		if (rv)
+			break;
+
+		len -= sge->len - off;
+		off = 0;
+		sge++;
+	}
+	return rv;
+}
+
+int siw_crc_array(struct hash_desc *desc, u8 *start, size_t len)
+{
+	struct scatterlist sg;
+
+	sg_init_one(&sg, start, len);
+	return crypto_hash_update(desc, &sg, len);
+}
+
+int siw_crc_sg(struct hash_desc *desc, struct scatterlist *sg,
+	       int off, int len)
+{
+	int rv;
+
+	if (off == 0)
+		rv = crypto_hash_update(desc, sg, len);
+	else {
+		struct scatterlist t_sg;
+
+		sg_init_table(&t_sg, 1);
+		sg_set_page(&t_sg, sg_page(sg), len, off);
+		rv = crypto_hash_update(desc, &t_sg, len);
+	}
+	return rv;
+}
+
+/*
+ * siw_qp_freeq_flush()
+ *
+ * Flush any WQE on the QP's free list
+ */
+void siw_qp_freeq_flush(struct siw_qp *qp)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*wqe;
+
+	dprint(DBG_OBJ|DBG_CM|DBG_WR, "(QP%d): Enter\n", QP_ID(qp));
+
+	if (list_empty(&qp->wqe_freelist))
+		return;
+
+	list_for_each_safe(pos, n, &qp->wqe_freelist) {
+		wqe = list_entry_wqe(pos);
+		list_del(&wqe->list);
+		kfree(wqe);
+	}
+}
+
+
+/*
+ * siw_sq_flush()
+ *
+ * Flush SQ and ORRQ entries to CQ.
+ * IRRQ entries are silently dropped.
+ *
+ * TODO: Add termination code for in-progress WQE.
+ * TODO: an in-progress WQE may have been partially
+ *       processed. It should be enforced, that transmission
+ *       of a started DDP segment must be completed if possible
+ *       by any chance.
+ *
+ * Must be called with qp state write lock held.
+ * Therefore, SQ and ORQ lock must not be taken.
+ */
+void siw_sq_flush(struct siw_qp *qp)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*wqe = tx_wqe(qp);
+	struct siw_cq		*cq = qp->scq;
+	int			async_event = 0;
+
+	dprint(DBG_OBJ|DBG_CM|DBG_WR, "(QP%d): Enter\n", QP_ID(qp));
+
+	/*
+	 * flush the in-progress wqe, if there.
+	 */
+	if (wqe) {
+		/*
+		 * TODO: Add iWARP Termination code
+		 */
+		tx_wqe(qp) = NULL;
+
+		dprint(DBG_WR,
+			" (QP%d): Flush current WQE %p, type %d\n",
+			QP_ID(qp), wqe, wr_type(wqe));
+
+		if (wr_type(wqe) == SIW_WR_RDMA_READ_RESP) {
+			siw_wqe_put(wqe);
+			wqe = NULL;
+		} else if (wr_type(wqe) != SIW_WR_RDMA_READ_REQ)
+			/*
+			 *  A RREQUEST is already on the ORRQ
+			 */
+			list_add_tail(&wqe->list, &qp->orq);
+	}
+	if (!list_empty(&qp->irq))
+		list_for_each_safe(pos, n, &qp->irq) {
+			wqe = list_entry_wqe(pos);
+			dprint(DBG_WR,
+				" (QP%d): Flush IRQ WQE %p, status %d\n",
+				QP_ID(qp), wqe, wqe->wr_status);
+			list_del(&wqe->list);
+			siw_wqe_put(wqe);
+		}
+
+	if (!list_empty(&qp->orq))
+		list_for_each_safe(pos, n, &qp->orq) {
+			wqe = list_entry_wqe(pos);
+			dprint(DBG_WR,
+				" (QP%d): Flush ORQ WQE %p, type %d,"
+				" status %d\n", QP_ID(qp), wqe, wr_type(wqe),
+				wqe->wr_status);
+			if (wqe->wr_status != SR_WR_DONE) {
+				async_event = 1;
+				wqe->wc_status = IB_WC_WR_FLUSH_ERR;
+				wqe->wr_status = SR_WR_DONE;
+			}
+			if (cq) {
+				lock_cq(cq);
+				list_move_tail(&wqe->list, &cq->queue);
+				/* TODO: enforce CQ limits */
+				atomic_inc(&cq->qlen);
+				unlock_cq(cq);
+			} else {
+				list_del(&wqe->list);
+				siw_wqe_put(wqe);
+			}
+		}
+	if (!list_empty(&qp->sq))
+		async_event = 1;
+		list_for_each_safe(pos, n, &qp->sq) {
+			wqe = list_entry_wqe(pos);
+			dprint(DBG_WR,
+				" (QP%d): Flush SQ WQE %p, type %d\n",
+				QP_ID(qp), wqe, wr_type(wqe));
+			if (cq) {
+				wqe->wc_status = IB_WC_WR_FLUSH_ERR;
+				wqe->wr_status = SR_WR_DONE;
+				lock_cq(cq);
+				list_move_tail(&wqe->list, &cq->queue);
+				/* TODO: enforce CQ limits */
+				atomic_inc(&cq->qlen);
+				unlock_cq(cq);
+			} else  {
+				list_del(&wqe->list);
+				siw_wqe_put(wqe);
+			}
+		}
+	atomic_set(&qp->sq_space, qp->attrs.sq_size);
+
+	if (wqe != NULL && cq != NULL && cq->ofa_cq.comp_handler != NULL)
+		(*cq->ofa_cq.comp_handler)(&cq->ofa_cq, cq->ofa_cq.cq_context);
+
+	if (async_event)
+		siw_async_ev(qp, NULL, IB_EVENT_SQ_DRAINED);
+}
+
+/*
+ * siw_rq_flush()
+ *
+ * Flush recv queue entries to cq. An in-progress WQE may have some bytes
+ * processed (wqe->processed).
+ *
+ * Must be called with qp state write lock held.
+ * Therefore, RQ lock must not be taken.
+ */
+void siw_rq_flush(struct siw_qp *qp)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*wqe;
+	struct siw_cq		*cq;
+
+	dprint(DBG_OBJ|DBG_CM|DBG_WR, "(QP%d): Enter\n", QP_ID(qp));
+
+	/*
+	 * Flush an in-progess WQE if present
+	 */
+	if (rx_wqe(qp)) {
+		if (qp->rx_ctx.hdr.ctrl.opcode != RDMAP_RDMA_WRITE)
+			list_add(&rx_wqe(qp)->list, &qp->rq);
+		else
+			siw_mem_put(rx_mem(qp));
+
+		rx_wqe(qp) = NULL;
+	}
+	if (list_empty(&qp->rq))
+		return;
+
+	cq = qp->rcq;
+
+	list_for_each_safe(pos, n, &qp->rq) {
+		wqe = list_entry_wqe(pos);
+		list_del_init(&wqe->list);
+		if (cq) {
+			wqe->wc_status = IB_WC_WR_FLUSH_ERR;
+			lock_cq(cq);
+			list_add_tail(&wqe->list, &cq->queue);
+			/* TODO: enforce CQ limits */
+			atomic_inc(&cq->qlen);
+			unlock_cq(cq);
+		} else
+			siw_wqe_put(wqe);
+
+		if (!qp->srq)
+			atomic_inc(&qp->rq_space);
+		else
+			atomic_inc(&qp->srq->space);
+
+	}
+	if (cq != NULL && cq->ofa_cq.comp_handler != NULL)
+		(*cq->ofa_cq.comp_handler)(&cq->ofa_cq, cq->ofa_cq.cq_context);
+}
-- 
1.5.4.3


^ permalink raw reply related

* [PATCH] SIW: Completion queue
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_cq.c |  243 ++++++++++++++++++++++++++++++++++++
 1 files changed, 243 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_cq.c

diff --git a/drivers/infiniband/hw/siw/siw_cq.c b/drivers/infiniband/hw/siw/siw_cq.c
new file mode 100644
index 0000000..441f128
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_cq.c
@@ -0,0 +1,243 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/list.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+static int siw_wc_op_siw2ofa[SIW_WR_NUM] = {
+	[SIW_WR_RDMA_WRITE]		= IB_WC_RDMA_WRITE,
+	[SIW_WR_RDMA_WRITE_WITH_IMM]	= IB_WC_RDMA_WRITE,
+	[SIW_WR_SEND]			= IB_WC_SEND,
+	[SIW_WR_SEND_WITH_IMM]		= IB_WC_SEND,
+	[SIW_WR_RDMA_READ_REQ]		= IB_WC_RDMA_READ,
+	[SIW_WR_ATOMIC_CMP_AND_SWP]	= IB_WC_COMP_SWAP,
+	[SIW_WR_ATOMIC_FETCH_AND_ADD]	= IB_WC_FETCH_ADD,
+	[SIW_WR_BIND_MW]		= IB_WC_BIND_MW,
+	[SIW_WR_FASTREG]		= IB_WC_FAST_REG_MR,
+	[SIW_WR_INVAL_STAG]		= IB_WC_LOCAL_INV,
+	[SIW_WR_RECEIVE]		= IB_WC_RECV,
+	[SIW_WR_RDMA_READ_RESP]		= 0 /* not used */
+};
+
+/*
+ * translate wc into ofa syntax
+ */
+static void siw_wc_siw2ofa(struct siw_wqe *siw_wc, struct ib_wc *ofa_wc)
+{
+	memset(ofa_wc, 0, sizeof *ofa_wc);
+
+	ofa_wc->wr_id = wr_id(siw_wc);
+	ofa_wc->status = siw_wc->wc_status;
+	ofa_wc->byte_len = siw_wc->processed;
+	ofa_wc->qp = &siw_wc->qp->ofa_qp;
+
+	BUG_ON(wr_type(siw_wc) >= SIW_WR_NUM);
+	ofa_wc->opcode = siw_wc_op_siw2ofa[wr_type(siw_wc)];
+	/*
+	 * ofa_wc->imm_data = 0;
+	 * ofa_wc->vendor_err = 0;
+	 * ofa_wc->src_qp = 0;
+	 * ofa_wc->wc_flags = 0; ADD immediate data support
+	 * ofa_wc->pkey_index = 0;
+	 * ofa_wc->slid = 0;
+	 * ofa_wc->sl = 0;
+	 * ofa_wc->dlid_path_bits = 0;
+	 * ofa_wc->port_num = 0;
+	 */
+}
+
+/*
+ * Reap one CQE from the CQ.
+ *
+ * Caller must hold qp read lock
+ *
+ * TODO: Provide routine which can read more than one CQE
+ */
+int siw_reap_cqe(struct siw_cq *cq, struct ib_wc *ofa_wc)
+{
+	struct siw_wqe	*cqe = NULL;
+	unsigned long flags;
+
+	lock_cq_rxsave(cq, flags);
+
+	if (!list_empty(&cq->queue)) {
+		cqe = list_first_wqe(&cq->queue);
+		list_del(&cqe->list);
+		atomic_dec(&cq->qlen);
+	}
+	unlock_cq_rxsave(cq, flags);
+
+	if (cqe) {
+		siw_wc_siw2ofa(cqe, ofa_wc);
+
+		dprint(DBG_WR, " QP%d, CQ%d: Reap WQE type: %d, p: %p\n",
+			  QP_ID(cqe->qp), OBJ_ID(cq), wr_type(cqe), cqe);
+
+		siw_wqe_put(cqe);
+		return 1;
+	} else
+		return 0;
+}
+
+/*
+ * siw_cq_flush()
+ *
+ * Flush all CQ elements. No CQ lock is taken.
+ */
+void siw_cq_flush(struct siw_cq *cq)
+{
+	struct list_head	*pos, *n;
+	struct siw_wqe		*cqe;
+
+	dprint(DBG_CM|DBG_OBJ, "(CQ%d:) Enter\n", OBJ_ID(cq));
+
+	if (list_empty(&cq->queue))
+		return;
+
+	list_for_each_safe(pos, n, &cq->queue) {
+		cqe = list_entry_wqe(pos);
+		list_del(&cqe->list);
+
+		dprint(DBG_OBJ|DBG_WR, " WQE: 0x%llu:, type: %d, p: %p\n",
+			(unsigned long long)wr_id(cqe),
+			wr_type(cqe), cqe);
+
+		siw_wqe_put(cqe);
+	}
+	atomic_set(&cq->qlen, 0);
+}
+
+
+
+/*
+ * siw_rq_complete()
+ *
+ * Appends RQ/SRQ WQE to CQ, if assigned.
+ * Must be called with qp state read locked
+ */
+void siw_rq_complete(struct siw_wqe *wqe, struct siw_qp *qp)
+{
+	struct siw_cq	*cq = qp->rcq;
+	unsigned long flags;
+
+	dprint(DBG_OBJ|DBG_WR, " QP%d WQE: 0x%llu:, type: %d, p: %p\n",
+		QP_ID(qp),
+		(unsigned long long)wr_id(wqe), wr_type(wqe), wqe);
+
+	if (cq) {
+		lock_cq_rxsave(cq, flags);
+
+		list_add_tail(&wqe->list, &cq->queue);
+		atomic_inc(&cq->qlen); /* FIXME: test overflow */
+
+		unlock_cq_rxsave(cq, flags);
+
+		/*
+		 * SRQ space was already incremented when WQE was fetched
+		 * by some QP
+		 */
+		if (!qp->srq)	/* XXX to be deferred to reaping ? */
+			atomic_inc(&qp->rq_space);
+
+		if (cq->ofa_cq.comp_handler != NULL &&
+			((cq->notify & SIW_CQ_NOTIFY_ALL) ||
+			 (cq->notify == SIW_CQ_NOTIFY_SOLICITED &&
+			  wr_flags(wqe) & IB_SEND_SOLICITED))) {
+				cq->notify = SIW_CQ_NOTIFY_NOT;
+				(*cq->ofa_cq.comp_handler)
+					(&cq->ofa_cq, cq->ofa_cq.cq_context);
+		}
+	} else {
+		if (!qp->srq)
+			atomic_inc(&qp->rq_space);
+		siw_wqe_put(wqe);
+	}
+}
+
+/*
+ * siw_sq_complete()
+ * Appends list of former SQ WQE's to CQ, if assigned.
+ * Must be called with qp state read locked
+ */
+void siw_sq_complete(struct list_head *c_list, struct siw_qp *qp, int num,
+		     enum ib_send_flags send_flags)
+{
+	struct siw_cq		*cq = qp->scq;
+	unsigned long flags;
+
+	if (cq) {
+		lock_cq_rxsave(cq, flags);
+
+		list_splice_tail(c_list, &cq->queue);
+		atomic_add(num, &cq->qlen); /* FIXME: test overflow */
+
+
+		dprint(DBG_WR, " CQ%d: add %d from QP%d, CQ len %d\n",
+			OBJ_ID(cq), num, QP_ID(qp), atomic_read(&cq->qlen));
+
+		/* XXX to be deferred to reaping */
+		atomic_add(num, &qp->sq_space);
+
+		if (cq->ofa_cq.comp_handler != NULL &&
+			((cq->notify & SIW_CQ_NOTIFY_ALL) ||
+			 (cq->notify == SIW_CQ_NOTIFY_SOLICITED &&
+			  send_flags & IB_SEND_SOLICITED))) {
+				cq->notify = SIW_CQ_NOTIFY_NOT;
+				(*cq->ofa_cq.comp_handler)
+					(&cq->ofa_cq, cq->ofa_cq.cq_context);
+		}
+		unlock_cq_rxsave(cq, flags);
+	} else {
+		struct list_head *pos;
+
+		list_for_each(pos, c_list)
+			siw_wqe_put(list_entry_wqe(pos));
+
+		atomic_add(num, &qp->sq_space);
+	}
+}
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Transmit path
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev; +Cc: linux-rdma, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_qp_tx.c | 1309 +++++++++++++++++++++++++++++++++
 1 files changed, 1309 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_qp_tx.c

diff --git a/drivers/infiniband/hw/siw/siw_qp_tx.c b/drivers/infiniband/hw/siw/siw_qp_tx.c
new file mode 100644
index 0000000..ef774eb
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_qp_tx.c
@@ -0,0 +1,1309 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt@zurich.ibm.com>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+static int zcopy_tx = 1;
+module_param(zcopy_tx, int, 0644);
+MODULE_PARM_DESC(zcopy_tx, "Zero copy user data transmit if possible");
+
+DEFINE_PER_CPU(atomic_t, siw_workq_len);
+
+static inline int siw_crc_txhdr(struct siw_iwarp_tx *ctx)
+{
+	crypto_hash_init(&ctx->mpa_crc_hd);
+	return siw_crc_array(&ctx->mpa_crc_hd, (u8 *)&ctx->pkt,
+			     ctx->ctrl_len);
+}
+
+#define PKT_FRAGMENTED 1
+#define PKT_COMPLETE 0
+
+/*
+ * siw_qp_prepare_tx()
+ *
+ * Prepare tx state for sending out one fpdu. Builds complete pkt
+ * if no user data or only immediate data are present.
+ *
+ * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
+ */
+static int siw_qp_prepare_tx(struct siw_iwarp_tx *c_tx)
+{
+	struct siw_wqe		*wqe = c_tx->wqe;
+	u32			*crc = NULL;
+
+	dprint(DBG_TX, "(QP%d):\n", TX_QPID(c_tx));
+
+	switch (wr_type(wqe)) {
+
+	case SIW_WR_RDMA_READ_REQ:
+		memcpy(&c_tx->pkt.ctrl,
+		       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.rreq.rsvd = 0;
+		c_tx->pkt.rreq.ddp_qn = htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
+		c_tx->pkt.rreq.ddp_msn =
+			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]);
+		c_tx->pkt.rreq.ddp_mo = 0;
+		c_tx->pkt.rreq.sink_stag = htonl(wqe->wr.rread.sge[0].lkey);
+		c_tx->pkt.rreq.sink_to =
+			cpu_to_be64(wqe->wr.rread.sge[0].addr); /* abs addr! */
+		c_tx->pkt.rreq.source_stag = htonl(wqe->wr.rread.rtag);
+		c_tx->pkt.rreq.source_to = cpu_to_be64(wqe->wr.rread.raddr);
+		c_tx->pkt.rreq.read_size = htonl(wqe->bytes);
+
+		dprint(DBG_TX, ": RREQ: Sink: %x, 0x%016llx\n",
+			wqe->wr.rread.sge[0].lkey, wqe->wr.rread.sge[0].addr);
+
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rreq);
+		crc = &c_tx->pkt.rreq_pkt.crc;
+		break;
+
+	case SIW_WR_SEND:
+		if (wr_flags(wqe) & IB_SEND_SOLICITED)
+			memcpy(&c_tx->pkt.ctrl,
+			       &iwarp_pktinfo[RDMAP_SEND_SE].ctrl,
+			       sizeof(struct iwarp_ctrl));
+		else
+			memcpy(&c_tx->pkt.ctrl,
+			       &iwarp_pktinfo[RDMAP_SEND].ctrl,
+			       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.send.ddp_qn = RDMAP_UNTAGGED_QN_SEND;
+		c_tx->pkt.send.ddp_msn =
+			htonl(++c_tx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+		c_tx->pkt.send.ddp_mo = 0;
+		c_tx->pkt.send.rsvd = 0;
+
+		c_tx->ctrl_len = sizeof(struct iwarp_send);
+
+		if (!wqe->bytes)
+			crc = &c_tx->pkt.send_pkt.crc;
+		break;
+
+	case SIW_WR_RDMA_WRITE:
+		memcpy(&c_tx->pkt.ctrl, &iwarp_pktinfo[RDMAP_RDMA_WRITE].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		c_tx->pkt.rwrite.sink_stag = htonl(wqe->wr.write.rtag);
+		c_tx->pkt.rwrite.sink_to = cpu_to_be64(wqe->wr.write.raddr);
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_write);
+
+		if (!wqe->bytes)
+			crc = &c_tx->pkt.write_pkt.crc;
+		break;
+
+	case SIW_WR_RDMA_READ_RESP:
+		memcpy(&c_tx->pkt.ctrl,
+		       &iwarp_pktinfo[RDMAP_RDMA_READ_RESP].ctrl,
+		       sizeof(struct iwarp_ctrl));
+
+		/* NBO */
+		c_tx->pkt.rresp.sink_stag = wqe->wr.rresp.rtag;
+		c_tx->pkt.rresp.sink_to = cpu_to_be64(wqe->wr.rresp.raddr);
+
+		c_tx->ctrl_len = sizeof(struct iwarp_rdma_rresp);
+
+		dprint(DBG_TX, ": RRESP: Sink: %x, 0x%016llx\n",
+			wqe->wr.rresp.rtag, wqe->wr.rresp.raddr);
+
+		if (!wqe->bytes)
+			crc = &c_tx->pkt.rresp_pkt.crc;
+		break;
+
+	default:
+		dprint(DBG_ON, "Unsupported WQE type %d\n", wr_type(wqe));
+		BUG();
+		break;
+	}
+	c_tx->ctrl_sent = 0;
+	c_tx->sge_idx = 0;
+	c_tx->sge_off = 0;
+	c_tx->pg_idx = 0;
+	c_tx->umem_chunk = NULL;
+
+	/*
+	 * Do complete CRC if enabled and short packet
+	 */
+	if (crc) {
+		*crc = 0;
+		if (c_tx->crc_enabled) {
+			if (siw_crc_txhdr(c_tx) != 0)
+				return -EINVAL;
+			crypto_hash_final(&c_tx->mpa_crc_hd, (u8 *)crc);
+		}
+	}
+	c_tx->ctrl_len += MPA_CRC_SIZE;
+
+	/*
+	 * Allow direct sending out of user buffer if WR is non signalled
+	 * and payload is over threshold and no CRC is enabled.
+	 * Per RDMA verbs, the application should not change the send buffer
+	 * until the work completed. In iWarp, work completion is only
+	 * local delivery to TCP. TCP may reuse the buffer for
+	 * retransmission or may even did not yet sent the data. Changing
+	 * unsent data also breaks the CRC, if applied.
+	 */
+	if (zcopy_tx &&
+	     !(wr_flags(wqe) & IB_SEND_SIGNALED) &&
+	     wqe->bytes > SENDPAGE_THRESH &&
+	     wr_type(wqe) != SIW_WR_RDMA_READ_REQ)
+		c_tx->use_sendpage = 1;
+	else
+		c_tx->use_sendpage = 0;
+
+	return crc == NULL ? PKT_FRAGMENTED : PKT_COMPLETE;
+}
+
+/*
+ * Send out one complete FPDU. Used for fixed sized packets like
+ * Read Requests or zero length SENDs, WRITEs, READ.responses.
+ * Also used for pushing an FPDU hdr only.
+ */
+static inline int siw_tx_ctrl(struct siw_iwarp_tx *c_tx, struct socket *s,
+			      int flags)
+{
+	struct msghdr msg = {.msg_flags = flags};
+	struct kvec iov = {
+		.iov_base = (char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent,
+		.iov_len = c_tx->ctrl_len - c_tx->ctrl_sent};
+
+	int rv = kernel_sendmsg(s, &msg, &iov, 1,
+				c_tx->ctrl_len - c_tx->ctrl_sent);
+
+	dprint(DBG_TX, " (QP%d): op=%d, %d of %d sent (%d)\n",
+		TX_QPID(c_tx), c_tx->pkt.ctrl.opcode,
+		c_tx->ctrl_sent + rv, c_tx->ctrl_len, rv);
+
+	if (rv >= 0) {
+		c_tx->ctrl_sent += rv;
+
+		if (c_tx->ctrl_sent == c_tx->ctrl_len) {
+			siw_dprint_hdr(&c_tx->pkt.hdr, TX_QPID(c_tx),
+					"CTRL sent");
+			if (!(flags & MSG_MORE))
+				c_tx->new_tcpseg = 1;
+			rv = 0;
+		} else if (c_tx->ctrl_sent < c_tx->ctrl_len)
+			rv = -EAGAIN;
+		else
+			BUG();
+	}
+	return rv;
+}
+
+/*
+ * 0copy TCP transmit interface.
+ *
+ * Push page array page by page or in one shot.
+ * Pushing the whole page array requires the inner do_tcp_sendpages
+ * function to be exported by the kernel.
+ */
+static int siw_tcp_sendpages(struct socket *s, struct page **page,
+			     int offset, size_t size)
+{
+	int rv = 0;
+
+#ifdef SIW_SENDPAGES_EXPORT
+	struct sock *sk = s->sk;
+
+	if (!(sk->sk_route_caps & NETIF_F_SG) ||
+	    !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) {
+		/* FIXME:
+		 * This should also be handled in a
+		 * loop
+		 */
+		return -EFAULT;
+	}
+
+	lock_sock(sk);
+	TCP_CHECK_TIMER(sk);
+
+	/*
+	 * just return what sendpages has return
+	 */
+	rv = do_tcp_sendpages(sk, page, offset, size, MSG_MORE|MSG_DONTWAIT);
+
+	TCP_CHECK_TIMER(sk);
+	release_sock(sk);
+	if (rv == -EAGAIN)
+		rv = 0;
+#else
+	/*
+	 * If do_tcp_sendpages() function is not exported
+	 * push page by page
+	 */
+	size_t todo = size;
+	int i;
+
+	for (i = 0; size > 0; i++) {
+		size_t bytes = min_t(size_t, PAGE_SIZE - offset, size);
+
+		rv = s->ops->sendpage(s, page[i], offset, bytes,
+				      MSG_MORE|MSG_DONTWAIT);
+		if (rv <= 0)
+			break;
+
+		size -= rv;
+
+		if (rv != bytes)
+			break;
+
+		offset = 0;
+	}
+	if (rv >= 0 || rv == -EAGAIN)
+		rv = todo - size;
+#endif
+	return rv;
+}
+
+/*
+ * siw_0copy_tx()
+ *
+ * Pushes list of pages to TCP socket. If pages from multiple
+ * SGE's, all referenced pages of each SGE are pushed in one
+ * shot.
+ */
+static int siw_0copy_tx(struct socket *s, struct page **page,
+			struct siw_sge *sge, unsigned int offset,
+			unsigned int size)
+{
+	int i = 0, sent = 0, rv;
+	int sge_bytes = min(sge->len - offset, size);
+
+	offset  = (sge->addr + offset) & ~PAGE_MASK;
+
+	while (sent != size) {
+
+		rv = siw_tcp_sendpages(s, &page[i], offset, sge_bytes);
+		if (rv >= 0) {
+			sent += rv;
+			if (size == sent || sge_bytes > rv)
+				break;
+
+			i += PAGE_ALIGN(sge_bytes + offset) >> PAGE_SHIFT;
+			sge++;
+			sge_bytes = min(sge->len, size - sent);
+			offset = sge->addr & ~PAGE_MASK;
+		} else {
+			sent = rv;
+			break;
+		}
+	}
+	return sent;
+}
+
+/*
+ * siw_tx_umem_init()
+ *
+ * Resolve memory chunk and update page index pointer
+ *
+ * @chunk:	Umem Chunk to be updated
+ * @p_idx	Page Index to be updated
+ * @mr:		Memory Region
+ * @va:		Virtual Address within MR
+ *
+ */
+static void siw_tx_umem_init(struct ib_umem_chunk **chunk, int *page_index,
+			     struct siw_mr *mr, u64 va)
+{
+	struct ib_umem_chunk *cp;
+	int p_ix;
+
+	BUG_ON(va < mr->mem.va);
+	va -= mr->mem.va & PAGE_MASK;
+	/*
+	 * equivalent to
+	 * va += mr->umem->offset;
+	 * va = va >> PAGE_SHIFT;
+	 */
+
+	p_ix = va >> PAGE_SHIFT;
+
+	list_for_each_entry(cp, &mr->umem->chunk_list, list) {
+		if (p_ix < cp->nents)
+			break;
+		p_ix -= cp->nents;
+	}
+	BUG_ON(p_ix >= cp->nents);
+
+	dprint(DBG_MM, "(): New chunk 0x%p: Page idx %d, nents %d\n",
+		cp, p_ix, cp->nents);
+
+	*chunk = cp;
+	*page_index = p_ix;
+
+	return;
+}
+
+/*
+ * update memory chunk and page index from given starting point
+ * before current transmit described by: c_tx->sge_off,
+ * sge->addr, c_tx->pg_idx, and c_tx->umem_chunk
+ */
+static inline void
+siw_umem_chunk_update(struct siw_iwarp_tx *c_tx, struct siw_mr *mr,
+		      struct siw_sge *sge, unsigned int off)
+{
+	struct ib_umem_chunk *chunk = c_tx->umem_chunk;
+	u64 va_start = sge->addr + c_tx->sge_off;
+
+	off += (unsigned int)(va_start & ~PAGE_MASK); /* + first page offset */
+	off >>= PAGE_SHIFT; 	/* bytes offset becomes pages offset */
+
+	list_for_each_entry_from(chunk, &mr->umem->chunk_list, list) {
+		if (c_tx->pg_idx + off < chunk->nents)
+			break;
+		off -= chunk->nents - c_tx->pg_idx;
+		c_tx->pg_idx = 0;
+	}
+	c_tx->pg_idx += off;
+
+	c_tx->umem_chunk = chunk;
+}
+
+#define MAX_TRAILER 8
+#define MAX_ARRAY 130	/* Max number of kernel_sendmsg elements */
+
+static inline void
+siw_save_txstate(struct siw_iwarp_tx *c_tx, struct ib_umem_chunk *chunk,
+		 unsigned int pg_idx, unsigned int sge_idx,
+		 unsigned int sge_off)
+{
+	c_tx->umem_chunk = chunk;
+	c_tx->pg_idx = pg_idx;
+	c_tx->sge_idx = sge_idx;
+	c_tx->sge_off = sge_off;
+}
+/*
+ * Write out iov referencing hdr, data and trailer of current FPDU.
+ * Update transmit state dependent on write return status
+ */
+static int siw_tx_hdt(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+	struct siw_wqe		*wqe = c_tx->wqe;
+	struct siw_sge		*sge = &wqe->wr.sgl.sge[c_tx->sge_idx],
+				*first_sge = sge;
+	struct siw_mr		*mr = siw_mem2mr(sge->mem.obj);
+	struct ib_umem_chunk 	*chunk = c_tx->umem_chunk;
+
+	struct kvec		iov[MAX_ARRAY];
+	struct page 		*page_array[MAX_ARRAY];
+	struct msghdr		msg = {.msg_flags = MSG_DONTWAIT};
+
+	int			seg = 0, do_crc = c_tx->do_crc, kbuf = 0,
+				rv;
+	unsigned int		data_len = c_tx->bytes_unsent,
+				hdr_len = 0,
+				trl_len = 0,
+				sge_off = c_tx->sge_off,
+				sge_idx = c_tx->sge_idx,
+				pg_idx = c_tx->pg_idx;
+
+	if (SIW_INLINED_DATA(wqe)) {
+		kbuf = 1;
+		chunk = 0;
+	}
+
+	if (c_tx->state == SIW_SEND_HDR) {
+		if (c_tx->use_sendpage) {
+			rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT|MSG_MORE);
+			if (rv)
+				goto done;
+
+			c_tx->state = SIW_SEND_DATA;
+		} else {
+			iov[0].iov_base =
+				(char *)&c_tx->pkt.ctrl + c_tx->ctrl_sent;
+			iov[0].iov_len = hdr_len =
+				c_tx->ctrl_len - c_tx->ctrl_sent;
+			seg = 1;
+			siw_dprint_hdr(&c_tx->pkt.hdr, TX_QPID(c_tx),
+					"HDR to send: ");
+		}
+	}
+
+	wqe->processed += data_len;
+
+	while (data_len) { /* walk the list of SGE's */
+		unsigned int sge_len = min(sge->len - sge_off, data_len);
+		unsigned int fp_off = (sge->addr + sge_off) & ~PAGE_MASK;
+
+		BUG_ON(!sge_len);
+
+		if (kbuf) {
+			/*
+			 * In kernel buffers to be tx'ed.
+			 */
+			iov[seg].iov_base =
+				(void *)(unsigned long)(sge->addr + sge_off);
+			iov[seg].iov_len = sge_len;
+			if (do_crc)
+				siw_crc_array(&c_tx->mpa_crc_hd,
+					      iov[seg].iov_base, sge_len);
+			sge_off += sge_len;
+			data_len -= sge_len;
+			seg++;
+			goto sge_done;
+		}
+		while (sge_len) {
+			struct scatterlist *sl;
+			size_t plen;
+
+			if (!chunk) {
+				mr = siw_mem2mr(sge->mem.obj);
+				siw_tx_umem_init(&chunk, &pg_idx, mr,
+						 sge->addr + sge_off);
+
+				if (!c_tx->umem_chunk)
+					/* Starting first tx for this WQE */
+					siw_save_txstate(c_tx, chunk, pg_idx,
+							 sge_idx, sge_off);
+			}
+			sl = &chunk->page_list[pg_idx];
+			plen = min((int)PAGE_SIZE - fp_off, sge_len);
+
+			BUG_ON(plen <= 0);
+
+			page_array[seg] = sg_page(sl);
+
+			if (!c_tx->use_sendpage) {
+				iov[seg].iov_base = kmap(sg_page(sl)) + fp_off;
+				iov[seg].iov_len = plen;
+			}
+			if (do_crc)
+				siw_crc_sg(&c_tx->mpa_crc_hd, sl, fp_off, plen);
+
+			sge_len -= plen;
+			sge_off += plen;
+			data_len -= plen;
+
+			if (plen + fp_off == PAGE_SIZE &&
+			    sge_off < sge->len && ++pg_idx == chunk->nents) {
+				chunk = mem_chunk_next(chunk);
+				pg_idx = 0;
+			}
+			fp_off = 0;
+			if (++seg > MAX_ARRAY) {
+				dprint(DBG_ON, "(QP%d): Too many fragments\n",
+				       TX_QPID(c_tx));
+				if (!kbuf) {
+					int i = (hdr_len > 0) ? 1 : 0;
+					seg--;
+					while (i < seg)
+						kunmap(page_array[i++]);
+				}
+				wqe->processed = 0;
+				rv = -EINVAL;
+				goto done_crc;
+			}
+		}
+sge_done:
+		/* Update SGE variables at end of SGE */
+		if (sge_off == sge->len && wqe->processed < wqe->bytes) {
+			sge_idx++;
+			sge++;
+			sge_off = 0;
+			chunk = NULL;
+		}
+	}
+	/* trailer */
+	if (likely(c_tx->state != SIW_SEND_TRAILER)) {
+		iov[seg].iov_base = &c_tx->trailer.pad[4 - c_tx->pad];
+		iov[seg].iov_len = trl_len = MAX_TRAILER - (4 - c_tx->pad);
+	} else {
+		iov[seg].iov_base = &c_tx->trailer.pad[c_tx->ctrl_sent];
+		iov[seg].iov_len = trl_len = MAX_TRAILER - c_tx->ctrl_sent;
+	}
+
+	if (c_tx->pad) {
+		*(u32 *)c_tx->trailer.pad = 0;
+		if (do_crc)
+			siw_crc_array(&c_tx->mpa_crc_hd,
+				      (u8 *)&c_tx->trailer.crc - c_tx->pad,
+				      c_tx->pad);
+	}
+	if (!c_tx->crc_enabled)
+		c_tx->trailer.crc = 0;
+	else if (do_crc)
+		crypto_hash_final(&c_tx->mpa_crc_hd, (u8 *)&c_tx->trailer.crc);
+
+	data_len = c_tx->bytes_unsent;
+
+	if (c_tx->tcp_seglen >= (int)MPA_MIN_FRAG && TX_MORE_WQE(TX_QP(c_tx))) {
+		msg.msg_flags |= MSG_MORE;
+		c_tx->new_tcpseg = 0;
+	} else
+		c_tx->new_tcpseg = 1;
+
+	if (c_tx->use_sendpage) {
+		rv = siw_0copy_tx(s, page_array, first_sge, c_tx->sge_off,
+				  data_len);
+		if (rv == data_len) {
+			rv = kernel_sendmsg(s, &msg, &iov[seg], 1, trl_len);
+			if (rv > 0)
+				rv += data_len;
+			else
+				rv = data_len;
+		}
+	} else {
+		rv = kernel_sendmsg(s, &msg, iov, seg + 1,
+				    hdr_len + data_len + trl_len);
+		if (!kbuf) {
+			int i = (hdr_len > 0) ? 1 : 0;
+			while (i < seg)
+				kunmap(page_array[i++]);
+		}
+	}
+	if (rv < (int)hdr_len) {
+		/* Not even complete hdr pushed or negative rv */
+		wqe->processed -= data_len;
+		if (rv >= 0) {
+			c_tx->ctrl_sent += rv;
+			rv = -EAGAIN;
+		}
+		goto done_crc;
+	}
+
+	rv -= hdr_len;
+
+	if (rv >= (int)data_len) {
+		/* all user data pushed to TCP or no data to push */
+		if (data_len > 0 && wqe->processed < wqe->bytes)
+			/* Save the current state for next tx */
+			siw_save_txstate(c_tx, chunk, pg_idx, sge_idx, sge_off);
+
+		rv -= data_len;
+
+		if (rv == trl_len) /* all pushed */
+			rv = 0;
+		else {
+			c_tx->state = SIW_SEND_TRAILER;
+			c_tx->ctrl_len = MAX_TRAILER;
+			c_tx->ctrl_sent = rv + 4 - c_tx->pad;
+			c_tx->bytes_unsent = 0;
+			rv = -EAGAIN;
+		}
+
+	} else if (data_len > 0) {
+		/* Maybe some user data pushed to TCP */
+		c_tx->state = SIW_SEND_DATA;
+		wqe->processed -= data_len - rv;
+
+		if (rv) {
+			/*
+			 * Some bytes out. Recompute tx state based
+			 * on old state and bytes pushed
+			 */
+			c_tx->bytes_unsent -= rv;
+			sge = &wqe->wr.sgl.sge[c_tx->sge_idx];
+
+			if (c_tx->sge_idx == sge_idx && c_tx->umem_chunk)
+				/*
+				 * same SGE as starting SGE for this FPDU
+				 */
+				siw_umem_chunk_update(c_tx, mr, sge, rv);
+			else {
+				while (sge->len <= c_tx->sge_off + rv) {
+					rv -= sge->len - c_tx->sge_off;
+					sge = &wqe->wr.sgl.sge[++c_tx->sge_idx];
+					c_tx->sge_off = 0;
+				}
+				c_tx->umem_chunk = NULL;
+			}
+			c_tx->sge_off += rv;
+			BUG_ON(c_tx->sge_off >= sge->len);
+		}
+		rv = -EAGAIN;
+	}
+done_crc:
+	c_tx->do_crc = 0;
+done:
+	return rv;
+}
+
+static void siw_calculate_tcpseg(struct siw_iwarp_tx *c_tx, struct socket *s)
+{
+	/*
+	 * refresh TCP segement len if we start a new segment or
+	 * remaining segment len is less than MPA_MIN_FRAG or
+	 * the socket send buffer is empty.
+	 */
+	if (c_tx->new_tcpseg || c_tx->tcp_seglen < (int)MPA_MIN_FRAG ||
+	     !tcp_send_head(s->sk))
+
+		c_tx->tcp_seglen = get_tcp_mss(s->sk);
+}
+
+
+/*
+ * siw_unseg_txlen()
+ *
+ * Compute complete tcp payload len if packet would not
+ * get fragmented
+ */
+static inline int siw_unseg_txlen(struct siw_iwarp_tx *c_tx)
+{
+	int pad = c_tx->bytes_unsent ? -c_tx->bytes_unsent & 0x3 : 0;
+
+	return c_tx->bytes_unsent + c_tx->ctrl_len + pad + MPA_CRC_SIZE;
+}
+
+
+/*
+ * siw_prepare_fpdu()
+ *
+ * Prepares transmit context to send out one FPDU if FPDU will contain
+ * user data and user data are not immediate data.
+ * Checks and locks involved memory segments of data to be sent.
+ * Computes maximum FPDU length to fill up TCP MSS if possible.
+ *
+ * @qp:		QP from which to transmit
+ * @wqe:	Current WQE causing transmission
+ *
+ * TODO: Take into account real available sendspace on socket
+ *       to avoid header misalignment due to send pausing within
+ *       fpdu transmission
+ */
+int siw_prepare_fpdu(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	struct siw_iwarp_tx	*c_tx  = &qp->tx_ctx;
+	int			rv = 0;
+
+	/*
+	 * TODO: TCP Fragmentation dynamics needs for further investigation.
+	 * 	 Resuming SQ processing may start with full-sized packet
+	 *	 or short packet which resets MSG_MORE and thus helps
+	 *	 to synchronize.
+	 *	 This version resumes with short packet.
+	 */
+	c_tx->ctrl_len = iwarp_pktinfo[c_tx->pkt.ctrl.opcode].hdr_len;
+	c_tx->ctrl_sent = 0;
+
+	/*
+	 * Update target buffer offset if any
+	 */
+	if (!c_tx->pkt.ctrl.t) {
+		/* Untagged message */
+		c_tx->pkt.c_untagged.ddp_mo = cpu_to_be32(wqe->processed);
+	} else {
+		/* Tagged message */
+		if (wr_type(wqe) == SIW_WR_RDMA_READ_RESP) {
+			c_tx->pkt.c_tagged.ddp_to =
+			    cpu_to_be64(wqe->wr.rresp.raddr + wqe->processed);
+		} else {
+			c_tx->pkt.c_tagged.ddp_to =
+			    cpu_to_be64(wqe->wr.write.raddr + wqe->processed);
+		}
+	}
+
+	/* First guess: one big unsegmented DDP segment */
+	c_tx->bytes_unsent = wqe->bytes - wqe->processed;
+	c_tx->tcp_seglen -= siw_unseg_txlen(c_tx);
+
+	if (c_tx->tcp_seglen >= 0) {
+		/* Whole DDP segment fits into current TCP segment */
+		c_tx->pkt.ctrl.l = 1;
+		c_tx->pad = -c_tx->bytes_unsent & 0x3;
+	} else {
+		/* Trim DDP payload to fit into current TCP segment */
+		c_tx->bytes_unsent += c_tx->tcp_seglen;
+		c_tx->bytes_unsent &= ~0x3;
+		c_tx->pad = 0;
+		c_tx->pkt.ctrl.l = 0;
+	}
+	c_tx->pkt.ctrl.mpa_len =
+		htons(c_tx->ctrl_len + c_tx->bytes_unsent - MPA_HDR_SIZE);
+
+#ifdef SIW_TX_FULLSEGS
+	c_tx->fpdu_len =
+		c_tx->ctrl_len + c_tx->bytes_unsent + c_tx->pad + MPA_CRC_SIZE;
+#endif
+	/*
+	 * Init MPA CRC computation
+	 */
+	if (c_tx->crc_enabled) {
+		siw_crc_txhdr(c_tx);
+		c_tx->do_crc = 1;
+	}
+	if (c_tx->bytes_unsent && !SIW_INLINED_DATA(wqe)) {
+		struct siw_sge	*sge = &wqe->wr.sgl.sge[c_tx->sge_idx];
+		/*
+		 * Reference memory to be tx'd
+		 */
+		BUG_ON(c_tx->sge_idx > wqe->wr.sgl.num_sge - 1);
+
+		if (wr_type(wqe) != SIW_WR_RDMA_READ_RESP)
+			rv = siw_check_sgl(qp->pd, sge, SR_MEM_LREAD,
+					   c_tx->sge_off, c_tx->bytes_unsent);
+		else
+			rv = siw_check_sge(qp->pd, sge, SR_MEM_RREAD,
+					   c_tx->sge_off, c_tx->bytes_unsent);
+	}
+	return rv;
+}
+
+#ifdef SIW_TX_FULLSEGS
+static inline int siw_test_wspace(struct socket *s, struct siw_iwarp_tx *c_tx)
+{
+	struct sock *sk = s->sk;
+	int rv = 0;
+
+	lock_sock(sk);
+	if (sk_stream_wspace(sk) < (int)c_tx->fpdu_len) {
+		set_bit(SOCK_NOSPACE, &s->flags);
+		rv = -EAGAIN;
+	}
+	release_sock(sk);
+
+	return rv;
+}
+#endif
+/*
+ * siw_qp_sq_proc_tx()
+ *
+ * Process one WQE which needs transmission on the wire.
+ * Return with:
+ *	-EAGAIN, if handover to tcp remained incomplete
+ *	0,	 if handover to tcp complete
+ *	< 0,	 if other errors happend.
+ *
+ * @qp:		QP to send from
+ * @wqe:	WQE causing transmission
+ */
+static int siw_qp_sq_proc_tx(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	struct siw_iwarp_tx	*c_tx = &qp->tx_ctx;
+	struct socket	 	*s = qp->attrs.llp_stream_handle;
+	int			rv = 0;
+
+
+	if (wqe->wr_status == SR_WR_QUEUED) {
+		wqe->wr_status = SR_WR_INPROGRESS;
+
+		siw_calculate_tcpseg(c_tx, s);
+
+		rv = siw_qp_prepare_tx(c_tx);
+		if (rv == PKT_FRAGMENTED) {
+			c_tx->state = SIW_SEND_HDR;
+			rv = siw_prepare_fpdu(qp, wqe);
+			if (rv)
+				return rv;
+		} else if (rv == PKT_COMPLETE)
+			c_tx->state = SIW_SEND_SHORT_FPDU;
+		else
+			goto tx_done;
+	}
+next_segment:
+#ifdef SIW_TX_FULLSEGS
+	rv = siw_test_wspace(s, c_tx);
+	if (rv < 0)
+		goto tx_done;
+#endif
+
+	if (c_tx->state == SIW_SEND_SHORT_FPDU) {
+		enum siw_wr_opcode tx_type = wr_type(wqe);
+
+		/*
+		 * Always end current TCP segment (no MSG_MORE flag):
+		 * trying to fill segment would result in excessive delay.
+		 */
+		rv = siw_tx_ctrl(c_tx, s, MSG_DONTWAIT);
+
+		if (!rv && tx_type != SIW_WR_RDMA_READ_REQ)
+			wqe->processed = wqe->bytes;
+
+		goto tx_done;
+
+	} else
+		rv = siw_tx_hdt(c_tx, s);
+
+	if (!rv) {
+		/* Verbs, 6.4.: Try stopping sending after a full DDP segment
+		 * if the connection goes down (== peer halfclose)
+		 */
+		if (unlikely(c_tx->tx_suspend)) {
+			rv = -ECONNABORTED;
+			goto tx_done;
+		}
+		/*
+		 * One segment sent. Processing completed if last segment.
+		 * Do next segment otherwise. Stop if tx error.
+		 */
+		if (c_tx->pkt.ctrl.l == 1) {
+			dprint(DBG_TX, "(QP%d): WR completed\n", QP_ID(qp));
+			goto tx_done;
+		}
+		c_tx->state = SIW_SEND_HDR;
+
+		siw_calculate_tcpseg(c_tx, s);
+
+		rv = siw_prepare_fpdu(qp, wqe);
+		if (!rv)
+			goto next_segment;
+	}
+tx_done:
+	return rv;
+}
+
+
+/*
+ * siw_wqe_sq_processed()
+ *
+ * Called after WQE processing completed.
+ * If WQE is not of signalled typ, it can be released.
+ * If the ORQ is empty, a signalled WQE is attached to the CQ.
+ * Otherwise, it is appended to the end of the ORQ for later
+ * completion. To keep WQE ordering, the ORQ is always consumed FIFO.
+ */
+static void siw_wqe_sq_processed(struct siw_wqe *wqe, struct siw_qp *qp)
+{
+	unsigned long flags;
+	LIST_HEAD(c_list);
+
+	if (!(wr_flags(wqe) & IB_SEND_SIGNALED)) {
+		atomic_inc(&qp->sq_space);
+		siw_wqe_put(wqe);
+		return;
+	}
+	lock_orq_rxsave(qp, flags);
+
+	if (ORQ_EMPTY(qp)) {
+		unlock_orq_rxsave(qp, flags);
+		dprint(DBG_WR|DBG_TX,
+			"(QP%d): Immediate completion, wr_type %d\n",
+			QP_ID(qp), wr_type(wqe));
+		list_add_tail(&wqe->list, &c_list);
+		siw_sq_complete(&c_list, qp, 1, wr_flags(wqe));
+	} else {
+		list_add_tail(&wqe->list, &qp->orq);
+		dprint(DBG_WR|DBG_TX,
+			"(QP%d): Defer completion, wr_type %d\n",
+			QP_ID(qp), wr_type(wqe));
+	}
+}
+
+int siw_qp_sq_proc_local(struct siw_qp *qp, struct siw_wqe *wqe)
+{
+	printk(KERN_ERR "local WR's not yet implemented\n");
+	BUG();
+	return 0;
+}
+
+
+/*
+ * siw_qp_sq_process()
+ *
+ * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
+ * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
+ * MPA FPDUs, each containing a DDP segment.
+ *
+ * SQ processing may occur in user context as a result of posting
+ * new WQE's or from siw_sq_work_handler() context.
+ *
+ * SQ processing may get paused anytime, possibly in the middle of a WR
+ * or FPDU, if insufficient send space is available. SQ processing
+ * gets resumed from siw_sq_work_handler(), if send space becomes
+ * available again.
+ *
+ * Must be called with the QP state read-locked.
+ *
+ * TODO:
+ * To be solved more seriously: an outbound RREQ can be satisfied
+ * by the corresponding RRESP _before_ it gets assigned to the ORQ.
+ * This happens regularly in RDMA READ via loopback case. Since both
+ * outbound RREQ and inbound RRESP can be handled by the same CPU
+ * locking the ORQ is dead-lock prone and thus not an option.
+ * Tentatively, the RREQ gets assigned to the ORQ _before_ being
+ * sent (and pulled back in case of send failure).
+ */
+int siw_qp_sq_process(struct siw_qp *qp, int user_ctx)
+{
+	struct siw_wqe		*wqe;
+	enum siw_wr_opcode	tx_type;
+	unsigned long		flags;
+	int			rv = 0;
+	int			max_burst;
+
+	if (user_ctx)
+		max_burst = SQ_USER_MAXBURST;
+	else
+		max_burst = max(qp->attrs.sq_size, qp->attrs.ird);
+
+	atomic_inc(&qp->tx_ctx.in_use);
+
+	wait_event(qp->tx_ctx.waitq, atomic_read(&qp->tx_ctx.in_use) == 1);
+
+	wqe = tx_wqe(qp);
+	BUG_ON(wqe == NULL);
+
+next_wqe:
+	/*
+	 * Stop QP processing if SQ state changed
+	 */
+	if (unlikely(qp->tx_ctx.tx_suspend)) {
+		dprint(DBG_WR|DBG_TX, "(QP%d): tx suspend\n", QP_ID(qp));
+		goto done;
+	}
+	tx_type = wr_type(wqe);
+
+	dprint(DBG_WR|DBG_TX,
+		" QP(%d): WR type %d, state %d, data %u, sent %u, id %llu\n",
+		QP_ID(qp), wr_type(wqe), wqe->wr_status, wqe->bytes,
+		wqe->processed, (unsigned long long)wr_id(wqe));
+
+	if (SIW_WQE_IS_TX(wqe))
+		rv = siw_qp_sq_proc_tx(qp, wqe);
+	else
+		rv = siw_qp_sq_proc_local(qp, wqe);
+
+	if (!rv) {
+		/*
+		 * WQE processing done
+		 */
+		switch (tx_type) {
+
+		case SIW_WR_SEND:
+		case SIW_WR_RDMA_WRITE:
+
+			wqe->wc_status = IB_WC_SUCCESS;
+			wqe->wr_status = SR_WR_DONE;
+			siw_wqe_sq_processed(wqe, qp);
+			break;
+
+		case SIW_WR_RDMA_READ_REQ:
+			/*
+			 * already enqueued to ORQ queue
+			 */
+			break;
+
+		case SIW_WR_RDMA_READ_RESP:
+			/*
+			 * silently recyclye wqe
+			 */
+			/* XXX DEBUG AID, please remove */
+			wqe->wr_status = SR_WR_DONE;
+			siw_wqe_put(wqe);
+			break;
+		default:
+			BUG();
+		}
+
+		lock_sq_rxsave(qp, flags);
+
+		wqe = siw_next_tx_wqe(qp);
+		if (!wqe) {
+			tx_wqe(qp) = NULL;
+			unlock_sq_rxsave(qp, flags);
+			goto done;
+		}
+		if (wr_type(wqe) == SIW_WR_RDMA_READ_REQ) {
+			if (ORD_SUSPEND_SQ(qp)) {
+				tx_wqe(qp) = NULL;
+				unlock_sq_rxsave(qp, flags);
+				dprint(DBG_WR|DBG_TX,
+					" QP%d PAUSE SQ: ORD limit\n",
+					QP_ID(qp));
+				goto done;
+			} else {
+				tx_wqe(qp) = wqe;
+				siw_rreq_queue(wqe, qp);
+			}
+		} else  {
+			list_del_init(&wqe->list);
+			tx_wqe(qp) = wqe;
+		}
+		unlock_sq_rxsave(qp, flags);
+
+		if (--max_burst == 0) {
+			if (user_ctx) {
+				/*
+				 * Avoid to keep the user sending from its
+				 * context for too long (blocking user thread)
+				 */
+				siw_sq_queue_work(qp);
+				goto done;
+			} else {
+				/*
+				 * Avoid to starve other QP's tx if consumer
+				 * keeps posting new tx work for current cpu.
+				 */
+				int workq_len =
+				    atomic_read(&get_cpu_var(siw_workq_len));
+
+				put_cpu_var(siw_workq_len);
+
+				if (workq_len) {
+					/* Another QP's work on same WQ */
+					siw_sq_queue_work(qp);
+					goto done;
+				}
+			}
+			max_burst = max(qp->attrs.sq_size, qp->attrs.ird);
+		}
+		goto next_wqe;
+
+	} else if (rv == -EAGAIN) {
+		dprint(DBG_WR|DBG_TX,
+			"(QP%d): SQ paused: hd/tr %d of %d, data %d\n",
+			QP_ID(qp), qp->tx_ctx.ctrl_sent, qp->tx_ctx.ctrl_len,
+			qp->tx_ctx.bytes_unsent);
+		rv = 0;
+		goto done;
+	} else {
+		/*
+		 * WQE processing failed.
+		 * Verbs 8.3.2:
+		 * o It turns any WQE into a signalled WQE.
+		 * o Local catastrophic error must be surfaced
+		 * o QP must be moved into Terminate state: done by code
+		 *   doing socket state change processing
+		 *
+		 * o TODO: Termination message must be sent.
+		 * o TODO: Implement more precise work completion errors,
+		 *         see enum ib_wc_status in ib_verbs.h
+		 */
+		dprint(DBG_ON, " (QP%d): WQE type %d processing failed: %d\n",
+				QP_ID(qp), wr_type(wqe), rv);
+
+		lock_sq_rxsave(qp, flags);
+		/*
+		 * RREQ may have already been completed by inbound RRESP!
+		 */
+		if (tx_type == RDMAP_RDMA_READ_REQ) {
+			lock_orq(qp);
+			if (!ORQ_EMPTY(qp) &&
+			    wqe == list_entry_wqe(qp->orq.prev)) {
+				/*
+				 * wqe still on the ORQ
+				 * TODO: fix a potential race condition if the
+				 * rx path is currently referencing the wqe(!)
+				 */
+				dprint(DBG_ON, " (QP%d): Bad RREQ in ORQ\n",
+					QP_ID(qp));
+				list_del_init(&wqe->list);
+				unlock_orq(qp);
+			} else {
+				/*
+				 * already completed by inbound RRESP
+				 */
+				dprint(DBG_ON,
+					" (QP%d): Bad RREQ already Completed\n",
+					QP_ID(qp));
+				unlock_orq(qp);
+				tx_wqe(qp) = NULL;
+				unlock_sq_rxsave(qp, flags);
+
+				goto done;
+			}
+		}
+		tx_wqe(qp) = NULL;
+		unlock_sq_rxsave(qp, flags);
+		/*
+		 * immediately suspends further TX processing
+		 */
+		if (!qp->tx_ctx.tx_suspend)
+			siw_qp_cm_drop(qp, 0);
+
+		switch (tx_type) {
+
+		case SIW_WR_SEND:
+		case SIW_WR_RDMA_WRITE:
+		case SIW_WR_RDMA_READ_REQ:
+			wqe->wr_status = SR_WR_DONE;
+			wqe->wc_status = IB_WC_LOC_QP_OP_ERR;
+			wqe->error = rv;
+			wr_flags(wqe) |= IB_SEND_SIGNALED;
+			if (tx_type != SIW_WR_RDMA_READ_REQ)
+				/*
+				 * RREQ already enqueued to ORQ queue
+				 */
+				siw_wqe_sq_processed(wqe, qp);
+
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+
+			break;
+
+		case SIW_WR_RDMA_READ_RESP:
+			/*
+			 * Recyclye wqe
+			 */
+			dprint(DBG_WR|DBG_TX|DBG_ON, "(QP%d): "
+				   "Processing RRESPONSE failed with %d\n",
+				    QP_ID(qp), rv);
+
+			siw_async_ev(qp, NULL, IB_EVENT_QP_REQ_ERR);
+
+			siw_wqe_put(wqe);
+			break;
+
+		default:
+			BUG();
+		}
+	}
+done:
+	atomic_dec(&qp->tx_ctx.in_use);
+	wake_up(&qp->tx_ctx.waitq);
+
+	return rv;
+}
+
+static struct workqueue_struct *siw_sq_wq;
+
+int __init siw_sq_worker_init(void)
+{
+	siw_sq_wq = create_workqueue("siw_sq_wq");
+	if (!siw_sq_wq)
+		return -ENOMEM;
+
+	dprint(DBG_TX|DBG_OBJ, " Init WQ\n");
+	return 0;
+}
+
+
+void __exit siw_sq_worker_exit(void)
+{
+	dprint(DBG_TX|DBG_OBJ, " Destroy WQ\n");
+	if (siw_sq_wq) {
+		flush_workqueue(siw_sq_wq);
+		destroy_workqueue(siw_sq_wq);
+	}
+}
+
+
+/*
+ * siw_sq_work_handler()
+ *
+ * Scheduled by siw_qp_llp_write_space() socket callback if socket
+ * send space became available again. This function resumes SQ
+ * processing.
+ */
+static void siw_sq_work_handler(struct work_struct *w)
+{
+	struct siw_sq_work	*this_work;
+	struct siw_qp		*qp;
+	int			rv;
+
+	atomic_dec(&get_cpu_var(siw_workq_len));
+	put_cpu_var(siw_workq_len);
+
+	this_work = container_of(w, struct siw_sq_work, work);
+	qp = container_of(this_work, struct siw_qp, sq_work);
+
+	dprint(DBG_TX|DBG_OBJ, "(QP%d)\n", QP_ID(qp));
+
+	if (down_read_trylock(&qp->state_lock)) {
+		if (likely(qp->attrs.state == SIW_QP_STATE_RTS &&
+			   !qp->tx_ctx.tx_suspend)) {
+
+			rv = siw_qp_sq_process(qp, 0);
+			up_read(&qp->state_lock);
+
+			if (rv < 0) {
+				dprint(DBG_TX, "(QP%d): failed: %d\n",
+					QP_ID(qp), rv);
+
+				if (!qp->tx_ctx.tx_suspend)
+					siw_qp_cm_drop(qp, 0);
+			}
+		} else {
+			dprint(DBG_ON|DBG_TX, "(QP%d): state: %d %d\n",
+				QP_ID(qp), qp->attrs.state,
+					qp->tx_ctx.tx_suspend);
+			up_read(&qp->state_lock);
+		}
+	} else {
+		dprint(DBG_ON|DBG_TX, "(QP%d): QP locked\n", QP_ID(qp));
+	}
+	siw_qp_put(qp);
+}
+
+
+int siw_sq_queue_work(struct siw_qp *qp)
+{
+	int cpu, rv;
+
+	dprint(DBG_TX|DBG_OBJ, "(QP%d)\n", QP_ID(qp));
+
+	siw_qp_get(qp);
+
+	INIT_WORK(&qp->sq_work.work, siw_sq_work_handler);
+
+	cpu = get_cpu();
+
+	if (in_softirq()) {
+		if (cpu == qp->cpu) {
+			/*
+			 * Try not to use the current CPU for tx traffic.
+			 */
+			for_each_online_cpu(cpu) {
+				if (cpu != qp->cpu)
+					break;
+			}
+		} else
+			cpu = qp->cpu;
+	}
+	atomic_inc(&per_cpu(siw_workq_len, cpu));
+	rv = queue_work_on(cpu, siw_sq_wq, &qp->sq_work.work);
+	/*
+	 * Remember CPU: Avoid spreading SQ work of QP over WQ's
+	 */
+	qp->cpu = cpu;
+
+	put_cpu();
+
+	return rv;
+}
-- 
1.5.4.3


^ permalink raw reply related

* [PATCH] SIW: Receive path
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_qp_rx.c | 1493 +++++++++++++++++++++++++++++++++
 1 files changed, 1493 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_qp_rx.c

diff --git a/drivers/infiniband/hw/siw/siw_qp_rx.c b/drivers/infiniband/hw/siw/siw_qp_rx.c
new file mode 100644
index 0000000..dd9edd4
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_qp_rx.c
@@ -0,0 +1,1493 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *          Fredy Neeser <nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_umem.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+#include "siw_cm.h"
+
+
+/*
+ * ----------------------------
+ * DDP reassembly for Softiwarp
+ * ----------------------------
+ * For the ordering of transmitted DDP segments, the relevant iWARP ordering
+ * rules are as follows:
+ *
+ * - RDMAP (RFC 5040): Section 7.5, Rule 17:
+ *   "RDMA Read Response Message processing at the Remote Peer (reading
+ *    the specified Tagged Buffer) MUST be started only after the RDMA
+ *    Read Request Message has been Delivered by the DDP layer (thus,
+ *    all previous RDMA Messages have been properly submitted for
+ *    ordered Placement)."
+ *
+ * - DDP (RFC 5041): Section 5.3:
+ *   "At the Data Source, DDP:
+ *    o MUST transmit DDP Messages in the order they were submitted to
+ *      the DDP layer,
+ *    o SHOULD transmit DDP Segments within a DDP Message in increasing
+ *      MO order for Untagged DDP Messages, and in increasing TO order
+ *      for Tagged DDP Messages."
+ *
+ * Combining these rules implies that, although RDMAP does not provide
+ * ordering between operations that are generated from the two ends of an
+ * RDMAP stream, DDP *must not* transmit an RDMA Read Response Message before
+ * it has finished transmitting SQ operations that were already submitted
+ * to the DDP layer. It follows that an iWARP transmitter must fully
+ * serialize RDMAP messages belonging to the same QP.
+ *
+ * Given that a TCP socket receives DDP segments in peer transmit order,
+ * we obtain the following ordering of received DDP segments:
+ *
+ * (i)  the received DDP segments of RDMAP messages for the same QP
+ *      cannot be interleaved
+ * (ii) the received DDP segments of a single RDMAP message *should*
+ *      arrive in order.
+ *
+ * The Softiwarp transmitter obeys rule #2 in DDP Section 5.3.
+ * With this property, the "should" becomes a "must" in (ii) above,
+ * which simplifies DDP reassembly considerably.
+ * The Softiwarp receiver currently relies on this property
+ * and reports an error if DDP segments of the same RDMAP message
+ * do not arrive in sequence.
+ */
+
+static inline int siw_crc_rxhdr(struct siw_iwarp_rx *ctx)
+{
+	crypto_hash_init(&ctx->mpa_crc_hd);
+
+	return siw_crc_array(&ctx->mpa_crc_hd, (u8 *)&ctx->hdr,
+			     ctx->fpdu_part_rcvd);
+}
+
+
+/*
+ * siw_rx_umem_init()
+ *
+ * Given memory region @mr and tagged offset @t_off within @mr,
+ * resolve corresponding ib_umem_chunk memory chunk pointer
+ * and update receive context variables to point at receive position.
+ * returns 0 on sucess and failure otherwise.
+ *
+ * NOTE: This function expects virtual addresses.
+ * TODO: Function needs generalization to support relative adressing
+ *       aka "ZBVA".
+ *
+ * @rctx:	Receive Context to be updated
+ * @mr:		Memory Region
+ * @t_off:	Offset within Memory Region
+ *
+ */
+static int siw_rx_umem_init(struct siw_iwarp_rx *rctx, struct siw_mr *mr,
+			    u64 t_off)
+{
+	struct ib_umem_chunk	*chunk;
+	u64			off_mr;   /* offset into MR */
+	int			psge_idx; /* Index of PSGE */
+
+	off_mr = t_off - (mr->mem.va & PAGE_MASK);
+	/*
+	 * Equivalent to
+	 * off_mr = t_off - mr->mem.va;
+	 * off_mr += mr->umem->offset;
+	 */
+
+	/* Skip pages not referenced by t_off */
+	psge_idx = off_mr >> PAGE_SHIFT;
+
+	list_for_each_entry(chunk, &mr->umem->chunk_list, list) {
+		if (psge_idx < chunk->nents)
+			break;
+		psge_idx -= chunk->nents;
+	}
+	if (psge_idx >= chunk->nents) {
+		dprint(DBG_MM|DBG_ON, "(QP%d): Short chunk list\n",
+			RX_QPID(rctx));
+		return -EINVAL;
+	}
+	rctx->pg_idx = psge_idx;
+	rctx->pg_off = off_mr & ~PAGE_MASK;
+	rctx->umem_chunk = chunk;
+
+	dprint(DBG_MM, "(QP%d): New chunk, idx %d\n", RX_QPID(rctx), psge_idx);
+	return 0;
+}
+
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @rctx.
+ * This function does not check if umem is within bounds requested by
+ * @len and @t_off. @umem_ends indicates if routine should
+ * not update chunk position pointers after the point it is
+ * currently receiving
+ *
+ * @rctx:	Receive Context
+ * @len:	Number of bytes to place
+ * @umen_ends:	1, if rctx chunk pointer should not be updated after len.
+ */
+static int siw_rx_umem(struct siw_iwarp_rx *rctx, int len, int umem_ends)
+{
+	struct scatterlist	*p_list;
+	void			*dest;
+	struct ib_umem_chunk    *chunk = rctx->umem_chunk;
+	int			pg_off = rctx->pg_off,
+				copied = 0,
+				bytes,
+				rv;
+
+	while (len) {
+		bytes  = min(len, (int)PAGE_SIZE - pg_off);
+		p_list = &chunk->page_list[rctx->pg_idx];
+
+		dest = kmap_atomic(sg_page(p_list), KM_SOFTIRQ0);
+
+		rv = skb_copy_bits(rctx->skb, rctx->skb_offset, dest + pg_off,
+				   bytes);
+
+		dprint(DBG_RX, "(QP%d): Page #%d, "
+			"bytes=%u, rv=%d returned by skb_copy_bits()\n",
+			RX_QPID(rctx), rctx->pg_idx, bytes, rv);
+
+		if (likely(!rv)) {
+			if (rctx->crc_enabled)
+				rv = siw_crc_sg(&rctx->mpa_crc_hd, p_list,
+						pg_off, bytes);
+
+			rctx->skb_offset += bytes;
+			copied += bytes;
+			len -= bytes;
+			pg_off += bytes;
+		}
+
+		kunmap_atomic(dest, KM_SOFTIRQ0);
+
+		if (unlikely(rv)) {
+			rctx->skb_copied += copied;
+			rctx->skb_new -= copied;
+			copied = -EFAULT;
+
+			dprint(DBG_RX|DBG_ON, "(QP%d): failed with %d\n",
+				RX_QPID(rctx), rv);
+
+			goto out;
+		}
+		if (pg_off == PAGE_SIZE) {
+			/*
+			 * end of page
+			 */
+			pg_off = 0;
+			/*
+			 * reference next page chunk if
+			 * - all pages in chunk used AND
+			 * - current loop fills more into this umem
+			 *   OR the next receive will go into this umem
+			 *   starting at the position where we are leaving
+			 *   the routine.
+			 */
+			if (++rctx->pg_idx == chunk->nents &&
+				(len > 0 || !umem_ends)) {
+
+				rctx->pg_idx = 0;
+				chunk = mem_chunk_next(chunk);
+			}
+		}
+	}
+	/*
+	 * store chunk position for resume
+	 */
+	rctx->umem_chunk = chunk;
+	rctx->pg_off = pg_off;
+
+	rctx->skb_copied += copied;
+	rctx->skb_new -= copied;
+out:
+	return copied;
+}
+
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_rresp_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_rdma_rresp	*rresp = &rctx->hdr.rresp;
+	struct siw_wqe		*wqe = rctx->dest.wqe;
+
+	rresp->sink_stag = be32_to_cpu(rresp->sink_stag);
+	rresp->sink_to   = be64_to_cpu(rresp->sink_to);
+
+	if (rctx->first_ddp_seg) {
+		rctx->ddp_stag = wqe->wr.rread.sge[0].lkey;
+		rctx->ddp_to   = wqe->wr.rread.sge[0].addr;
+	}
+	if (rctx->ddp_stag != rresp->sink_stag) {
+		dprint(DBG_RX|DBG_ON,
+			" received STAG=%08x, expected STAG=%08x\n",
+			rresp->sink_stag, rctx->ddp_stag);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->ddp_to != rresp->sink_to) {
+		dprint(DBG_RX|DBG_ON,
+			" received TO=%016llx, expected TO=%016llx\n",
+			(unsigned long long)rresp->sink_to,
+			(unsigned long long)rctx->ddp_to);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->more_ddp_segs)
+		rctx->ddp_to += rctx->fpdu_part_rem;
+
+	else if (wqe->processed + rctx->fpdu_part_rem != wqe->bytes) {
+		dprint(DBG_RX|DBG_ON,
+			" RRESP length does not match RREQ, "
+			"peer sent=%d, expected %d\n",
+			wqe->processed + rctx->fpdu_part_rem, wqe->bytes);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_write_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_rdma_write	*write = &rctx->hdr.rwrite;
+
+	write->sink_stag = be32_to_cpu(write->sink_stag);
+	write->sink_to   = be64_to_cpu(write->sink_to);
+
+	if (rctx->first_ddp_seg) {
+		rctx->ddp_stag = write->sink_stag;
+		rctx->ddp_to   = write->sink_to;
+	} else {
+		if (rctx->ddp_stag != write->sink_stag) {
+			dprint(DBG_RX|DBG_ON,
+				" received STAG=%08x, expected STAG=%08x\n",
+				write->sink_stag, rctx->ddp_stag);
+			/*
+			 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+			 */
+			return -EINVAL;
+		}
+		if (rctx->ddp_to !=  write->sink_to) {
+			dprint(DBG_RX|DBG_ON,
+				" received TO=%016llx, expected TO=%016llx\n",
+				(unsigned long long)write->sink_to,
+				(unsigned long long)rctx->ddp_to);
+			/*
+			 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+			 */
+			return -EINVAL;
+		}
+	}
+	/*
+	 * Update expected target offset for next incoming DDP segment
+	 */
+	if (rctx->more_ddp_segs != 0)
+		rctx->ddp_to += rctx->fpdu_part_rem;
+
+	return 0;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segement.
+ */
+static inline int siw_send_check_ntoh(struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_send	*send = &rctx->hdr.send;
+	struct siw_wqe		*wqe = rctx->dest.wqe;
+
+	send->ddp_msn = be32_to_cpu(send->ddp_msn);
+	send->ddp_mo  = be32_to_cpu(send->ddp_mo);
+	send->ddp_qn  = be32_to_cpu(send->ddp_qn);
+
+	if (send->ddp_qn != RDMAP_UNTAGGED_QN_SEND) {
+		dprint(DBG_RX|DBG_ON, " Invalid DDP QN %d for SEND\n",
+			send->ddp_qn);
+		return -EINVAL;
+	}
+	if (send->ddp_msn != rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]) {
+		dprint(DBG_RX|DBG_ON, " received MSN=%d, expected MSN=%d\n",
+			rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND], send->ddp_msn);
+		/*
+		 * TODO: Error handling
+		 * async_event= RI_EVENT_QP_RQ_PROTECTION_ERROR_MSN_GAP;
+		 * cmpl_status= RI_WC_STATUS_LOCAL_QP_CATASTROPHIC;
+		 */
+		return -EINVAL;
+	}
+	if (send->ddp_mo != wqe->processed) {
+		dprint(DBG_RX|DBG_ON, " Received MO=%u, expected MO=%u\n",
+			send->ddp_mo, wqe->processed);
+		/*
+		 * Verbs: RI_EVENT_QP_LLP_INTEGRITY_ERROR_BAD_FPDU
+		 */
+		return -EINVAL;
+	}
+	if (rctx->first_ddp_seg) {
+		/* initialize user memory write position */
+		rctx->sge_idx = 0;
+		rctx->sge_off = 0;
+	}
+	if (wqe->bytes < wqe->processed + rctx->fpdu_part_rem) {
+		dprint(DBG_RX|DBG_ON, " Receive space short: %d < %d\n",
+			wqe->bytes - wqe->processed, rctx->fpdu_part_rem);
+		wqe->wc_status = IB_WC_LOC_LEN_ERR;
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static inline struct siw_wqe *siw_get_rqe(struct siw_qp *qp)
+{
+	struct siw_wqe	*wqe = NULL;
+
+	if (!qp->srq) {
+		lock_rq(qp);
+		if (!list_empty(&qp->rq)) {
+			wqe = list_first_wqe(&qp->rq);
+			list_del_init(&wqe->list);
+			unlock_rq(qp);
+		} else {
+			unlock_rq(qp);
+			dprint(DBG_RX, " QP(%d): RQ empty!\n", QP_ID(qp));
+		}
+	} else {
+		wqe = siw_srq_fetch_wqe(qp);
+		if (!wqe)
+			dprint(DBG_RX, " QP(%d): SRQ empty!\n", QP_ID(qp));
+	}
+	return wqe;
+}
+
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	struct siw_sge	*sge;
+	struct siw_mr	*mr;
+	u32		data_bytes,	/* all data bytes available */
+			rcvd_bytes;	/* sum of data bytes rcvd */
+	int		rv = 0;
+
+	if (rctx->first_ddp_seg) {
+		WARN_ON(rx_wqe(qp) != NULL);
+
+		wqe = siw_get_rqe(qp);
+		if (!wqe)
+			return -ENOENT;
+
+		rx_wqe(qp) = wqe;
+		wqe->wr_status = SR_WR_INPROGRESS;
+	} else  {
+		wqe = rx_wqe(qp);
+		if (!wqe) {
+			/*
+			 * this is a siw bug!
+			 */
+			dprint(DBG_ON, "QP(%d): RQ failure\n", QP_ID(qp));
+			return -EPROTO;
+		}
+	}
+	if (rctx->state == SIW_GET_DATA_START) {
+		rv = siw_send_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+		if (!rctx->fpdu_part_rem) /* zero length SEND */
+			return 0;
+	}
+	data_bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+	rcvd_bytes = 0;
+
+	while (data_bytes) {
+		struct siw_pd	*pd;
+		u32	sge_bytes;	/* data bytes avail for SGE */
+		int	umem_ends;	/* 1 if umem ends with current rcv */
+
+		sge = &wqe->wr.sgl.sge[rctx->sge_idx];
+
+		if (!sge->len) {
+			/* just skip empty sge's */
+			rctx->sge_idx++;
+			rctx->sge_off = 0;
+			continue;
+		}
+		sge_bytes = min(data_bytes, sge->len - rctx->sge_off);
+
+		/*
+		 * check with QP's PD if no SRQ present, SRQ's PD otherwise
+		 */
+		pd = qp->srq == NULL ? qp->pd : qp->srq->pd;
+
+		rv = siw_check_sge(pd, sge, SR_MEM_LWRITE, rctx->sge_off,
+				   sge_bytes);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+			break;
+		}
+		mr = siw_mem2mr(sge->mem.obj);
+
+		if (rctx->sge_off == 0) {
+			/*
+			 * started a new sge: update receive pointers
+			 */
+			rv = siw_rx_umem_init(rctx, mr, sge->addr);
+			if (rv)
+				break;
+		}
+		/*
+		 * Are we going to finish placing
+		 * - the last fragment of the current SGE or
+		 * - the last DDP segment (L=1) of the current RDMAP message?
+		 *
+		 * siw_rx_umem() must advance umem page_chunk position
+		 * after sucessful receive only, if receive into current
+		 * umem does not end. umem ends, if:
+		 * - current SGE gets completely filled, OR
+		 * - current MPA FPDU is last AND gets consumed now
+		 */
+		umem_ends = ((sge_bytes + rctx->sge_off == sge->len) ||
+			      (!rctx->more_ddp_segs &&
+			       rctx->fpdu_part_rcvd + sge_bytes ==
+					rctx->fpdu_part_rem)) ? 1 : 0;
+
+		rv = siw_rx_umem(rctx, sge_bytes, umem_ends);
+		if (rv != sge_bytes) {
+			/*
+			 * siw_rx_umem() must have updated
+			 * skb_new and skb_copied
+			 */
+			wqe->processed += rcvd_bytes;
+			return -EINVAL;
+		}
+		rctx->sge_off += rv;
+
+		if (rctx->sge_off == sge->len) {
+			rctx->sge_idx++;
+			rctx->sge_off = 0;
+		}
+		data_bytes -= rv;
+		rcvd_bytes += rv;
+
+		rctx->fpdu_part_rem -= rv;
+		rctx->fpdu_part_rcvd += rv;
+	}
+	wqe->processed += rcvd_bytes;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ *	0:       reached the end of a DDP segment
+ *	-EAGAIN: to be called again to finish the DDP segment
+ */
+
+int siw_proc_write(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_dev		*dev = qp->hdr.dev;
+	struct iwarp_rdma_write	*write = &rctx->hdr.rwrite;
+	struct siw_mem		*mem;
+	int			bytes,
+				last_write,
+				rv;
+
+	if (rctx->state == SIW_GET_DATA_START) {
+
+		if (!rctx->fpdu_part_rem) /* zero length WRITE */
+			return 0;
+
+		rv = siw_write_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			return rv;
+		}
+	}
+	bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+
+	/*
+	 * NOTE: bytes > 0 is always true, since this routine
+	 * gets only called if so.
+	 */
+	if (rctx->first_ddp_seg) {
+		/* DEBUG Code, to be removed */
+		if (rx_mem(qp) != 0) {
+			dprint(DBG_RX|DBG_ON, "(QP%d): Stale rctx state!\n",
+				QP_ID(qp));
+			return -EFAULT;
+		}
+		rx_mem(qp) = siw_mem_id2obj(dev, rctx->ddp_stag >> 8);
+	}
+	if (rx_mem(qp) == NULL) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): "
+			"Sink STag not found or invalid,  STag=0x%08x\n",
+			QP_ID(qp), rctx->ddp_stag);
+		return -EINVAL;
+	}
+	mem = rx_mem(qp);
+	/*
+	 * Rtag not checked against mem's tag again because
+	 * hdr check guarantees same tag as before if fragmented
+	 */
+	rv = siw_check_mem(qp->pd, mem, write->sink_to + rctx->fpdu_part_rcvd,
+			   SR_MEM_RWRITE, bytes);
+	if (rv) {
+		siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+		return rv;
+	}
+	if (rctx->first_ddp_seg) {
+		rv = siw_rx_umem_init(rctx, siw_mem2mr(mem), write->sink_to);
+		if (rv)
+			return -EINVAL;
+
+	} else if (!rctx->umem_chunk) {
+		/*
+		 * This should never happen.
+		 *
+		 * TODO: Remove tentative debug aid.
+		 */
+		dprint(DBG_RX|DBG_ON, "(QP%d): "
+			"Umem chunk not resolved!\n", QP_ID(qp));
+		return -EINVAL;
+	}
+	/*
+	 * Are we going to place the last piece of the last
+	 * DDP segment of the current RDMAP message?
+	 *
+	 * It is last if:
+	 * - rctx->fpdu_part_rem <= rctx->skb_new AND
+	 * - payload_rem (of current DDP segment) <= rctx->skb_new
+	 */
+	last_write = ((rctx->fpdu_part_rem <= rctx->skb_new) &&
+		      !rctx->more_ddp_segs) ? 1 : 0;
+
+	rv = siw_rx_umem(rctx, bytes, last_write);
+	if (rv != bytes)
+		return -EINVAL;
+
+	rctx->fpdu_part_rem -= rv;
+	rctx->fpdu_part_rcvd += rv;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	if (!rctx->fpdu_part_rem)
+		return 0;
+
+	dprint(DBG_ON|DBG_RX, "(QP%d): RREQ with MPA len %d\n", QP_ID(qp),
+		rctx->hdr.ctrl.mpa_len);
+
+	return -EPROTO;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ *	0:      success,
+ *		failure code otherwise
+ */
+
+int siw_init_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe 	*rsp;
+
+	rsp = siw_wqe_get(qp, SIW_WR_RDMA_READ_RESP);
+	if (rsp) {
+		rsp->wr.rresp.sge.len = be32_to_cpu(rctx->hdr.rreq.read_size);
+		rsp->bytes = rsp->wr.rresp.sge.len;	/* redundant */
+		rsp->processed = 0;
+
+		rsp->wr.rresp.sge.addr = be64_to_cpu(rctx->hdr.rreq.source_to);
+		rsp->wr.rresp.num_sge = rsp->bytes ? 1 : 0;
+
+		rsp->wr.rresp.sge.mem.obj = NULL;	/* defer lookup */
+		rsp->wr.rresp.sge.lkey =
+			be32_to_cpu(rctx->hdr.rreq.source_stag);
+
+		rsp->wr.rresp.raddr = be64_to_cpu(rctx->hdr.rreq.sink_to);
+		rsp->wr.rresp.rtag = rctx->hdr.rreq.sink_stag; /* NBO */
+
+	} else {
+		dprint(DBG_RX|DBG_ON, "(QP%d): IRD exceeded!\n", QP_ID(qp));
+		return -EPROTO;
+	}
+	rsp->wr_status = SR_WR_QUEUED;
+
+	/*
+	 * Insert into IRQ
+	 *
+	 * TODO: Revisit ordering of genuine SQ WRs and Read Response
+	 * pseudo-WRs. RDMAP specifies that there is no ordering among
+	 * the two directions of transmission, so there is a degree of
+	 * freedom.
+	 *
+	 * The current logic favours Read Responses over SQ work requests
+	 * that are queued but not already in progress.
+	 */
+	lock_sq(qp);
+	if (!tx_wqe(qp)) {
+		tx_wqe(qp) = rsp;
+		unlock_sq(qp);
+		/*
+		 * schedule TX work, even if SQ was supended due to
+		 * ORD limit: it is always OK (and may even prevent peers
+		 * from appl lock) to send RRESPONSE's
+		 */
+		siw_sq_queue_work(qp);
+	} else {
+		list_add_tail(&rsp->list, &qp->irq);
+		unlock_sq(qp);
+	}
+	return 0;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE.
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	struct siw_mr	*mr;
+	struct siw_sge	*sge;
+	int		bytes,
+			is_last,
+			rv;
+
+	if (rctx->first_ddp_seg) {
+		WARN_ON(rx_wqe(qp) != NULL);
+		/*
+		 * fetch pending RREQ from orq
+		 */
+		lock_orq(qp);
+		if (!list_empty(&qp->orq)) {
+			wqe = list_first_entry(&qp->orq, struct siw_wqe, list);
+			list_del_init(&wqe->list);
+		} else {
+			unlock_orq(qp);
+			dprint(DBG_RX|DBG_ON, "(QP%d): ORQ empty\n",
+				QP_ID(qp));
+			/*
+			 * TODO: Should generate an async error
+			 */
+			rv = -ENODATA; /* or -ENOENT ? */
+			goto done;
+		}
+		unlock_orq(qp);
+
+		rx_wqe(qp) = wqe;
+
+		if (wr_type(wqe) != SIW_WR_RDMA_READ_REQ || wqe->processed) {
+			WARN_ON(wqe->processed);
+			WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
+			rv = -EINVAL;
+			goto done;
+		}
+
+		wqe->wr_status = SR_WR_INPROGRESS;
+
+		rv = siw_rresp_check_ntoh(rctx);
+		if (rv) {
+			siw_async_ev(qp, NULL, IB_EVENT_QP_FATAL);
+			goto done;
+		}
+	} else {
+		wqe = rx_wqe(qp);
+		if (!wqe) {
+			WARN_ON(1);
+			rv = -ENODATA;
+			goto done;
+		}
+	}
+	if (!rctx->fpdu_part_rem) /* zero length RRESPONSE */
+		return 0;
+
+	bytes = min(rctx->fpdu_part_rem, rctx->skb_new);
+	sge = wqe->wr.rread.sge; /* there is only one */
+
+	/*
+	 * check target memory which resolves memory on first fragment
+	 */
+	rv = siw_check_sge(qp->pd, sge, SR_MEM_LWRITE, wqe->processed, bytes);
+	if (rv) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): siw_check_sge failed: %d\n",
+			QP_ID(qp), rv);
+		wqe->wc_status = IB_WC_LOC_PROT_ERR;
+		siw_async_ev(qp, NULL, IB_EVENT_QP_ACCESS_ERR);
+		goto done;
+	}
+	mr = siw_mem2mr(sge->mem.obj);
+
+	if (rctx->first_ddp_seg) {
+		rv = siw_rx_umem_init(rctx, mr, sge->addr);
+		if (rv) {
+			wqe->wc_status = IB_WC_LOC_PROT_ERR;
+			goto done;
+		}
+	} else if (!rctx->umem_chunk) {
+		/*
+		 * This should never happen.
+		 *
+		 * TODO: Remove tentative debug aid.
+		 */
+		dprint(DBG_RX|DBG_ON, "(QP%d): No target mem!\n", QP_ID(qp));
+		wqe->wc_status = IB_WC_GENERAL_ERR;
+		rv = -EPROTO;
+		goto done;
+	}
+	/*
+	 * Are we going to finish placing the last DDP segment (L=1)
+	 * of the current RDMAP message?
+	 *
+	 * NOTE: siw_rresp_check_ntoh() guarantees that the
+	 * last inbound RDMAP Read Response message exactly matches
+	 * with the RREQ WR.
+	 */
+	is_last = (bytes + wqe->processed == wqe->bytes) ? 1 : 0;
+
+	rv = siw_rx_umem(rctx,  bytes, is_last);
+	if (rv != bytes) {
+		wqe->wc_status = IB_WC_GENERAL_ERR;
+		rv = -EINVAL;
+		goto done;
+	}
+	rctx->fpdu_part_rem -= rv;
+	rctx->fpdu_part_rcvd += rv;
+
+	wqe->processed += rv;
+
+	if (!rctx->fpdu_part_rem)
+		return 0;
+done:
+	return (rv < 0) ? rv : -EAGAIN;
+}
+
+static void siw_drain_pkt(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	char	buf[4096];
+	int	len;
+
+	dprint(DBG_ON|DBG_RX, " (QP%d): drain %d bytes\n",
+		QP_ID(qp), rctx->fpdu_part_rem);
+
+	while (rctx->fpdu_part_rem) {
+		len = min(rctx->fpdu_part_rem, 4096);
+
+		skb_copy_bits(rctx->skb, rctx->skb_offset,
+				      buf, rctx->fpdu_part_rem);
+
+		rctx->skb_copied += len;
+		rctx->skb_offset += len;
+		rctx->skb_new -= len;
+		rctx->fpdu_part_rem -= len;
+	}
+}
+
+int siw_proc_unsupp(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	WARN_ON(1);
+	siw_drain_pkt(qp, rctx);
+	return 0;
+}
+
+
+int siw_proc_terminate(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct iwarp_terminate	*term = &rctx->hdr.terminate;
+
+	printk(KERN_INFO "(QP%d): RX Terminate: etype=%d, layer=%d, ecode=%d\n",
+		QP_ID(qp), term->term_ctrl.etype, term->term_ctrl.layer,
+		term->term_ctrl.ecode);
+
+	siw_drain_pkt(qp, rctx);
+	return 0;
+}
+
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_iwarp_rx *rctx)
+{
+	struct sk_buff	*skb = rctx->skb;
+	u8		*tbuf = (u8 *)&rctx->trailer.crc - rctx->pad;
+	int		avail;
+
+	avail = min(rctx->skb_new, rctx->fpdu_part_rem);
+
+	skb_copy_bits(skb, rctx->skb_offset,
+		      tbuf + rctx->fpdu_part_rcvd, avail);
+
+	rctx->fpdu_part_rcvd += avail;
+	rctx->fpdu_part_rem -= avail;
+
+	rctx->skb_new -= avail;
+	rctx->skb_offset += avail;
+	rctx->skb_copied += avail;
+
+	dprint(DBG_RX, " (QP%d): %d remaining (%d)\n", QP_ID(qp),
+		rctx->fpdu_part_rem, avail);
+
+	if (!rctx->fpdu_part_rem) {
+		u32	crc_in, crc_own = 0;
+		/*
+		 * check crc if required
+		 */
+		if (!rctx->crc_enabled)
+			return 0;
+
+		if (rctx->pad && siw_crc_array(&rctx->mpa_crc_hd,
+					       tbuf, rctx->pad) != 0)
+			return -EINVAL;
+
+		crypto_hash_final(&rctx->mpa_crc_hd, (u8 *)&crc_own);
+
+		/*
+		 * CRC32 is computed, transmitted and received directly in NBO,
+		 * so there's never a reason to convert byte order.
+		 */
+		crc_in = rctx->trailer.crc;
+
+		if (crc_in != crc_own) {
+			dprint(DBG_RX|DBG_ON,
+				" (QP%d): CRC ERROR in:=%08x, own=%08x\n",
+				QP_ID(qp), crc_in, crc_own);
+			return -EINVAL;
+		}
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+
+static int siw_get_hdr(struct siw_iwarp_rx *rctx)
+{
+	struct sk_buff		*skb = rctx->skb;
+	struct iwarp_ctrl	*c_hdr = &rctx->hdr.ctrl;
+
+	int bytes;
+
+	if (rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) {
+		/*
+		 * copy first fix part of iwarp hdr
+		 */
+		bytes = min_t(int, rctx->skb_new,
+			      sizeof(struct iwarp_ctrl) - rctx->fpdu_part_rcvd);
+
+		skb_copy_bits(skb, rctx->skb_offset,
+			      (char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
+
+		rctx->fpdu_part_rcvd += bytes;
+
+		rctx->skb_new -= bytes;
+		rctx->skb_offset += bytes;
+		rctx->skb_copied += bytes;
+
+		if (!rctx->skb_new ||
+			rctx->fpdu_part_rcvd < sizeof(struct iwarp_ctrl)) {
+			return -EAGAIN;
+		}
+
+		if (c_hdr->opcode > RDMAP_TERMINATE) {
+			dprint(DBG_RX|DBG_ON, " opcode %d\n", c_hdr->opcode);
+			return -EINVAL;
+		}
+		if (c_hdr->dv != DDP_VERSION) {
+			dprint(DBG_RX|DBG_ON, " dversion %d\n", c_hdr->dv);
+			return -EINVAL;
+		}
+		if (c_hdr->rv != RDMAP_VERSION) {
+			dprint(DBG_RX|DBG_ON, " rversion %d\n", c_hdr->rv);
+			return -EINVAL;
+		}
+		dprint(DBG_RX, "(QP%d): New Header, opcode:%d\n",
+			RX_QPID(rctx), c_hdr->opcode);
+	}
+	/*
+	 * figure out len of current hdr: variable length of
+	 * iwarp hdr forces us to copy hdr information
+	 */
+	bytes = min(rctx->skb_new,
+		  iwarp_pktinfo[c_hdr->opcode].hdr_len - rctx->fpdu_part_rcvd);
+
+	skb_copy_bits(skb, rctx->skb_offset,
+		      (char *)c_hdr + rctx->fpdu_part_rcvd, bytes);
+
+	rctx->fpdu_part_rcvd += bytes;
+
+	rctx->skb_new -= bytes;
+	rctx->skb_offset += bytes;
+	rctx->skb_copied += bytes;
+
+	if (rctx->fpdu_part_rcvd == iwarp_pktinfo[c_hdr->opcode].hdr_len) {
+		/*
+		 * HDR receive completed. Check if the current DDP segment
+		 * starts a new RDMAP message or continues a previously
+		 * started RDMAP message.
+		 *
+		 * Note well from the comments on DDP reassembly:
+		 * - Support for unordered reception of DDP segments
+		 *   (or FPDUs) from different RDMAP messages is not needed.
+		 * - Unordered reception of DDP segments of the same
+		 *   RDMAP message is not supported. It is probably not
+		 *   needed with most peers.
+		 */
+		siw_dprint_hdr(&rctx->hdr, RX_QPID(rctx), "HDR received");
+
+		if (rctx->more_ddp_segs != 0) {
+			rctx->first_ddp_seg = 0;
+			if (rctx->prev_ddp_opcode != c_hdr->opcode) {
+				dprint(DBG_ON,
+					"packet intersection: %d <> %d\n",
+					rctx->prev_ddp_opcode, c_hdr->opcode);
+				return -EPROTO;
+			}
+		} else {
+			rctx->prev_ddp_opcode = c_hdr->opcode;
+			rctx->first_ddp_seg = 1;
+		}
+		rctx->more_ddp_segs = (c_hdr->l == 0) ? 1 : 0;
+
+		return 0;
+	}
+	return -EAGAIN;
+}
+
+static inline int siw_fpdu_payload_len(struct siw_iwarp_rx *rctx)
+{
+	return ((int)(rctx->hdr.ctrl.mpa_len) - rctx->fpdu_part_rcvd)
+		+ MPA_HDR_SIZE;
+}
+
+static inline int siw_fpdu_trailer_len(struct siw_iwarp_rx *rctx)
+{
+	int mpa_len = (int)rctx->hdr.ctrl.mpa_len + MPA_HDR_SIZE;
+
+	return MPA_CRC_SIZE + (-mpa_len & 0x3);
+}
+
+/*
+ * siw_rreq_complete()
+ *
+ * Complete the current READ REQUEST after READ RESPONSE processing.
+ * It may complete consecutive WQE's which were already SQ
+ * processed before but are awaiting completion due to completion
+ * ordering (see verbs 8.2.2.2).
+ * The READ RESPONSE may also resume SQ processing if it was stalled
+ * due to ORD exhaustion (see verbs 8.2.2.18)
+ * Function stops completion when next READ REQUEST found or ORQ empty.
+ */
+static void siw_rreq_complete(struct siw_wqe *wqe, int error)
+{
+	struct siw_qp		*qp = wqe->qp;
+	int			num_wc = 1;
+	enum ib_send_flags	flags;
+	LIST_HEAD(c_list);
+
+	flags = wr_flags(wqe);
+
+	if (flags & IB_SEND_SIGNALED)
+		list_add(&wqe->list, &c_list);
+	else {
+		atomic_inc(&qp->sq_space);
+		siw_wqe_put(wqe);
+		num_wc = 0;
+	}
+
+	lock_orq(qp);
+
+	/* More WQE's to complete following this RREQ? */
+	if (!list_empty(&qp->orq)) {
+		struct list_head *pos, *n;
+		list_for_each_safe(pos, n, &qp->orq) {
+			wqe = list_entry_wqe(pos);
+			if (wr_type(wqe) == SIW_WR_RDMA_READ_REQ)
+				break;
+			flags |= wr_flags(wqe);
+			num_wc++;
+			dprint(DBG_WR|DBG_ON,
+				"(QP%d): Resume completion, wr_type %d\n",
+				QP_ID(qp), wr_type(wqe));
+			list_move_tail(pos, &c_list);
+		}
+	}
+	unlock_orq(qp);
+
+	if (num_wc)
+		siw_sq_complete(&c_list, qp, num_wc, flags);
+
+	/*
+	 * Check if SQ processing was stalled due to ORD limit
+	 */
+	if (ORD_SUSPEND_SQ(qp)) {
+		lock_sq(qp);
+
+		wqe = siw_next_tx_wqe(qp);
+
+		if (wqe && !tx_wqe(qp)) {
+			WARN_ON(wr_type(wqe) != SIW_WR_RDMA_READ_REQ);
+			list_del_init(&wqe->list);
+			tx_wqe(qp) = wqe;
+
+			list_add_tail(&wqe->list, &qp->orq);
+
+			unlock_sq(qp);
+
+			dprint(DBG_RX, "(QP%d): SQ resume (%d)\n",
+				QP_ID(qp), atomic_read(&qp->sq_space));
+
+			siw_sq_queue_work(qp);
+		} else {
+			/* only new ORQ space if not next RREQ queued */
+			atomic_inc(&qp->orq_space);
+			unlock_sq(qp);
+		}
+	} else
+		atomic_inc(&qp->orq_space);
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * complete processing of an RDMA message after receiving all
+ * DDP segmens
+ *
+ *   o SENDs + RRESPs will need for completion,
+ *   o RREQs need for  READ RESPONSE initialization
+ *   o WRITEs need memory dereferencing
+ *
+ * TODO: Could siw_[s,r]_complete() fail? (CQ full)
+ */
+static inline int siw_rdmap_complete(struct siw_qp *qp,
+				     struct siw_iwarp_rx *rctx)
+{
+	struct siw_wqe	*wqe;
+	int rv = 0;
+
+	switch (rctx->hdr.ctrl.opcode) {
+
+	case RDMAP_SEND_SE:
+		wr_flags(rx_wqe(qp)) |= IB_SEND_SOLICITED;
+	case RDMAP_SEND:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+		wqe = rx_wqe(qp);
+
+		wqe->wc_status = IB_WC_SUCCESS;
+		wqe->wr_status = SR_WR_DONE;
+
+		siw_rq_complete(wqe, qp);
+
+		break;
+
+	case RDMAP_RDMA_READ_RESP:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+
+		wqe = rx_wqe(qp);
+
+		wqe->wc_status = IB_WC_SUCCESS;
+		wqe->wr_status = SR_WR_DONE;
+
+		siw_rreq_complete(wqe, 0);
+
+		break;
+
+	case RDMAP_RDMA_READ_REQ:
+		rv = siw_init_rresp(qp, rctx);
+
+		break;
+
+	case RDMAP_RDMA_WRITE:
+		/*
+		 * Free References from memory object if
+		 * attached to receive context (inbound WRITE)
+		 * While a zero-length WRITE is allowed, the
+		 * current implementation does not create
+		 * a memory reference (it is unclear if memory
+		 * rights should be checked in that case!).
+		 *
+		 * TODO: check zero length WRITE semantics
+		 */
+		if (rx_mem(qp))
+			siw_mem_put(rx_mem(qp));
+		break;
+
+	default:
+		break;
+
+	}
+	rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
+	rx_wqe(qp) = NULL;	/* also clears MEM object for WRITE */
+
+	return rv;
+}
+
+/*
+ * siw_rdmap_error()
+ *
+ * Abort processing of RDMAP message after failure.
+ * SENDs + RRESPs will need for receive completion, if
+ * already started.
+ *
+ * TODO: WRITE need local error to be surfaced.
+ *
+ */
+static inline void
+siw_rdmap_error(struct siw_qp *qp, struct siw_iwarp_rx *rctx, int status)
+{
+	struct siw_wqe	*wqe;
+
+	switch (rctx->hdr.ctrl.opcode) {
+
+	case RDMAP_SEND_SE:
+	case RDMAP_SEND:
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+		wqe = rx_wqe(qp);
+		if (!wqe)
+			return;
+
+		if (rctx->hdr.ctrl.opcode == RDMAP_SEND_SE)
+			wr_flags(wqe) |= IB_SEND_SOLICITED;
+
+		if (!wqe->wc_status)
+			wqe->wc_status = IB_WC_GENERAL_ERR;
+
+		wqe->wr_status = SR_WR_DONE;
+		siw_rq_complete(wqe, qp);
+
+		break;
+
+	case RDMAP_RDMA_READ_RESP:
+		/*
+		 * A READ RESPONSE may flush consecutive WQE's
+		 * which were SQ processed before
+		 */
+		rctx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+
+		if (rctx->state == SIW_GET_HDR || status == -ENODATA)
+			/*  eventual RREQ left untouched */
+			break;
+
+		wqe = rx_wqe(qp);
+		if (wqe) {
+			if (status)
+				wqe->wc_status = status;
+			else
+				wqe->wc_status = IB_WC_GENERAL_ERR;
+
+			wqe->wr_status = SR_WR_DONE;
+			/*
+			 * All errors turn the wqe into signalled.
+			 */
+			wr_flags(wqe) |= IB_SEND_SIGNALED;
+			siw_rreq_complete(wqe, status);
+		}
+		break;
+
+	case RDMAP_RDMA_WRITE:
+		/*
+		 * Free References from memory object if
+		 * attached to receive context (inbound WRITE)
+		 * While a zero-length WRITE is allowed, the
+		 * current implementation does not create
+		 * a memory reference (it is unclear if memory
+		 * rights should be checked in that case!).
+		 *
+		 * TODO: check zero length WRITE semantics
+		 */
+		if (rx_mem(qp))
+			siw_mem_put(rx_mem(qp));
+		break;
+
+	default:
+		break;
+	}
+	rctx->umem_chunk = NULL; /* DEBUG aid, tentatively */
+	rx_wqe(qp) = NULL;	/* also clears MEM object for WRITE */
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc:	read descriptor
+ * @skb:	socket buffer
+ * @off:	offset in skb
+ * @len:	skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+		    unsigned int off, size_t len)
+{
+	struct siw_qp		*qp = rd_desc->arg.data;
+	struct siw_iwarp_rx	*rctx = &qp->rx_ctx;
+	int			rv;
+
+	rctx->skb = skb;
+	rctx->skb_new = skb->len - off;
+	rctx->skb_offset = off;
+	rctx->skb_copied = 0;
+
+	dprint(DBG_RX, "(QP%d): new data %d, rx-state %d\n", QP_ID(qp),
+		rctx->skb_new, rctx->state);
+
+	if (unlikely(rctx->rx_suspend == 1 ||
+		     qp->attrs.state != SIW_QP_STATE_RTS)) {
+		dprint(DBG_RX|DBG_ON, "(QP%d): failed. state rx:%d, qp:%d\n",
+			QP_ID(qp), qp->rx_ctx.state, qp->attrs.state);
+		return 0;
+	}
+	while (rctx->skb_new) {
+
+		switch (rctx->state) {
+
+		case SIW_GET_HDR:
+			rv = siw_get_hdr(rctx);
+			if (!rv) {
+				if (rctx->crc_enabled &&
+				    siw_crc_rxhdr(rctx) != 0) {
+					rv = -EINVAL;
+					break;
+				}
+				rctx->hdr.ctrl.mpa_len =
+					ntohs(rctx->hdr.ctrl.mpa_len);
+
+				rctx->fpdu_part_rem =
+					siw_fpdu_payload_len(rctx);
+
+				if (rctx->fpdu_part_rem)
+					rctx->pad = -rctx->fpdu_part_rem & 0x3;
+				else
+					rctx->pad = 0;
+
+				rctx->state = SIW_GET_DATA_START;
+				rctx->fpdu_part_rcvd = 0;
+			}
+			break;
+
+		case SIW_GET_DATA_MORE:
+			/*
+			 * Another data fragment of the same DDP segment.
+			 * Headers will not be checked again by the
+			 * opcode-specific data receive function below.
+			 * Setting first_ddp_seg = 0 avoids repeating
+			 * initializations that may occur only once per
+			 * DDP segment.
+			 */
+			rctx->first_ddp_seg = 0;
+
+		case SIW_GET_DATA_START:
+			/*
+			 * Headers will be checked by the opcode-specific
+			 * data receive function below.
+			 */
+			rv = siw_rx_data(qp, rctx);
+			if (!rv) {
+				rctx->fpdu_part_rem =
+					siw_fpdu_trailer_len(rctx);
+				rctx->fpdu_part_rcvd = 0;
+				rctx->state = SIW_GET_TRAILER;
+			} else
+				rctx->state = SIW_GET_DATA_MORE;
+
+			break;
+
+		case SIW_GET_TRAILER:
+			/*
+			 * read CRC + any padding
+			 */
+			rv = siw_get_trailer(qp, rctx);
+			if (!rv) {
+				/*
+				 * FPDU completed.
+				 * complete RDMAP message if last fragment
+				 */
+				rctx->state = SIW_GET_HDR;
+				rctx->fpdu_part_rcvd = 0;
+
+				if (!rctx->hdr.ctrl.l)
+					/* more frags */
+					break;
+
+				rv = siw_rdmap_complete(qp, rctx);
+				if (rv)
+					break;
+			}
+			break;
+
+		default:
+			WARN_ON(1);
+			rv = -EAGAIN;
+		}
+
+		if (unlikely(rv != 0 && rv != -EAGAIN)) {
+			/*
+			 * TODO: implement graceful error handling including
+			 *       generation (and processing) of TERMINATE
+			 *       messages.
+			 *
+			 *	 for now we are left with a bogus rx status
+			 *	 unable to receive any further byte.
+			 *	 BUT: code must handle difference between
+			 *
+			 * 	 o protocol syntax (FATAL, framing lost)
+			 *	 o crc	(FATAL, framing lost since we do not
+			 *	        trust packet header (??))
+			 *	 o local resource (maybe non fatal, framing
+			 *	   not lost)
+			 *
+			 *	 errors.
+			 */
+			siw_rdmap_error(qp, rctx, rv);
+
+			dprint(DBG_RX|DBG_ON,
+				"(QP%d): RX ERROR %d at RX state %d\n",
+				QP_ID(qp), rv, rctx->state);
+
+			siw_dprint_rctx(rctx);
+			/*
+			 * Calling siw_cm_queue_work() is safe without
+			 * releasing qp->state_lock because the QP state
+			 * will be transitioned to SIW_QP_STATE_ERROR
+			 * by the siw_work_handler() workqueue handler
+			 * after we return from siw_qp_llp_data_ready().
+			 */
+			siw_qp_cm_drop(qp, 1);
+
+			break;
+		}
+		if (rv) {
+			dprint(DBG_RX, "(QP%d): "
+				"Misaligned FPDU: State: %d, missing: %d\n",
+				QP_ID(qp), rctx->state, rctx->fpdu_part_rem);
+			break;
+		}
+	}
+	return rctx->skb_copied;
+}
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Debugging and Tracing
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 drivers/infiniband/hw/siw/siw_debug.c |  198 +++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/siw/siw_debug.h |  159 ++++++++++++++++++++++++++
 2 files changed, 357 insertions(+), 0 deletions(-)
 create mode 100644 drivers/infiniband/hw/siw/siw_debug.c
 create mode 100644 drivers/infiniband/hw/siw/siw_debug.h

diff --git a/drivers/infiniband/hw/siw/siw_debug.c b/drivers/infiniband/hw/siw/siw_debug.c
new file mode 100644
index 0000000..6340272
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_debug.c
@@ -0,0 +1,198 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Bernard Metzler <bmt-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *          Fredy Neeser <nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <net/tcp.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "siw.h"
+#include "siw_obj.h"
+
+
+void siw_print_qp_attr_mask(enum ib_qp_attr_mask attr_mask, char *msg)
+{
+	printk(KERN_INFO "-------- %s -------\n", msg);
+	if (IB_QP_STATE & attr_mask)
+		printk(KERN_INFO "IB_QP_STATE\n");
+	if (IB_QP_CUR_STATE & attr_mask)
+		printk(KERN_INFO "IB_QP_CUR_STATE\n");
+	if (IB_QP_EN_SQD_ASYNC_NOTIFY & attr_mask)
+		printk(KERN_INFO "IB_QP_EN_SQD_ASYNC_NOTIFY\n");
+	if (IB_QP_ACCESS_FLAGS & attr_mask)
+		printk(KERN_INFO "IB_QP_ACCESS_FLAGS\n");
+	if (IB_QP_PKEY_INDEX & attr_mask)
+		printk(KERN_INFO "IB_QP_PKEY_INDEX\n");
+	if (IB_QP_PORT & attr_mask)
+		printk(KERN_INFO "IB_QP_PORT\n");
+	if (IB_QP_QKEY & attr_mask)
+		printk(KERN_INFO "IB_QP_QKEY\n");
+	if (IB_QP_AV & attr_mask)
+		printk(KERN_INFO "IB_QP_AV\n");
+	if (IB_QP_PATH_MTU & attr_mask)
+		printk(KERN_INFO "IB_QP_PATH_MTU\n");
+	if (IB_QP_TIMEOUT & attr_mask)
+		printk(KERN_INFO "IB_QP_TIMEOUT\n");
+	if (IB_QP_RETRY_CNT & attr_mask)
+		printk(KERN_INFO "IB_QP_RETRY_CNT\n");
+	if (IB_QP_RNR_RETRY & attr_mask)
+		printk(KERN_INFO "IB_QP_RNR_RETRY\n");
+	if (IB_QP_RQ_PSN & attr_mask)
+		printk(KERN_INFO "IB_QP_RQ_PSN\n");
+	if (IB_QP_MAX_QP_RD_ATOMIC & attr_mask)
+		printk(KERN_INFO "IB_QP_MAX_QP_RD_ATOMIC\n");
+	if (IB_QP_ALT_PATH & attr_mask)
+		printk(KERN_INFO "IB_QP_ALT_PATH\n");
+	if (IB_QP_MIN_RNR_TIMER & attr_mask)
+		printk(KERN_INFO "IB_QP_MIN_RNR_TIMER\n");
+	if (IB_QP_SQ_PSN & attr_mask)
+		printk(KERN_INFO "IB_QP_SQ_PSN\n");
+	if (IB_QP_MAX_DEST_RD_ATOMIC & attr_mask)
+		printk(KERN_INFO "IB_QP_MAX_DEST_RD_ATOMIC\n");
+	if (IB_QP_PATH_MIG_STATE & attr_mask)
+		printk(KERN_INFO "IB_QP_PATH_MIG_STATE\n");
+	if (IB_QP_CAP & attr_mask)
+		printk(KERN_INFO "IB_QP_CAP\n");
+	if (IB_QP_DEST_QPN & attr_mask)
+		printk(KERN_INFO "IB_QP_DEST_QPN\n");
+	printk(KERN_INFO "-------- %s -(end)-\n", msg);
+}
+
+
+void siw_print_hdr(union iwarp_hdrs *hdr, int qp_id, char *msg)
+{
+	switch (hdr->ctrl.opcode) {
+
+	case RDMAP_RDMA_WRITE:
+		printk(KERN_INFO "QP%04d %s(WRITE, MPA len %d): %08x %016llx\n",
+			qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->rwrite.sink_stag, hdr->rwrite.sink_to);
+		break;
+
+	case RDMAP_RDMA_READ_REQ:
+		printk(KERN_INFO "QP%04d %s(RREQ, MPA len %d): %08x %08x "
+			"%08x %08x %016llx %08x %08x %016llx\n", qp_id, msg,
+			ntohs(hdr->ctrl.mpa_len),
+			hdr->rreq.ddp_qn, hdr->rreq.ddp_msn,
+			hdr->rreq.ddp_mo, hdr->rreq.sink_stag,
+			hdr->rreq.sink_to, hdr->rreq.read_size,
+			hdr->rreq.source_stag, hdr->rreq.source_to);
+
+		break;
+	case RDMAP_RDMA_READ_RESP:
+		printk(KERN_INFO "QP%04d %s(RRESP, MPA len %d): %08x %016llx\n",
+			qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->rresp.sink_stag, hdr->rresp.sink_to);
+		break;
+
+	case RDMAP_SEND:
+		printk(KERN_INFO "QP%04d %s(SEND, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn, hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_SEND_INVAL:
+		printk(KERN_INFO "QP%04d %s(S_INV, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn,
+			hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_SEND_SE:
+		printk(KERN_INFO "QP%04d %s(S_SE, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn,
+			hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_SEND_SE_INVAL:
+		printk(KERN_INFO "QP%04d %s(S_SE_INV, MPA len %d): %08x %08x "
+			"%08x\n", qp_id, msg, ntohs(hdr->ctrl.mpa_len),
+			hdr->send.ddp_qn, hdr->send.ddp_msn,
+			hdr->send.ddp_mo);
+		break;
+
+	case RDMAP_TERMINATE:
+		printk(KERN_INFO "QP%04d %s(TERM, MPA len %d):\n", qp_id, msg,
+			ntohs(hdr->ctrl.mpa_len));
+		break;
+
+	default:
+		printk(KERN_INFO "QP%04d %s ?????\n", qp_id, msg);
+		break;
+	}
+}
+
+void siw_print_rctx(struct siw_iwarp_rx *rctx)
+{
+	printk(KERN_INFO "---RX Context-->\n");
+	siw_print_hdr(&rctx->hdr, RX_QPID(rctx), "\nCurrent Pkt:\t");
+	printk(KERN_INFO "Skbuf State:\tp:0x%p, new:%d, off:%d, copied:%d\n",
+		rctx->skb, rctx->skb_new, rctx->skb_offset, rctx->skb_copied);
+	printk(KERN_INFO "FPDU State:\trx_state:%d,\n\t\trcvd:%d, rem:%d, "
+		"pad:%d\n", rctx->state, rctx->fpdu_part_rcvd,
+		rctx->fpdu_part_rem, rctx->pad);
+	printk(KERN_INFO "Rx Mem:\t\tp:0x%p, chunk:0x%p,\n\t\tp_ix:%d, "
+		"p_off:%d, stag:0x%08x, mem_id:%d\n",
+		rctx->dest.wqe, rctx->umem_chunk, rctx->pg_idx, rctx->pg_off,
+		rctx->ddp_stag, rctx->ddp_stag >> 8);
+	printk(KERN_INFO "DDP State:\tprev_op:%d, first_seg:%d, "
+		"more_segs:%d\n", rctx->prev_ddp_opcode, rctx->first_ddp_seg,
+		rctx->more_ddp_segs);
+	printk(KERN_INFO "MPA State:\tlen:%d, crc_enabled:%d, crc:0x%x\n",
+		rctx->hdr.ctrl.mpa_len, rctx->crc_enabled, rctx->trailer.crc);
+	printk(KERN_INFO "<---------------\n");
+}
+
+#if DPRINT_MASK > 0
+char ib_qp_state_to_string[IB_QPS_ERR+1][sizeof "RESET"] = {
+	[IB_QPS_RESET]	= "RESET",
+	[IB_QPS_INIT]	= "INIT",
+	[IB_QPS_RTR]	= "RTR",
+	[IB_QPS_RTS]	= "RTS",
+	[IB_QPS_SQD]	= "SQD",
+	[IB_QPS_SQE]	= "SQE",
+	[IB_QPS_ERR]	= "ERR"
+};
+#endif
diff --git a/drivers/infiniband/hw/siw/siw_debug.h b/drivers/infiniband/hw/siw/siw_debug.h
new file mode 100644
index 0000000..58615fd
--- /dev/null
+++ b/drivers/infiniband/hw/siw/siw_debug.h
@@ -0,0 +1,159 @@
+/*
+ * Software iWARP device driver for Linux
+ *
+ * Authors: Fredy Neeser <nfd-OA+xvbQnYDHMbYB6QlFGEg@public.gmane.org>
+ *
+ * Copyright (c) 2008-2010, IBM Corporation
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ *   Redistribution and use in source and binary forms, with or
+ *   without modification, are permitted provided that the following
+ *   conditions are met:
+ *
+ *   - Redistributions of source code must retain the above copyright notice,
+ *     this list of conditions and the following disclaimer.
+ *
+ *   - Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ *   - Neither the name of IBM nor the names of its contributors may be
+ *     used to endorse or promote products derived from this software without
+ *     specific prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _SIW_DEBUG_H
+#define _SIW_DEBUG_H
+
+#include <linux/uaccess.h>
+#include <linux/hardirq.h>	/* in_interrupt() */
+
+/*
+ * dprint: Selective debug printing
+ *
+ * Use an OR combination of DBG_* as dbgcat in dprint*(dbgcat,...)
+ * to assign debug messages to categories:
+ *
+ * dbgcat	Debug message belongs to category
+ * -----------------------------------------------------------------------------
+ * DBG_ON	Always on, for really important events or error conditions
+ * DBG_TMP	Temporarily on for fine-grained debugging
+ * DBQ_OBJ	Object management (object construction/destruction/refcounting)
+ * DBG_MM	Memory management
+ * DBG_EH	Event handling (completion events and asynchronous events)
+ * DBG_CM	Connection management, QP states
+ * DBG_WR	Work requests
+ * DBG_TX	iWARP TX path
+ * DBG_RX	iWARP RX path
+ * DBG_SK	Socket operations
+ * DBG_KT	Kernel threads
+ * DBG_IRQ	Interrupt context (SoftIRQ or HardIRQ)
+ * DBG_DM	Device management
+ * DBG_HDR	Packet HDRs
+ * DBG_ALL	All categories above
+ */
+#define DBG_ON		0x00000001
+#define DBG_TMP		0x00000002
+#define DBG_OBJ		0x00000004
+#define DBG_MM		0x00000008
+#define DBG_EH		0x00000010
+#define DBG_CM		0x00000020
+#define DBG_WR		0x00000040
+#define DBG_TX		0x00000080
+#define DBG_RX		0x00000100
+#define DBG_SK		0x00000200
+#define DBG_KT		0x00000400
+#define DBG_IRQ		0x00000800
+#define DBG_DM		0x00001000
+#define DBG_HDR		0x00002000
+#define DBG_ALL		(DBG_IRQ|DBG_KT|DBG_SK|DBG_RX|DBG_TX|DBG_WR|\
+DBG_CM|DBG_EH|DBG_MM|DBG_OBJ|DBG_TMP|DBG_DM|DBG_ON|DBG_HDR)
+#define DBG_ALL_NOHDR	(DBG_IRQ|DBG_KT|DBG_SK|DBG_RX|DBG_TX|DBG_WR|\
+DBG_CM|DBG_EH|DBG_MM|DBG_OBJ|DBG_TMP|DBG_DM|DBG_ON)
+#define DBG_CTRL	(DBG_ON|DBG_CM|DBG_DM)
+
+/*
+ * Set DPRINT_MASK to tailor your debugging needs:
+ *
+ * DPRINT_MASK value		Enables debug messages for
+ * ---------------------------------------------------------------------
+ * DBG_ON			Important events / error conditions only
+ *				(minimum number of debug messages)
+ * OR-ed combination of DBG_*	Selective debugging
+ * DBG_KT|DBG_ON		Kernel threads
+ * DBG_ALL			All categories
+ */
+#define DPRINT_MASK	0
+
+extern void siw_print_hdr(union iwarp_hdrs *, int, char *);
+extern void siw_print_rctx(struct siw_iwarp_rx *);
+extern void siw_print_qp_attr_mask(enum ib_qp_attr_mask, char *);
+
+#if DPRINT_MASK > 0
+
+/**
+ * dprint - Selective debug print for process, SoftIRQ or HardIRQ context
+ *
+ * Debug print with selectable debug categories,
+ * starting with header
+ * - "( pid /cpu) __func__" for process context
+ * - "( irq /cpu) __func__" for IRQ context
+ *
+ * @dbgcat	: Set of debug categories (OR-ed combination of DBG_* above),
+ *		  to which this debug message is assigned.
+ * @fmt		: printf compliant format string
+ * @args	: printf compliant argument list
+ */
+#define dprint(dbgcat, fmt, args...)					\
+	do {								\
+		if ((dbgcat) & DPRINT_MASK) {				\
+			if (!in_interrupt())				\
+				printk(KERN_INFO "(%5d/%1d) %s" fmt,	\
+					current->pid,			\
+					current_thread_info()->cpu,	\
+					__func__, ## args);		\
+			else						\
+				printk(KERN_INFO "( irq /%1d) %s" fmt,	\
+					current_thread_info()->cpu,	\
+					__func__, ## args);		\
+		}							\
+	} while (0)
+
+
+#define siw_dprint_rctx(r)	siw_print_rctx(r)
+extern char ib_qp_state_to_string[IB_QPS_ERR+1][sizeof "RESET"];
+
+#else
+#define dprint(dbgcat, fmt, args...)	do { } while (0)
+#define siw_dprint_rctx(r)	do { } while (0)
+#endif
+
+
+#if DPRINT_MASK & DBG_HDR
+#define siw_dprint_hdr(h, i, m)	siw_print_hdr(h, i, m)
+#else
+#define siw_dprint_hdr(h, i, m)	do { } while (0)
+#endif
+
+#if DPRINT_MASK & DBG_CM
+#define siw_dprint_qp_attr_mask(mask)\
+		siw_print_qp_attr_mask(mask, (char *)__func__)
+#else
+#define siw_dprint_qp_attr_mask(mask)	do { } while (0)
+#endif
+
+#endif
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* [PATCH] SIW: Documentation (initial)
From: Bernard Metzler @ 2010-10-05  6:55 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Bernard Metzler

---
 Documentation/networking/siw.txt |   91 ++++++++++++++++++++++++++++++++++++++
 1 files changed, 91 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/networking/siw.txt

diff --git a/Documentation/networking/siw.txt b/Documentation/networking/siw.txt
new file mode 100644
index 0000000..f051d8b
--- /dev/null
+++ b/Documentation/networking/siw.txt
@@ -0,0 +1,91 @@
+SoftiWARP: Software iWARP kernel driver module.
+
+General
+-------
+SoftiWARP (siw) implements the iWARP protocol suite (MPA/DDP/RDMAP,
+IETF-RFC 5044/5041/5040) completely in software as a Linux kernel module.
+siw runs on top of TCP kernel sockets and exports the Linux kernel ibvers
+RDMA interface. siw interfaces with the iwcm connection manager.
+
+
+Transmit Path
+-------------
+If a send queue (SQ) work queue element gets posted, siw tries to send
+it directly out of the application context. If the SQ was non-empty,
+SQ processing is done asynchronously by a kernel worker thread. This
+thread gets scheduled, if the TCP socket signals new write space to
+be available. If during send operation the socket send space get
+exhausted, SQ processing is abandoned until new socket write space
+becomes available.
+
+
+Receive Path
+------------
+All application data is placed into target buffers within softirq
+socket callback. Application notification is asynchronous.
+
+
+User Interface
+--------------
+All fast path operations such as posting of work requests and
+reaping of work completions currently involve a system call into
+the siw module. Kernel/user-mapped send and receive as well as 
+completion queues are not part of the current code. In
+particular, mapped completion queues may improve performance,
+since reaping completion queue entries as well as re-arming
+the completion queue could be done more efficiently.
+
+
+Memory Management
+-----------------
+siw currently uses kernels ib_umem_get() function to pin memory for later
+use in data transfer operations. Transmit and receive memory is checked
+against correct access permissions only in the moment of access by the
+network input path or before pushing it to the socket for transmission.
+ib_umem_get() provides DMA mappings for the requested address space which
+is not used by siw.
+
+
+Module Parameters
+-----------------
+The following siw module parameters are recognized.
+loopback_enabled:
+	If set, siw attaches also to the looback device. Checked only
+	during module insertion.
+
+mpa_crc_enabled:
+	If set, the MPA CRC gets generated and checked both in tx and rx
+	path. Without hardware support, setting this flag will severely
+	hurt throughput. 
+
+zcopy_tx:
+	If set, payload of non signalled work requests
+	(such as non signalled WRITE or SEND as well as all READ
+	responses) are transferred using the TCP sockets
+	sendpage interface. This parameter can be switched on and
+	off dynamically (echo 1 >> /sys/module/siw/parameters/zcopy_tx
+	for enablement, 0 for disabling). System load may benefits from
+	using 0copy data transmission. 0copy is not enabled if
+	mpa_crc_enabled is set.
+
+
+Compile Time Flags:
+-DCHECK_DMA_CAPABILITIES
+	Checks if the device siw wants to attach to provides
+	DMA capabilities. While DMA capabilities are currently not
+	needed (siw works on top of a kernel TCP socket), siw
+	uses ib_umem_get() which performs a (not used) DMA address
+	translation. Writing a siw private memory reservation and
+	pinning routine would solve the issue.
+
+-DSIW_TX_FULLSEGS
+	Experimental, not enabled by default. If set,
+	siw tries not to overrun the socket (not sending until
+	-EAGAIN retrun), but stops sending if the current segment
+	would not fit into the socket's estimated tx buffer. With that,
+	wire FPDUs may get truncated by the TCP stack far less often.
+	Since this feature manipulates the sock's SOCK_NOSPACE
+	bit, it violates strict layering and is therefore considered
+	proprietary.
+	Since TCP is a byte stream protocol, no guarantee can be given
+	if FPDU's are not fragmented.
-- 
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: pull-request: bluetooth-2.6 2010-09-27
From: David Miller @ 2010-10-05  7:06 UTC (permalink / raw)
  To: padovan-Y3ZbgMPKUGA34EUeqzHoZw
  Cc: linville-2XuSBdqkA4R54TAoqtyWWQ, marcel-kz+m5ild9QBg9hUCZPvPmw,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20101004223513.GB3234@vigoh>

From: "Gustavo F. Padovan" <padovan-Y3ZbgMPKUGA34EUeqzHoZw@public.gmane.org>
Date: Mon, 4 Oct 2010 19:35:13 -0300

> Follow the output of git show for that change, if we agree on the change I
> can append it to the bluetooth pull request.

That makes sense to me, thanks for doing this audit.

Append that commit and send a new pull request.

Thanks!

^ permalink raw reply

* Re: [PATCH 1/4] genetlink: introduce pre_doit/post_doit hooks
From: David Miller @ 2010-10-05  7:08 UTC (permalink / raw)
  To: johannes-cdvu00un1VgdHxzADdlk8Q
  Cc: linville-2XuSBdqkA4R54TAoqtyWWQ, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20101004191838.763690480-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>

From: Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>
Date: Mon, 04 Oct 2010 21:14:03 +0200

> From: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
> 
> Each family may have some amount of boilerplate
> locking code that applies to most, or even all,
> commands.
> 
> This allows a family to handle such things in
> a more generic way, by allowing it to
>  a) include private flags in each operation
>  b) specify a pre_doit hook that is called,
>     before an operation's doit() callback and
>     may return an error directly,
>  c) specify a post_doit hook that can undo
>     locking or similar things done by pre_doit,
>     and finally
>  d) include two private pointers in each info
>     struct passed between all these operations
>     including doit(). (It's two because I'll
>     need two in nl80211 -- can be extended.)
> 
> Signed-off-by: Johannes Berg <johannes.berg-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>

Acked-by: David S. Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH] bonding: fix to rejoin multicast groups immediately
From: David Miller @ 2010-10-05  7:13 UTC (permalink / raw)
  To: fleitner; +Cc: netdev
In-Reply-To: <1285744327-1194-1-git-send-email-fleitner@redhat.com>

From: Flavio Leitner <fleitner@redhat.com>
Date: Wed, 29 Sep 2010 04:12:07 -0300

> It should rejoin multicast groups immediately when
> the failover happens to restore the multicast traffic.
> 
> Signed-off-by: Flavio Leitner <fleitner@redhat.com>

I suspect the IGMPv3 handling via a delayed action, as is currently
implemented, is on purpose and is done so to follow the specification
of the IGMPv3 RFCs.

Therefore you have to explain why your new behavior is so desirable
and in particular why something as undesirable as violating the RFCs
is therefore warranted.

^ permalink raw reply

* Re: [PATCH] skge: add quirk to limit DMA
From: David Miller @ 2010-10-05  7:18 UTC (permalink / raw)
  To: sgruszka; +Cc: shemminger, netdev, luya
In-Reply-To: <20100929092515.GA6804@redhat.com>

From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Wed, 29 Sep 2010 11:33:23 +0200

> Skge devices installed on some Gigabyte motherboards are not able to
> perform 64 dma correctly due to board PCI implementation, so limit
> DMA to 32bit if such boards are detected.
> 
> Bug was reported here:
> https://bugzilla.redhat.com/show_bug.cgi?id=447489
> 
> Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
> Tested-by: Luya Tshimbalanga <luya@fedoraproject.org>

Stephen?  Can I get an ACK or some kind of other status on this?

^ permalink raw reply

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
From: David Miller @ 2010-10-05  7:24 UTC (permalink / raw)
  To: eric.dumazet; +Cc: jarkao2, hadi, netdev
In-Reply-To: <1286035915.2582.2472.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 02 Oct 2010 18:11:55 +0200

> [PATCH net-next V3] net: dynamic ingress_queue allocation
> 
> ingress being not used very much, and net_device->ingress_queue being
> quite a big object (128 or 256 bytes), use a dynamic allocation if
> needed (tc qdisc add dev eth0 ingress ...)
> 
> dev_ingress_queue(dev) helper should be used only with RTNL taken.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

This looks good to me, applied, thanks Eric.

^ permalink raw reply

* Re: ipvs: Use frag walker helper in SCTP proto support.
From: David Miller @ 2010-10-05  7:27 UTC (permalink / raw)
  To: horms; +Cc: netdev, netfilter-devel, lvs-devel, kaber
In-Reply-To: <20101004075911.GB2359@verge.net.au>

From: Simon Horman <horms@verge.net.au>
Date: Mon, 4 Oct 2010 16:59:12 +0900

> On Sun, Oct 03, 2010 at 11:46:01PM -0700, David Miller wrote:
>> 
>> Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> Acked-by: Simon Horman <horms@verge.net.au>
> 
> Dave, I'm happy for this to go via your tree or Partick's.
> I don't believe it conflicts with any of the other changes
> that are pending.

I'll toss it into net-next-2.6, thanks for reviewing.

^ permalink raw reply

* Re: [PATCH net-next] net: relax rtnl_dereference()
From: David Miller @ 2010-10-05  7:29 UTC (permalink / raw)
  To: eric.dumazet; +Cc: hadi, netdev, jarkao2
In-Reply-To: <1286182812.18293.22.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 04 Oct 2010 11:00:12 +0200

> Le lundi 04 octobre 2010 à 10:42 +0200, Eric Dumazet a écrit :
> 
>> BTW, rtnl_dereference() should be changed to use
>> rcu_dereference_protected() instead of rcu_dereference_check() :
>> If RTBL is held, there is no need to force a barrier.
>> 
> 
> [PATCH net-next] net: relax rtnl_dereference()
> 
> rtnl_dereference() is used in contexts where RTNL is held, to fetch an
> RCU protected pointer.
>  
> Updates to this pointer are prevented by RTNL, so we dont need
> smp_read_barrier_depends() and the ACCESS_ONCE() provided in
> rcu_dereference_check().
> 
> rtnl_dereference() is mainly a macro to document the locking invariant.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next V3] net: dynamic ingress_queue allocation
From: Eric Dumazet @ 2010-10-05  7:31 UTC (permalink / raw)
  To: David Miller; +Cc: jarkao2, hadi, netdev
In-Reply-To: <20101005.002450.58425682.davem@davemloft.net>

Le mardi 05 octobre 2010 à 00:24 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Sat, 02 Oct 2010 18:11:55 +0200
> 
> > [PATCH net-next V3] net: dynamic ingress_queue allocation
> > 
> > ingress being not used very much, and net_device->ingress_queue being
> > quite a big object (128 or 256 bytes), use a dynamic allocation if
> > needed (tc qdisc add dev eth0 ingress ...)
> > 
> > dev_ingress_queue(dev) helper should be used only with RTNL taken.
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> This looks good to me, applied, thanks Eric.

Thanks to Jarek, Jamal, and you :)



^ permalink raw reply

* [PATCH V4] fs: allow for more than 2^31 files
From: Eric Dumazet @ 2010-10-05  7:32 UTC (permalink / raw)
  To: Andrew Morton
  Cc: David Miller, Robin Holt, dipankar, viro, bcrl, den, mingo,
	mszeredi, cmm, npiggin, xemul, linux-kernel, netdev
In-Reply-To: <1285910958.2705.56.camel@edumazet-laptop>

Andrew,

Could you please review this patch, you probably are the right guy to
take it, because it crosses fs and net trees.

Note : /proc/sys/fs/file-nr is a read-only file, so this patch doesnt
depend on previous patch (sysctl: fix min/max handling in
__do_proc_doulongvec_minmax())

Thanks !

[PATCH V4] fs: allow for more than 2^31 files

Robin Holt tried to boot a 16TB system and found af_unix was overflowing
a 32bit value :

<quote>

We were seeing a failure which prevented boot.  The kernel was incapable
of creating either a named pipe or unix domain socket.  This comes down
to a common kernel function called unix_create1() which does:

        atomic_inc(&unix_nr_socks);
        if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
                goto out;

The function get_max_files() is a simple return of files_stat.max_files.
files_stat.max_files is a signed integer and is computed in
fs/file_table.c's files_init().

        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = n;

In our case, mempages (total_ram_pages) is approx 3,758,096,384
(0xe0000000).  That leaves max_files at approximately 1,503,238,553.
This causes 2 * get_max_files() to integer overflow.

</quote>

Fix is to let /proc/sys/fs/file-nr & /proc/sys/fs/file-max use long
integers, and change af_unix to use an atomic_long_t instead of
atomic_t.

get_max_files() is changed to return an unsigned long.
get_nr_files() is changed to return a long.

unix_nr_socks is changed from atomic_t to atomic_long_t, while not
strictly needed to address Robin problem.
 
Before patch (on a 64bit kernel) :
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
-18446744071562067968

After patch:
# echo 2147483648 >/proc/sys/fs/file-max
# cat /proc/sys/fs/file-max
2147483648
# cat /proc/sys/fs/file-nr
704     0       2147483648


Reported-by: Robin Holt <holt@sgi.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: David Miller <davem@davemloft.net>
Reviewed-by: Robin Holt <holt@sgi.com>
Tested-by: Robin Holt <holt@sgi.com>
---
 fs/file_table.c    |   17 +++++++----------
 include/linux/fs.h |    8 ++++----
 kernel/sysctl.c    |    6 +++---
 net/unix/af_unix.c |   14 +++++++-------
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index a04bdd8..c3dee38 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -60,7 +60,7 @@ static inline void file_free(struct file *f)
 /*
  * Return the total number of open files in the system
  */
-static int get_nr_files(void)
+static long get_nr_files(void)
 {
 	return percpu_counter_read_positive(&nr_files);
 }
@@ -68,7 +68,7 @@ static int get_nr_files(void)
 /*
  * Return the maximum number of open files in the system
  */
-int get_max_files(void)
+unsigned long get_max_files(void)
 {
 	return files_stat.max_files;
 }
@@ -82,7 +82,7 @@ int proc_nr_files(ctl_table *table, int write,
                      void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	files_stat.nr_files = get_nr_files();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #else
 int proc_nr_files(ctl_table *table, int write,
@@ -105,7 +105,7 @@ int proc_nr_files(ctl_table *table, int write,
 struct file *get_empty_filp(void)
 {
 	const struct cred *cred = current_cred();
-	static int old_max;
+	static long old_max;
 	struct file * f;
 
 	/*
@@ -140,8 +140,7 @@ struct file *get_empty_filp(void)
 over:
 	/* Ran out of filps - report that */
 	if (get_nr_files() > old_max) {
-		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					get_max_files());
+		pr_info("VFS: file-max limit %lu reached\n", get_max_files());
 		old_max = get_nr_files();
 	}
 	goto fail;
@@ -487,7 +486,7 @@ retry:
 
 void __init files_init(unsigned long mempages)
 { 
-	int n; 
+	unsigned long n;
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
@@ -498,9 +497,7 @@ void __init files_init(unsigned long mempages)
 	 */ 
 
 	n = (mempages * (PAGE_SIZE / 1024)) / 10;
-	files_stat.max_files = n; 
-	if (files_stat.max_files < NR_FILE)
-		files_stat.max_files = NR_FILE;
+	files_stat.max_files = max_t(unsigned long, n, NR_FILE);
 	files_defer_init();
 	lg_lock_init(files_lglock);
 	percpu_counter_init(&nr_files, 0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069b..8c06590 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -34,9 +34,9 @@
 
 /* And dynamically-tunable limits and defaults: */
 struct files_stat_struct {
-	int nr_files;		/* read only */
-	int nr_free_files;	/* read only */
-	int max_files;		/* tunable */
+	unsigned long nr_files;		/* read only */
+	unsigned long nr_free_files;	/* read only */
+	unsigned long max_files;		/* tunable */
 };
 
 struct inodes_stat_t {
@@ -404,7 +404,7 @@ extern void __init inode_init_early(void);
 extern void __init files_init(unsigned long);
 
 extern struct files_stat_struct files_stat;
-extern int get_max_files(void);
+extern unsigned long get_max_files(void);
 extern int sysctl_nr_open;
 extern struct inodes_stat_t inodes_stat;
 extern int leases_enable, lease_break_time;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f88552c..f789a0a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1352,16 +1352,16 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "file-nr",
 		.data		= &files_stat,
-		.maxlen		= 3*sizeof(int),
+		.maxlen		= sizeof(files_stat),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_files,
 	},
 	{
 		.procname	= "file-max",
 		.data		= &files_stat.max_files,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(files_stat.max_files),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_doulongvec_minmax,
 	},
 	{
 		.procname	= "nr_open",
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0b39b24..3e1d7d1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -117,7 +117,7 @@
 
 static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
 static DEFINE_SPINLOCK(unix_table_lock);
-static atomic_t unix_nr_socks = ATOMIC_INIT(0);
+static atomic_long_t unix_nr_socks;
 
 #define unix_sockets_unbound	(&unix_socket_table[UNIX_HASH_SIZE])
 
@@ -360,13 +360,13 @@ static void unix_sock_destructor(struct sock *sk)
 	if (u->addr)
 		unix_release_addr(u->addr);
 
-	atomic_dec(&unix_nr_socks);
+	atomic_long_dec(&unix_nr_socks);
 	local_bh_disable();
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 	local_bh_enable();
 #ifdef UNIX_REFCNT_DEBUG
-	printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk,
-		atomic_read(&unix_nr_socks));
+	printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
+		atomic_long_read(&unix_nr_socks));
 #endif
 }
 
@@ -606,8 +606,8 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	atomic_inc(&unix_nr_socks);
-	if (atomic_read(&unix_nr_socks) > 2 * get_max_files())
+	atomic_long_inc(&unix_nr_socks);
+	if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 		goto out;
 
 	sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
@@ -632,7 +632,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
 	if (sk == NULL)
-		atomic_dec(&unix_nr_socks);
+		atomic_long_dec(&unix_nr_socks);
 	else {
 		local_bh_disable();
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);



^ permalink raw reply related

* Re: [PATCH 1/2] net/phy: fix many "defined but unused" warnings
From: David Miller @ 2010-10-05  7:36 UTC (permalink / raw)
  To: u.kleine-koenig; +Cc: netdev
In-Reply-To: <1286185413-22924-1-git-send-email-u.kleine-koenig@pengutronix.de>

From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon,  4 Oct 2010 11:43:32 +0200

> MODULE_DEVICE_TABLE only expdrivers/net/phy/bcm63xx.c:134: warning: 'bcm63xx_tbl' defined but not usedands to something if it's compiled
> for a module.  So when building-in support for the phys, the
> mdio_device_id tables are unused.  Marking them with __maybe_unused
> fixes the following warnings:
> 
> 	drivers/net/phy/bcm63xx.c:134: warning: 'bcm63xx_tbl' defined but not used
> 	drivers/net/phy/broadcom.c:933: warning: 'broadcom_tbl' defined but not used
> 	drivers/net/phy/cicada.c:162: warning: 'cicada_tbl' defined but not used
> 	drivers/net/phy/davicom.c:222: warning: 'davicom_tbl' defined but not used
> 	drivers/net/phy/et1011c.c:114: warning: 'et1011c_tbl' defined but not used
> 	drivers/net/phy/icplus.c:137: warning: 'icplus_tbl' defined but not used
> 	drivers/net/phy/lxt.c:226: warning: 'lxt_tbl' defined but not used
> 	drivers/net/phy/marvell.c:724: warning: 'marvell_tbl' defined but not used
> 	drivers/net/phy/micrel.c:234: warning: 'micrel_tbl' defined but not used
> 	drivers/net/phy/national.c:154: warning: 'ns_tbl' defined but not used
> 	drivers/net/phy/qsemi.c:141: warning: 'qs6612_tbl' defined but not used
> 	drivers/net/phy/realtek.c:82: warning: 'realtek_tbl' defined but not used
> 	drivers/net/phy/smsc.c:257: warning: 'smsc_tbl' defined but not used
> 	drivers/net/phy/ste10Xp.c:135: warning: 'ste10Xp_tbl' defined but not used
> 	drivers/net/phy/vitesse.c:195: warning: 'vitesse_tbl' defined but not used
> 
> Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>

Unfortunate but necessary for now, so applied, thanks.

Maybe we can eventually put something into MODULE_DEVICE_TABLE to add a nop-style
reference, enough to shut up the compiler but not actually compile the table into
the code.

^ permalink raw reply

* Re: [PATCH 2/2] [RFC] don't let BCM63XX_PHY depend on non-existant symbol
From: David Miller @ 2010-10-05  7:36 UTC (permalink / raw)
  To: u.kleine-koenig; +Cc: netdev, mbizon, florian, ralf
In-Reply-To: <1286185413-22924-2-git-send-email-u.kleine-koenig@pengutronix.de>

From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Date: Mon,  4 Oct 2010 11:43:33 +0200

> The kernel doesn't have a symbol called BCM63XX.  There is a symbol
> BCM63XX_ENET (introduced in 9b1fc55a0500, 6 weeks after 09bb9aa0ed that
> introduced BCM63XX_PHY), but the driver compiles without that, too.
> 
> Cc: Maxime Bizon <mbizon@freebox.fr>
> Cc: Florian Fainelli <florian@openwrt.org>
> Cc: David S. Miller <davem@davemloft.net>
> Cc: Ralf Baechle <ralf@linux-mips.org>
> Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>

Looks good to me, applied.

Thanks.

^ permalink raw reply

* Re: [patch] cls_u32: signedness bug
From: David Miller @ 2010-10-05  7:40 UTC (permalink / raw)
  To: error27; +Cc: hadi, shemminger, elendil, xiaosuo, netdev, kernel-janitors
In-Reply-To: <20101004122836.GB5692@bicker>

From: Dan Carpenter <error27@gmail.com>
Date: Mon, 4 Oct 2010 14:28:36 +0200

> skb_headroom() is unsigned so "skb_headroom(skb) + toff" is also
> unsigned and can't be less than zero.  This test was added in 66d50d25:
> "u32: negative offset fix"  It was supposed to fix a regression.
> 
> Signed-off-by: Dan Carpenter <error27@gmail.com>
> ---
> Compile tested only.  Please check.

This looks correct to me, thanks for fixing this.

Applied.

^ permalink raw reply

* Re: [PATCH net-next] wimax: make functions local
From: David Miller @ 2010-10-05  7:48 UTC (permalink / raw)
  To: shemminger; +Cc: inaky, linux-wimax, netdev
In-Reply-To: <20101005145959.362fbd85@s6510>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 5 Oct 2010 14:59:59 +0900

> Make wimax variables and functions local if possible.
> Compile tested only.
> 
> This also removes a couple of unused EXPORT_SYMBOL.
> If this breaks some out of tree code, please fix that
> by putting the code in the kernel tree.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] qlcnic: remove dead code
From: David Miller @ 2010-10-05  7:48 UTC (permalink / raw)
  To: shemminger
  Cc: amit.salecha, netdev, ameen.rahman, anirban.chakraborty,
	sritej.velaga
In-Reply-To: <20101005104430.554c03e6@s6510>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 5 Oct 2010 10:44:30 +0900

> This driver has several pieces of dead code (found by running
> make namespacecheck). This patch removes them.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] fib: cleanups
From: David Miller @ 2010-10-05  7:48 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev
In-Reply-To: <1286258418.2457.7.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 05 Oct 2010 08:00:18 +0200

> Code style cleanups before upcoming functional changes.
> C99 initializer for fib_props array.
> 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] fib: fib_rules_cleanup can be static
From: David Miller @ 2010-10-05  7:49 UTC (permalink / raw)
  To: shemminger; +Cc: netdev
In-Reply-To: <20101005151417.57eae0b0@s6510>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 5 Oct 2010 15:14:17 +0900

> fib_rules_cleanup_ups is only defined and used in one place.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] ipv6: make __ipv6_isatap_ifid static
From: David Miller @ 2010-10-05  7:49 UTC (permalink / raw)
  To: shemminger; +Cc: netdev
In-Reply-To: <20101005151753.363121b8@s6510>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 5 Oct 2010 15:17:53 +0900

> Another exported symbol only used in one file
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied.

^ permalink raw reply

* Re: [PATCH] caif: remove duplicated include
From: David Miller @ 2010-10-05  7:49 UTC (permalink / raw)
  To: sjur.brandeland; +Cc: nikai, netdev, linux-kernel
In-Reply-To: <81C3A93C17462B4BBD7E272753C1057919238FAF3A@EXDCVYMBSTM005.EQ1STM.local>

From: Sjur BRENDELAND <sjur.brandeland@stericsson.com>
Date: Tue, 5 Oct 2010 08:34:13 +0200

> Nicolas Kaiser wrote: 
>> Remove duplicated include.
>> 
>> Signed-off-by: Nicolas Kaiser <nikai@nikai.net>
> 
> Looks good, thanks. 
> Acked-by: Sjur Braendeland <sjur.brandeland@stericsson.com>

Applied.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox