Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 07/10] RDMA/cxgb4: DB Drop Recovery for RDMA and LLD queues.
From: Vipul Pandya @ 2011-10-19 17:11 UTC (permalink / raw)
  To: linux-rdma, netdev; +Cc: roland, davem, divy, dm, kumaras, swise, Vipul Pandya
In-Reply-To: <1319044264-779-1-git-send-email-vipul@chelsio.com>

    - add module option db_fc_threshold which is the count of active QPs
    that trigger automatic db flow control mode.

    - automatically transition to/from flow control mode when the active qp
    count crosses db_fc_theshold.

    - add more db debugfs stats

    - on DB DROP event from the LLD, recover all the iwarp queues.

Signed-off-by: Vipul Pandya <vipul@chelsio.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/hw/cxgb4/device.c   |  176 ++++++++++++++++++++++++++++++-
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |   24 ++++-
 drivers/infiniband/hw/cxgb4/qp.c       |   47 ++++++++-
 drivers/infiniband/hw/cxgb4/t4.h       |   24 +++++
 4 files changed, 259 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 9062ed9..bdb398f 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -246,6 +246,8 @@ static const struct file_operations stag_debugfs_fops = {
 	.llseek  = default_llseek,
 };
 
+static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY"};
+
 static int stats_show(struct seq_file *seq, void *v)
 {
 	struct c4iw_dev *dev = seq->private;
@@ -272,6 +274,9 @@ static int stats_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);
 	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);
 	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop);
+	seq_printf(seq, " DB State: %s Transitions %llu\n",
+		   db_state_str[dev->db_state],
+		   dev->rdev.stats.db_state_transitions);
 	return 0;
 }
 
@@ -295,6 +300,7 @@ static ssize_t stats_clear(struct file *file, const char __user *buf,
 	dev->rdev.stats.db_full = 0;
 	dev->rdev.stats.db_empty = 0;
 	dev->rdev.stats.db_drop = 0;
+	dev->rdev.stats.db_state_transitions = 0;
 	mutex_unlock(&dev->rdev.stats.lock);
 	return count;
 }
@@ -677,8 +683,11 @@ static int disable_qp_db(int id, void *p, void *data)
 static void stop_queues(struct uld_ctx *ctx)
 {
 	spin_lock_irq(&ctx->dev->lock);
-	ctx->dev->db_state = FLOW_CONTROL;
-	idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
+	if (ctx->dev->db_state == NORMAL) {
+		ctx->dev->rdev.stats.db_state_transitions++;
+		ctx->dev->db_state = FLOW_CONTROL;
+		idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
+	}
 	spin_unlock_irq(&ctx->dev->lock);
 }
 
@@ -693,9 +702,165 @@ static int enable_qp_db(int id, void *p, void *data)
 static void resume_queues(struct uld_ctx *ctx)
 {
 	spin_lock_irq(&ctx->dev->lock);
-	ctx->dev->db_state = NORMAL;
-	idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL);
+	if (ctx->dev->qpcnt <= db_fc_threshold &&
+	    ctx->dev->db_state == FLOW_CONTROL) {
+		ctx->dev->db_state = NORMAL;
+		ctx->dev->rdev.stats.db_state_transitions++;
+		idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL);
+	}
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+struct qp_list {
+	unsigned idx;
+	struct c4iw_qp **qps;
+};
+
+static int add_and_ref_qp(int id, void *p, void *data)
+{
+	struct qp_list *qp_listp = data;
+	struct c4iw_qp *qp = p;
+
+	c4iw_qp_add_ref(&qp->ibqp);
+	qp_listp->qps[qp_listp->idx++] = qp;
+	return 0;
+}
+
+static int count_qps(int id, void *p, void *data)
+{
+	unsigned *countp = data;
+	(*countp)++;
+	return 0;
+}
+
+static void deref_qps(struct qp_list qp_list)
+{
+	int idx;
+
+	for (idx = 0; idx < qp_list.idx; idx++)
+		c4iw_qp_rem_ref(&qp_list.qps[idx]->ibqp);
+}
+
+static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
+{
+	int idx;
+	int ret;
+
+	for (idx = 0; idx < qp_list->idx; idx++) {
+		struct c4iw_qp *qp = qp_list->qps[idx];
+
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.sq.qid,
+					  t4_sq_host_wq_pidx(&qp->wq),
+					  t4_sq_wq_size(&qp->wq));
+		if (ret) {
+			printk(KERN_ERR MOD "%s: Fatal error - "
+			       "DB overflow recovery failed - "
+			       "error syncing SQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.sq.qid);
+			return;
+		}
+
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.rq.qid,
+					  t4_rq_host_wq_pidx(&qp->wq),
+					  t4_rq_wq_size(&qp->wq));
+
+		if (ret) {
+			printk(KERN_ERR MOD "%s: Fatal error - "
+			       "DB overflow recovery failed - "
+			       "error syncing RQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.rq.qid);
+			return;
+		}
+
+		/* Wait for the dbfifo to drain */
+		while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(usecs_to_jiffies(10));
+		}
+	}
+}
+
+static void recover_queues(struct uld_ctx *ctx)
+{
+	int count = 0;
+	struct qp_list qp_list;
+	int ret;
+
+	/* lock out kernel db ringers */
+	mutex_lock(&ctx->dev->db_mutex);
+
+	/* put all queues in to recovery mode */
+	spin_lock_irq(&ctx->dev->lock);
+	ctx->dev->db_state = RECOVERY;
+	ctx->dev->rdev.stats.db_state_transitions++;
+	idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
+	spin_unlock_irq(&ctx->dev->lock);
+
+	/* slow everybody down */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(usecs_to_jiffies(1000));
+
+	/* Wait for the dbfifo to completely drain. */
+	while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(usecs_to_jiffies(10));
+	}
+
+	/* flush the SGE contexts */
+	ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]);
+	if (ret) {
+		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
+		       pci_name(ctx->lldi.pdev));
+		goto out;
+	}
+
+	/* Count active queues so we can build a list of queues to recover */
+	spin_lock_irq(&ctx->dev->lock);
+	idr_for_each(&ctx->dev->qpidr, count_qps, &count);
+
+	qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC);
+	if (!qp_list.qps) {
+		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
+		       pci_name(ctx->lldi.pdev));
+		spin_unlock_irq(&ctx->dev->lock);
+		goto out;
+	}
+	qp_list.idx = 0;
+
+	/* add and ref each qp so it doesn't get freed */
+	idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list);
+
 	spin_unlock_irq(&ctx->dev->lock);
+
+	/* now traverse the list in a safe context to recover the db state*/
+	recover_lost_dbs(ctx, &qp_list);
+
+	/* we're almost done!  deref the qps and clean up */
+	deref_qps(qp_list);
+	kfree(qp_list.qps);
+
+	/* Wait for the dbfifo to completely drain again */
+	while (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1) > 0) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(usecs_to_jiffies(10));
+	}
+
+	/* resume the queues */
+	spin_lock_irq(&ctx->dev->lock);
+	if (ctx->dev->qpcnt > db_fc_threshold)
+		ctx->dev->db_state = FLOW_CONTROL;
+	else {
+		ctx->dev->db_state = NORMAL;
+		idr_for_each(&ctx->dev->qpidr, enable_qp_db, NULL);
+	}
+	ctx->dev->rdev.stats.db_state_transitions++;
+	spin_unlock_irq(&ctx->dev->lock);
+
+out:
+	/* start up kernel db ringers again */
+	mutex_unlock(&ctx->dev->db_mutex);
 }
 
 static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
@@ -716,8 +881,7 @@ static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
 		mutex_unlock(&ctx->dev->rdev.stats.lock);
 		break;
 	case CXGB4_CONTROL_DB_DROP:
-		printk(KERN_WARNING MOD "%s: Fatal DB DROP\n",
-		       pci_name(ctx->lldi.pdev));
+		recover_queues(ctx);
 		mutex_lock(&ctx->dev->rdev.stats.lock);
 		ctx->dev->rdev.stats.db_drop++;
 		mutex_unlock(&ctx->dev->rdev.stats.lock);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 2ce7741..75d643c 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -120,6 +120,7 @@ struct c4iw_stats {
 	u64  db_full;
 	u64  db_empty;
 	u64  db_drop;
+	u64  db_state_transitions;
 };
 
 struct c4iw_rdev {
@@ -212,6 +213,7 @@ struct c4iw_dev {
 	struct mutex db_mutex;
 	struct dentry *debugfs_root;
 	enum db_state db_state;
+	int qpcnt;
 };
 
 static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
@@ -271,11 +273,25 @@ static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr,
 	return _insert_handle(rhp, idr, handle, id, 0);
 }
 
-static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
+static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr,
+				   u32 id, int lock)
 {
-	spin_lock_irq(&rhp->lock);
+	if (lock)
+		spin_lock_irq(&rhp->lock);
 	idr_remove(idr, id);
-	spin_unlock_irq(&rhp->lock);
+	if (lock)
+		spin_unlock_irq(&rhp->lock);
+}
+
+static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 1);
+}
+
+static inline void remove_handle_nolock(struct c4iw_dev *rhp,
+					 struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 0);
 }
 
 struct c4iw_pd {
@@ -842,5 +858,7 @@ void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe);
 extern struct cxgb4_client t4c_client;
 extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
 extern int c4iw_max_read_depth;
+extern int db_fc_threshold;
+
 
 #endif
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 36fc94d..215b66a 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -42,6 +42,11 @@ static int ocqp_support = 1;
 module_param(ocqp_support, int, 0644);
 MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)");
 
+int db_fc_threshold = 2000;
+module_param(db_fc_threshold, int, 0644);
+MODULE_PARM_DESC(db_fc_threshold, "QP count/threshold that triggers automatic "
+		 "db flow control mode (default = 2000)");
+
 static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state)
 {
 	unsigned long flag;
@@ -1132,13 +1137,19 @@ static int ring_kernel_db(struct c4iw_qp *qhp, u32 qid, u16 inc)
 
 	mutex_lock(&qhp->rhp->db_mutex);
 	do {
-		if (cxgb4_dbfifo_count(qhp->rhp->rdev.lldi.ports[0], 1) < 768) {
+
+		/*
+		 * The interrupt threshold is dbfifo_int_thresh << 6. So
+		 * make sure we don't cross that and generate an interrupt.
+		 */
+		if (cxgb4_dbfifo_count(qhp->rhp->rdev.lldi.ports[0], 1) <
+		    (qhp->rhp->rdev.lldi.dbfifo_int_thresh << 5)) {
 			writel(V_QID(qid) | V_PIDX(inc), qhp->wq.db);
 			break;
 		}
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(usecs_to_jiffies(delay));
-		delay = min(delay << 1, 200000);
+		delay = min(delay << 1, 2000);
 	} while (1);
 	mutex_unlock(&qhp->rhp->db_mutex);
 	return 0;
@@ -1373,6 +1384,14 @@ out:
 	return ret;
 }
 
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_enable_wq_db(&qp->wq);
+	return 0;
+}
+
 int c4iw_destroy_qp(struct ib_qp *ib_qp)
 {
 	struct c4iw_dev *rhp;
@@ -1390,7 +1409,16 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
 		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
 	wait_event(qhp->wait, !qhp->ep);
 
-	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+	spin_lock_irq(&rhp->lock);
+	remove_handle_nolock(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+	rhp->qpcnt--;
+	BUG_ON(rhp->qpcnt < 0);
+	if (rhp->qpcnt <= db_fc_threshold && rhp->db_state == FLOW_CONTROL) {
+		rhp->rdev.stats.db_state_transitions++;
+		rhp->db_state = NORMAL;
+		idr_for_each(&rhp->qpidr, enable_qp_db, NULL);
+	}
+	spin_unlock_irq(&rhp->lock);
 	atomic_dec(&qhp->refcnt);
 	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
 
@@ -1404,6 +1432,14 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
 	return 0;
 }
 
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_disable_wq_db(&qp->wq);
+	return 0;
+}
+
 struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 			     struct ib_udata *udata)
 {
@@ -1493,6 +1529,11 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
 	spin_lock_irq(&rhp->lock);
 	if (rhp->db_state != NORMAL)
 		t4_disable_wq_db(&qhp->wq);
+	if (++rhp->qpcnt > db_fc_threshold && rhp->db_state == NORMAL) {
+		rhp->rdev.stats.db_state_transitions++;
+		rhp->db_state = FLOW_CONTROL;
+		idr_for_each(&rhp->qpidr, disable_qp_db, NULL);
+	}
 	ret = insert_handle_nolock(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
 	spin_unlock_irq(&rhp->lock);
 	if (ret)
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index c0221ee..16f26ab 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -62,6 +62,10 @@ struct t4_status_page {
 	__be16 pidx;
 	u8 qp_err;	/* flit 1 - sw owns */
 	u8 db_off;
+	u8 pad;
+	u16 host_wq_pidx;
+	u16 host_cidx;
+	u16 host_pidx;
 };
 
 #define T4_EQ_ENTRY_SIZE 64
@@ -375,6 +379,16 @@ static inline void t4_rq_consume(struct t4_wq *wq)
 		wq->rq.cidx = 0;
 }
 
+static inline u16 t4_rq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->rq.queue[wq->rq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_rq_wq_size(struct t4_wq *wq)
+{
+		return wq->rq.size * T4_RQ_NUM_SLOTS;
+}
+
 static inline int t4_sq_onchip(struct t4_sq *sq)
 {
 	return sq->flags & T4_SQ_ONCHIP;
@@ -412,6 +426,16 @@ static inline void t4_sq_consume(struct t4_wq *wq)
 		wq->sq.cidx = 0;
 }
 
+static inline u16 t4_sq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->sq.queue[wq->sq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_sq_wq_size(struct t4_wq *wq)
+{
+		return wq->sq.size * T4_SQ_NUM_SLOTS;
+}
+
 static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc)
 {
 	wmb();
-- 
1.7.1

^ permalink raw reply related

* [PATCH 08/10] RDMA/cxgb4: Use vmalloc for debugfs qp dump. Allows dumping thousands of qps.
From: Vipul Pandya @ 2011-10-19 17:11 UTC (permalink / raw)
  To: linux-rdma, netdev; +Cc: roland, davem, divy, dm, kumaras, swise, Vipul Pandya
In-Reply-To: <1319044264-779-1-git-send-email-vipul@chelsio.com>

	Log active open failures of interest.

Signed-off-by: Vipul Pandya <vipul@chelsio.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/hw/cxgb4/cm.c     |   18 ++++++++++++++++++
 drivers/infiniband/hw/cxgb4/device.c |    4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index c51818a..3d87d02 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -1411,6 +1411,24 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
 		return 0;
 	}
 
+	/*
+	 * Log interesting failures.
+	 */
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+	case CPL_ERR_CONN_TIMEDOUT:
+		break;
+	default:
+		printk(KERN_INFO MOD "Active open failure - "
+		       "atid %u status %u errno %d %pI4:%u->%pI4:%u\n",
+		       atid, status, status2errno(status),
+		       &ep->com.local_addr.sin_addr.s_addr,
+		       ntohs(ep->com.local_addr.sin_port),
+		       &ep->com.remote_addr.sin_addr.s_addr,
+		       ntohs(ep->com.remote_addr.sin_port));
+		break;
+	}
+
 	connect_reply_upcall(ep, status2errno(status));
 	state_set(&ep->com, DEAD);
 
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index bdb398f..8545629 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -121,7 +121,7 @@ static int qp_release(struct inode *inode, struct file *file)
 		printk(KERN_INFO "%s null qpd?\n", __func__);
 		return 0;
 	}
-	kfree(qpd->buf);
+	vfree(qpd->buf);
 	kfree(qpd);
 	return 0;
 }
@@ -145,7 +145,7 @@ static int qp_open(struct inode *inode, struct file *file)
 	spin_unlock_irq(&qpd->devp->lock);
 
 	qpd->bufsize = count * 128;
-	qpd->buf = kmalloc(qpd->bufsize, GFP_KERNEL);
+	qpd->buf = vmalloc(qpd->bufsize);
 	if (!qpd->buf) {
 		ret = -ENOMEM;
 		goto err1;
-- 
1.7.1

^ permalink raw reply related

* [PATCH 09/10] RDMA/cxgb4: remove kfifo usage
From: Vipul Pandya @ 2011-10-19 17:11 UTC (permalink / raw)
  To: linux-rdma, netdev; +Cc: roland, davem, divy, dm, kumaras, swise, Vipul Pandya
In-Reply-To: <1319044264-779-1-git-send-email-vipul@chelsio.com>

	Using kfifos for ID management was limiting the number of QPs and
	preventing NP384 MPI jobs.  So replace it with a simple bitmap
	allocator.

	Remove IDs from the IDR tables before deallocating them. This bug was
	causing the BUG_ON() in insert_handle() to fire because the ID was
	getting reused before being removed from the IDR table.

Signed-off-by: Vipul Pandya <vipul@chelsio.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/hw/cxgb4/Makefile   |    2 +-
 drivers/infiniband/hw/cxgb4/device.c   |   37 +++++---
 drivers/infiniband/hw/cxgb4/id_table.c |  112 ++++++++++++++++++++++++
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |   35 ++++++--
 drivers/infiniband/hw/cxgb4/mem.c      |   10 +--
 drivers/infiniband/hw/cxgb4/provider.c |    9 +--
 drivers/infiniband/hw/cxgb4/resource.c |  148 ++++++++------------------------
 7 files changed, 203 insertions(+), 150 deletions(-)
 create mode 100644 drivers/infiniband/hw/cxgb4/id_table.c

diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
index 46b878c..e11cf72 100644
--- a/drivers/infiniband/hw/cxgb4/Makefile
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -2,4 +2,4 @@ ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
 
 obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
 
-iw_cxgb4-y :=  device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o
+iw_cxgb4-y :=  device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 8545629..c8fd1d8 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -252,25 +252,26 @@ static int stats_show(struct seq_file *seq, void *v)
 {
 	struct c4iw_dev *dev = seq->private;
 
-	seq_printf(seq, " Object: %10s %10s %10s\n", "Total", "Current", "Max");
-	seq_printf(seq, "     PDID: %10llu %10llu %10llu\n",
+	seq_printf(seq, "   Object: %10s %10s %10s %10s\n", "Total", "Current",
+		   "Max", "Fail");
+	seq_printf(seq, "     PDID: %10llu %10llu %10llu %10llu\n",
 			dev->rdev.stats.pd.total, dev->rdev.stats.pd.cur,
-			dev->rdev.stats.pd.max);
-	seq_printf(seq, "      QID: %10llu %10llu %10llu\n",
+			dev->rdev.stats.pd.max, dev->rdev.stats.pd.fail);
+	seq_printf(seq, "      QID: %10llu %10llu %10llu %10llu\n",
 			dev->rdev.stats.qid.total, dev->rdev.stats.qid.cur,
-			dev->rdev.stats.qid.max);
-	seq_printf(seq, "   TPTMEM: %10llu %10llu %10llu\n",
+			dev->rdev.stats.qid.max, dev->rdev.stats.qid.fail);
+	seq_printf(seq, "   TPTMEM: %10llu %10llu %10llu %10llu\n",
 			dev->rdev.stats.stag.total, dev->rdev.stats.stag.cur,
-			dev->rdev.stats.stag.max);
-	seq_printf(seq, "   PBLMEM: %10llu %10llu %10llu\n",
+			dev->rdev.stats.stag.max, dev->rdev.stats.stag.fail);
+	seq_printf(seq, "   PBLMEM: %10llu %10llu %10llu %10llu\n",
 			dev->rdev.stats.pbl.total, dev->rdev.stats.pbl.cur,
-			dev->rdev.stats.pbl.max);
-	seq_printf(seq, "   RQTMEM: %10llu %10llu %10llu\n",
+			dev->rdev.stats.pbl.max, dev->rdev.stats.pbl.fail);
+	seq_printf(seq, "   RQTMEM: %10llu %10llu %10llu %10llu\n",
 			dev->rdev.stats.rqt.total, dev->rdev.stats.rqt.cur,
-			dev->rdev.stats.rqt.max);
-	seq_printf(seq, "  OCQPMEM: %10llu %10llu %10llu\n",
+			dev->rdev.stats.rqt.max, dev->rdev.stats.rqt.fail);
+	seq_printf(seq, "  OCQPMEM: %10llu %10llu %10llu %10llu\n",
 			dev->rdev.stats.ocqp.total, dev->rdev.stats.ocqp.cur,
-			dev->rdev.stats.ocqp.max);
+			dev->rdev.stats.ocqp.max, dev->rdev.stats.ocqp.fail);
 	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);
 	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);
 	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop);
@@ -292,11 +293,17 @@ static ssize_t stats_clear(struct file *file, const char __user *buf,
 
 	mutex_lock(&dev->rdev.stats.lock);
 	dev->rdev.stats.pd.max = 0;
+	dev->rdev.stats.pd.fail = 0;
 	dev->rdev.stats.qid.max = 0;
+	dev->rdev.stats.qid.fail = 0;
 	dev->rdev.stats.stag.max = 0;
+	dev->rdev.stats.stag.fail = 0;
 	dev->rdev.stats.pbl.max = 0;
+	dev->rdev.stats.pbl.fail = 0;
 	dev->rdev.stats.rqt.max = 0;
+	dev->rdev.stats.rqt.fail = 0;
 	dev->rdev.stats.ocqp.max = 0;
+	dev->rdev.stats.ocqp.fail = 0;
 	dev->rdev.stats.db_full = 0;
 	dev->rdev.stats.db_empty = 0;
 	dev->rdev.stats.db_drop = 0;
@@ -350,8 +357,8 @@ void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
 		entry = list_entry(pos, struct c4iw_qid_list, entry);
 		list_del_init(&entry->entry);
 		if (!(entry->qid & rdev->qpmask)) {
-			c4iw_put_resource(&rdev->resource.qid_fifo, entry->qid,
-					&rdev->resource.qid_fifo_lock);
+			c4iw_put_resource(&rdev->resource.qid_table,
+					  entry->qid);
 			mutex_lock(&rdev->stats.lock);
 			rdev->stats.qid.cur -= rdev->qpmask + 1;
 			mutex_unlock(&rdev->stats.lock);
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c
new file mode 100644
index 0000000..f95e5df
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/id_table.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2011 Chelsio Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "iw_cxgb4.h"
+
+#define RANDOM_SKIP 16
+
+/*
+ * Trivial bitmap-based allocator. If the random flag is set, the
+ * allocator is designed to:
+ * - pseudo-randomize the id returned such that it is not trivially predictable.
+ * - avoid reuse of recently used id (at the expense of predictability)
+ */
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max)
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+
+	if (obj < alloc->max) {
+		if (alloc->flags & C4IW_ID_TABLE_F_RANDOM)
+			alloc->last += random32() % RANDOM_SKIP;
+		else
+			alloc->last = obj + 1;
+		if (alloc->last >= alloc->max)
+			alloc->last = 0;
+		set_bit(obj, alloc->table);
+		obj += alloc->start;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+	return obj;
+}
+
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj -= alloc->start;
+	BUG_ON((int)obj < 0);
+
+	spin_lock_irqsave(&alloc->lock, flags);
+	clear_bit(obj, alloc->table);
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags)
+{
+	int i;
+
+	alloc->start = start;
+	alloc->flags = flags;
+	if (flags & C4IW_ID_TABLE_F_RANDOM)
+		alloc->last = random32() % RANDOM_SKIP;
+	else
+		alloc->last = 0;
+	alloc->max  = num;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof(long),
+				GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	if (!(alloc->flags & C4IW_ID_TABLE_F_EMPTY))
+		for (i = 0; i < reserved; ++i)
+			set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void c4iw_id_table_free(struct c4iw_id_table *alloc)
+{
+	kfree(alloc->table);
+}
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 75d643c..9e26b76 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -45,7 +45,6 @@
 #include <linux/kref.h>
 #include <linux/timer.h>
 #include <linux/io.h>
-#include <linux/kfifo.h>
 
 #include <asm/byteorder.h>
 
@@ -79,13 +78,22 @@ static inline void *cplhdr(struct sk_buff *skb)
 	return skb->data;
 }
 
+#define C4IW_ID_TABLE_F_RANDOM 1       /* Pseudo-randomize the id's returned */
+#define C4IW_ID_TABLE_F_EMPTY  2       /* Table is initially empty */
+
+struct c4iw_id_table {
+	u32 flags;
+	u32 start;              /* logical minimal id */
+	u32 last;               /* hint for find */
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
 struct c4iw_resource {
-	struct kfifo tpt_fifo;
-	spinlock_t tpt_fifo_lock;
-	struct kfifo qid_fifo;
-	spinlock_t qid_fifo_lock;
-	struct kfifo pdid_fifo;
-	spinlock_t pdid_fifo_lock;
+	struct c4iw_id_table tpt_table;
+	struct c4iw_id_table qid_table;
+	struct c4iw_id_table pdid_table;
 };
 
 struct c4iw_qid_list {
@@ -107,6 +115,7 @@ struct c4iw_stat {
 	u64 total;
 	u64 cur;
 	u64 max;
+	u64 fail;
 };
 
 struct c4iw_stats {
@@ -253,7 +262,7 @@ static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr,
 		if (lock)
 			spin_lock_irq(&rhp->lock);
 		ret = idr_get_new_above(idr, handle, id, &newid);
-		BUG_ON(newid != id);
+		BUG_ON(!ret && newid != id);
 		if (lock)
 			spin_unlock_irq(&rhp->lock);
 	} while (ret == -EAGAIN);
@@ -754,14 +763,20 @@ static inline int compute_wscale(int win)
 	return wscale;
 }
 
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc);
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj);
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags);
+void c4iw_id_table_free(struct c4iw_id_table *alloc);
+
 typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb);
 
 int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
 		     struct l2t_entry *l2t);
 void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid,
 		   struct c4iw_dev_ucontext *uctx);
-u32 c4iw_get_resource(struct kfifo *fifo, spinlock_t *lock);
-void c4iw_put_resource(struct kfifo *fifo, u32 entry, spinlock_t *lock);
+u32 c4iw_get_resource(struct c4iw_id_table *id_table);
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry);
 int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid);
 int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev);
 int c4iw_pblpool_create(struct c4iw_rdev *rdev);
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
index 2a87379..57e07c6 100644
--- a/drivers/infiniband/hw/cxgb4/mem.c
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -131,8 +131,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
 	stag_idx = (*stag) >> 8;
 
 	if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) {
-		stag_idx = c4iw_get_resource(&rdev->resource.tpt_fifo,
-					     &rdev->resource.tpt_fifo_lock);
+		stag_idx = c4iw_get_resource(&rdev->resource.tpt_table);
 		if (!stag_idx)
 			return -ENOMEM;
 		mutex_lock(&rdev->stats.lock);
@@ -171,8 +170,7 @@ static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
 				sizeof(tpt), &tpt);
 
 	if (reset_tpt_entry) {
-		c4iw_put_resource(&rdev->resource.tpt_fifo, stag_idx,
-				  &rdev->resource.tpt_fifo_lock);
+		c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
 		mutex_lock(&rdev->stats.lock);
 		rdev->stats.stag.cur -= 32;
 		mutex_unlock(&rdev->stats.lock);
@@ -695,8 +693,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
 	mhp = to_c4iw_mw(mw);
 	rhp = mhp->rhp;
 	mmid = (mw->rkey) >> 8;
-	deallocate_window(&rhp->rdev, mhp->attr.stag);
 	remove_handle(rhp, &rhp->mmidr, mmid);
+	deallocate_window(&rhp->rdev, mhp->attr.stag);
 	kfree(mhp);
 	PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
 	return 0;
@@ -798,12 +796,12 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
 	mhp = to_c4iw_mr(ib_mr);
 	rhp = mhp->rhp;
 	mmid = mhp->attr.stag >> 8;
+	remove_handle(rhp, &rhp->mmidr, mmid);
 	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
 		       mhp->attr.pbl_addr);
 	if (mhp->attr.pbl_size)
 		c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
 				  mhp->attr.pbl_size << 3);
-	remove_handle(rhp, &rhp->mmidr, mmid);
 	if (mhp->kva)
 		kfree((void *) (unsigned long) mhp->kva);
 	if (mhp->umem)
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index c2554ef..db1debc 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -188,8 +188,7 @@ static int c4iw_deallocate_pd(struct ib_pd *pd)
 	php = to_c4iw_pd(pd);
 	rhp = php->rhp;
 	PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
-	c4iw_put_resource(&rhp->rdev.resource.pdid_fifo, php->pdid,
-			  &rhp->rdev.resource.pdid_fifo_lock);
+	c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid);
 	mutex_lock(&rhp->rdev.stats.lock);
 	rhp->rdev.stats.pd.cur--;
 	mutex_unlock(&rhp->rdev.stats.lock);
@@ -207,14 +206,12 @@ static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
 
 	PDBG("%s ibdev %p\n", __func__, ibdev);
 	rhp = (struct c4iw_dev *) ibdev;
-	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_fifo,
-				  &rhp->rdev.resource.pdid_fifo_lock);
+	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_table);
 	if (!pdid)
 		return ERR_PTR(-EINVAL);
 	php = kzalloc(sizeof(*php), GFP_KERNEL);
 	if (!php) {
-		c4iw_put_resource(&rhp->rdev.resource.pdid_fifo, pdid,
-				  &rhp->rdev.resource.pdid_fifo_lock);
+		c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
 		return ERR_PTR(-ENOMEM);
 	}
 	php->pdid = pdid;
diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c
index 1b948d1..cdef4d7 100644
--- a/drivers/infiniband/hw/cxgb4/resource.c
+++ b/drivers/infiniband/hw/cxgb4/resource.c
@@ -30,96 +30,25 @@
  * SOFTWARE.
  */
 /* Crude resource management */
-#include <linux/kernel.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/kfifo.h>
 #include <linux/spinlock.h>
-#include <linux/errno.h>
 #include <linux/genalloc.h>
 #include <linux/ratelimit.h>
 #include "iw_cxgb4.h"
 
-#define RANDOM_SIZE 16
-
-static int __c4iw_init_resource_fifo(struct kfifo *fifo,
-				   spinlock_t *fifo_lock,
-				   u32 nr, u32 skip_low,
-				   u32 skip_high,
-				   int random)
-{
-	u32 i, j, entry = 0, idx;
-	u32 random_bytes;
-	u32 rarray[16];
-	spin_lock_init(fifo_lock);
-
-	if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL))
-		return -ENOMEM;
-
-	for (i = 0; i < skip_low + skip_high; i++)
-		kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
-	if (random) {
-		j = 0;
-		random_bytes = random32();
-		for (i = 0; i < RANDOM_SIZE; i++)
-			rarray[i] = i + skip_low;
-		for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
-			if (j >= RANDOM_SIZE) {
-				j = 0;
-				random_bytes = random32();
-			}
-			idx = (random_bytes >> (j * 2)) & 0xF;
-			kfifo_in(fifo,
-				(unsigned char *) &rarray[idx],
-				sizeof(u32));
-			rarray[idx] = i;
-			j++;
-		}
-		for (i = 0; i < RANDOM_SIZE; i++)
-			kfifo_in(fifo,
-				(unsigned char *) &rarray[i],
-				sizeof(u32));
-	} else
-		for (i = skip_low; i < nr - skip_high; i++)
-			kfifo_in(fifo, (unsigned char *) &i, sizeof(u32));
-
-	for (i = 0; i < skip_low + skip_high; i++)
-		if (kfifo_out_locked(fifo, (unsigned char *) &entry,
-				     sizeof(u32), fifo_lock))
-			break;
-	return 0;
-}
-
-static int c4iw_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock,
-				   u32 nr, u32 skip_low, u32 skip_high)
-{
-	return __c4iw_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
-					  skip_high, 0);
-}
-
-static int c4iw_init_resource_fifo_random(struct kfifo *fifo,
-				   spinlock_t *fifo_lock,
-				   u32 nr, u32 skip_low, u32 skip_high)
-{
-	return __c4iw_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
-					  skip_high, 1);
-}
-
-static int c4iw_init_qid_fifo(struct c4iw_rdev *rdev)
+static int c4iw_init_qid_table(struct c4iw_rdev *rdev)
 {
 	u32 i;
 
-	spin_lock_init(&rdev->resource.qid_fifo_lock);
-
-	if (kfifo_alloc(&rdev->resource.qid_fifo, rdev->lldi.vr->qp.size *
-			sizeof(u32), GFP_KERNEL))
+	if (c4iw_id_table_alloc(&rdev->resource.qid_table,
+				rdev->lldi.vr->qp.start,
+				rdev->lldi.vr->qp.size,
+				rdev->lldi.vr->qp.size, 0))
 		return -ENOMEM;
 
 	for (i = rdev->lldi.vr->qp.start;
-	     i < rdev->lldi.vr->qp.start + rdev->lldi.vr->qp.size; i++)
+		i < rdev->lldi.vr->qp.start + rdev->lldi.vr->qp.size; i++)
 		if (!(i & rdev->qpmask))
-			kfifo_in(&rdev->resource.qid_fifo,
-				    (unsigned char *) &i, sizeof(u32));
+			c4iw_id_free(&rdev->resource.qid_table, i);
 	return 0;
 }
 
@@ -127,44 +56,42 @@ static int c4iw_init_qid_fifo(struct c4iw_rdev *rdev)
 int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid)
 {
 	int err = 0;
-	err = c4iw_init_resource_fifo_random(&rdev->resource.tpt_fifo,
-					     &rdev->resource.tpt_fifo_lock,
-					     nr_tpt, 1, 0);
+	err = c4iw_id_table_alloc(&rdev->resource.tpt_table, 0, nr_tpt, 1,
+					C4IW_ID_TABLE_F_RANDOM);
 	if (err)
 		goto tpt_err;
-	err = c4iw_init_qid_fifo(rdev);
+	err = c4iw_init_qid_table(rdev);
 	if (err)
 		goto qid_err;
-	err = c4iw_init_resource_fifo(&rdev->resource.pdid_fifo,
-				      &rdev->resource.pdid_fifo_lock,
-				      nr_pdid, 1, 0);
+	err = c4iw_id_table_alloc(&rdev->resource.pdid_table, 0,
+					nr_pdid, 1, 0);
 	if (err)
 		goto pdid_err;
 	return 0;
-pdid_err:
-	kfifo_free(&rdev->resource.qid_fifo);
-qid_err:
-	kfifo_free(&rdev->resource.tpt_fifo);
-tpt_err:
+ pdid_err:
+	c4iw_id_table_free(&rdev->resource.qid_table);
+ qid_err:
+	c4iw_id_table_free(&rdev->resource.tpt_table);
+ tpt_err:
 	return -ENOMEM;
 }
 
 /*
  * returns 0 if no resource available
  */
-u32 c4iw_get_resource(struct kfifo *fifo, spinlock_t *lock)
+u32 c4iw_get_resource(struct c4iw_id_table *id_table)
 {
 	u32 entry;
-	if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock))
-		return entry;
-	else
+	entry = c4iw_id_alloc(id_table);
+	if (entry == (u32)(-1))
 		return 0;
+	return entry;
 }
 
-void c4iw_put_resource(struct kfifo *fifo, u32 entry, spinlock_t *lock)
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry)
 {
 	PDBG("%s entry 0x%x\n", __func__, entry);
-	kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock);
+	c4iw_id_free(id_table, entry);
 }
 
 u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
@@ -181,8 +108,7 @@ u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
 		qid = entry->qid;
 		kfree(entry);
 	} else {
-		qid = c4iw_get_resource(&rdev->resource.qid_fifo,
-					&rdev->resource.qid_fifo_lock);
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
 		if (!qid)
 			goto out;
 		mutex_lock(&rdev->stats.lock);
@@ -252,8 +178,7 @@ u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
 		qid = entry->qid;
 		kfree(entry);
 	} else {
-		qid = c4iw_get_resource(&rdev->resource.qid_fifo,
-					&rdev->resource.qid_fifo_lock);
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
 		if (!qid)
 			goto out;
 		mutex_lock(&rdev->stats.lock);
@@ -311,9 +236,9 @@ void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
 
 void c4iw_destroy_resource(struct c4iw_resource *rscp)
 {
-	kfifo_free(&rscp->tpt_fifo);
-	kfifo_free(&rscp->qid_fifo);
-	kfifo_free(&rscp->pdid_fifo);
+	c4iw_id_table_free(&rscp->tpt_table);
+	c4iw_id_table_free(&rscp->qid_table);
+	c4iw_id_table_free(&rscp->pdid_table);
 }
 
 /*
@@ -326,16 +251,14 @@ u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size)
 {
 	unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size);
 	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
-	if (!addr)
-		printk_ratelimited(KERN_WARNING MOD "%s: Out of PBL memory\n",
-		       pci_name(rdev->lldi.pdev));
+	mutex_lock(&rdev->stats.lock);
 	if (addr) {
-		mutex_lock(&rdev->stats.lock);
 		rdev->stats.pbl.cur += roundup(size, 1 << MIN_PBL_SHIFT);
 		if (rdev->stats.pbl.cur > rdev->stats.pbl.max)
 			rdev->stats.pbl.max = rdev->stats.pbl.cur;
-		mutex_unlock(&rdev->stats.lock);
-	}
+	} else
+		rdev->stats.pbl.fail++;
+	mutex_unlock(&rdev->stats.lock);
 	return (u32)addr;
 }
 
@@ -401,13 +324,14 @@ u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size)
 	if (!addr)
 		printk_ratelimited(KERN_WARNING MOD "%s: Out of RQT memory\n",
 		       pci_name(rdev->lldi.pdev));
+	mutex_lock(&rdev->stats.lock);
 	if (addr) {
-		mutex_lock(&rdev->stats.lock);
 		rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT);
 		if (rdev->stats.rqt.cur > rdev->stats.rqt.max)
 			rdev->stats.rqt.max = rdev->stats.rqt.cur;
-		mutex_unlock(&rdev->stats.lock);
-	}
+	} else
+		rdev->stats.rqt.fail++;
+	mutex_unlock(&rdev->stats.lock);
 	return (u32)addr;
 }
 
-- 
1.7.1

^ permalink raw reply related

* [PATCH 10/10] RDMA/cxgb4: Add query_qp support in driver to query the qp state before flushing.
From: Vipul Pandya @ 2011-10-19 17:11 UTC (permalink / raw)
  To: linux-rdma, netdev; +Cc: roland, davem, divy, dm, kumaras, swise, Vipul Pandya
In-Reply-To: <1319044264-779-1-git-send-email-vipul@chelsio.com>

Signed-off-by: Vipul Pandya <vipul@chelsio.com>
Signed-off-by: Steve Wise <swise@opengridcomputing.com>
---
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h |   19 +++++++++++++++++++
 drivers/infiniband/hw/cxgb4/provider.c |    2 ++
 drivers/infiniband/hw/cxgb4/qp.c       |   11 +++++++++++
 3 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 9e26b76..8aa1744 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -550,6 +550,23 @@ static inline int c4iw_convert_state(enum ib_qp_state ib_state)
 	}
 }
 
+static inline int to_ib_qp_state(int c4iw_qp_state)
+{
+	switch (c4iw_qp_state) {
+	case C4IW_QP_STATE_IDLE:
+		return IB_QPS_INIT;
+	case C4IW_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case C4IW_QP_STATE_CLOSING:
+		return IB_QPS_SQD;
+	case C4IW_QP_STATE_TERMINATE:
+		return IB_QPS_SQE;
+	case C4IW_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	}
+	return IB_QPS_ERR;
+}
+
 static inline u32 c4iw_ib_to_tpt_access(int a)
 {
 	return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
@@ -845,6 +862,8 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
 			     struct ib_udata *udata);
 int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 				 int attr_mask, struct ib_udata *udata);
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr);
 struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn);
 u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size);
 void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index db1debc..3cb1128 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -443,6 +443,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
 	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
 	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
 	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
 	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
 	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
 	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
@@ -465,6 +466,7 @@ int c4iw_register_device(struct c4iw_dev *dev)
 	dev->ibdev.destroy_ah = c4iw_ah_destroy;
 	dev->ibdev.create_qp = c4iw_create_qp;
 	dev->ibdev.modify_qp = c4iw_ib_modify_qp;
+	dev->ibdev.query_qp = c4iw_ib_query_qp;
 	dev->ibdev.destroy_qp = c4iw_destroy_qp;
 	dev->ibdev.create_cq = c4iw_create_cq;
 	dev->ibdev.destroy_cq = c4iw_destroy_cq;
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 215b66a..a7b8713 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -1696,3 +1696,14 @@ struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn)
 	PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
 	return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn);
 }
+
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
+
+	memset(attr, 0, sizeof *attr);
+	memset(init_attr, 0, sizeof *init_attr);
+	attr->qp_state = to_ib_qp_state(qhp->attr.state);
+	return 0;
+}
-- 
1.7.1

^ permalink raw reply related

* Re: [PATCH] Disable TCP_DEBUG and FASTRETRANS_DEBUG by default
From: Flavio Leitner @ 2011-10-19 17:16 UTC (permalink / raw)
  To: David Miller
  Cc: dpmcgee, netdev, kuznet, jmorris, yoshfuji, kaber, linux-kernel
In-Reply-To: <20111017.175238.18474279385644215.davem@davemloft.net>

On Mon, 17 Oct 2011 17:52:38 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Dan McGee <dpmcgee@gmail.com>
> Date: Mon, 17 Oct 2011 15:25:24 -0500
> 
> > If these are truly debug options, they should be turned off by default
> > and can be tweaked if necessary. Fix one usage of the flag to use #if
> > instead of #ifdef so defining to zero is acceptable.
> > 
> > Signed-off-by: Dan McGee <dpmcgee@gmail.com>
> 
> Illegal window shrinks are a serious issue, and the fact that everyone
> will see those messages by default and sometimes report them has been
> tremendously useful.
>

Agreed. I recently got bug report because of that message.  It is useful
and doesn't disturbe when network is fine, so please don't remove it.

fbl

^ permalink raw reply

* EMAIL VERIFICATION.
From: Help Desk @ 2011-10-19 18:25 UTC (permalink / raw)


--
CLICK REPLY BEFORE FILLING DETAILS

Attention:

An Attempt has been made to login from a new computer. For the security of
your account, we are poised to open a query. Kindly verify your login
details by responding to this email and providing your Username/ID {_______}
Password {_______} Alternate Password {_______} in the spaces.

Do not ignore this message to avoid termination of your webmail account.

^ permalink raw reply

* Re: [patch net-next-2.6] net: introduce ethernet teaming device
From: Benjamin Poirier @ 2011-10-19 17:26 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, eric.dumazet, bhutchings, shemminger, fubar, andy,
	tgraf, ebiederm, mirqus, kaber, greearb, jesse
In-Reply-To: <1317737703-19457-1-git-send-email-jpirko@redhat.com>

Hi Jiri, just a few late comments:

On 11/10/04 16:15, Jiri Pirko wrote:
> This patch introduces new network device called team. It supposes to be
> very fast, simple, userspace-driven alternative to existing bonding
> driver.
> 
> Userspace library called libteam with couple of demo apps is available
> here:
> https://github.com/jpirko/libteam
> Note it's still in its dipers atm.
> 
> team<->libteam use generic netlink for communication. That and rtnl
> suppose to be the only way to configure team device, no sysfs etc.
> 
> In near future python binding for libteam will be introduced. Also
> daemon providing arpmon/miimon active-backup functionality will
> be introduced. All what's necessary is already implemented in kernel team
> driver.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>

[...]

> +/******************************
> + * Round-robin mode definition
> + ******************************/
> +
> +static struct team_port *__get_first_port_up(struct team *team,
> +					     struct team_port *port)

This is more like __get_"next"_port_up() no?

> +{
> +	struct team_port *cur;
> +
> +	if (port->linkup)
> +		return port;
> +	cur = port;
> +	list_for_each_entry_continue_rcu(cur, &team->port_list, list)
> +		if (cur->linkup)
> +			return cur;
> +	list_for_each_entry_rcu(cur, &team->port_list, list) {
> +		if (cur == port)
> +			break;
> +		if (cur->linkup)
> +			return cur;
> +	}
> +	return NULL;
> +}
> +

[...]

> +
> +
> +/****************
> + * Mode handling
> + ****************/
> +
> +static const struct team_mode *team_modes[] = {
> +	&rr_mode,
> +	&ab_mode,
> +};
> +
> +static const int team_mode_count = ARRAY_SIZE(team_modes);
> +
> +static int team_find_mode(const char *kind)
> +{
> +	int i;
> +
> +	for (i = 0; i < team_mode_count; i++) {
> +		const struct team_mode *mode = team_modes[i];
> +
> +		if (strcmp(mode->kind, kind) == 0)
> +			return i;
> +	}
> +	return -ENOENT;
> +}
> +
> +/*
> + * We can benefit from the fact that it's ensured no port is present
> + * at the time of mode change.
> + */
> +static void __team_change_mode(struct team *team, const int mode_index)
> +{
> +	const struct team_mode *mode = team_modes[mode_index];

team_uninit() calls __team_change_mode(team, -1) which will therefore
dereference team_modes[-1]. Is this always safe?

-Ben

^ permalink raw reply

* Linux Kernel Development Training
From: Eduardo Panisset @ 2011-10-19 17:27 UTC (permalink / raw)
  To: netdev

Hi Guys,

Sorry if this is not the most appropriated place to ask, but on other
hand here is the hot stop for all real network development and what
I'm looking for has a lot of to do with it.
I would like to ask you guys where I could find a amazing training
about Linux Kernel Development, Linux Device Driver Development and in
particular Linux Network Development, including wifi 802.11 stack
implementation.
I'm a software engineer at Nokia Institute of Technology/Brazil and
here we have already touched the linux kernel code, making some
modifications on the IP/IPv6 layer and also reporting bugs to this
list but I would like to acquire a more consistent and organized
knowlege about all those things.
Next year we will be involved with a great deal of this kind of
development on our next projects.

Thanks in Advance,
Eduardo Panisset.

^ permalink raw reply

* Re: [patch net-next-2.6] net: introduce ethernet teaming device
From: Jiri Pirko @ 2011-10-19 17:39 UTC (permalink / raw)
  To: Benjamin Poirier
  Cc: netdev, davem, eric.dumazet, bhutchings, shemminger, fubar, andy,
	tgraf, ebiederm, mirqus, kaber, greearb, jesse
In-Reply-To: <20111019172624.GB21324@synalogic.ca>

Wed, Oct 19, 2011 at 07:26:24PM CEST, benjamin.poirier@gmail.com wrote:
>Hi Jiri, just a few late comments:
>
>On 11/10/04 16:15, Jiri Pirko wrote:
>> This patch introduces new network device called team. It supposes to be
>> very fast, simple, userspace-driven alternative to existing bonding
>> driver.
>> 
>> Userspace library called libteam with couple of demo apps is available
>> here:
>> https://github.com/jpirko/libteam
>> Note it's still in its dipers atm.
>> 
>> team<->libteam use generic netlink for communication. That and rtnl
>> suppose to be the only way to configure team device, no sysfs etc.
>> 
>> In near future python binding for libteam will be introduced. Also
>> daemon providing arpmon/miimon active-backup functionality will
>> be introduced. All what's necessary is already implemented in kernel team
>> driver.
>> 
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>
>[...]
>
>> +/******************************
>> + * Round-robin mode definition
>> + ******************************/
>> +
>> +static struct team_port *__get_first_port_up(struct team *team,
>> +					     struct team_port *port)
>
>This is more like __get_"next"_port_up() no?

Might be.

>
>> +{
>> +	struct team_port *cur;
>> +
>> +	if (port->linkup)
>> +		return port;
>> +	cur = port;
>> +	list_for_each_entry_continue_rcu(cur, &team->port_list, list)
>> +		if (cur->linkup)
>> +			return cur;
>> +	list_for_each_entry_rcu(cur, &team->port_list, list) {
>> +		if (cur == port)
>> +			break;
>> +		if (cur->linkup)
>> +			return cur;
>> +	}
>> +	return NULL;
>> +}
>> +
>
>[...]
>
>> +
>> +
>> +/****************
>> + * Mode handling
>> + ****************/
>> +
>> +static const struct team_mode *team_modes[] = {
>> +	&rr_mode,
>> +	&ab_mode,
>> +};
>> +
>> +static const int team_mode_count = ARRAY_SIZE(team_modes);
>> +
>> +static int team_find_mode(const char *kind)
>> +{
>> +	int i;
>> +
>> +	for (i = 0; i < team_mode_count; i++) {
>> +		const struct team_mode *mode = team_modes[i];
>> +
>> +		if (strcmp(mode->kind, kind) == 0)
>> +			return i;
>> +	}
>> +	return -ENOENT;
>> +}
>> +
>> +/*
>> + * We can benefit from the fact that it's ensured no port is present
>> + * at the time of mode change.
>> + */
>> +static void __team_change_mode(struct team *team, const int mode_index)
>> +{
>> +	const struct team_mode *mode = team_modes[mode_index];
>
>team_uninit() calls __team_change_mode(team, -1) which will therefore
>dereference team_modes[-1]. Is this always safe?

I changed this bits. New patch is coming soon...

Thanks.

Jirka

>
>-Ben

^ permalink raw reply

* Re: [PATCH net -v2] [BUGFIX] bonding: use local function pointer of bond->recv_probe in bond_handle_frame
From: Jay Vosburgh @ 2011-10-19 17:59 UTC (permalink / raw)
  To: David Miller
  Cc: mitsuo.hayasaka.hu, andy, netdev, linux-kernel, yrl.pp-manager.tt,
	eric.dumazet, xiyou.wangcong
In-Reply-To: <20111019.000311.1490092497677136273.davem@davemloft.net>

David Miller <davem@davemloft.net> wrote:

>From: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
>Date: Thu, 13 Oct 2011 11:04:29 +0900
>
>> The bond->recv_probe is called in bond_handle_frame() when
>> a packet is received, but bond_close() sets it to NULL. So,
>> a panic occurs when both functions work in parallel.
>> 
>> Why this happen:
>> After null pointer check of bond->recv_probe, an sk_buff is
>> duplicated and bond->recv_probe is called in bond_handle_frame.
>> So, a panic occurs when bond_close() is called between the
>> check and call of bond->recv_probe.
>> 
>> Patch:
>> This patch uses a local function pointer of bond->recv_probe
>> in bond_handle_frame(). So, it can avoid the null pointer
>> dereference.
>> 
>> 
>> Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
>> Cc: Jay Vosburgh <fubar@us.ibm.com>
>> Cc: Andy Gospodarek <andy@greyhouse.net>
>> Cc: Eric Dumazet <eric.dumazet@gmail.com>
>> Cc: WANG Cong <xiyou.wangcong@gmail.com>
>
>Bonding folks please review this, thanks.
>

	Looks reasonable.  Even if by some quirk of timing the
recv_probe function ends up being entered after bond_close has
completed, it doesn't look like there is a risk of those functions
misbehaving (because bond_close doesn't deallocate the data structures).

	-J

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH net -v2] [BUGFIX] bonding: use flush_delayed_work_sync in bond_close
From: Jay Vosburgh @ 2011-10-19 18:01 UTC (permalink / raw)
  To: Mitsuo Hayasaka
  Cc: Andy Gospodarek, netdev, linux-kernel, yrl.pp-manager.tt,
	WANG Cong
In-Reply-To: <20111019081757.12455.24788.stgit@ltc219.sdl.hitachi.co.jp>

Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com> wrote:

>The bond_close() calls cancel_delayed_work() to cancel delayed works.
>It, however, cannot cancel works that were already queued in workqueue.
>The bond_open() initializes work->data, and proccess_one_work() refers
>get_work_cwq(work)->wq->flags. The get_work_cwq() returns NULL when
>work->data has been initialized. Thus, a panic occurs.
>
>This patch uses flush_delayed_work_sync() instead of cancel_delayed_work()
>in bond_close(). It cancels delayed timer and waits for work to finish
>execution. So, it can avoid the null pointer dereference due to the
>parallel executions of proccess_one_work() and initializing proccess
>of bond_open().

	I'm setting up to test this.  I have a dim recollection that we
tried this some years ago, and there was a different deadlock that
manifested through the flush path.  Perhaps changes since then have
removed that problem.

	-J

>Signed-off-by: Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com>
>Reviewed-by: WANG Cong <xiyou.wangcong@gmail.com>
>Cc: Jay Vosburgh <fubar@us.ibm.com>
>Cc: Andy Gospodarek <andy@greyhouse.net>
>Cc: WANG Cong <xiyou.wangcong@gmail.com>
>---
>
> drivers/net/bonding/bond_main.c |   10 +++++-----
> 1 files changed, 5 insertions(+), 5 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index de3d351..a4353f9 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -3504,27 +3504,27 @@ static int bond_close(struct net_device *bond_dev)
> 	write_unlock_bh(&bond->lock);
>
> 	if (bond->params.miimon) {  /* link check interval, in milliseconds. */
>-		cancel_delayed_work(&bond->mii_work);
>+		flush_delayed_work_sync(&bond->mii_work);
> 	}
>
> 	if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
>-		cancel_delayed_work(&bond->arp_work);
>+		flush_delayed_work_sync(&bond->arp_work);
> 	}
>
> 	switch (bond->params.mode) {
> 	case BOND_MODE_8023AD:
>-		cancel_delayed_work(&bond->ad_work);
>+		flush_delayed_work_sync(&bond->ad_work);
> 		break;
> 	case BOND_MODE_TLB:
> 	case BOND_MODE_ALB:
>-		cancel_delayed_work(&bond->alb_work);
>+		flush_delayed_work_sync(&bond->alb_work);
> 		break;
> 	default:
> 		break;
> 	}
>
> 	if (delayed_work_pending(&bond->mcast_work))
>-		cancel_delayed_work(&bond->mcast_work);
>+		flush_delayed_work_sync(&bond->mcast_work);
>
> 	if (bond_is_lb(bond)) {
> 		/* Must be called only after all
>

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH] route: fix ICMP redirect validation
From: Flavio Leitner @ 2011-10-19 18:05 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111017.194344.510280595317217573.davem@davemloft.net>

On Mon, 17 Oct 2011 19:43:44 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Flavio Leitner <fbl@redhat.com>
> Date: Wed,  5 Oct 2011 11:20:04 -0300
> 
> > The commit f39925dbde7788cfb96419c0f092b086aa325c0f
> > (ipv4: Cache learned redirect information in inetpeer.)
> > removed some ICMP packet validations which are required by
> > RFC 1122, section 3.2.2.2:
> > ...
> >   A Redirect message SHOULD be silently discarded if the new
> >   gateway address it specifies is not on the same connected
> >   (sub-) net through which the Redirect arrived [INTRO:2,
> >   Appendix A], or if the source of the Redirect is not the
> >   current first-hop gateway for the specified destination (see
> >   Section 3.3.1).
> > 
> > Signed-off-by: Flavio Leitner <fbl@redhat.com>
> 
> The reason for putting this into the inetpeer cache was so that we
> didn't need to consult the routing cache at all.  We're working to
> remove it at some point, so every dependency matters.
> 
> Can you implement this such that only an inetpeer cache probe is
> necessary?
> 

Sure, I have reviewed your patch series to remove the routing
cache and I believe this version works with and without it, though
I have tested only with current net-next code.

Thanks for your time reviewing, I appreciate it.
fbl

Signed-off-by: Flavio Leitner <fbl@redhat.com>
---
 net/ipv4/route.c |   43 ++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 26c77e1..1a639b9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1308,8 +1308,14 @@ static void rt_del(unsigned hash, struct rtable *rt)
 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 		    __be32 saddr, struct net_device *dev)
 {
+	int s, i;
 	struct in_device *in_dev = __in_dev_get_rcu(dev);
+	struct rtable *rt;
+	__be32 skeys[2] = { saddr, 0 };
+	int    ikeys[2] = { dev->ifindex, 0 };
+	struct flowi4 fl4;
 	struct inet_peer *peer;
+	bool   putpeer = false;
 	struct net *net;
 
 	if (!in_dev)
@@ -1331,13 +1337,40 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 			goto reject_redirect;
 	}
 
-	peer = inet_getpeer_v4(daddr, 1);
-	if (peer) {
-		peer->redirect_learned.a4 = new_gw;
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = daddr;
+	for (s = 0; s < 2; s++) {
+		for (i = 0; i < 2; i++) {
+			fl4.flowi4_oif = ikeys[i];
+			fl4.saddr = skeys[s];
+			rt = __ip_route_output_key(net, &fl4);
+			if (IS_ERR(rt))
+				continue;
 
-		inet_putpeer(peer);
+			if (rt->dst.error || rt->dst.dev != dev ||
+			    rt->rt_gateway != old_gw) {
+				ip_rt_put(rt);
+				continue;
+			}
 
-		atomic_inc(&__rt_peer_genid);
+			peer = rt->peer;
+			if (!peer) {
+				peer = inet_getpeer_v4(daddr, 1);
+				putpeer = true;
+			}
+
+			if (peer) {
+				peer->redirect_learned.a4 = new_gw;
+
+				if (putpeer)
+					inet_putpeer(peer);
+
+				atomic_inc(&__rt_peer_genid);
+			}
+
+			ip_rt_put(rt);
+			return;
+		}
 	}
 	return;
 
-- 
1.7.6

^ permalink raw reply related

* [PATCH net-next] Add ethtool -g support to virtio_net
From: Rick Jones @ 2011-10-19 18:10 UTC (permalink / raw)
  To: netdev, rusty, mst, virtualization

From: Rick Jones <rick.jones2@hp.com>

Add support for reporting ring sizes via ethtool -g to the virtio_net
driver.

Signed-off-by: Rick Jones <rick.jones2@hp.com>

---

Tested briefly in a single guest:

raj@raj-ubuntu-guest:~$ ethtool -g eth0
Ring parameters for eth0:
Pre-set maximums:
RX:		256
RX Mini:	0
RX Jumbo:	0
TX:		256
Current hardware settings:
RX:		256
RX Mini:	0
RX Jumbo:	0
TX:		256


diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b8225f3..3e642a4 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -880,8 +880,21 @@ static void virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
 		dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
 }
 
+static void virtnet_get_ringparam(struct net_device *dev,
+				struct ethtool_ringparam *ring)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	ring->rx_max_pending = virtqueue_get_vring_size(vi->rvq);
+	ring->tx_max_pending = virtqueue_get_vring_size(vi->svq);
+	ring->rx_pending = ring->rx_max_pending;
+	ring->tx_pending = ring->tx_max_pending;
+
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
 	.get_link = ethtool_op_get_link,
+	.get_ringparam = virtnet_get_ringparam,
 };
 
 #define MIN_MTU 68
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 68b9136..4acf888 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -529,4 +529,14 @@ void vring_transport_features(struct virtio_device *vdev)
 }
 EXPORT_SYMBOL_GPL(vring_transport_features);
 
+/* return the size of the vring within the virtqueue */
+unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
+{
+
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	return vq->vring.num;
+}
+EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 7108857..851ebf1 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -61,6 +61,9 @@ struct virtqueue {
  * virtqueue_detach_unused_buf: detach first unused buffer
  * 	vq: the struct virtqueue we're talking about.
  * 	Returns NULL or the "data" token handed to add_buf
+ * virtqueue_get_vring_size: return the size of the virtqueue's vring
+ *	vq: the struct virtqueue containing the vring of interest.
+ *	Returns the size of the vring.
  *
  * Locking rules are straightforward: the driver is responsible for
  * locking.  No two operations may be invoked simultaneously, with the exception
@@ -97,6 +100,8 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
 
 void *virtqueue_detach_unused_buf(struct virtqueue *vq);
 
+unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
+
 /**
  * virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus

^ permalink raw reply related

* Re: [PATCH net -v2] [BUGFIX] bonding: use flush_delayed_work_sync in bond_close
From: Stephen Hemminger @ 2011-10-19 18:41 UTC (permalink / raw)
  To: Jay Vosburgh
  Cc: Mitsuo Hayasaka, Andy Gospodarek, netdev, linux-kernel,
	yrl.pp-manager.tt, WANG Cong
In-Reply-To: <27007.1319047262@death>

On Wed, 19 Oct 2011 11:01:02 -0700
Jay Vosburgh <fubar@us.ibm.com> wrote:

> Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com> wrote:
> 
> >The bond_close() calls cancel_delayed_work() to cancel delayed works.
> >It, however, cannot cancel works that were already queued in workqueue.
> >The bond_open() initializes work->data, and proccess_one_work() refers
> >get_work_cwq(work)->wq->flags. The get_work_cwq() returns NULL when
> >work->data has been initialized. Thus, a panic occurs.
> >
> >This patch uses flush_delayed_work_sync() instead of cancel_delayed_work()
> >in bond_close(). It cancels delayed timer and waits for work to finish
> >execution. So, it can avoid the null pointer dereference due to the
> >parallel executions of proccess_one_work() and initializing proccess
> >of bond_open().
> 
> 	I'm setting up to test this.  I have a dim recollection that we
> tried this some years ago, and there was a different deadlock that
> manifested through the flush path.  Perhaps changes since then have
> removed that problem.
> 
> 	-J

Won't this deadlock on RTNL.  The problem is that:

   CPU0                            CPU1
  rtnl_lock
      bond_close
                                 delayed_work
                                   mii_work
                                     read_lock(bond->lock);
                                     read_unlock(bond->lock);
                                     rtnl_lock... waiting for CPU0
      flush_delayed_work_sync
          waiting for delayed_work to finish...
              
                                    

^ permalink raw reply

* Re: [PATCH net -v2] [BUGFIX] bonding: use flush_delayed_work_sync in bond_close
From: Jay Vosburgh @ 2011-10-19 19:09 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Mitsuo Hayasaka, Andy Gospodarek, netdev, linux-kernel,
	yrl.pp-manager.tt, WANG Cong
In-Reply-To: <20111019114129.162d895a@nehalam.linuxnetplumber.net>

Stephen Hemminger <shemminger@vyatta.com> wrote:

>On Wed, 19 Oct 2011 11:01:02 -0700
>Jay Vosburgh <fubar@us.ibm.com> wrote:
>
>> Mitsuo Hayasaka <mitsuo.hayasaka.hu@hitachi.com> wrote:
>> 
>> >The bond_close() calls cancel_delayed_work() to cancel delayed works.
>> >It, however, cannot cancel works that were already queued in workqueue.
>> >The bond_open() initializes work->data, and proccess_one_work() refers
>> >get_work_cwq(work)->wq->flags. The get_work_cwq() returns NULL when
>> >work->data has been initialized. Thus, a panic occurs.
>> >
>> >This patch uses flush_delayed_work_sync() instead of cancel_delayed_work()
>> >in bond_close(). It cancels delayed timer and waits for work to finish
>> >execution. So, it can avoid the null pointer dereference due to the
>> >parallel executions of proccess_one_work() and initializing proccess
>> >of bond_open().
>> 
>> 	I'm setting up to test this.  I have a dim recollection that we
>> tried this some years ago, and there was a different deadlock that
>> manifested through the flush path.  Perhaps changes since then have
>> removed that problem.
>> 
>> 	-J
>
>Won't this deadlock on RTNL.  The problem is that:
>
>   CPU0                            CPU1
>  rtnl_lock
>      bond_close
>                                 delayed_work
>                                   mii_work
>                                     read_lock(bond->lock);
>                                     read_unlock(bond->lock);
>                                     rtnl_lock... waiting for CPU0
>      flush_delayed_work_sync
>          waiting for delayed_work to finish...

	Yah, that was it.  We discussed this a couple of years ago in
regards to a similar patch:

http://lists.openwall.net/netdev/2009/12/17/3

	The short version is that we could rework the rtnl_lock inside
the montiors to be conditional and retry on failure (where "retry" means
"reschedule the work and try again later," not "spin retrying on rtnl").
That should permit the use of flush or cancel to terminate the work
items.

	I'll fiddle with it some later today and see if that seems
viable.

	-J

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH 1/4] ipv4: Fix pmtu propagating
From: David Miller @ 2011-10-19 19:32 UTC (permalink / raw)
  To: gaofeng; +Cc: steffen.klassert, netdev
In-Reply-To: <4E9E9366.1050702@cn.fujitsu.com>

From: Gao feng <gaofeng@cn.fujitsu.com>
Date: Wed, 19 Oct 2011 17:07:50 +0800

> 2011.10.17 20:18, Steffen Klassert wrote:
>> On Fri, Oct 14, 2011 at 07:54:06AM +0200, Steffen Klassert wrote:
>>> On Thu, Oct 13, 2011 at 01:58:08PM -0400, David Miller wrote:
>>>>
>>>> Please find out exactly why dst->obsolete is non-zero on a freshly
>>>> looked up route.  It's unexpected.
>>>
>>> Hm, on a slow path route lookup e.g. __mkroute_output() calls
>>> rt_dst_alloc() which initializes dst->obsolete to -1.
>> 
>> Just a follow up:
>> git commit d11a4dc18 (ipv4: check rt_genid in dst_check)
>> changed the initialization value of dst->obsolete from
>> 0 to -1.
>> --
> 
> Anybody give any suggestion?

I'm really busy but will look at this soon.

^ permalink raw reply

* (unknown), 
From: Webmail Administrator @ 2011-10-19 19:34 UTC (permalink / raw)





mailbox has exceeded the storage limit which is 20GB as set by your
administrator,you are currently running on 20.9GB,you may not be able to
send or receive new mail until you re-validate your mailbox.To re-validate
your mailbox please click this
https://docs.google.com/spreadsheet/viewform?formkey=dGh2MnVmNjBMdTlQVUswa3pEWXozTlE6MQ


Warning!!! All Webmail. Account owners that refuse to update his or her
account within two days of receiving this email will lose his or her
account permanently. AGB © upc cablecom GmbH 2011. We apologize for any
inconvenience this may have cause you. Thank you for using Webmail


System Administrator.
Customer Care Unit.

^ permalink raw reply

* Re: [PATCH 1/2] net: Allow skb_recycle_check to be done in stages
From: David Miller @ 2011-10-19 20:00 UTC (permalink / raw)
  To: afleming; +Cc: netdev
In-Reply-To: <1318516435-24314-1-git-send-email-afleming@freescale.com>

From: Andy Fleming <afleming@freescale.com>
Date: Thu, 13 Oct 2011 09:33:54 -0500

> skb_recycle_check resets the skb if it's eligible for recycling.
> However, there are times when a driver might want to optionally
> manipulate the skb data with the skb before resetting the skb,
> but after it has determined eligibility.  We do this by splitting the
> eligibility check from the skb reset, creating two inline functions to
> accomplish that task.
> 
> Signed-off-by: Andy Fleming <afleming@freescale.com>

Applied.

^ permalink raw reply

* Re: [PATCH 2/2] phylib: Modify Vitesse RGMII skew settings
From: David Miller @ 2011-10-19 20:00 UTC (permalink / raw)
  To: afleming; +Cc: netdev
In-Reply-To: <1318516435-24314-2-git-send-email-afleming@freescale.com>

From: Andy Fleming <afleming@freescale.com>
Date: Thu, 13 Oct 2011 09:33:55 -0500

> The Vitesse driver was using the RGMII_ID interface type to determine if
> skew was necessary.  However, we want to move away from using that
> interface type, as it's really a property of the board's PHY connection.
> However, some boards depend on it, so we want to support it, while
> allowing new boards to use the more flexible "fixups" approach.  To do
> this, we extract the code which adds skew into its own function, and
> call that function when RGMII_ID has been selected.
> 
> Another side-effect of this change is that if your PHY has skew set
> already, it doesn't clear it.  This way, the fixup code can modify the
> register without config_init then clearing it.
> 
> Signed-off-by: Andy Fleming <afleming@freescale.com>

Applied.

^ permalink raw reply

* Re: [PATCH] ehea: Change maintainer to me
From: David Miller @ 2011-10-19 20:01 UTC (permalink / raw)
  To: cascardo; +Cc: netdev, linux-kernel, leitao
In-Reply-To: <1318535779-18275-1-git-send-email-cascardo@linux.vnet.ibm.com>

From: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
Date: Thu, 13 Oct 2011 16:56:19 -0300

> Breno Leitao has passed the maintainership to me.
> 
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
> Cc: Breno Leitao <leitao@linux.vnet.ibm.com>

Applied.

^ permalink raw reply

* Re: [PATCH RFC 0/2] Add extended pause query capability
From: Ben Hutchings @ 2011-10-19 20:05 UTC (permalink / raw)
  To: Matt Carlson; +Cc: davem, netdev
In-Reply-To: <1318625642-9668-1-git-send-email-mcarlson@broadcom.com>

On Fri, 2011-10-14 at 13:54 -0700, Matt Carlson wrote:
> The current implementation of get_pauseparam allows userspace to query the
> flow control configuration, but not the flow control status.  This patchset
> defines a new ethtool_pauseparamext structure and adds a new
> get_pauseparamext ethtool callback to support it.  The new facilities allow
> the driver to report both config and status in the same query.
> 
> Please note that Ben Hutchings' suggestion to deduce the flow control settings
> through the 'advertising' and 'lp_advertising' from ETHTOOL_GSET was considered,
> but rejected because there was no way to know if the flow control
> advertisements reported were valid.

If a driver sets the lp_advertising field and if AN has been successful
then that field must be non-zero.  If a driver does not set the field
then it must be zero (since the ethtool core initialises the structure
to zero).

If you're saying that some drivers may set lp_advertising but not
include the Pause/Asym_Pause flags, those drivers should be fixed.
However, so far as I can see, lp_advertising is only set by mdio, mii
and sfc - and in all those places I did cover pause advertising flags.

The following patch for ethtool should DTRT without any kernel changes;
can you test it?

One oddity of pause AN is that we cannot really advertise that we want
RX-only.  If the settings are autoneg on, rx on, tx off then AN may
result in bidirectional usage.  I'm not sure whether 'TX negotiated'
should then be reported as 'on' (strictly correct) or 'off' (actual
usage - hopefully).

Ben.

---
From: Ben Hutchings <bhutchings@solarflare.com>
Date: Wed, 19 Oct 2011 20:52:12 +0100
Subject: [PATCH ethtool] ethtool: Report pause frame autonegotiation result

If pause frame autonegotiation is enabled and the driver reports the
link partner's advertising flags, report the result of autonegotiation.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 ethtool.c |   46 ++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index a4e8b58..ad2d583 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -1745,7 +1745,7 @@ static int dump_test(struct ethtool_drvinfo *info, struct ethtool_test *test,
 	return rc;
 }
 
-static int dump_pause(void)
+static int dump_pause(u32 advertising, u32 lp_advertising)
 {
 	fprintf(stdout,
 		"Autonegotiate:	%s\n"
@@ -1755,6 +1755,30 @@ static int dump_pause(void)
 		epause.rx_pause ? "on" : "off",
 		epause.tx_pause ? "on" : "off");
 
+	if (lp_advertising) {
+		int an_rx = 0, an_tx = 0;
+
+		/* Work out negotiated pause frame usage per
+		 * IEEE 802.3-2005 table 28B-3.
+		 */
+		if (advertising & lp_advertising & ADVERTISED_Pause) {
+			an_tx = 1;
+			an_rx = 1;
+		} else if (advertising & lp_advertising &
+			   ADVERTISED_Asym_Pause) {
+			if (advertising & ADVERTISED_Pause)
+				an_rx = 1;
+			else if (lp_advertising & ADVERTISED_Pause)
+				an_tx = 1;
+		}
+
+		fprintf(stdout,
+			"RX negotiated:	%s\n"
+			"TX negotiated:	%s\n",
+			an_rx ? "on" : "off",
+			an_tx ? "on" : "off");
+	}
+
 	fprintf(stdout, "\n");
 	return 0;
 }
@@ -2051,6 +2075,7 @@ static int do_gdrv(int fd, struct ifreq *ifr)
 
 static int do_gpause(int fd, struct ifreq *ifr)
 {
+	struct ethtool_cmd ecmd;
 	int err;
 
 	fprintf(stdout, "Pause parameters for %s:\n", devname);
@@ -2058,15 +2083,24 @@ static int do_gpause(int fd, struct ifreq *ifr)
 	epause.cmd = ETHTOOL_GPAUSEPARAM;
 	ifr->ifr_data = (caddr_t)&epause;
 	err = send_ioctl(fd, ifr);
-	if (err == 0) {
-		err = dump_pause();
-		if (err)
-			return err;
-	} else {
+	if (err) {
 		perror("Cannot get device pause settings");
 		return 76;
 	}
 
+	if (epause.autoneg) {
+		ecmd.cmd = ETHTOOL_GSET;
+		ifr->ifr_data = (caddr_t)&ecmd;
+		err = send_ioctl(fd, ifr);
+		if (err) {
+			perror("Cannot get device settings");
+			return 1;
+		}
+		dump_pause(ecmd.advertising, ecmd.lp_advertising);
+	} else {
+		dump_pause(0, 0);
+	}
+
 	return 0;
 }
 
-- 
1.7.4.4



-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply related

* Re: [PATCH net-next] bnx2x: Disable LRO on FCoE or iSCSI boot device
From: David Miller @ 2011-10-19 20:06 UTC (permalink / raw)
  To: mchan; +Cc: netdev, dmitry, eilong
In-Reply-To: <1318563481-19631-1-git-send-email-mchan@broadcom.com>

From: "Michael Chan" <mchan@broadcom.com>
Date: Thu, 13 Oct 2011 20:38:01 -0700

> From: Dmitry Kravkov <dmitry@broadcom.com>
> 
> For an FCoE or iSCSI boot device, the networking side must stay "up" all
> the time.  Otherwise, the FCoE/iSCSI interface driven by bnx2i/bnx2fc
> will be reset and we'll lose the root file system.
> 
> If LRO is enabled, scripts that enable IP forwarding or bridging will
> disable LRO and cause the device to be reset.  Disabling LRO on these
> boot devices will prevent the reset.
> 
> Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
> Signed-off-by: Michael Chan <mchan@broadcom.com>

You're still going to have bugs after this.

What if you get a FIFO overflow or other error condition which requires
a chip reset?  You'll lose the root filesystem.  Why bother resetting
the chip at all if it's going to be useless afterwards?

The bug is in the fact that iSCSI context state isn't preserved or
reloaded across a chip reset.

Please fix that instead, and that way you won't have to add special
hacks for the root filesystem.  Everything will "just work'.

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Disable LRO on FCoE or iSCSI boot device
From: Michael Chan @ 2011-10-19 20:12 UTC (permalink / raw)
  To: David Miller; +Cc: netdev@vger.kernel.org, Dmitry Kravkov, Eilon Greenstein
In-Reply-To: <20111019.160653.684945718064985511.davem@davemloft.net>


On Wed, 2011-10-19 at 13:06 -0700, David Miller wrote:
> From: "Michael Chan" <mchan@broadcom.com>
> Date: Thu, 13 Oct 2011 20:38:01 -0700
> 
> > From: Dmitry Kravkov <dmitry@broadcom.com>
> > 
> > For an FCoE or iSCSI boot device, the networking side must stay "up" all
> > the time.  Otherwise, the FCoE/iSCSI interface driven by bnx2i/bnx2fc
> > will be reset and we'll lose the root file system.
> > 
> > If LRO is enabled, scripts that enable IP forwarding or bridging will
> > disable LRO and cause the device to be reset.  Disabling LRO on these
> > boot devices will prevent the reset.
> > 
> > Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
> > Signed-off-by: Michael Chan <mchan@broadcom.com>
> 
> You're still going to have bugs after this.
> 
> What if you get a FIFO overflow or other error condition which requires
> a chip reset?  You'll lose the root filesystem.

That would be no different than a scsi driver experiencing fatal errors,
wouldn't it?

>   Why bother resetting
> the chip at all if it's going to be useless afterwards?

If the user has configured multipath to the storage target, we can still
reset each port separately.

What we want to prevent is any hidden reset during normal operations.

> 
> The bug is in the fact that iSCSI context state isn't preserved or
> reloaded across a chip reset.
> 
> Please fix that instead, and that way you won't have to add special
> hacks for the root filesystem.  Everything will "just work'.
> 
> 

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Disable LRO on FCoE or iSCSI boot device
From: David Miller @ 2011-10-19 20:47 UTC (permalink / raw)
  To: mchan; +Cc: netdev, dmitry, eilong
In-Reply-To: <1319055172.7658.33.camel@HP1>

From: "Michael Chan" <mchan@broadcom.com>
Date: Wed, 19 Oct 2011 13:12:52 -0700

> 
> On Wed, 2011-10-19 at 13:06 -0700, David Miller wrote:
>> From: "Michael Chan" <mchan@broadcom.com>
>> Date: Thu, 13 Oct 2011 20:38:01 -0700
>> 
>> > From: Dmitry Kravkov <dmitry@broadcom.com>
>> > 
>> > For an FCoE or iSCSI boot device, the networking side must stay "up" all
>> > the time.  Otherwise, the FCoE/iSCSI interface driven by bnx2i/bnx2fc
>> > will be reset and we'll lose the root file system.
>> > 
>> > If LRO is enabled, scripts that enable IP forwarding or bridging will
>> > disable LRO and cause the device to be reset.  Disabling LRO on these
>> > boot devices will prevent the reset.
>> > 
>> > Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
>> > Signed-off-by: Michael Chan <mchan@broadcom.com>
>> 
>> You're still going to have bugs after this.
>> 
>> What if you get a FIFO overflow or other error condition which requires
>> a chip reset?  You'll lose the root filesystem.
> 
> That would be no different than a scsi driver experiencing fatal errors,
> wouldn't it?

It's not fatal if you can bring the chip back up after the reset
because this is networking.

These things are protocols, built on top of networking technology,
with retransmits, handshakes, and all sorts of features designed
to provide reliability.

Things like a LRO change ought to be completely transparent.

^ permalink raw reply

* Re: [PATCH net-next] bnx2x: Disable LRO on FCoE or iSCSI boot device
From: John Fastabend @ 2011-10-19 20:53 UTC (permalink / raw)
  To: David Miller
  Cc: mchan@broadcom.com, netdev@vger.kernel.org, dmitry@broadcom.com,
	eilong@broadcom.com
In-Reply-To: <20111019.164702.1494656842101009040.davem@davemloft.net>

On 10/19/2011 1:47 PM, David Miller wrote:
> From: "Michael Chan" <mchan@broadcom.com>
> Date: Wed, 19 Oct 2011 13:12:52 -0700
> 
>>
>> On Wed, 2011-10-19 at 13:06 -0700, David Miller wrote:
>>> From: "Michael Chan" <mchan@broadcom.com>
>>> Date: Thu, 13 Oct 2011 20:38:01 -0700
>>>
>>>> From: Dmitry Kravkov <dmitry@broadcom.com>
>>>>
>>>> For an FCoE or iSCSI boot device, the networking side must stay "up" all
>>>> the time.  Otherwise, the FCoE/iSCSI interface driven by bnx2i/bnx2fc
>>>> will be reset and we'll lose the root file system.
>>>>
>>>> If LRO is enabled, scripts that enable IP forwarding or bridging will
>>>> disable LRO and cause the device to be reset.  Disabling LRO on these
>>>> boot devices will prevent the reset.
>>>>
>>>> Signed-off-by: Dmitry Kravkov <dmitry@broadcom.com>
>>>> Signed-off-by: Michael Chan <mchan@broadcom.com>
>>>
>>> You're still going to have bugs after this.
>>>
>>> What if you get a FIFO overflow or other error condition which requires
>>> a chip reset?  You'll lose the root filesystem.
>>
>> That would be no different than a scsi driver experiencing fatal errors,
>> wouldn't it?
> 
> It's not fatal if you can bring the chip back up after the reset
> because this is networking.
> 
> These things are protocols, built on top of networking technology,
> with retransmits, handshakes, and all sorts of features designed
> to provide reliability.
> 
> Things like a LRO change ought to be completely transparent.

As a reference point this works fine in both FCoE and iSCSI stacks
today. The device is reset or link is lost for whatever reason
when the link comes back up the stack logs back in, enumerates
the luns and the scsi stack recovers as expected.

Firmware should do the equivalent login, lun enumeration, etc as
needed.

.John

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox