Netdev List
 help / color / mirror / Atom feed
* [PATCH 9/9 Rev3] [IPoIB] Implement the new batching API
From: Krishna Kumar @ 2007-08-08  9:32 UTC (permalink / raw)
  To: johnpol, kaber, shemminger, davem, sri
  Cc: jagana, Robert.Olsson, peter.p.waskiewicz.jr, herbert, gaagaan,
	kumarkr, rdreier, rick.jones2, mcarlson, jeff, general, mchan,
	tgraf, hadi, netdev, Krishna Kumar, xma
In-Reply-To: <20070808093114.15396.22797.sendpatchset@localhost.localdomain>

IPoIB: implement the new batching API.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 ipoib_main.c |  189 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 184 insertions(+), 5 deletions(-)

diff -ruNp ORG/drivers/infiniband/ulp/ipoib/ipoib_main.c NEW/drivers/infiniband/ulp/ipoib/ipoib_main.c
--- ORG/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-07-12 08:55:06.000000000 +0530
+++ NEW/drivers/infiniband/ulp/ipoib/ipoib_main.c	2007-08-07 13:11:19.000000000 +0530
@@ -558,7 +558,8 @@ static void neigh_add_path(struct sk_buf
 				goto err_drop;
 			}
 		} else
-			ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			ipoib_send(dev, skb, path->ah,
+				   IPOIB_QPN(skb->dst->neighbour->ha), 1);
 	} else {
 		neigh->ah  = NULL;
 
@@ -638,7 +639,7 @@ static void unicast_arp_send(struct sk_b
 		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
 			  be16_to_cpu(path->pathrec.dlid));
 
-		ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
+		ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr), 1);
 	} else if ((path->query || !path_rec_start(dev, path)) &&
 		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
 		/* put pseudoheader back on for next time */
@@ -704,7 +705,8 @@ static int ipoib_start_xmit(struct sk_bu
 				goto out;
 			}
 
-			ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
+			ipoib_send(dev, skb, neigh->ah,
+				   IPOIB_QPN(skb->dst->neighbour->ha), 1);
 			goto out;
 		}
 
@@ -753,6 +755,153 @@ out:
 	return NETDEV_TX_OK;
 }
 
+#define	XMIT_QUEUED_SKBS()						\
+	do {								\
+		if (wr_num) {						\
+			ipoib_send(dev, NULL, old_neigh->ah, old_qpn,	\
+				   wr_num);				\
+			wr_num = 0;					\
+		}							\
+	} while (0)
+
+/*
+ * TODO: Merge with ipoib_start_xmit to use the same code and have a
+ * transparent wrapper caller to xmit's, etc. Status: Done, needs testing.
+ */
+static int ipoib_start_xmit_frames(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	struct sk_buff_head *blist = dev->skb_blist;
+	int max_skbs, wr_num = 0;
+	u32 qpn, old_qpn = 0;
+	struct ipoib_neigh *neigh, *old_neigh = NULL;
+	unsigned long flags;
+
+	if (unlikely(!spin_trylock_irqsave(&priv->tx_lock, flags)))
+		return NETDEV_TX_LOCKED;
+
+	/*
+	 * Figure out how many skbs can be sent. This prevents the device
+	 * getting full and avoids checking for queue stopped after each
+	 * iteration.
+	 */
+	max_skbs = ipoib_sendq_size - (priv->tx_head - priv->tx_tail);
+	while (max_skbs-- > 0 && (skb = __skb_dequeue(blist)) != NULL) {
+		if (likely(skb->dst && skb->dst->neighbour)) {
+			if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
+				XMIT_QUEUED_SKBS();
+				ipoib_path_lookup(skb, dev);
+				continue;
+			}
+
+			neigh = *to_ipoib_neigh(skb->dst->neighbour);
+
+			if (ipoib_cm_get(neigh)) {
+				if (ipoib_cm_up(neigh)) {
+					XMIT_QUEUED_SKBS();
+					ipoib_cm_send(dev, skb,
+						      ipoib_cm_get(neigh));
+					continue;
+				}
+			} else if (neigh->ah) {
+				if (unlikely(memcmp(&neigh->dgid.raw,
+						    skb->dst->neighbour->ha + 4,
+						    sizeof(union ib_gid)))) {
+					spin_lock(&priv->lock);
+					/*
+					 * It's safe to call ipoib_put_ah()
+					 * inside priv->lock here, because we
+					 * know that path->ah will always hold
+					 * one more reference, so ipoib_put_ah()
+					 * will never do more than decrement
+					 * the ref count.
+					 */
+					ipoib_put_ah(neigh->ah);
+					list_del(&neigh->list);
+					ipoib_neigh_free(dev, neigh);
+					spin_unlock(&priv->lock);
+					XMIT_QUEUED_SKBS();
+					ipoib_path_lookup(skb, dev);
+					continue;
+				}
+
+				qpn = IPOIB_QPN(skb->dst->neighbour->ha);
+				if (neigh != old_neigh || qpn != old_qpn) {
+					/*
+					 * Sending to a different destination
+					 * from earlier skb's - send all
+					 * existing skbs (if any), and restart.
+					 */
+					XMIT_QUEUED_SKBS();
+					old_neigh = neigh;
+					old_qpn = qpn;
+				}
+
+				if (likely(!ipoib_process_skb(dev, skb, priv,
+							      neigh->ah, qpn,
+							      wr_num)))
+					wr_num++;
+
+				continue;
+			}
+
+			if (skb_queue_len(&neigh->queue) <
+			    IPOIB_MAX_PATH_REC_QUEUE) {
+				spin_lock(&priv->lock);
+				__skb_queue_tail(&neigh->queue, skb);
+				spin_unlock(&priv->lock);
+			} else {
+				dev_kfree_skb_any(skb);
+				++priv->stats.tx_dropped;
+				++max_skbs;
+			}
+		} else {
+			struct ipoib_pseudoheader *phdr =
+				(struct ipoib_pseudoheader *) skb->data;
+			skb_pull(skb, sizeof *phdr);
+
+			if (phdr->hwaddr[4] == 0xff) {
+				/* Add in the P_Key for multicast*/
+				phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+				phdr->hwaddr[9] = priv->pkey & 0xff;
+
+				XMIT_QUEUED_SKBS();
+				ipoib_mcast_send(dev, phdr->hwaddr + 4, skb);
+			} else {
+				/* unicast GID -- should be ARP or RARP reply */
+
+				if ((be16_to_cpup((__be16 *) skb->data) !=
+				    ETH_P_ARP) &&
+				    (be16_to_cpup((__be16 *) skb->data) !=
+				    ETH_P_RARP)) {
+					ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x "
+						IPOIB_GID_FMT "\n",
+						skb->dst ? "neigh" : "dst",
+						be16_to_cpup((__be16 *)
+						skb->data),
+						IPOIB_QPN(phdr->hwaddr),
+						IPOIB_GID_RAW_ARG(phdr->hwaddr
+								  + 4));
+					dev_kfree_skb_any(skb);
+					++priv->stats.tx_dropped;
+					++max_skbs;
+					continue;
+				}
+				XMIT_QUEUED_SKBS();
+				unicast_arp_send(skb, dev, phdr);
+			}
+		}
+	}
+
+	/* Send out last packets (if any) */
+	XMIT_QUEUED_SKBS();
+
+	spin_unlock_irqrestore(&priv->tx_lock, flags);
+
+	return skb_queue_empty(blist) ? NETDEV_TX_OK : NETDEV_TX_BUSY;
+}
+
 static struct net_device_stats *ipoib_get_stats(struct net_device *dev)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -896,13 +1045,37 @@ int ipoib_dev_init(struct net_device *de
 		goto out_rx_ring_cleanup;
 	}
 
-	/* priv->tx_head & tx_tail are already 0 */
+	/* priv->tx_head & tx_tail & tx_priv_tail are already 0 */
 
-	if (ipoib_ib_dev_init(dev, ca, port))
+	/* Allocate tx_sge */
+	priv->tx_sge = kmalloc(ipoib_sendq_size * sizeof *priv->tx_sge,
+			       GFP_KERNEL);
+	if (!priv->tx_sge) {
+		printk(KERN_WARNING "%s: failed to allocate TX sge (%d entries)\n",
+		       ca->name, ipoib_sendq_size);
 		goto out_tx_ring_cleanup;
+	}
+
+	/* Allocate tx_wr */
+	priv->tx_wr = kmalloc(ipoib_sendq_size * sizeof *priv->tx_wr,
+			      GFP_KERNEL);
+	if (!priv->tx_wr) {
+		printk(KERN_WARNING "%s: failed to allocate TX wr (%d entries)\n",
+		       ca->name, ipoib_sendq_size);
+		goto out_tx_sge_cleanup;
+	}
+
+	if (ipoib_ib_dev_init(dev, ca, port))
+		goto out_tx_wr_cleanup;
 
 	return 0;
 
+out_tx_wr_cleanup:
+	kfree(priv->tx_wr);
+
+out_tx_sge_cleanup:
+	kfree(priv->tx_sge);
+
 out_tx_ring_cleanup:
 	kfree(priv->tx_ring);
 
@@ -930,9 +1103,13 @@ void ipoib_dev_cleanup(struct net_device
 
 	kfree(priv->rx_ring);
 	kfree(priv->tx_ring);
+	kfree(priv->tx_sge);
+	kfree(priv->tx_wr);
 
 	priv->rx_ring = NULL;
 	priv->tx_ring = NULL;
+	priv->tx_sge = NULL;
+	priv->tx_wr = NULL;
 }
 
 static void ipoib_setup(struct net_device *dev)
@@ -943,6 +1120,7 @@ static void ipoib_setup(struct net_devic
 	dev->stop 		 = ipoib_stop;
 	dev->change_mtu 	 = ipoib_change_mtu;
 	dev->hard_start_xmit 	 = ipoib_start_xmit;
+	dev->hard_start_xmit_batch = ipoib_start_xmit_frames;
 	dev->get_stats 		 = ipoib_get_stats;
 	dev->tx_timeout 	 = ipoib_timeout;
 	dev->hard_header 	 = ipoib_hard_header;
@@ -979,6 +1157,7 @@ static void ipoib_setup(struct net_devic
 
 	spin_lock_init(&priv->lock);
 	spin_lock_init(&priv->tx_lock);
+	spin_lock_init(&priv->comp_lock);
 
 	mutex_init(&priv->mcast_mutex);
 	mutex_init(&priv->vlan_mutex);

^ permalink raw reply

* [ofa-general] [PATCH 6/9 Rev3] [IPoIB] CM & Multicast changes
From: Krishna Kumar @ 2007-08-08  9:32 UTC (permalink / raw)
  To: johnpol, sri, shemminger, kaber, davem
  Cc: jagana, Robert.Olsson, peter.p.waskiewicz.jr, herbert, gaagaan,
	kumarkr, rdreier, mcarlson, jeff, general, mchan, tgraf, hadi,
	netdev
In-Reply-To: <20070808093114.15396.22797.sendpatchset@localhost.localdomain>

IPoIB CM & Multicast changes based on header file changes.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 ipoib_cm.c        |   13 +++++++++----
 ipoib_multicast.c |    4 ++--
 2 files changed, 11 insertions(+), 6 deletions(-)

diff -ruNp ORG/drivers/infiniband/ulp/ipoib/ipoib_cm.c NEW/drivers/infiniband/ulp/ipoib/ipoib_cm.c
--- ORG/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-07-17 08:48:35.000000000 +0530
+++ NEW/drivers/infiniband/ulp/ipoib/ipoib_cm.c	2007-08-07 13:11:19.000000000 +0530
@@ -493,14 +493,19 @@ static inline int post_send(struct ipoib
 			    unsigned int wr_id,
 			    u64 addr, int len)
 {
+	int ret;
 	struct ib_send_wr *bad_wr;
 
-	priv->tx_sge.addr             = addr;
-	priv->tx_sge.length           = len;
+	priv->tx_sge[0].addr          = addr;
+	priv->tx_sge[0].length        = len;
+
+	priv->tx_wr[0].wr_id 	      = wr_id;
 
-	priv->tx_wr.wr_id 	      = wr_id;
+	priv->tx_wr[0].next = NULL;
+	ret = ib_post_send(tx->qp, priv->tx_wr, &bad_wr);
+	priv->tx_wr[0].next = &priv->tx_wr[1];
 
-	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+	return ret;
 }
 
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
diff -ruNp ORG/drivers/infiniband/ulp/ipoib/ipoib_multicast.c NEW/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
--- ORG/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2007-07-12 08:55:06.000000000 +0530
+++ NEW/drivers/infiniband/ulp/ipoib/ipoib_multicast.c	2007-08-07 13:11:19.000000000 +0530
@@ -217,7 +217,7 @@ static int ipoib_mcast_join_finish(struc
 	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
 		    sizeof (union ib_gid))) {
 		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
-		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+		priv->tx_wr[0].wr.ud.remote_qkey = priv->qkey;
 	}
 
 	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
@@ -736,7 +736,7 @@ out:
 			}
 		}
 
-		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN, 1);
 	}
 
 unlock:

^ permalink raw reply

* [ofa-general] [PATCH 7/9 Rev3] [IPoIB] Verb changes
From: Krishna Kumar @ 2007-08-08  9:32 UTC (permalink / raw)
  To: johnpol, davem, shemminger, kaber, sri
  Cc: jagana, Robert.Olsson, herbert, gaagaan, kumarkr, rdreier,
	peter.p.waskiewicz.jr, mcarlson, jeff, general, mchan, tgraf,
	hadi, netdev
In-Reply-To: <20070808093114.15396.22797.sendpatchset@localhost.localdomain>

IPoIB verb changes to use batching.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 ipoib_verbs.c |   23 ++++++++++++++---------
 1 files changed, 14 insertions(+), 9 deletions(-)

diff -ruNp ORG/drivers/infiniband/ulp/ipoib/ipoib_verbs.c NEW/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
--- ORG/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-07-12 08:55:06.000000000 +0530
+++ NEW/drivers/infiniband/ulp/ipoib/ipoib_verbs.c	2007-08-07 13:11:19.000000000 +0530
@@ -152,11 +152,11 @@ int ipoib_transport_dev_init(struct net_
 			.max_send_sge = 1,
 			.max_recv_sge = 1
 		},
-		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.sq_sig_type = IB_SIGNAL_REQ_WR,	/* 11.2.4.1 */
 		.qp_type     = IB_QPT_UD
 	};
-
-	int ret, size;
+	struct ib_send_wr *next_wr = NULL;
+	int i, ret, size;
 
 	priv->pd = ib_alloc_pd(priv->ca);
 	if (IS_ERR(priv->pd)) {
@@ -197,12 +197,17 @@ int ipoib_transport_dev_init(struct net_
 	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
 	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
 
-	priv->tx_sge.lkey 	= priv->mr->lkey;
-
-	priv->tx_wr.opcode 	= IB_WR_SEND;
-	priv->tx_wr.sg_list 	= &priv->tx_sge;
-	priv->tx_wr.num_sge 	= 1;
-	priv->tx_wr.send_flags 	= IB_SEND_SIGNALED;
+	for (i = ipoib_sendq_size - 1; i >= 0; i--) {
+		priv->tx_sge[i].lkey		= priv->mr->lkey;
+		priv->tx_wr[i].opcode		= IB_WR_SEND;
+		priv->tx_wr[i].sg_list		= &priv->tx_sge[i];
+		priv->tx_wr[i].num_sge		= 1;
+		priv->tx_wr[i].send_flags	= 0;
+
+		/* Link the list properly for provider to use */
+		priv->tx_wr[i].next		= next_wr;
+		next_wr				= &priv->tx_wr[i];
+	}
 
 	return 0;
 

^ permalink raw reply

* [PATCH 3/9 Rev3] [sched] Modify qdisc_run to support batching
From: Krishna Kumar @ 2007-08-08  9:31 UTC (permalink / raw)
  To: johnpol, davem, shemminger, kaber, sri
  Cc: jagana, Robert.Olsson, peter.p.waskiewicz.jr, herbert, gaagaan,
	kumarkr, rdreier, rick.jones2, mcarlson, jeff, general, mchan,
	tgraf, hadi, netdev, Krishna Kumar, xma
In-Reply-To: <20070808093114.15396.22797.sendpatchset@localhost.localdomain>

Modify qdisc_run() to support batching. Modify callers of qdisc_run to
use batching, modify qdisc_restart to implement batching.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 include/net/pkt_sched.h |    6 +--
 net/core/dev.c          |    5 +--
 net/sched/sch_generic.c |   77 +++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 69 insertions(+), 19 deletions(-)

diff -ruNp ORG/include/net/pkt_sched.h NEW/include/net/pkt_sched.h
--- ORG/include/net/pkt_sched.h	2007-07-17 08:48:37.000000000 +0530
+++ NEW/include/net/pkt_sched.h	2007-08-07 13:11:19.000000000 +0530
@@ -80,13 +80,13 @@ extern struct qdisc_rate_table *qdisc_ge
 		struct rtattr *tab);
 extern void qdisc_put_rtab(struct qdisc_rate_table *tab);
 
-extern void __qdisc_run(struct net_device *dev);
+extern void __qdisc_run(struct net_device *dev, struct sk_buff_head *blist);
 
-static inline void qdisc_run(struct net_device *dev)
+static inline void qdisc_run(struct net_device *dev, struct sk_buff_head *blist)
 {
 	if (!netif_queue_stopped(dev) &&
 	    !test_and_set_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
-		__qdisc_run(dev);
+		__qdisc_run(dev, blist);
 }
 
 extern int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
diff -ruNp ORG/net/sched/sch_generic.c NEW/net/sched/sch_generic.c
--- ORG/net/sched/sch_generic.c	2007-07-12 08:55:20.000000000 +0530
+++ NEW/net/sched/sch_generic.c	2007-08-07 13:11:19.000000000 +0530
@@ -59,10 +59,12 @@ static inline int qdisc_qlen(struct Qdis
 static inline int dev_requeue_skb(struct sk_buff *skb, struct net_device *dev,
 				  struct Qdisc *q)
 {
-	if (unlikely(skb->next))
-		dev->gso_skb = skb;
-	else
-		q->ops->requeue(skb, q);
+	if (likely(skb)) {
+		if (unlikely(skb->next))
+			dev->gso_skb = skb;
+		else
+			q->ops->requeue(skb, q);
+	}
 
 	netif_schedule(dev);
 	return 0;
@@ -91,17 +93,22 @@ static inline int handle_dev_cpu_collisi
 		/*
 		 * Same CPU holding the lock. It may be a transient
 		 * configuration error, when hard_start_xmit() recurses. We
-		 * detect it by checking xmit owner and drop the packet when
-		 * deadloop is detected. Return OK to try the next skb.
+		 * detect it by checking xmit owner and drop the packet (or
+		 * all packets in batching case) when deadloop is detected.
+		 * Return OK to try the next skb.
 		 */
-		kfree_skb(skb);
+		if (likely(skb))
+			kfree_skb(skb);
+		else if (!skb_queue_empty(dev->skb_blist))
+			skb_queue_purge(dev->skb_blist);
+
 		if (net_ratelimit())
 			printk(KERN_WARNING "Dead loop on netdevice %s, "
 			       "fix it urgently!\n", dev->name);
 		ret = qdisc_qlen(q);
 	} else {
 		/*
-		 * Another cpu is holding lock, requeue & delay xmits for
+		 * Another cpu is holding lock, requeue skb & delay xmits for
 		 * some time.
 		 */
 		__get_cpu_var(netdev_rx_stat).cpu_collision++;
@@ -112,6 +119,38 @@ static inline int handle_dev_cpu_collisi
 }
 
 /*
+ * Algorithm to get skb(s) is:
+ *	- Non batching drivers, or if the batch list is empty and there is
+ *	  1 skb in the queue - dequeue skb and put it in *skbp to tell the
+ *	  caller to use the single xmit API.
+ *	- Batching drivers where the batch list already contains atleast one
+ *	  skb, or if there are multiple skbs in the queue: keep dequeue'ing
+ *	  skb's upto a limit and set *skbp to NULL to tell the caller to use
+ *	  the multiple xmit API.
+ *
+ * Returns:
+ *	1 - atleast one skb is to be sent out, *skbp contains skb or NULL
+ *	    (in case >1 skbs present in blist for batching)
+ *	0 - no skbs to be sent.
+ */
+static inline int get_skb(struct net_device *dev, struct Qdisc *q,
+			  struct sk_buff_head *blist, struct sk_buff **skbp)
+{
+	if (likely(!blist || (!skb_queue_len(blist) && qdisc_qlen(q) <= 1))) {
+		return likely((*skbp = dev_dequeue_skb(dev, q)) != NULL);
+	} else {
+		int max = dev->tx_queue_len - skb_queue_len(blist);
+		struct sk_buff *skb;
+
+		while (max > 0 && (skb = dev_dequeue_skb(dev, q)) != NULL)
+			max -= dev_add_skb_to_blist(skb, dev);
+
+		*skbp = NULL;
+		return 1;	/* we have atleast one skb in blist */
+	}
+}
+
+/*
  * NOTE: Called under dev->queue_lock with locally disabled BH.
  *
  * __LINK_STATE_QDISC_RUNNING guarantees only one CPU can process this
@@ -130,7 +169,8 @@ static inline int handle_dev_cpu_collisi
  *				>0 - queue is not empty.
  *
  */
-static inline int qdisc_restart(struct net_device *dev)
+static inline int qdisc_restart(struct net_device *dev,
+				struct sk_buff_head *blist)
 {
 	struct Qdisc *q = dev->qdisc;
 	struct sk_buff *skb;
@@ -138,7 +178,7 @@ static inline int qdisc_restart(struct n
 	int ret;
 
 	/* Dequeue packet */
-	if (unlikely((skb = dev_dequeue_skb(dev, q)) == NULL))
+	if (unlikely(!get_skb(dev, q, blist, &skb)))
 		return 0;
 
 	/*
@@ -158,7 +198,10 @@ static inline int qdisc_restart(struct n
 	/* And release queue */
 	spin_unlock(&dev->queue_lock);
 
-	ret = dev_hard_start_xmit(skb, dev);
+	if (likely(skb))
+		ret = dev_hard_start_xmit(skb, dev);
+	else
+		ret = dev->hard_start_xmit_batch(dev);
 
 	if (!lockless)
 		netif_tx_unlock(dev);
@@ -168,7 +211,7 @@ static inline int qdisc_restart(struct n
 
 	switch (ret) {
 	case NETDEV_TX_OK:
-		/* Driver sent out skb successfully */
+		/* Driver sent out skb (or entire skb_blist) successfully */
 		ret = qdisc_qlen(q);
 		break;
 
@@ -190,10 +233,10 @@ static inline int qdisc_restart(struct n
 	return ret;
 }
 
-void __qdisc_run(struct net_device *dev)
+void __qdisc_run(struct net_device *dev, struct sk_buff_head *blist)
 {
 	do {
-		if (!qdisc_restart(dev))
+		if (!qdisc_restart(dev, blist))
 			break;
 	} while (!netif_queue_stopped(dev));
 
@@ -563,6 +606,12 @@ void dev_deactivate(struct net_device *d
 	qdisc = dev->qdisc;
 	dev->qdisc = &noop_qdisc;
 
+	if (dev->skb_blist) {
+		/* Release skbs on batch list */
+		if (!skb_queue_empty(dev->skb_blist))
+			skb_queue_purge(dev->skb_blist);
+	}
+
 	qdisc_reset(qdisc);
 
 	skb = dev->gso_skb;
diff -ruNp ORG/net/core/dev.c NEW/net/core/dev.c
--- ORG/net/core/dev.c	2007-08-06 08:25:40.000000000 +0530
+++ NEW/net/core/dev.c	2007-08-07 13:11:19.000000000 +0530
@@ -1699,7 +1699,7 @@ gso:
 			/* reset queue_mapping to zero */
 			skb->queue_mapping = 0;
 			rc = q->enqueue(skb, q);
-			qdisc_run(dev);
+			qdisc_run(dev, NULL);
 			spin_unlock(&dev->queue_lock);
 
 			rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
@@ -1896,7 +1896,8 @@ static void net_tx_action(struct softirq
 			clear_bit(__LINK_STATE_SCHED, &dev->state);
 
 			if (spin_trylock(&dev->queue_lock)) {
-				qdisc_run(dev);
+				/* Send all skbs if driver supports batching */
+				qdisc_run(dev, dev->skb_blist);
 				spin_unlock(&dev->queue_lock);
 			} else {
 				netif_schedule(dev);

^ permalink raw reply

* [PATCH 8/9 Rev3] [IPoIB] Post and work completion handler changes
From: Krishna Kumar @ 2007-08-08  9:32 UTC (permalink / raw)
  To: johnpol, sri, shemminger, davem, kaber
  Cc: jagana, Robert.Olsson, rick.jones2, herbert, gaagaan, kumarkr,
	rdreier, peter.p.waskiewicz.jr, mcarlson, jeff, general, mchan,
	tgraf, hadi, netdev, Krishna Kumar, xma
In-Reply-To: <20070808093114.15396.22797.sendpatchset@localhost.localdomain>

IPoIB internal post and work completion handler changes.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 ipoib_ib.c |  217 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 173 insertions(+), 44 deletions(-)

diff -ruNp ORG/drivers/infiniband/ulp/ipoib/ipoib_ib.c NEW/drivers/infiniband/ulp/ipoib/ipoib_ib.c
--- ORG/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-07-17 08:48:35.000000000 +0530
+++ NEW/drivers/infiniband/ulp/ipoib/ipoib_ib.c	2007-08-07 13:11:19.000000000 +0530
@@ -242,6 +242,8 @@ repost:
 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i, num_completions;
+	unsigned int tx_ring_index;
 	unsigned int wr_id = wc->wr_id;
 	struct ipoib_tx_buf *tx_req;
 	unsigned long flags;
@@ -255,18 +257,56 @@ static void ipoib_ib_handle_tx_wc(struct
 		return;
 	}
 
-	tx_req = &priv->tx_ring[wr_id];
+	/*
+	 * Handle skbs completion from tx_tail to wr_id. Two issues :
+	 *	- Need to stop other WC's from mangling same skb's if they
+	 *	  run at the same time. Use tx_prev_tail to demarcate WC's.
+	 *	- Handle WC's from earlier (possibly multiple) post_sends in
+	 *	  this iteration as we move from tx_prev_tail to wr_id, since
+	 *	  if the last WR (which is the one which requested completion
+	 *	  notification) failed to be sent for any of those earlier
+	 *	  request(s), no completion notification is generated for
+	 *	  successful WR's of those earlier request(s).
+	 */
+	spin_lock_irqsave(&priv->comp_lock, flags);
+
+	/* Get start index */
+	tx_ring_index = priv->tx_prev_tail & (ipoib_sendq_size - 1);
+
+	/*Find number of WC's */
+	num_completions = wr_id - tx_ring_index + 1;
+	if (unlikely(num_completions <= 0))
+		num_completions += ipoib_sendq_size;
+
+	/* Save new start index for any parallel WC's */
+	priv->tx_prev_tail += num_completions;
+
+	spin_unlock_irqrestore(&priv->comp_lock, flags);
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping,
-			    tx_req->skb->len, DMA_TO_DEVICE);
+	tx_req = &priv->tx_ring[tx_ring_index];
+	for (i = 0; i < num_completions; i++) {
+		if (likely(tx_req->skb)) {
+			ib_dma_unmap_single(priv->ca, tx_req->mapping,
+					    tx_req->skb->len, DMA_TO_DEVICE);
 
-	++priv->stats.tx_packets;
-	priv->stats.tx_bytes += tx_req->skb->len;
+			++priv->stats.tx_packets;
+			priv->stats.tx_bytes += tx_req->skb->len;
 
-	dev_kfree_skb_any(tx_req->skb);
+			dev_kfree_skb_any(tx_req->skb);
+		}
+		/*
+		 * else this skb failed synchronously when posted and was
+		 * freed immediately.
+		 */
+
+		if (likely(++tx_ring_index != ipoib_sendq_size))
+			tx_req++;
+		else
+			tx_req = &priv->tx_ring[0];
+	}
 
 	spin_lock_irqsave(&priv->tx_lock, flags);
-	++priv->tx_tail;
+	priv->tx_tail += num_completions;
 	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
 	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
 		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
@@ -340,29 +380,57 @@ void ipoib_ib_completion(struct ib_cq *c
 	netif_rx_schedule(dev_ptr);
 }
 
-static inline int post_send(struct ipoib_dev_priv *priv,
-			    unsigned int wr_id,
-			    struct ib_ah *address, u32 qpn,
-			    u64 addr, int len)
+/*
+ * post_send : Post WR(s) to the device.
+ *
+ * num_skbs is the number of WR's, first_wr is the first slot in tx_wr[] (or
+ * tx_sge[]). first_wr is normally zero unless a previous post_send returned
+ * error and we are trying to post the untried WR's, in which case first_wr
+ * is the index to the first untried WR.
+ *
+ * Break the WR link before posting so that provider knows how many WR's to
+ * process, and this is set back after the post.
+ */
+static inline int post_send(struct ipoib_dev_priv *priv, u32 qpn,
+			    int first_wr, int num_skbs,
+			    struct ib_send_wr **bad_wr)
 {
-	struct ib_send_wr *bad_wr;
+	int ret;
+	struct ib_send_wr *last_wr, *next_wr;
+
+	last_wr = &priv->tx_wr[first_wr + num_skbs - 1];
+
+	/* Set Completion Notification for last WR */
+	last_wr->send_flags = IB_SEND_SIGNALED;
 
-	priv->tx_sge.addr             = addr;
-	priv->tx_sge.length           = len;
+	/* Terminate the last WR */
+	next_wr = last_wr->next;
+	last_wr->next = NULL;
 
-	priv->tx_wr.wr_id 	      = wr_id;
-	priv->tx_wr.wr.ud.remote_qpn  = qpn;
-	priv->tx_wr.wr.ud.ah 	      = address;
+	/* Send all the WR's in one doorbell */
+	ret = ib_post_send(priv->qp, &priv->tx_wr[first_wr], bad_wr);
 
-	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+	/* Restore send_flags & WR chain */
+	last_wr->send_flags = 0;
+	last_wr->next = next_wr;
+
+	return ret;
 }
 
-void ipoib_send(struct net_device *dev, struct sk_buff *skb,
-		struct ipoib_ah *address, u32 qpn)
+/*
+ * Map skb & store skb/mapping in tx_ring; and details of the WR in tx_wr
+ * to pass to the provider.
+ *
+ * Returns:
+ *	1: Error and the skb is freed.
+ *	0 skb processed successfully.
+ */
+int ipoib_process_skb(struct net_device *dev, struct sk_buff *skb,
+		      struct ipoib_dev_priv *priv, struct ipoib_ah *address,
+		      u32 qpn, int wr_num)
 {
-	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_tx_buf *tx_req;
 	u64 addr;
+	unsigned int tx_ring_index;
 
 	if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
@@ -370,7 +438,7 @@ void ipoib_send(struct net_device *dev, 
 		++priv->stats.tx_dropped;
 		++priv->stats.tx_errors;
 		ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
-		return;
+		return 1;
 	}
 
 	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
@@ -383,35 +451,96 @@ void ipoib_send(struct net_device *dev, 
 	 * means we have to make sure everything is properly recorded and
 	 * our state is consistent before we call post_send().
 	 */
-	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
-	tx_req->skb = skb;
-	addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
-				 DMA_TO_DEVICE);
+	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		++priv->stats.tx_errors;
 		dev_kfree_skb_any(skb);
-		return;
+		return 1;
 	}
-	tx_req->mapping = addr;
 
-	if (unlikely(post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
-			       address->ah, qpn, addr, skb->len))) {
-		ipoib_warn(priv, "post_send failed\n");
-		++priv->stats.tx_errors;
-		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
-		dev_kfree_skb_any(skb);
-	} else {
-		dev->trans_start = jiffies;
+	tx_ring_index = priv->tx_head & (ipoib_sendq_size - 1);
+
+	/* Save till completion handler executes */
+	priv->tx_ring[tx_ring_index].skb = skb;
+	priv->tx_ring[tx_ring_index].mapping = addr;
+
+	/* Set WR values for the provider to use */
+	priv->tx_sge[wr_num].addr = addr;
+	priv->tx_sge[wr_num].length = skb->len;
+
+	priv->tx_wr[wr_num].wr_id = tx_ring_index;
+	priv->tx_wr[wr_num].wr.ud.remote_qpn = qpn;
+	priv->tx_wr[wr_num].wr.ud.ah = address->ah;
+
+	priv->tx_head++;
+
+	if (unlikely(priv->tx_head - priv->tx_tail == ipoib_sendq_size)) {
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		netif_stop_queue(dev);
+		set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+	}
 
-		address->last_send = priv->tx_head;
-		++priv->tx_head;
+	return 0;
+}
 
-		if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
-			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
-			netif_stop_queue(dev);
-			set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
+/*
+ * Send num_skbs to the device. If an skb is passed to this function, it is
+ * single, unprocessed skb send case; otherwise it means that all skbs are
+ * already processed and put on priv->tx_wr,tx_sge,tx_ring, etc.
+ */
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn, int num_skbs)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int first_wr = 0;
+
+	if (skb && ipoib_process_skb(dev, skb, priv, address, qpn, 0))
+		return;
+
+	/* Send all skb's in one post */
+	do {
+		struct ib_send_wr *bad_wr;
+
+		if (unlikely((post_send(priv, qpn, first_wr, num_skbs,
+					&bad_wr)))) {
+			int done;
+
+			ipoib_warn(priv, "post_send failed\n");
+
+			/* Get number of WR's that finished successfully */
+			done = bad_wr - &priv->tx_wr[first_wr];
+
+			/* Handle 1 error */
+			priv->stats.tx_errors++;
+			ib_dma_unmap_single(priv->ca,
+				priv->tx_sge[first_wr + done].addr,
+				priv->tx_sge[first_wr + done].length,
+				DMA_TO_DEVICE);
+
+			/* Free failed WR & reset for WC handler to recognize */
+			dev_kfree_skb_any(priv->tx_ring[bad_wr->wr_id].skb);
+			priv->tx_ring[bad_wr->wr_id].skb = NULL;
+
+			/* Handle 'n' successes */
+			if (done) {
+				dev->trans_start = jiffies;
+				address->last_send = priv->tx_head - (num_skbs -
+								      done) - 1;
+			}
+
+			/* Get count of skbs that were not tried */
+			num_skbs -= (done + 1);
+				/* + 1 for WR that was tried & failed */
+
+			/* Get start index for next iteration */
+			first_wr += (done + 1);
+		} else {
+			dev->trans_start = jiffies;
+
+			address->last_send = priv->tx_head - 1;
+			num_skbs = 0;
 		}
-	}
+	} while (num_skbs);
 }
 
 static void __ipoib_reap_ah(struct net_device *dev)

^ permalink raw reply

* [patch] ipvs: force read of atomic_t in while loop
From: Heiko Carstens @ 2007-08-08  9:33 UTC (permalink / raw)
  To: Andrew Morton, David Miller
  Cc: linux-kernel, netdev, Martin Schwidefsky, Wensong Zhang,
	Simon Horman

From: Heiko Carstens <heiko.carstens@de.ibm.com>

For architectures that don't have a volatile atomic_ts constructs like
while (atomic_read(&something)); might result in endless loops since a
barrier() is missing which forces the compiler to generate code that
actually reads memory contents.
Fix this in ipvs by using the IP_VS_WAIT_WHILE macro which resolves to
while (expr) { cpu_relax(); }
(why isn't this open coded btw?)

Cc: Wensong Zhang <wensong@linux-vs.org>
Cc: Simon Horman <horms@verge.net.au>
Cc: David Miller <davem@davemloft.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---

Just saw this while grepping for atomic_reads in a while loops.
Maybe we should re-add the volatile to atomic_t. Not sure.

 net/ipv4/ipvs/ip_vs_ctl.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6/net/ipv4/ipvs/ip_vs_ctl.c
===================================================================
--- linux-2.6.orig/net/ipv4/ipvs/ip_vs_ctl.c
+++ linux-2.6/net/ipv4/ipvs/ip_vs_ctl.c
@@ -909,7 +909,7 @@ ip_vs_edit_dest(struct ip_vs_service *sv
 	write_lock_bh(&__ip_vs_svc_lock);
 
 	/* Wait until all other svc users go away */
-	while (atomic_read(&svc->usecnt) > 1) {};
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
 
 	/* call the update_service, because server weight may be changed */
 	svc->scheduler->update_service(svc);

^ permalink raw reply

* [PATCH 4/9 Rev3] [ethtool] Add ethtool support
From: Krishna Kumar @ 2007-08-08  9:31 UTC (permalink / raw)
  To: johnpol, sri, shemminger, davem, kaber
  Cc: jagana, Robert.Olsson, rick.jones2, herbert, gaagaan, kumarkr,
	rdreier, peter.p.waskiewicz.jr, mcarlson, jeff, general, mchan,
	tgraf, hadi, netdev, Krishna Kumar, xma
In-Reply-To: <20070808093114.15396.22797.sendpatchset@localhost.localdomain>

Add ethtool support to enable/disable batching.

Signed-off-by: Krishna Kumar <krkumar2@in.ibm.com>
---
 include/linux/ethtool.h   |    2 ++
 include/linux/netdevice.h |    2 ++
 net/core/ethtool.c        |   26 ++++++++++++++++++++++++++
 3 files changed, 30 insertions(+)

diff -ruNp ORG/include/linux/ethtool.h NEW/include/linux/ethtool.h
--- ORG/include/linux/ethtool.h	2007-08-06 08:25:37.000000000 +0530
+++ NEW/include/linux/ethtool.h	2007-08-07 13:11:19.000000000 +0530
@@ -410,6 +410,8 @@ struct ethtool_ops {
 #define ETHTOOL_SUFO		0x00000022 /* Set UFO enable (ethtool_value) */
 #define ETHTOOL_GGSO		0x00000023 /* Get GSO enable (ethtool_value) */
 #define ETHTOOL_SGSO		0x00000024 /* Set GSO enable (ethtool_value) */
+#define ETHTOOL_GBATCH		0x00000024 /* Get Batch (ethtool_value) */
+#define ETHTOOL_SBATCH		0x00000025 /* Set Batch (ethtool_value) */
 
 /* compatibility with older code */
 #define SPARC_ETH_GSET		ETHTOOL_GSET
diff -ruNp ORG/include/linux/netdevice.h NEW/include/linux/netdevice.h
--- ORG/include/linux/netdevice.h	2007-08-06 08:25:37.000000000 +0530
+++ NEW/include/linux/netdevice.h	2007-08-07 13:11:19.000000000 +0530
@@ -1112,6 +1112,8 @@ extern void		dev_set_promiscuity(struct 
 extern void		dev_set_allmulti(struct net_device *dev, int inc);
 extern void		netdev_state_change(struct net_device *dev);
 extern void		netdev_features_change(struct net_device *dev);
+extern int		dev_change_tx_batch_skb(struct net_device *dev,
+						unsigned long new_batch_skb);
 /* Load a device via the kmod */
 extern void		dev_load(const char *name);
 extern void		dev_mcast_init(void);
diff -ruNp ORG/net/core/ethtool.c NEW/net/core/ethtool.c
--- ORG/net/core/ethtool.c	2007-08-06 08:25:40.000000000 +0530
+++ NEW/net/core/ethtool.c	2007-08-07 13:11:19.000000000 +0530
@@ -638,6 +638,25 @@ static int ethtool_set_gso(struct net_de
 	return 0;
 }
 
+static int ethtool_get_batch(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata = { ETHTOOL_GBATCH };
+
+	edata.data = dev->skb_blist != NULL;
+	if (copy_to_user(useraddr, &edata, sizeof(edata)))
+		 return -EFAULT;
+	return 0;
+}
+
+static int ethtool_set_gso(struct net_device *dev, char __user *useraddr)
+{
+	struct ethtool_value edata;
+
+	if (copy_from_user(&edata, useraddr, sizeof(edata)))
+		return -EFAULT;
+	return dev_change_tx_batch_skb(dev, edata.data);
+}
+
 static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_test test;
@@ -817,6 +836,7 @@ int dev_ethtool(struct ifreq *ifr)
 	case ETHTOOL_GPERMADDR:
 	case ETHTOOL_GUFO:
 	case ETHTOOL_GGSO:
+	case ETHTOOL_GBATCH:
 		break;
 	default:
 		if (!capable(CAP_NET_ADMIN))
@@ -935,6 +955,12 @@ int dev_ethtool(struct ifreq *ifr)
 	case ETHTOOL_SGSO:
 		rc = ethtool_set_gso(dev, useraddr);
 		break;
+	case ETHTOOL_GBATCH:
+		rc = ethtool_get_batch(dev, useraddr);
+		break;
+	case ETHTOOL_SBATCH:
+		rc = ethtool_set_batch(dev, useraddr);
+		break;
 	default:
 		rc = -EOPNOTSUPP;
 	}

^ permalink raw reply

* Re: [patch] ipvs: force read of atomic_t in while loop
From: Horms @ 2007-08-08  9:45 UTC (permalink / raw)
  To: Heiko Carstens
  Cc: Andrew Morton, David Miller, linux-kernel, netdev,
	Martin Schwidefsky, Wensong Zhang
In-Reply-To: <20070808093300.GA14530@osiris.boeblingen.de.ibm.com>

On Wed, Aug 08, 2007 at 11:33:00AM +0200, Heiko Carstens wrote:
> From: Heiko Carstens <heiko.carstens@de.ibm.com>
> 
> For architectures that don't have a volatile atomic_ts constructs like
> while (atomic_read(&something)); might result in endless loops since a
> barrier() is missing which forces the compiler to generate code that
> actually reads memory contents.
> Fix this in ipvs by using the IP_VS_WAIT_WHILE macro which resolves to
> while (expr) { cpu_relax(); }
> (why isn't this open coded btw?)
> 
> Cc: Wensong Zhang <wensong@linux-vs.org>
> Cc: Simon Horman <horms@verge.net.au>
> Cc: David Miller <davem@davemloft.net>
> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
> Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
> ---
> 
> Just saw this while grepping for atomic_reads in a while loops.
> Maybe we should re-add the volatile to atomic_t. Not sure.

This looks good to me. A little wile back I noticed a few places
where IP_VS_WAIT_WHILE seemed to be curiously unused, then I got
distracted...

Signed-off-by: Simon Horman <horms@verge.net.au>

> 
>  net/ipv4/ipvs/ip_vs_ctl.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> Index: linux-2.6/net/ipv4/ipvs/ip_vs_ctl.c
> ===================================================================
> --- linux-2.6.orig/net/ipv4/ipvs/ip_vs_ctl.c
> +++ linux-2.6/net/ipv4/ipvs/ip_vs_ctl.c
> @@ -909,7 +909,7 @@ ip_vs_edit_dest(struct ip_vs_service *sv
>  	write_lock_bh(&__ip_vs_svc_lock);
>  
>  	/* Wait until all other svc users go away */
> -	while (atomic_read(&svc->usecnt) > 1) {};
> +	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
>  
>  	/* call the update_service, because server weight may be changed */
>  	svc->scheduler->update_service(svc);

-- 
Horms
  H: http://www.vergenet.net/~horms/
  W: http://www.valinux.co.jp/en/

^ permalink raw reply

* Block device throttling [Re: Distributed storage.]
From: Evgeniy Polyakov @ 2007-08-08  9:54 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Daniel Phillips, netdev, linux-kernel, linux-fsdevel,
	Peter Zijlstra
In-Reply-To: <20070807205538.GB5245@kernel.dk>

On Tue, Aug 07, 2007 at 10:55:38PM +0200, Jens Axboe (jens.axboe@oracle.com) wrote:
> I don't like structure bloat, but I do like nice design. Overloading is

So, what did we decide? To bloat bio a bit (add a queue pointer) or to
use physical device limits? The latter requires to replace all occurence
of bio->bi_bdev = something_new with blk_set_bdev(bio, somthing_new),
where queue limits will be appropriately charged. So far I'm testing
second case, but I only changed DST for testing, can change all other
users if needed though.

-- 
	Evgeniy Polyakov

^ permalink raw reply

* [1/1] Block device throttling [Re: Distributed storage.]
From: Evgeniy Polyakov @ 2007-08-08 10:17 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Daniel Phillips, netdev, linux-kernel, linux-fsdevel,
	Peter Zijlstra
In-Reply-To: <20070808095448.GA3440@2ka.mipt.ru>

This throttling mechanism allows to limit maximum amount of queued bios 
per physical device. By default it is turned off and old block layer 
behaviour with unlimited number of bios is used. When turned on (queue
limit is set to something different than -1U via blk_set_queue_limit()),
generic_make_request() will sleep until there is room in the queue.
number of bios is increased in generic_make_request() and reduced either
in bio_endio(), when bio is completely processed (bi_size is zero), and
recharged from original queue when new device is assigned to bio via
blk_set_bdev(). All oerations are not atomic, since we do not care about
precise number of bios, but a fact, that we are close or close enough to
the limit.

Tested on distributed storage device - with limit of 2 bios it works
slow :)

Signed-off-by: Evgeniy Polyakov <johnpol@2ka.mipt.ru>

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index c99b463..1882c9b 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -1851,6 +1851,10 @@ request_queue_t *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 
+	q->bio_limit = -1U;
+	q->bio_queued = 0;
+	init_waitqueue_head(&q->wait);
+
 	mutex_init(&q->sysfs_lock);
 
 	return q;
@@ -3237,6 +3241,16 @@ end_io:
  */
 void generic_make_request(struct bio *bio)
 {
+	request_queue_t *q;
+
+	BUG_ON(!bio->bi_bdev)
+
+	q = bdev_get_queue(bio->bi_bdev);
+	if (q && q->bio_limit != -1U) {
+		wait_event_interruptible(q->wait, q->bio_queued + 1 <= q->bio_limit);
+		q->bio_queued++;
+	}
+
 	if (current->bio_tail) {
 		/* make_request is active */
 		*(current->bio_tail) = bio;
diff --git a/fs/bio.c b/fs/bio.c
index 093345f..0a33958 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1028,6 +1028,16 @@ void bio_endio(struct bio *bio, unsigned int bytes_done, int error)
 	bio->bi_size -= bytes_done;
 	bio->bi_sector += (bytes_done >> 9);
 
+	if (!bio->bi_size && bio->bi_bdev) {
+		request_queue_t *q;
+
+		q = bdev_get_queue(bio->bi_bdev);
+		if (q) {
+			q->bio_queued--;
+			wake_up(&q->wait);
+		}
+	}
+
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio, bytes_done, error);
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index db5b00a..7ce0cd7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -467,6 +467,9 @@ struct request_queue
 	struct request		*orig_bar_rq;
 	unsigned int		bi_size;
 
+	wait_queue_head_t	wait;
+	unsigned int		bio_limit, bio_queued;
+
 	struct mutex		sysfs_lock;
 };
 
@@ -764,6 +767,30 @@ extern long nr_blockdev_pages(void);
 int blk_get_queue(request_queue_t *);
 request_queue_t *blk_alloc_queue(gfp_t);
 request_queue_t *blk_alloc_queue_node(gfp_t, int);
+
+static inline void blk_queue_set_limit(request_queue_t *q, unsigned int limit)
+{
+	q->bio_limit = limit;
+}
+
+static inline void blk_set_bdev(struct bio *bio, struct block_device *bdev)
+{
+	request_queue_t *q;
+
+	if (!bio->bi_bdev) {
+		bio->bi_bdev = bdev;
+		return;
+	}
+	
+	q = bdev_get_queue(bio->bi_bdev);
+	if (q) {
+		q->bio_queued--;
+		wake_up(&q->wait);
+	}
+
+	bio->bi_bdev = bdev;
+}
+
 extern void blk_put_queue(request_queue_t *);
 
 /*

-- 
	Evgeniy Polyakov

^ permalink raw reply related

* Re: [RFC] allow device to stop packet mirror behaviour
From: David Miller @ 2007-08-08 10:17 UTC (permalink / raw)
  To: johannes-cdvu00un1VgdHxzADdlk8Q
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, andy-/Zus8d0mwwtBDgjK7y7TUQ,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1186564441.11717.5.camel-YfaajirXv214zXjbi5bjpg@public.gmane.org>

From: Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>
Date: Wed, 08 Aug 2007 11:14:01 +0200

> On Tue, 2007-08-07 at 18:06 -0700, David Miller wrote:
> > From: Johannes Berg <johannes-cdvu00un1VgdHxzADdlk8Q@public.gmane.org>
> > Date: Tue, 07 Aug 2007 10:25:55 +0200
> > 
> > > The only way to solve this problem therefore seems to be to suppress the
> > > mirroring out of the packet by dev_queue_xmit_nit(). The patch below
> > > does that by way of adding a new netdev flag.
> > 
> > Multicast packets also get looped back in a similar manner in the ipv4
> > code.  These will also be seen twice due to this issue.
> 
> I don't think these other places are of any interest because of the
> radiotap+802.11 framing on the devices where it is relevant to us; you
> can't actually add an IP route to a monitor interface as far as I can
> tell.

Then I don't understand your problem.  If they are specific 802.11
protocol packets, the radio stack is in a much better situation to
filter out things like this.

^ permalink raw reply

* Re: [patch] ipvs: force read of atomic_t in while loop
From: David Miller @ 2007-08-08 10:21 UTC (permalink / raw)
  To: heiko.carstens; +Cc: akpm, linux-kernel, netdev, schwidefsky, wensong, horms
In-Reply-To: <20070808093300.GA14530@osiris.boeblingen.de.ibm.com>

From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 8 Aug 2007 11:33:00 +0200

> Just saw this while grepping for atomic_reads in a while loops.
> Maybe we should re-add the volatile to atomic_t. Not sure.

I think whatever the choice, it should be done consistently
on every architecture.

It's just asking for trouble if your arch does it differently from
every other.

^ permalink raw reply

* Re: [patch] ipvs: force read of atomic_t in while loop
From: Heiko Carstens @ 2007-08-08 10:28 UTC (permalink / raw)
  To: David Miller; +Cc: akpm, linux-kernel, netdev, schwidefsky, wensong, horms
In-Reply-To: <20070808.032131.35507346.davem@davemloft.net>

On Wed, Aug 08, 2007 at 03:21:31AM -0700, David Miller wrote:
> From: Heiko Carstens <heiko.carstens@de.ibm.com>
> Date: Wed, 8 Aug 2007 11:33:00 +0200
> 
> > Just saw this while grepping for atomic_reads in a while loops.
> > Maybe we should re-add the volatile to atomic_t. Not sure.
> 
> I think whatever the choice, it should be done consistently
> on every architecture.
> 
> It's just asking for trouble if your arch does it differently from
> every other.

Well..currently it's i386/x86_64 and s390 which have no volatile
in atomic_t. And yes, of course I agree it should be consistent
across all architectures. But it isn't.

^ permalink raw reply

* Re: [RFC] allow device to stop packet mirror behaviour
From: Johannes Berg @ 2007-08-08 10:32 UTC (permalink / raw)
  To: David Miller
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, andy-/Zus8d0mwwtBDgjK7y7TUQ,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20070808.031755.91441405.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 2380 bytes --]

On Wed, 2007-08-08 at 03:17 -0700, David Miller wrote:

> Then I don't understand your problem.  If they are specific 802.11
> protocol packets, the radio stack is in a much better situation to
> filter out things like this.

What do you mean by radio stack? You can't really send any frames into
the monitor devices *except* raw radiotap+802.11 frames, and these are
exactly the problem because they're mirrored out right away. But because
we have a different radiotap header on outgoing than on incoming frames
to allow userspace to see the transmission indication (was the packet
ACKed by the other side etc) we ourselves mirror them out as well.

We also mirror out packets from other virtual devices associated with
the same PHY as the virtual monitor.

So think of it this way:

[ monitor interface (radiotap+802.11) ]  [ STA mode interface (802.3) ]
                                  |       |
                                  \       /
                                   \     /
                                    \   /
                                     \ /
                                [ mac80211 ]
                                      |
                                   [ PHY ]
                                      |
                             [ wireless medium ]

Now for packets that come in on the right path, they get mirrored out
back to the STA mode interface as well. That's fine. For packets that
come from the bottom, they get cloned and sent to both devices at the
top with different framing.

Additionally, packets that come in via any interface on the top are
redirected to all monitor interfaces *after* their transmission via the
PHY and are amended by a radiotap header that indicates whether the
transmission was successful. So if you run tcpdump on both interfaces
above you'll see the IPv4 multicast packet sent to the STA mode
interface come up on the STA mode interface right away, and you'll also
see it on the monitor interface after it was transmitted by the PHY.

The problem now comes in when you actually transmit a 802.11 framed
packet down the monitor device. Because the mac80211 code mirrors all
transmitted packets up to all monitor interfaces, and the generic code
mirrors the packet as soon as it was sent to the device (via
dev_queue_xmit_nit) you see it twice.

johannes

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 190 bytes --]

^ permalink raw reply

* Re: [PATCH] Virtual ethernet device (v.4)
From: Pavel Emelyanov @ 2007-08-08 10:42 UTC (permalink / raw)
  To: David Miller; +Cc: Patrick McHardy, Linux Netdev List, devel
In-Reply-To: <469F2DE7.9090407@openvz.org>

Hi, David.

What are your plans about this driver?

Thanks,
Pavel

> Veth stands for Virtual ETHernet. It is a simple tunnel driver
> that works at the link layer and looks like a pair of ethernet
> devices interconnected with each other.
> 
> Mainly it allows to communicate between network namespaces but
> it can be used as is as well. E.g. one may join to bridged
> networking segments together.
> 
> Eric recently sent a similar driver called etun with the sysfs
> interface. This implementation uses another interface - the RTM_NRELINK 
> message introduced by Patric.
> 
> The newlink callback is organized that way to make it easy to create the 
> peer device in the separate namespace when we have them in kernel.
> 
> Many thanks to Patrick for reviewing the patches and his advises
> on how to make driver cleaner.
> 
> Changes from v.3:
> * Reserved place for struct ifinfomsg in IFLA_INFO_DATA part
>   of the packet. This is not used yet, but may be in the
>   future.
> 
> Changes from v.2.1:
> * Made the generic routine for link creation to be used
>   by veth driver, any other tunnel driver that needs to
>   create several devices at once and rtnl_newlink() code.
> 
> Changes from v.2:
> * Rebase over latest netdev tree. No actual changes;
> * Small code rework.
> 
> Changes from v.1:
> * Per-cpu statistics;
> * Standard convention for nla policy names;
> * Module alias added;
> * Xmit function fixes noticed by Patric;
> * Code cleanup.
> 
> The patch for an ip utility is also provided.
> 
> Signed-off-by: Pavel Emelianov <xemul@openvz.org>
> Acked-by: Patrick McHardy <kaber@trash.net>
> 


^ permalink raw reply

* [ofa-general] Re: [PATCH 0/9 Rev3] Implement batching skb API and support in IPoIB
From: David Miller @ 2007-08-08 10:49 UTC (permalink / raw)
  To: krkumar2
  Cc: johnpol, jagana, peter.p.waskiewicz.jr, herbert, gaagaan,
	Robert.Olsson, kumarkr, rdreier, mcarlson, jeff, hadi, general,
	mchan, tgraf, netdev, shemminger, kaber, sri
In-Reply-To: <20070808093114.15396.22797.sendpatchset@localhost.localdomain>

From: Krishna Kumar <krkumar2@in.ibm.com>
Date: Wed, 08 Aug 2007 15:01:14 +0530

> RESULTS: The performance improvement for TCP No Delay is in the range of -8%
> 	to 320% (with -8% being the sole negative), with many individual tests
> 	giving 50% or more improvement (I think it is to do with the hw slots
> 	getting full quicker resulting in more batching when the queue gets
> 	woken). The results for TCP is in the range of -11% to 93%, with most
> 	of the tests (8/12) giving improvements.

Not because I think it obviates your work, but rather because I'm
curious, could you test a TSO-in-hardware driver converted to
batching and see how TSO alone compares to batching for a pure
TCP workload?

I personally don't think it will help for that case at all as
TSO likely does better job of coalescing the work _and_ reducing
bus traffic as well as work in the TCP stack.

^ permalink raw reply

* Re: [RFC] stuff from tcp-2.6 partially merged to upcoming net-2.6.24?
From: David Miller @ 2007-08-08 10:55 UTC (permalink / raw)
  To: ilpo.jarvinen; +Cc: netdev
In-Reply-To: <Pine.LNX.4.64.0708071356060.8788@kivilampi-30.cs.helsinki.fi>

From: "Ilpo_Järvinen" <ilpo.jarvinen@helsinki.fi>
Date: Tue, 7 Aug 2007 16:19:58 +0300 (EEST)

> Do you have any suggestion how I should proceed? Or do you perhaps object 
> such partial merge completely? ...I could try to come up with a cleaned up 
> patch series which has original and their bug fix parts combined to a 
> single patch per change (would provide cleaner history and shouldn't be 
> very hard to do either)...

Thank you for doing the rebase.  I agree with you that we should
seperate out as much of the non-rb-tree stuff as possible and put it
into net-2.6.24

I'll will try hard to make time to look at your rebase and try to move
things forward.  I've been all-consumed by the NAPI work and a driver
I've been writing in the background for the past few weeks.


^ permalink raw reply

* [ofa-general] Re: [PATCH 2/9 Rev3] [core] Add skb_blist & hard_start_xmit_batch
From: Stephen Hemminger @ 2007-08-08 10:59 UTC (permalink / raw)
  To: Krishna Kumar
  Cc: johnpol, jagana, peter.p.waskiewicz.jr, herbert, gaagaan,
	Robert.Olsson, kumarkr, rdreier, mcarlson, kaber, jeff, general,
	mchan, tgraf, netdev, hadi, davem, sri
In-Reply-To: <20070808093135.15396.29687.sendpatchset@localhost.localdomain>

How do qdisc's that do rate control work now? Did you add logic to
breakup batches for token bucket, etc.

^ permalink raw reply

* [patch] genirq: temporary fix for level-triggered IRQ resend
From: Jarek Poplawski @ 2007-08-08 11:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Thomas Gleixner, Jean-Baptiste Vignaud,
	linux-kernel, shemminger, linux-net, netdev, Andrew Morton,
	Alan Cox, marcin.slusarz
In-Reply-To: <20070731160008.GA7776@elte.hu>

Hi,

I see there is a bit of complaining on this original resend temporary
patch. But, since it seems to do a good job for some people, here is
my proposal to limit the 'range of fire' a little bit.

Marcin and Jean-Baptiste: try to test this with 2.6.23-rc2, please.
(Unless Ingo or Thomas have other plans with this problem?)

Thanks,
Jarek P.

-------->

Subject: [patch] genirq: temporary fix for level-triggered IRQ resend


Marcin Slusarz reported a ne2k-pci "hung network interface" regression.

delayed disable relies on the ability to re-trigger the interrupt in the
case that a real interrupt happens after the software disable was set.
In this case we actually disable the interrupt on the hardware level
_after_ it occurred.

On enable_irq, we need to re-trigger the interrupt. On i386 this relies
on a hardware resend mechanism (send_IPI_self()).

Actually we only need the resend for edge type interrupts. Level type
interrupts come back once enable_irq() re-enables the interrupt line.

Marcin found that when he disables the irq line on the hardware level
(removing the delayed disable) the card is kept alive.

Thomas Gleixner prepared a testing patch which proved to fix hung ups
after limiting irqs resending to edge type only. Then Ingo Molnar's
version of this patch was applied to 2.6.23-rc2 kernel as a temporary
solution. Since, the problem is still diagnosed, but seems to affect
mainly X86_64 arch, here is a modified, but still temporary, version of
this solution, which should limit the range of warnings and changes in
interrupt handling to minimum.


Signed-off-by: Jarek Poplawski <jarkao2@o2.pl>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Jean-Baptiste Vignaud <vignaud@xandmail.fr>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>

---

diff -Nurp 2.6.23-rc2-/kernel/irq/resend.c 2.6.23-rc2/kernel/irq/resend.c
--- 2.6.23-rc2-/kernel/irq/resend.c	2007-08-08 11:48:15.000000000 +0200
+++ 2.6.23-rc2/kernel/irq/resend.c	2007-08-08 11:58:42.000000000 +0200
@@ -62,18 +62,19 @@ void check_irq_resend(struct irq_desc *d
 	 */
 	desc->chip->enable(irq);
 
+	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+
+#ifdef CONFIG_X86_64
 	/*
 	 * Temporary hack to figure out more about the problem, which
 	 * is causing the ancient network cards to die.
 	 */
-	if (desc->handle_irq != handle_edge_irq) {
-		WARN_ON_ONCE(1);
-		return;
-	}
-
-	if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-		desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
-
+		if (desc->handle_irq == handle_fasteoi_irq) {
+			WARN_ON_ONCE(1);
+			return;
+		}
+#endif
 		if (!desc->chip || !desc->chip->retrigger ||
 					!desc->chip->retrigger(irq)) {
 #ifdef CONFIG_HARDIRQS_SW_RESEND

^ permalink raw reply

* Re: [PATCH RFC]: napi_struct V6
From: Stephen Hemminger @ 2007-08-08 11:06 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, jgarzik, hadi, rusty
In-Reply-To: <20070807.215432.09951753.davem@davemloft.net>

On Tue, 07 Aug 2007 21:54:32 -0700 (PDT)
David Miller <davem@davemloft.net> wrote:

> 
> Changes since V5:
> 
> 1) Revert unnecessary TX locking changes in bnx2 and tg3
> 
> 2) ++i
> 
> This is probably what I'll check into my net-2.6.24 tree
> 
> Thanks.
> 
> [NET]: Make NAPI polling independant of struct net_device objects.
> 
> Several devices have multiple independant RX queues per net
> device, and some have a single interrupt doorbell for several
> queues.
> 
> In either case, it's easier to support layouts like that if the
> structure representing the poll is independant from the net
> device itself.
> 
> The signature of the ->poll() call back goes from:
> 
> 	int foo_poll(struct net_device *dev, int *budget)
> 
> to
> 
> 	int foo_poll(struct napi_struct *napi, int budget)
> 
> The caller is returned the number of RX packets processed (or
> the number of "NAPI credits" consumed if you want to get
> abstract).  The callee no longer messes around bumping
> dev->quota, *budget, etc. because that is all handled in the
> caller upon return.
> 
> The napi_struct is to be embedded in the device driver private data
> structures.
> 
> Furthermore, it is the driver's responsibility to disable all NAPI
> instances in it's ->stop() device close handler.  Since the
> napi_struct is privatized into the driver's private data structures,
> only the driver knows how to get at all of the napi_struct instances
> it may have per-device.
> 
> With lots of help and suggestions from Rusty Russell.
> 
> [ Ported to current tree and all drivers converted.  -DaveM ]
> 
> Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
> Signed-off-by: David S. Miller <davem@davemloft.net>


Thanks for doing all this. Looks great.
Documentation of NAPI still needs more work. I'll take a start at getting
net_device docbook format cleaned up, then start on a redo of the API
documentation.  

^ permalink raw reply

* Re: 2.6.20->2.6.21 - networking dies after random time
From: Marcin Ślusarz @ 2007-08-08 11:09 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Ingo Molnar, Thomas Gleixner, Linus Torvalds,
	Jean-Baptiste Vignaud, linux-kernel, shemminger, linux-net,
	netdev, Andrew Morton, Alan Cox
In-Reply-To: <20070807121339.GA3946@ff.dom.local>

2007/8/7, Jarek Poplawski <jarkao2@o2.pl>:
> So, the let's try this idea yet: modified Ingo's "x86: activate
> HARDIRQS_SW_RESEND" patch.
> (Don't forget about make oldconfig before make.)
> For testing only.
>
> Cheers,
> Jarek P.
>
> PS: alas there was not even time for "compile checking"...
>
> ---
>
> diff -Nurp 2.6.22.1-/arch/i386/Kconfig 2.6.22.1/arch/i386/Kconfig
> --- 2.6.22.1-/arch/i386/Kconfig 2007-07-09 01:32:17.000000000 +0200
> +++ 2.6.22.1/arch/i386/Kconfig  2007-08-07 13:13:03.000000000 +0200
> @@ -1252,6 +1252,10 @@ config GENERIC_PENDING_IRQ
>         depends on GENERIC_HARDIRQS && SMP
>         default y
>
> +config HARDIRQS_SW_RESEND
> +       bool
> +       default y
> +
>  config X86_SMP
>         bool
>         depends on SMP && !X86_VOYAGER
> diff -Nurp 2.6.22.1-/arch/x86_64/Kconfig 2.6.22.1/arch/x86_64/Kconfig
> --- 2.6.22.1-/arch/x86_64/Kconfig       2007-07-09 01:32:17.000000000 +0200
> +++ 2.6.22.1/arch/x86_64/Kconfig        2007-08-07 13:13:03.000000000 +0200
> @@ -690,6 +690,10 @@ config GENERIC_PENDING_IRQ
>         depends on GENERIC_HARDIRQS && SMP
>         default y
>
> +config HARDIRQS_SW_RESEND
> +       bool
> +       default y
> +
>  menu "Power management options"
>
>  source kernel/power/Kconfig
> diff -Nurp 2.6.22.1-/kernel/irq/manage.c 2.6.22.1/kernel/irq/manage.c
> --- 2.6.22.1-/kernel/irq/manage.c       2007-07-09 01:32:17.000000000 +0200
> +++ 2.6.22.1/kernel/irq/manage.c        2007-08-07 13:13:03.000000000 +0200
> @@ -169,6 +169,14 @@ void enable_irq(unsigned int irq)
>                 desc->depth--;
>         }
>         spin_unlock_irqrestore(&desc->lock, flags);
> +#ifdef CONFIG_HARDIRQS_SW_RESEND
> +       /*
> +        * Do a bh disable/enable pair to trigger any pending
> +        * irq resend logic:
> +        */
> +       local_bh_disable();
> +       local_bh_enable();
> +#endif
>  }
>  EXPORT_SYMBOL(enable_irq);
>
> diff -Nurp 2.6.22.1-/kernel/irq/resend.c 2.6.22.1/kernel/irq/resend.c
> --- 2.6.22.1-/kernel/irq/resend.c       2007-07-09 01:32:17.000000000 +0200
> +++ 2.6.22.1/kernel/irq/resend.c        2007-08-07 13:57:54.000000000 +0200
> @@ -62,16 +62,24 @@ void check_irq_resend(struct irq_desc *d
>          */
>         desc->chip->enable(irq);
>
> +       /*
> +        * Temporary hack to figure out more about the problem, which
> +        * is causing the ancient network cards to die.
> +        */
> +
>         if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
>                 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
>
> -               if (!desc->chip || !desc->chip->retrigger ||
> -                                       !desc->chip->retrigger(irq)) {
> +               if (desc->handle_irq == handle_edge_irq) {
> +                       if (desc->chip->retrigger)
> +                               desc->chip->retrigger(irq);
> +                       return;
> +               }
>  #ifdef CONFIG_HARDIRQS_SW_RESEND
> -                       /* Set it pending and activate the softirq: */
> -                       set_bit(irq, irqs_resend);
> -                       tasklet_schedule(&resend_tasklet);
> +               WARN_ON_ONCE(1);
> +               /* Set it pending and activate the softirq: */
> +               set_bit(irq, irqs_resend);
> +               tasklet_schedule(&resend_tasklet);
>  #endif
> -               }
>         }
>  }
>
Works fine with:
WARNING: at kernel/irq/resend.c:79 check_irq_resend()

Call Trace:
 [<ffffffff8025e660>] check_irq_resend+0xc0/0xd0
 [<ffffffff8025e1cd>] enable_irq+0xed/0xf0
 [<ffffffff8807f21d>] :8390:ei_start_xmit+0x14d/0x30c
 [<ffffffff8024d055>] lock_release_non_nested+0xe5/0x190
 [<ffffffff80539b78>] __qdisc_run+0x98/0x1f0
 [<ffffffff80539b8e>] __qdisc_run+0xae/0x1f0
 [<ffffffff8052b65e>] dev_hard_start_xmit+0x26e/0x2d0
 [<ffffffff80539ba0>] __qdisc_run+0xc0/0x1f0
 [<ffffffff8052dc2f>] dev_queue_xmit+0x24f/0x310
 [<ffffffff805337a7>] neigh_resolve_output+0xe7/0x290
 [<ffffffff8054f5c0>] dst_output+0x0/0x10
 [<ffffffff80552aff>] ip_output+0x19f/0x340
 [<ffffffff80551f77>] ip_queue_xmit+0x217/0x430
 [<ffffffff80563b2a>] tcp_transmit_skb+0x40a/0x7c0
 [<ffffffff805657bb>] __tcp_push_pending_frames+0x11b/0x940
 [<ffffffff8055972a>] tcp_sendmsg+0x87a/0xc80
 [<ffffffff80577735>] inet_sendmsg+0x45/0x80
 [<ffffffff8051e2d4>] sock_aio_write+0x104/0x120
 [<ffffffff80285fc1>] do_sync_write+0xf1/0x130
 [<ffffffff80243290>] autoremove_wake_function+0x0/0x40
 [<ffffffff802868e9>] vfs_write+0x159/0x170
 [<ffffffff80286ef0>] sys_write+0x50/0x90
 [<ffffffff802097fe>] system_call+0x7e/0x83

^ permalink raw reply

* Re: 2.6.20->2.6.21 - networking dies after random time
From: Marcin Ślusarz @ 2007-08-08 11:11 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Ingo Molnar, Thomas Gleixner, Linus Torvalds,
	Jean-Baptiste Vignaud, linux-kernel, shemminger, linux-net,
	netdev, Andrew Morton, Alan Cox
In-Reply-To: <20070807125518.GA4669@ff.dom.local>

2007/8/7, Jarek Poplawski <jarkao2@o2.pl>:
> And here is one more patch to test the same idea (chip->retrigger()).
> Let's try i386 way! (I hope I will not be arrested for this...)
> (Should be tested without any previous patches.)
>
> Jarek P.
>
> PS: as above
>
> ---
>
> diff -Nurp 2.6.22.1-/arch/x86_64/kernel/io_apic.c 2.6.22.1/arch/x86_64/kernel/io_apic.c
> --- 2.6.22.1-/arch/x86_64/kernel/io_apic.c      2007-07-09 01:32:17.000000000 +0200
> +++ 2.6.22.1/arch/x86_64/kernel/io_apic.c       2007-08-07 14:37:45.000000000 +0200
> @@ -1311,15 +1311,8 @@ static unsigned int startup_ioapic_irq(u
>  static int ioapic_retrigger_irq(unsigned int irq)
>  {
>         struct irq_cfg *cfg = &irq_cfg[irq];
> -       cpumask_t mask;
> -       unsigned long flags;
> -
> -       spin_lock_irqsave(&vector_lock, flags);
> -       cpus_clear(mask);
> -       cpu_set(first_cpu(cfg->domain), mask);
>
> -       send_IPI_mask(mask, cfg->vector);
> -       spin_unlock_irqrestore(&vector_lock, flags);
> +       send_IPI_self(cfg->vector);
>
>         return 1;
>  }
>
Network card timed out with this patch.

Marcin

^ permalink raw reply

* Re: [PATCH 0/9 Rev3] Implement batching skb API and support in IPoIB
From: Krishna Kumar2 @ 2007-08-08 11:09 UTC (permalink / raw)
  To: David Miller
  Cc: gaagaan, general, hadi, herbert, jagana, jeff, johnpol, kaber,
	kumarkr, mcarlson, mchan, netdev, peter.p.waskiewicz.jr, rdreier,
	rick.jones2, Robert.Olsson, shemminger, sri, tgraf, xma
In-Reply-To: <20070808.034900.85820906.davem@davemloft.net>

David Miller <davem@davemloft.net> wrote on 08/08/2007 04:19:00 PM:

> From: Krishna Kumar <krkumar2@in.ibm.com>
> Date: Wed, 08 Aug 2007 15:01:14 +0530
>
> > RESULTS: The performance improvement for TCP No Delay is in the range
of -8%
> >    to 320% (with -8% being the sole negative), with many individual
tests
> >    giving 50% or more improvement (I think it is to do with the hw
slots
> >    getting full quicker resulting in more batching when the queue gets
> >    woken). The results for TCP is in the range of -11% to 93%, with
most
> >    of the tests (8/12) giving improvements.
>
> Not because I think it obviates your work, but rather because I'm
> curious, could you test a TSO-in-hardware driver converted to
> batching and see how TSO alone compares to batching for a pure
> TCP workload?
>
> I personally don't think it will help for that case at all as
> TSO likely does better job of coalescing the work _and_ reducing
> bus traffic as well as work in the TCP stack.

Definitely, I will try to do this.

What do you generally think of the patch/implementation ? :)

thanks,

- KK


^ permalink raw reply

* [PATCH] net/tulip/xircom_cb.c: remove superfulous priv assignment
From: Mariusz Kozlowski @ 2007-08-08 11:20 UTC (permalink / raw)
  To: Jeff Garzik, Arjan van de Ven; +Cc: netdev, linux-kernel

Hello,

	Unpatched version does sth like this:

dev = alloc_etherdev(...
private = netdev_priv(dev);
...
dev->priv = private;

which doesn't make much sense (does it?) because this is done in
alloc_netdev() already.

struct net_device *alloc_netdev(...
{
...
	if (sizeof_priv)
		dev->priv = netdev_priv(dev);

This patch removes superfluous code.

Signed-off-by: Mariusz Kozlowski <m.kozlowski@tuxland.pl>

 drivers/net/tulip/xircom_cb.c | 32853 -> 32831 (-22 bytes)
 drivers/net/tulip/xircom_cb.o | 123984 -> 123984 (0 bytes)

 drivers/net/tulip/xircom_cb.c |    1 -
 1 file changed, 1 deletion(-)

--- linux-2.6.23-rc1-mm2-a/drivers/net/tulip/xircom_cb.c	2007-08-01 08:43:46.000000000 +0200
+++ linux-2.6.23-rc1-mm2-b/drivers/net/tulip/xircom_cb.c	2007-08-08 11:06:22.000000000 +0200
@@ -271,7 +271,6 @@ static int __devinit xircom_probe(struct
 	dev->hard_start_xmit = &xircom_start_xmit;
 	dev->stop = &xircom_close;
 	dev->get_stats = &xircom_get_stats;
-	dev->priv = private;
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	dev->poll_controller = &xircom_poll_controller;
 #endif

^ permalink raw reply

* IPV6_PKTINFO socket option
From: Gerrit Renker @ 2007-08-08 11:16 UTC (permalink / raw)
  To: yoshfuji; +Cc: netdev

Yoshifuji-san,

a few weeks earlier I enquired about the IPV6_PKTINFO socket option to get at the
destination address of datagrams, where you replied that this option is `deprecated'.

There are three problems:
 
 1. On i386 it works as described in section 4 of RFC 3542, using IPV6_RECVPKTINFO
    as sticky socket option to pull out the IPV6_PKTINFO cmsg header fields.
   
 2. On sparc64 with the same kernel IPV6_PKTINFO works without problems, even pulls
    out the cmsg fields correctly. Conversely, when trying to set the IPV6_RECVPKTINFO
    sticky option on the socket, no cmsg fields are generated. 
    The kernel is of the same date and revision as the i386 kernel - library issue ???
    It is very annoying, since the application needs to run on both architectures.

 3. Misc:
    * The option is mentioned 9 times in the index of "Unix Network Programming"
      (3rd ed., p. 969). Moreover, an entire section (27.7) is devoted to this topic.
    * It might be good to give at least a warning message in the syslog that the
      IPV6_PKTINFO socket option is no longer supported. That would save many users
      grief. An example how this was done in DCCP is in net/dccp/proto.c:

        case DCCP_SOCKOPT_PACKET_SIZE:
                DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app");
   

Maybe this is due to a misunderstanding - in which case I'd be grateful for any clarifications.

Thanks
Gerrit

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox