Netdev List
 help / color / mirror / Atom feed
* [PATCH] net_sched: factorize qdisc stats handling
From: Eric Dumazet @ 2011-01-08  9:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Changli Gao, David Miller, Fabio Checconi, netdev, Luigi Rizzo
In-Reply-To: <20110107200234.3f5e7ff8@nehalam>

Le vendredi 07 janvier 2011 à 20:02 -0800, Stephen Hemminger a écrit :
> On Sat, 8 Jan 2011 10:56:33 +0800
> Changli Gao <xiaosuo@gmail.com> wrote:
> 
> > > +       cl->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;  
> > 
> > Hmm, there is no other packets schedulers which account packets in
> > this way. Which one is better? I am not sure. And in this patch,
> > qstats.drops isn't maintained in the same way. Would these two be
> > consistent.
> 
> HTB uses this accounting.

Yes, but we should use generic helpers and avoid duplicating this kind
of magic here and here ;)


[PATCH] net_sched: factorize qdisc stats handling

HTB takes into account skb is segmented in stats updates.
Generalize this to all schedulers.

They should use qdisc_bstats_update() helper instead of manipulating
bstats.bytes and bstats.packets

Add bstats_update() helper too for classes that use
gnet_stats_basic_packed fields.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/sch_generic.h |   15 +++++++++++----
 net/core/dev.c            |    2 +-
 net/sched/act_csum.c      |    3 +--
 net/sched/act_ipt.c       |    3 +--
 net/sched/act_mirred.c    |    3 +--
 net/sched/act_nat.c       |    3 +--
 net/sched/act_pedit.c     |    3 +--
 net/sched/act_police.c    |    3 +--
 net/sched/act_simple.c    |    3 +--
 net/sched/act_skbedit.c   |    3 +--
 net/sched/sch_atm.c       |    6 ++----
 net/sched/sch_cbq.c       |    6 ++----
 net/sched/sch_drr.c       |    8 ++------
 net/sched/sch_dsmark.c    |    3 +--
 net/sched/sch_hfsc.c      |    6 ++----
 net/sched/sch_htb.c       |   17 ++++++-----------
 net/sched/sch_ingress.c   |    3 +--
 net/sched/sch_multiq.c    |    3 +--
 net/sched/sch_netem.c     |    6 ++----
 net/sched/sch_prio.c      |    3 +--
 net/sched/sch_red.c       |    3 +--
 net/sched/sch_sfq.c       |    3 +--
 net/sched/sch_tbf.c       |    3 +--
 net/sched/sch_teql.c      |    3 +--
 24 files changed, 44 insertions(+), 70 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0af57eb..389bbcb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -426,10 +426,17 @@ static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
 	return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
 }
 
-static inline void __qdisc_update_bstats(struct Qdisc *sch, unsigned int len)
+
+static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
+				 struct sk_buff *skb)
+{
+	bstats->bytes += qdisc_pkt_len(skb);
+	bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
+}
+
+static inline void qdisc_bstats_update(struct Qdisc *sch, struct sk_buff *skb)
 {
-	sch->bstats.bytes += len;
-	sch->bstats.packets++;
+	bstats_update(&sch->bstats, skb);
 }
 
 static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
@@ -437,7 +444,7 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
 {
 	__skb_queue_tail(list, skb);
 	sch->qstats.backlog += qdisc_pkt_len(skb);
-	__qdisc_update_bstats(sch, qdisc_pkt_len(skb));
+	qdisc_bstats_update(sch, skb);
 
 	return NET_XMIT_SUCCESS;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index a215269..ab60f58 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2301,7 +2301,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		 */
 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
 			skb_dst_force(skb);
-		__qdisc_update_bstats(q, skb->len);
+		qdisc_bstats_update(q, skb);
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 67dc7ce..83ddfc0 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -508,8 +508,7 @@ static int tcf_csum(struct sk_buff *skb,
 
 	spin_lock(&p->tcf_lock);
 	p->tcf_tm.lastuse = jiffies;
-	p->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	p->tcf_bstats.packets++;
+	bstats_update(&p->tcf_bstats, skb);
 	action = p->tcf_action;
 	update_flags = p->update_flags;
 	spin_unlock(&p->tcf_lock);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8daef96..c2a7c20 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -209,8 +209,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
 	spin_lock(&ipt->tcf_lock);
 
 	ipt->tcf_tm.lastuse = jiffies;
-	ipt->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	ipt->tcf_bstats.packets++;
+	bstats_update(&ipt->tcf_bstats, skb);
 
 	/* yes, we have to worry about both in and out dev
 	 worry later - danger - this API seems to have changed
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 0c311be..d765067 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -165,8 +165,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&m->tcf_lock);
 	m->tcf_tm.lastuse = jiffies;
-	m->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	m->tcf_bstats.packets++;
+	bstats_update(&m->tcf_bstats, skb);
 
 	dev = m->tcfm_dev;
 	if (!dev) {
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 186eb83..178a4bd 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -125,8 +125,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 	egress = p->flags & TCA_NAT_FLAG_EGRESS;
 	action = p->tcf_action;
 
-	p->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	p->tcf_bstats.packets++;
+	bstats_update(&p->tcf_bstats, skb);
 
 	spin_unlock(&p->tcf_lock);
 
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index a0593c9..445bef7 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -187,8 +187,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
 bad:
 	p->tcf_qstats.overlimits++;
 done:
-	p->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	p->tcf_bstats.packets++;
+	bstats_update(&p->tcf_bstats, skb);
 	spin_unlock(&p->tcf_lock);
 	return p->tcf_action;
 }
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 7ebf743..e2f08b1 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -298,8 +298,7 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&police->tcf_lock);
 
-	police->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	police->tcf_bstats.packets++;
+	bstats_update(&police->tcf_bstats, skb);
 
 	if (police->tcfp_ewma_rate &&
 	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 97e84f3..7287cff 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -42,8 +42,7 @@ static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result
 
 	spin_lock(&d->tcf_lock);
 	d->tcf_tm.lastuse = jiffies;
-	d->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	d->tcf_bstats.packets++;
+	bstats_update(&d->tcf_bstats, skb);
 
 	/* print policy string followed by _ then packet count
 	 * Example if this was the 3rd packet and the string was "hello"
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 66cbf4e..836f5fe 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -46,8 +46,7 @@ static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&d->tcf_lock);
 	d->tcf_tm.lastuse = jiffies;
-	d->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	d->tcf_bstats.packets++;
+	bstats_update(&d->tcf_bstats, skb);
 
 	if (d->flags & SKBEDIT_F_PRIORITY)
 		skb->priority = d->priority;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 2825407..943d733 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -422,10 +422,8 @@ drop: __maybe_unused
 		}
 		return ret;
 	}
-	sch->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
-	flow->bstats.bytes += qdisc_pkt_len(skb);
-	flow->bstats.packets++;
+	qdisc_bstats_update(sch, skb);
+	bstats_update(&flow->bstats, skb);
 	/*
 	 * Okay, this may seem weird. We pretend we've dropped the packet if
 	 * it goes via ATM. The reason for this is that the outer qdisc
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index eb76315..c80d1c2 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -390,8 +390,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	ret = qdisc_enqueue(skb, cl->q);
 	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
-		sch->bstats.packets++;
-		sch->bstats.bytes += qdisc_pkt_len(skb);
+		qdisc_bstats_update(sch, skb);
 		cbq_mark_toplevel(q, cl);
 		if (!cl->next_alive)
 			cbq_activate_class(cl);
@@ -650,8 +649,7 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 		ret = qdisc_enqueue(skb, cl->q);
 		if (ret == NET_XMIT_SUCCESS) {
 			sch->q.qlen++;
-			sch->bstats.packets++;
-			sch->bstats.bytes += qdisc_pkt_len(skb);
+			qdisc_bstats_update(sch, skb);
 			if (!cl->next_alive)
 				cbq_activate_class(cl);
 			return 0;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index aa8b531..de55e64 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -351,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
-	unsigned int len;
 	int err;
 
 	cl = drr_classify(skb, sch, &err);
@@ -362,7 +361,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return err;
 	}
 
-	len = qdisc_pkt_len(skb);
 	err = qdisc_enqueue(skb, cl->qdisc);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		if (net_xmit_drop_count(err)) {
@@ -377,10 +375,8 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		cl->deficit = cl->quantum;
 	}
 
-	cl->bstats.packets++;
-	cl->bstats.bytes += len;
-	sch->bstats.packets++;
-	sch->bstats.bytes += len;
+	bstats_update(&cl->bstats, skb);
+	qdisc_bstats_update(sch, skb);
 
 	sch->q.qlen++;
 	return err;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1d295d6..60f4bdd 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -260,8 +260,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return err;
 	}
 
-	sch->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
+	qdisc_bstats_update(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 069c62b..2e45791 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1599,10 +1599,8 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl->qdisc->q.qlen == 1)
 		set_active(cl, qdisc_pkt_len(skb));
 
-	cl->bstats.packets++;
-	cl->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
+	bstats_update(&cl->bstats, skb);
+	qdisc_bstats_update(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 01b519d..984c1b0 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -569,15 +569,12 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		}
 		return ret;
 	} else {
-		cl->bstats.packets +=
-			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
-		cl->bstats.bytes += qdisc_pkt_len(skb);
+		bstats_update(&cl->bstats, skb);
 		htb_activate(q, cl);
 	}
 
 	sch->q.qlen++;
-	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -648,12 +645,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 				htb_add_to_wait_tree(q, cl, diff);
 		}
 
-		/* update byte stats except for leaves which are already updated */
-		if (cl->level) {
-			cl->bstats.bytes += bytes;
-			cl->bstats.packets += skb_is_gso(skb)?
-					skb_shinfo(skb)->gso_segs:1;
-		}
+		/* update basic stats except for leaves which are already updated */
+		if (cl->level)
+			bstats_update(&cl->bstats, skb);
+
 		cl = cl->parent;
 	}
 }
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index f10e34a..bce1665 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -63,8 +63,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	result = tc_classify(skb, p->filter_list, &res);
 
-	sch->bstats.packets++;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
 	switch (result) {
 	case TC_ACT_SHOT:
 		result = TC_ACT_SHOT;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 32690de..21f13da 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -83,8 +83,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	ret = qdisc_enqueue(skb, qdisc);
 	if (ret == NET_XMIT_SUCCESS) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
 	}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e5593c0..1c4bce8 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -240,8 +240,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	if (likely(ret == NET_XMIT_SUCCESS)) {
 		sch->q.qlen++;
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 	} else if (net_xmit_drop_count(ret)) {
 		sch->qstats.drops++;
 	}
@@ -477,8 +476,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		__skb_queue_after(list, skb, nskb);
 
 		sch->qstats.backlog += qdisc_pkt_len(nskb);
-		sch->bstats.bytes += qdisc_pkt_len(nskb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, nskb);
 
 		return NET_XMIT_SUCCESS;
 	}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index b1c95bc..966158d 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -84,8 +84,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	ret = qdisc_enqueue(skb, qdisc);
 	if (ret == NET_XMIT_SUCCESS) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
 	}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a67ba3c..a6009c5 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -94,8 +94,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 
 	ret = qdisc_enqueue(skb, child);
 	if (likely(ret == NET_XMIT_SUCCESS)) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		sch->q.qlen++;
 	} else if (net_xmit_drop_count(ret)) {
 		q->stats.pdrop++;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index d54ac94..239ec53 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -403,8 +403,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		slot->allot = q->scaled_quantum;
 	}
 	if (++sch->q.qlen <= q->limit) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		return NET_XMIT_SUCCESS;
 	}
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 641a30d..77565e7 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -134,8 +134,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 	}
 
 	sch->q.qlen++;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
+	qdisc_bstats_update(sch, skb);
 	return NET_XMIT_SUCCESS;
 }
 
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 106479a..af9360d 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -83,8 +83,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 
 	if (q->q.qlen < dev->tx_queue_len) {
 		__skb_queue_tail(&q->q, skb);
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		return NET_XMIT_SUCCESS;
 	}
 



^ permalink raw reply related

* Re: [PATCH net-next-2.6 v3 1/1] can: c_can: Added support for Bosch C_CAN controller
From: Wolfgang Grandegger @ 2011-01-08  9:09 UTC (permalink / raw)
  To: Bhupesh Sharma
  Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1294135195-9448-1-git-send-email-bhupesh.sharma-qxv4g6HH51o@public.gmane.org>

Hi Bhupesh,

the patch already looks quite good. Just a few more issues...

On 01/04/2011 10:59 AM, Bhupesh Sharma wrote:
> Bosch C_CAN controller is a full-CAN implementation which is compliant
> to CAN protocol version 2.0 part A and B. Bosch C_CAN user manual can be
> obtained from:
> http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
> 
> This patch adds the support for this controller.
> The following are the design choices made while writing the controller
> driver:
> 1. Interface Register set IF1 has be used only in the current design.
> 2. Out of the 32 Message objects available, 16 are kept aside for RX
>    purposes and the rest for TX purposes.
> 3. NAPI implementation is such that both the TX and RX paths function
>    in polling mode.
> 
> Changes since V2:
> 1. Seperately implemented a bus independent interface "c_can.c" and
>    a bus sensitive driver "c_can_platform.c". The bus sensitive driver
>    essentially caters to the details of registers mapping/arch differences
>    found on different SoCs.
> 2. Changed RX poll method to allow *in-order packet reception*.
> 3. Implemeneted LEC (last error code) as an enum.
> 4. Implemented CAN_CTRLMODE_BERR_REPORTING.
> 5. Corrected "quota" handling in RX poll routine.
> 6. Implemented and used priv->can.do_get_berr_counter.
> 7. Improved timeout-handling while programming IF command request
>    register.
> 8. Corrected register offset typecasting to allow the same to work on
>    64-bit systems.
> 
> Signed-off-by: Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> ---
>  drivers/net/can/Kconfig                |    2 +
>  drivers/net/can/Makefile               |    1 +
>  drivers/net/can/c_can/Kconfig          |   15 +
>  drivers/net/can/c_can/Makefile         |    8 +
>  drivers/net/can/c_can/c_can.c          |  960 ++++++++++++++++++++++++++++++++
>  drivers/net/can/c_can/c_can.h          |  235 ++++++++
>  drivers/net/can/c_can/c_can_platform.c |  210 +++++++
>  7 files changed, 1431 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/net/can/c_can/Kconfig
>  create mode 100644 drivers/net/can/c_can/Makefile
>  create mode 100644 drivers/net/can/c_can/c_can.c
>  create mode 100644 drivers/net/can/c_can/c_can.h
>  create mode 100644 drivers/net/can/c_can/c_can_platform.c
> 
> diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
> index 9d9e453..50549b5 100644
> --- a/drivers/net/can/Kconfig
> +++ b/drivers/net/can/Kconfig
> @@ -86,6 +86,8 @@ source "drivers/net/can/mscan/Kconfig"
>  
>  source "drivers/net/can/sja1000/Kconfig"
>  
> +source "drivers/net/can/c_can/Kconfig"
> +
>  source "drivers/net/can/usb/Kconfig"
>  
>  config CAN_DEBUG_DEVICES
> diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
> index 0057537..c3efeb3 100644
> --- a/drivers/net/can/Makefile
> +++ b/drivers/net/can/Makefile
> @@ -11,6 +11,7 @@ obj-y				+= usb/
>  
>  obj-$(CONFIG_CAN_SJA1000)	+= sja1000/
>  obj-$(CONFIG_CAN_MSCAN)		+= mscan/
> +obj-$(CONFIG_CAN_C_CAN)		+= c_can/
>  obj-$(CONFIG_CAN_AT91)		+= at91_can.o
>  obj-$(CONFIG_CAN_TI_HECC)	+= ti_hecc.o
>  obj-$(CONFIG_CAN_MCP251X)	+= mcp251x.o
> diff --git a/drivers/net/can/c_can/Kconfig b/drivers/net/can/c_can/Kconfig
> new file mode 100644
> index 0000000..ffb9773
> --- /dev/null
> +++ b/drivers/net/can/c_can/Kconfig
> @@ -0,0 +1,15 @@
> +menuconfig CAN_C_CAN
> +	tristate "Bosch C_CAN devices"
> +	depends on CAN_DEV && HAS_IOMEM
> +
> +if CAN_C_CAN
> +
> +config CAN_C_CAN_PLATFORM
> +	tristate "Generic Platform Bus based C_CAN driver"
> +	---help---
> +	  This driver adds support for the C_CAN chips connected to
> +	  the "platform bus" (Linux abstraction for directly to the
> +	  processor attached devices) which can be found on various
> +	  boards from ST Microelectronics (http://www.st.com)
> +	  like the SPEAr1310 and SPEAr320 evaluation boards.
> +endif

> diff --git a/drivers/net/can/c_can/Makefile b/drivers/net/can/c_can/Makefile
> new file mode 100644
> index 0000000..9273f6d
> --- /dev/null
> +++ b/drivers/net/can/c_can/Makefile
> @@ -0,0 +1,8 @@
> +#
> +#  Makefile for the Bosch C_CAN controller drivers.
> +#
> +
> +obj-$(CONFIG_CAN_C_CAN) += c_can.o
> +obj-$(CONFIG_CAN_C_CAN_PLATFORM) += c_can_platform.o
> +
> +ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
> diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
> new file mode 100644
> index 0000000..206e650
> --- /dev/null
> +++ b/drivers/net/can/c_can/c_can.c
> @@ -0,0 +1,960 @@
> +/*
> + * CAN bus driver for Bosch C_CAN controller
> + *
> + * Copyright (C) 2010 ST Microelectronics
> + * Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> + *
> + * Borrowed heavily from the C_CAN driver originally written by:
> + * Copyright (C) 2007
> + * - Sascha Hauer, Marc Kleine-Budde, Pengutronix <s.hauer-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + * - Simon Kallweit, intefo AG <simon.kallweit-+G9qxTFKJT/tRgLqZ5aouw@public.gmane.org>
> + *
> + * TX and RX NAPI implementation has been borrowed from at91 CAN driver
> + * written by:
> + * Copyright
> + * (C) 2007 by Hans J. Koch <hjk-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
> + * (C) 2008, 2009 by Marc Kleine-Budde <kernel-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + *
> + * Bosch C_CAN controller is compliant to CAN protocol version 2.0 part A and B.
> + * Bosch C_CAN user manual can be obtained from:
> + * http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf

Unfortunately, this link is not valid any more.

> + *
> + * This file is licensed under the terms of the GNU General Public
> + * License version 2. This program is licensed "as is" without any
> + * warranty of any kind, whether express or implied.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/version.h>
> +#include <linux/module.h>
> +#include <linux/interrupt.h>
> +#include <linux/delay.h>
> +#include <linux/netdevice.h>
> +#include <linux/if_arp.h>
> +#include <linux/if_ether.h>
> +#include <linux/list.h>
> +#include <linux/delay.h>
> +#include <linux/workqueue.h>

Do you need that include?

> +#include <linux/io.h>
> +#include <linux/platform_device.h>
> +#include <linux/clk.h>

...and the upper two? They are related to platform code.

> +#include <linux/can.h>
> +#include <linux/can/dev.h>
> +#include <linux/can/error.h>
> +
> +#include "c_can.h"
> +
> +static struct can_bittiming_const c_can_bittiming_const = {
> +	.name = KBUILD_MODNAME,
> +	.tseg1_min = 2,		/* Time segment 1 = prop_seg + phase_seg1 */
> +	.tseg1_max = 16,
> +	.tseg2_min = 1,		/* Time segment 2 = phase_seg2 */
> +	.tseg2_max = 8,
> +	.sjw_max = 4,
> +	.brp_min = 1,
> +	.brp_max = 1024,	/* 6-bit BRP field + 4-bit BRPE field*/
> +	.brp_inc = 1,
> +};
> +
> +static inline int get_tx_next_msg_obj(const struct c_can_priv *priv)
> +{
> +	return (priv->tx_next & C_CAN_NEXT_MSG_OBJ_MASK) +
> +			C_CAN_MSG_OBJ_TX_FIRST;
> +}
> +
> +static inline int get_tx_echo_msg_obj(const struct c_can_priv *priv)
> +{
> +	return (priv->tx_echo & C_CAN_NEXT_MSG_OBJ_MASK) +
> +			C_CAN_MSG_OBJ_TX_FIRST;
> +}
> +
> +static u32 c_can_read_reg32(struct c_can_priv *priv, void *reg)
> +{
> +	u32 val = priv->read_reg(priv, reg);
> +	val |= ((u32) priv->read_reg(priv, reg + 2)) << 16;
> +	return val;
> +}
> +
> +void c_can_enable_all_interrupts(struct c_can_priv *priv,
> +						int enable)
> +{
> +	unsigned int cntrl_save = priv->read_reg(priv,
> +						&priv->reg_base->control);
> +
> +	if (enable)
> +		cntrl_save |= (CONTROL_SIE | CONTROL_EIE | CONTROL_IE);
> +	else
> +		cntrl_save &= ~(CONTROL_EIE | CONTROL_IE | CONTROL_SIE);
> +
> +	priv->write_reg(priv, &priv->reg_base->control, cntrl_save);
> +}
> +EXPORT_SYMBOL_GPL(c_can_enable_all_interrupts);

Do you really need to export that function? More later.

> +
> +static inline void c_can_object_get(struct net_device *dev,
> +					int iface, int objno, int mask)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	int count = MIN_TIMEOUT_VALUE;
> +
> +	/*
> +	 * As per specs, after writting the message object number in the
> +	 * IF command request register the transfer b/w interface
> +	 * register and message RAM must be complete in 6 CAN-CLK
> +	 * period.
> +	 */
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_mask,
> +			IFX_WRITE_LOW_16BIT(mask));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_reg,
> +			IFX_WRITE_LOW_16BIT(objno + 1));
> +
> +	while (count) {
> +		if (!(priv->read_reg(priv,
> +					&priv->reg_base->ifreg[iface].com_reg) &
> +					IF_COMR_BUSY))
> +			break;

Could be shortened to:

	while (count && priv->read_reg(priv,
				&priv->reg_base->ifreg[iface].com_reg) &
				IF_COMR_BUSY)


> +		count--;
> +		udelay(1);
> +	}
> +
> +	if (!count)
> +		dev_err(dev->dev.parent, "timed out in object get\n");
> +}
> +
> +static inline void c_can_object_put(struct net_device *dev,
> +					int iface, int objno, int mask)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	int count = MIN_TIMEOUT_VALUE;
> +
> +	/*
> +	 * As per specs, after writting the message object number in the
> +	 * IF command request register the transfer b/w interface
> +	 * register and message RAM must be complete in 6 CAN-CLK
> +	 * period.
> +	 */
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_mask,
> +			(IF_COMM_WR | IFX_WRITE_LOW_16BIT(mask)));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_reg,
> +			IFX_WRITE_LOW_16BIT(objno + 1));
> +
> +	while (count) {
> +		if (!(priv->read_reg(priv,
> +				&priv->reg_base->ifreg[iface].com_reg) &
> +				IF_COMR_BUSY))
> +			break;

Ditto. Also this is duplicated code. A (inline) function would make sense.

> +
> +		count--;
> +		udelay(1);
> +	}
> +
> +	if (!count)
> +		dev_err(dev->dev.parent, "timed out in object put\n");
> +}
> +
> +int c_can_write_msg_object(struct net_device *dev,
> +			int iface, struct can_frame *frame, int objno)
> +{
> +	int i;
> +	u16 flags = 0;
> +	unsigned int id;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	if (frame->can_id & CAN_EFF_FLAG) {
> +		id = frame->can_id & CAN_EFF_MASK;
> +		flags |= IF_ARB_MSGXTD;
> +	} else
> +		id = ((frame->can_id & CAN_SFF_MASK) << 18);
> +
> +	if (!(frame->can_id & CAN_RTR_FLAG))
> +		flags |= IF_ARB_TRANSMIT;
> +
> +	flags |= IF_ARB_MSGVAL;
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb1,
> +				IFX_WRITE_LOW_16BIT(id));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb2, flags |
> +				IFX_WRITE_HIGH_16BIT(id));
> +
> +	for (i = 0; i < frame->can_dlc; i += 2) {
> +		priv->write_reg(priv, &priv->reg_base->ifreg[iface].data[i / 2],
> +				frame->data[i] | (frame->data[i + 1] << 8));
> +	}
> +
> +	return frame->can_dlc;
> +}
> +
> +static inline void c_can_mark_rx_msg_obj(struct net_device *dev,
> +						int iface, int ctrl_mask,
> +						int obj)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +			ctrl_mask & ~(IF_MCONT_MSGLST | IF_MCONT_INTPND));
> +
> +	c_can_object_put(dev, iface, obj, IF_COMM_CONTROL);
> +

Please remove empty line above.

> +}
> +
> +static inline void c_can_activate_all_lower_rx_msg_obj(struct net_device *dev,
> +						int iface,
> +						int ctrl_mask)
> +{
> +	int i;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	for (i = 0; i < C_CAN_MSG_RX_LOW_LAST; i++) {
> +		priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +				ctrl_mask & ~(IF_MCONT_MSGLST |
> +					IF_MCONT_INTPND | IF_MCONT_NEWDAT));
> +		c_can_object_put(dev, iface, i + 1, IF_COMM_CONTROL);
> +	}
> +}
> +
> +static inline void c_can_activate_rx_msg_obj(struct net_device *dev,
> +						int iface, int ctrl_mask,
> +						int obj)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +			ctrl_mask & ~(IF_MCONT_MSGLST |
> +				IF_MCONT_INTPND | IF_MCONT_NEWDAT));
> +
> +	c_can_object_put(dev, iface, obj, IF_COMM_CONTROL);

Ditto.

> +}
> +
> +static void c_can_handle_lost_msg_obj(struct net_device *dev,
> +					int iface, int objno)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +	struct sk_buff *skb;
> +	struct can_frame *frame;
> +
> +	dev_err(dev->dev.parent, "msg lost in buffer %d\n", objno);
> +
> +	c_can_object_get(dev, iface, objno, IF_COMM_ALL &
> +						~IF_COMM_TXRQST);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +			IF_MCONT_CLR_MSGLST);
> +
> +	c_can_object_put(dev, 0, objno, IF_COMM_CONTROL);
> +
> +	/* create an error msg */
> +	skb = alloc_can_err_skb(dev, &frame);
> +	if (unlikely(!skb))
> +		return;
> +
> +	frame->can_id |= CAN_ERR_CRTL;
> +	frame->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
> +	stats->rx_errors++;
> +	stats->rx_over_errors++;
> +
> +	netif_receive_skb(skb);
> +}
> +
> +static int c_can_read_msg_object(struct net_device *dev, int iface, int ctrl,
> +				int objno)
> +{
> +	u16 flags, data;
> +	int i;
> +	unsigned int val;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +	struct sk_buff *skb;
> +	struct can_frame *frame;
> +
> +	skb = alloc_can_skb(dev, &frame);
> +	if (!skb) {
> +		stats->rx_dropped++;
> +		return -ENOMEM;
> +	}
> +
> +	frame->can_dlc = get_can_dlc(ctrl & 0x0F);
> +
> +	for (i = 0; i < frame->can_dlc; i += 2) {
> +		data = priv->read_reg(priv,
> +				&priv->reg_base->ifreg[iface].data[i / 2]);
> +		frame->data[i] = data;
> +		frame->data[i + 1] = data >> 8;
> +	}
> +
> +	flags =	priv->read_reg(priv, &priv->reg_base->ifreg[iface].arb2);
> +	val = priv->read_reg(priv, &priv->reg_base->ifreg[iface].arb1) |
> +		(flags << 16);
> +
> +	if (flags & IF_ARB_MSGXTD)
> +		frame->can_id = (val & CAN_EFF_MASK) | CAN_EFF_FLAG;
> +	else
> +		frame->can_id = (val >> 18) & CAN_SFF_MASK;
> +
> +	if (flags & IF_ARB_TRANSMIT)
> +		frame->can_id |= CAN_RTR_FLAG;
> +
> +	netif_receive_skb(skb);
> +
> +	stats->rx_packets++;
> +	stats->rx_bytes += frame->can_dlc;
> +
> +	return 0;
> +}
> +
> +static void c_can_setup_receive_object(struct net_device *dev, int iface,
> +					int objno, unsigned int mask,
> +					unsigned int id, unsigned int mcont)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].mask1,
> +			IFX_WRITE_LOW_16BIT(mask));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].mask2,
> +			IFX_WRITE_HIGH_16BIT(mask));
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb1,
> +			IFX_WRITE_LOW_16BIT(id));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb2,
> +			(IF_ARB_MSGVAL | IFX_WRITE_HIGH_16BIT(id)));
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl, mcont);
> +	c_can_object_put(dev, iface, objno, IF_COMM_ALL &
> +						~IF_COMM_TXRQST);

Should fit on one line.

> +
> +	dev_dbg(dev->dev.parent, "obj no:%d, msgval:0x%08x\n", objno,
> +			c_can_read_reg32(priv, &priv->reg_base->msgval1));

Please remove empty line above.

> +}
> +
> +static void c_can_inval_msg_object(struct net_device *dev, int iface, int objno)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb1, 0);
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb2, 0);
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl, 0);
> +
> +	c_can_object_put(dev, iface, objno,
> +				IF_COMM_ARB | IF_COMM_CONTROL);
> +
> +	dev_dbg(dev->dev.parent, "obj no:%d, msgval:0x%08x\n", objno,
> +			c_can_read_reg32(priv, &priv->reg_base->msgval1));

Ditto.

> +}
> +
> +static netdev_tx_t c_can_start_xmit(struct sk_buff *skb,
> +					struct net_device *dev)
> +{
> +	u32 val;
> +	u32 msg_obj_no;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct can_frame *frame = (struct can_frame *)skb->data;
> +
> +	if (can_dropped_invalid_skb(dev, skb))
> +		return NETDEV_TX_OK;
> +
> +	msg_obj_no = get_tx_next_msg_obj(priv);
> +
> +	/* prepare message object for transmission */
> +	val = c_can_write_msg_object(dev, 0, frame, msg_obj_no);
> +
> +	/* enable interrupt for this message object */
> +	priv->write_reg(priv, &priv->reg_base->ifreg[0].msg_cntrl,
> +			IF_MCONT_TXIE | IF_MCONT_TXRQST | IF_MCONT_EOB |
> +			(val & 0xf));
> +	c_can_object_put(dev, 0, msg_obj_no, IF_COMM_ALL);
> +
> +	can_put_echo_skb(skb, dev, msg_obj_no - C_CAN_MSG_OBJ_TX_FIRST);
> +
> +	priv->tx_next++;
> +	if ((priv->tx_next & C_CAN_NEXT_MSG_OBJ_MASK) == 0)
> +		netif_stop_queue(dev);
> +
> +	return NETDEV_TX_OK;
> +}
> +
> +static int c_can_set_bittiming(struct net_device *dev)
> +{
> +	unsigned int reg_btr, reg_brpe, ctrl_save;
> +	u8 brp, brpe, sjw, tseg1, tseg2;
> +	u32 ten_bit_brp;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	const struct can_bittiming *bt = &priv->can.bittiming;
> +
> +	/* c_can provides a 6-bit brp and 4-bit brpe fields */
> +	ten_bit_brp = bt->brp - 1;
> +	brp = ten_bit_brp & BTR_BRP_MASK;
> +	brpe = ten_bit_brp >> 6;
> +
> +	sjw = bt->sjw - 1;
> +	tseg1 = bt->prop_seg + bt->phase_seg1 - 1;
> +	tseg2 = bt->phase_seg2 - 1;
> +
> +	reg_btr = ((brp) | (sjw << BTR_SJW_SHIFT) | (tseg1 << BTR_TSEG1_SHIFT) |
> +			(tseg2 << BTR_TSEG2_SHIFT));

The outer brackets are not needed.

> +	reg_brpe = brpe & BRP_EXT_BRPE_MASK;
> +
> +	dev_info(dev->dev.parent,
> +		"setting BTR=%04x BRPE=%04x\n", reg_btr, reg_brpe);
> +
> +	ctrl_save = priv->read_reg(priv, &priv->reg_base->control);
> +	priv->write_reg(priv, &priv->reg_base->control,
> +			ctrl_save | CONTROL_CCE | CONTROL_INIT);
> +	priv->write_reg(priv, &priv->reg_base->btr, reg_btr);
> +	priv->write_reg(priv, &priv->reg_base->brp_ext, reg_brpe);
> +	priv->write_reg(priv, &priv->reg_base->control, ctrl_save);
> +
> +	return 0;
> +}
> +
> +/*
> + * Configure C_CAN message objects for Tx and Rx purposes:
> + * C_CAN provides a total of 32 message objects that can be configured
> + * either for Tx or Rx purposes. Here the first 16 message objects are used as
> + * a reception FIFO. The end of reception FIFO is signified by the EoB bit
> + * being SET. The remaining 16 message objects are kept aside for Tx purposes.
> + * See user guide document for further details on configuring message
> + * objects.
> + */
> +static void c_can_configure_msg_objects(struct net_device *dev)
> +{
> +	int i;
> +
> +	/* first invalidate all message objects */
> +	for (i = 0; i <= C_CAN_NO_OF_OBJECTS; i++)
> +		c_can_inval_msg_object(dev, 0, i);
> +
> +	/* setup receive message objects */
> +	for (i = C_CAN_MSG_OBJ_RX_FIRST + 1 ; i < C_CAN_MSG_OBJ_RX_LAST; i++)
> +		c_can_setup_receive_object(dev, 0, i, 0, 0,
> +			((IF_MCONT_RXIE | IF_MCONT_UMASK) & ~IF_MCONT_EOB));

Ditto.

> +	c_can_setup_receive_object(dev, 0, C_CAN_MSG_OBJ_RX_LAST, 0, 0,
> +			IF_MCONT_EOB | IF_MCONT_RXIE | IF_MCONT_UMASK);
> +}
> +
> +/*
> + * Configure C_CAN chip:
> + * - enable/disable auto-retransmission
> + * - set operating mode
> + * - configure message objects
> + */
> +static void c_can_chip_config(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT)
> +		/* disable automatic retransmission */
> +		priv->write_reg(priv, &priv->reg_base->control,
> +				CONTROL_DISABLE_AR);
> +	else
> +		/* enable automatic retransmission */
> +		priv->write_reg(priv, &priv->reg_base->control,
> +				CONTROL_ENABLE_AR);
> +
> +	if (priv->can.ctrlmode & (CAN_CTRLMODE_LISTENONLY &
> +					CAN_CTRLMODE_LOOPBACK)) {
> +		/* loopback + silent mode : useful for hot self-test */
> +		priv->write_reg(priv, &priv->reg_base->control, (CONTROL_EIE |
> +				CONTROL_SIE | CONTROL_IE | CONTROL_TEST));

Outer brackets are not needed.

> +		priv->write_reg(priv, &priv->reg_base->test,
> +				(TEST_LBACK | TEST_SILENT));
> +	} else if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) {
> +		/* loopback mode : useful for self-test function */
> +		priv->write_reg(priv, &priv->reg_base->control, (CONTROL_EIE |
> +				CONTROL_SIE | CONTROL_IE | CONTROL_TEST));

Ditto.

> +		priv->write_reg(priv, &priv->reg_base->test, TEST_LBACK);
> +	} else if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) {
> +		/* silent mode : bus-monitoring mode */
> +		priv->write_reg(priv, &priv->reg_base->control, (CONTROL_EIE |
> +				CONTROL_SIE | CONTROL_IE | CONTROL_TEST));

Ditto.

> +		priv->write_reg(priv, &priv->reg_base->test, TEST_SILENT);
> +	} else
> +		/* normal mode*/
> +		priv->write_reg(priv, &priv->reg_base->control,
> +				(CONTROL_EIE | CONTROL_SIE | CONTROL_IE));

Ditto.

> +	/* configure message objects */
> +	c_can_configure_msg_objects(dev);
> +}
> +
> +static void c_can_start(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* enable status change, error and module interrupts */
> +	c_can_enable_all_interrupts(priv, ENABLE_ALL_INTERRUPTS);
> +
> +	/* basic c_can configuration */
> +	c_can_chip_config(dev);
> +
> +	priv->can.state = CAN_STATE_ERROR_ACTIVE;
> +
> +	/* reset tx helper pointers */
> +	priv->tx_next = priv->tx_echo = 0;
> +}
> +
> +static void c_can_stop(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* disable all interrupts */
> +	c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);
> +
> +	/* set the state as STOPPED */
> +	priv->can.state = CAN_STATE_STOPPED;
> +}
> +
> +static int c_can_set_mode(struct net_device *dev, enum can_mode mode)
> +{
> +	switch (mode) {
> +	case CAN_MODE_START:
> +		c_can_start(dev);
> +		netif_wake_queue(dev);
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	return 0;
> +}
> +
> +static int c_can_get_berr_counter(const struct net_device *dev,
> +					struct can_berr_counter *bec)
> +{
> +	unsigned int reg_err_counter;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	reg_err_counter = priv->read_reg(priv, &priv->reg_base->error_counter);
> +	bec->rxerr = ((reg_err_counter & ERR_COUNTER_REC_MASK) >>
> +				ERR_COUNTER_REC_SHIFT);

You don't need the out brackets.

> +	bec->txerr = (reg_err_counter & ERR_COUNTER_TEC_MASK);

Ditto.

> +	return 0;
> +}
> +
> +/*
> + * theory of operation:
> + *
> + * priv->tx_echo holds the number of the oldest can_frame put for
> + * transmission into the hardware, but not yet ACKed by the CAN tx
> + * complete IRQ.
> + *
> + * We iterate from priv->tx_echo to priv->tx_next and check if the
> + * packet has been transmitted, echo it back to the CAN framework.
> + * If we discover a not yet transmitted package, stop looking for more.
> + */
> +static void c_can_do_tx(struct net_device *dev)
> +{
> +	u32 val;
> +	u32 msg_obj_no;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +
> +	for (/* nix */; (priv->tx_next - priv->tx_echo) > 0; priv->tx_echo++) {
> +		msg_obj_no = get_tx_echo_msg_obj(priv);
> +		c_can_inval_msg_object(dev, 0, msg_obj_no);
> +		val = c_can_read_reg32(priv, &priv->reg_base->txrqst1);
> +		if (!(val & (1 << msg_obj_no))) {
> +			can_get_echo_skb(dev,
> +					msg_obj_no - C_CAN_MSG_OBJ_TX_FIRST);
> +			stats->tx_bytes += priv->read_reg(priv,
> +					&priv->reg_base->ifreg[0].msg_cntrl)
> +					& IF_MCONT_DLC_MASK;
> +			stats->tx_packets++;
> +		}
> +	}
> +
> +	/* restart queue if wrap-up or if queue stalled on last pkt */
> +	if (((priv->tx_next & C_CAN_NEXT_MSG_OBJ_MASK) != 0) ||
> +			((priv->tx_echo & C_CAN_NEXT_MSG_OBJ_MASK) == 0))
> +		netif_wake_queue(dev);
> +}
> +
> +/*
> + * theory of operation:
> + *
> + * c_can core saves a received CAN message into the first free message
> + * object it finds free (starting with the lowest). Bits NEWDAT and
> + * INTPND are set for this message object indicating that a new message
> + * has arrived. To work-around this issue, we keep two groups of message
> + * objects whose partitioning is defined by C_CAN_MSG_OBJ_RX_SPLIT.
> + *
> + * To ensure in-order frame reception we use the following
> + * approach while re-activating a message object to receive further
> + * frames:
> + * - if the current message object number is lower than
> + *   C_CAN_MSG_RX_LOW_LAST, do not clear the NEWDAT bit while clearing
> + *   the INTPND bit.
> + * - if the current message object number is equal to
> + *   C_CAN_MSG_RX_LOW_LAST then clear the NEWDAT bit of all lower
> + *   receive message objects.
> + * - if the current message object number is greater than
> + *   C_CAN_MSG_RX_LOW_LAST then clear the NEWDAT bit of
> + *   only this message object.
> + */
> +static int c_can_do_rx_poll(struct net_device *dev, int quota)
> +{
> +	u32 num_rx_pkts = 0;
> +	unsigned int msg_obj, msg_ctrl_save;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	u32 val = c_can_read_reg32(priv, &priv->reg_base->intpnd1);
> +
> +	for (msg_obj = C_CAN_MSG_OBJ_RX_FIRST;
> +			msg_obj <= C_CAN_MSG_OBJ_RX_LAST && quota > 0;
> +			msg_obj++) {
> +		if (val & (1 << msg_obj)) {
> +			c_can_object_get(dev, 0, msg_obj, IF_COMM_ALL &
> +					~IF_COMM_TXRQST);
> +			msg_ctrl_save = priv->read_reg(priv,
> +					&priv->reg_base->ifreg[0].msg_cntrl);
> +
> +			if (msg_ctrl_save & IF_MCONT_EOB)
> +				return num_rx_pkts;
> +
> +			if (msg_ctrl_save & IF_MCONT_MSGLST) {
> +				c_can_handle_lost_msg_obj(dev, 0, msg_obj);
> +				num_rx_pkts++;
> +				quota--;
> +				continue;
> +			}
> +
> +			if (!(msg_ctrl_save & IF_MCONT_NEWDAT))
> +				continue;
> +
> +			/* read the data from the message object */
> +			c_can_read_msg_object(dev, 0, msg_ctrl_save, msg_obj);
> +
> +			if (msg_obj < C_CAN_MSG_RX_LOW_LAST)
> +				c_can_mark_rx_msg_obj(dev, 0,
> +						msg_ctrl_save, msg_obj);
> +			else if (msg_obj > C_CAN_MSG_RX_LOW_LAST)
> +				/* activate this msg obj */
> +				c_can_activate_rx_msg_obj(dev, 0,
> +						msg_ctrl_save, msg_obj);
> +			else if (msg_obj == C_CAN_MSG_RX_LOW_LAST)
> +				/* activate all lower message objects */
> +				c_can_activate_all_lower_rx_msg_obj(dev,
> +						0, msg_ctrl_save);
> +
> +			num_rx_pkts++;
> +			quota--;
> +		}
> +		val = c_can_read_reg32(priv, &priv->reg_base->intpnd1);
> +	}
> +
> +	return num_rx_pkts;
> +}
> +
> +static inline int c_can_has_and_handle_berr(struct c_can_priv *priv)
> +{
> +	return (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) &&
> +		(priv->current_status & STATUS_LEC_MASK);
> +}
> +
> +static int c_can_err(struct net_device *dev,
> +				enum c_can_bus_error_types error_type,
> +				enum c_can_lec_type lec_type)
> +{
> +	unsigned int reg_err_counter;
> +	unsigned int rx_err_passive;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +	struct can_frame *cf;
> +	struct sk_buff *skb;
> +	struct can_berr_counter bec;
> +
> +	/* propogate the error condition to the CAN stack */
> +	skb = alloc_can_err_skb(dev, &cf);
> +	if (unlikely(!skb))
> +		return 0;
> +
> +	c_can_get_berr_counter(dev, &bec);
> +	reg_err_counter = priv->read_reg(priv, &priv->reg_base->error_counter);
> +	rx_err_passive = ((reg_err_counter & ERR_COUNTER_RP_MASK) >>
> +				ERR_COUNTER_RP_SHIFT);

Outer brackset?

> +	if (error_type & C_CAN_ERROR_WARNING) {
> +		/* error warning state */
> +		priv->can.can_stats.error_warning++;
> +		priv->can.state = CAN_STATE_ERROR_WARNING;
> +		cf->can_id |= CAN_ERR_CRTL;
> +		if (bec.rxerr > 96)
> +			cf->data[1] |= CAN_ERR_CRTL_RX_WARNING;
> +		if (bec.txerr > 96)
> +			cf->data[1] |= CAN_ERR_CRTL_TX_WARNING;
> +	}
> +	if (error_type & C_CAN_ERROR_PASSIVE) {
> +		/* error passive state */
> +		priv->can.can_stats.error_passive++;
> +		priv->can.state = CAN_STATE_ERROR_PASSIVE;
> +		cf->can_id |= CAN_ERR_CRTL;
> +		if (rx_err_passive)
> +			cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
> +		if (bec.txerr > 127)
> +			cf->data[1] |= CAN_ERR_CRTL_TX_PASSIVE;
> +	}
> +	if (error_type & C_CAN_BUS_OFF) {
> +		/* bus-off state */
> +		priv->can.state = CAN_STATE_BUS_OFF;
> +		cf->can_id |= CAN_ERR_BUSOFF;
> +		/* disable all interrupts in bus-off mode to ensure that
> +		 * the CPU is not hogged down
> +		 */

Please use the following style:

	/*
	 * Comment
 	 */

> +		c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);
> +		can_bus_off(dev);
> +	}
> +
> +	/*
> +	 * check for 'last error code' which tells us the
> +	 * type of the last error to occur on the CAN bus
> +	 */
> +	switch (lec_type) {
> +		/* common for all type of bus errors */
> +		priv->can.can_stats.bus_error++;
> +		stats->rx_errors++;
> +		cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
> +		cf->data[2] |= CAN_ERR_PROT_UNSPEC;

Are you sure that this part is ever executed? I wonder why the compile does
not complain.

> +	case LEC_STUFF_ERROR:
> +		dev_dbg(dev->dev.parent, "stuff error\n");
> +		cf->data[2] |= CAN_ERR_PROT_STUFF;
> +		break;
> +
> +	case LEC_FORM_ERROR:
> +		dev_dbg(dev->dev.parent, "form error\n");
> +		cf->data[2] |= CAN_ERR_PROT_FORM;
> +		break;
> +
> +	case LEC_ACK_ERROR:
> +		dev_dbg(dev->dev.parent, "ack error\n");
> +		cf->data[2] |= (CAN_ERR_PROT_LOC_ACK |
> +				CAN_ERR_PROT_LOC_ACK_DEL);
> +		break;
> +
> +	case LEC_BIT1_ERROR:
> +		dev_dbg(dev->dev.parent, "bit1 error\n");
> +		cf->data[2] |= CAN_ERR_PROT_BIT1;
> +		break;
> +
> +	case LEC_BIT0_ERROR:
> +		dev_dbg(dev->dev.parent, "bit0 error\n");
> +		cf->data[2] |= CAN_ERR_PROT_BIT0;
> +		break;
> +
> +	case LEC_CRC_ERROR:
> +		dev_dbg(dev->dev.parent, "CRC error\n");
> +		cf->data[2] |= (CAN_ERR_PROT_LOC_CRC_SEQ |
> +				CAN_ERR_PROT_LOC_CRC_DEL);
> +		break;
> +	}
> +
> +	netif_receive_skb(skb);
> +	stats->rx_packets++;
> +	stats->rx_bytes += cf->can_dlc;
> +
> +	return 1;
> +}
> +
> +static int c_can_poll(struct napi_struct *napi, int quota)
> +{
> +	u16 irqstatus;
> +	int lec_type = 0;
> +	int work_done = 0;
> +	struct net_device *dev = napi->dev;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	enum c_can_bus_error_types error_type = C_CAN_NO_ERROR;
> +
> +	irqstatus = priv->read_reg(priv, &priv->reg_base->ir);
> +
> +	/* status events have the highest priority */
> +	if (irqstatus == STATUS_INTERRUPT) {
> +		priv->current_status = priv->read_reg(priv,
> +					&priv->reg_base->status);
> +
> +		/* handle Tx/Rx events */
> +		if (priv->current_status & STATUS_TXOK)
> +			priv->write_reg(priv, &priv->reg_base->status,
> +					(priv->current_status & ~STATUS_TXOK));

Outer bracket are not needed. Here and in similar expressions below.

> +
> +		if (priv->current_status & STATUS_RXOK)
> +			priv->write_reg(priv, &priv->reg_base->status,
> +					(priv->current_status & ~STATUS_RXOK));
> +
> +		/* handle bus error events */
> +		if (priv->current_status & STATUS_EWARN) {
> +			dev_dbg(dev->dev.parent,
> +					"entered error warning state\n");
> +			error_type = C_CAN_ERROR_WARNING;
> +		}
> +		if ((priv->current_status & STATUS_EPASS) &&
> +				(!(priv->last_status & STATUS_EPASS))) {
> +			dev_dbg(dev->dev.parent,
> +					"entered error passive state\n");
> +			error_type = C_CAN_ERROR_PASSIVE;
> +		}
> +		if ((priv->current_status & STATUS_BOFF) &&
> +				(!(priv->last_status & STATUS_BOFF))) {
> +			dev_dbg(dev->dev.parent,
> +					"entered bus off state\n");
> +			error_type = C_CAN_BUS_OFF;
> +		}
> +
> +		/* handle bus recovery events */
> +		if ((!(priv->current_status & STATUS_EPASS)) &&
> +				(priv->last_status & STATUS_EPASS)) {
> +			dev_dbg(dev->dev.parent,
> +					"left error passive state\n");
> +			priv->can.state = CAN_STATE_ERROR_ACTIVE;
> +		}
> +		if ((!(priv->current_status & STATUS_BOFF)) &&
> +				(priv->last_status & STATUS_BOFF)) {
> +			dev_dbg(dev->dev.parent,
> +					"left bus off state\n");
> +			priv->can.state = CAN_STATE_ERROR_ACTIVE;
> +		}
> +
> +		priv->last_status = priv->current_status;
> +
> +		/* handle error on the bus */
> +		lec_type = c_can_has_and_handle_berr(priv);
> +		if (lec_type && (error_type != C_CAN_NO_ERROR))
> +			work_done += c_can_err(dev, error_type, lec_type);
> +	} else if ((irqstatus > C_CAN_MSG_OBJ_RX_FIRST) &&
> +			(irqstatus <= C_CAN_MSG_OBJ_RX_LAST)) {
> +		/* handle events corresponding to receive message objects */
> +		work_done += c_can_do_rx_poll(dev, (quota - work_done));
> +	} else if ((irqstatus > C_CAN_MSG_OBJ_TX_FIRST) &&
> +			(irqstatus <= C_CAN_MSG_OBJ_TX_LAST)) {
> +		/* handle events corresponding to transmit message objects */
> +		c_can_do_tx(dev);
> +	}
> +
> +	if (work_done < quota) {
> +		napi_complete(napi);
> +		/* enable all IRQs */
> +		c_can_enable_all_interrupts(priv, ENABLE_ALL_INTERRUPTS);
> +	}
> +
> +	return work_done;
> +}
> +
> +static irqreturn_t c_can_isr(int irq, void *dev_id)
> +{
> +	struct net_device *dev = (struct net_device *)dev_id;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* disable all interrupts and schedule the NAPI */
> +	c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);
> +	napi_schedule(&priv->napi);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static int c_can_open(struct net_device *dev)
> +{
> +	int err;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* open the can device */
> +	err = open_candev(dev);
> +	if (err) {
> +		dev_err(dev->dev.parent, "failed to open can device\n");
> +		return err;
> +	}
> +
> +	/* register interrupt handler */
> +	err = request_irq(dev->irq, &c_can_isr, priv->irq_flags, dev->name,
> +				dev);
> +	if (err < 0) {
> +		dev_err(dev->dev.parent, "failed to attach interrupt\n");

s/attach/request/ ?

> +		goto exit_irq_fail;
> +	}
> +
> +	/* start the c_can controller */
> +	c_can_start(dev);
> +
> +	napi_enable(&priv->napi);
> +	netif_start_queue(dev);
> +
> +	return 0;
> +
> +exit_irq_fail:
> +	close_candev(dev);
> +	return err;
> +}
> +
> +static int c_can_close(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	netif_stop_queue(dev);
> +	napi_disable(&priv->napi);
> +	c_can_stop(dev);
> +	free_irq(dev->irq, dev);
> +	close_candev(dev);
> +
> +	return 0;
> +}
> +
> +struct net_device *alloc_c_can_dev(void)
> +{
> +	struct net_device *dev;
> +	struct c_can_priv *priv;
> +
> +	dev = alloc_candev(sizeof(struct c_can_priv), C_CAN_MSG_OBJ_TX_NUM);
> +	if (!dev)
> +		return NULL;
> +
> +	priv = netdev_priv(dev);
> +	netif_napi_add(dev, &priv->napi, c_can_poll, C_CAN_NAPI_WEIGHT);
> +
> +	priv->dev = dev;
> +	priv->can.bittiming_const = &c_can_bittiming_const;
> +	priv->can.do_set_bittiming = c_can_set_bittiming;
> +	priv->can.do_set_mode = c_can_set_mode;
> +	priv->can.do_get_berr_counter = c_can_get_berr_counter;
> +	priv->can.ctrlmode_supported = CAN_CTRLMODE_ONE_SHOT |
> +					CAN_CTRLMODE_LOOPBACK |
> +					CAN_CTRLMODE_LISTENONLY |
> +					CAN_CTRLMODE_BERR_REPORTING;
> +
> +	return dev;
> +}
> +EXPORT_SYMBOL_GPL(alloc_c_can_dev);
> +
> +void free_c_can_dev(struct net_device *dev)
> +{
> +	free_candev(dev);
> +}
> +EXPORT_SYMBOL_GPL(free_c_can_dev);
> +
> +static const struct net_device_ops c_can_netdev_ops = {
> +	.ndo_open = c_can_open,
> +	.ndo_stop = c_can_close,
> +	.ndo_start_xmit = c_can_start_xmit,
> +};
> +
> +int register_c_can_dev(struct net_device *dev)
> +{
> +	dev->flags |= IFF_ECHO;	/* we support local echo */
> +	dev->netdev_ops = &c_can_netdev_ops;
> +
> +	return register_candev(dev);
> +}
> +EXPORT_SYMBOL_GPL(register_c_can_dev);
> +
> +void unregister_c_can_dev(struct net_device *dev)
> +{
> +	unregister_candev(dev);
> +}
> +EXPORT_SYMBOL_GPL(unregister_c_can_dev);
> +
> +MODULE_AUTHOR("Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>");
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("CAN bus driver for Bosch C_CAN controller");
> diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h
> new file mode 100644
> index 0000000..fafc5e6
> --- /dev/null
> +++ b/drivers/net/can/c_can/c_can.h
> @@ -0,0 +1,235 @@
> +/*
> + * CAN bus driver for Bosch C_CAN controller
> + *
> + * Copyright (C) 2010 ST Microelectronics
> + * Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> + *
> + * Borrowed heavily from the C_CAN driver originally written by:
> + * Copyright (C) 2007
> + * - Sascha Hauer, Marc Kleine-Budde, Pengutronix <s.hauer-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + * - Simon Kallweit, intefo AG <simon.kallweit-+G9qxTFKJT/tRgLqZ5aouw@public.gmane.org>
> + *
> + * TX and RX NAPI implementation has been borrowed from at91 CAN driver
> + * written by:
> + * Copyright
> + * (C) 2007 by Hans J. Koch <hjk-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
> + * (C) 2008, 2009 by Marc Kleine-Budde <kernel-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + *
> + * Bosch C_CAN controller is compliant to CAN protocol version 2.0 part A and B.
> + * Bosch C_CAN user manual can be obtained from:
> + * http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
> + *
> + * This file is licensed under the terms of the GNU General Public
> + * License version 2. This program is licensed "as is" without any
> + * warranty of any kind, whether express or implied.
> + */
> +
> +#ifndef C_CAN_H
> +#define C_CAN_H
> +
> +/* control register */
> +#define CONTROL_TEST		BIT(7)
> +#define CONTROL_CCE		BIT(6)
> +#define CONTROL_DISABLE_AR	BIT(5)
> +#define CONTROL_ENABLE_AR	(0 << 5)
> +#define CONTROL_EIE		BIT(3)
> +#define CONTROL_SIE		BIT(2)
> +#define CONTROL_IE		BIT(1)
> +#define CONTROL_INIT		BIT(0)
> +
> +/* test register */
> +#define TEST_RX			BIT(7)
> +#define TEST_TX1		BIT(6)
> +#define TEST_TX2		BIT(5)
> +#define TEST_LBACK		BIT(4)
> +#define TEST_SILENT		BIT(3)
> +#define TEST_BASIC		BIT(2)
> +
> +/* status register */
> +#define STATUS_BOFF		BIT(7)
> +#define STATUS_EWARN		BIT(6)
> +#define STATUS_EPASS		BIT(5)
> +#define STATUS_RXOK		BIT(4)
> +#define STATUS_TXOK		BIT(3)
> +#define STATUS_LEC_MASK		0x07
> +
> +/* error counter register */
> +#define ERR_COUNTER_TEC_MASK	0xff
> +#define ERR_COUNTER_TEC_SHIFT	0
> +#define ERR_COUNTER_REC_SHIFT	8
> +#define ERR_COUNTER_REC_MASK	(0x7f << ERR_COUNTER_REC_SHIFT)
> +#define ERR_COUNTER_RP_SHIFT	15
> +#define ERR_COUNTER_RP_MASK	(0x1 << ERR_COUNTER_RP_SHIFT)
> +
> +/* bit-timing register */
> +#define BTR_BRP_MASK		0x3f
> +#define BTR_BRP_SHIFT		0
> +#define BTR_SJW_SHIFT		6
> +#define BTR_SJW_MASK		(0x3 << BTR_SJW_SHIFT)
> +#define BTR_TSEG1_SHIFT		8
> +#define BTR_TSEG1_MASK		(0xf << BTR_TSEG1_SHIFT)
> +#define BTR_TSEG2_SHIFT		12
> +#define BTR_TSEG2_MASK		(0x7 << BTR_TSEG2_SHIFT)
> +
> +/* brp extension register */
> +#define BRP_EXT_BRPE_MASK	0x0f
> +#define BRP_EXT_BRPE_SHIFT	0
> +
> +/* IFx command request */
> +#define IF_COMR_BUSY		BIT(15)
> +
> +/* IFx command mask */
> +#define IF_COMM_WR		BIT(7)
> +#define IF_COMM_MASK		BIT(6)
> +#define IF_COMM_ARB		BIT(5)
> +#define IF_COMM_CONTROL		BIT(4)
> +#define IF_COMM_CLR_INT_PND	BIT(3)
> +#define IF_COMM_TXRQST		BIT(2)
> +#define IF_COMM_DATAA		BIT(1)
> +#define IF_COMM_DATAB		BIT(0)
> +#define IF_COMM_ALL		(IF_COMM_MASK | IF_COMM_ARB | \
> +				IF_COMM_CONTROL | IF_COMM_TXRQST | \
> +				IF_COMM_DATAA | IF_COMM_DATAB)
> +
> +/* IFx arbitration */
> +#define IF_ARB_MSGVAL		BIT(15)
> +#define IF_ARB_MSGXTD		BIT(14)
> +#define IF_ARB_TRANSMIT		BIT(13)
> +
> +/* IFx message control */
> +#define IF_MCONT_NEWDAT		BIT(15)
> +#define IF_MCONT_MSGLST		BIT(14)
> +#define IF_MCONT_CLR_MSGLST	(0 << 14)
> +#define IF_MCONT_INTPND		BIT(13)
> +#define IF_MCONT_UMASK		BIT(12)
> +#define IF_MCONT_TXIE		BIT(11)
> +#define IF_MCONT_RXIE		BIT(10)
> +#define IF_MCONT_RMTEN		BIT(9)
> +#define IF_MCONT_TXRQST		BIT(8)
> +#define IF_MCONT_EOB		BIT(7)
> +#define IF_MCONT_DLC_MASK	0xf
> +
> +/*
> + * IFx register masks:
> + * allow easy operation on 16-bit registers when the
> + * argument is 32-bit instead
> + */
> +#define IFX_WRITE_LOW_16BIT(x)	((x) & 0xFFFF)
> +#define IFX_WRITE_HIGH_16BIT(x)	(((x) & 0xFFFF0000) >> 16)
> +
> +/* message object split */
> +#define C_CAN_NO_OF_OBJECTS	31
> +#define C_CAN_MSG_OBJ_RX_NUM	16
> +#define C_CAN_MSG_OBJ_TX_NUM	16
> +
> +#define C_CAN_MSG_OBJ_RX_FIRST	0
> +#define C_CAN_MSG_OBJ_RX_LAST	(C_CAN_MSG_OBJ_RX_FIRST + \
> +				C_CAN_MSG_OBJ_RX_NUM - 1)
> +
> +#define C_CAN_MSG_OBJ_TX_FIRST	(C_CAN_MSG_OBJ_RX_LAST + 1)
> +#define C_CAN_MSG_OBJ_TX_LAST	(C_CAN_MSG_OBJ_TX_FIRST + \
> +				C_CAN_MSG_OBJ_TX_NUM - 1)
> +
> +#define C_CAN_MSG_OBJ_RX_SPLIT	8
> +#define C_CAN_MSG_RX_LOW_LAST	(C_CAN_MSG_OBJ_RX_SPLIT - 1)
> +
> +#define C_CAN_NEXT_MSG_OBJ_MASK	(C_CAN_MSG_OBJ_TX_NUM - 1)
> +#define RECEIVE_OBJECT_BITS	0x0000ffff
> +
> +/* status interrupt */
> +#define STATUS_INTERRUPT	0x8000
> +
> +/* global interrupt masks */
> +#define ENABLE_ALL_INTERRUPTS	1
> +#define DISABLE_ALL_INTERRUPTS	0
> +
> +/* minimum timeout for checking BUSY status */
> +#define MIN_TIMEOUT_VALUE	6
> +
> +/* napi related */
> +#define C_CAN_NAPI_WEIGHT	C_CAN_MSG_OBJ_RX_NUM
> +
> +/* c_can IF registers */
> +struct c_can_if_regs {
> +	u16 com_reg;
> +	u16 com_mask;
> +	u16 mask1;
> +	u16 mask2;
> +	u16 arb1;
> +	u16 arb2;
> +	u16 msg_cntrl;
> +	u16 data[4];
> +	u16 _reserved[13];
> +};
> +
> +/* c_can hardware registers */
> +struct c_can_regs {
> +	u16 control;
> +	u16 status;
> +	u16 error_counter;
> +	u16 btr;
> +	u16 ir;
> +	u16 test;
> +	u16 brp_ext;
> +	u16 _reserved1;
> +	struct c_can_if_regs ifreg[2]; /* [0] = IF1 and [1] = IF2 */

Why not just "if" instead of "ifreg"? That would also nicely shorten
many log expressions.

> +	u16 _reserved2[8];
> +	u16 txrqst1;
> +	u16 txrqst2;
> +	u16 _reserved3[6];
> +	u16 newdat1;
> +	u16 newdat2;
> +	u16 _reserved4[6];
> +	u16 intpnd1;
> +	u16 intpnd2;
> +	u16 _reserved5[6];
> +	u16 msgval1;
> +	u16 msgval2;
> +	u16 _reserved6[6];
> +};

Above you use both, rather long and heavily abbreviated names, e.g.
"error_counter" vs. "ir". Something in between would be nice.

> +/* c_can lec values */
> +enum c_can_lec_type {
> +	LEC_STUFF_ERROR = 1,
> +	LEC_FORM_ERROR,
> +	LEC_ACK_ERROR,
> +	LEC_BIT1_ERROR,
> +	LEC_BIT0_ERROR,
> +	LEC_CRC_ERROR,
> +};
> +
> +/*
> + * c_can error types:
> + * Bus errors (BUS_OFF, ERROR_WARNING, ERROR_PASSIVE) are supported
> + */
> +enum c_can_bus_error_types {
> +	C_CAN_NO_ERROR = 0,
> +	C_CAN_BUS_OFF,
> +	C_CAN_ERROR_WARNING,
> +	C_CAN_ERROR_PASSIVE,
> +};
> +
> +/* c_can private data structure */
> +struct c_can_priv {
> +	struct can_priv can;	/* must be the first member */
> +	struct napi_struct napi;
> +	struct net_device *dev;
> +	int tx_object;
> +	int current_status;
> +	int last_status;
> +	u16 (*read_reg) (struct c_can_priv *priv, void *reg);
> +	void (*write_reg) (struct c_can_priv *priv, void *reg, u16 val);
> +	struct c_can_regs __iomem *reg_base;

s/reg_base/regs/ seems more logical to me. reg_base sounds like a "void *"
member. 

> +	unsigned long irq_flags; /* for request_irq() */
> +	unsigned int tx_next;
> +	unsigned int tx_echo;
> +	struct clk *clk;

clk is a platform specific variable, e.g. a PCI based drive will not need it.
Therefore a member "priv" would make sense. Also it would nicely shorten
many log expressions.

> +};
> +
> +void c_can_enable_all_interrupts(struct c_can_priv *priv, int enable);
> +struct net_device *alloc_c_can_dev(void);
> +void free_c_can_dev(struct net_device *dev);
> +int register_c_can_dev(struct net_device *dev);
> +void unregister_c_can_dev(struct net_device *dev);
> +
> +#endif /* C_CAN_H */
> diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c
> new file mode 100644
> index 0000000..482a57e
> --- /dev/null
> +++ b/drivers/net/can/c_can/c_can_platform.c
> @@ -0,0 +1,210 @@
> +/*
> + * Platform CAN bus driver for Bosch C_CAN controller
> + *
> + * Copyright (C) 2010 ST Microelectronics
> + * Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> + *
> + * Borrowed heavily from the C_CAN driver originally written by:
> + * Copyright (C) 2007
> + * - Sascha Hauer, Marc Kleine-Budde, Pengutronix <s.hauer-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + * - Simon Kallweit, intefo AG <simon.kallweit-+G9qxTFKJT/tRgLqZ5aouw@public.gmane.org>
> + *
> + * Bosch C_CAN controller is compliant to CAN protocol version 2.0 part A and B.
> + * Bosch C_CAN user manual can be obtained from:
> + * http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
> + *
> + * This file is licensed under the terms of the GNU General Public
> + * License version 2. This program is licensed "as is" without any
> + * warranty of any kind, whether express or implied.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/version.h>
> +#include <linux/module.h>
> +#include <linux/interrupt.h>
> +#include <linux/delay.h>
> +#include <linux/netdevice.h>
> +#include <linux/if_arp.h>
> +#include <linux/if_ether.h>
> +#include <linux/list.h>
> +#include <linux/delay.h>
> +#include <linux/io.h>
> +#include <linux/platform_device.h>
> +#include <linux/clk.h>
> +
> +#include <linux/can/dev.h>
> +
> +#include "c_can.h"
> +
> +/*
> + * 16-bit c_can registers can be arranged differently in the memory
> + * architecture of different implementations. For example: 16-bit
> + * registers can be aligned to a 16-bit boundary or 32-bit boundary etc.
> + * Handle the same by providing a common read/write interface.
> + */
> +static u16 c_can_plat_read_reg_aligned_to_16bit(struct c_can_priv *priv,
> +						void *reg)
> +{
> +	return readw(reg);
> +}
> +
> +static void c_can_plat_write_reg_aligned_to_16bit(struct c_can_priv *priv,
> +						void *reg, u16 val)
> +{
> +	writew(val, reg);
> +}
> +
> +static u16 c_can_plat_read_reg_aligned_to_32bit(struct c_can_priv *priv,
> +						void *reg)
> +{
> +	return readw(reg + (long)reg - (long)priv->reg_base);
> +}
> +
> +static void c_can_plat_write_reg_aligned_to_32bit(struct c_can_priv *priv,
> +						void *reg, u16 val)
> +{
> +	writew(val, reg + (long)reg - (long)priv->reg_base);
> +}
> +
> +static int __devinit c_can_plat_probe(struct platform_device *pdev)
> +{
> +	int ret;
> +	void __iomem *addr;
> +	struct net_device *dev;
> +	struct c_can_priv *priv;
> +	struct resource *mem, *irq;
> +	struct clk *clk;
> +
> +	/* get the appropriate clk */
> +	clk = clk_get(&pdev->dev, NULL);
> +	if (IS_ERR(clk)) {
> +		dev_err(&pdev->dev, "no clock defined\n");
> +		ret = -ENODEV;
> +		goto exit;
> +	}
> +
> +	/* get the platform data */
> +	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
> +	if (!mem || (irq <= 0)) {
> +		ret = -ENODEV;
> +		goto exit_free_clk;
> +	}
> +
> +	if (!request_mem_region(mem->start, resource_size(mem),
> +				KBUILD_MODNAME)) {
> +		dev_err(&pdev->dev, "resource unavailable\n");
> +		ret = -ENODEV;
> +		goto exit_free_clk;
> +	}
> +
> +	addr = ioremap(mem->start, resource_size(mem));
> +	if (!addr) {
> +		dev_err(&pdev->dev, "failed to map can port\n");
> +		ret = -ENOMEM;
> +		goto exit_release_mem;
> +	}
> +
> +	/* allocate the c_can device */
> +	dev = alloc_c_can_dev();
> +	if (!dev) {
> +		ret = -ENOMEM;
> +		goto exit_iounmap;
> +	}
> +
> +	priv = netdev_priv(dev);
> +
> +	dev->irq = irq->start;
> +	priv->irq_flags = irq->flags;
> +	priv->reg_base = addr;
> +	priv->can.clock.freq = clk_get_rate(clk);
> +	priv->clk = clk;
> +
> +	switch (mem->flags & IORESOURCE_MEM_TYPE_MASK) {
> +	case IORESOURCE_MEM_32BIT:
> +		priv->read_reg = c_can_plat_read_reg_aligned_to_32bit;
> +		priv->write_reg = c_can_plat_write_reg_aligned_to_32bit;
> +		break;
> +	case IORESOURCE_MEM_16BIT:
> +	default:
> +		priv->read_reg = c_can_plat_read_reg_aligned_to_16bit;
> +		priv->write_reg = c_can_plat_write_reg_aligned_to_16bit;
> +		break;
> +	}
> +
> +	platform_set_drvdata(pdev, dev);
> +	SET_NETDEV_DEV(dev, &pdev->dev);
> +
> +	ret = register_c_can_dev(dev);
> +	if (ret) {
> +		dev_err(&pdev->dev, "registering %s failed (err=%d)\n",
> +			KBUILD_MODNAME, ret);
> +		goto exit_free_device;
> +	}
> +
> +	dev_info(&pdev->dev, "%s device registered (reg_base=%p, irq=%d)\n",
> +		 KBUILD_MODNAME, priv->reg_base, dev->irq);
> +	return 0;
> +
> +exit_free_device:
> +	platform_set_drvdata(pdev, NULL);
> +	free_c_can_dev(dev);
> +exit_iounmap:
> +	iounmap(addr);
> +exit_release_mem:
> +	release_mem_region(mem->start, resource_size(mem));
> +exit_free_clk:
> +	clk_put(clk);
> +exit:
> +	dev_err(&pdev->dev, "probe failed\n");
> +
> +	return ret;
> +}
> +
> +static int __devexit c_can_plat_remove(struct platform_device *pdev)
> +{
> +	struct net_device *dev = platform_get_drvdata(pdev);
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct resource *mem;
> +
> +	/* disable all interrupts */
> +	c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);

To avoid exportign that function, couldn't it be done at the beginning of 
unregister_c_can_dev()?

> +
> +	unregister_c_can_dev(dev);
> +	platform_set_drvdata(pdev, NULL);
> +
> +	free_c_can_dev(dev);
> +	iounmap(priv->reg_base);
> +
> +	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	release_mem_region(mem->start, resource_size(mem));
> +
> +	clk_put(priv->clk);
> +
> +	return 0;
> +}
> +
> +static struct platform_driver c_can_plat_driver = {
> +	.driver = {
> +		.name = KBUILD_MODNAME,
> +		.owner = THIS_MODULE,
> +	},
> +	.probe = c_can_plat_probe,
> +	.remove = __devexit_p(c_can_plat_remove),
> +};
> +
> +static int __init c_can_plat_init(void)
> +{
> +	return platform_driver_register(&c_can_plat_driver);
> +}
> +module_init(c_can_plat_init);
> +
> +static void __exit c_can_plat_exit(void)
> +{
> +	platform_driver_unregister(&c_can_plat_driver);
> +}
> +module_exit(c_can_plat_exit);
> +
> +MODULE_AUTHOR("Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>");
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("Platform CAN bus driver for Bosch C_CAN controller");

Thanks for your contribution.

Wolfgang.

^ permalink raw reply

* [RFC v3 PATCH] m68knommu: added dm9000 support
From: Angelo Dureghello @ 2011-01-08  9:08 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-m68k

This patch allows to use the dm9000 network chip with a m68knommu
big-endian cpu. From the data bus circuit-wiring point of view,
the cpu data bus connected to the dm9000 chip should be hardware-byte-swapped,
crossing the bytes wires (D0:7 to D24:31, etc.). 
In anyway, has been also added an option to swap the bytes in the driver, 
if some cpu has been wired straight D0:D31 to dm9000.

Signed-off-by: Angelo Dureghello <angelo70@gmail.com>

---
--- linux/drivers/net/Kconfig.orig	2011-01-05 17:11:37.992376124 +0100
+++ linux/drivers/net/Kconfig	2011-01-08 09:53:48.231300064 +0100
@@ -960,7 +960,7 @@ config TI_DAVINCI_EMAC
 
 config DM9000
 	tristate "DM9000 support"
-	depends on ARM || BLACKFIN || MIPS
+	depends on COLDFIRE || ARM || BLACKFIN || MIPS
 	select CRC32
 	select MII
 	---help---
@@ -986,6 +986,14 @@ config DM9000_FORCE_SIMPLE_PHY_POLL
 	  costly MII PHY reads. Note, this will not work if the chip is
 	  operating with an external PHY.
 
+config DM9000_32BIT_SW_SWAP
+	bool "Software byte swap for 32 bit data bus"
+	depends on DM9000 && COLDFIRE
+	---help---
+	  This configuration allows to swap data bytes from the dm9000
+	  driver itself, when the big endian cpu is wired straight to
+	  the dm9000 32 bit data bus.
+
 config ENC28J60
 	tristate "ENC28J60 support"
 	depends on EXPERIMENTAL && SPI && NET_ETHERNET

--- linux/drivers/net/dm9000.c.orig	2010-12-30 23:19:39.747836070 +0100
+++ linux/drivers/net/dm9000.c	2011-01-08 09:54:28.543551323 +0100
@@ -158,9 +158,17 @@ dm9000_reset(board_info_t * db)
 	dev_dbg(db->dev, "resetting device\n");
 
 	/* RESET device */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(DM9000_NCR, db->io_addr);
+#else
 	writeb(DM9000_NCR, db->io_addr);
+#endif
 	udelay(200);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(NCR_RST, db->io_data);
+#else
 	writeb(NCR_RST, db->io_data);
+#endif
 	udelay(200);
 }
 
@@ -170,8 +178,13 @@ dm9000_reset(board_info_t * db)
 static u8
 ior(board_info_t * db, int reg)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg, db->io_addr);
+	return (u8)readl(db->io_data);
+#else
 	writeb(reg, db->io_addr);
 	return readb(db->io_data);
+#endif
 }
 
 /*
@@ -181,43 +194,72 @@ ior(board_info_t * db, int reg)
 static void
 iow(board_info_t * db, int reg, int value)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg, db->io_addr);
+	writel(value, db->io_data);
+#else
 	writeb(reg, db->io_addr);
 	writeb(value, db->io_data);
+#endif
 }
 
 /* routines for sending block to chip */
 
 static void dm9000_outblk_8bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writesbsw(reg, data, count);
+#else
 	writesb(reg, data, count);
+#endif
 }
 
 static void dm9000_outblk_16bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writeswsw(reg, data, (count+1) >> 1);
+#else
 	writesw(reg, data, (count+1) >> 1);
+#endif
 }
 
 static void dm9000_outblk_32bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writeslsw(reg, data, (count+3) >> 2);
+#else
 	writesl(reg, data, (count+3) >> 2);
+#endif
 }
 
 /* input block from chip to memory */
 
 static void dm9000_inblk_8bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	readsbsw(reg, data, count);
+#else
 	readsb(reg, data, count);
+#endif
 }
 
 
 static void dm9000_inblk_16bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	readswsw(reg, data, (count+1) >> 1);
+#else
 	readsw(reg, data, (count+1) >> 1);
+#endif
 }
 
 static void dm9000_inblk_32bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	readslsw(reg, data, (count+3) >> 2);
+#else
 	readsl(reg, data, (count+3) >> 2);
+#endif
 }
 
 /* dump block from chip to null */
@@ -863,8 +905,12 @@ static void dm9000_timeout(struct net_de
 	netif_wake_queue(dev);
 
 	/* Restore previous register address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
-	spin_unlock_irqrestore(&db->lock, flags);
+#endif
+	spin_unlock_irqrestore(&db->lock,flags);
 }
 
 static void dm9000_send_packet(struct net_device *dev,
@@ -908,7 +954,11 @@ dm9000_start_xmit(struct sk_buff *skb, s
 	spin_lock_irqsave(&db->lock, flags);
 
 	/* Move data to DM9000 TX RAM */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   writel(DM9000_MWCMD, db->io_addr);
+#else
 	writeb(DM9000_MWCMD, db->io_addr);
+#endif	
 
 	(db->outblk)(db->io_data, skb->data, skb->len);
 	dev->stats.tx_bytes += skb->len;
@@ -981,7 +1031,11 @@ dm9000_rx(struct net_device *dev)
 		ior(db, DM9000_MRCMDX);	/* Dummy read */
 
 		/* Get most updated data */
-		rxbyte = readb(db->io_data);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+      rxbyte = (u8)readl(db->io_data);
+#else
+      rxbyte = readb(db->io_data);
+#endif
 
 		/* Status check: this byte must be 0 or 1 */
 		if (rxbyte & DM9000_PKT_ERR) {
@@ -996,7 +1050,12 @@ dm9000_rx(struct net_device *dev)
 
 		/* A packet ready now  & Get status/length */
 		GoodPacket = true;
+
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+		writel(DM9000_MRCMD, db->io_addr);
+#else
 		writeb(DM9000_MRCMD, db->io_addr);
+#endif
 
 		(db->inblk)(db->io_data, &rxhdr, sizeof(rxhdr));
 
@@ -1085,7 +1144,11 @@ static irqreturn_t dm9000_interrupt(int
 	spin_lock_irqsave(&db->lock, flags);
 
 	/* Save previous register address */
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	/* Disable all interrupts */
 	iow(db, DM9000_IMR, IMR_PAR);
@@ -1116,7 +1179,11 @@ static irqreturn_t dm9000_interrupt(int
 	iow(db, DM9000_IMR, db->imr_all);
 
 	/* Restore previous register address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 
 	spin_unlock_irqrestore(&db->lock, flags);
 
@@ -1237,7 +1304,11 @@ dm9000_phy_read(struct net_device *dev,
 	spin_lock_irqsave(&db->lock,flags);
 
 	/* Save previous register address */
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	/* Fill the phyxcer register into REG_0C */
 	iow(db, DM9000_EPAR, DM9000_PHY | reg);
@@ -1250,7 +1321,11 @@ dm9000_phy_read(struct net_device *dev,
 	dm9000_msleep(db, 1);		/* Wait read complete */
 
 	spin_lock_irqsave(&db->lock,flags);
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	iow(db, DM9000_EPCR, 0x0);	/* Clear phyxcer read command */
 
@@ -1258,7 +1333,11 @@ dm9000_phy_read(struct net_device *dev,
 	ret = (ior(db, DM9000_EPDRH) << 8) | ior(db, DM9000_EPDRL);
 
 	/* restore the previous address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 	spin_unlock_irqrestore(&db->lock,flags);
 
 	mutex_unlock(&db->addr_lock);
@@ -1284,7 +1363,11 @@ dm9000_phy_write(struct net_device *dev,
 	spin_lock_irqsave(&db->lock,flags);
 
 	/* Save previous register address */
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	/* Fill the phyxcer register into REG_0C */
 	iow(db, DM9000_EPAR, DM9000_PHY | reg);
@@ -1295,18 +1378,30 @@ dm9000_phy_write(struct net_device *dev,
 
 	iow(db, DM9000_EPCR, EPCR_EPOS | EPCR_ERPRW);	/* Issue phyxcer write command */
 
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 	spin_unlock_irqrestore(&db->lock, flags);
 
 	dm9000_msleep(db, 1);		/* Wait write complete */
 
 	spin_lock_irqsave(&db->lock,flags);
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	iow(db, DM9000_EPCR, 0x0);	/* Clear phyxcer write command */
 
 	/* restore the previous address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 
 	spin_unlock_irqrestore(&db->lock, flags);
 	mutex_unlock(&db->addr_lock);

--- linux/arch/m68k/include/asm/io_no.h.orig	2011-01-08 09:53:16.835301417 +0100
+++ linux/arch/m68k/include/asm/io_no.h	2011-01-08 09:53:18.523299757 +0100
@@ -47,6 +47,90 @@ static inline unsigned int _swapl(volati
 #define writew(b,addr) (void)((*(volatile unsigned short *) (addr)) = (b))
 #define writel(b,addr) (void)((*(volatile unsigned int *) (addr)) = (b))
 
+static inline void writesb (void __iomem *reg, void *data, int count)
+{
+	unsigned char *p = (unsigned char*) data;
+
+	while (count--) writeb(*p++, reg);
+}
+
+static inline void writesbsw (void __iomem *reg, void *data, int count)
+{
+	unsigned char *p = (unsigned char *) data;
+
+	while (count--) writel((int)(*p++), reg);
+}
+
+static inline void writesw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short*) data;
+
+   while (count--) writew(*p++, reg);
+}
+
+static inline void writeswsw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short *) data;
+
+   while (count--) writel((int)(_swapw(*p++)), reg);
+}
+
+static inline void writesl (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long*) data;
+
+   while (count--) writel(*p++, reg);
+}
+
+static inline void writeslsw (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long *) data;
+
+   while (count--) writel((int)(_swapl(*p++)), reg);
+}
+
+static inline void readsb (void __iomem *reg, void *data, int count)
+{
+   unsigned char *p = (unsigned char *) data;
+
+   while (count--) *p++ = readb(reg);
+}
+
+static inline void readsbsw (void __iomem *reg, void *data, int count)
+{
+   unsigned char *p = (unsigned char *) data;
+
+   while (count--) *p++ = (unsigned char)readl(reg);
+}
+
+static inline void readsw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short *) data;
+
+   while (count--) *p++ = readb(reg);
+}
+
+static inline void readswsw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short *) data;
+
+   while (count--) *p++ = _swapw((unsigned short)readw(reg));
+}
+
+static inline void readsl (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long *) data;
+
+   while (count--) *p++ = readb(reg);
+}
+
+static inline void readslsw (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long *) data;
+
+   while (count--) *p++ = _swapl(readl(reg));
+}
+
 #define __raw_readb readb
 #define __raw_readw readw
 #define __raw_readl readl

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: David Miller @ 2011-01-08  4:37 UTC (permalink / raw)
  To: paulus; +Cc: xiaosuo, harvey.harrison, linux-ppp, netdev
In-Reply-To: <20110108031320.GA28926@brick.ozlabs.ibm.com>

From: Paul Mackerras <paulus@samba.org>
Date: Sat, 8 Jan 2011 14:13:20 +1100

> On Fri, Jan 07, 2011 at 05:15:34PM -0800, David Miller wrote:
> 
>> I have to say that every time I go read the header parsing code in the
>> PPP driver, I absolutely regret it.
>> 
>> And Changli's patch fixes some of the readability problems.
> 
> It's up to you whether you merge the patch or not, but surely you
> agree it needs more than a zero-line description?

It's entire sufficient to me.

He de-open-coded {get,put}_unaligned_be{16,32}() and when open-coding
is eliminated in this way a commit message of "Use {helper function
foo}." is more than enough.

^ permalink raw reply

* Re: [RFC] sched: QFQ - quick fair queue scheduler
From: David Miller @ 2011-01-08  4:34 UTC (permalink / raw)
  To: xiaosuo; +Cc: shemminger, dada1, fabio, netdev, rizzo
In-Reply-To: <AANLkTi=pqZ3CwLTAZnhd-cyQNj8OSeBHsP_bFiH3hJ-_@mail.gmail.com>


Changli, please do not quote an entire patch just to comment upon
one specific portion of that patch.  Quote only the hunks of the
patch you actually want to talk about.

When you quote the entire patch, it wastes bandwith, and makes it
harder for people to scan around to see your feedback.

I think people who do this have no idea how much pain they cause
for every single person reading their postings.  Nor do they
realize that this makes their feedback get unread completely by
many people.

Please, never do this again.  You contribute far too much for this
to become a habit.

Thank you.

^ permalink raw reply

* [PATCH 2/2] sky2: convert to new VLAN model (v0.2)
From: Stephen Hemminger @ 2011-01-08  4:13 UTC (permalink / raw)
  To: Jesse Gross, David Miller; +Cc: netdev
In-Reply-To: <AANLkTikwGQFByOZGgCCjTJySPa8QYndZ903CFmOkS1Ha@mail.gmail.com>

This converts sky2 to new VLAN offload flags control via ethtool.
It also allows for transmit offload of vlan tagged frames which
was not possible before.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
Changed the setting of vlan_features in this version to keep
non-offload settings (GRO|HIGHDMA) even if vlan offload is not
enabled.

--- a/drivers/net/sky2.c	2011-01-07 20:06:03.082168965 -0800
+++ b/drivers/net/sky2.c	2011-01-07 20:09:06.006180327 -0800
@@ -46,10 +46,6 @@
 
 #include <asm/irq.h>
 
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
-#define SKY2_VLAN_TAG_USED 1
-#endif
-
 #include "sky2.h"
 
 #define DRV_NAME		"sky2"
@@ -1326,39 +1322,34 @@ static int sky2_ioctl(struct net_device
 	return err;
 }
 
-#ifdef SKY2_VLAN_TAG_USED
-static void sky2_set_vlan_mode(struct sky2_hw *hw, u16 port, bool onoff)
-{
-	if (onoff) {
-		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
-			     RX_VLAN_STRIP_ON);
-		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
-			     TX_VLAN_TAG_ON);
-	} else {
-		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
-			     RX_VLAN_STRIP_OFF);
-		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
-			     TX_VLAN_TAG_OFF);
-	}
-}
+#define NETIF_F_ALL_VLAN (NETIF_F_HW_VLAN_TX|NETIF_F_HW_VLAN_RX)
 
-static void sky2_vlan_rx_register(struct net_device *dev, struct vlan_group *grp)
+static void sky2_vlan_mode(struct net_device *dev)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
 	u16 port = sky2->port;
 
-	netif_tx_lock_bh(dev);
-	napi_disable(&hw->napi);
+	if (dev->features & NETIF_F_HW_VLAN_RX)
+		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
+			     RX_VLAN_STRIP_ON);
+	else
+		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
+			     RX_VLAN_STRIP_OFF);
 
-	sky2->vlgrp = grp;
-	sky2_set_vlan_mode(hw, port, grp != NULL);
+	dev->vlan_features = dev->features &~ NETIF_F_ALL_VLAN;
+	if (dev->features & NETIF_F_HW_VLAN_TX)
+		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
+			     TX_VLAN_TAG_ON);
+	else {
+		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
+			     TX_VLAN_TAG_OFF);
 
-	sky2_read32(hw, B0_Y2_SP_LISR);
-	napi_enable(&hw->napi);
-	netif_tx_unlock_bh(dev);
+		/* Can't do transmit offload of vlan without hw vlan */
+		dev->vlan_features &= ~(NETIF_F_TSO | NETIF_F_SG
+					| NETIF_F_ALL_CSUM);
+	}
 }
-#endif
 
 /* Amount of required worst case padding in rx buffer */
 static inline unsigned sky2_rx_pad(const struct sky2_hw *hw)
@@ -1635,9 +1626,7 @@ static void sky2_hw_up(struct sky2_port
 	sky2_prefetch_init(hw, txqaddr[port], sky2->tx_le_map,
 			   sky2->tx_ring_size - 1);
 
-#ifdef SKY2_VLAN_TAG_USED
-	sky2_set_vlan_mode(hw, port, sky2->vlgrp != NULL);
-#endif
+	sky2_vlan_mode(sky2->netdev);
 
 	sky2_rx_start(sky2);
 }
@@ -1780,7 +1769,7 @@ static netdev_tx_t sky2_xmit_frame(struc
 	}
 
 	ctrl = 0;
-#ifdef SKY2_VLAN_TAG_USED
+
 	/* Add VLAN tag, can piggyback on LRGLEN or ADDR64 */
 	if (vlan_tx_tag_present(skb)) {
 		if (!le) {
@@ -1792,7 +1781,6 @@ static netdev_tx_t sky2_xmit_frame(struc
 		le->length = cpu_to_be16(vlan_tx_tag_get(skb));
 		ctrl |= INS_VLAN;
 	}
-#endif
 
 	/* Handle TCP checksum offload */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -2432,11 +2420,8 @@ static struct sk_buff *sky2_receive(stru
 	struct sk_buff *skb = NULL;
 	u16 count = (status & GMR_FS_LEN) >> 16;
 
-#ifdef SKY2_VLAN_TAG_USED
-	/* Account for vlan tag */
-	if (sky2->vlgrp && (status & GMR_FS_VLAN))
-		count -= VLAN_HLEN;
-#endif
+	if (status & GMR_FS_VLAN)
+		count -= VLAN_HLEN;	/* Account for vlan tag */
 
 	netif_printk(sky2, rx_status, KERN_DEBUG, dev,
 		     "rx slot %u status 0x%x len %d\n",
@@ -2504,17 +2489,9 @@ static inline void sky2_tx_done(struct n
 static inline void sky2_skb_rx(const struct sky2_port *sky2,
 			       u32 status, struct sk_buff *skb)
 {
-#ifdef SKY2_VLAN_TAG_USED
-	u16 vlan_tag = be16_to_cpu(sky2->rx_tag);
-	if (sky2->vlgrp && (status & GMR_FS_VLAN)) {
-		if (skb->ip_summed == CHECKSUM_NONE)
-			vlan_hwaccel_receive_skb(skb, sky2->vlgrp, vlan_tag);
-		else
-			vlan_gro_receive(&sky2->hw->napi, sky2->vlgrp,
-					 vlan_tag, skb);
-		return;
-	}
-#endif
+	if (status & GMR_FS_VLAN)
+		__vlan_hwaccel_put_tag(skb, be16_to_cpu(sky2->rx_tag));
+
 	if (skb->ip_summed == CHECKSUM_NONE)
 		netif_receive_skb(skb);
 	else
@@ -2631,7 +2608,6 @@ static int sky2_status_intr(struct sky2_
 				goto exit_loop;
 			break;
 
-#ifdef SKY2_VLAN_TAG_USED
 		case OP_RXVLAN:
 			sky2->rx_tag = length;
 			break;
@@ -2639,7 +2615,6 @@ static int sky2_status_intr(struct sky2_
 		case OP_RXCHKSVLAN:
 			sky2->rx_tag = length;
 			/* fall through */
-#endif
 		case OP_RXCHKS:
 			if (likely(sky2->flags & SKY2_FLAG_RX_CHECKSUM))
 				sky2_rx_checksum(sky2, status);
@@ -3042,6 +3017,10 @@ static int __devinit sky2_init(struct sk
 			| SKY2_HW_NEW_LE
 			| SKY2_HW_AUTO_TX_SUM
 			| SKY2_HW_ADV_POWER_CTL;
+
+		/* The workaround for status conflicts VLAN tag detection. */
+		if (hw->chip_rev == CHIP_REV_YU_FE2_A0)
+			hw->flags |= SKY2_HW_VLAN_BROKEN;
 		break;
 
 	case CHIP_ID_YUKON_SUPR:
@@ -4237,15 +4216,28 @@ static int sky2_set_eeprom(struct net_de
 static int sky2_set_flags(struct net_device *dev, u32 data)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
-	u32 supported =
-		(sky2->hw->flags & SKY2_HW_RSS_BROKEN) ? 0 : ETH_FLAG_RXHASH;
+	unsigned long old_feat = dev->features;
+	u32 supported = 0;
 	int rc;
 
+	if (!(sky2->hw->flags & SKY2_HW_RSS_BROKEN))
+		supported |= ETH_FLAG_RXHASH;
+
+	if (!(sky2->hw->flags & SKY2_HW_VLAN_BROKEN))
+		supported |= ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN;
+
+	printk(KERN_DEBUG "sky2 set_flags: supported %x data %x\n",
+	       supported, data);
+
 	rc = ethtool_op_set_flags(dev, data, supported);
 	if (rc)
 		return rc;
 
-	rx_set_rss(dev);
+	if ((old_feat ^ dev->features) & NETIF_F_RXHASH)
+		rx_set_rss(dev);
+
+	if ((old_feat ^ dev->features) & NETIF_F_ALL_VLAN)
+		sky2_vlan_mode(dev);
 
 	return 0;
 }
@@ -4281,6 +4273,7 @@ static const struct ethtool_ops sky2_eth
 	.get_sset_count = sky2_get_sset_count,
 	.get_ethtool_stats = sky2_get_ethtool_stats,
 	.set_flags	= sky2_set_flags,
+	.get_flags	= ethtool_op_get_flags,
 };
 
 #ifdef CONFIG_SKY2_DEBUG
@@ -4562,9 +4555,6 @@ static const struct net_device_ops sky2_
 	.ndo_change_mtu		= sky2_change_mtu,
 	.ndo_tx_timeout		= sky2_tx_timeout,
 	.ndo_get_stats64	= sky2_get_stats,
-#ifdef SKY2_VLAN_TAG_USED
-	.ndo_vlan_rx_register	= sky2_vlan_rx_register,
-#endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= sky2_netpoll,
 #endif
@@ -4580,9 +4570,6 @@ static const struct net_device_ops sky2_
 	.ndo_change_mtu		= sky2_change_mtu,
 	.ndo_tx_timeout		= sky2_tx_timeout,
 	.ndo_get_stats64	= sky2_get_stats,
-#ifdef SKY2_VLAN_TAG_USED
-	.ndo_vlan_rx_register	= sky2_vlan_rx_register,
-#endif
   },
 };
 
@@ -4633,7 +4620,8 @@ static __devinit struct net_device *sky2
 	sky2->port = port;
 
 	dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG
-		| NETIF_F_TSO  | NETIF_F_GRO;
+		| NETIF_F_TSO | NETIF_F_GRO;
+
 	if (highmem)
 		dev->features |= NETIF_F_HIGHDMA;
 
@@ -4641,13 +4629,8 @@ static __devinit struct net_device *sky2
 	if (!(hw->flags & SKY2_HW_RSS_BROKEN))
 		dev->features |= NETIF_F_RXHASH;
 
-#ifdef SKY2_VLAN_TAG_USED
-	/* The workaround for FE+ status conflicts with VLAN tag detection. */
-	if (!(sky2->hw->chip_id == CHIP_ID_YUKON_FE_P &&
-	      sky2->hw->chip_rev == CHIP_REV_YU_FE2_A0)) {
+	if (!(hw->flags & SKY2_HW_VLAN_BROKEN))
 		dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
-	}
-#endif
 
 	/* read the mac address */
 	memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8, ETH_ALEN);
--- a/drivers/net/sky2.h	2011-01-07 20:05:59.982101189 -0800
+++ b/drivers/net/sky2.h	2011-01-07 20:06:03.094169226 -0800
@@ -2236,11 +2236,8 @@ struct sky2_port {
 	u16		     rx_pending;
 	u16		     rx_data_size;
 	u16		     rx_nfrags;
-
-#ifdef SKY2_VLAN_TAG_USED
 	u16		     rx_tag;
-	struct vlan_group    *vlgrp;
-#endif
+
 	struct {
 		unsigned long last;
 		u32	mac_rp;
@@ -2284,6 +2281,7 @@ struct sky2_hw {
 #define SKY2_HW_AUTO_TX_SUM	0x00000040	/* new IP decode for Tx */
 #define SKY2_HW_ADV_POWER_CTL	0x00000080	/* additional PHY power regs */
 #define SKY2_HW_RSS_BROKEN	0x00000100
+#define SKY2_HW_VLAN_BROKEN     0x00000200
 
 	u8	     	     chip_id;
 	u8		     chip_rev;

^ permalink raw reply

* Re: [RFC] sched: QFQ - quick fair queue scheduler
From: Stephen Hemminger @ 2011-01-08  4:02 UTC (permalink / raw)
  To: Changli Gao
  Cc: David Miller, Eric Dumazet, Fabio Checconi, netdev, Luigi Rizzo
In-Reply-To: <AANLkTi=pqZ3CwLTAZnhd-cyQNj8OSeBHsP_bFiH3hJ-_@mail.gmail.com>

On Sat, 8 Jan 2011 10:56:33 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> > +       cl->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;  
> 
> Hmm, there is no other packets schedulers which account packets in
> this way. Which one is better? I am not sure. And in this patch,
> qstats.drops isn't maintained in the same way. Would these two be
> consistent.

HTB uses this accounting.


^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Paul Mackerras @ 2011-01-08  3:13 UTC (permalink / raw)
  To: David Miller; +Cc: xiaosuo, harvey.harrison, linux-ppp, netdev
In-Reply-To: <20110107.171534.193718114.davem@davemloft.net>

On Fri, Jan 07, 2011 at 05:15:34PM -0800, David Miller wrote:

> I have to say that every time I go read the header parsing code in the
> PPP driver, I absolutely regret it.
> 
> And Changli's patch fixes some of the readability problems.

It's up to you whether you merge the patch or not, but surely you
agree it needs more than a zero-line description?

Paul.

^ permalink raw reply

* Re: [RFC] sched: QFQ - quick fair queue scheduler
From: Changli Gao @ 2011-01-08  2:56 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Eric Dumazet, Fabio Checconi, netdev, Luigi Rizzo
In-Reply-To: <20110106195614.20dbc402@nehalam>

On Fri, Jan 7, 2011 at 11:56 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> This is an implementation of the Quick Fair Queue scheduler developed
> by Fabio Checconi and Luigi Rizzo. The same algorithm is already implemented in ipfw
> in FreeBSD. Fabio had an earlier version developed on Linux, I just
> did some cleanup, and backporting of FreeBSD version.
>
> For more information see web page: http://info.iet.unipi.it/~luigi/qfq/
> and Google tech talk: http://www.youtube.com/watch?v=r8vBmybeKlE
>
> This is for inspection at this point, barely tested.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
> ---
> Patch against net-next-2.6.
> Configuration may get patch fuzz because of testing CHOKe in
> same tree.
>
>  include/linux/pkt_sched.h |   14
>  net/sched/Kconfig         |   11
>  net/sched/Makefile        |    1
>  net/sched/sch_qfq.c       | 1012 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 1038 insertions(+)
>
> --- a/include/linux/pkt_sched.h 2011-01-05 09:01:33.268032043 -0800
> +++ b/include/linux/pkt_sched.h 2011-01-05 23:17:20.637390255 -0800
> @@ -481,4 +481,18 @@ struct tc_drr_stats {
>        __u32   deficit;
>  };
>
> +/* QFQ */
> +enum {
> +       TCA_QFQ_WEIGHT,
> +       TCA_QFQ_LMAX,
> +       __TCA_QFQ_MAX
> +};
> +
> +#define TCA_QFQ_MAX    (__TCA_QFQ_MAX - 1)
> +
> +struct tc_qfq_stats {
> +       __u32 weight;
> +       __u32 lmax;
> +};
> +
>  #endif
> --- a/net/sched/Kconfig 2011-01-05 09:01:33.280032462 -0800
> +++ b/net/sched/Kconfig 2011-01-05 23:17:20.637390255 -0800
> @@ -216,6 +216,17 @@ config NET_SCH_CHOKE
>          To compile this code as a module, choose M here: the
>          module will be called sch_choke.
>
> +config NET_SCH_QFQ
> +        tristate "Quick Fair Queueing Scheduler (QFQ)"
> +       help
> +         Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
> +         packet scheduling algorithm.
> +
> +         To compile this driver as a module, choose M here: the module
> +         will be called sch_qfq.
> +
> +         If unsure, say N.
> +
>  config NET_SCH_INGRESS
>        tristate "Ingress Qdisc"
>        depends on NET_CLS_ACT
> --- a/net/sched/Makefile        2011-01-05 09:01:33.284032598 -0800
> +++ b/net/sched/Makefile        2011-01-05 23:17:20.645389829 -0800
> @@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)  += sch_mult
>  obj-$(CONFIG_NET_SCH_ATM)      += sch_atm.o
>  obj-$(CONFIG_NET_SCH_NETEM)    += sch_netem.o
>  obj-$(CONFIG_NET_SCH_DRR)      += sch_drr.o
> +obj-$(CONFIG_NET_SCH_QFQ)      += sch_qfq.o
>  obj-$(CONFIG_NET_SCH_CHOKE)    += sch_choke.o
>  obj-$(CONFIG_NET_CLS_U32)      += cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
> --- /dev/null   1970-01-01 00:00:00.000000000 +0000
> +++ b/net/sched/sch_qfq.c       2011-01-06 12:51:28.498280327 -0800
> @@ -0,0 +1,1125 @@
> +/*
> + * net/sched/sch_qfq.c         Quick Fair Queueing Scheduler.
> + *
> + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/bitops.h>
> +#include <linux/errno.h>
> +#include <linux/netdevice.h>
> +#include <linux/pkt_sched.h>
> +#include <net/sch_generic.h>
> +#include <net/pkt_sched.h>
> +#include <net/pkt_cls.h>
> +
> +/*  Quick Fair Queueing
> +    ===================
> +
> +    Sources:
> +    Fabio Checconi and Scuola Superiore and S. Anna
> +    and Paolo Valente and Luigi Riz "QFQ: Efficient Packet Scheduling
> +    with Tight Bandwidth Distribution Guarantees", SIGCOMM 2010
> +
> +    See also:
> +    http://retis.sssup.it/~fabio/linux/qfq/
> + */
> +
> +/*
> +
> +  Virtual time computations.
> +
> +  S, F and V are all computed in fixed point arithmetic with
> +  FRAC_BITS decimal bits.
> +
> +  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
> +       one bit per index.
> +  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
> +
> +  The layout of the bits is as below:
> +
> +                   [ MTU_SHIFT ][      FRAC_BITS    ]
> +                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
> +                                ^.__grp->index = 0
> +                                *.__grp->slot_shift
> +
> +  where MIN_SLOT_SHIFT is derived by difference from the others.
> +
> +  The max group index corresponds to Lmax/w_min, where
> +  Lmax=1<<MTU_SHIFT, w_min = 1 .
> +  From this, and knowing how many groups (MAX_INDEX) we want,
> +  we can derive the shift corresponding to each group.
> +
> +  Because we often need to compute
> +       F = S + len/w_i  and V = V + len/wsum
> +  instead of storing w_i store the value
> +       inv_w = (1<<FRAC_BITS)/w_i
> +  so we can do F = S + len * inv_w * wsum.
> +  We use W_TOT in the formulas so we can easily move between
> +  static and adaptive weight sum.
> +
> +  The per-scheduler-instance data contain all the data structures
> +  for the scheduler: bitmaps and bucket lists.
> +
> + */
> +
> +/*
> + * Maximum number of consecutive slots occupied by backlogged classes
> + * inside a group.
> + */
> +#define QFQ_MAX_SLOTS  32
> +
> +/*
> + * Shifts used for class<->group mapping.  We allow class weights that are
> + * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the
> + * group with the smallest index that can support the L_i / r_i configured
> + * for the class.
> + *
> + * grp->index is the index of the group; and grp->slot_shift
> + * is the shift for the corresponding (scaled) sigma_i.
> + */
> +#define QFQ_MAX_INDEX          19
> +#define QFQ_MAX_WSHIFT         16
> +
> +#define        QFQ_MAX_WEIGHT          (1<<QFQ_MAX_WSHIFT)
> +#define QFQ_MAX_WSUM           (2*QFQ_MAX_WEIGHT)
> +
> +#define FRAC_BITS              30      /* fixed point arithmetic */
> +#define ONE_FP                 (1UL << FRAC_BITS)
> +#define IWSUM                  (ONE_FP/QFQ_MAX_WSUM)
> +
> +#define QFQ_MTU_SHIFT          11
> +#define QFQ_MIN_SLOT_SHIFT     (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
> +
> +/*
> + * Possible group states.  These values are used as indexes for the bitmaps
> + * array of struct qfq_queue.
> + */
> +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
> +
> +struct qfq_group;
> +
> +struct qfq_class {
> +       struct Qdisc_class_common common;
> +
> +       unsigned int refcnt;
> +       unsigned int filter_cnt;
> +
> +       struct gnet_stats_basic_packed bstats;
> +       struct gnet_stats_queue qstats;
> +       struct gnet_stats_rate_est rate_est;
> +       struct Qdisc *qdisc;
> +
> +       struct qfq_class *next; /* Link for the slot list. */
> +       u64 S, F;               /* flow timestamps (exact) */
> +
> +       /* group we belong to. In principle we would need the index,
> +        * which is log_2(lmax/weight), but we never reference it
> +        * directly, only the group.
> +        */
> +       struct qfq_group *grp;
> +
> +       /* these are copied from the flowset. */
> +       u32     inv_w;          /* ONE_FP/weight */
> +       u32     lmax;           /* Max packet size for this flow. */
> +};
> +
> +struct qfq_group {
> +       uint64_t S, F;                  /* group timestamps (approx). */
> +       unsigned int slot_shift;        /* Slot shift. */
> +       unsigned int index;             /* Group index. */
> +       unsigned int front;             /* Index of the front slot. */
> +       unsigned long full_slots;       /* non-empty slots */
> +
> +       /* Array of RR lists of active classes. */
> +       struct qfq_class *slots[QFQ_MAX_SLOTS];
> +};
> +
> +struct qfq_sched {
> +       struct tcf_proto *filter_list;
> +       struct Qdisc_class_hash clhash;
> +
> +       uint64_t        V;              /* Precise virtual time. */
> +       u32 wsum;                       /* weight sum */
> +
> +       unsigned long bitmaps[QFQ_MAX_STATE];       /* Group bitmaps. */
> +       struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
> +};
> +
> +static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct Qdisc_class_common *clc;
> +
> +       clc = qdisc_class_find(&q->clhash, classid);
> +       if (clc == NULL)
> +               return NULL;
> +       return container_of(clc, struct qfq_class, common);
> +}
> +
> +static void qfq_purge_queue(struct qfq_class *cl)
> +{
> +       unsigned int len = cl->qdisc->q.qlen;
> +
> +       qdisc_reset(cl->qdisc);
> +       qdisc_tree_decrease_qlen(cl->qdisc, len);
> +}
> +
> +static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
> +       [TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
> +       [TCA_QFQ_LMAX] = { .type = NLA_U32 },
> +};
> +
> +/*
> + * Calculate a flow index, given its weight and maximum packet length.
> + * index = log_2(maxlen/weight) but we need to apply the scaling.
> + * This is used only once at flow creation.
> + */
> +static int qfq_calc_index(u32 inv_w, unsigned int maxlen)
> +{
> +       u64 slot_size = (u64)maxlen *inv_w;
> +       unsigned long size_map;
> +       int index = 0;
> +
> +       size_map = slot_size >> QFQ_MIN_SLOT_SHIFT;
> +       if (!size_map)
> +               goto out;
> +
> +       index = __fls(size_map) + 1;    /* basically a log_2 */
> +       index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
> +
> +       if (index < 0)
> +               index = 0;
> +out:
> +       pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
> +                (unsigned long) ONE_FP/inv_w, maxlen, index);
> +
> +       return index;
> +}
> +
> +static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
> +                           struct nlattr **tca, unsigned long *arg)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl = (struct qfq_class *)*arg;
> +       struct nlattr *tb[TCA_QFQ_MAX + 1];
> +       u32 weight, lmax, inv_w;
> +       int i, err;
> +
> +       if (tca[TCA_OPTIONS] == NULL)
> +               return -EINVAL;
> +
> +       err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
> +       if (err < 0)
> +               return err;
> +
> +       if (tb[TCA_QFQ_WEIGHT]) {
> +               weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
> +               if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
> +                       pr_notice("qfq: invalid weight %u\n", weight);
> +                       return -EINVAL;
> +               }
> +       } else
> +               weight = 1;
> +
> +       inv_w = ONE_FP / weight;
> +       weight = ONE_FP / inv_w;
> +       if (q->wsum + weight > QFQ_MAX_WSUM) {
> +               pr_notice("qfq: total weight out of range (%u + %u)\n",
> +                         weight, q->wsum);
> +               return -EINVAL;
> +       }
> +
> +       if (tb[TCA_QFQ_LMAX]) {
> +               lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
> +               if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) {
> +                       pr_notice("qfq: invalid max length %u\n", lmax);
> +                       return -EINVAL;
> +               }
> +       } else
> +               lmax = 1UL << QFQ_MTU_SHIFT;
> +
> +       if (cl != NULL) {
> +               if (tca[TCA_RATE]) {
> +                       err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> +                                                   qdisc_root_sleeping_lock(sch),
> +                                                   tca[TCA_RATE]);
> +                       if (err)
> +                               return err;
> +               }
> +
> +               sch_tree_lock(sch);
> +               if (tb[TCA_QFQ_WEIGHT]) {
> +                       q->wsum = weight - ONE_FP / cl->inv_w;
> +                       cl->inv_w = inv_w;
> +               }
> +               sch_tree_unlock(sch);
> +
> +               return 0;
> +       }
> +
> +       cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
> +       if (cl == NULL)
> +               return -ENOBUFS;
> +
> +       cl->refcnt = 1;
> +       cl->common.classid = classid;
> +       cl->lmax = lmax;
> +       cl->inv_w = inv_w;
> +       i = qfq_calc_index(cl->inv_w, cl->lmax);
> +
> +       cl->grp = &q->groups[i];
> +       q->wsum += weight;
> +
> +       cl->qdisc = qdisc_create_dflt(sch->dev_queue,
> +                                     &pfifo_qdisc_ops, classid);
> +       if (cl->qdisc == NULL)
> +               cl->qdisc = &noop_qdisc;
> +
> +       if (tca[TCA_RATE]) {
> +               err = gen_new_estimator(&cl->bstats, &cl->rate_est,
> +                                       qdisc_root_sleeping_lock(sch),
> +                                       tca[TCA_RATE]);
> +               if (err) {
> +                       qdisc_destroy(cl->qdisc);
> +                       kfree(cl);
> +                       return err;
> +               }
> +       }
> +
> +       sch_tree_lock(sch);
> +       qdisc_class_hash_insert(&q->clhash, &cl->common);
> +       sch_tree_unlock(sch);
> +
> +       qdisc_class_hash_grow(sch, &q->clhash);
> +
> +       *arg = (unsigned long)cl;
> +       return 0;
> +}
> +
> +static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
> +{
> +       struct qfq_sched *q = (struct qfq_sched *)sch;
> +
> +       if (cl->inv_w) {
> +               q->wsum -= ONE_FP / cl->inv_w;
> +               cl->inv_w = 0;
> +       }
> +
> +       gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +       qdisc_destroy(cl->qdisc);
> +       kfree(cl);
> +}
> +
> +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (cl->filter_cnt > 0)
> +               return -EBUSY;
> +
> +       sch_tree_lock(sch);
> +
> +       qfq_purge_queue(cl);
> +       qdisc_class_hash_remove(&q->clhash, &cl->common);
> +
> +       if (--cl->refcnt == 0)
> +               qfq_destroy_class(sch, cl);
> +
> +       sch_tree_unlock(sch);
> +       return 0;
> +}
> +
> +static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
> +{
> +       struct qfq_class *cl = qfq_find_class(sch, classid);
> +
> +       if (cl != NULL)
> +               cl->refcnt++;
> +
> +       return (unsigned long)cl;
> +}
> +
> +static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (--cl->refcnt == 0)
> +               qfq_destroy_class(sch, cl);
> +}
> +
> +static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +
> +       if (cl)
> +               return NULL;
> +
> +       return &q->filter_list;
> +}
> +
> +static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
> +                                 u32 classid)
> +{
> +       struct qfq_class *cl = qfq_find_class(sch, classid);
> +
> +       if (cl != NULL)
> +               cl->filter_cnt++;
> +
> +       return (unsigned long)cl;
> +}
> +
> +static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       cl->filter_cnt--;
> +}
> +
> +static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
> +                          struct Qdisc *new, struct Qdisc **old)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (new == NULL) {
> +               new = qdisc_create_dflt(sch->dev_queue,
> +                                       &pfifo_qdisc_ops, cl->common.classid);
> +               if (new == NULL)
> +                       new = &noop_qdisc;
> +       }
> +
> +       sch_tree_lock(sch);
> +       qfq_purge_queue(cl);
> +       *old = cl->qdisc;
> +       cl->qdisc = new;
> +       sch_tree_unlock(sch);
> +       return 0;
> +}
> +
> +static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       return cl->qdisc;
> +}
> +
> +static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
> +                         struct sk_buff *skb, struct tcmsg *tcm)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +       struct nlattr *nest;
> +
> +       tcm->tcm_parent = TC_H_ROOT;
> +       tcm->tcm_handle = cl->common.classid;
> +       tcm->tcm_info   = cl->qdisc->handle;
> +
> +       nest = nla_nest_start(skb, TCA_OPTIONS);
> +       if (nest == NULL)
> +               goto nla_put_failure;
> +       NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w);
> +       NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax);
> +       return nla_nest_end(skb, nest);
> +
> +nla_put_failure:
> +       nla_nest_cancel(skb, nest);
> +       return -EMSGSIZE;
> +}
> +
> +static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
> +                               struct gnet_dump *d)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +       struct tc_qfq_stats xstats;
> +
> +       memset(&xstats, 0, sizeof(xstats));
> +
> +       xstats.weight = ONE_FP/cl->inv_w;
> +       xstats.lmax = cl->lmax;
> +
> +       if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> +           gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
> +           gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
> +               return -1;
> +
> +       return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
> +}
> +
> +static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl;
> +       struct hlist_node *n;
> +       unsigned int i;
> +
> +       if (arg->stop)
> +               return;
> +
> +       for (i = 0; i < q->clhash.hashsize; i++) {
> +               hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
> +                       if (arg->count < arg->skip) {
> +                               arg->count++;
> +                               continue;
> +                       }
> +                       if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
> +                               arg->stop = 1;
> +                               return;
> +                       }
> +                       arg->count++;
> +               }
> +       }
> +}
> +
> +static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
> +                                     int *qerr)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl;
> +       struct tcf_result res;
> +       int result;
> +
> +       if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
> +               cl = qfq_find_class(sch, skb->priority);
> +               if (cl != NULL)
> +                       return cl;
> +       }
> +
> +       *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> +       result = tc_classify(skb, q->filter_list, &res);
> +       if (result >= 0) {
> +#ifdef CONFIG_NET_CLS_ACT
> +               switch (result) {
> +               case TC_ACT_QUEUED:
> +               case TC_ACT_STOLEN:
> +                       *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> +               case TC_ACT_SHOT:
> +                       return NULL;
> +               }
> +#endif
> +               cl = (struct qfq_class *)res.class;
> +               if (cl == NULL)
> +                       cl = qfq_find_class(sch, res.classid);
> +               return cl;
> +       }
> +
> +       return NULL;
> +}
> +
> +/* Generic comparison function, handling wraparound. */
> +static inline int qfq_gt(u64 a, u64 b)
> +{
> +       return (s64)(a - b) > 0;
> +}
> +
> +/* Round a precise timestamp to its slotted value. */
> +static inline u64 qfq_round_down(u64 ts, unsigned int shift)
> +{
> +       return ts & ~((1ULL << shift) - 1);
> +}
> +
> +/* return the pointer to the group with lowest index in the bitmap */
> +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
> +                                       unsigned long bitmap)
> +{
> +       int index = __ffs(bitmap); // zero-based
> +       return &q->groups[index];
> +}
> +/* Calculate a mask to mimic what would be ffs_from(). */
> +static inline unsigned long mask_from(unsigned long bitmap, int from)
> +{
> +       return bitmap & ~((1UL << from) - 1);
> +}
> +
> +/*
> + * The state computation relies on ER=0, IR=1, EB=2, IB=3
> + * First compute eligibility comparing grp->S, q->V,
> + * then check if someone is blocking us and possibly add EB
> + */
> +static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
> +{
> +       /* if S > V we are not eligible */
> +       unsigned int state = qfq_gt(grp->S, q->V);
> +       unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
> +       struct qfq_group *next;
> +
> +       if (mask) {
> +               next = qfq_ffs(q, mask);
> +               if (qfq_gt(grp->F, next->F))
> +                       state |= EB;
> +       }
> +
> +       return state;
> +}
> +
> +
> +/*
> + * In principle
> + *     q->bitmaps[dst] |= q->bitmaps[src] & mask;
> + *     q->bitmaps[src] &= ~mask;
> + * but we should make sure that src != dst
> + */
> +static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
> +                                  int src, int dst)
> +{
> +       q->bitmaps[dst] |= q->bitmaps[src] & mask;
> +       q->bitmaps[src] &= ~mask;
> +}
> +
> +static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
> +{
> +       unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
> +       struct qfq_group *next;
> +
> +       if (mask) {
> +               next = qfq_ffs(q, mask);
> +               if (!qfq_gt(next->F, old_F))
> +                       return;
> +       }
> +
> +       mask = (1UL << index) - 1;
> +       qfq_move_groups(q, mask, EB, ER);
> +       qfq_move_groups(q, mask, IB, IR);
> +}
> +
> +/*
> + * perhaps
> + *
> +       old_V ^= q->V;
> +       old_V >>= QFQ_MIN_SLOT_SHIFT;
> +       if (old_V) {
> +               ...
> +       }
> + *
> + */
> +static void qfq_make_eligible(struct qfq_sched *q, u64 old_V)
> +{
> +       unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
> +       unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
> +
> +       if (vslot != old_vslot) {
> +               unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1;
> +               qfq_move_groups(q, mask, IR, ER);
> +               qfq_move_groups(q, mask, IB, EB);
> +       }
> +}
> +
> +/*
> + * XXX we should make sure that slot becomes less than 32.
> + * This is guaranteed by the input values.
> + * roundedS is always cl->S rounded on grp->slot_shift bits.
> + */
> +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl,
> +                                  u64 roundedS)
> +{
> +       u64 slot = (roundedS - grp->S) >> grp->slot_shift;
> +       unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
> +
> +       cl->next = grp->slots[i];
> +       grp->slots[i] = cl;
> +       __set_bit(slot, &grp->full_slots);
> +}
> +
> +/*
> + * remove the entry from the slot
> + */
> +static void qfq_front_slot_remove(struct qfq_group *grp)
> +{
> +       struct qfq_class **h = &grp->slots[grp->front];
> +
> +       *h = (*h)->next;
> +       if (!*h)
> +               __clear_bit(0, &grp->full_slots);
> +}
> +
> +/*
> + * Returns the first full queue in a group. As a side effect,
> + * adjust the bucket list so the first non-empty bucket is at
> + * position 0 in full_slots.
> + */
> +static struct qfq_class *qfq_slot_scan(struct qfq_group *grp)
> +{
> +       unsigned int i;
> +
> +       pr_debug("qfq slot_scan: grp %u full %#lx\n",
> +                grp->index, grp->full_slots);
> +
> +       if (!grp->full_slots)
> +               return NULL;
> +
> +       i = __ffs(grp->full_slots);  /* zero based */
> +       if (i > 0) {
> +               grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
> +               grp->full_slots >>= i;
> +       }
> +
> +       return grp->slots[grp->front];
> +}
> +
> +/*
> + * adjust the bucket list. When the start time of a group decreases,
> + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
> + * move the objects. The mask of occupied slots must be shifted
> + * because we use ffs() to find the first non-empty slot.
> + * This covers decreases in the group's start time, but what about
> + * increases of the start time ?
> + * Here too we should make sure that i is less than 32
> + */
> +static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
> +{
> +       unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
> +
> +       grp->full_slots <<= i;
> +       grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
> +}
> +
> +static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
> +{
> +       struct qfq_group *grp;
> +       unsigned long ineligible;
> +
> +       ineligible = q->bitmaps[IR] | q->bitmaps[IB];
> +       if (ineligible) {
> +               if (!q->bitmaps[ER]) {
> +                       grp = qfq_ffs(q, ineligible);
> +                       if (qfq_gt(grp->S, q->V))
> +                               q->V = grp->S;
> +               }
> +               qfq_make_eligible(q, old_V);
> +       }
> +}
> +
> +/* What is length of next packet in queue (0 if queue is empty) */
> +static unsigned int qdisc_peek_len(struct Qdisc *sch)
> +{
> +       struct sk_buff *skb;
> +
> +       skb = sch->ops->peek(sch);
> +       return skb ? qdisc_pkt_len(skb) : 0;
> +}
> +
> +/*
> + * Updates the class, returns true if also the group needs to be updated.
> + */
> +static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl)
> +{
> +       unsigned int len = qdisc_peek_len(cl->qdisc);
> +
> +       cl->S = cl->F;
> +       if (!len)
> +               qfq_front_slot_remove(grp);     /* queue is empty */
> +       else {
> +               u64 roundedS;
> +
> +               cl->F = cl->S + (u64)len * cl->inv_w;
> +               roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +               if (roundedS == grp->S)
> +                       return false;
> +
> +               qfq_front_slot_remove(grp);
> +               qfq_slot_insert(grp, cl, roundedS);
> +       }
> +
> +       return true;
> +}
> +
> +static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl;
> +       struct sk_buff *skb;
> +       unsigned int len;
> +       u64 old_V;
> +
> +       if (!q->bitmaps[ER])
> +               return NULL;
> +
> +       grp = qfq_ffs(q, q->bitmaps[ER]);
> +
> +       cl = grp->slots[grp->front];
> +       skb = qdisc_dequeue_peeked(cl->qdisc);
> +       if (!skb) {
> +               WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
> +               return NULL;
> +       }
> +
> +       sch->q.qlen--;
> +
> +       old_V = q->V;
> +       len = qdisc_pkt_len(skb);
> +       q->V += (u64)len * IWSUM;
> +       pr_debug("qfq enqueue: len %u F %lld now %lld\n",
> +                len, (unsigned long long) cl->F, (unsigned long long) q->V);
> +
> +       if (qfq_update_class(grp, cl)) {
> +               u64 old_F = grp->F;
> +
> +               cl = qfq_slot_scan(grp);
> +               if (!cl)
> +                       __clear_bit(grp->index, &q->bitmaps[ER]);
> +               else {
> +                       u64 roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +                       unsigned int s;
> +
> +                       if (grp->S == roundedS)
> +                               goto skip_unblock;
> +                       grp->S = roundedS;
> +                       grp->F = roundedS + (2ULL << grp->slot_shift);
> +                       __clear_bit(grp->index, &q->bitmaps[ER]);
> +                       s = qfq_calc_state(q, grp);
> +                       __set_bit(grp->index, &q->bitmaps[s]);
> +               }
> +
> +               qfq_unblock_groups(q, grp->index, old_F);
> +       }
> +
> +skip_unblock:
> +       qfq_update_eligible(q, old_V);
> +
> +       return skb;
> +}
> +
> +/*
> + * Assign a reasonable start time for a new flow k in group i.
> + * Admissible values for \hat(F) are multiples of \sigma_i
> + * no greater than V+\sigma_i . Larger values mean that
> + * we had a wraparound so we consider the timestamp to be stale.
> + *
> + * If F is not stale and F >= V then we set S = F.
> + * Otherwise we should assign S = V, but this may violate
> + * the ordering in ER. So, if we have groups in ER, set S to
> + * the F_j of the first group j which would be blocking us.
> + * We are guaranteed not to move S backward because
> + * otherwise our group i would still be blocked.
> + */
> +static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
> +{
> +       unsigned long mask;
> +       uint32_t limit, roundedF;
> +       int slot_shift = cl->grp->slot_shift;
> +
> +       roundedF = qfq_round_down(cl->F, slot_shift);
> +       limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
> +
> +       if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
> +               /* timestamp was stale */
> +               mask = mask_from(q->bitmaps[ER], cl->grp->index);
> +               if (mask) {
> +                       struct qfq_group *next = qfq_ffs(q, mask);
> +                       if (qfq_gt(roundedF, next->F)) {
> +                               cl->S = next->F;
> +                               return;
> +                       }
> +               }
> +               cl->S = q->V;
> +       } else { /* timestamp is not stale */
> +               cl->S = cl->F;
> +       }
> +}
> +
> +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl;
> +       unsigned int len;
> +       int err;
> +       u64 roundedS;
> +       int s;
> +
> +       cl = qfq_classify(skb, sch, &err);
> +       if (cl == NULL || cl->qdisc->q.qlen > 80) {
> +               if (err & __NET_XMIT_BYPASS)
> +                       sch->qstats.drops++;
> +               kfree_skb(skb);
> +               return err;
> +       }
> +
> +       len = qdisc_pkt_len(skb);
> +       err = qdisc_enqueue(skb, cl->qdisc);
> +       if (unlikely(err != NET_XMIT_SUCCESS)) {
> +               if (net_xmit_drop_count(err)) {
> +                       cl->qstats.drops++;
> +                       sch->qstats.drops++;
> +               }
> +               return err;
> +       }
> +
> +       cl->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;

Hmm, there is no other packets schedulers which account packets in
this way. Which one is better? I am not sure. And in this patch,
qstats.drops isn't maintained in the same way. Would these two be
consistent.

> +       cl->bstats.bytes += qdisc_pkt_len(skb);
> +
> +       sch->q.qlen++;
> +       sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
> +       sch->bstats.bytes += qdisc_pkt_len(skb);
> +
> +       if (qdisc_peek_head(sch) != skb)
> +               return err;

I suspect that it is wrong.

Here is the fake code from the paper:


5 i f ( f low . queue . head != pkt )
6 return ; // Flow already backlogged, we are don

So the correct code should be:
    if (qdisc_peek_head(cl->qdisc) != skb)
           return err;

However, we can't assume the cl->qdisc is work conserving, so the code
should be:
   if (cl->qdisc->q.qlen > 1)
          return err;

> +
> +       /* If reach this point, queue q was idle */
> +       grp = cl->grp;
> +       qfq_update_start(q, cl);
> +
> +       /* compute new finish time and rounded start. */
> +       cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w;
> +       roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +
> +       /*
> +        * insert cl in the correct bucket.
> +        * If cl->S >= grp->S we don't need to adjust the
> +        * bucket list and simply go to the insertion phase.
> +        * Otherwise grp->S is decreasing, we must make room
> +        * in the bucket list, and also recompute the group state.
> +        * Finally, if there were no flows in this group and nobody
> +        * was in ER make sure to adjust V.
> +        */
> +       if (grp->full_slots) {
> +               if (!qfq_gt(grp->S, cl->S))
> +                       goto skip_update;
> +
> +               /* create a slot for this cl->S */
> +               qfq_slot_rotate(grp, roundedS);
> +               /* group was surely ineligible, remove */
> +               __clear_bit(grp->index, &q->bitmaps[IR]);
> +               __clear_bit(grp->index, &q->bitmaps[IB]);
> +       } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
> +               q->V = roundedS;
> +
> +       grp->S = roundedS;
> +       grp->F = roundedS + (2ULL << grp->slot_shift);
> +       s = qfq_calc_state(q, grp);
> +       __set_bit(grp->index, &q->bitmaps[s]);
> +
> +       pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
> +                s, q->bitmaps[s],
> +                (unsigned long long) cl->S,
> +                (unsigned long long) cl->F,
> +                (unsigned long long) q->V);
> +
> +skip_update:
> +       qfq_slot_insert(grp, cl, roundedS);
> +
> +       return err;
> +}
> +
> +
> +static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
> +                           struct qfq_class *cl, struct qfq_class **pprev)
> +{
> +       unsigned int i, offset;
> +       u64 roundedS;
> +
> +       roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +       offset = (roundedS - grp->S) >> grp->slot_shift;
> +       i = (grp->front + offset) % QFQ_MAX_SLOTS;
> +
> +       if (!pprev) {
> +               pprev = &grp->slots[i];
> +               while (*pprev && *pprev != cl)
> +                       pprev = &(*pprev)->next;
> +       }
> +
> +       *pprev = cl->next;
> +       if (!grp->slots[i])
> +               __clear_bit(offset, &grp->full_slots);
> +}
> +
> +/*
> + * called to forcibly destroy a queue.
> + * If the queue is not in the front bucket, or if it has
> + * other queues in the front bucket, we can simply remove
> + * the queue with no other side effects.
> + * Otherwise we must propagate the event up.
> + */
> +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
> +                                struct qfq_class **pprev)
> +{
> +       struct qfq_group *grp = cl->grp;
> +       unsigned long mask;
> +       u64 roundedS;
> +       int s;
> +
> +       cl->F = cl->S;
> +       qfq_slot_remove(q, grp, cl, pprev);
> +
> +       if (!grp->full_slots) {
> +               __clear_bit(grp->index, &q->bitmaps[IR]);
> +               __clear_bit(grp->index, &q->bitmaps[EB]);
> +               __clear_bit(grp->index, &q->bitmaps[IB]);
> +
> +               if (test_bit(grp->index, &q->bitmaps[ER]) &&
> +                   !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
> +                       mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
> +                       if (mask)
> +                               mask = ~((1UL << __fls(mask)) - 1);
> +                       else
> +                               mask = ~0UL;
> +                       qfq_move_groups(q, mask, EB, ER);
> +                       qfq_move_groups(q, mask, IB, IR);
> +               }
> +               __clear_bit(grp->index, &q->bitmaps[ER]);
> +       } else if (!grp->slots[grp->front]) {
> +               cl = qfq_slot_scan(grp);
> +               roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +               if (grp->S != roundedS) {
> +                       __clear_bit(grp->index, &q->bitmaps[ER]);
> +                       __clear_bit(grp->index, &q->bitmaps[IR]);
> +                       __clear_bit(grp->index, &q->bitmaps[EB]);
> +                       __clear_bit(grp->index, &q->bitmaps[IB]);
> +                       grp->S = roundedS;
> +                       grp->F = roundedS + (2ULL << grp->slot_shift);
> +                       s = qfq_calc_state(q, grp);
> +                       __set_bit(grp->index, &q->bitmaps[s]);
> +               }
> +       }
> +
> +       qfq_update_eligible(q, q->V);
> +}
> +
> +static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_sched *q = (struct qfq_sched *)sch;
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (cl->qdisc->q.qlen == 0)
> +               qfq_deactivate_class(q, cl, NULL);
> +}
> +
> +static unsigned int qfq_drop(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl, **pp;
> +       unsigned int i, j, len;
> +
> +       for (i = 0; i <= QFQ_MAX_INDEX; i++) {
> +               grp = &q->groups[i];
> +               for (j = 0; j < QFQ_MAX_SLOTS; j++) {
> +                       for (pp = &grp->slots[j]; *pp; pp = &(*pp)->next) {
> +                               cl = *pp;
> +                               if (!cl->qdisc->ops->drop)
> +                                       continue;
> +
> +                               len = cl->qdisc->ops->drop(cl->qdisc);
> +                               if (len > 0) {
> +                                       sch->q.qlen--;
> +                                       if (!cl->qdisc->q.qlen)
> +                                               qfq_deactivate_class(q, cl, pp);
> +
> +                                       return len;
> +                               }
> +                       }
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       int i, err;
> +
> +       err = qdisc_class_hash_init(&q->clhash);
> +       if (err < 0)
> +               return err;
> +
> +       for (i = 0; i <= QFQ_MAX_INDEX; i++) {
> +               grp = &q->groups[i];
> +               grp->index = i;
> +       }
> +
> +       return 0;
> +}
> +
> +static void qfq_reset_qdisc(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl, **pp;
> +       struct hlist_node *n;
> +       unsigned int i, j;
> +
> +       for (i = 0; i <= QFQ_MAX_INDEX; i++) {
> +               grp = &q->groups[i];
> +               for (j = 0; j < QFQ_MAX_SLOTS; j++) {
> +                       for (pp = &grp->slots[j]; *pp; pp = &(*pp)->next) {
> +                               cl = *pp;
> +                               if (cl->qdisc->q.qlen)
> +                                       qfq_deactivate_class(q, cl, pp);
> +                       }
> +               }
> +       }
> +
> +       for (i = 0; i < q->clhash.hashsize; i++) {
> +               hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
> +                       qdisc_reset(cl->qdisc);
> +       }
> +       sch->q.qlen = 0;
> +}
> +
> +static void qfq_destroy_qdisc(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl;
> +       struct hlist_node *n, *next;
> +       unsigned int i;
> +
> +       tcf_destroy_chain(&q->filter_list);
> +
> +       for (i = 0; i < q->clhash.hashsize; i++) {
> +               hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
> +                                         common.hnode)
> +                       qfq_destroy_class(sch, cl);
> +       }
> +       qdisc_class_hash_destroy(&q->clhash);
> +}
> +
> +static const struct Qdisc_class_ops qfq_class_ops = {
> +       .change         = qfq_change_class,
> +       .delete         = qfq_delete_class,
> +       .get            = qfq_get_class,
> +       .put            = qfq_put_class,
> +       .tcf_chain      = qfq_tcf_chain,
> +       .bind_tcf       = qfq_bind_tcf,
> +       .unbind_tcf     = qfq_unbind_tcf,
> +       .graft          = qfq_graft_class,
> +       .leaf           = qfq_class_leaf,
> +       .qlen_notify    = qfq_qlen_notify,
> +       .dump           = qfq_dump_class,
> +       .dump_stats     = qfq_dump_class_stats,
> +       .walk           = qfq_walk,
> +};
> +
> +static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
> +       .cl_ops         = &qfq_class_ops,
> +       .id             = "qfq",
> +       .priv_size      = sizeof(struct qfq_sched),
> +       .enqueue        = qfq_enqueue,
> +       .dequeue        = qfq_dequeue,
> +       .peek           = qdisc_peek_dequeued,
> +       .drop           = qfq_drop,
> +       .init           = qfq_init_qdisc,
> +       .reset          = qfq_reset_qdisc,
> +       .destroy        = qfq_destroy_qdisc,
> +       .owner          = THIS_MODULE,
> +};
> +
> +static int __init qfq_init(void)
> +{
> +       return register_qdisc(&qfq_qdisc_ops);
> +}
> +
> +static void __exit qfq_exit(void)
> +{
> +       unregister_qdisc(&qfq_qdisc_ops);
> +}
> +
> +module_init(qfq_init);
> +module_exit(qfq_exit);
> +MODULE_LICENSE("GPL");
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>



-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: David Miller @ 2011-01-08  1:15 UTC (permalink / raw)
  To: xiaosuo; +Cc: paulus, harvey.harrison, linux-ppp, netdev
In-Reply-To: <AANLkTim=1GCah0qp8HJK31LWXPb5vAZYp+d2BTQM+Q+B@mail.gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Sat, 8 Jan 2011 08:43:01 +0800

> On Fri, Jan 7, 2011 at 11:01 AM, Paul Mackerras <paulus@samba.org> wrote:
>> On Fri, Jan 07, 2011 at 07:37:36AM +0800, Changli Gao wrote:
>>
>>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>>
>> This patch description is inadequate.  It should tell us why you are
>> making this change.  Does it result in smaller and/or faster code, and
>> if so by how much on what sort of machine?  Do you think it makes the
>> code clearer?  (I don't.)  Or is there some other motivation for this?
>>
> 
> Good designed APIs always make code clearer, smaller and faster. It is
> obvious enough I think.

I have to say that every time I go read the header parsing code in the
PPP driver, I absolutely regret it.

And Changli's patch fixes some of the readability problems.

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Changli Gao @ 2011-01-08  0:43 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David S. Miller, Harvey Harrison, linux-ppp, netdev
In-Reply-To: <20110107030145.GA8021@brick.ozlabs.ibm.com>

On Fri, Jan 7, 2011 at 11:01 AM, Paul Mackerras <paulus@samba.org> wrote:
> On Fri, Jan 07, 2011 at 07:37:36AM +0800, Changli Gao wrote:
>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>
> This patch description is inadequate.  It should tell us why you are
> making this change.  Does it result in smaller and/or faster code, and
> if so by how much on what sort of machine?  Do you think it makes the
> code clearer?  (I don't.)  Or is there some other motivation for this?
>

Good designed APIs always make code clearer, smaller and faster. It is
obvious enough I think.

The names of the functions imply the endianness, like comments. On
some MIPS architectures which support unaligned load and store
instructions, the APIs result in smaller and faster code.


-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [GIT] Networking
From: Francois Romieu @ 2011-01-08  0:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ben Hutchings, David Miller, Hayes Wang, David Woodhouse, akpm,
	netdev, linux-kernel
In-Reply-To: <AANLkTinANp+Jp8TPKzii6iZLUFpMyrXhmxYr42nZrXQU@mail.gmail.com>

Linus Torvalds <torvalds@linux-foundation.org> :
[...]
> Hmm. I never even waited for 60 seconds. Maybe my boot would have
> continued after the delay.

The 60 seconds delay is here :

drivers/base/firmware_class.c
[...]
static int loading_timeout = 60;        /* In seconds */

It can be read and set through sysfs.

I'll give a try at moving the request-firmware dependent stuff to
device-open time in drivers/net/r8169.c tomorrow morning. It's friday.

-- 
Ueimor

^ permalink raw reply

* Re: [net-next-2.6 PATCH v7 2/2] net_sched: implement a root container qdisc sch_mqprio
From: Jarek Poplawski @ 2011-01-07 23:28 UTC (permalink / raw)
  To: John Fastabend
  Cc: davem, hadi, eric.dumazet, shemminger, tgraf, bhutchings, nhorman,
	netdev
In-Reply-To: <20110107224549.19830.3961.stgit@jf-dev1-dcblab>

On Fri, Jan 07, 2011 at 02:45:49PM -0800, John Fastabend wrote:
> This implements a mqprio queueing discipline that by default creates
> a pfifo_fast qdisc per tx queue and provides the needed configuration
> interface.
> 
> Using the mqprio qdisc the number of tcs currently in use along
> with the range of queues alloted to each class can be configured. By
> default skbs are mapped to traffic classes using the skb priority.
> This mapping is configurable.
> 
> Configurable parameters,
> 
> struct tc_mqprio_qopt {
>         __u8    num_tc;
>         __u8    prio_tc_map[TC_BITMASK + 1];
>         __u8    hw;
>         __u16   count[TC_MAX_QUEUE];
>         __u16   offset[TC_MAX_QUEUE];
> };
> 
> Here the count/offset pairing give the queue alignment and the
> prio_tc_map gives the mapping from skb->priority to tc.
> 
> The hw bit determines if the hardware should configure the count
> and offset values. If the hardware bit is set then the operation
> will fail if the hardware does not implement the ndo_setup_tc
> operation. This is to avoid undetermined states where the hardware
> may or may not control the queue mapping. Also minimal bounds
> checking is done on the count/offset to verify a queue does not
> exceed num_tx_queues and that queue ranges do not overlap. Otherwise
> it is left to user policy or hardware configuration to create
> useful mappings.
> 
> It is expected that hardware QOS schemes can be implemented by
> creating appropriate mappings of queues in ndo_tc_setup().
> 
> One expected use case is drivers will use the ndo_setup_tc to map
> queue ranges onto 802.1Q traffic classes. This provides a generic
> mechanism to map network traffic onto these traffic classes and
> removes the need for lower layer drivers to know specifics about
> traffic types.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

Acked-by: Jarek Poplawski <jarkao2@gmail.com>

^ permalink raw reply

* [net-next-2.6 PATCH v7 2/2] net_sched: implement a root container qdisc sch_mqprio
From: John Fastabend @ 2011-01-07 22:45 UTC (permalink / raw)
  To: davem
  Cc: jarkao2, hadi, eric.dumazet, shemminger, tgraf, bhutchings,
	nhorman, netdev
In-Reply-To: <20110107224543.19830.74009.stgit@jf-dev1-dcblab>

This implements a mqprio queueing discipline that by default creates
a pfifo_fast qdisc per tx queue and provides the needed configuration
interface.

Using the mqprio qdisc the number of tcs currently in use along
with the range of queues alloted to each class can be configured. By
default skbs are mapped to traffic classes using the skb priority.
This mapping is configurable.

Configurable parameters,

struct tc_mqprio_qopt {
        __u8    num_tc;
        __u8    prio_tc_map[TC_BITMASK + 1];
        __u8    hw;
        __u16   count[TC_MAX_QUEUE];
        __u16   offset[TC_MAX_QUEUE];
};

Here the count/offset pairing give the queue alignment and the
prio_tc_map gives the mapping from skb->priority to tc.

The hw bit determines if the hardware should configure the count
and offset values. If the hardware bit is set then the operation
will fail if the hardware does not implement the ndo_setup_tc
operation. This is to avoid undetermined states where the hardware
may or may not control the queue mapping. Also minimal bounds
checking is done on the count/offset to verify a queue does not
exceed num_tx_queues and that queue ranges do not overlap. Otherwise
it is left to user policy or hardware configuration to create
useful mappings.

It is expected that hardware QOS schemes can be implemented by
creating appropriate mappings of queues in ndo_tc_setup().

One expected use case is drivers will use the ndo_setup_tc to map
queue ranges onto 802.1Q traffic classes. This provides a generic
mechanism to map network traffic onto these traffic classes and
removes the need for lower layer drivers to know specifics about
traffic types.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/pkt_sched.h |   12 +
 net/sched/Kconfig         |   12 +
 net/sched/Makefile        |    1 
 net/sched/sch_generic.c   |    4 
 net/sched/sch_mqprio.c    |  418 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 447 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_mqprio.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 2cfa4bc..776cd93 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -481,4 +481,16 @@ struct tc_drr_stats {
 	__u32	deficit;
 };
 
+/* MQPRIO */
+#define TC_QOPT_BITMASK 15
+#define TC_QOPT_MAX_QUEUE 16
+
+struct tc_mqprio_qopt {
+	__u8	num_tc;
+	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
+	__u8	hw;
+	__u16	count[TC_QOPT_MAX_QUEUE];
+	__u16	offset[TC_QOPT_MAX_QUEUE];
+};
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a36270a..f52f5eb 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -205,6 +205,18 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_MQPRIO
+	tristate "Multi-queue priority scheduler (MQPRIO)"
+	help
+	  Say Y here if you want to use the Multi-queue Priority scheduler.
+	  This scheduler allows QOS to be offloaded on NICs that have support
+	  for offloading QOS schedulers.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_mqprio.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5db..26ce681 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598..723b278 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -540,6 +540,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
 };
+EXPORT_SYMBOL(pfifo_fast_ops);
 
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  struct Qdisc_ops *ops)
@@ -674,6 +675,7 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 
 	return oqdisc;
 }
+EXPORT_SYMBOL(dev_graft_qdisc);
 
 static void attach_one_default_qdisc(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
@@ -761,6 +763,7 @@ void dev_activate(struct net_device *dev)
 		dev_watchdog_up(dev);
 	}
 }
+EXPORT_SYMBOL(dev_activate);
 
 static void dev_deactivate_queue(struct net_device *dev,
 				 struct netdev_queue *dev_queue,
@@ -840,6 +843,7 @@ void dev_deactivate(struct net_device *dev)
 	list_add(&dev->unreg_list, &single);
 	dev_deactivate_many(&single);
 }
+EXPORT_SYMBOL(dev_deactivate);
 
 static void dev_init_scheduler_queue(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 0000000..705bdfa
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,418 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+	struct Qdisc		**qdiscs;
+	int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned int ntx;
+
+	if (!priv->qdiscs)
+		return;
+
+	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
+		qdisc_destroy(priv->qdiscs[ntx]);
+
+	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+		dev->netdev_ops->ndo_setup_tc(dev, 0, dev->real_num_tx_queues);
+	else
+		netdev_set_num_tc(dev, 0);
+
+	kfree(priv->qdiscs);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+	int i, j;
+
+	/* Verify num_tc is not out of max range */
+	if (qopt->num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	/* Verify priority mapping uses valid tcs */
+	for (i = 0; i < TC_BITMASK + 1; i++) {
+		if (qopt->prio_tc_map[i] >= qopt->num_tc)
+			return -EINVAL;
+	}
+
+	/* net_device does not support requested operation */
+	if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+		return -EINVAL;
+
+	/* if hw owned qcount and qoffset are taken from LLD so
+	 * no reason to verify them here
+	 */
+	if (qopt->hw)
+		return 0;
+
+	for (i = 0; i < qopt->num_tc; i++) {
+		unsigned int last = qopt->offset[i] + qopt->count[i];
+
+		/* Verify the queue count is in tx range being equal to the
+		 * real_num_tx_queues indicates the last queue is in use.
+		 */
+		if (qopt->offset[i] >= dev->real_num_tx_queues ||
+		    !qopt->count[i] ||
+		    last > dev->real_num_tx_queues)
+			return -EINVAL;
+
+		/* Verify that the offset and counts do not overlap */
+		for (j = i + 1; j < qopt->num_tc; j++) {
+			if (last > qopt->offset[j])
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	int i, err = -EOPNOTSUPP;
+	struct tc_mqprio_qopt *qopt = NULL;
+
+	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+	if (mqprio_parse_opt(dev, qopt))
+		return -EINVAL;
+
+	/* pre-allocate qdisc, attachment can't fail */
+	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+			       GFP_KERNEL);
+	if (priv->qdiscs == NULL) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		dev_queue = netdev_get_tx_queue(dev, i);
+		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(i + 1)));
+		if (qdisc == NULL) {
+			err = -ENOMEM;
+			goto err;
+		}
+		qdisc->flags |= TCQ_F_CAN_BYPASS;
+		priv->qdiscs[i] = qdisc;
+	}
+
+	/* If the mqprio options indicate that hardware should own
+	 * the queue mapping then run ndo_setup_tc otherwise use the
+	 * supplied and verified mapping
+	 */
+	if (qopt->hw) {
+		priv->hw_owned = 1;
+		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc,
+						    dev->real_num_tx_queues);
+		if (err)
+			goto err;
+	} else {
+		netdev_set_num_tc(dev, qopt->num_tc);
+		for (i = 0; i < qopt->num_tc; i++)
+			netdev_set_tc_queue(dev, i,
+					    qopt->count[i], qopt->offset[i]);
+	}
+
+	/* Always use supplied priority mappings */
+	for (i = 0; i < TC_BITMASK + 1; i++)
+		netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+	sch->flags |= TCQ_F_MQROOT;
+	return 0;
+
+err:
+	mqprio_destroy(sch);
+	return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	/* Attach underlying qdisc */
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = priv->qdiscs[ntx];
+		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (qdisc)
+			qdisc_destroy(qdisc);
+	}
+	kfree(priv->qdiscs);
+	priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+					     unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return -EINVAL;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = dev_graft_qdisc(dev_queue, new);
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_mqprio_qopt opt;
+	struct Qdisc *qdisc;
+	unsigned int i;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+
+	opt.num_tc = netdev_get_num_tc(dev);
+	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+	opt.hw = priv->hw_owned;
+
+	for (i = 0; i < netdev_get_num_tc(dev); i++) {
+		opt.count[i] = dev->tc_to_txq[i].count;
+		opt.offset[i] = dev->tc_to_txq[i].offset;
+	}
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return NULL;
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+		return 0;
+	return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		tcm->tcm_parent = TC_H_ROOT;
+		tcm->tcm_info = 0;
+	} else {
+		int i;
+		struct netdev_queue *dev_queue;
+
+		dev_queue = mqprio_queue_get(sch, cl);
+		tcm->tcm_parent = 0;
+		for (i = 0; i < netdev_get_num_tc(dev); i++) {
+			struct netdev_tc_txq tc = dev->tc_to_txq[i];
+			int q_idx = cl - netdev_get_num_tc(dev);
+
+			if (q_idx > tc.offset &&
+			    q_idx <= tc.offset + tc.count) {
+				tcm->tcm_parent =
+					TC_H_MAKE(TC_H_MAJ(sch->handle),
+						  TC_H_MIN(i + 1));
+				break;
+			}
+		}
+		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	}
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+			       struct gnet_dump *d)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		int i;
+		struct Qdisc *qdisc;
+		struct gnet_stats_queue qstats = {0};
+		struct gnet_stats_basic_packed bstats = {0};
+		struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+		/* Drop lock here it will be reclaimed before touching
+		 * statistics this is required because the d->lock we
+		 * hold here is the look on dev_queue->qdisc_sleeping
+		 * also acquired below.
+		 */
+		spin_unlock_bh(d->lock);
+
+		for (i = tc.offset; i < tc.offset + tc.count; i++) {
+			qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+			spin_lock_bh(qdisc_lock(qdisc));
+			bstats.bytes      += qdisc->bstats.bytes;
+			bstats.packets    += qdisc->bstats.packets;
+			qstats.qlen       += qdisc->qstats.qlen;
+			qstats.backlog    += qdisc->qstats.backlog;
+			qstats.drops      += qdisc->qstats.drops;
+			qstats.requeues   += qdisc->qstats.requeues;
+			qstats.overlimits += qdisc->qstats.overlimits;
+			spin_unlock_bh(qdisc_lock(qdisc));
+		}
+		/* Reclaim root sleeping lock before completing stats */
+		spin_lock_bh(d->lock);
+		if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &qstats) < 0)
+			return -1;
+	} else {
+		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+		sch = dev_queue->qdisc_sleeping;
+		sch->qstats.qlen = sch->q.qlen;
+		if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &sch->qstats) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx;
+
+	if (arg->stop)
+		return;
+
+	/* Walk hierarchy with a virtual class per tc */
+	arg->count = arg->skip;
+	for (ntx = arg->skip;
+	     ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+	     ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+	.graft		= mqprio_graft,
+	.leaf		= mqprio_leaf,
+	.get		= mqprio_get,
+	.put		= mqprio_put,
+	.walk		= mqprio_walk,
+	.dump		= mqprio_dump_class,
+	.dump_stats	= mqprio_dump_class_stats,
+};
+
+struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+	.cl_ops		= &mqprio_class_ops,
+	.id		= "mqprio",
+	.priv_size	= sizeof(struct mqprio_sched),
+	.init		= mqprio_init,
+	.destroy	= mqprio_destroy,
+	.attach		= mqprio_attach,
+	.dump		= mqprio_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+	return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+	unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");


^ permalink raw reply related

* [net-next-2.6 PATCH v7 1/2] net: implement mechanism for HW based QOS
From: John Fastabend @ 2011-01-07 22:45 UTC (permalink / raw)
  To: davem
  Cc: jarkao2, hadi, eric.dumazet, shemminger, tgraf, bhutchings,
	nhorman, netdev

This patch provides a mechanism for lower layer devices to
steer traffic using skb->priority to tx queues. This allows
for hardware based QOS schemes to use the default qdisc without
incurring the penalties related to global state and the qdisc
lock. While reliably receiving skbs on the correct tx ring
to avoid head of line blocking resulting from shuffling in
the LLD. Finally, all the goodness from txq caching and xps/rps
can still be leveraged.

Many drivers and hardware exist with the ability to implement
QOS schemes in the hardware but currently these drivers tend
to rely on firmware to reroute specific traffic, a driver
specific select_queue or the queue_mapping action in the
qdisc.

By using select_queue for this drivers need to be updated for
each and every traffic type and we lose the goodness of much
of the upstream work. Firmware solutions are inherently
inflexible. And finally if admins are expected to build a
qdisc and filter rules to steer traffic this requires knowledge
of how the hardware is currently configured. The number of tx
queues and the queue offsets may change depending on resources.
Also this approach incurs all the overhead of a qdisc with filters.

With the mechanism in this patch users can set skb priority using
expected methods ie setsockopt() or the stack can set the priority
directly. Then the skb will be steered to the correct tx queues
aligned with hardware QOS traffic classes. In the normal case with
a single traffic class and all queues in this class everything
works as is until the LLD enables multiple tcs.

To steer the skb we mask out the lower 4 bits of the priority
and allow the hardware to configure upto 15 distinct classes
of traffic. This is expected to be sufficient for most applications
at any rate it is more then the 8021Q spec designates and is
equal to the number of prio bands currently implemented in
the default qdisc.

This in conjunction with a userspace application such as
lldpad can be used to implement 8021Q transmission selection
algorithms one of these algorithms being the extended transmission
selection algorithm currently being used for DCB.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/netdevice.h |   65 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c            |   61 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0f6b1c9..b1dbbed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -646,6 +646,14 @@ struct xps_dev_maps {
     (nr_cpu_ids * sizeof(struct xps_map *)))
 #endif /* CONFIG_XPS */
 
+#define TC_MAX_QUEUE	16
+#define TC_BITMASK	15
+/* HW offloaded queuing disciplines txq count and offset maps */
+struct netdev_tc_txq {
+	u16 count;
+	u16 offset;
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -756,6 +764,7 @@ struct xps_dev_maps {
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ * void (*ndo_setup_tc)(struct net_device *dev, u8 tc, unsigned int txq)
  */
 #define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
@@ -814,6 +823,8 @@ struct net_device_ops {
 						   struct nlattr *port[]);
 	int			(*ndo_get_vf_port)(struct net_device *dev,
 						   int vf, struct sk_buff *skb);
+	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc,
+						unsigned int txq);
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
@@ -1146,6 +1157,9 @@ struct net_device {
 	/* Data Center Bridging netlink ops */
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
+	u8 num_tc;
+	struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
+	u8 prio_tc_map[TC_BITMASK + 1];
 
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	/* max exchange id for FCoE LRO by ddp */
@@ -1162,6 +1176,57 @@ struct net_device {
 #define	NETDEV_ALIGN		32
 
 static inline
+int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
+{
+	return dev->prio_tc_map[prio & TC_BITMASK];
+}
+
+static inline
+int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
+{
+	if (tc >= dev->num_tc)
+		return -EINVAL;
+
+	dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
+	return 0;
+}
+
+static inline
+void netdev_reset_tc(struct net_device *dev)
+{
+	dev->num_tc = 0;
+	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+
+static inline
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+	if (tc >= dev->num_tc)
+		return -EINVAL;
+
+	dev->tc_to_txq[tc].count = count;
+	dev->tc_to_txq[tc].offset = offset;
+	return 0;
+}
+
+static inline
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+	if (num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	dev->num_tc = num_tc;
+	return 0;
+}
+
+static inline
+int netdev_get_num_tc(struct net_device *dev)
+{
+	return dev->num_tc;
+}
+
+static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index a215269..7c9b1aa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1593,6 +1593,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 	rcu_read_unlock();
 }
 
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this if the net_device supports ndo_setup_tc
+ * call the ops routine with the new queue number. If the ops is not
+ * available verify the tc mapping remains valid and if not NULL the
+ * mapping. With no priorities mapping to this offset/count pair it
+ * will no longer be used. In the worst case TC0 is invalid nothing
+ * can be done so disable priority mappings.
+ */
+void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_setup_tc) {
+		ops->ndo_setup_tc(dev, dev->num_tc, txq);
+	} else {
+		int i;
+		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+		/* If TC0 is invalidated disable TC mapping */
+		if (tc->offset + tc->count > txq) {
+			pr_warning("Number of in use tx queues changed "
+				   "invalidating tc mappings. Priority "
+				   "traffic classification disabled!\n");
+			dev->num_tc = 0;
+			return;
+		}
+
+		/* Invalidated prio to tc mappings set to TC0 */
+		for (i = 1; i < TC_BITMASK + 1; i++) {
+			int q = netdev_get_prio_tc_map(dev, i);
+
+			tc = &dev->tc_to_txq[q];
+			if (tc->offset + tc->count > txq) {
+				pr_warning("Number of in use tx queues "
+					   "changed. Priority %i to tc "
+					   "mapping %i is no longer valid "
+					   "setting map to 0\n",
+					   i, q);
+				netdev_set_prio_tc_map(dev, i, 0);
+			}
+		}
+	}
+}
+
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1612,6 +1660,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 		if (rc)
 			return rc;
 
+		if (dev->num_tc)
+			netif_setup_tc(dev, txq);
+
 		if (txq < dev->real_num_tx_queues)
 			qdisc_reset_all_tx_gt(dev, txq);
 	}
@@ -2165,6 +2216,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 		  unsigned int num_tx_queues)
 {
 	u32 hash;
+	u16 qoffset = 0;
+	u16 qcount = num_tx_queues;
 
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
@@ -2173,13 +2226,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 		return hash;
 	}
 
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		qoffset = dev->tc_to_txq[tc].offset;
+		qcount = dev->tc_to_txq[tc].count;
+	}
+
 	if (skb->sk && skb->sk->sk_hash)
 		hash = skb->sk->sk_hash;
 	else
 		hash = (__force u16) skb->protocol ^ skb->rxhash;
 	hash = jhash_1word(hash, hashrnd);
 
-	return (u16) (((u64) hash * num_tx_queues) >> 32);
+	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 


^ permalink raw reply related

* Re: [GIT] Networking
From: Linus Torvalds @ 2011-01-07 22:48 UTC (permalink / raw)
  To: Francois Romieu
  Cc: Ben Hutchings, David Miller, Hayes Wang, David Woodhouse, akpm,
	netdev, linux-kernel
In-Reply-To: <20110107215505.GA1892@electric-eye.fr.zoreil.com>

On Fri, Jan 7, 2011 at 1:55 PM, Francois Romieu <romieu@fr.zoreil.com> wrote:
> Linus Torvalds <torvalds@linux-foundation.org> :
> [...]
>> I just confirmed that building it as a module works
>
> I have just tried a non-modular build and it worked without firmware.
>
...
> [    4.340876] sd 1:0:0:0: [sda] Attached SCSI disk
> [   63.968081] r8169 0000:02:00.0: eth0: unable to apply firmware patch
>
> It's here. After a 60 seconds black-out.

Hmm. I never even waited for 60 seconds. Maybe my boot would have
continued after the delay.

                    Linus

^ permalink raw reply

* Re: [net-next-2.6 PATCH v6 1/2] net: implement mechanism for HW based QOS
From: John Fastabend @ 2011-01-07 22:48 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: davem@davemloft.net, hadi@cyberus.ca, eric.dumazet@gmail.com,
	shemminger@vyatta.com, tgraf@infradead.org,
	bhutchings@solarflare.com, nhorman@tuxdriver.com,
	netdev@vger.kernel.org
In-Reply-To: <20110107214645.GB2050@del.dom.local>

On 1/7/2011 1:46 PM, Jarek Poplawski wrote:
> On Thu, Jan 06, 2011 at 07:12:11PM -0800, John Fastabend wrote:
>> This patch provides a mechanism for lower layer devices to
>> steer traffic using skb->priority to tx queues. This allows
>> for hardware based QOS schemes to use the default qdisc without
>> incurring the penalties related to global state and the qdisc
>> lock. While reliably receiving skbs on the correct tx ring
>> to avoid head of line blocking resulting from shuffling in
>> the LLD. Finally, all the goodness from txq caching and xps/rps
>> can still be leveraged.
>>
>> Many drivers and hardware exist with the ability to implement
>> QOS schemes in the hardware but currently these drivers tend
>> to rely on firmware to reroute specific traffic, a driver
>> specific select_queue or the queue_mapping action in the
>> qdisc.
>>
>> By using select_queue for this drivers need to be updated for
>> each and every traffic type and we lose the goodness of much
>> of the upstream work. Firmware solutions are inherently
>> inflexible. And finally if admins are expected to build a
>> qdisc and filter rules to steer traffic this requires knowledge
>> of how the hardware is currently configured. The number of tx
>> queues and the queue offsets may change depending on resources.
>> Also this approach incurs all the overhead of a qdisc with filters.
>>
>> With the mechanism in this patch users can set skb priority using
>> expected methods ie setsockopt() or the stack can set the priority
>> directly. Then the skb will be steered to the correct tx queues
>> aligned with hardware QOS traffic classes. In the normal case with
>> a single traffic class and all queues in this class everything
>> works as is until the LLD enables multiple tcs.
>>
>> To steer the skb we mask out the lower 4 bits of the priority
>> and allow the hardware to configure upto 15 distinct classes
>> of traffic. This is expected to be sufficient for most applications
>> at any rate it is more then the 8021Q spec designates and is
>> equal to the number of prio bands currently implemented in
>> the default qdisc.
>>
>> This in conjunction with a userspace application such as
>> lldpad can be used to implement 8021Q transmission selection
>> algorithms one of these algorithms being the extended transmission
>> selection algorithm currently being used for DCB.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>
>>  include/linux/netdevice.h |   65 +++++++++++++++++++++++++++++++++++++++++++++
>>  net/core/dev.c            |   52 +++++++++++++++++++++++++++++++++++-
>>  2 files changed, 116 insertions(+), 1 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 0f6b1c9..12fff42 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -646,6 +646,14 @@ struct xps_dev_maps {
>>      (nr_cpu_ids * sizeof(struct xps_map *)))
>>  #endif /* CONFIG_XPS */
>>  
>> +#define TC_MAX_QUEUE	16
>> +#define TC_BITMASK	15
>> +/* HW offloaded queuing disciplines txq count and offset maps */
>> +struct netdev_tc_txq {
>> +	u16 count;
>> +	u16 offset;
>> +};
>> +
>>  /*
>>   * This structure defines the management hooks for network devices.
>>   * The following hooks can be defined; unless noted otherwise, they are
>> @@ -756,6 +764,7 @@ struct xps_dev_maps {
>>   * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
>>   *			  struct nlattr *port[]);
>>   * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
>> + * void (*ndo_setup_tc)(struct net_device *dev, u8 tc)
> 
> ..., unsigned int txq) ?
> 
>>   */
>>  #define HAVE_NET_DEVICE_OPS
>>  struct net_device_ops {
>> @@ -814,6 +823,8 @@ struct net_device_ops {
>>  						   struct nlattr *port[]);
>>  	int			(*ndo_get_vf_port)(struct net_device *dev,
>>  						   int vf, struct sk_buff *skb);
>> +	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc,
>> +						unsigned int txq);
> 
> ...
>> +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
>> + * @dev: Network device
>> + * @txq: number of queues available
>> + *
>> + * If real_num_tx_queues is changed the tc mappings may no longer be
>> + * valid. To resolve this if the net_device supports ndo_setup_tc
>> + * call the ops routine with the new queue number. If the ops is not
>> + * available verify the tc mapping remains valid and if not NULL the
>> + * mapping. With no priorities mapping to this offset/count pair it
>> + * will no longer be used. In the worst case TC0 is invalid nothing
>> + * can be done so disable priority mappings.
>> + */
>> +void netif_setup_tc(struct net_device *dev, unsigned int txq)
>> +{
>> +	const struct net_device_ops *ops = dev->netdev_ops;
>> +
>> +	if (ops->ndo_setup_tc) {
>> +		ops->ndo_setup_tc(dev, dev->num_tc, txq);
>> +	} else {
>> +		int i;
>> +		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
>> +
>> +		/* If TC0 is invalidated disable TC mapping */
>> +		if (tc->offset + tc->count > txq) {
>> +			dev->num_tc = 0;
>> +			return;
>> +		}
>> +
>> +		/* Invalidated prio to tc mappings set to TC0 */
>> +		for (i = 1; i < TC_BITMASK + 1; i++) {
>> +			int q = netdev_get_prio_tc_map(dev, i);
> 
> (empty line)
> Btw, probably some warning should be logged on config change here.
> 

OK maybe I should see about making at least my local checkpatch script
look for this. Also added pr_warnings here.

>> +			tc = &dev->tc_to_txq[q];
>> +
>> +			if (tc->offset + tc->count > txq)
>> +				netdev_set_prio_tc_map(dev, i, 0);
>> +		}
>> +	}
>> +}
>> +
>>  /*
>>   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
>>   * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
>> @@ -1614,6 +1653,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
>>  
>>  		if (txq < dev->real_num_tx_queues)
>>  			qdisc_reset_all_tx_gt(dev, txq);
>> +
>> +		if (dev->num_tc)
>> +			netif_setup_tc(dev, txq);
> 
> Should be before qdisc_reset_all_tx_gt (above).
> 
> Jarek P.

I will fix this. Thanks!


^ permalink raw reply

* Re: [GIT] Networking
From: Francois Romieu @ 2011-01-07 21:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: David Miller, Hayes Wang, Ben Hutchings, David Woodhouse, akpm,
	netdev, linux-kernel
In-Reply-To: <AANLkTi=7=_FyVsjA3vKEpg8RwfC=m-8QGfqa6ctxQday@mail.gmail.com>

Linus Torvalds <torvalds@linux-foundation.org> :
> On Thu, Jan 6, 2011 at 12:20 PM, David Miller <davem@davemloft.net> wrote:
> >
> > Plus the usual spattering of wireless, bluetooth, and wired driver
> > updates.
> 
> Grr.

Oops.

[...]
> [torvalds@i5 linux]$ git bisect good
> bca03d5f32c8ee9b5cfa1d32640a63fded6cb3c0 is the first bad commit
> commit bca03d5f32c8ee9b5cfa1d32640a63fded6cb3c0
> Author: fran?ois romieu <romieu@fr.zoreil.com>
> Date:   Mon Jan 3 15:07:31 2011 +0000
> 
>     r8169: remove the firmware of RTL8111D.
> 
>     The binary file of the firmware is moved to linux-firmware repository.
>     The firmwares are rtl_nic/rtl8168d-1.fw and rtl_nic/rtl8168d-2.fw.
>     The driver goes along if the firmware couldn't be found. However, it
>     is suggested to be done with the suitable firmware.
> 
>     Some wrong PHY parameters are directly corrected in the driver.
> 
>     Simple firmware checking added per Ben Hutchings suggestion.
> 
>     Signed-off-by: Hayes Wang <hayeswang@realtek.com>
>     Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>
>     Cc: Ben Hutchings <benh@debian.org>
>     Signed-off-by: David S. Miller <davem@davemloft.net>
> 
> and the behavior is very broken: it just hangs at boot-time. No
> messages from the driver (certainly not any messages about missing
> firmware), no nothing. The thing is just hung.
>
> On a working setup, I have
> 
> 
>   [    1.595071] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
>   [    1.595114] r8169 0000:01:00.0: PCI INT A -> GSI 19 (level, low) -> IRQ 19
>   [    1.595174] r8169 0000:01:00.0: setting latency timer to 64
>   [    1.595227] r8169 0000:01:00.0: irq 42 for MSI/MSI-X
>   [    1.595770] r8169 0000:01:00.0: eth0: RTL8168d/8111d at
> 0xffffc90000068000, e0:cb:4e:95:1a:d7, XID 083000c0 IRQ 42

>   ...
>   [    7.985917] r8169 0000:01:00.0: eth0: link up
>   [    7.987525] r8169 0000:01:00.0: eth0: link up
> 
> while on a non-working setup, I get that XID line, and after that a
> few other init routines continue to show up (probably just
> multi-threaded initcalls), but the box is dead.
> 
> Quite frankly, that commit looks broken anyway. It doesn't just switch
> to the firmware loader, it also seems to change other things (ie
> removed some mdio writes, added others).

It was described in the commit message as
[...]
    Some wrong PHY parameters are directly corrected in the driver.

> What's going on?

The firmware is supposed to be optional. The (relevant) 8168 have
been reported to work without it. I have tested it with my hardware
(RTL_GIGA_MAC_VER_26 (== your), RTL_GIGA_MAC_VER_25 + some others),
seen the "r8169 0000:02:00.0: eth0: unable to apply firmware patch"
message and noticed that the card was actually working. Though I did
not try a non-modular build, this was not exactly a five minutes
test.

I had no technical reason [*] to turn the firmware mandatory and screw
distro-maintainers who have decided that the firmware is not free enough.

[*] The small "I will regret it" voices in my head do not count as
    technical reasons.

[...]
> Not acceptable. I'm ok with an external firmware repository, but only
> if it _works_. Right now it doesn't. It just seems to cause silent
> failures: there were no warnings about missing firmware at ANY time.
> Not build-time, not run-time.

?

The driver is supposed to display a message at KERN_WARNING.

-- 
Ueimor

^ permalink raw reply

* Re: [PATCH] Export ACPI _DSM provided firmware instance number and string name to sysfs
From: Jesse Barnes @ 2011-01-07 22:27 UTC (permalink / raw)
  To: Narendra_K
  Cc: Matt_Domsch, linux-pci, linux-hotplug, netdev, Jordan_Hargrave,
	Charles_Rose, Vijay_Nijhawan
In-Reply-To: <20101223194927.GA2717@fedora14-r610.oslab.blr.amer.dell.com>

On Thu, 23 Dec 2010 11:24:36 -0800
<Narendra_K@Dell.com> wrote:

> On Thu, Dec 23, 2010 at 08:02:03PM +0530, Domsch, Matt wrote:
> > On Wed, Dec 22, 2010 at 08:42:39AM -0800, Narendra_K@Dell.com wrote:
> > > Hello,
> > > 
> > > This patch exports ACPI _DSM provided firmware instance number and
> > > string name to sysfs.
> > > 
> > > Please review -
> > 
> > There are now two different meanings for the 'index' file:
> > 
> > 1) SMBIOS-provided "type instance" value, which I've only seen in
> >    range [1..N] for N devices, monotonically stepwise increasing.
> > 
> > 2) ACPI-provided "index" value, which per spec only needs to be a
> >    "sort key", not starting at 0 or 1, and while monotonically
> >    increasing, not necessarily stepwise.  It's perfectly valid for the
> >    values to be (12, 16, 27, 29) if that's convenient for BIOS to
> >    generate.
> > 
> > Therefore, a consumer of this value (such as biosdevname) must know
> > which of the two it's dealing with, and either accept the value as-is,
> > or sort the value list.  While I suppose it could sort the value list
> > in either case, I'd prefer the ACPI value to be exposed in its own
> > file, perhaps 'acpi_index', to make this explicit rather than
> > implicit.
> > 
> > 'label' is fine for either case, with ACPI taking priority over
> > SMBIOS if both happen to be present.
> > 

Applied, thanks.

-- 
Jesse Barnes, Intel Open Source Technology Center

^ permalink raw reply

* Re: [net-next-2.6 PATCH v6 2/2] net_sched: implement a root container qdisc sch_mqprio
From: John Fastabend @ 2011-01-07 22:16 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: davem@davemloft.net, hadi@cyberus.ca, eric.dumazet@gmail.com,
	shemminger@vyatta.com, tgraf@infradead.org,
	bhutchings@solarflare.com, nhorman@tuxdriver.com,
	netdev@vger.kernel.org
In-Reply-To: <20110107212140.GA2050@del.dom.local>

On 1/7/2011 1:21 PM, Jarek Poplawski wrote:
> On Thu, Jan 06, 2011 at 07:12:16PM -0800, John Fastabend wrote:
>> This implements a mqprio queueing discipline that by default creates
>> a pfifo_fast qdisc per tx queue and provides the needed configuration
>> interface.
>>
>> Using the mqprio qdisc the number of tcs currently in use along
>> with the range of queues alloted to each class can be configured. By
>> default skbs are mapped to traffic classes using the skb priority.
>> This mapping is configurable.
>>
>> Configurable parameters,
>>
>> struct tc_mqprio_qopt {
>>         __u8    num_tc;
>>         __u8    prio_tc_map[TC_BITMASK + 1];
>>         __u8    hw;
>>         __u16   count[TC_MAX_QUEUE];
>>         __u16   offset[TC_MAX_QUEUE];
>> };
>>
>> Here the count/offset pairing give the queue alignment and the
>> prio_tc_map gives the mapping from skb->priority to tc.
>>
>> The hw bit determines if the hardware should configure the count
>> and offset values. If the hardware bit is set then the operation
>> will fail if the hardware does not implement the ndo_setup_tc
>> operation. This is to avoid undetermined states where the hardware
>> may or may not control the queue mapping. Also minimal bounds
>> checking is done on the count/offset to verify a queue does not
>> exceed num_tx_queues and that queue ranges do not overlap. Otherwise
>> it is left to user policy or hardware configuration to create
>> useful mappings.
>>
>> It is expected that hardware QOS schemes can be implemented by
>> creating appropriate mappings of queues in ndo_tc_setup().
>>
>> One expected use case is drivers will use the ndo_setup_tc to map
>> queue ranges onto 802.1Q traffic classes. This provides a generic
>> mechanism to map network traffic onto these traffic classes and
>> removes the need for lower layer drivers to know specifics about
>> traffic types.
>>
>> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
>> ---
>>
>>  include/linux/pkt_sched.h |   12 +
>>  net/sched/Kconfig         |   12 +
>>  net/sched/Makefile        |    1 
>>  net/sched/sch_generic.c   |    4 
>>  net/sched/sch_mqprio.c    |  415 +++++++++++++++++++++++++++++++++++++++++++++
>>  5 files changed, 444 insertions(+), 0 deletions(-)
>>  create mode 100644 net/sched/sch_mqprio.c
>>
>> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
>> index 2cfa4bc..776cd93 100644
>> --- a/include/linux/pkt_sched.h
>> +++ b/include/linux/pkt_sched.h
>> @@ -481,4 +481,16 @@ struct tc_drr_stats {
>>  	__u32	deficit;
>>  };
>>  
>> +/* MQPRIO */
>> +#define TC_QOPT_BITMASK 15
>> +#define TC_QOPT_MAX_QUEUE 16
>> +
>> +struct tc_mqprio_qopt {
>> +	__u8	num_tc;
>> +	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
>> +	__u8	hw;
>> +	__u16	count[TC_QOPT_MAX_QUEUE];
>> +	__u16	offset[TC_QOPT_MAX_QUEUE];
>> +};
> 
> ...
>> +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
>> +{
>> +	int i, j;
>> +
>> +	/* Verify num_tc is not out of max range */
>> +	if (qopt->num_tc > TC_QOPT_MAX_QUEUE)
> 
> If these TC_QOPTs really couldn't be avoided you should probably check
> them with BUILD_BUG_ON() but use only TC_MAX_QUEUE/TC_BITMASK
> everywhere. Otherwise, it looks OK to me.
> 
> Jarek P.

I couldn't think of any better way. I'll add the BUILD_BUG_ON() macros.

Thanks 
John

^ permalink raw reply

* Re: [GIT] Networking
From: Francois Romieu @ 2011-01-07 21:55 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ben Hutchings, David Miller, Hayes Wang, David Woodhouse, akpm,
	netdev, linux-kernel
In-Reply-To: <AANLkTikwrHoOY4wvUp1qn-OT=EjZesSNR29US1ERc0o2@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 2017 bytes --]

Linus Torvalds <torvalds@linux-foundation.org> :
[...]
> I just confirmed that building it as a module works

I have just tried a non-modular build and it worked without firmware.

However:
[    3.508989] scsi2 : pata_sis
[    3.515602] scsi3 : pata_sis
[    3.522815] ata3: PATA max UDMA/133 cmd 0x1f0 ctl 0x3f6 bmdma 0xffa0 irq 14
[    3.537408] ata4: PATA max UDMA/133 cmd 0x170 ctl 0x376 bmdma 0xffa8 irq 15
[    3.552204] Fixed MDIO Bus: probed
[    3.559902] r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
[    3.571875] ACPI: PCI Interrupt Link [LNKE] enabled at IRQ 5
[    3.583869] PCI: setting IRQ 5 as level-triggered
[    3.583877] r8169 0000:02:00.0: PCI INT A -> Link[LNKE] -> GSI 5 (level, low) -> IRQ 5
[    3.600399] r8169 0000:02:00.0: enabling Mem-Wr-Inval
[    3.600440] r8169 0000:02:00.0: setting latency timer to 64
[    3.600453] r8169 0000:02:00.0: no MSI. Back to INTx.
[    3.611416] r8169 0000:02:00.0: eth0: RTL8168d/8111d at 0xf87ea000, 00:e0:4c:68:00:2c, XID 083000c0 IRQ 5

No firmware warning message ? 

[    3.842642] ata1: SATA link down (SStatus 0 SControl 300)
[    4.172056] ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300)
[    4.192318] ata2.00: ATA-7: SAMSUNG HD250HJ, FH100-05, max UDMA7
[    4.205162] ata2.00: 488397168 sectors, multi 16: LBA48 NCQ (depth 0/32)
[    4.240311] ata2.00: configured for UDMA/133
[    4.249869] scsi 1:0:0:0: Direct-Access     ATA      SAMSUNG HD250HJ  FH10 PQ: 0 ANSI: 5
[    4.267151] sd 1:0:0:0: [sda] 488397168 512-byte logical blocks: (250 GB/232 GiB)
[    4.283199] sd 1:0:0:0: Attached scsi generic sg0 type 0
[    4.295028] sd 1:0:0:0: [sda] Write Protect is off
[    4.305610] sd 1:0:0:0: [sda] Mode Sense: 00 3a 00 00
[    4.305637] sd 1:0:0:0: [sda] Write cache: enabled, read cache: enabled, doesn't support DPO or FUA
[    4.334873]  sda: sda1
[    4.340876] sd 1:0:0:0: [sda] Attached SCSI disk
[   63.968081] r8169 0000:02:00.0: eth0: unable to apply firmware patch

It's here. After a 60 seconds black-out.

-- 
Ueimor

[-- Attachment #2: dmesg.gz --]
[-- Type: application/x-gzip, Size: 14038 bytes --]

^ permalink raw reply

* Re: [net-next-2.6 PATCH v6 1/2] net: implement mechanism for HW based QOS
From: Jarek Poplawski @ 2011-01-07 21:46 UTC (permalink / raw)
  To: John Fastabend
  Cc: davem, hadi, eric.dumazet, shemminger, tgraf, bhutchings, nhorman,
	netdev
In-Reply-To: <20110107031211.2446.35715.stgit@jf-dev1-dcblab>

On Thu, Jan 06, 2011 at 07:12:11PM -0800, John Fastabend wrote:
> This patch provides a mechanism for lower layer devices to
> steer traffic using skb->priority to tx queues. This allows
> for hardware based QOS schemes to use the default qdisc without
> incurring the penalties related to global state and the qdisc
> lock. While reliably receiving skbs on the correct tx ring
> to avoid head of line blocking resulting from shuffling in
> the LLD. Finally, all the goodness from txq caching and xps/rps
> can still be leveraged.
> 
> Many drivers and hardware exist with the ability to implement
> QOS schemes in the hardware but currently these drivers tend
> to rely on firmware to reroute specific traffic, a driver
> specific select_queue or the queue_mapping action in the
> qdisc.
> 
> By using select_queue for this drivers need to be updated for
> each and every traffic type and we lose the goodness of much
> of the upstream work. Firmware solutions are inherently
> inflexible. And finally if admins are expected to build a
> qdisc and filter rules to steer traffic this requires knowledge
> of how the hardware is currently configured. The number of tx
> queues and the queue offsets may change depending on resources.
> Also this approach incurs all the overhead of a qdisc with filters.
> 
> With the mechanism in this patch users can set skb priority using
> expected methods ie setsockopt() or the stack can set the priority
> directly. Then the skb will be steered to the correct tx queues
> aligned with hardware QOS traffic classes. In the normal case with
> a single traffic class and all queues in this class everything
> works as is until the LLD enables multiple tcs.
> 
> To steer the skb we mask out the lower 4 bits of the priority
> and allow the hardware to configure upto 15 distinct classes
> of traffic. This is expected to be sufficient for most applications
> at any rate it is more then the 8021Q spec designates and is
> equal to the number of prio bands currently implemented in
> the default qdisc.
> 
> This in conjunction with a userspace application such as
> lldpad can be used to implement 8021Q transmission selection
> algorithms one of these algorithms being the extended transmission
> selection algorithm currently being used for DCB.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
> 
>  include/linux/netdevice.h |   65 +++++++++++++++++++++++++++++++++++++++++++++
>  net/core/dev.c            |   52 +++++++++++++++++++++++++++++++++++-
>  2 files changed, 116 insertions(+), 1 deletions(-)
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0f6b1c9..12fff42 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -646,6 +646,14 @@ struct xps_dev_maps {
>      (nr_cpu_ids * sizeof(struct xps_map *)))
>  #endif /* CONFIG_XPS */
>  
> +#define TC_MAX_QUEUE	16
> +#define TC_BITMASK	15
> +/* HW offloaded queuing disciplines txq count and offset maps */
> +struct netdev_tc_txq {
> +	u16 count;
> +	u16 offset;
> +};
> +
>  /*
>   * This structure defines the management hooks for network devices.
>   * The following hooks can be defined; unless noted otherwise, they are
> @@ -756,6 +764,7 @@ struct xps_dev_maps {
>   * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
>   *			  struct nlattr *port[]);
>   * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
> + * void (*ndo_setup_tc)(struct net_device *dev, u8 tc)

..., unsigned int txq) ?

>   */
>  #define HAVE_NET_DEVICE_OPS
>  struct net_device_ops {
> @@ -814,6 +823,8 @@ struct net_device_ops {
>  						   struct nlattr *port[]);
>  	int			(*ndo_get_vf_port)(struct net_device *dev,
>  						   int vf, struct sk_buff *skb);
> +	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc,
> +						unsigned int txq);

...
> +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
> + * @dev: Network device
> + * @txq: number of queues available
> + *
> + * If real_num_tx_queues is changed the tc mappings may no longer be
> + * valid. To resolve this if the net_device supports ndo_setup_tc
> + * call the ops routine with the new queue number. If the ops is not
> + * available verify the tc mapping remains valid and if not NULL the
> + * mapping. With no priorities mapping to this offset/count pair it
> + * will no longer be used. In the worst case TC0 is invalid nothing
> + * can be done so disable priority mappings.
> + */
> +void netif_setup_tc(struct net_device *dev, unsigned int txq)
> +{
> +	const struct net_device_ops *ops = dev->netdev_ops;
> +
> +	if (ops->ndo_setup_tc) {
> +		ops->ndo_setup_tc(dev, dev->num_tc, txq);
> +	} else {
> +		int i;
> +		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
> +
> +		/* If TC0 is invalidated disable TC mapping */
> +		if (tc->offset + tc->count > txq) {
> +			dev->num_tc = 0;
> +			return;
> +		}
> +
> +		/* Invalidated prio to tc mappings set to TC0 */
> +		for (i = 1; i < TC_BITMASK + 1; i++) {
> +			int q = netdev_get_prio_tc_map(dev, i);

(empty line)
Btw, probably some warning should be logged on config change here.

> +			tc = &dev->tc_to_txq[q];
> +
> +			if (tc->offset + tc->count > txq)
> +				netdev_set_prio_tc_map(dev, i, 0);
> +		}
> +	}
> +}
> +
>  /*
>   * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
>   * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
> @@ -1614,6 +1653,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
>  
>  		if (txq < dev->real_num_tx_queues)
>  			qdisc_reset_all_tx_gt(dev, txq);
> +
> +		if (dev->num_tc)
> +			netif_setup_tc(dev, txq);

Should be before qdisc_reset_all_tx_gt (above).

Jarek P.

^ permalink raw reply

* Re: [net-next-2.6 PATCH v6 2/2] net_sched: implement a root container qdisc sch_mqprio
From: Jarek Poplawski @ 2011-01-07 21:21 UTC (permalink / raw)
  To: John Fastabend
  Cc: davem, hadi, eric.dumazet, shemminger, tgraf, bhutchings, nhorman,
	netdev
In-Reply-To: <20110107031216.2446.35953.stgit@jf-dev1-dcblab>

On Thu, Jan 06, 2011 at 07:12:16PM -0800, John Fastabend wrote:
> This implements a mqprio queueing discipline that by default creates
> a pfifo_fast qdisc per tx queue and provides the needed configuration
> interface.
> 
> Using the mqprio qdisc the number of tcs currently in use along
> with the range of queues alloted to each class can be configured. By
> default skbs are mapped to traffic classes using the skb priority.
> This mapping is configurable.
> 
> Configurable parameters,
> 
> struct tc_mqprio_qopt {
>         __u8    num_tc;
>         __u8    prio_tc_map[TC_BITMASK + 1];
>         __u8    hw;
>         __u16   count[TC_MAX_QUEUE];
>         __u16   offset[TC_MAX_QUEUE];
> };
> 
> Here the count/offset pairing give the queue alignment and the
> prio_tc_map gives the mapping from skb->priority to tc.
> 
> The hw bit determines if the hardware should configure the count
> and offset values. If the hardware bit is set then the operation
> will fail if the hardware does not implement the ndo_setup_tc
> operation. This is to avoid undetermined states where the hardware
> may or may not control the queue mapping. Also minimal bounds
> checking is done on the count/offset to verify a queue does not
> exceed num_tx_queues and that queue ranges do not overlap. Otherwise
> it is left to user policy or hardware configuration to create
> useful mappings.
> 
> It is expected that hardware QOS schemes can be implemented by
> creating appropriate mappings of queues in ndo_tc_setup().
> 
> One expected use case is drivers will use the ndo_setup_tc to map
> queue ranges onto 802.1Q traffic classes. This provides a generic
> mechanism to map network traffic onto these traffic classes and
> removes the need for lower layer drivers to know specifics about
> traffic types.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
> ---
> 
>  include/linux/pkt_sched.h |   12 +
>  net/sched/Kconfig         |   12 +
>  net/sched/Makefile        |    1 
>  net/sched/sch_generic.c   |    4 
>  net/sched/sch_mqprio.c    |  415 +++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 444 insertions(+), 0 deletions(-)
>  create mode 100644 net/sched/sch_mqprio.c
> 
> diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
> index 2cfa4bc..776cd93 100644
> --- a/include/linux/pkt_sched.h
> +++ b/include/linux/pkt_sched.h
> @@ -481,4 +481,16 @@ struct tc_drr_stats {
>  	__u32	deficit;
>  };
>  
> +/* MQPRIO */
> +#define TC_QOPT_BITMASK 15
> +#define TC_QOPT_MAX_QUEUE 16
> +
> +struct tc_mqprio_qopt {
> +	__u8	num_tc;
> +	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
> +	__u8	hw;
> +	__u16	count[TC_QOPT_MAX_QUEUE];
> +	__u16	offset[TC_QOPT_MAX_QUEUE];
> +};

...
> +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
> +{
> +	int i, j;
> +
> +	/* Verify num_tc is not out of max range */
> +	if (qopt->num_tc > TC_QOPT_MAX_QUEUE)

If these TC_QOPTs really couldn't be avoided you should probably check
them with BUILD_BUG_ON() but use only TC_MAX_QUEUE/TC_BITMASK
everywhere. Otherwise, it looks OK to me.

Jarek P.

^ permalink raw reply

* [rfc] Creating drivers/net/ethernet
From: Joe Perches @ 2011-01-07 21:15 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Paul Gortmaker, Jan Engelhardt, Jeffrey Kirsher
In-Reply-To: <1294360199-9860-1-git-send-email-jeffrey.t.kirsher@intel.com>

Does anyone still think moving files around in drivers/net
would be sensible and a suitable candidate for 2.6.38?

http://vger.kernel.org/netconf2010_slides/netconf-jtk.pdf

Here's what I proposed.

http://www.spinics.net/lists/netdev/msg149717.html

I agree with Jan that the current 10/100, 1000, 10000 speed
selections mechanisms are less than ideal.

Perhaps it'd be worthwhile to remove it.


^ permalink raw reply

* Re: [PATCH] forcedeth: Do not use legacy PCI power management
From: Rafael J. Wysocki @ 2011-01-07 21:12 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, LKML, Linux-pm mailing list
In-Reply-To: <201101070049.16833.rjw@sisk.pl>

On Friday, January 07, 2011, Rafael J. Wysocki wrote:
> From: Rafael J. Wysocki <rjw@sisk.pl>
> 
> The forcedeth driver uses the legacy PCI power management, so it has
> to do PCI-specific things in its ->suspend() and ->resume() callbacks
> and some of them are not done correctly.
> 
> Convert forcedeth to the new PCI power management framework and make
> it let the PCI subsystem take care of all the PCI-specific aspects of
> device handling during system power transitions.
> 
> Tested with nVidia Corporation MCP55 Ethernet (rev a2).

This version of the patch contains a mistake (nv_shutdown should be #defined
as NULL if !CONFIG_PM, not nv_resume), please disregard it.

Fixed patch is appended.

Thanks,
Rafael


---
From: Rafael J. Wysocki <rjw@sisk.pl>
Subject: forcedeth: Do not use legacy PCI power management (v2)

The forcedeth driver uses the legacy PCI power management, so it has
to do PCI-specific things in its ->suspend() and ->resume() callbacks
and some of them are not done correctly.

Convert forcedeth to the new PCI power management framework and make
it let the PCI subsystem take care of all the PCI-specific aspects of
device handling during system power transitions.

Tested with nVidia Corporation MCP55 Ethernet (rev a2).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 drivers/net/forcedeth.c |   34 ++++++++++++----------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

Index: linux-2.6/drivers/net/forcedeth.c
===================================================================
--- linux-2.6.orig/drivers/net/forcedeth.c
+++ linux-2.6/drivers/net/forcedeth.c
@@ -4082,6 +4082,7 @@ static int nv_set_wol(struct net_device
 		writel(flags, base + NvRegWakeUpFlags);
 		spin_unlock_irq(&np->lock);
 	}
+	device_set_wakeup_enable(&np->pci_dev->dev, np->wolenabled);
 	return 0;
 }
 
@@ -5643,14 +5644,10 @@ static int __devinit nv_probe(struct pci
 	/* set mac address */
 	nv_copy_mac_to_hw(dev);
 
-	/* Workaround current PCI init glitch:  wakeup bits aren't
-	 * being set from PCI PM capability.
-	 */
-	device_init_wakeup(&pci_dev->dev, 1);
-
 	/* disable WOL */
 	writel(0, base + NvRegWakeUpFlags);
 	np->wolenabled = 0;
+	device_set_wakeup_enable(&pci_dev->dev, false);
 
 	if (id->driver_data & DEV_HAS_POWER_CNTRL) {
 
@@ -5923,8 +5920,9 @@ static void __devexit nv_remove(struct p
 }
 
 #ifdef CONFIG_PM
-static int nv_suspend(struct pci_dev *pdev, pm_message_t state)
+static int nv_suspend(struct device *device)
 {
+	struct pci_dev *pdev = to_pci_dev(device);
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct fe_priv *np = netdev_priv(dev);
 	u8 __iomem *base = get_hwbase(dev);
@@ -5940,25 +5938,17 @@ static int nv_suspend(struct pci_dev *pd
 	for (i = 0;i <= np->register_size/sizeof(u32); i++)
 		np->saved_config_space[i] = readl(base + i*sizeof(u32));
 
-	pci_save_state(pdev);
-	pci_enable_wake(pdev, pci_choose_state(pdev, state), np->wolenabled);
-	pci_disable_device(pdev);
-	pci_set_power_state(pdev, pci_choose_state(pdev, state));
 	return 0;
 }
 
-static int nv_resume(struct pci_dev *pdev)
+static int nv_resume(struct device *device)
 {
+	struct pci_dev *pdev = to_pci_dev(device);
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct fe_priv *np = netdev_priv(dev);
 	u8 __iomem *base = get_hwbase(dev);
 	int i, rc = 0;
 
-	pci_set_power_state(pdev, PCI_D0);
-	pci_restore_state(pdev);
-	/* ack any pending wake events, disable PME */
-	pci_enable_wake(pdev, PCI_D0, 0);
-
 	/* restore non-pci configuration space */
 	for (i = 0;i <= np->register_size/sizeof(u32); i++)
 		writel(np->saved_config_space[i], base+i*sizeof(u32));
@@ -5977,6 +5967,9 @@ static int nv_resume(struct pci_dev *pde
 	return rc;
 }
 
+static SIMPLE_DEV_PM_OPS(nv_pm_ops, nv_suspend, nv_resume);
+#define NV_PM_OPS (&nv_pm_ops)
+
 static void nv_shutdown(struct pci_dev *pdev)
 {
 	struct net_device *dev = pci_get_drvdata(pdev);
@@ -6000,15 +5993,13 @@ static void nv_shutdown(struct pci_dev *
 	 * only put the device into D3 if we really go for poweroff.
 	 */
 	if (system_state == SYSTEM_POWER_OFF) {
-		if (pci_enable_wake(pdev, PCI_D3cold, np->wolenabled))
-			pci_enable_wake(pdev, PCI_D3hot, np->wolenabled);
+		pci_wake_from_d3(pdev, np->wolenabled);
 		pci_set_power_state(pdev, PCI_D3hot);
 	}
 }
 #else
-#define nv_suspend NULL
+#define NV_PM_OPS NULL
 #define nv_shutdown NULL
-#define nv_resume NULL
 #endif /* CONFIG_PM */
 
 static DEFINE_PCI_DEVICE_TABLE(pci_tbl) = {
@@ -6180,9 +6171,8 @@ static struct pci_driver driver = {
 	.id_table	= pci_tbl,
 	.probe		= nv_probe,
 	.remove		= __devexit_p(nv_remove),
-	.suspend	= nv_suspend,
-	.resume		= nv_resume,
 	.shutdown	= nv_shutdown,
+	.driver.pm	= NV_PM_OPS,
 };
 
 static int __init init_nic(void)

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox