Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH v4 net-next-2.6] netfilter: x_tables: dont block BH while reading counters
From: Eric Dumazet @ 2011-01-08 16:45 UTC (permalink / raw)
  To: Patrick McHardy, David Miller
  Cc: Jesper Dangaard Brouer, netfilter-devel, netdev,
	Stephen Hemminger
In-Reply-To: <1292646579.7894.42.camel@edumazet-laptop>

David,

I am resending this patch, sent 3 weeks ago, Patrick gave no answer.

I believe it should be included in linux-2.6.38 and stable kernels.

Some people found they had to change NIC RX ring sizes in order not
missing frames (from 1024 to 2048), while root cause of the problem was
this.

Quoting Jesper : "I can now hit the system with a pktgen at 128 bytes,
and see no drops/overruns while running iptables.  (This packet load at
128bytes is 822 kpps and 840Mbit/s) (iptables ruleset is the big chains:
20929 rules: 81239)."

Thanks

[PATCH v4] netfilter: x_tables: dont block BH while reading counters

Using "iptables -L" with a lot of rules have a too big BH latency.
Jesper mentioned ~6 ms and worried of frame drops.

Switch to a per_cpu seqlock scheme, so that taking a snapshot of
counters doesnt need to block BH (for this cpu, but also other cpus).

This adds two increments on seqlock sequence per ipt_do_table() call,
its a reasonable cost for allowing "iptables -L" not block BH
processing.

Reported-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Jesper Dangaard Brouer <hawk@comx.dk>
---
 include/linux/netfilter/x_tables.h |   10 +++---
 net/ipv4/netfilter/arp_tables.c    |   45 ++++++++-------------------
 net/ipv4/netfilter/ip_tables.c     |   45 ++++++++-------------------
 net/ipv6/netfilter/ip6_tables.c    |   45 ++++++++-------------------
 net/netfilter/x_tables.c           |    3 +
 5 files changed, 49 insertions(+), 99 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 742bec0..6712e71 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -472,7 +472,7 @@ extern void xt_free_table_info(struct xt_table_info *info);
  *  necessary for reading the counters.
  */
 struct xt_info_lock {
-	spinlock_t lock;
+	seqlock_t lock;
 	unsigned char readers;
 };
 DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
@@ -497,7 +497,7 @@ static inline void xt_info_rdlock_bh(void)
 	local_bh_disable();
 	lock = &__get_cpu_var(xt_info_locks);
 	if (likely(!lock->readers++))
-		spin_lock(&lock->lock);
+		write_seqlock(&lock->lock);
 }
 
 static inline void xt_info_rdunlock_bh(void)
@@ -505,7 +505,7 @@ static inline void xt_info_rdunlock_bh(void)
 	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
 
 	if (likely(!--lock->readers))
-		spin_unlock(&lock->lock);
+		write_sequnlock(&lock->lock);
 	local_bh_enable();
 }
 
@@ -516,12 +516,12 @@ static inline void xt_info_rdunlock_bh(void)
  */
 static inline void xt_info_wrlock(unsigned int cpu)
 {
-	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
+	write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
 }
 
 static inline void xt_info_wrunlock(unsigned int cpu)
 {
-	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
+	write_sequnlock(&per_cpu(xt_info_locks, cpu).lock);
 }
 
 /*
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3fac340..e855fff 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -710,42 +710,25 @@ static void get_counters(const struct xt_table_info *t,
 	struct arpt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
+		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
 		i = 0;
-		local_bh_disable();
-		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqbegin(lock);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqretry(lock, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
 		}
-		xt_info_wrunlock(cpu);
-		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -759,7 +742,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	 * about).
 	 */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc(countersize);
+	counters = vzalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1007,7 +990,7 @@ static int __do_replace(struct net *net, const char *name,
 	struct arpt_entry *iter;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d63..652efea 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -884,42 +884,25 @@ get_counters(const struct xt_table_info *t,
 	struct ipt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU.
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
+		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
 		i = 0;
-		local_bh_disable();
-		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqbegin(lock);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqretry(lock, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i; /* macro does multi eval of i */
 		}
-		xt_info_wrunlock(cpu);
-		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -932,7 +915,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc(countersize);
+	counters = vzalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ipt_entry *iter;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 4555823..7d227c6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -897,42 +897,25 @@ get_counters(const struct xt_table_info *t,
 	struct ip6t_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
+		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
 		i = 0;
-		local_bh_disable();
-		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqbegin(lock);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqretry(lock, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
 		}
-		xt_info_wrunlock(cpu);
-		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -945,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc(countersize);
+	counters = vzalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1216,7 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ip6t_entry *iter;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8046350..c942376 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1325,7 +1325,8 @@ static int __init xt_init(void)
 
 	for_each_possible_cpu(i) {
 		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
-		spin_lock_init(&lock->lock);
+
+		seqlock_init(&lock->lock);
 		lock->readers = 0;
 	}
 



^ permalink raw reply related

* Re: [PATCH V8 12/13] ptp: Added a clock driver for the IXP46x.
From: Krzysztof Halasa @ 2011-01-08 16:25 UTC (permalink / raw)
  To: Richard Cochran
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	Alan Cox, Arnd Bergmann, Christoph Lameter, David Miller,
	John Stultz, Peter Zijlstra, Rodolfo Giometti, Thomas Gleixner
In-Reply-To: <20110107170752.GB8666-7KxsofuKt4IfAd9E5cN8NEzG7cXyKsk/@public.gmane.org>

Richard Cochran <richardcochran-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> writes:

> The time stamp code clones the skb, but the LE version frees the skb
> too early. Perhaps we can move that dev_kfree_skb(skb) in the LE case
> to be the last statement in eth_xmit(). What do you think?

I think so. Or something similar.

> Do you mean, you don't like the constant on the left hand side?

Yes.

> Is that prohibited by CodingStyle or similar?

I don't think so. It's just a personal taste. I think it's based on
things learned in primary school, they teach to write (comparisons)
X = 4 instead of the other way around, and my brain seems to shock
a bit on the opposite.

> I got into the habit of writing it that way to prevent a typo like:
>
> 	if (irq = NO_IRQ)

I see. Unfortunately it doesn't prevent typos like this when the right
side isn't a constant. Anyway gcc warns about them, even when both sides
are variable.

>> Also I don't like the ixp_read/ixp_write() trivial macros. Why not
>> simply call __raw_readl() and __raw_writel()?
>
> Well, I have had the experience back in 2.4 days of having my drivers
> ruined by the changing IO macros in the kernel. The wrappers are
> supposed to help if that ever happens again. Seeing *two* leading
> underscores in the macro names certainly makes me nervous.

Well, these two underscores mainly mean it's arch-dependent, but so are
the ixp4xx drivers. Using the __raw_read* directly is the preferred
method (or, perhaps, in such case, it's the only way).

Actually, I was thinking about changing the macros some time ago, and it
may eventually happen. But we'll fix all the code using them then.
-- 
Krzysztof Halasa

^ permalink raw reply

* Re: [PATCH] net_sched: factorize qdisc stats handling
From: Eric Dumazet @ 2011-01-08 15:40 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Stephen Hemminger, Changli Gao, David Miller, Fabio Checconi,
	netdev, Luigi Rizzo
In-Reply-To: <4D286823.8050707@gmail.com>

Le samedi 08 janvier 2011 à 14:35 +0100, Jarek Poplawski a écrit :

> I guess you can't use qdisc_pkt_len() without qdisc_enqueue_root().
> 

Indeed !

This TCQ_F_CAN_BYPASS thing is truly evil ;)

Hmm, this makes me thing we could add TCQ_F_CAN_BYPASS on SFQ.

Thanks !



^ permalink raw reply

* Re: [PATCH] net_sched: factorize qdisc stats handling
From: Jarek Poplawski @ 2011-01-08 13:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, Changli Gao, David Miller, Fabio Checconi,
	netdev, Luigi Rizzo
In-Reply-To: <1294478789.2709.79.camel@edumazet-laptop>

Eric Dumazet wrote:
...
> -static inline void __qdisc_update_bstats(struct Qdisc *sch, unsigned int len)
> +
> +static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
> +				 struct sk_buff *skb)
> +{
> +	bstats->bytes += qdisc_pkt_len(skb);
> +	bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
> +}
> +
> +static inline void qdisc_bstats_update(struct Qdisc *sch, struct sk_buff *skb)
>  {
> -	sch->bstats.bytes += len;
> -	sch->bstats.packets++;
> +	bstats_update(&sch->bstats, skb);
>  }
>  
>  static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
> @@ -437,7 +444,7 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
>  {
>  	__skb_queue_tail(list, skb);
>  	sch->qstats.backlog += qdisc_pkt_len(skb);
> -	__qdisc_update_bstats(sch, qdisc_pkt_len(skb));
> +	qdisc_bstats_update(sch, skb);
>  
>  	return NET_XMIT_SUCCESS;
>  }
> diff --git a/net/core/dev.c b/net/core/dev.c
> index a215269..ab60f58 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2301,7 +2301,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
>  		 */
>  		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
>  			skb_dst_force(skb);
> -		__qdisc_update_bstats(q, skb->len);
> +		qdisc_bstats_update(q, skb);
>  		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
>  			if (unlikely(contended)) {
>  				spin_unlock(&q->busylock);

I guess you can't use qdisc_pkt_len() without qdisc_enqueue_root().

Jarek P.

^ permalink raw reply

* [e100] Page allocation failure warning(?) in 2.6.36.3
From: Chris Rankin @ 2011-01-08 12:53 UTC (permalink / raw)
  To: e1000-devel; +Cc: netdev

Hi,

I've just booted 2.6.36.3 on my old router box (which contains one single e100 card, and one dual-port e100 card), and have discovered this rather scary message in the dmesg log:

e100: Intel(R) PRO/100 Network Driver, 3.5.24-k2-NAPI
e100: Copyright(c) 1999-2006 Intel Corporation
e100 0000:00:0f.0: PME# disabled
e100 0000:00:0f.0: eth0: addr 0xffbeb000, irq 10, MAC addr 00:90:27:76:d0:ec
e100 0000:01:04.0: PME# disabled
e100 0000:01:04.0: eth1: addr 0xff0fe000, irq 11, MAC addr 00:03:47:3b:29:5c
e100 0000:01:05.0: PME# disabled
e100 0000:01:05.0: eth2: addr 0xff0ff000, irq 10, MAC addr 00:03:47:3b:29:5d
...
device eth1 entered promiscuous mode
ADDRCONF(NETDEV_UP): eth1: link is not ready
e100 0000:01:04.0: eth1: NIC Link is Up 100 Mbps Full Duplex
ADDRCONF(NETDEV_CHANGE): eth1: link becomes ready
device eth2 entered promiscuous mode
ADDRCONF(NETDEV_UP): eth2: link is not ready
br0: port 1(eth1) entering learning state
br0: port 1(eth1) entering learning state
ifconfig: page allocation failure. order:6, mode:0x8020
Pid: 3716, comm: ifconfig Not tainted 2.6.36.3 #1
Call Trace:
 [<c104b2a9>] ? __alloc_pages_nodemask+0x477/0x4a6
 [<c106177d>] ? __slab_alloc+0x1eb/0x396
 [<c1004ca6>] ? dma_generic_alloc_coherent+0x4e/0xac
 [<c105fb5c>] ? dma_pool_alloc+0xe5/0x1d9
 [<c1004c58>] ? dma_generic_alloc_coherent+0x0/0xac
 [<c58f97f3>] ? e100_rx_alloc_skb+0x87/0x122 [e100]
 [<c58f9883>] ? e100_rx_alloc_skb+0x117/0x122 [e100]
 [<c58f98dc>] ? e100_alloc_cbs+0x4e/0xfa [e100]
 [<c58fb370>] ? e100_up+0x1b/0xf1 [e100]
 [<c58fb45d>] ? e100_open+0x17/0x3b [e100]
 [<c1121630>] ? __dev_open+0x7c/0xa0
 [<c11217ed>] ? __dev_change_flags+0x8b/0x100
 [<c11218c3>] ? dev_change_flags+0x10/0x3b
 [<c1159880>] ? devinet_ioctl+0x25a/0x532
 [<c11146d2>] ? sock_ioctl+0x1a8/0x1ca
 [<c111452a>] ? sock_ioctl+0x0/0x1ca
 [<c106e061>] ? do_vfs_ioctl+0x464/0x4a2
 [<c1014ce0>] ? do_page_fault+0x2d2/0x2ea
 [<c1014cc8>] ? do_page_fault+0x2ba/0x2ea
 [<c10636f6>] ? sys_faccessat+0x144/0x151
 [<c106e0cc>] ? sys_ioctl+0x2d/0x49
 [<c1177dd5>] ? syscall_call+0x7/0xb
Mem-Info:
DMA per-cpu:
CPU    0: hi:    0, btch:   1 usd:   0
Normal per-cpu:
CPU    0: hi:    6, btch:   1 usd:   0
active_anon:280 inactive_anon:808 isolated_anon:0
 active_file:1384 inactive_file:9672 isolated_file:0
 unevictable:0 dirty:77 writeback:0 unstable:0
 free:753 slab_reclaimable:499 slab_unreclaimable:1393
 mapped:375 shmem:643 pagetables:59 bounce:0
DMA free:1492kB min:248kB low:308kB high:372kB active_anon:0kB inactive_anon:12kB active_file:288kB inactive_file:12548kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15864kB mlocked:0kB dirty:48kB writeback:0kB mapped:28kB shmem:0kB slab_reclaimable:312kB slab_unreclaimable:884kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 47 47
Normal free:1520kB min:764kB low:952kB high:1144kB active_anon:1120kB inactive_anon:3220kB active_file:5248kB inactive_file:26140kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:48768kB mlocked:0kB dirty:260kB writeback:0kB mapped:1472kB shmem:2572kB slab_reclaimable:1684kB slab_unreclaimable:4688kB kernel_stack:176kB pagetables:236kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0 0
DMA: 51*4kB 23*8kB 3*16kB 3*32kB 7*64kB 4*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 1492kB
Normal: 260*4kB 26*8kB 1*16kB 0*32kB 0*64kB 0*128kB 1*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 1520kB
11715 total pagecache pages
16 pages in swap cache
Swap cache stats: add 44, delete 28, find 72/72
Free swap  = 2179532kB
Total swap = 2179596kB
16383 pages RAM
826 pages reserved
10736 pages shared
5361 pages non-shared
ADDRCONF(NETDEV_UP): eth0: link is not ready
e100 0000:00:0f.0: eth0: NIC Link is Up 100 Mbps Full Duplex
ADDRCONF(NETDEV_CHANGE): eth0: link becomes ready
br0: port 1(eth1) entering forwarding state

Should I be concerned, please? All three e100 devices still appear to be working, but something nasty seems to have happened anyway.

The lspci output for these devices is:
00:0f.0 0200: 8086:1229 (rev 08)
	Subsystem: 8086:000c
	Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV+ VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
	Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Latency: 72 (2000ns min, 14000ns max), Cache Line Size: 32 bytes
	Interrupt: pin A routed to IRQ 10
	Region 0: Memory at ffbeb000 (32-bit, non-prefetchable) [size=4K]
	Region 1: I/O ports at ef00 [size=64]
	Region 2: Memory at fef00000 (32-bit, non-prefetchable) [size=1M]
	[virtual] Expansion ROM at 04000000 [disabled] [size=1M]
	Capabilities: [dc] Power Management version 2
		Flags: PMEClk- DSI+ D1+ D2+ AuxCurrent=0mA PME(D0+,D1+,D2+,D3hot+,D3cold-)
		Status: D0 PME-Enable- DSel=0 DScale=2 PME-
	Kernel driver in use: e100
	Kernel modules: e100

01:04.0 0200: 8086:1229 (rev 05)
	Subsystem: 8086:10f0
	Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV+ VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
	Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Latency: 72 (2000ns min, 14000ns max), Cache Line Size: 32 bytes
	Interrupt: pin A routed to IRQ 11
	Region 0: Memory at ff0fe000 (32-bit, prefetchable) [size=4K]
	Region 1: I/O ports at fcc0 [size=32]
	Region 2: Memory at ff700000 (32-bit, non-prefetchable) [size=1M]
	[virtual] Expansion ROM at ff100000 [disabled] [size=1M]
	Capabilities: [dc] Power Management version 1
		Flags: PMEClk- DSI+ D1+ D2+ AuxCurrent=0mA PME(D0+,D1+,D2+,D3hot+,D3cold-)
		Status: D0 PME-Enable- DSel=0 DScale=0 PME-
	Kernel driver in use: e100
	Kernel modules: e100

01:05.0 0200: 8086:1229 (rev 05)
	Subsystem: 8086:10f0
	Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV+ VGASnoop- ParErr- Stepping- SERR+ FastB2B- DisINTx-
	Status: Cap+ 66MHz- UDF- FastB2B+ ParErr- DEVSEL=medium >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
	Latency: 72 (2000ns min, 14000ns max), Cache Line Size: 32 bytes
	Interrupt: pin A routed to IRQ 10
	Region 0: Memory at ff0ff000 (32-bit, prefetchable) [size=4K]
	Region 1: I/O ports at fce0 [size=32]
	Region 2: Memory at ff900000 (32-bit, non-prefetchable) [size=1M]
	[virtual] Expansion ROM at ff200000 [disabled] [size=1M]
	Capabilities: [dc] Power Management version 1
		Flags: PMEClk- DSI+ D1+ D2+ AuxCurrent=0mA PME(D0+,D1+,D2+,D3hot+,D3cold-)
		Status: D0 PME-Enable- DSel=0 DScale=0 PME-
	Kernel driver in use: e100
	Kernel modules: e100

Thanks,
Chris


      

------------------------------------------------------------------------------
Gaining the trust of online customers is vital for the success of any company
that requires sensitive data to be transmitted over the Web.   Learn how to 
best implement a security strategy that keeps consumers' information secure 
and instills the confidence they need to proceed with transactions.
http://p.sf.net/sfu/oracle-sfdevnl 
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply

* Re: [GIT] Networking
From: Francois Romieu @ 2011-01-08 12:17 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ben Hutchings, David Miller, Hayes Wang, David Woodhouse, akpm,
	netdev, linux-kernel
In-Reply-To: <20110108000901.GA2133@electric-eye.fr.zoreil.com>

r8169: delay phy init until device opens.

It workarounds the 60s firmware load failure timeout for the
non-modular case.

Signed-off-by: Francois Romieu <romieu@fr.zoreil.com>

diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c
index 27a7c20..dd758cd 100644
--- a/drivers/net/r8169.c
+++ b/drivers/net/r8169.c
@@ -3069,15 +3069,6 @@ rtl8169_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		rtl8168_driver_start(tp);
 	}
 
-	rtl8169_init_phy(dev, tp);
-
-	/*
-	 * Pretend we are using VLANs; This bypasses a nasty bug where
-	 * Interrupts stop flowing on high load on 8110SCd controllers.
-	 */
-	if (tp->mac_version == RTL_GIGA_MAC_VER_05)
-		RTL_W16(CPlusCmd, RTL_R16(CPlusCmd) | RxVlan);
-
 	device_set_wakeup_enable(&pdev->dev, tp->features & RTL_FEATURE_WOL);
 
 	if (pci_dev_run_wake(pdev))
@@ -3127,6 +3118,7 @@ static void __devexit rtl8169_remove_one(struct pci_dev *pdev)
 static int rtl8169_open(struct net_device *dev)
 {
 	struct rtl8169_private *tp = netdev_priv(dev);
+	void __iomem *ioaddr = tp->mmio_addr;
 	struct pci_dev *pdev = tp->pci_dev;
 	int retval = -ENOMEM;
 
@@ -3162,6 +3154,15 @@ static int rtl8169_open(struct net_device *dev)
 
 	napi_enable(&tp->napi);
 
+	rtl8169_init_phy(dev, tp);
+
+	/*
+	 * Pretend we are using VLANs; This bypasses a nasty bug where
+	 * Interrupts stop flowing on high load on 8110SCd controllers.
+	 */
+	if (tp->mac_version == RTL_GIGA_MAC_VER_05)
+		RTL_W16(CPlusCmd, RTL_R16(CPlusCmd) | RxVlan);
+
 	rtl_pll_power_up(tp);
 
 	rtl_hw_start(dev);
@@ -3171,7 +3172,7 @@ static int rtl8169_open(struct net_device *dev)
 	tp->saved_wolopts = 0;
 	pm_runtime_put_noidle(&pdev->dev);
 
-	rtl8169_check_link_status(dev, tp, tp->mmio_addr);
+	rtl8169_check_link_status(dev, tp, ioaddr);
 out:
 	return retval;
 

^ permalink raw reply related

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Jarek Poplawski @ 2011-01-08 10:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, xiaosuo, paulus, harvey.harrison, linux-ppp, netdev
In-Reply-To: <1294482005.2709.90.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le samedi 08 janvier 2011 à 11:04 +0100, Jarek Poplawski a écrit :
> 
>> Just for the record: I agree with Paul that current code is more readable.
>> This code still requires thinking about specific bytes and the patch mixes
>> it only with word access.
>>
>> Jarek P.
>>
>>> @@ -395,16 +396,14 @@ mppe_compress(void *arg, unsigned char *ibuf, unsigned char *obuf,
>>>  	 */
>>>  	obuf[0] = PPP_ADDRESS(ibuf);
>>>  	obuf[1] = PPP_CONTROL(ibuf);
>>> -	obuf[2] = PPP_COMP >> 8;	/* isize + MPPE_OVHD + 1 */
>>> -	obuf[3] = PPP_COMP;	/* isize + MPPE_OVHD + 2 */
>>> +	put_unaligned_be16(PPP_COMP, obuf + 2);
>>>  	obuf += PPP_HDRLEN;
> 
> Compilers are stupid not generating optimal code, so we should help them
> a bit.
> 
> Yes, I agree this is ugly Jarek and makes reading of this code a bit
> more complex, but this is a move we cannot stop. Number of functions,
> macros, etc... is exploding and we must follow the trend ;)
> 
> 41 c6 44 24 02 00       movb   $0x0,0x2(%r12)
> 41 c6 44 24 03 fd       movb   $0xfd,0x3(%r12)
> 
> After patch :
> 
> 66 41 c7 44 24 02 00 fd   movw   $0xfd00,0x2(%r12)

And that's why Paul wanted more justification, because readability
gain is questionable.

Jarek P.

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Eric Dumazet @ 2011-01-08 10:20 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: David Miller, xiaosuo, paulus, harvey.harrison, linux-ppp, netdev
In-Reply-To: <4D2836BB.6000600@gmail.com>

Le samedi 08 janvier 2011 à 11:04 +0100, Jarek Poplawski a écrit :

> Just for the record: I agree with Paul that current code is more readable.
> This code still requires thinking about specific bytes and the patch mixes
> it only with word access.
> 
> Jarek P.
> 
> > @@ -395,16 +396,14 @@ mppe_compress(void *arg, unsigned char *ibuf, unsigned char *obuf,
> >  	 */
> >  	obuf[0] = PPP_ADDRESS(ibuf);
> >  	obuf[1] = PPP_CONTROL(ibuf);
> > -	obuf[2] = PPP_COMP >> 8;	/* isize + MPPE_OVHD + 1 */
> > -	obuf[3] = PPP_COMP;	/* isize + MPPE_OVHD + 2 */
> > +	put_unaligned_be16(PPP_COMP, obuf + 2);
> >  	obuf += PPP_HDRLEN;

Compilers are stupid not generating optimal code, so we should help them
a bit.

Yes, I agree this is ugly Jarek and makes reading of this code a bit
more complex, but this is a move we cannot stop. Number of functions,
macros, etc... is exploding and we must follow the trend ;)

41 c6 44 24 02 00       movb   $0x0,0x2(%r12)
41 c6 44 24 03 fd       movb   $0xfd,0x3(%r12)

After patch :

66 41 c7 44 24 02 00 fd   movw   $0xfd00,0x2(%r12)




^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Jarek Poplawski @ 2011-01-08 10:04 UTC (permalink / raw)
  To: David Miller; +Cc: xiaosuo, paulus, harvey.harrison, linux-ppp, netdev
In-Reply-To: <20110107.171534.193718114.davem@davemloft.net>

David Miller wrote:
> From: Changli Gao <xiaosuo@gmail.com>
> Date: Sat, 8 Jan 2011 08:43:01 +0800
> 
>> On Fri, Jan 7, 2011 at 11:01 AM, Paul Mackerras <paulus@samba.org> wrote:
>>> On Fri, Jan 07, 2011 at 07:37:36AM +0800, Changli Gao wrote:
>>>
>>>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>>>
>>> This patch description is inadequate.  It should tell us why you are
>>> making this change.  Does it result in smaller and/or faster code, and
>>> if so by how much on what sort of machine?  Do you think it makes the
>>> code clearer?  (I don't.)  Or is there some other motivation for this?
>>>
>>
>> Good designed APIs always make code clearer, smaller and faster. It is
>> obvious enough I think.
> 
> I have to say that every time I go read the header parsing code in the
> PPP driver, I absolutely regret it.
> 
> And Changli's patch fixes some of the readability problems.

Just for the record: I agree with Paul that current code is more readable.
This code still requires thinking about specific bytes and the patch mixes
it only with word access.

Jarek P.

> @@ -395,16 +396,14 @@ mppe_compress(void *arg, unsigned char *ibuf, unsigned char *obuf,
>  	 */
>  	obuf[0] = PPP_ADDRESS(ibuf);
>  	obuf[1] = PPP_CONTROL(ibuf);
> -	obuf[2] = PPP_COMP >> 8;	/* isize + MPPE_OVHD + 1 */
> -	obuf[3] = PPP_COMP;	/* isize + MPPE_OVHD + 2 */
> +	put_unaligned_be16(PPP_COMP, obuf + 2);
>  	obuf += PPP_HDRLEN;


^ permalink raw reply

* [PATCH] net_sched: factorize qdisc stats handling
From: Eric Dumazet @ 2011-01-08  9:26 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Changli Gao, David Miller, Fabio Checconi, netdev, Luigi Rizzo
In-Reply-To: <20110107200234.3f5e7ff8@nehalam>

Le vendredi 07 janvier 2011 à 20:02 -0800, Stephen Hemminger a écrit :
> On Sat, 8 Jan 2011 10:56:33 +0800
> Changli Gao <xiaosuo@gmail.com> wrote:
> 
> > > +       cl->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;  
> > 
> > Hmm, there is no other packets schedulers which account packets in
> > this way. Which one is better? I am not sure. And in this patch,
> > qstats.drops isn't maintained in the same way. Would these two be
> > consistent.
> 
> HTB uses this accounting.

Yes, but we should use generic helpers and avoid duplicating this kind
of magic here and here ;)


[PATCH] net_sched: factorize qdisc stats handling

HTB takes into account skb is segmented in stats updates.
Generalize this to all schedulers.

They should use qdisc_bstats_update() helper instead of manipulating
bstats.bytes and bstats.packets

Add bstats_update() helper too for classes that use
gnet_stats_basic_packed fields.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/sch_generic.h |   15 +++++++++++----
 net/core/dev.c            |    2 +-
 net/sched/act_csum.c      |    3 +--
 net/sched/act_ipt.c       |    3 +--
 net/sched/act_mirred.c    |    3 +--
 net/sched/act_nat.c       |    3 +--
 net/sched/act_pedit.c     |    3 +--
 net/sched/act_police.c    |    3 +--
 net/sched/act_simple.c    |    3 +--
 net/sched/act_skbedit.c   |    3 +--
 net/sched/sch_atm.c       |    6 ++----
 net/sched/sch_cbq.c       |    6 ++----
 net/sched/sch_drr.c       |    8 ++------
 net/sched/sch_dsmark.c    |    3 +--
 net/sched/sch_hfsc.c      |    6 ++----
 net/sched/sch_htb.c       |   17 ++++++-----------
 net/sched/sch_ingress.c   |    3 +--
 net/sched/sch_multiq.c    |    3 +--
 net/sched/sch_netem.c     |    6 ++----
 net/sched/sch_prio.c      |    3 +--
 net/sched/sch_red.c       |    3 +--
 net/sched/sch_sfq.c       |    3 +--
 net/sched/sch_tbf.c       |    3 +--
 net/sched/sch_teql.c      |    3 +--
 24 files changed, 44 insertions(+), 70 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 0af57eb..389bbcb 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -426,10 +426,17 @@ static inline int qdisc_enqueue_root(struct sk_buff *skb, struct Qdisc *sch)
 	return qdisc_enqueue(skb, sch) & NET_XMIT_MASK;
 }
 
-static inline void __qdisc_update_bstats(struct Qdisc *sch, unsigned int len)
+
+static inline void bstats_update(struct gnet_stats_basic_packed *bstats,
+				 struct sk_buff *skb)
+{
+	bstats->bytes += qdisc_pkt_len(skb);
+	bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1;
+}
+
+static inline void qdisc_bstats_update(struct Qdisc *sch, struct sk_buff *skb)
 {
-	sch->bstats.bytes += len;
-	sch->bstats.packets++;
+	bstats_update(&sch->bstats, skb);
 }
 
 static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
@@ -437,7 +444,7 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch,
 {
 	__skb_queue_tail(list, skb);
 	sch->qstats.backlog += qdisc_pkt_len(skb);
-	__qdisc_update_bstats(sch, qdisc_pkt_len(skb));
+	qdisc_bstats_update(sch, skb);
 
 	return NET_XMIT_SUCCESS;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index a215269..ab60f58 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2301,7 +2301,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 		 */
 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
 			skb_dst_force(skb);
-		__qdisc_update_bstats(q, skb->len);
+		qdisc_bstats_update(q, skb);
 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
 			if (unlikely(contended)) {
 				spin_unlock(&q->busylock);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 67dc7ce..83ddfc0 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -508,8 +508,7 @@ static int tcf_csum(struct sk_buff *skb,
 
 	spin_lock(&p->tcf_lock);
 	p->tcf_tm.lastuse = jiffies;
-	p->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	p->tcf_bstats.packets++;
+	bstats_update(&p->tcf_bstats, skb);
 	action = p->tcf_action;
 	update_flags = p->update_flags;
 	spin_unlock(&p->tcf_lock);
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 8daef96..c2a7c20 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -209,8 +209,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
 	spin_lock(&ipt->tcf_lock);
 
 	ipt->tcf_tm.lastuse = jiffies;
-	ipt->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	ipt->tcf_bstats.packets++;
+	bstats_update(&ipt->tcf_bstats, skb);
 
 	/* yes, we have to worry about both in and out dev
 	 worry later - danger - this API seems to have changed
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 0c311be..d765067 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -165,8 +165,7 @@ static int tcf_mirred(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&m->tcf_lock);
 	m->tcf_tm.lastuse = jiffies;
-	m->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	m->tcf_bstats.packets++;
+	bstats_update(&m->tcf_bstats, skb);
 
 	dev = m->tcfm_dev;
 	if (!dev) {
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 186eb83..178a4bd 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -125,8 +125,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a,
 	egress = p->flags & TCA_NAT_FLAG_EGRESS;
 	action = p->tcf_action;
 
-	p->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	p->tcf_bstats.packets++;
+	bstats_update(&p->tcf_bstats, skb);
 
 	spin_unlock(&p->tcf_lock);
 
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index a0593c9..445bef7 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -187,8 +187,7 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
 bad:
 	p->tcf_qstats.overlimits++;
 done:
-	p->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	p->tcf_bstats.packets++;
+	bstats_update(&p->tcf_bstats, skb);
 	spin_unlock(&p->tcf_lock);
 	return p->tcf_action;
 }
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 7ebf743..e2f08b1 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -298,8 +298,7 @@ static int tcf_act_police(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&police->tcf_lock);
 
-	police->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	police->tcf_bstats.packets++;
+	bstats_update(&police->tcf_bstats, skb);
 
 	if (police->tcfp_ewma_rate &&
 	    police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 97e84f3..7287cff 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -42,8 +42,7 @@ static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result
 
 	spin_lock(&d->tcf_lock);
 	d->tcf_tm.lastuse = jiffies;
-	d->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	d->tcf_bstats.packets++;
+	bstats_update(&d->tcf_bstats, skb);
 
 	/* print policy string followed by _ then packet count
 	 * Example if this was the 3rd packet and the string was "hello"
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 66cbf4e..836f5fe 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -46,8 +46,7 @@ static int tcf_skbedit(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock(&d->tcf_lock);
 	d->tcf_tm.lastuse = jiffies;
-	d->tcf_bstats.bytes += qdisc_pkt_len(skb);
-	d->tcf_bstats.packets++;
+	bstats_update(&d->tcf_bstats, skb);
 
 	if (d->flags & SKBEDIT_F_PRIORITY)
 		skb->priority = d->priority;
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 2825407..943d733 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -422,10 +422,8 @@ drop: __maybe_unused
 		}
 		return ret;
 	}
-	sch->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
-	flow->bstats.bytes += qdisc_pkt_len(skb);
-	flow->bstats.packets++;
+	qdisc_bstats_update(sch, skb);
+	bstats_update(&flow->bstats, skb);
 	/*
 	 * Okay, this may seem weird. We pretend we've dropped the packet if
 	 * it goes via ATM. The reason for this is that the outer qdisc
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index eb76315..c80d1c2 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -390,8 +390,7 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	ret = qdisc_enqueue(skb, cl->q);
 	if (ret == NET_XMIT_SUCCESS) {
 		sch->q.qlen++;
-		sch->bstats.packets++;
-		sch->bstats.bytes += qdisc_pkt_len(skb);
+		qdisc_bstats_update(sch, skb);
 		cbq_mark_toplevel(q, cl);
 		if (!cl->next_alive)
 			cbq_activate_class(cl);
@@ -650,8 +649,7 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
 		ret = qdisc_enqueue(skb, cl->q);
 		if (ret == NET_XMIT_SUCCESS) {
 			sch->q.qlen++;
-			sch->bstats.packets++;
-			sch->bstats.bytes += qdisc_pkt_len(skb);
+			qdisc_bstats_update(sch, skb);
 			if (!cl->next_alive)
 				cbq_activate_class(cl);
 			return 0;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index aa8b531..de55e64 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -351,7 +351,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 {
 	struct drr_sched *q = qdisc_priv(sch);
 	struct drr_class *cl;
-	unsigned int len;
 	int err;
 
 	cl = drr_classify(skb, sch, &err);
@@ -362,7 +361,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return err;
 	}
 
-	len = qdisc_pkt_len(skb);
 	err = qdisc_enqueue(skb, cl->qdisc);
 	if (unlikely(err != NET_XMIT_SUCCESS)) {
 		if (net_xmit_drop_count(err)) {
@@ -377,10 +375,8 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		cl->deficit = cl->quantum;
 	}
 
-	cl->bstats.packets++;
-	cl->bstats.bytes += len;
-	sch->bstats.packets++;
-	sch->bstats.bytes += len;
+	bstats_update(&cl->bstats, skb);
+	qdisc_bstats_update(sch, skb);
 
 	sch->q.qlen++;
 	return err;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 1d295d6..60f4bdd 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -260,8 +260,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		return err;
 	}
 
-	sch->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
+	qdisc_bstats_update(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 069c62b..2e45791 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1599,10 +1599,8 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 	if (cl->qdisc->q.qlen == 1)
 		set_active(cl, qdisc_pkt_len(skb));
 
-	cl->bstats.packets++;
-	cl->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
+	bstats_update(&cl->bstats, skb);
+	qdisc_bstats_update(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 01b519d..984c1b0 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -569,15 +569,12 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		}
 		return ret;
 	} else {
-		cl->bstats.packets +=
-			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
-		cl->bstats.bytes += qdisc_pkt_len(skb);
+		bstats_update(&cl->bstats, skb);
 		htb_activate(q, cl);
 	}
 
 	sch->q.qlen++;
-	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
 	return NET_XMIT_SUCCESS;
 }
 
@@ -648,12 +645,10 @@ static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
 				htb_add_to_wait_tree(q, cl, diff);
 		}
 
-		/* update byte stats except for leaves which are already updated */
-		if (cl->level) {
-			cl->bstats.bytes += bytes;
-			cl->bstats.packets += skb_is_gso(skb)?
-					skb_shinfo(skb)->gso_segs:1;
-		}
+		/* update basic stats except for leaves which are already updated */
+		if (cl->level)
+			bstats_update(&cl->bstats, skb);
+
 		cl = cl->parent;
 	}
 }
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index f10e34a..bce1665 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -63,8 +63,7 @@ static int ingress_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	result = tc_classify(skb, p->filter_list, &res);
 
-	sch->bstats.packets++;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
+	qdisc_bstats_update(sch, skb);
 	switch (result) {
 	case TC_ACT_SHOT:
 		result = TC_ACT_SHOT;
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 32690de..21f13da 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -83,8 +83,7 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	ret = qdisc_enqueue(skb, qdisc);
 	if (ret == NET_XMIT_SUCCESS) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
 	}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e5593c0..1c4bce8 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -240,8 +240,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	if (likely(ret == NET_XMIT_SUCCESS)) {
 		sch->q.qlen++;
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 	} else if (net_xmit_drop_count(ret)) {
 		sch->qstats.drops++;
 	}
@@ -477,8 +476,7 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 		__skb_queue_after(list, skb, nskb);
 
 		sch->qstats.backlog += qdisc_pkt_len(nskb);
-		sch->bstats.bytes += qdisc_pkt_len(nskb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, nskb);
 
 		return NET_XMIT_SUCCESS;
 	}
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index b1c95bc..966158d 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -84,8 +84,7 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 
 	ret = qdisc_enqueue(skb, qdisc);
 	if (ret == NET_XMIT_SUCCESS) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		sch->q.qlen++;
 		return NET_XMIT_SUCCESS;
 	}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index a67ba3c..a6009c5 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -94,8 +94,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 
 	ret = qdisc_enqueue(skb, child);
 	if (likely(ret == NET_XMIT_SUCCESS)) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		sch->q.qlen++;
 	} else if (net_xmit_drop_count(ret)) {
 		q->stats.pdrop++;
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index d54ac94..239ec53 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -403,8 +403,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 		slot->allot = q->scaled_quantum;
 	}
 	if (++sch->q.qlen <= q->limit) {
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		return NET_XMIT_SUCCESS;
 	}
 
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index 641a30d..77565e7 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -134,8 +134,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 	}
 
 	sch->q.qlen++;
-	sch->bstats.bytes += qdisc_pkt_len(skb);
-	sch->bstats.packets++;
+	qdisc_bstats_update(sch, skb);
 	return NET_XMIT_SUCCESS;
 }
 
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index 106479a..af9360d 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -83,8 +83,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
 
 	if (q->q.qlen < dev->tx_queue_len) {
 		__skb_queue_tail(&q->q, skb);
-		sch->bstats.bytes += qdisc_pkt_len(skb);
-		sch->bstats.packets++;
+		qdisc_bstats_update(sch, skb);
 		return NET_XMIT_SUCCESS;
 	}
 



^ permalink raw reply related

* Re: [PATCH net-next-2.6 v3 1/1] can: c_can: Added support for Bosch C_CAN controller
From: Wolfgang Grandegger @ 2011-01-08  9:09 UTC (permalink / raw)
  To: Bhupesh Sharma
  Cc: Socketcan-core-0fE9KPoRgkgATYTw5x5z8w,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1294135195-9448-1-git-send-email-bhupesh.sharma-qxv4g6HH51o@public.gmane.org>

Hi Bhupesh,

the patch already looks quite good. Just a few more issues...

On 01/04/2011 10:59 AM, Bhupesh Sharma wrote:
> Bosch C_CAN controller is a full-CAN implementation which is compliant
> to CAN protocol version 2.0 part A and B. Bosch C_CAN user manual can be
> obtained from:
> http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
> 
> This patch adds the support for this controller.
> The following are the design choices made while writing the controller
> driver:
> 1. Interface Register set IF1 has be used only in the current design.
> 2. Out of the 32 Message objects available, 16 are kept aside for RX
>    purposes and the rest for TX purposes.
> 3. NAPI implementation is such that both the TX and RX paths function
>    in polling mode.
> 
> Changes since V2:
> 1. Seperately implemented a bus independent interface "c_can.c" and
>    a bus sensitive driver "c_can_platform.c". The bus sensitive driver
>    essentially caters to the details of registers mapping/arch differences
>    found on different SoCs.
> 2. Changed RX poll method to allow *in-order packet reception*.
> 3. Implemeneted LEC (last error code) as an enum.
> 4. Implemented CAN_CTRLMODE_BERR_REPORTING.
> 5. Corrected "quota" handling in RX poll routine.
> 6. Implemented and used priv->can.do_get_berr_counter.
> 7. Improved timeout-handling while programming IF command request
>    register.
> 8. Corrected register offset typecasting to allow the same to work on
>    64-bit systems.
> 
> Signed-off-by: Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> ---
>  drivers/net/can/Kconfig                |    2 +
>  drivers/net/can/Makefile               |    1 +
>  drivers/net/can/c_can/Kconfig          |   15 +
>  drivers/net/can/c_can/Makefile         |    8 +
>  drivers/net/can/c_can/c_can.c          |  960 ++++++++++++++++++++++++++++++++
>  drivers/net/can/c_can/c_can.h          |  235 ++++++++
>  drivers/net/can/c_can/c_can_platform.c |  210 +++++++
>  7 files changed, 1431 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/net/can/c_can/Kconfig
>  create mode 100644 drivers/net/can/c_can/Makefile
>  create mode 100644 drivers/net/can/c_can/c_can.c
>  create mode 100644 drivers/net/can/c_can/c_can.h
>  create mode 100644 drivers/net/can/c_can/c_can_platform.c
> 
> diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig
> index 9d9e453..50549b5 100644
> --- a/drivers/net/can/Kconfig
> +++ b/drivers/net/can/Kconfig
> @@ -86,6 +86,8 @@ source "drivers/net/can/mscan/Kconfig"
>  
>  source "drivers/net/can/sja1000/Kconfig"
>  
> +source "drivers/net/can/c_can/Kconfig"
> +
>  source "drivers/net/can/usb/Kconfig"
>  
>  config CAN_DEBUG_DEVICES
> diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile
> index 0057537..c3efeb3 100644
> --- a/drivers/net/can/Makefile
> +++ b/drivers/net/can/Makefile
> @@ -11,6 +11,7 @@ obj-y				+= usb/
>  
>  obj-$(CONFIG_CAN_SJA1000)	+= sja1000/
>  obj-$(CONFIG_CAN_MSCAN)		+= mscan/
> +obj-$(CONFIG_CAN_C_CAN)		+= c_can/
>  obj-$(CONFIG_CAN_AT91)		+= at91_can.o
>  obj-$(CONFIG_CAN_TI_HECC)	+= ti_hecc.o
>  obj-$(CONFIG_CAN_MCP251X)	+= mcp251x.o
> diff --git a/drivers/net/can/c_can/Kconfig b/drivers/net/can/c_can/Kconfig
> new file mode 100644
> index 0000000..ffb9773
> --- /dev/null
> +++ b/drivers/net/can/c_can/Kconfig
> @@ -0,0 +1,15 @@
> +menuconfig CAN_C_CAN
> +	tristate "Bosch C_CAN devices"
> +	depends on CAN_DEV && HAS_IOMEM
> +
> +if CAN_C_CAN
> +
> +config CAN_C_CAN_PLATFORM
> +	tristate "Generic Platform Bus based C_CAN driver"
> +	---help---
> +	  This driver adds support for the C_CAN chips connected to
> +	  the "platform bus" (Linux abstraction for directly to the
> +	  processor attached devices) which can be found on various
> +	  boards from ST Microelectronics (http://www.st.com)
> +	  like the SPEAr1310 and SPEAr320 evaluation boards.
> +endif

> diff --git a/drivers/net/can/c_can/Makefile b/drivers/net/can/c_can/Makefile
> new file mode 100644
> index 0000000..9273f6d
> --- /dev/null
> +++ b/drivers/net/can/c_can/Makefile
> @@ -0,0 +1,8 @@
> +#
> +#  Makefile for the Bosch C_CAN controller drivers.
> +#
> +
> +obj-$(CONFIG_CAN_C_CAN) += c_can.o
> +obj-$(CONFIG_CAN_C_CAN_PLATFORM) += c_can_platform.o
> +
> +ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG
> diff --git a/drivers/net/can/c_can/c_can.c b/drivers/net/can/c_can/c_can.c
> new file mode 100644
> index 0000000..206e650
> --- /dev/null
> +++ b/drivers/net/can/c_can/c_can.c
> @@ -0,0 +1,960 @@
> +/*
> + * CAN bus driver for Bosch C_CAN controller
> + *
> + * Copyright (C) 2010 ST Microelectronics
> + * Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> + *
> + * Borrowed heavily from the C_CAN driver originally written by:
> + * Copyright (C) 2007
> + * - Sascha Hauer, Marc Kleine-Budde, Pengutronix <s.hauer-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + * - Simon Kallweit, intefo AG <simon.kallweit-+G9qxTFKJT/tRgLqZ5aouw@public.gmane.org>
> + *
> + * TX and RX NAPI implementation has been borrowed from at91 CAN driver
> + * written by:
> + * Copyright
> + * (C) 2007 by Hans J. Koch <hjk-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
> + * (C) 2008, 2009 by Marc Kleine-Budde <kernel-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + *
> + * Bosch C_CAN controller is compliant to CAN protocol version 2.0 part A and B.
> + * Bosch C_CAN user manual can be obtained from:
> + * http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf

Unfortunately, this link is not valid any more.

> + *
> + * This file is licensed under the terms of the GNU General Public
> + * License version 2. This program is licensed "as is" without any
> + * warranty of any kind, whether express or implied.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/version.h>
> +#include <linux/module.h>
> +#include <linux/interrupt.h>
> +#include <linux/delay.h>
> +#include <linux/netdevice.h>
> +#include <linux/if_arp.h>
> +#include <linux/if_ether.h>
> +#include <linux/list.h>
> +#include <linux/delay.h>
> +#include <linux/workqueue.h>

Do you need that include?

> +#include <linux/io.h>
> +#include <linux/platform_device.h>
> +#include <linux/clk.h>

...and the upper two? They are related to platform code.

> +#include <linux/can.h>
> +#include <linux/can/dev.h>
> +#include <linux/can/error.h>
> +
> +#include "c_can.h"
> +
> +static struct can_bittiming_const c_can_bittiming_const = {
> +	.name = KBUILD_MODNAME,
> +	.tseg1_min = 2,		/* Time segment 1 = prop_seg + phase_seg1 */
> +	.tseg1_max = 16,
> +	.tseg2_min = 1,		/* Time segment 2 = phase_seg2 */
> +	.tseg2_max = 8,
> +	.sjw_max = 4,
> +	.brp_min = 1,
> +	.brp_max = 1024,	/* 6-bit BRP field + 4-bit BRPE field*/
> +	.brp_inc = 1,
> +};
> +
> +static inline int get_tx_next_msg_obj(const struct c_can_priv *priv)
> +{
> +	return (priv->tx_next & C_CAN_NEXT_MSG_OBJ_MASK) +
> +			C_CAN_MSG_OBJ_TX_FIRST;
> +}
> +
> +static inline int get_tx_echo_msg_obj(const struct c_can_priv *priv)
> +{
> +	return (priv->tx_echo & C_CAN_NEXT_MSG_OBJ_MASK) +
> +			C_CAN_MSG_OBJ_TX_FIRST;
> +}
> +
> +static u32 c_can_read_reg32(struct c_can_priv *priv, void *reg)
> +{
> +	u32 val = priv->read_reg(priv, reg);
> +	val |= ((u32) priv->read_reg(priv, reg + 2)) << 16;
> +	return val;
> +}
> +
> +void c_can_enable_all_interrupts(struct c_can_priv *priv,
> +						int enable)
> +{
> +	unsigned int cntrl_save = priv->read_reg(priv,
> +						&priv->reg_base->control);
> +
> +	if (enable)
> +		cntrl_save |= (CONTROL_SIE | CONTROL_EIE | CONTROL_IE);
> +	else
> +		cntrl_save &= ~(CONTROL_EIE | CONTROL_IE | CONTROL_SIE);
> +
> +	priv->write_reg(priv, &priv->reg_base->control, cntrl_save);
> +}
> +EXPORT_SYMBOL_GPL(c_can_enable_all_interrupts);

Do you really need to export that function? More later.

> +
> +static inline void c_can_object_get(struct net_device *dev,
> +					int iface, int objno, int mask)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	int count = MIN_TIMEOUT_VALUE;
> +
> +	/*
> +	 * As per specs, after writting the message object number in the
> +	 * IF command request register the transfer b/w interface
> +	 * register and message RAM must be complete in 6 CAN-CLK
> +	 * period.
> +	 */
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_mask,
> +			IFX_WRITE_LOW_16BIT(mask));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_reg,
> +			IFX_WRITE_LOW_16BIT(objno + 1));
> +
> +	while (count) {
> +		if (!(priv->read_reg(priv,
> +					&priv->reg_base->ifreg[iface].com_reg) &
> +					IF_COMR_BUSY))
> +			break;

Could be shortened to:

	while (count && priv->read_reg(priv,
				&priv->reg_base->ifreg[iface].com_reg) &
				IF_COMR_BUSY)


> +		count--;
> +		udelay(1);
> +	}
> +
> +	if (!count)
> +		dev_err(dev->dev.parent, "timed out in object get\n");
> +}
> +
> +static inline void c_can_object_put(struct net_device *dev,
> +					int iface, int objno, int mask)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	int count = MIN_TIMEOUT_VALUE;
> +
> +	/*
> +	 * As per specs, after writting the message object number in the
> +	 * IF command request register the transfer b/w interface
> +	 * register and message RAM must be complete in 6 CAN-CLK
> +	 * period.
> +	 */
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_mask,
> +			(IF_COMM_WR | IFX_WRITE_LOW_16BIT(mask)));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].com_reg,
> +			IFX_WRITE_LOW_16BIT(objno + 1));
> +
> +	while (count) {
> +		if (!(priv->read_reg(priv,
> +				&priv->reg_base->ifreg[iface].com_reg) &
> +				IF_COMR_BUSY))
> +			break;

Ditto. Also this is duplicated code. A (inline) function would make sense.

> +
> +		count--;
> +		udelay(1);
> +	}
> +
> +	if (!count)
> +		dev_err(dev->dev.parent, "timed out in object put\n");
> +}
> +
> +int c_can_write_msg_object(struct net_device *dev,
> +			int iface, struct can_frame *frame, int objno)
> +{
> +	int i;
> +	u16 flags = 0;
> +	unsigned int id;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	if (frame->can_id & CAN_EFF_FLAG) {
> +		id = frame->can_id & CAN_EFF_MASK;
> +		flags |= IF_ARB_MSGXTD;
> +	} else
> +		id = ((frame->can_id & CAN_SFF_MASK) << 18);
> +
> +	if (!(frame->can_id & CAN_RTR_FLAG))
> +		flags |= IF_ARB_TRANSMIT;
> +
> +	flags |= IF_ARB_MSGVAL;
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb1,
> +				IFX_WRITE_LOW_16BIT(id));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb2, flags |
> +				IFX_WRITE_HIGH_16BIT(id));
> +
> +	for (i = 0; i < frame->can_dlc; i += 2) {
> +		priv->write_reg(priv, &priv->reg_base->ifreg[iface].data[i / 2],
> +				frame->data[i] | (frame->data[i + 1] << 8));
> +	}
> +
> +	return frame->can_dlc;
> +}
> +
> +static inline void c_can_mark_rx_msg_obj(struct net_device *dev,
> +						int iface, int ctrl_mask,
> +						int obj)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +			ctrl_mask & ~(IF_MCONT_MSGLST | IF_MCONT_INTPND));
> +
> +	c_can_object_put(dev, iface, obj, IF_COMM_CONTROL);
> +

Please remove empty line above.

> +}
> +
> +static inline void c_can_activate_all_lower_rx_msg_obj(struct net_device *dev,
> +						int iface,
> +						int ctrl_mask)
> +{
> +	int i;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	for (i = 0; i < C_CAN_MSG_RX_LOW_LAST; i++) {
> +		priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +				ctrl_mask & ~(IF_MCONT_MSGLST |
> +					IF_MCONT_INTPND | IF_MCONT_NEWDAT));
> +		c_can_object_put(dev, iface, i + 1, IF_COMM_CONTROL);
> +	}
> +}
> +
> +static inline void c_can_activate_rx_msg_obj(struct net_device *dev,
> +						int iface, int ctrl_mask,
> +						int obj)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +			ctrl_mask & ~(IF_MCONT_MSGLST |
> +				IF_MCONT_INTPND | IF_MCONT_NEWDAT));
> +
> +	c_can_object_put(dev, iface, obj, IF_COMM_CONTROL);

Ditto.

> +}
> +
> +static void c_can_handle_lost_msg_obj(struct net_device *dev,
> +					int iface, int objno)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +	struct sk_buff *skb;
> +	struct can_frame *frame;
> +
> +	dev_err(dev->dev.parent, "msg lost in buffer %d\n", objno);
> +
> +	c_can_object_get(dev, iface, objno, IF_COMM_ALL &
> +						~IF_COMM_TXRQST);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl,
> +			IF_MCONT_CLR_MSGLST);
> +
> +	c_can_object_put(dev, 0, objno, IF_COMM_CONTROL);
> +
> +	/* create an error msg */
> +	skb = alloc_can_err_skb(dev, &frame);
> +	if (unlikely(!skb))
> +		return;
> +
> +	frame->can_id |= CAN_ERR_CRTL;
> +	frame->data[1] = CAN_ERR_CRTL_RX_OVERFLOW;
> +	stats->rx_errors++;
> +	stats->rx_over_errors++;
> +
> +	netif_receive_skb(skb);
> +}
> +
> +static int c_can_read_msg_object(struct net_device *dev, int iface, int ctrl,
> +				int objno)
> +{
> +	u16 flags, data;
> +	int i;
> +	unsigned int val;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +	struct sk_buff *skb;
> +	struct can_frame *frame;
> +
> +	skb = alloc_can_skb(dev, &frame);
> +	if (!skb) {
> +		stats->rx_dropped++;
> +		return -ENOMEM;
> +	}
> +
> +	frame->can_dlc = get_can_dlc(ctrl & 0x0F);
> +
> +	for (i = 0; i < frame->can_dlc; i += 2) {
> +		data = priv->read_reg(priv,
> +				&priv->reg_base->ifreg[iface].data[i / 2]);
> +		frame->data[i] = data;
> +		frame->data[i + 1] = data >> 8;
> +	}
> +
> +	flags =	priv->read_reg(priv, &priv->reg_base->ifreg[iface].arb2);
> +	val = priv->read_reg(priv, &priv->reg_base->ifreg[iface].arb1) |
> +		(flags << 16);
> +
> +	if (flags & IF_ARB_MSGXTD)
> +		frame->can_id = (val & CAN_EFF_MASK) | CAN_EFF_FLAG;
> +	else
> +		frame->can_id = (val >> 18) & CAN_SFF_MASK;
> +
> +	if (flags & IF_ARB_TRANSMIT)
> +		frame->can_id |= CAN_RTR_FLAG;
> +
> +	netif_receive_skb(skb);
> +
> +	stats->rx_packets++;
> +	stats->rx_bytes += frame->can_dlc;
> +
> +	return 0;
> +}
> +
> +static void c_can_setup_receive_object(struct net_device *dev, int iface,
> +					int objno, unsigned int mask,
> +					unsigned int id, unsigned int mcont)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].mask1,
> +			IFX_WRITE_LOW_16BIT(mask));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].mask2,
> +			IFX_WRITE_HIGH_16BIT(mask));
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb1,
> +			IFX_WRITE_LOW_16BIT(id));
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb2,
> +			(IF_ARB_MSGVAL | IFX_WRITE_HIGH_16BIT(id)));
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl, mcont);
> +	c_can_object_put(dev, iface, objno, IF_COMM_ALL &
> +						~IF_COMM_TXRQST);

Should fit on one line.

> +
> +	dev_dbg(dev->dev.parent, "obj no:%d, msgval:0x%08x\n", objno,
> +			c_can_read_reg32(priv, &priv->reg_base->msgval1));

Please remove empty line above.

> +}
> +
> +static void c_can_inval_msg_object(struct net_device *dev, int iface, int objno)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb1, 0);
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].arb2, 0);
> +	priv->write_reg(priv, &priv->reg_base->ifreg[iface].msg_cntrl, 0);
> +
> +	c_can_object_put(dev, iface, objno,
> +				IF_COMM_ARB | IF_COMM_CONTROL);
> +
> +	dev_dbg(dev->dev.parent, "obj no:%d, msgval:0x%08x\n", objno,
> +			c_can_read_reg32(priv, &priv->reg_base->msgval1));

Ditto.

> +}
> +
> +static netdev_tx_t c_can_start_xmit(struct sk_buff *skb,
> +					struct net_device *dev)
> +{
> +	u32 val;
> +	u32 msg_obj_no;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct can_frame *frame = (struct can_frame *)skb->data;
> +
> +	if (can_dropped_invalid_skb(dev, skb))
> +		return NETDEV_TX_OK;
> +
> +	msg_obj_no = get_tx_next_msg_obj(priv);
> +
> +	/* prepare message object for transmission */
> +	val = c_can_write_msg_object(dev, 0, frame, msg_obj_no);
> +
> +	/* enable interrupt for this message object */
> +	priv->write_reg(priv, &priv->reg_base->ifreg[0].msg_cntrl,
> +			IF_MCONT_TXIE | IF_MCONT_TXRQST | IF_MCONT_EOB |
> +			(val & 0xf));
> +	c_can_object_put(dev, 0, msg_obj_no, IF_COMM_ALL);
> +
> +	can_put_echo_skb(skb, dev, msg_obj_no - C_CAN_MSG_OBJ_TX_FIRST);
> +
> +	priv->tx_next++;
> +	if ((priv->tx_next & C_CAN_NEXT_MSG_OBJ_MASK) == 0)
> +		netif_stop_queue(dev);
> +
> +	return NETDEV_TX_OK;
> +}
> +
> +static int c_can_set_bittiming(struct net_device *dev)
> +{
> +	unsigned int reg_btr, reg_brpe, ctrl_save;
> +	u8 brp, brpe, sjw, tseg1, tseg2;
> +	u32 ten_bit_brp;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	const struct can_bittiming *bt = &priv->can.bittiming;
> +
> +	/* c_can provides a 6-bit brp and 4-bit brpe fields */
> +	ten_bit_brp = bt->brp - 1;
> +	brp = ten_bit_brp & BTR_BRP_MASK;
> +	brpe = ten_bit_brp >> 6;
> +
> +	sjw = bt->sjw - 1;
> +	tseg1 = bt->prop_seg + bt->phase_seg1 - 1;
> +	tseg2 = bt->phase_seg2 - 1;
> +
> +	reg_btr = ((brp) | (sjw << BTR_SJW_SHIFT) | (tseg1 << BTR_TSEG1_SHIFT) |
> +			(tseg2 << BTR_TSEG2_SHIFT));

The outer brackets are not needed.

> +	reg_brpe = brpe & BRP_EXT_BRPE_MASK;
> +
> +	dev_info(dev->dev.parent,
> +		"setting BTR=%04x BRPE=%04x\n", reg_btr, reg_brpe);
> +
> +	ctrl_save = priv->read_reg(priv, &priv->reg_base->control);
> +	priv->write_reg(priv, &priv->reg_base->control,
> +			ctrl_save | CONTROL_CCE | CONTROL_INIT);
> +	priv->write_reg(priv, &priv->reg_base->btr, reg_btr);
> +	priv->write_reg(priv, &priv->reg_base->brp_ext, reg_brpe);
> +	priv->write_reg(priv, &priv->reg_base->control, ctrl_save);
> +
> +	return 0;
> +}
> +
> +/*
> + * Configure C_CAN message objects for Tx and Rx purposes:
> + * C_CAN provides a total of 32 message objects that can be configured
> + * either for Tx or Rx purposes. Here the first 16 message objects are used as
> + * a reception FIFO. The end of reception FIFO is signified by the EoB bit
> + * being SET. The remaining 16 message objects are kept aside for Tx purposes.
> + * See user guide document for further details on configuring message
> + * objects.
> + */
> +static void c_can_configure_msg_objects(struct net_device *dev)
> +{
> +	int i;
> +
> +	/* first invalidate all message objects */
> +	for (i = 0; i <= C_CAN_NO_OF_OBJECTS; i++)
> +		c_can_inval_msg_object(dev, 0, i);
> +
> +	/* setup receive message objects */
> +	for (i = C_CAN_MSG_OBJ_RX_FIRST + 1 ; i < C_CAN_MSG_OBJ_RX_LAST; i++)
> +		c_can_setup_receive_object(dev, 0, i, 0, 0,
> +			((IF_MCONT_RXIE | IF_MCONT_UMASK) & ~IF_MCONT_EOB));

Ditto.

> +	c_can_setup_receive_object(dev, 0, C_CAN_MSG_OBJ_RX_LAST, 0, 0,
> +			IF_MCONT_EOB | IF_MCONT_RXIE | IF_MCONT_UMASK);
> +}
> +
> +/*
> + * Configure C_CAN chip:
> + * - enable/disable auto-retransmission
> + * - set operating mode
> + * - configure message objects
> + */
> +static void c_can_chip_config(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	if (priv->can.ctrlmode & CAN_CTRLMODE_ONE_SHOT)
> +		/* disable automatic retransmission */
> +		priv->write_reg(priv, &priv->reg_base->control,
> +				CONTROL_DISABLE_AR);
> +	else
> +		/* enable automatic retransmission */
> +		priv->write_reg(priv, &priv->reg_base->control,
> +				CONTROL_ENABLE_AR);
> +
> +	if (priv->can.ctrlmode & (CAN_CTRLMODE_LISTENONLY &
> +					CAN_CTRLMODE_LOOPBACK)) {
> +		/* loopback + silent mode : useful for hot self-test */
> +		priv->write_reg(priv, &priv->reg_base->control, (CONTROL_EIE |
> +				CONTROL_SIE | CONTROL_IE | CONTROL_TEST));

Outer brackets are not needed.

> +		priv->write_reg(priv, &priv->reg_base->test,
> +				(TEST_LBACK | TEST_SILENT));
> +	} else if (priv->can.ctrlmode & CAN_CTRLMODE_LOOPBACK) {
> +		/* loopback mode : useful for self-test function */
> +		priv->write_reg(priv, &priv->reg_base->control, (CONTROL_EIE |
> +				CONTROL_SIE | CONTROL_IE | CONTROL_TEST));

Ditto.

> +		priv->write_reg(priv, &priv->reg_base->test, TEST_LBACK);
> +	} else if (priv->can.ctrlmode & CAN_CTRLMODE_LISTENONLY) {
> +		/* silent mode : bus-monitoring mode */
> +		priv->write_reg(priv, &priv->reg_base->control, (CONTROL_EIE |
> +				CONTROL_SIE | CONTROL_IE | CONTROL_TEST));

Ditto.

> +		priv->write_reg(priv, &priv->reg_base->test, TEST_SILENT);
> +	} else
> +		/* normal mode*/
> +		priv->write_reg(priv, &priv->reg_base->control,
> +				(CONTROL_EIE | CONTROL_SIE | CONTROL_IE));

Ditto.

> +	/* configure message objects */
> +	c_can_configure_msg_objects(dev);
> +}
> +
> +static void c_can_start(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* enable status change, error and module interrupts */
> +	c_can_enable_all_interrupts(priv, ENABLE_ALL_INTERRUPTS);
> +
> +	/* basic c_can configuration */
> +	c_can_chip_config(dev);
> +
> +	priv->can.state = CAN_STATE_ERROR_ACTIVE;
> +
> +	/* reset tx helper pointers */
> +	priv->tx_next = priv->tx_echo = 0;
> +}
> +
> +static void c_can_stop(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* disable all interrupts */
> +	c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);
> +
> +	/* set the state as STOPPED */
> +	priv->can.state = CAN_STATE_STOPPED;
> +}
> +
> +static int c_can_set_mode(struct net_device *dev, enum can_mode mode)
> +{
> +	switch (mode) {
> +	case CAN_MODE_START:
> +		c_can_start(dev);
> +		netif_wake_queue(dev);
> +		break;
> +	default:
> +		return -EOPNOTSUPP;
> +	}
> +
> +	return 0;
> +}
> +
> +static int c_can_get_berr_counter(const struct net_device *dev,
> +					struct can_berr_counter *bec)
> +{
> +	unsigned int reg_err_counter;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	reg_err_counter = priv->read_reg(priv, &priv->reg_base->error_counter);
> +	bec->rxerr = ((reg_err_counter & ERR_COUNTER_REC_MASK) >>
> +				ERR_COUNTER_REC_SHIFT);

You don't need the out brackets.

> +	bec->txerr = (reg_err_counter & ERR_COUNTER_TEC_MASK);

Ditto.

> +	return 0;
> +}
> +
> +/*
> + * theory of operation:
> + *
> + * priv->tx_echo holds the number of the oldest can_frame put for
> + * transmission into the hardware, but not yet ACKed by the CAN tx
> + * complete IRQ.
> + *
> + * We iterate from priv->tx_echo to priv->tx_next and check if the
> + * packet has been transmitted, echo it back to the CAN framework.
> + * If we discover a not yet transmitted package, stop looking for more.
> + */
> +static void c_can_do_tx(struct net_device *dev)
> +{
> +	u32 val;
> +	u32 msg_obj_no;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +
> +	for (/* nix */; (priv->tx_next - priv->tx_echo) > 0; priv->tx_echo++) {
> +		msg_obj_no = get_tx_echo_msg_obj(priv);
> +		c_can_inval_msg_object(dev, 0, msg_obj_no);
> +		val = c_can_read_reg32(priv, &priv->reg_base->txrqst1);
> +		if (!(val & (1 << msg_obj_no))) {
> +			can_get_echo_skb(dev,
> +					msg_obj_no - C_CAN_MSG_OBJ_TX_FIRST);
> +			stats->tx_bytes += priv->read_reg(priv,
> +					&priv->reg_base->ifreg[0].msg_cntrl)
> +					& IF_MCONT_DLC_MASK;
> +			stats->tx_packets++;
> +		}
> +	}
> +
> +	/* restart queue if wrap-up or if queue stalled on last pkt */
> +	if (((priv->tx_next & C_CAN_NEXT_MSG_OBJ_MASK) != 0) ||
> +			((priv->tx_echo & C_CAN_NEXT_MSG_OBJ_MASK) == 0))
> +		netif_wake_queue(dev);
> +}
> +
> +/*
> + * theory of operation:
> + *
> + * c_can core saves a received CAN message into the first free message
> + * object it finds free (starting with the lowest). Bits NEWDAT and
> + * INTPND are set for this message object indicating that a new message
> + * has arrived. To work-around this issue, we keep two groups of message
> + * objects whose partitioning is defined by C_CAN_MSG_OBJ_RX_SPLIT.
> + *
> + * To ensure in-order frame reception we use the following
> + * approach while re-activating a message object to receive further
> + * frames:
> + * - if the current message object number is lower than
> + *   C_CAN_MSG_RX_LOW_LAST, do not clear the NEWDAT bit while clearing
> + *   the INTPND bit.
> + * - if the current message object number is equal to
> + *   C_CAN_MSG_RX_LOW_LAST then clear the NEWDAT bit of all lower
> + *   receive message objects.
> + * - if the current message object number is greater than
> + *   C_CAN_MSG_RX_LOW_LAST then clear the NEWDAT bit of
> + *   only this message object.
> + */
> +static int c_can_do_rx_poll(struct net_device *dev, int quota)
> +{
> +	u32 num_rx_pkts = 0;
> +	unsigned int msg_obj, msg_ctrl_save;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	u32 val = c_can_read_reg32(priv, &priv->reg_base->intpnd1);
> +
> +	for (msg_obj = C_CAN_MSG_OBJ_RX_FIRST;
> +			msg_obj <= C_CAN_MSG_OBJ_RX_LAST && quota > 0;
> +			msg_obj++) {
> +		if (val & (1 << msg_obj)) {
> +			c_can_object_get(dev, 0, msg_obj, IF_COMM_ALL &
> +					~IF_COMM_TXRQST);
> +			msg_ctrl_save = priv->read_reg(priv,
> +					&priv->reg_base->ifreg[0].msg_cntrl);
> +
> +			if (msg_ctrl_save & IF_MCONT_EOB)
> +				return num_rx_pkts;
> +
> +			if (msg_ctrl_save & IF_MCONT_MSGLST) {
> +				c_can_handle_lost_msg_obj(dev, 0, msg_obj);
> +				num_rx_pkts++;
> +				quota--;
> +				continue;
> +			}
> +
> +			if (!(msg_ctrl_save & IF_MCONT_NEWDAT))
> +				continue;
> +
> +			/* read the data from the message object */
> +			c_can_read_msg_object(dev, 0, msg_ctrl_save, msg_obj);
> +
> +			if (msg_obj < C_CAN_MSG_RX_LOW_LAST)
> +				c_can_mark_rx_msg_obj(dev, 0,
> +						msg_ctrl_save, msg_obj);
> +			else if (msg_obj > C_CAN_MSG_RX_LOW_LAST)
> +				/* activate this msg obj */
> +				c_can_activate_rx_msg_obj(dev, 0,
> +						msg_ctrl_save, msg_obj);
> +			else if (msg_obj == C_CAN_MSG_RX_LOW_LAST)
> +				/* activate all lower message objects */
> +				c_can_activate_all_lower_rx_msg_obj(dev,
> +						0, msg_ctrl_save);
> +
> +			num_rx_pkts++;
> +			quota--;
> +		}
> +		val = c_can_read_reg32(priv, &priv->reg_base->intpnd1);
> +	}
> +
> +	return num_rx_pkts;
> +}
> +
> +static inline int c_can_has_and_handle_berr(struct c_can_priv *priv)
> +{
> +	return (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) &&
> +		(priv->current_status & STATUS_LEC_MASK);
> +}
> +
> +static int c_can_err(struct net_device *dev,
> +				enum c_can_bus_error_types error_type,
> +				enum c_can_lec_type lec_type)
> +{
> +	unsigned int reg_err_counter;
> +	unsigned int rx_err_passive;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct net_device_stats *stats = &dev->stats;
> +	struct can_frame *cf;
> +	struct sk_buff *skb;
> +	struct can_berr_counter bec;
> +
> +	/* propogate the error condition to the CAN stack */
> +	skb = alloc_can_err_skb(dev, &cf);
> +	if (unlikely(!skb))
> +		return 0;
> +
> +	c_can_get_berr_counter(dev, &bec);
> +	reg_err_counter = priv->read_reg(priv, &priv->reg_base->error_counter);
> +	rx_err_passive = ((reg_err_counter & ERR_COUNTER_RP_MASK) >>
> +				ERR_COUNTER_RP_SHIFT);

Outer brackset?

> +	if (error_type & C_CAN_ERROR_WARNING) {
> +		/* error warning state */
> +		priv->can.can_stats.error_warning++;
> +		priv->can.state = CAN_STATE_ERROR_WARNING;
> +		cf->can_id |= CAN_ERR_CRTL;
> +		if (bec.rxerr > 96)
> +			cf->data[1] |= CAN_ERR_CRTL_RX_WARNING;
> +		if (bec.txerr > 96)
> +			cf->data[1] |= CAN_ERR_CRTL_TX_WARNING;
> +	}
> +	if (error_type & C_CAN_ERROR_PASSIVE) {
> +		/* error passive state */
> +		priv->can.can_stats.error_passive++;
> +		priv->can.state = CAN_STATE_ERROR_PASSIVE;
> +		cf->can_id |= CAN_ERR_CRTL;
> +		if (rx_err_passive)
> +			cf->data[1] |= CAN_ERR_CRTL_RX_PASSIVE;
> +		if (bec.txerr > 127)
> +			cf->data[1] |= CAN_ERR_CRTL_TX_PASSIVE;
> +	}
> +	if (error_type & C_CAN_BUS_OFF) {
> +		/* bus-off state */
> +		priv->can.state = CAN_STATE_BUS_OFF;
> +		cf->can_id |= CAN_ERR_BUSOFF;
> +		/* disable all interrupts in bus-off mode to ensure that
> +		 * the CPU is not hogged down
> +		 */

Please use the following style:

	/*
	 * Comment
 	 */

> +		c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);
> +		can_bus_off(dev);
> +	}
> +
> +	/*
> +	 * check for 'last error code' which tells us the
> +	 * type of the last error to occur on the CAN bus
> +	 */
> +	switch (lec_type) {
> +		/* common for all type of bus errors */
> +		priv->can.can_stats.bus_error++;
> +		stats->rx_errors++;
> +		cf->can_id |= CAN_ERR_PROT | CAN_ERR_BUSERROR;
> +		cf->data[2] |= CAN_ERR_PROT_UNSPEC;

Are you sure that this part is ever executed? I wonder why the compile does
not complain.

> +	case LEC_STUFF_ERROR:
> +		dev_dbg(dev->dev.parent, "stuff error\n");
> +		cf->data[2] |= CAN_ERR_PROT_STUFF;
> +		break;
> +
> +	case LEC_FORM_ERROR:
> +		dev_dbg(dev->dev.parent, "form error\n");
> +		cf->data[2] |= CAN_ERR_PROT_FORM;
> +		break;
> +
> +	case LEC_ACK_ERROR:
> +		dev_dbg(dev->dev.parent, "ack error\n");
> +		cf->data[2] |= (CAN_ERR_PROT_LOC_ACK |
> +				CAN_ERR_PROT_LOC_ACK_DEL);
> +		break;
> +
> +	case LEC_BIT1_ERROR:
> +		dev_dbg(dev->dev.parent, "bit1 error\n");
> +		cf->data[2] |= CAN_ERR_PROT_BIT1;
> +		break;
> +
> +	case LEC_BIT0_ERROR:
> +		dev_dbg(dev->dev.parent, "bit0 error\n");
> +		cf->data[2] |= CAN_ERR_PROT_BIT0;
> +		break;
> +
> +	case LEC_CRC_ERROR:
> +		dev_dbg(dev->dev.parent, "CRC error\n");
> +		cf->data[2] |= (CAN_ERR_PROT_LOC_CRC_SEQ |
> +				CAN_ERR_PROT_LOC_CRC_DEL);
> +		break;
> +	}
> +
> +	netif_receive_skb(skb);
> +	stats->rx_packets++;
> +	stats->rx_bytes += cf->can_dlc;
> +
> +	return 1;
> +}
> +
> +static int c_can_poll(struct napi_struct *napi, int quota)
> +{
> +	u16 irqstatus;
> +	int lec_type = 0;
> +	int work_done = 0;
> +	struct net_device *dev = napi->dev;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	enum c_can_bus_error_types error_type = C_CAN_NO_ERROR;
> +
> +	irqstatus = priv->read_reg(priv, &priv->reg_base->ir);
> +
> +	/* status events have the highest priority */
> +	if (irqstatus == STATUS_INTERRUPT) {
> +		priv->current_status = priv->read_reg(priv,
> +					&priv->reg_base->status);
> +
> +		/* handle Tx/Rx events */
> +		if (priv->current_status & STATUS_TXOK)
> +			priv->write_reg(priv, &priv->reg_base->status,
> +					(priv->current_status & ~STATUS_TXOK));

Outer bracket are not needed. Here and in similar expressions below.

> +
> +		if (priv->current_status & STATUS_RXOK)
> +			priv->write_reg(priv, &priv->reg_base->status,
> +					(priv->current_status & ~STATUS_RXOK));
> +
> +		/* handle bus error events */
> +		if (priv->current_status & STATUS_EWARN) {
> +			dev_dbg(dev->dev.parent,
> +					"entered error warning state\n");
> +			error_type = C_CAN_ERROR_WARNING;
> +		}
> +		if ((priv->current_status & STATUS_EPASS) &&
> +				(!(priv->last_status & STATUS_EPASS))) {
> +			dev_dbg(dev->dev.parent,
> +					"entered error passive state\n");
> +			error_type = C_CAN_ERROR_PASSIVE;
> +		}
> +		if ((priv->current_status & STATUS_BOFF) &&
> +				(!(priv->last_status & STATUS_BOFF))) {
> +			dev_dbg(dev->dev.parent,
> +					"entered bus off state\n");
> +			error_type = C_CAN_BUS_OFF;
> +		}
> +
> +		/* handle bus recovery events */
> +		if ((!(priv->current_status & STATUS_EPASS)) &&
> +				(priv->last_status & STATUS_EPASS)) {
> +			dev_dbg(dev->dev.parent,
> +					"left error passive state\n");
> +			priv->can.state = CAN_STATE_ERROR_ACTIVE;
> +		}
> +		if ((!(priv->current_status & STATUS_BOFF)) &&
> +				(priv->last_status & STATUS_BOFF)) {
> +			dev_dbg(dev->dev.parent,
> +					"left bus off state\n");
> +			priv->can.state = CAN_STATE_ERROR_ACTIVE;
> +		}
> +
> +		priv->last_status = priv->current_status;
> +
> +		/* handle error on the bus */
> +		lec_type = c_can_has_and_handle_berr(priv);
> +		if (lec_type && (error_type != C_CAN_NO_ERROR))
> +			work_done += c_can_err(dev, error_type, lec_type);
> +	} else if ((irqstatus > C_CAN_MSG_OBJ_RX_FIRST) &&
> +			(irqstatus <= C_CAN_MSG_OBJ_RX_LAST)) {
> +		/* handle events corresponding to receive message objects */
> +		work_done += c_can_do_rx_poll(dev, (quota - work_done));
> +	} else if ((irqstatus > C_CAN_MSG_OBJ_TX_FIRST) &&
> +			(irqstatus <= C_CAN_MSG_OBJ_TX_LAST)) {
> +		/* handle events corresponding to transmit message objects */
> +		c_can_do_tx(dev);
> +	}
> +
> +	if (work_done < quota) {
> +		napi_complete(napi);
> +		/* enable all IRQs */
> +		c_can_enable_all_interrupts(priv, ENABLE_ALL_INTERRUPTS);
> +	}
> +
> +	return work_done;
> +}
> +
> +static irqreturn_t c_can_isr(int irq, void *dev_id)
> +{
> +	struct net_device *dev = (struct net_device *)dev_id;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* disable all interrupts and schedule the NAPI */
> +	c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);
> +	napi_schedule(&priv->napi);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static int c_can_open(struct net_device *dev)
> +{
> +	int err;
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	/* open the can device */
> +	err = open_candev(dev);
> +	if (err) {
> +		dev_err(dev->dev.parent, "failed to open can device\n");
> +		return err;
> +	}
> +
> +	/* register interrupt handler */
> +	err = request_irq(dev->irq, &c_can_isr, priv->irq_flags, dev->name,
> +				dev);
> +	if (err < 0) {
> +		dev_err(dev->dev.parent, "failed to attach interrupt\n");

s/attach/request/ ?

> +		goto exit_irq_fail;
> +	}
> +
> +	/* start the c_can controller */
> +	c_can_start(dev);
> +
> +	napi_enable(&priv->napi);
> +	netif_start_queue(dev);
> +
> +	return 0;
> +
> +exit_irq_fail:
> +	close_candev(dev);
> +	return err;
> +}
> +
> +static int c_can_close(struct net_device *dev)
> +{
> +	struct c_can_priv *priv = netdev_priv(dev);
> +
> +	netif_stop_queue(dev);
> +	napi_disable(&priv->napi);
> +	c_can_stop(dev);
> +	free_irq(dev->irq, dev);
> +	close_candev(dev);
> +
> +	return 0;
> +}
> +
> +struct net_device *alloc_c_can_dev(void)
> +{
> +	struct net_device *dev;
> +	struct c_can_priv *priv;
> +
> +	dev = alloc_candev(sizeof(struct c_can_priv), C_CAN_MSG_OBJ_TX_NUM);
> +	if (!dev)
> +		return NULL;
> +
> +	priv = netdev_priv(dev);
> +	netif_napi_add(dev, &priv->napi, c_can_poll, C_CAN_NAPI_WEIGHT);
> +
> +	priv->dev = dev;
> +	priv->can.bittiming_const = &c_can_bittiming_const;
> +	priv->can.do_set_bittiming = c_can_set_bittiming;
> +	priv->can.do_set_mode = c_can_set_mode;
> +	priv->can.do_get_berr_counter = c_can_get_berr_counter;
> +	priv->can.ctrlmode_supported = CAN_CTRLMODE_ONE_SHOT |
> +					CAN_CTRLMODE_LOOPBACK |
> +					CAN_CTRLMODE_LISTENONLY |
> +					CAN_CTRLMODE_BERR_REPORTING;
> +
> +	return dev;
> +}
> +EXPORT_SYMBOL_GPL(alloc_c_can_dev);
> +
> +void free_c_can_dev(struct net_device *dev)
> +{
> +	free_candev(dev);
> +}
> +EXPORT_SYMBOL_GPL(free_c_can_dev);
> +
> +static const struct net_device_ops c_can_netdev_ops = {
> +	.ndo_open = c_can_open,
> +	.ndo_stop = c_can_close,
> +	.ndo_start_xmit = c_can_start_xmit,
> +};
> +
> +int register_c_can_dev(struct net_device *dev)
> +{
> +	dev->flags |= IFF_ECHO;	/* we support local echo */
> +	dev->netdev_ops = &c_can_netdev_ops;
> +
> +	return register_candev(dev);
> +}
> +EXPORT_SYMBOL_GPL(register_c_can_dev);
> +
> +void unregister_c_can_dev(struct net_device *dev)
> +{
> +	unregister_candev(dev);
> +}
> +EXPORT_SYMBOL_GPL(unregister_c_can_dev);
> +
> +MODULE_AUTHOR("Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>");
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("CAN bus driver for Bosch C_CAN controller");
> diff --git a/drivers/net/can/c_can/c_can.h b/drivers/net/can/c_can/c_can.h
> new file mode 100644
> index 0000000..fafc5e6
> --- /dev/null
> +++ b/drivers/net/can/c_can/c_can.h
> @@ -0,0 +1,235 @@
> +/*
> + * CAN bus driver for Bosch C_CAN controller
> + *
> + * Copyright (C) 2010 ST Microelectronics
> + * Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> + *
> + * Borrowed heavily from the C_CAN driver originally written by:
> + * Copyright (C) 2007
> + * - Sascha Hauer, Marc Kleine-Budde, Pengutronix <s.hauer-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + * - Simon Kallweit, intefo AG <simon.kallweit-+G9qxTFKJT/tRgLqZ5aouw@public.gmane.org>
> + *
> + * TX and RX NAPI implementation has been borrowed from at91 CAN driver
> + * written by:
> + * Copyright
> + * (C) 2007 by Hans J. Koch <hjk-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
> + * (C) 2008, 2009 by Marc Kleine-Budde <kernel-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + *
> + * Bosch C_CAN controller is compliant to CAN protocol version 2.0 part A and B.
> + * Bosch C_CAN user manual can be obtained from:
> + * http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
> + *
> + * This file is licensed under the terms of the GNU General Public
> + * License version 2. This program is licensed "as is" without any
> + * warranty of any kind, whether express or implied.
> + */
> +
> +#ifndef C_CAN_H
> +#define C_CAN_H
> +
> +/* control register */
> +#define CONTROL_TEST		BIT(7)
> +#define CONTROL_CCE		BIT(6)
> +#define CONTROL_DISABLE_AR	BIT(5)
> +#define CONTROL_ENABLE_AR	(0 << 5)
> +#define CONTROL_EIE		BIT(3)
> +#define CONTROL_SIE		BIT(2)
> +#define CONTROL_IE		BIT(1)
> +#define CONTROL_INIT		BIT(0)
> +
> +/* test register */
> +#define TEST_RX			BIT(7)
> +#define TEST_TX1		BIT(6)
> +#define TEST_TX2		BIT(5)
> +#define TEST_LBACK		BIT(4)
> +#define TEST_SILENT		BIT(3)
> +#define TEST_BASIC		BIT(2)
> +
> +/* status register */
> +#define STATUS_BOFF		BIT(7)
> +#define STATUS_EWARN		BIT(6)
> +#define STATUS_EPASS		BIT(5)
> +#define STATUS_RXOK		BIT(4)
> +#define STATUS_TXOK		BIT(3)
> +#define STATUS_LEC_MASK		0x07
> +
> +/* error counter register */
> +#define ERR_COUNTER_TEC_MASK	0xff
> +#define ERR_COUNTER_TEC_SHIFT	0
> +#define ERR_COUNTER_REC_SHIFT	8
> +#define ERR_COUNTER_REC_MASK	(0x7f << ERR_COUNTER_REC_SHIFT)
> +#define ERR_COUNTER_RP_SHIFT	15
> +#define ERR_COUNTER_RP_MASK	(0x1 << ERR_COUNTER_RP_SHIFT)
> +
> +/* bit-timing register */
> +#define BTR_BRP_MASK		0x3f
> +#define BTR_BRP_SHIFT		0
> +#define BTR_SJW_SHIFT		6
> +#define BTR_SJW_MASK		(0x3 << BTR_SJW_SHIFT)
> +#define BTR_TSEG1_SHIFT		8
> +#define BTR_TSEG1_MASK		(0xf << BTR_TSEG1_SHIFT)
> +#define BTR_TSEG2_SHIFT		12
> +#define BTR_TSEG2_MASK		(0x7 << BTR_TSEG2_SHIFT)
> +
> +/* brp extension register */
> +#define BRP_EXT_BRPE_MASK	0x0f
> +#define BRP_EXT_BRPE_SHIFT	0
> +
> +/* IFx command request */
> +#define IF_COMR_BUSY		BIT(15)
> +
> +/* IFx command mask */
> +#define IF_COMM_WR		BIT(7)
> +#define IF_COMM_MASK		BIT(6)
> +#define IF_COMM_ARB		BIT(5)
> +#define IF_COMM_CONTROL		BIT(4)
> +#define IF_COMM_CLR_INT_PND	BIT(3)
> +#define IF_COMM_TXRQST		BIT(2)
> +#define IF_COMM_DATAA		BIT(1)
> +#define IF_COMM_DATAB		BIT(0)
> +#define IF_COMM_ALL		(IF_COMM_MASK | IF_COMM_ARB | \
> +				IF_COMM_CONTROL | IF_COMM_TXRQST | \
> +				IF_COMM_DATAA | IF_COMM_DATAB)
> +
> +/* IFx arbitration */
> +#define IF_ARB_MSGVAL		BIT(15)
> +#define IF_ARB_MSGXTD		BIT(14)
> +#define IF_ARB_TRANSMIT		BIT(13)
> +
> +/* IFx message control */
> +#define IF_MCONT_NEWDAT		BIT(15)
> +#define IF_MCONT_MSGLST		BIT(14)
> +#define IF_MCONT_CLR_MSGLST	(0 << 14)
> +#define IF_MCONT_INTPND		BIT(13)
> +#define IF_MCONT_UMASK		BIT(12)
> +#define IF_MCONT_TXIE		BIT(11)
> +#define IF_MCONT_RXIE		BIT(10)
> +#define IF_MCONT_RMTEN		BIT(9)
> +#define IF_MCONT_TXRQST		BIT(8)
> +#define IF_MCONT_EOB		BIT(7)
> +#define IF_MCONT_DLC_MASK	0xf
> +
> +/*
> + * IFx register masks:
> + * allow easy operation on 16-bit registers when the
> + * argument is 32-bit instead
> + */
> +#define IFX_WRITE_LOW_16BIT(x)	((x) & 0xFFFF)
> +#define IFX_WRITE_HIGH_16BIT(x)	(((x) & 0xFFFF0000) >> 16)
> +
> +/* message object split */
> +#define C_CAN_NO_OF_OBJECTS	31
> +#define C_CAN_MSG_OBJ_RX_NUM	16
> +#define C_CAN_MSG_OBJ_TX_NUM	16
> +
> +#define C_CAN_MSG_OBJ_RX_FIRST	0
> +#define C_CAN_MSG_OBJ_RX_LAST	(C_CAN_MSG_OBJ_RX_FIRST + \
> +				C_CAN_MSG_OBJ_RX_NUM - 1)
> +
> +#define C_CAN_MSG_OBJ_TX_FIRST	(C_CAN_MSG_OBJ_RX_LAST + 1)
> +#define C_CAN_MSG_OBJ_TX_LAST	(C_CAN_MSG_OBJ_TX_FIRST + \
> +				C_CAN_MSG_OBJ_TX_NUM - 1)
> +
> +#define C_CAN_MSG_OBJ_RX_SPLIT	8
> +#define C_CAN_MSG_RX_LOW_LAST	(C_CAN_MSG_OBJ_RX_SPLIT - 1)
> +
> +#define C_CAN_NEXT_MSG_OBJ_MASK	(C_CAN_MSG_OBJ_TX_NUM - 1)
> +#define RECEIVE_OBJECT_BITS	0x0000ffff
> +
> +/* status interrupt */
> +#define STATUS_INTERRUPT	0x8000
> +
> +/* global interrupt masks */
> +#define ENABLE_ALL_INTERRUPTS	1
> +#define DISABLE_ALL_INTERRUPTS	0
> +
> +/* minimum timeout for checking BUSY status */
> +#define MIN_TIMEOUT_VALUE	6
> +
> +/* napi related */
> +#define C_CAN_NAPI_WEIGHT	C_CAN_MSG_OBJ_RX_NUM
> +
> +/* c_can IF registers */
> +struct c_can_if_regs {
> +	u16 com_reg;
> +	u16 com_mask;
> +	u16 mask1;
> +	u16 mask2;
> +	u16 arb1;
> +	u16 arb2;
> +	u16 msg_cntrl;
> +	u16 data[4];
> +	u16 _reserved[13];
> +};
> +
> +/* c_can hardware registers */
> +struct c_can_regs {
> +	u16 control;
> +	u16 status;
> +	u16 error_counter;
> +	u16 btr;
> +	u16 ir;
> +	u16 test;
> +	u16 brp_ext;
> +	u16 _reserved1;
> +	struct c_can_if_regs ifreg[2]; /* [0] = IF1 and [1] = IF2 */

Why not just "if" instead of "ifreg"? That would also nicely shorten
many log expressions.

> +	u16 _reserved2[8];
> +	u16 txrqst1;
> +	u16 txrqst2;
> +	u16 _reserved3[6];
> +	u16 newdat1;
> +	u16 newdat2;
> +	u16 _reserved4[6];
> +	u16 intpnd1;
> +	u16 intpnd2;
> +	u16 _reserved5[6];
> +	u16 msgval1;
> +	u16 msgval2;
> +	u16 _reserved6[6];
> +};

Above you use both, rather long and heavily abbreviated names, e.g.
"error_counter" vs. "ir". Something in between would be nice.

> +/* c_can lec values */
> +enum c_can_lec_type {
> +	LEC_STUFF_ERROR = 1,
> +	LEC_FORM_ERROR,
> +	LEC_ACK_ERROR,
> +	LEC_BIT1_ERROR,
> +	LEC_BIT0_ERROR,
> +	LEC_CRC_ERROR,
> +};
> +
> +/*
> + * c_can error types:
> + * Bus errors (BUS_OFF, ERROR_WARNING, ERROR_PASSIVE) are supported
> + */
> +enum c_can_bus_error_types {
> +	C_CAN_NO_ERROR = 0,
> +	C_CAN_BUS_OFF,
> +	C_CAN_ERROR_WARNING,
> +	C_CAN_ERROR_PASSIVE,
> +};
> +
> +/* c_can private data structure */
> +struct c_can_priv {
> +	struct can_priv can;	/* must be the first member */
> +	struct napi_struct napi;
> +	struct net_device *dev;
> +	int tx_object;
> +	int current_status;
> +	int last_status;
> +	u16 (*read_reg) (struct c_can_priv *priv, void *reg);
> +	void (*write_reg) (struct c_can_priv *priv, void *reg, u16 val);
> +	struct c_can_regs __iomem *reg_base;

s/reg_base/regs/ seems more logical to me. reg_base sounds like a "void *"
member. 

> +	unsigned long irq_flags; /* for request_irq() */
> +	unsigned int tx_next;
> +	unsigned int tx_echo;
> +	struct clk *clk;

clk is a platform specific variable, e.g. a PCI based drive will not need it.
Therefore a member "priv" would make sense. Also it would nicely shorten
many log expressions.

> +};
> +
> +void c_can_enable_all_interrupts(struct c_can_priv *priv, int enable);
> +struct net_device *alloc_c_can_dev(void);
> +void free_c_can_dev(struct net_device *dev);
> +int register_c_can_dev(struct net_device *dev);
> +void unregister_c_can_dev(struct net_device *dev);
> +
> +#endif /* C_CAN_H */
> diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c
> new file mode 100644
> index 0000000..482a57e
> --- /dev/null
> +++ b/drivers/net/can/c_can/c_can_platform.c
> @@ -0,0 +1,210 @@
> +/*
> + * Platform CAN bus driver for Bosch C_CAN controller
> + *
> + * Copyright (C) 2010 ST Microelectronics
> + * Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>
> + *
> + * Borrowed heavily from the C_CAN driver originally written by:
> + * Copyright (C) 2007
> + * - Sascha Hauer, Marc Kleine-Budde, Pengutronix <s.hauer-bIcnvbaLZ9MEGnE8C9+IrQ@public.gmane.org>
> + * - Simon Kallweit, intefo AG <simon.kallweit-+G9qxTFKJT/tRgLqZ5aouw@public.gmane.org>
> + *
> + * Bosch C_CAN controller is compliant to CAN protocol version 2.0 part A and B.
> + * Bosch C_CAN user manual can be obtained from:
> + * http://www.semiconductors.bosch.de/pdf/Users_Manual_C_CAN.pdf
> + *
> + * This file is licensed under the terms of the GNU General Public
> + * License version 2. This program is licensed "as is" without any
> + * warranty of any kind, whether express or implied.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/version.h>
> +#include <linux/module.h>
> +#include <linux/interrupt.h>
> +#include <linux/delay.h>
> +#include <linux/netdevice.h>
> +#include <linux/if_arp.h>
> +#include <linux/if_ether.h>
> +#include <linux/list.h>
> +#include <linux/delay.h>
> +#include <linux/io.h>
> +#include <linux/platform_device.h>
> +#include <linux/clk.h>
> +
> +#include <linux/can/dev.h>
> +
> +#include "c_can.h"
> +
> +/*
> + * 16-bit c_can registers can be arranged differently in the memory
> + * architecture of different implementations. For example: 16-bit
> + * registers can be aligned to a 16-bit boundary or 32-bit boundary etc.
> + * Handle the same by providing a common read/write interface.
> + */
> +static u16 c_can_plat_read_reg_aligned_to_16bit(struct c_can_priv *priv,
> +						void *reg)
> +{
> +	return readw(reg);
> +}
> +
> +static void c_can_plat_write_reg_aligned_to_16bit(struct c_can_priv *priv,
> +						void *reg, u16 val)
> +{
> +	writew(val, reg);
> +}
> +
> +static u16 c_can_plat_read_reg_aligned_to_32bit(struct c_can_priv *priv,
> +						void *reg)
> +{
> +	return readw(reg + (long)reg - (long)priv->reg_base);
> +}
> +
> +static void c_can_plat_write_reg_aligned_to_32bit(struct c_can_priv *priv,
> +						void *reg, u16 val)
> +{
> +	writew(val, reg + (long)reg - (long)priv->reg_base);
> +}
> +
> +static int __devinit c_can_plat_probe(struct platform_device *pdev)
> +{
> +	int ret;
> +	void __iomem *addr;
> +	struct net_device *dev;
> +	struct c_can_priv *priv;
> +	struct resource *mem, *irq;
> +	struct clk *clk;
> +
> +	/* get the appropriate clk */
> +	clk = clk_get(&pdev->dev, NULL);
> +	if (IS_ERR(clk)) {
> +		dev_err(&pdev->dev, "no clock defined\n");
> +		ret = -ENODEV;
> +		goto exit;
> +	}
> +
> +	/* get the platform data */
> +	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
> +	if (!mem || (irq <= 0)) {
> +		ret = -ENODEV;
> +		goto exit_free_clk;
> +	}
> +
> +	if (!request_mem_region(mem->start, resource_size(mem),
> +				KBUILD_MODNAME)) {
> +		dev_err(&pdev->dev, "resource unavailable\n");
> +		ret = -ENODEV;
> +		goto exit_free_clk;
> +	}
> +
> +	addr = ioremap(mem->start, resource_size(mem));
> +	if (!addr) {
> +		dev_err(&pdev->dev, "failed to map can port\n");
> +		ret = -ENOMEM;
> +		goto exit_release_mem;
> +	}
> +
> +	/* allocate the c_can device */
> +	dev = alloc_c_can_dev();
> +	if (!dev) {
> +		ret = -ENOMEM;
> +		goto exit_iounmap;
> +	}
> +
> +	priv = netdev_priv(dev);
> +
> +	dev->irq = irq->start;
> +	priv->irq_flags = irq->flags;
> +	priv->reg_base = addr;
> +	priv->can.clock.freq = clk_get_rate(clk);
> +	priv->clk = clk;
> +
> +	switch (mem->flags & IORESOURCE_MEM_TYPE_MASK) {
> +	case IORESOURCE_MEM_32BIT:
> +		priv->read_reg = c_can_plat_read_reg_aligned_to_32bit;
> +		priv->write_reg = c_can_plat_write_reg_aligned_to_32bit;
> +		break;
> +	case IORESOURCE_MEM_16BIT:
> +	default:
> +		priv->read_reg = c_can_plat_read_reg_aligned_to_16bit;
> +		priv->write_reg = c_can_plat_write_reg_aligned_to_16bit;
> +		break;
> +	}
> +
> +	platform_set_drvdata(pdev, dev);
> +	SET_NETDEV_DEV(dev, &pdev->dev);
> +
> +	ret = register_c_can_dev(dev);
> +	if (ret) {
> +		dev_err(&pdev->dev, "registering %s failed (err=%d)\n",
> +			KBUILD_MODNAME, ret);
> +		goto exit_free_device;
> +	}
> +
> +	dev_info(&pdev->dev, "%s device registered (reg_base=%p, irq=%d)\n",
> +		 KBUILD_MODNAME, priv->reg_base, dev->irq);
> +	return 0;
> +
> +exit_free_device:
> +	platform_set_drvdata(pdev, NULL);
> +	free_c_can_dev(dev);
> +exit_iounmap:
> +	iounmap(addr);
> +exit_release_mem:
> +	release_mem_region(mem->start, resource_size(mem));
> +exit_free_clk:
> +	clk_put(clk);
> +exit:
> +	dev_err(&pdev->dev, "probe failed\n");
> +
> +	return ret;
> +}
> +
> +static int __devexit c_can_plat_remove(struct platform_device *pdev)
> +{
> +	struct net_device *dev = platform_get_drvdata(pdev);
> +	struct c_can_priv *priv = netdev_priv(dev);
> +	struct resource *mem;
> +
> +	/* disable all interrupts */
> +	c_can_enable_all_interrupts(priv, DISABLE_ALL_INTERRUPTS);

To avoid exportign that function, couldn't it be done at the beginning of 
unregister_c_can_dev()?

> +
> +	unregister_c_can_dev(dev);
> +	platform_set_drvdata(pdev, NULL);
> +
> +	free_c_can_dev(dev);
> +	iounmap(priv->reg_base);
> +
> +	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	release_mem_region(mem->start, resource_size(mem));
> +
> +	clk_put(priv->clk);
> +
> +	return 0;
> +}
> +
> +static struct platform_driver c_can_plat_driver = {
> +	.driver = {
> +		.name = KBUILD_MODNAME,
> +		.owner = THIS_MODULE,
> +	},
> +	.probe = c_can_plat_probe,
> +	.remove = __devexit_p(c_can_plat_remove),
> +};
> +
> +static int __init c_can_plat_init(void)
> +{
> +	return platform_driver_register(&c_can_plat_driver);
> +}
> +module_init(c_can_plat_init);
> +
> +static void __exit c_can_plat_exit(void)
> +{
> +	platform_driver_unregister(&c_can_plat_driver);
> +}
> +module_exit(c_can_plat_exit);
> +
> +MODULE_AUTHOR("Bhupesh Sharma <bhupesh.sharma-qxv4g6HH51o@public.gmane.org>");
> +MODULE_LICENSE("GPL v2");
> +MODULE_DESCRIPTION("Platform CAN bus driver for Bosch C_CAN controller");

Thanks for your contribution.

Wolfgang.

^ permalink raw reply

* [RFC v3 PATCH] m68knommu: added dm9000 support
From: Angelo Dureghello @ 2011-01-08  9:08 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-m68k

This patch allows to use the dm9000 network chip with a m68knommu
big-endian cpu. From the data bus circuit-wiring point of view,
the cpu data bus connected to the dm9000 chip should be hardware-byte-swapped,
crossing the bytes wires (D0:7 to D24:31, etc.). 
In anyway, has been also added an option to swap the bytes in the driver, 
if some cpu has been wired straight D0:D31 to dm9000.

Signed-off-by: Angelo Dureghello <angelo70@gmail.com>

---
--- linux/drivers/net/Kconfig.orig	2011-01-05 17:11:37.992376124 +0100
+++ linux/drivers/net/Kconfig	2011-01-08 09:53:48.231300064 +0100
@@ -960,7 +960,7 @@ config TI_DAVINCI_EMAC
 
 config DM9000
 	tristate "DM9000 support"
-	depends on ARM || BLACKFIN || MIPS
+	depends on COLDFIRE || ARM || BLACKFIN || MIPS
 	select CRC32
 	select MII
 	---help---
@@ -986,6 +986,14 @@ config DM9000_FORCE_SIMPLE_PHY_POLL
 	  costly MII PHY reads. Note, this will not work if the chip is
 	  operating with an external PHY.
 
+config DM9000_32BIT_SW_SWAP
+	bool "Software byte swap for 32 bit data bus"
+	depends on DM9000 && COLDFIRE
+	---help---
+	  This configuration allows to swap data bytes from the dm9000
+	  driver itself, when the big endian cpu is wired straight to
+	  the dm9000 32 bit data bus.
+
 config ENC28J60
 	tristate "ENC28J60 support"
 	depends on EXPERIMENTAL && SPI && NET_ETHERNET

--- linux/drivers/net/dm9000.c.orig	2010-12-30 23:19:39.747836070 +0100
+++ linux/drivers/net/dm9000.c	2011-01-08 09:54:28.543551323 +0100
@@ -158,9 +158,17 @@ dm9000_reset(board_info_t * db)
 	dev_dbg(db->dev, "resetting device\n");
 
 	/* RESET device */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(DM9000_NCR, db->io_addr);
+#else
 	writeb(DM9000_NCR, db->io_addr);
+#endif
 	udelay(200);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(NCR_RST, db->io_data);
+#else
 	writeb(NCR_RST, db->io_data);
+#endif
 	udelay(200);
 }
 
@@ -170,8 +178,13 @@ dm9000_reset(board_info_t * db)
 static u8
 ior(board_info_t * db, int reg)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg, db->io_addr);
+	return (u8)readl(db->io_data);
+#else
 	writeb(reg, db->io_addr);
 	return readb(db->io_data);
+#endif
 }
 
 /*
@@ -181,43 +194,72 @@ ior(board_info_t * db, int reg)
 static void
 iow(board_info_t * db, int reg, int value)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg, db->io_addr);
+	writel(value, db->io_data);
+#else
 	writeb(reg, db->io_addr);
 	writeb(value, db->io_data);
+#endif
 }
 
 /* routines for sending block to chip */
 
 static void dm9000_outblk_8bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writesbsw(reg, data, count);
+#else
 	writesb(reg, data, count);
+#endif
 }
 
 static void dm9000_outblk_16bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writeswsw(reg, data, (count+1) >> 1);
+#else
 	writesw(reg, data, (count+1) >> 1);
+#endif
 }
 
 static void dm9000_outblk_32bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writeslsw(reg, data, (count+3) >> 2);
+#else
 	writesl(reg, data, (count+3) >> 2);
+#endif
 }
 
 /* input block from chip to memory */
 
 static void dm9000_inblk_8bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	readsbsw(reg, data, count);
+#else
 	readsb(reg, data, count);
+#endif
 }
 
 
 static void dm9000_inblk_16bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	readswsw(reg, data, (count+1) >> 1);
+#else
 	readsw(reg, data, (count+1) >> 1);
+#endif
 }
 
 static void dm9000_inblk_32bit(void __iomem *reg, void *data, int count)
 {
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	readslsw(reg, data, (count+3) >> 2);
+#else
 	readsl(reg, data, (count+3) >> 2);
+#endif
 }
 
 /* dump block from chip to null */
@@ -863,8 +905,12 @@ static void dm9000_timeout(struct net_de
 	netif_wake_queue(dev);
 
 	/* Restore previous register address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
-	spin_unlock_irqrestore(&db->lock, flags);
+#endif
+	spin_unlock_irqrestore(&db->lock,flags);
 }
 
 static void dm9000_send_packet(struct net_device *dev,
@@ -908,7 +954,11 @@ dm9000_start_xmit(struct sk_buff *skb, s
 	spin_lock_irqsave(&db->lock, flags);
 
 	/* Move data to DM9000 TX RAM */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   writel(DM9000_MWCMD, db->io_addr);
+#else
 	writeb(DM9000_MWCMD, db->io_addr);
+#endif	
 
 	(db->outblk)(db->io_data, skb->data, skb->len);
 	dev->stats.tx_bytes += skb->len;
@@ -981,7 +1031,11 @@ dm9000_rx(struct net_device *dev)
 		ior(db, DM9000_MRCMDX);	/* Dummy read */
 
 		/* Get most updated data */
-		rxbyte = readb(db->io_data);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+      rxbyte = (u8)readl(db->io_data);
+#else
+      rxbyte = readb(db->io_data);
+#endif
 
 		/* Status check: this byte must be 0 or 1 */
 		if (rxbyte & DM9000_PKT_ERR) {
@@ -996,7 +1050,12 @@ dm9000_rx(struct net_device *dev)
 
 		/* A packet ready now  & Get status/length */
 		GoodPacket = true;
+
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+		writel(DM9000_MRCMD, db->io_addr);
+#else
 		writeb(DM9000_MRCMD, db->io_addr);
+#endif
 
 		(db->inblk)(db->io_data, &rxhdr, sizeof(rxhdr));
 
@@ -1085,7 +1144,11 @@ static irqreturn_t dm9000_interrupt(int
 	spin_lock_irqsave(&db->lock, flags);
 
 	/* Save previous register address */
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	/* Disable all interrupts */
 	iow(db, DM9000_IMR, IMR_PAR);
@@ -1116,7 +1179,11 @@ static irqreturn_t dm9000_interrupt(int
 	iow(db, DM9000_IMR, db->imr_all);
 
 	/* Restore previous register address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 
 	spin_unlock_irqrestore(&db->lock, flags);
 
@@ -1237,7 +1304,11 @@ dm9000_phy_read(struct net_device *dev,
 	spin_lock_irqsave(&db->lock,flags);
 
 	/* Save previous register address */
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	/* Fill the phyxcer register into REG_0C */
 	iow(db, DM9000_EPAR, DM9000_PHY | reg);
@@ -1250,7 +1321,11 @@ dm9000_phy_read(struct net_device *dev,
 	dm9000_msleep(db, 1);		/* Wait read complete */
 
 	spin_lock_irqsave(&db->lock,flags);
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	iow(db, DM9000_EPCR, 0x0);	/* Clear phyxcer read command */
 
@@ -1258,7 +1333,11 @@ dm9000_phy_read(struct net_device *dev,
 	ret = (ior(db, DM9000_EPDRH) << 8) | ior(db, DM9000_EPDRL);
 
 	/* restore the previous address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 	spin_unlock_irqrestore(&db->lock,flags);
 
 	mutex_unlock(&db->addr_lock);
@@ -1284,7 +1363,11 @@ dm9000_phy_write(struct net_device *dev,
 	spin_lock_irqsave(&db->lock,flags);
 
 	/* Save previous register address */
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	/* Fill the phyxcer register into REG_0C */
 	iow(db, DM9000_EPAR, DM9000_PHY | reg);
@@ -1295,18 +1378,30 @@ dm9000_phy_write(struct net_device *dev,
 
 	iow(db, DM9000_EPCR, EPCR_EPOS | EPCR_ERPRW);	/* Issue phyxcer write command */
 
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 	spin_unlock_irqrestore(&db->lock, flags);
 
 	dm9000_msleep(db, 1);		/* Wait write complete */
 
 	spin_lock_irqsave(&db->lock,flags);
-	reg_save = readb(db->io_addr);
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+   reg_save = (u8)readl(db->io_addr);
+#else
+   reg_save = readb(db->io_addr);
+#endif
 
 	iow(db, DM9000_EPCR, 0x0);	/* Clear phyxcer write command */
 
 	/* restore the previous address */
+#ifdef CONFIG_DM9000_32BIT_SW_SWAP
+	writel(reg_save, db->io_addr);
+#else
 	writeb(reg_save, db->io_addr);
+#endif
 
 	spin_unlock_irqrestore(&db->lock, flags);
 	mutex_unlock(&db->addr_lock);

--- linux/arch/m68k/include/asm/io_no.h.orig	2011-01-08 09:53:16.835301417 +0100
+++ linux/arch/m68k/include/asm/io_no.h	2011-01-08 09:53:18.523299757 +0100
@@ -47,6 +47,90 @@ static inline unsigned int _swapl(volati
 #define writew(b,addr) (void)((*(volatile unsigned short *) (addr)) = (b))
 #define writel(b,addr) (void)((*(volatile unsigned int *) (addr)) = (b))
 
+static inline void writesb (void __iomem *reg, void *data, int count)
+{
+	unsigned char *p = (unsigned char*) data;
+
+	while (count--) writeb(*p++, reg);
+}
+
+static inline void writesbsw (void __iomem *reg, void *data, int count)
+{
+	unsigned char *p = (unsigned char *) data;
+
+	while (count--) writel((int)(*p++), reg);
+}
+
+static inline void writesw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short*) data;
+
+   while (count--) writew(*p++, reg);
+}
+
+static inline void writeswsw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short *) data;
+
+   while (count--) writel((int)(_swapw(*p++)), reg);
+}
+
+static inline void writesl (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long*) data;
+
+   while (count--) writel(*p++, reg);
+}
+
+static inline void writeslsw (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long *) data;
+
+   while (count--) writel((int)(_swapl(*p++)), reg);
+}
+
+static inline void readsb (void __iomem *reg, void *data, int count)
+{
+   unsigned char *p = (unsigned char *) data;
+
+   while (count--) *p++ = readb(reg);
+}
+
+static inline void readsbsw (void __iomem *reg, void *data, int count)
+{
+   unsigned char *p = (unsigned char *) data;
+
+   while (count--) *p++ = (unsigned char)readl(reg);
+}
+
+static inline void readsw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short *) data;
+
+   while (count--) *p++ = readb(reg);
+}
+
+static inline void readswsw (void __iomem *reg, void *data, int count)
+{
+   unsigned short *p = (unsigned short *) data;
+
+   while (count--) *p++ = _swapw((unsigned short)readw(reg));
+}
+
+static inline void readsl (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long *) data;
+
+   while (count--) *p++ = readb(reg);
+}
+
+static inline void readslsw (void __iomem *reg, void *data, int count)
+{
+   unsigned long *p = (unsigned long *) data;
+
+   while (count--) *p++ = _swapl(readl(reg));
+}
+
 #define __raw_readb readb
 #define __raw_readw readw
 #define __raw_readl readl

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: David Miller @ 2011-01-08  4:37 UTC (permalink / raw)
  To: paulus; +Cc: xiaosuo, harvey.harrison, linux-ppp, netdev
In-Reply-To: <20110108031320.GA28926@brick.ozlabs.ibm.com>

From: Paul Mackerras <paulus@samba.org>
Date: Sat, 8 Jan 2011 14:13:20 +1100

> On Fri, Jan 07, 2011 at 05:15:34PM -0800, David Miller wrote:
> 
>> I have to say that every time I go read the header parsing code in the
>> PPP driver, I absolutely regret it.
>> 
>> And Changli's patch fixes some of the readability problems.
> 
> It's up to you whether you merge the patch or not, but surely you
> agree it needs more than a zero-line description?

It's entire sufficient to me.

He de-open-coded {get,put}_unaligned_be{16,32}() and when open-coding
is eliminated in this way a commit message of "Use {helper function
foo}." is more than enough.

^ permalink raw reply

* Re: [RFC] sched: QFQ - quick fair queue scheduler
From: David Miller @ 2011-01-08  4:34 UTC (permalink / raw)
  To: xiaosuo; +Cc: shemminger, dada1, fabio, netdev, rizzo
In-Reply-To: <AANLkTi=pqZ3CwLTAZnhd-cyQNj8OSeBHsP_bFiH3hJ-_@mail.gmail.com>

Changli, please do not quote an entire patch just to comment upon
one specific portion of that patch.  Quote only the hunks of the
patch you actually want to talk about.

When you quote the entire patch, it wastes bandwith, and makes it
harder for people to scan around to see your feedback.

I think people who do this have no idea how much pain they cause
for every single person reading their postings.  Nor do they
realize that this makes their feedback get unread completely by
many people.

Please, never do this again.  You contribute far too much for this
to become a habit.

Thank you.

^ permalink raw reply

* [PATCH 2/2] sky2: convert to new VLAN model (v0.2)
From: Stephen Hemminger @ 2011-01-08  4:13 UTC (permalink / raw)
  To: Jesse Gross, David Miller; +Cc: netdev
In-Reply-To: <AANLkTikwGQFByOZGgCCjTJySPa8QYndZ903CFmOkS1Ha@mail.gmail.com>

This converts sky2 to new VLAN offload flags control via ethtool.
It also allows for transmit offload of vlan tagged frames which
was not possible before.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
Changed the setting of vlan_features in this version to keep
non-offload settings (GRO|HIGHDMA) even if vlan offload is not
enabled.

--- a/drivers/net/sky2.c	2011-01-07 20:06:03.082168965 -0800
+++ b/drivers/net/sky2.c	2011-01-07 20:09:06.006180327 -0800
@@ -46,10 +46,6 @@
 
 #include <asm/irq.h>
 
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
-#define SKY2_VLAN_TAG_USED 1
-#endif
-
 #include "sky2.h"
 
 #define DRV_NAME		"sky2"
@@ -1326,39 +1322,34 @@ static int sky2_ioctl(struct net_device
 	return err;
 }
 
-#ifdef SKY2_VLAN_TAG_USED
-static void sky2_set_vlan_mode(struct sky2_hw *hw, u16 port, bool onoff)
-{
-	if (onoff) {
-		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
-			     RX_VLAN_STRIP_ON);
-		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
-			     TX_VLAN_TAG_ON);
-	} else {
-		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
-			     RX_VLAN_STRIP_OFF);
-		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
-			     TX_VLAN_TAG_OFF);
-	}
-}
+#define NETIF_F_ALL_VLAN (NETIF_F_HW_VLAN_TX|NETIF_F_HW_VLAN_RX)
 
-static void sky2_vlan_rx_register(struct net_device *dev, struct vlan_group *grp)
+static void sky2_vlan_mode(struct net_device *dev)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
 	struct sky2_hw *hw = sky2->hw;
 	u16 port = sky2->port;
 
-	netif_tx_lock_bh(dev);
-	napi_disable(&hw->napi);
+	if (dev->features & NETIF_F_HW_VLAN_RX)
+		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
+			     RX_VLAN_STRIP_ON);
+	else
+		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
+			     RX_VLAN_STRIP_OFF);
 
-	sky2->vlgrp = grp;
-	sky2_set_vlan_mode(hw, port, grp != NULL);
+	dev->vlan_features = dev->features &~ NETIF_F_ALL_VLAN;
+	if (dev->features & NETIF_F_HW_VLAN_TX)
+		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
+			     TX_VLAN_TAG_ON);
+	else {
+		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
+			     TX_VLAN_TAG_OFF);
 
-	sky2_read32(hw, B0_Y2_SP_LISR);
-	napi_enable(&hw->napi);
-	netif_tx_unlock_bh(dev);
+		/* Can't do transmit offload of vlan without hw vlan */
+		dev->vlan_features &= ~(NETIF_F_TSO | NETIF_F_SG
+					| NETIF_F_ALL_CSUM);
+	}
 }
-#endif
 
 /* Amount of required worst case padding in rx buffer */
 static inline unsigned sky2_rx_pad(const struct sky2_hw *hw)
@@ -1635,9 +1626,7 @@ static void sky2_hw_up(struct sky2_port
 	sky2_prefetch_init(hw, txqaddr[port], sky2->tx_le_map,
 			   sky2->tx_ring_size - 1);
 
-#ifdef SKY2_VLAN_TAG_USED
-	sky2_set_vlan_mode(hw, port, sky2->vlgrp != NULL);
-#endif
+	sky2_vlan_mode(sky2->netdev);
 
 	sky2_rx_start(sky2);
 }
@@ -1780,7 +1769,7 @@ static netdev_tx_t sky2_xmit_frame(struc
 	}
 
 	ctrl = 0;
-#ifdef SKY2_VLAN_TAG_USED
+
 	/* Add VLAN tag, can piggyback on LRGLEN or ADDR64 */
 	if (vlan_tx_tag_present(skb)) {
 		if (!le) {
@@ -1792,7 +1781,6 @@ static netdev_tx_t sky2_xmit_frame(struc
 		le->length = cpu_to_be16(vlan_tx_tag_get(skb));
 		ctrl |= INS_VLAN;
 	}
-#endif
 
 	/* Handle TCP checksum offload */
 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
@@ -2432,11 +2420,8 @@ static struct sk_buff *sky2_receive(stru
 	struct sk_buff *skb = NULL;
 	u16 count = (status & GMR_FS_LEN) >> 16;
 
-#ifdef SKY2_VLAN_TAG_USED
-	/* Account for vlan tag */
-	if (sky2->vlgrp && (status & GMR_FS_VLAN))
-		count -= VLAN_HLEN;
-#endif
+	if (status & GMR_FS_VLAN)
+		count -= VLAN_HLEN;	/* Account for vlan tag */
 
 	netif_printk(sky2, rx_status, KERN_DEBUG, dev,
 		     "rx slot %u status 0x%x len %d\n",
@@ -2504,17 +2489,9 @@ static inline void sky2_tx_done(struct n
 static inline void sky2_skb_rx(const struct sky2_port *sky2,
 			       u32 status, struct sk_buff *skb)
 {
-#ifdef SKY2_VLAN_TAG_USED
-	u16 vlan_tag = be16_to_cpu(sky2->rx_tag);
-	if (sky2->vlgrp && (status & GMR_FS_VLAN)) {
-		if (skb->ip_summed == CHECKSUM_NONE)
-			vlan_hwaccel_receive_skb(skb, sky2->vlgrp, vlan_tag);
-		else
-			vlan_gro_receive(&sky2->hw->napi, sky2->vlgrp,
-					 vlan_tag, skb);
-		return;
-	}
-#endif
+	if (status & GMR_FS_VLAN)
+		__vlan_hwaccel_put_tag(skb, be16_to_cpu(sky2->rx_tag));
+
 	if (skb->ip_summed == CHECKSUM_NONE)
 		netif_receive_skb(skb);
 	else
@@ -2631,7 +2608,6 @@ static int sky2_status_intr(struct sky2_
 				goto exit_loop;
 			break;
 
-#ifdef SKY2_VLAN_TAG_USED
 		case OP_RXVLAN:
 			sky2->rx_tag = length;
 			break;
@@ -2639,7 +2615,6 @@ static int sky2_status_intr(struct sky2_
 		case OP_RXCHKSVLAN:
 			sky2->rx_tag = length;
 			/* fall through */
-#endif
 		case OP_RXCHKS:
 			if (likely(sky2->flags & SKY2_FLAG_RX_CHECKSUM))
 				sky2_rx_checksum(sky2, status);
@@ -3042,6 +3017,10 @@ static int __devinit sky2_init(struct sk
 			| SKY2_HW_NEW_LE
 			| SKY2_HW_AUTO_TX_SUM
 			| SKY2_HW_ADV_POWER_CTL;
+
+		/* The workaround for status conflicts VLAN tag detection. */
+		if (hw->chip_rev == CHIP_REV_YU_FE2_A0)
+			hw->flags |= SKY2_HW_VLAN_BROKEN;
 		break;
 
 	case CHIP_ID_YUKON_SUPR:
@@ -4237,15 +4216,28 @@ static int sky2_set_eeprom(struct net_de
 static int sky2_set_flags(struct net_device *dev, u32 data)
 {
 	struct sky2_port *sky2 = netdev_priv(dev);
-	u32 supported =
-		(sky2->hw->flags & SKY2_HW_RSS_BROKEN) ? 0 : ETH_FLAG_RXHASH;
+	unsigned long old_feat = dev->features;
+	u32 supported = 0;
 	int rc;
 
+	if (!(sky2->hw->flags & SKY2_HW_RSS_BROKEN))
+		supported |= ETH_FLAG_RXHASH;
+
+	if (!(sky2->hw->flags & SKY2_HW_VLAN_BROKEN))
+		supported |= ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN;
+
+	printk(KERN_DEBUG "sky2 set_flags: supported %x data %x\n",
+	       supported, data);
+
 	rc = ethtool_op_set_flags(dev, data, supported);
 	if (rc)
 		return rc;
 
-	rx_set_rss(dev);
+	if ((old_feat ^ dev->features) & NETIF_F_RXHASH)
+		rx_set_rss(dev);
+
+	if ((old_feat ^ dev->features) & NETIF_F_ALL_VLAN)
+		sky2_vlan_mode(dev);
 
 	return 0;
 }
@@ -4281,6 +4273,7 @@ static const struct ethtool_ops sky2_eth
 	.get_sset_count = sky2_get_sset_count,
 	.get_ethtool_stats = sky2_get_ethtool_stats,
 	.set_flags	= sky2_set_flags,
+	.get_flags	= ethtool_op_get_flags,
 };
 
 #ifdef CONFIG_SKY2_DEBUG
@@ -4562,9 +4555,6 @@ static const struct net_device_ops sky2_
 	.ndo_change_mtu		= sky2_change_mtu,
 	.ndo_tx_timeout		= sky2_tx_timeout,
 	.ndo_get_stats64	= sky2_get_stats,
-#ifdef SKY2_VLAN_TAG_USED
-	.ndo_vlan_rx_register	= sky2_vlan_rx_register,
-#endif
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= sky2_netpoll,
 #endif
@@ -4580,9 +4570,6 @@ static const struct net_device_ops sky2_
 	.ndo_change_mtu		= sky2_change_mtu,
 	.ndo_tx_timeout		= sky2_tx_timeout,
 	.ndo_get_stats64	= sky2_get_stats,
-#ifdef SKY2_VLAN_TAG_USED
-	.ndo_vlan_rx_register	= sky2_vlan_rx_register,
-#endif
   },
 };
 
@@ -4633,7 +4620,8 @@ static __devinit struct net_device *sky2
 	sky2->port = port;
 
 	dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG
-		| NETIF_F_TSO  | NETIF_F_GRO;
+		| NETIF_F_TSO | NETIF_F_GRO;
+
 	if (highmem)
 		dev->features |= NETIF_F_HIGHDMA;
 
@@ -4641,13 +4629,8 @@ static __devinit struct net_device *sky2
 	if (!(hw->flags & SKY2_HW_RSS_BROKEN))
 		dev->features |= NETIF_F_RXHASH;
 
-#ifdef SKY2_VLAN_TAG_USED
-	/* The workaround for FE+ status conflicts with VLAN tag detection. */
-	if (!(sky2->hw->chip_id == CHIP_ID_YUKON_FE_P &&
-	      sky2->hw->chip_rev == CHIP_REV_YU_FE2_A0)) {
+	if (!(hw->flags & SKY2_HW_VLAN_BROKEN))
 		dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
-	}
-#endif
 
 	/* read the mac address */
 	memcpy_fromio(dev->dev_addr, hw->regs + B2_MAC_1 + port * 8, ETH_ALEN);
--- a/drivers/net/sky2.h	2011-01-07 20:05:59.982101189 -0800
+++ b/drivers/net/sky2.h	2011-01-07 20:06:03.094169226 -0800
@@ -2236,11 +2236,8 @@ struct sky2_port {
 	u16		     rx_pending;
 	u16		     rx_data_size;
 	u16		     rx_nfrags;
-
-#ifdef SKY2_VLAN_TAG_USED
 	u16		     rx_tag;
-	struct vlan_group    *vlgrp;
-#endif
+
 	struct {
 		unsigned long last;
 		u32	mac_rp;
@@ -2284,6 +2281,7 @@ struct sky2_hw {
 #define SKY2_HW_AUTO_TX_SUM	0x00000040	/* new IP decode for Tx */
 #define SKY2_HW_ADV_POWER_CTL	0x00000080	/* additional PHY power regs */
 #define SKY2_HW_RSS_BROKEN	0x00000100
+#define SKY2_HW_VLAN_BROKEN     0x00000200
 
 	u8	     	     chip_id;
 	u8		     chip_rev;

^ permalink raw reply

* Re: [RFC] sched: QFQ - quick fair queue scheduler
From: Stephen Hemminger @ 2011-01-08  4:02 UTC (permalink / raw)
  To: Changli Gao
  Cc: David Miller, Eric Dumazet, Fabio Checconi, netdev, Luigi Rizzo
In-Reply-To: <AANLkTi=pqZ3CwLTAZnhd-cyQNj8OSeBHsP_bFiH3hJ-_@mail.gmail.com>

On Sat, 8 Jan 2011 10:56:33 +0800
Changli Gao <xiaosuo@gmail.com> wrote:

> > +       cl->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;  
> 
> Hmm, there is no other packets schedulers which account packets in
> this way. Which one is better? I am not sure. And in this patch,
> qstats.drops isn't maintained in the same way. Would these two be
> consistent.

HTB uses this accounting.


^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Paul Mackerras @ 2011-01-08  3:13 UTC (permalink / raw)
  To: David Miller; +Cc: xiaosuo, harvey.harrison, linux-ppp, netdev
In-Reply-To: <20110107.171534.193718114.davem@davemloft.net>

On Fri, Jan 07, 2011 at 05:15:34PM -0800, David Miller wrote:

> I have to say that every time I go read the header parsing code in the
> PPP driver, I absolutely regret it.
> 
> And Changli's patch fixes some of the readability problems.

It's up to you whether you merge the patch or not, but surely you
agree it needs more than a zero-line description?

Paul.

^ permalink raw reply

* Re: [RFC] sched: QFQ - quick fair queue scheduler
From: Changli Gao @ 2011-01-08  2:56 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Eric Dumazet, Fabio Checconi, netdev, Luigi Rizzo
In-Reply-To: <20110106195614.20dbc402@nehalam>

On Fri, Jan 7, 2011 at 11:56 AM, Stephen Hemminger
<shemminger@vyatta.com> wrote:
> This is an implementation of the Quick Fair Queue scheduler developed
> by Fabio Checconi and Luigi Rizzo. The same algorithm is already implemented in ipfw
> in FreeBSD. Fabio had an earlier version developed on Linux, I just
> did some cleanup, and backporting of FreeBSD version.
>
> For more information see web page: http://info.iet.unipi.it/~luigi/qfq/
> and Google tech talk: http://www.youtube.com/watch?v=r8vBmybeKlE
>
> This is for inspection at this point, barely tested.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
> ---
> Patch against net-next-2.6.
> Configuration may get patch fuzz because of testing CHOKe in
> same tree.
>
>  include/linux/pkt_sched.h |   14
>  net/sched/Kconfig         |   11
>  net/sched/Makefile        |    1
>  net/sched/sch_qfq.c       | 1012 ++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 1038 insertions(+)
>
> --- a/include/linux/pkt_sched.h 2011-01-05 09:01:33.268032043 -0800
> +++ b/include/linux/pkt_sched.h 2011-01-05 23:17:20.637390255 -0800
> @@ -481,4 +481,18 @@ struct tc_drr_stats {
>        __u32   deficit;
>  };
>
> +/* QFQ */
> +enum {
> +       TCA_QFQ_WEIGHT,
> +       TCA_QFQ_LMAX,
> +       __TCA_QFQ_MAX
> +};
> +
> +#define TCA_QFQ_MAX    (__TCA_QFQ_MAX - 1)
> +
> +struct tc_qfq_stats {
> +       __u32 weight;
> +       __u32 lmax;
> +};
> +
>  #endif
> --- a/net/sched/Kconfig 2011-01-05 09:01:33.280032462 -0800
> +++ b/net/sched/Kconfig 2011-01-05 23:17:20.637390255 -0800
> @@ -216,6 +216,17 @@ config NET_SCH_CHOKE
>          To compile this code as a module, choose M here: the
>          module will be called sch_choke.
>
> +config NET_SCH_QFQ
> +        tristate "Quick Fair Queueing Scheduler (QFQ)"
> +       help
> +         Say Y here if you want to use the Quick Fair Queueing Scheduler (QFQ)
> +         packet scheduling algorithm.
> +
> +         To compile this driver as a module, choose M here: the module
> +         will be called sch_qfq.
> +
> +         If unsure, say N.
> +
>  config NET_SCH_INGRESS
>        tristate "Ingress Qdisc"
>        depends on NET_CLS_ACT
> --- a/net/sched/Makefile        2011-01-05 09:01:33.284032598 -0800
> +++ b/net/sched/Makefile        2011-01-05 23:17:20.645389829 -0800
> @@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)  += sch_mult
>  obj-$(CONFIG_NET_SCH_ATM)      += sch_atm.o
>  obj-$(CONFIG_NET_SCH_NETEM)    += sch_netem.o
>  obj-$(CONFIG_NET_SCH_DRR)      += sch_drr.o
> +obj-$(CONFIG_NET_SCH_QFQ)      += sch_qfq.o
>  obj-$(CONFIG_NET_SCH_CHOKE)    += sch_choke.o
>  obj-$(CONFIG_NET_CLS_U32)      += cls_u32.o
>  obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
> --- /dev/null   1970-01-01 00:00:00.000000000 +0000
> +++ b/net/sched/sch_qfq.c       2011-01-06 12:51:28.498280327 -0800
> @@ -0,0 +1,1125 @@
> +/*
> + * net/sched/sch_qfq.c         Quick Fair Queueing Scheduler.
> + *
> + * Copyright (c) 2009 Fabio Checconi, Luigi Rizzo, and Paolo Valente.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * version 2 as published by the Free Software Foundation.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <linux/bitops.h>
> +#include <linux/errno.h>
> +#include <linux/netdevice.h>
> +#include <linux/pkt_sched.h>
> +#include <net/sch_generic.h>
> +#include <net/pkt_sched.h>
> +#include <net/pkt_cls.h>
> +
> +/*  Quick Fair Queueing
> +    ===================
> +
> +    Sources:
> +    Fabio Checconi and Scuola Superiore and S. Anna
> +    and Paolo Valente and Luigi Riz "QFQ: Efficient Packet Scheduling
> +    with Tight Bandwidth Distribution Guarantees", SIGCOMM 2010
> +
> +    See also:
> +    http://retis.sssup.it/~fabio/linux/qfq/
> + */
> +
> +/*
> +
> +  Virtual time computations.
> +
> +  S, F and V are all computed in fixed point arithmetic with
> +  FRAC_BITS decimal bits.
> +
> +  QFQ_MAX_INDEX is the maximum index allowed for a group. We need
> +       one bit per index.
> +  QFQ_MAX_WSHIFT is the maximum power of two supported as a weight.
> +
> +  The layout of the bits is as below:
> +
> +                   [ MTU_SHIFT ][      FRAC_BITS    ]
> +                   [ MAX_INDEX    ][ MIN_SLOT_SHIFT ]
> +                                ^.__grp->index = 0
> +                                *.__grp->slot_shift
> +
> +  where MIN_SLOT_SHIFT is derived by difference from the others.
> +
> +  The max group index corresponds to Lmax/w_min, where
> +  Lmax=1<<MTU_SHIFT, w_min = 1 .
> +  From this, and knowing how many groups (MAX_INDEX) we want,
> +  we can derive the shift corresponding to each group.
> +
> +  Because we often need to compute
> +       F = S + len/w_i  and V = V + len/wsum
> +  instead of storing w_i store the value
> +       inv_w = (1<<FRAC_BITS)/w_i
> +  so we can do F = S + len * inv_w * wsum.
> +  We use W_TOT in the formulas so we can easily move between
> +  static and adaptive weight sum.
> +
> +  The per-scheduler-instance data contain all the data structures
> +  for the scheduler: bitmaps and bucket lists.
> +
> + */
> +
> +/*
> + * Maximum number of consecutive slots occupied by backlogged classes
> + * inside a group.
> + */
> +#define QFQ_MAX_SLOTS  32
> +
> +/*
> + * Shifts used for class<->group mapping.  We allow class weights that are
> + * in the range [1, 2^MAX_WSHIFT], and we try to map each class i to the
> + * group with the smallest index that can support the L_i / r_i configured
> + * for the class.
> + *
> + * grp->index is the index of the group; and grp->slot_shift
> + * is the shift for the corresponding (scaled) sigma_i.
> + */
> +#define QFQ_MAX_INDEX          19
> +#define QFQ_MAX_WSHIFT         16
> +
> +#define        QFQ_MAX_WEIGHT          (1<<QFQ_MAX_WSHIFT)
> +#define QFQ_MAX_WSUM           (2*QFQ_MAX_WEIGHT)
> +
> +#define FRAC_BITS              30      /* fixed point arithmetic */
> +#define ONE_FP                 (1UL << FRAC_BITS)
> +#define IWSUM                  (ONE_FP/QFQ_MAX_WSUM)
> +
> +#define QFQ_MTU_SHIFT          11
> +#define QFQ_MIN_SLOT_SHIFT     (FRAC_BITS + QFQ_MTU_SHIFT - QFQ_MAX_INDEX)
> +
> +/*
> + * Possible group states.  These values are used as indexes for the bitmaps
> + * array of struct qfq_queue.
> + */
> +enum qfq_state { ER, IR, EB, IB, QFQ_MAX_STATE };
> +
> +struct qfq_group;
> +
> +struct qfq_class {
> +       struct Qdisc_class_common common;
> +
> +       unsigned int refcnt;
> +       unsigned int filter_cnt;
> +
> +       struct gnet_stats_basic_packed bstats;
> +       struct gnet_stats_queue qstats;
> +       struct gnet_stats_rate_est rate_est;
> +       struct Qdisc *qdisc;
> +
> +       struct qfq_class *next; /* Link for the slot list. */
> +       u64 S, F;               /* flow timestamps (exact) */
> +
> +       /* group we belong to. In principle we would need the index,
> +        * which is log_2(lmax/weight), but we never reference it
> +        * directly, only the group.
> +        */
> +       struct qfq_group *grp;
> +
> +       /* these are copied from the flowset. */
> +       u32     inv_w;          /* ONE_FP/weight */
> +       u32     lmax;           /* Max packet size for this flow. */
> +};
> +
> +struct qfq_group {
> +       uint64_t S, F;                  /* group timestamps (approx). */
> +       unsigned int slot_shift;        /* Slot shift. */
> +       unsigned int index;             /* Group index. */
> +       unsigned int front;             /* Index of the front slot. */
> +       unsigned long full_slots;       /* non-empty slots */
> +
> +       /* Array of RR lists of active classes. */
> +       struct qfq_class *slots[QFQ_MAX_SLOTS];
> +};
> +
> +struct qfq_sched {
> +       struct tcf_proto *filter_list;
> +       struct Qdisc_class_hash clhash;
> +
> +       uint64_t        V;              /* Precise virtual time. */
> +       u32 wsum;                       /* weight sum */
> +
> +       unsigned long bitmaps[QFQ_MAX_STATE];       /* Group bitmaps. */
> +       struct qfq_group groups[QFQ_MAX_INDEX + 1]; /* The groups. */
> +};
> +
> +static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct Qdisc_class_common *clc;
> +
> +       clc = qdisc_class_find(&q->clhash, classid);
> +       if (clc == NULL)
> +               return NULL;
> +       return container_of(clc, struct qfq_class, common);
> +}
> +
> +static void qfq_purge_queue(struct qfq_class *cl)
> +{
> +       unsigned int len = cl->qdisc->q.qlen;
> +
> +       qdisc_reset(cl->qdisc);
> +       qdisc_tree_decrease_qlen(cl->qdisc, len);
> +}
> +
> +static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
> +       [TCA_QFQ_WEIGHT] = { .type = NLA_U32 },
> +       [TCA_QFQ_LMAX] = { .type = NLA_U32 },
> +};
> +
> +/*
> + * Calculate a flow index, given its weight and maximum packet length.
> + * index = log_2(maxlen/weight) but we need to apply the scaling.
> + * This is used only once at flow creation.
> + */
> +static int qfq_calc_index(u32 inv_w, unsigned int maxlen)
> +{
> +       u64 slot_size = (u64)maxlen *inv_w;
> +       unsigned long size_map;
> +       int index = 0;
> +
> +       size_map = slot_size >> QFQ_MIN_SLOT_SHIFT;
> +       if (!size_map)
> +               goto out;
> +
> +       index = __fls(size_map) + 1;    /* basically a log_2 */
> +       index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
> +
> +       if (index < 0)
> +               index = 0;
> +out:
> +       pr_debug("qfq calc_index: W = %lu, L = %u, I = %d\n",
> +                (unsigned long) ONE_FP/inv_w, maxlen, index);
> +
> +       return index;
> +}
> +
> +static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
> +                           struct nlattr **tca, unsigned long *arg)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl = (struct qfq_class *)*arg;
> +       struct nlattr *tb[TCA_QFQ_MAX + 1];
> +       u32 weight, lmax, inv_w;
> +       int i, err;
> +
> +       if (tca[TCA_OPTIONS] == NULL)
> +               return -EINVAL;
> +
> +       err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy);
> +       if (err < 0)
> +               return err;
> +
> +       if (tb[TCA_QFQ_WEIGHT]) {
> +               weight = nla_get_u32(tb[TCA_QFQ_WEIGHT]);
> +               if (!weight || weight > (1UL << QFQ_MAX_WSHIFT)) {
> +                       pr_notice("qfq: invalid weight %u\n", weight);
> +                       return -EINVAL;
> +               }
> +       } else
> +               weight = 1;
> +
> +       inv_w = ONE_FP / weight;
> +       weight = ONE_FP / inv_w;
> +       if (q->wsum + weight > QFQ_MAX_WSUM) {
> +               pr_notice("qfq: total weight out of range (%u + %u)\n",
> +                         weight, q->wsum);
> +               return -EINVAL;
> +       }
> +
> +       if (tb[TCA_QFQ_LMAX]) {
> +               lmax = nla_get_u32(tb[TCA_QFQ_LMAX]);
> +               if (!lmax || lmax > (1UL << QFQ_MTU_SHIFT)) {
> +                       pr_notice("qfq: invalid max length %u\n", lmax);
> +                       return -EINVAL;
> +               }
> +       } else
> +               lmax = 1UL << QFQ_MTU_SHIFT;
> +
> +       if (cl != NULL) {
> +               if (tca[TCA_RATE]) {
> +                       err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
> +                                                   qdisc_root_sleeping_lock(sch),
> +                                                   tca[TCA_RATE]);
> +                       if (err)
> +                               return err;
> +               }
> +
> +               sch_tree_lock(sch);
> +               if (tb[TCA_QFQ_WEIGHT]) {
> +                       q->wsum = weight - ONE_FP / cl->inv_w;
> +                       cl->inv_w = inv_w;
> +               }
> +               sch_tree_unlock(sch);
> +
> +               return 0;
> +       }
> +
> +       cl = kzalloc(sizeof(struct qfq_class), GFP_KERNEL);
> +       if (cl == NULL)
> +               return -ENOBUFS;
> +
> +       cl->refcnt = 1;
> +       cl->common.classid = classid;
> +       cl->lmax = lmax;
> +       cl->inv_w = inv_w;
> +       i = qfq_calc_index(cl->inv_w, cl->lmax);
> +
> +       cl->grp = &q->groups[i];
> +       q->wsum += weight;
> +
> +       cl->qdisc = qdisc_create_dflt(sch->dev_queue,
> +                                     &pfifo_qdisc_ops, classid);
> +       if (cl->qdisc == NULL)
> +               cl->qdisc = &noop_qdisc;
> +
> +       if (tca[TCA_RATE]) {
> +               err = gen_new_estimator(&cl->bstats, &cl->rate_est,
> +                                       qdisc_root_sleeping_lock(sch),
> +                                       tca[TCA_RATE]);
> +               if (err) {
> +                       qdisc_destroy(cl->qdisc);
> +                       kfree(cl);
> +                       return err;
> +               }
> +       }
> +
> +       sch_tree_lock(sch);
> +       qdisc_class_hash_insert(&q->clhash, &cl->common);
> +       sch_tree_unlock(sch);
> +
> +       qdisc_class_hash_grow(sch, &q->clhash);
> +
> +       *arg = (unsigned long)cl;
> +       return 0;
> +}
> +
> +static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
> +{
> +       struct qfq_sched *q = (struct qfq_sched *)sch;
> +
> +       if (cl->inv_w) {
> +               q->wsum -= ONE_FP / cl->inv_w;
> +               cl->inv_w = 0;
> +       }
> +
> +       gen_kill_estimator(&cl->bstats, &cl->rate_est);
> +       qdisc_destroy(cl->qdisc);
> +       kfree(cl);
> +}
> +
> +static int qfq_delete_class(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (cl->filter_cnt > 0)
> +               return -EBUSY;
> +
> +       sch_tree_lock(sch);
> +
> +       qfq_purge_queue(cl);
> +       qdisc_class_hash_remove(&q->clhash, &cl->common);
> +
> +       if (--cl->refcnt == 0)
> +               qfq_destroy_class(sch, cl);
> +
> +       sch_tree_unlock(sch);
> +       return 0;
> +}
> +
> +static unsigned long qfq_get_class(struct Qdisc *sch, u32 classid)
> +{
> +       struct qfq_class *cl = qfq_find_class(sch, classid);
> +
> +       if (cl != NULL)
> +               cl->refcnt++;
> +
> +       return (unsigned long)cl;
> +}
> +
> +static void qfq_put_class(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (--cl->refcnt == 0)
> +               qfq_destroy_class(sch, cl);
> +}
> +
> +static struct tcf_proto **qfq_tcf_chain(struct Qdisc *sch, unsigned long cl)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +
> +       if (cl)
> +               return NULL;
> +
> +       return &q->filter_list;
> +}
> +
> +static unsigned long qfq_bind_tcf(struct Qdisc *sch, unsigned long parent,
> +                                 u32 classid)
> +{
> +       struct qfq_class *cl = qfq_find_class(sch, classid);
> +
> +       if (cl != NULL)
> +               cl->filter_cnt++;
> +
> +       return (unsigned long)cl;
> +}
> +
> +static void qfq_unbind_tcf(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       cl->filter_cnt--;
> +}
> +
> +static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
> +                          struct Qdisc *new, struct Qdisc **old)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (new == NULL) {
> +               new = qdisc_create_dflt(sch->dev_queue,
> +                                       &pfifo_qdisc_ops, cl->common.classid);
> +               if (new == NULL)
> +                       new = &noop_qdisc;
> +       }
> +
> +       sch_tree_lock(sch);
> +       qfq_purge_queue(cl);
> +       *old = cl->qdisc;
> +       cl->qdisc = new;
> +       sch_tree_unlock(sch);
> +       return 0;
> +}
> +
> +static struct Qdisc *qfq_class_leaf(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       return cl->qdisc;
> +}
> +
> +static int qfq_dump_class(struct Qdisc *sch, unsigned long arg,
> +                         struct sk_buff *skb, struct tcmsg *tcm)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +       struct nlattr *nest;
> +
> +       tcm->tcm_parent = TC_H_ROOT;
> +       tcm->tcm_handle = cl->common.classid;
> +       tcm->tcm_info   = cl->qdisc->handle;
> +
> +       nest = nla_nest_start(skb, TCA_OPTIONS);
> +       if (nest == NULL)
> +               goto nla_put_failure;
> +       NLA_PUT_U32(skb, TCA_QFQ_WEIGHT, ONE_FP/cl->inv_w);
> +       NLA_PUT_U32(skb, TCA_QFQ_LMAX, cl->lmax);
> +       return nla_nest_end(skb, nest);
> +
> +nla_put_failure:
> +       nla_nest_cancel(skb, nest);
> +       return -EMSGSIZE;
> +}
> +
> +static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
> +                               struct gnet_dump *d)
> +{
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +       struct tc_qfq_stats xstats;
> +
> +       memset(&xstats, 0, sizeof(xstats));
> +
> +       xstats.weight = ONE_FP/cl->inv_w;
> +       xstats.lmax = cl->lmax;
> +
> +       if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
> +           gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
> +           gnet_stats_copy_queue(d, &cl->qdisc->qstats) < 0)
> +               return -1;
> +
> +       return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
> +}
> +
> +static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl;
> +       struct hlist_node *n;
> +       unsigned int i;
> +
> +       if (arg->stop)
> +               return;
> +
> +       for (i = 0; i < q->clhash.hashsize; i++) {
> +               hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
> +                       if (arg->count < arg->skip) {
> +                               arg->count++;
> +                               continue;
> +                       }
> +                       if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
> +                               arg->stop = 1;
> +                               return;
> +                       }
> +                       arg->count++;
> +               }
> +       }
> +}
> +
> +static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch,
> +                                     int *qerr)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl;
> +       struct tcf_result res;
> +       int result;
> +
> +       if (TC_H_MAJ(skb->priority ^ sch->handle) == 0) {
> +               cl = qfq_find_class(sch, skb->priority);
> +               if (cl != NULL)
> +                       return cl;
> +       }
> +
> +       *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
> +       result = tc_classify(skb, q->filter_list, &res);
> +       if (result >= 0) {
> +#ifdef CONFIG_NET_CLS_ACT
> +               switch (result) {
> +               case TC_ACT_QUEUED:
> +               case TC_ACT_STOLEN:
> +                       *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
> +               case TC_ACT_SHOT:
> +                       return NULL;
> +               }
> +#endif
> +               cl = (struct qfq_class *)res.class;
> +               if (cl == NULL)
> +                       cl = qfq_find_class(sch, res.classid);
> +               return cl;
> +       }
> +
> +       return NULL;
> +}
> +
> +/* Generic comparison function, handling wraparound. */
> +static inline int qfq_gt(u64 a, u64 b)
> +{
> +       return (s64)(a - b) > 0;
> +}
> +
> +/* Round a precise timestamp to its slotted value. */
> +static inline u64 qfq_round_down(u64 ts, unsigned int shift)
> +{
> +       return ts & ~((1ULL << shift) - 1);
> +}
> +
> +/* return the pointer to the group with lowest index in the bitmap */
> +static inline struct qfq_group *qfq_ffs(struct qfq_sched *q,
> +                                       unsigned long bitmap)
> +{
> +       int index = __ffs(bitmap); // zero-based
> +       return &q->groups[index];
> +}
> +/* Calculate a mask to mimic what would be ffs_from(). */
> +static inline unsigned long mask_from(unsigned long bitmap, int from)
> +{
> +       return bitmap & ~((1UL << from) - 1);
> +}
> +
> +/*
> + * The state computation relies on ER=0, IR=1, EB=2, IB=3
> + * First compute eligibility comparing grp->S, q->V,
> + * then check if someone is blocking us and possibly add EB
> + */
> +static int qfq_calc_state(struct qfq_sched *q, const struct qfq_group *grp)
> +{
> +       /* if S > V we are not eligible */
> +       unsigned int state = qfq_gt(grp->S, q->V);
> +       unsigned long mask = mask_from(q->bitmaps[ER], grp->index);
> +       struct qfq_group *next;
> +
> +       if (mask) {
> +               next = qfq_ffs(q, mask);
> +               if (qfq_gt(grp->F, next->F))
> +                       state |= EB;
> +       }
> +
> +       return state;
> +}
> +
> +
> +/*
> + * In principle
> + *     q->bitmaps[dst] |= q->bitmaps[src] & mask;
> + *     q->bitmaps[src] &= ~mask;
> + * but we should make sure that src != dst
> + */
> +static inline void qfq_move_groups(struct qfq_sched *q, unsigned long mask,
> +                                  int src, int dst)
> +{
> +       q->bitmaps[dst] |= q->bitmaps[src] & mask;
> +       q->bitmaps[src] &= ~mask;
> +}
> +
> +static void qfq_unblock_groups(struct qfq_sched *q, int index, u64 old_F)
> +{
> +       unsigned long mask = mask_from(q->bitmaps[ER], index + 1);
> +       struct qfq_group *next;
> +
> +       if (mask) {
> +               next = qfq_ffs(q, mask);
> +               if (!qfq_gt(next->F, old_F))
> +                       return;
> +       }
> +
> +       mask = (1UL << index) - 1;
> +       qfq_move_groups(q, mask, EB, ER);
> +       qfq_move_groups(q, mask, IB, IR);
> +}
> +
> +/*
> + * perhaps
> + *
> +       old_V ^= q->V;
> +       old_V >>= QFQ_MIN_SLOT_SHIFT;
> +       if (old_V) {
> +               ...
> +       }
> + *
> + */
> +static void qfq_make_eligible(struct qfq_sched *q, u64 old_V)
> +{
> +       unsigned long vslot = q->V >> QFQ_MIN_SLOT_SHIFT;
> +       unsigned long old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
> +
> +       if (vslot != old_vslot) {
> +               unsigned long mask = (1UL << fls(vslot ^ old_vslot)) - 1;
> +               qfq_move_groups(q, mask, IR, ER);
> +               qfq_move_groups(q, mask, IB, EB);
> +       }
> +}
> +
> +/*
> + * XXX we should make sure that slot becomes less than 32.
> + * This is guaranteed by the input values.
> + * roundedS is always cl->S rounded on grp->slot_shift bits.
> + */
> +static void qfq_slot_insert(struct qfq_group *grp, struct qfq_class *cl,
> +                                  u64 roundedS)
> +{
> +       u64 slot = (roundedS - grp->S) >> grp->slot_shift;
> +       unsigned int i = (grp->front + slot) % QFQ_MAX_SLOTS;
> +
> +       cl->next = grp->slots[i];
> +       grp->slots[i] = cl;
> +       __set_bit(slot, &grp->full_slots);
> +}
> +
> +/*
> + * remove the entry from the slot
> + */
> +static void qfq_front_slot_remove(struct qfq_group *grp)
> +{
> +       struct qfq_class **h = &grp->slots[grp->front];
> +
> +       *h = (*h)->next;
> +       if (!*h)
> +               __clear_bit(0, &grp->full_slots);
> +}
> +
> +/*
> + * Returns the first full queue in a group. As a side effect,
> + * adjust the bucket list so the first non-empty bucket is at
> + * position 0 in full_slots.
> + */
> +static struct qfq_class *qfq_slot_scan(struct qfq_group *grp)
> +{
> +       unsigned int i;
> +
> +       pr_debug("qfq slot_scan: grp %u full %#lx\n",
> +                grp->index, grp->full_slots);
> +
> +       if (!grp->full_slots)
> +               return NULL;
> +
> +       i = __ffs(grp->full_slots);  /* zero based */
> +       if (i > 0) {
> +               grp->front = (grp->front + i) % QFQ_MAX_SLOTS;
> +               grp->full_slots >>= i;
> +       }
> +
> +       return grp->slots[grp->front];
> +}
> +
> +/*
> + * adjust the bucket list. When the start time of a group decreases,
> + * we move the index down (modulo QFQ_MAX_SLOTS) so we don't need to
> + * move the objects. The mask of occupied slots must be shifted
> + * because we use ffs() to find the first non-empty slot.
> + * This covers decreases in the group's start time, but what about
> + * increases of the start time ?
> + * Here too we should make sure that i is less than 32
> + */
> +static void qfq_slot_rotate(struct qfq_group *grp, u64 roundedS)
> +{
> +       unsigned int i = (grp->S - roundedS) >> grp->slot_shift;
> +
> +       grp->full_slots <<= i;
> +       grp->front = (grp->front - i) % QFQ_MAX_SLOTS;
> +}
> +
> +static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
> +{
> +       struct qfq_group *grp;
> +       unsigned long ineligible;
> +
> +       ineligible = q->bitmaps[IR] | q->bitmaps[IB];
> +       if (ineligible) {
> +               if (!q->bitmaps[ER]) {
> +                       grp = qfq_ffs(q, ineligible);
> +                       if (qfq_gt(grp->S, q->V))
> +                               q->V = grp->S;
> +               }
> +               qfq_make_eligible(q, old_V);
> +       }
> +}
> +
> +/* What is length of next packet in queue (0 if queue is empty) */
> +static unsigned int qdisc_peek_len(struct Qdisc *sch)
> +{
> +       struct sk_buff *skb;
> +
> +       skb = sch->ops->peek(sch);
> +       return skb ? qdisc_pkt_len(skb) : 0;
> +}
> +
> +/*
> + * Updates the class, returns true if also the group needs to be updated.
> + */
> +static bool qfq_update_class(struct qfq_group *grp, struct qfq_class *cl)
> +{
> +       unsigned int len = qdisc_peek_len(cl->qdisc);
> +
> +       cl->S = cl->F;
> +       if (!len)
> +               qfq_front_slot_remove(grp);     /* queue is empty */
> +       else {
> +               u64 roundedS;
> +
> +               cl->F = cl->S + (u64)len * cl->inv_w;
> +               roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +               if (roundedS == grp->S)
> +                       return false;
> +
> +               qfq_front_slot_remove(grp);
> +               qfq_slot_insert(grp, cl, roundedS);
> +       }
> +
> +       return true;
> +}
> +
> +static struct sk_buff *qfq_dequeue(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl;
> +       struct sk_buff *skb;
> +       unsigned int len;
> +       u64 old_V;
> +
> +       if (!q->bitmaps[ER])
> +               return NULL;
> +
> +       grp = qfq_ffs(q, q->bitmaps[ER]);
> +
> +       cl = grp->slots[grp->front];
> +       skb = qdisc_dequeue_peeked(cl->qdisc);
> +       if (!skb) {
> +               WARN_ONCE(1, "qfq_dequeue: non-workconserving leaf\n");
> +               return NULL;
> +       }
> +
> +       sch->q.qlen--;
> +
> +       old_V = q->V;
> +       len = qdisc_pkt_len(skb);
> +       q->V += (u64)len * IWSUM;
> +       pr_debug("qfq enqueue: len %u F %lld now %lld\n",
> +                len, (unsigned long long) cl->F, (unsigned long long) q->V);
> +
> +       if (qfq_update_class(grp, cl)) {
> +               u64 old_F = grp->F;
> +
> +               cl = qfq_slot_scan(grp);
> +               if (!cl)
> +                       __clear_bit(grp->index, &q->bitmaps[ER]);
> +               else {
> +                       u64 roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +                       unsigned int s;
> +
> +                       if (grp->S == roundedS)
> +                               goto skip_unblock;
> +                       grp->S = roundedS;
> +                       grp->F = roundedS + (2ULL << grp->slot_shift);
> +                       __clear_bit(grp->index, &q->bitmaps[ER]);
> +                       s = qfq_calc_state(q, grp);
> +                       __set_bit(grp->index, &q->bitmaps[s]);
> +               }
> +
> +               qfq_unblock_groups(q, grp->index, old_F);
> +       }
> +
> +skip_unblock:
> +       qfq_update_eligible(q, old_V);
> +
> +       return skb;
> +}
> +
> +/*
> + * Assign a reasonable start time for a new flow k in group i.
> + * Admissible values for \hat(F) are multiples of \sigma_i
> + * no greater than V+\sigma_i . Larger values mean that
> + * we had a wraparound so we consider the timestamp to be stale.
> + *
> + * If F is not stale and F >= V then we set S = F.
> + * Otherwise we should assign S = V, but this may violate
> + * the ordering in ER. So, if we have groups in ER, set S to
> + * the F_j of the first group j which would be blocking us.
> + * We are guaranteed not to move S backward because
> + * otherwise our group i would still be blocked.
> + */
> +static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
> +{
> +       unsigned long mask;
> +       uint32_t limit, roundedF;
> +       int slot_shift = cl->grp->slot_shift;
> +
> +       roundedF = qfq_round_down(cl->F, slot_shift);
> +       limit = qfq_round_down(q->V, slot_shift) + (1UL << slot_shift);
> +
> +       if (!qfq_gt(cl->F, q->V) || qfq_gt(roundedF, limit)) {
> +               /* timestamp was stale */
> +               mask = mask_from(q->bitmaps[ER], cl->grp->index);
> +               if (mask) {
> +                       struct qfq_group *next = qfq_ffs(q, mask);
> +                       if (qfq_gt(roundedF, next->F)) {
> +                               cl->S = next->F;
> +                               return;
> +                       }
> +               }
> +               cl->S = q->V;
> +       } else { /* timestamp is not stale */
> +               cl->S = cl->F;
> +       }
> +}
> +
> +static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl;
> +       unsigned int len;
> +       int err;
> +       u64 roundedS;
> +       int s;
> +
> +       cl = qfq_classify(skb, sch, &err);
> +       if (cl == NULL || cl->qdisc->q.qlen > 80) {
> +               if (err & __NET_XMIT_BYPASS)
> +                       sch->qstats.drops++;
> +               kfree_skb(skb);
> +               return err;
> +       }
> +
> +       len = qdisc_pkt_len(skb);
> +       err = qdisc_enqueue(skb, cl->qdisc);
> +       if (unlikely(err != NET_XMIT_SUCCESS)) {
> +               if (net_xmit_drop_count(err)) {
> +                       cl->qstats.drops++;
> +                       sch->qstats.drops++;
> +               }
> +               return err;
> +       }
> +
> +       cl->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;

Hmm, there is no other packets schedulers which account packets in
this way. Which one is better? I am not sure. And in this patch,
qstats.drops isn't maintained in the same way. Would these two be
consistent.

> +       cl->bstats.bytes += qdisc_pkt_len(skb);
> +
> +       sch->q.qlen++;
> +       sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
> +       sch->bstats.bytes += qdisc_pkt_len(skb);
> +
> +       if (qdisc_peek_head(sch) != skb)
> +               return err;

I suspect that it is wrong.

Here is the fake code from the paper:


5 i f ( f low . queue . head != pkt )
6 return ; // Flow already backlogged, we are don

So the correct code should be:
    if (qdisc_peek_head(cl->qdisc) != skb)
           return err;

However, we can't assume the cl->qdisc is work conserving, so the code
should be:
   if (cl->qdisc->q.qlen > 1)
          return err;

> +
> +       /* If reach this point, queue q was idle */
> +       grp = cl->grp;
> +       qfq_update_start(q, cl);
> +
> +       /* compute new finish time and rounded start. */
> +       cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w;
> +       roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +
> +       /*
> +        * insert cl in the correct bucket.
> +        * If cl->S >= grp->S we don't need to adjust the
> +        * bucket list and simply go to the insertion phase.
> +        * Otherwise grp->S is decreasing, we must make room
> +        * in the bucket list, and also recompute the group state.
> +        * Finally, if there were no flows in this group and nobody
> +        * was in ER make sure to adjust V.
> +        */
> +       if (grp->full_slots) {
> +               if (!qfq_gt(grp->S, cl->S))
> +                       goto skip_update;
> +
> +               /* create a slot for this cl->S */
> +               qfq_slot_rotate(grp, roundedS);
> +               /* group was surely ineligible, remove */
> +               __clear_bit(grp->index, &q->bitmaps[IR]);
> +               __clear_bit(grp->index, &q->bitmaps[IB]);
> +       } else if (!q->bitmaps[ER] && qfq_gt(roundedS, q->V))
> +               q->V = roundedS;
> +
> +       grp->S = roundedS;
> +       grp->F = roundedS + (2ULL << grp->slot_shift);
> +       s = qfq_calc_state(q, grp);
> +       __set_bit(grp->index, &q->bitmaps[s]);
> +
> +       pr_debug("qfq enqueue: new state %d %#lx S %lld F %lld V %lld\n",
> +                s, q->bitmaps[s],
> +                (unsigned long long) cl->S,
> +                (unsigned long long) cl->F,
> +                (unsigned long long) q->V);
> +
> +skip_update:
> +       qfq_slot_insert(grp, cl, roundedS);
> +
> +       return err;
> +}
> +
> +
> +static void qfq_slot_remove(struct qfq_sched *q, struct qfq_group *grp,
> +                           struct qfq_class *cl, struct qfq_class **pprev)
> +{
> +       unsigned int i, offset;
> +       u64 roundedS;
> +
> +       roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +       offset = (roundedS - grp->S) >> grp->slot_shift;
> +       i = (grp->front + offset) % QFQ_MAX_SLOTS;
> +
> +       if (!pprev) {
> +               pprev = &grp->slots[i];
> +               while (*pprev && *pprev != cl)
> +                       pprev = &(*pprev)->next;
> +       }
> +
> +       *pprev = cl->next;
> +       if (!grp->slots[i])
> +               __clear_bit(offset, &grp->full_slots);
> +}
> +
> +/*
> + * called to forcibly destroy a queue.
> + * If the queue is not in the front bucket, or if it has
> + * other queues in the front bucket, we can simply remove
> + * the queue with no other side effects.
> + * Otherwise we must propagate the event up.
> + */
> +static void qfq_deactivate_class(struct qfq_sched *q, struct qfq_class *cl,
> +                                struct qfq_class **pprev)
> +{
> +       struct qfq_group *grp = cl->grp;
> +       unsigned long mask;
> +       u64 roundedS;
> +       int s;
> +
> +       cl->F = cl->S;
> +       qfq_slot_remove(q, grp, cl, pprev);
> +
> +       if (!grp->full_slots) {
> +               __clear_bit(grp->index, &q->bitmaps[IR]);
> +               __clear_bit(grp->index, &q->bitmaps[EB]);
> +               __clear_bit(grp->index, &q->bitmaps[IB]);
> +
> +               if (test_bit(grp->index, &q->bitmaps[ER]) &&
> +                   !(q->bitmaps[ER] & ~((1UL << grp->index) - 1))) {
> +                       mask = q->bitmaps[ER] & ((1UL << grp->index) - 1);
> +                       if (mask)
> +                               mask = ~((1UL << __fls(mask)) - 1);
> +                       else
> +                               mask = ~0UL;
> +                       qfq_move_groups(q, mask, EB, ER);
> +                       qfq_move_groups(q, mask, IB, IR);
> +               }
> +               __clear_bit(grp->index, &q->bitmaps[ER]);
> +       } else if (!grp->slots[grp->front]) {
> +               cl = qfq_slot_scan(grp);
> +               roundedS = qfq_round_down(cl->S, grp->slot_shift);
> +               if (grp->S != roundedS) {
> +                       __clear_bit(grp->index, &q->bitmaps[ER]);
> +                       __clear_bit(grp->index, &q->bitmaps[IR]);
> +                       __clear_bit(grp->index, &q->bitmaps[EB]);
> +                       __clear_bit(grp->index, &q->bitmaps[IB]);
> +                       grp->S = roundedS;
> +                       grp->F = roundedS + (2ULL << grp->slot_shift);
> +                       s = qfq_calc_state(q, grp);
> +                       __set_bit(grp->index, &q->bitmaps[s]);
> +               }
> +       }
> +
> +       qfq_update_eligible(q, q->V);
> +}
> +
> +static void qfq_qlen_notify(struct Qdisc *sch, unsigned long arg)
> +{
> +       struct qfq_sched *q = (struct qfq_sched *)sch;
> +       struct qfq_class *cl = (struct qfq_class *)arg;
> +
> +       if (cl->qdisc->q.qlen == 0)
> +               qfq_deactivate_class(q, cl, NULL);
> +}
> +
> +static unsigned int qfq_drop(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl, **pp;
> +       unsigned int i, j, len;
> +
> +       for (i = 0; i <= QFQ_MAX_INDEX; i++) {
> +               grp = &q->groups[i];
> +               for (j = 0; j < QFQ_MAX_SLOTS; j++) {
> +                       for (pp = &grp->slots[j]; *pp; pp = &(*pp)->next) {
> +                               cl = *pp;
> +                               if (!cl->qdisc->ops->drop)
> +                                       continue;
> +
> +                               len = cl->qdisc->ops->drop(cl->qdisc);
> +                               if (len > 0) {
> +                                       sch->q.qlen--;
> +                                       if (!cl->qdisc->q.qlen)
> +                                               qfq_deactivate_class(q, cl, pp);
> +
> +                                       return len;
> +                               }
> +                       }
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       int i, err;
> +
> +       err = qdisc_class_hash_init(&q->clhash);
> +       if (err < 0)
> +               return err;
> +
> +       for (i = 0; i <= QFQ_MAX_INDEX; i++) {
> +               grp = &q->groups[i];
> +               grp->index = i;
> +       }
> +
> +       return 0;
> +}
> +
> +static void qfq_reset_qdisc(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_group *grp;
> +       struct qfq_class *cl, **pp;
> +       struct hlist_node *n;
> +       unsigned int i, j;
> +
> +       for (i = 0; i <= QFQ_MAX_INDEX; i++) {
> +               grp = &q->groups[i];
> +               for (j = 0; j < QFQ_MAX_SLOTS; j++) {
> +                       for (pp = &grp->slots[j]; *pp; pp = &(*pp)->next) {
> +                               cl = *pp;
> +                               if (cl->qdisc->q.qlen)
> +                                       qfq_deactivate_class(q, cl, pp);
> +                       }
> +               }
> +       }
> +
> +       for (i = 0; i < q->clhash.hashsize; i++) {
> +               hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
> +                       qdisc_reset(cl->qdisc);
> +       }
> +       sch->q.qlen = 0;
> +}
> +
> +static void qfq_destroy_qdisc(struct Qdisc *sch)
> +{
> +       struct qfq_sched *q = qdisc_priv(sch);
> +       struct qfq_class *cl;
> +       struct hlist_node *n, *next;
> +       unsigned int i;
> +
> +       tcf_destroy_chain(&q->filter_list);
> +
> +       for (i = 0; i < q->clhash.hashsize; i++) {
> +               hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
> +                                         common.hnode)
> +                       qfq_destroy_class(sch, cl);
> +       }
> +       qdisc_class_hash_destroy(&q->clhash);
> +}
> +
> +static const struct Qdisc_class_ops qfq_class_ops = {
> +       .change         = qfq_change_class,
> +       .delete         = qfq_delete_class,
> +       .get            = qfq_get_class,
> +       .put            = qfq_put_class,
> +       .tcf_chain      = qfq_tcf_chain,
> +       .bind_tcf       = qfq_bind_tcf,
> +       .unbind_tcf     = qfq_unbind_tcf,
> +       .graft          = qfq_graft_class,
> +       .leaf           = qfq_class_leaf,
> +       .qlen_notify    = qfq_qlen_notify,
> +       .dump           = qfq_dump_class,
> +       .dump_stats     = qfq_dump_class_stats,
> +       .walk           = qfq_walk,
> +};
> +
> +static struct Qdisc_ops qfq_qdisc_ops __read_mostly = {
> +       .cl_ops         = &qfq_class_ops,
> +       .id             = "qfq",
> +       .priv_size      = sizeof(struct qfq_sched),
> +       .enqueue        = qfq_enqueue,
> +       .dequeue        = qfq_dequeue,
> +       .peek           = qdisc_peek_dequeued,
> +       .drop           = qfq_drop,
> +       .init           = qfq_init_qdisc,
> +       .reset          = qfq_reset_qdisc,
> +       .destroy        = qfq_destroy_qdisc,
> +       .owner          = THIS_MODULE,
> +};
> +
> +static int __init qfq_init(void)
> +{
> +       return register_qdisc(&qfq_qdisc_ops);
> +}
> +
> +static void __exit qfq_exit(void)
> +{
> +       unregister_qdisc(&qfq_qdisc_ops);
> +}
> +
> +module_init(qfq_init);
> +module_exit(qfq_exit);
> +MODULE_LICENSE("GPL");
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>



-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: David Miller @ 2011-01-08  1:15 UTC (permalink / raw)
  To: xiaosuo; +Cc: paulus, harvey.harrison, linux-ppp, netdev
In-Reply-To: <AANLkTim=1GCah0qp8HJK31LWXPb5vAZYp+d2BTQM+Q+B@mail.gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Sat, 8 Jan 2011 08:43:01 +0800

> On Fri, Jan 7, 2011 at 11:01 AM, Paul Mackerras <paulus@samba.org> wrote:
>> On Fri, Jan 07, 2011 at 07:37:36AM +0800, Changli Gao wrote:
>>
>>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>>
>> This patch description is inadequate.  It should tell us why you are
>> making this change.  Does it result in smaller and/or faster code, and
>> if so by how much on what sort of machine?  Do you think it makes the
>> code clearer?  (I don't.)  Or is there some other motivation for this?
>>
> 
> Good designed APIs always make code clearer, smaller and faster. It is
> obvious enough I think.

I have to say that every time I go read the header parsing code in the
PPP driver, I absolutely regret it.

And Changli's patch fixes some of the readability problems.

^ permalink raw reply

* Re: [PATCH v2] net: ppp: use {get,put}_unaligned_be{16,32}
From: Changli Gao @ 2011-01-08  0:43 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: David S. Miller, Harvey Harrison, linux-ppp, netdev
In-Reply-To: <20110107030145.GA8021@brick.ozlabs.ibm.com>

On Fri, Jan 7, 2011 at 11:01 AM, Paul Mackerras <paulus@samba.org> wrote:
> On Fri, Jan 07, 2011 at 07:37:36AM +0800, Changli Gao wrote:
>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>
> This patch description is inadequate.  It should tell us why you are
> making this change.  Does it result in smaller and/or faster code, and
> if so by how much on what sort of machine?  Do you think it makes the
> code clearer?  (I don't.)  Or is there some other motivation for this?
>

Good designed APIs always make code clearer, smaller and faster. It is
obvious enough I think.

The names of the functions imply the endianness, like comments. On
some MIPS architectures which support unaligned load and store
instructions, the APIs result in smaller and faster code.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [GIT] Networking
From: Francois Romieu @ 2011-01-08  0:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ben Hutchings, David Miller, Hayes Wang, David Woodhouse, akpm,
	netdev, linux-kernel
In-Reply-To: <AANLkTinANp+Jp8TPKzii6iZLUFpMyrXhmxYr42nZrXQU@mail.gmail.com>

Linus Torvalds <torvalds@linux-foundation.org> :
[...]
> Hmm. I never even waited for 60 seconds. Maybe my boot would have
> continued after the delay.

The 60 seconds delay is here :

drivers/base/firmware_class.c
[...]
static int loading_timeout = 60;        /* In seconds */

It can be read and set through sysfs.

I'll give a try at moving the request-firmware dependent stuff to
device-open time in drivers/net/r8169.c tomorrow morning. It's friday.

-- 
Ueimor

^ permalink raw reply

* Re: [net-next-2.6 PATCH v7 2/2] net_sched: implement a root container qdisc sch_mqprio
From: Jarek Poplawski @ 2011-01-07 23:28 UTC (permalink / raw)
  To: John Fastabend
  Cc: davem, hadi, eric.dumazet, shemminger, tgraf, bhutchings, nhorman,
	netdev
In-Reply-To: <20110107224549.19830.3961.stgit@jf-dev1-dcblab>

On Fri, Jan 07, 2011 at 02:45:49PM -0800, John Fastabend wrote:
> This implements a mqprio queueing discipline that by default creates
> a pfifo_fast qdisc per tx queue and provides the needed configuration
> interface.
> 
> Using the mqprio qdisc the number of tcs currently in use along
> with the range of queues alloted to each class can be configured. By
> default skbs are mapped to traffic classes using the skb priority.
> This mapping is configurable.
> 
> Configurable parameters,
> 
> struct tc_mqprio_qopt {
>         __u8    num_tc;
>         __u8    prio_tc_map[TC_BITMASK + 1];
>         __u8    hw;
>         __u16   count[TC_MAX_QUEUE];
>         __u16   offset[TC_MAX_QUEUE];
> };
> 
> Here the count/offset pairing give the queue alignment and the
> prio_tc_map gives the mapping from skb->priority to tc.
> 
> The hw bit determines if the hardware should configure the count
> and offset values. If the hardware bit is set then the operation
> will fail if the hardware does not implement the ndo_setup_tc
> operation. This is to avoid undetermined states where the hardware
> may or may not control the queue mapping. Also minimal bounds
> checking is done on the count/offset to verify a queue does not
> exceed num_tx_queues and that queue ranges do not overlap. Otherwise
> it is left to user policy or hardware configuration to create
> useful mappings.
> 
> It is expected that hardware QOS schemes can be implemented by
> creating appropriate mappings of queues in ndo_tc_setup().
> 
> One expected use case is drivers will use the ndo_setup_tc to map
> queue ranges onto 802.1Q traffic classes. This provides a generic
> mechanism to map network traffic onto these traffic classes and
> removes the need for lower layer drivers to know specifics about
> traffic types.
> 
> Signed-off-by: John Fastabend <john.r.fastabend@intel.com>

Acked-by: Jarek Poplawski <jarkao2@gmail.com>

^ permalink raw reply

* [net-next-2.6 PATCH v7 2/2] net_sched: implement a root container qdisc sch_mqprio
From: John Fastabend @ 2011-01-07 22:45 UTC (permalink / raw)
  To: davem
  Cc: jarkao2, hadi, eric.dumazet, shemminger, tgraf, bhutchings,
	nhorman, netdev
In-Reply-To: <20110107224543.19830.74009.stgit@jf-dev1-dcblab>

This implements a mqprio queueing discipline that by default creates
a pfifo_fast qdisc per tx queue and provides the needed configuration
interface.

Using the mqprio qdisc the number of tcs currently in use along
with the range of queues alloted to each class can be configured. By
default skbs are mapped to traffic classes using the skb priority.
This mapping is configurable.

Configurable parameters,

struct tc_mqprio_qopt {
        __u8    num_tc;
        __u8    prio_tc_map[TC_BITMASK + 1];
        __u8    hw;
        __u16   count[TC_MAX_QUEUE];
        __u16   offset[TC_MAX_QUEUE];
};

Here the count/offset pairing give the queue alignment and the
prio_tc_map gives the mapping from skb->priority to tc.

The hw bit determines if the hardware should configure the count
and offset values. If the hardware bit is set then the operation
will fail if the hardware does not implement the ndo_setup_tc
operation. This is to avoid undetermined states where the hardware
may or may not control the queue mapping. Also minimal bounds
checking is done on the count/offset to verify a queue does not
exceed num_tx_queues and that queue ranges do not overlap. Otherwise
it is left to user policy or hardware configuration to create
useful mappings.

It is expected that hardware QOS schemes can be implemented by
creating appropriate mappings of queues in ndo_tc_setup().

One expected use case is drivers will use the ndo_setup_tc to map
queue ranges onto 802.1Q traffic classes. This provides a generic
mechanism to map network traffic onto these traffic classes and
removes the need for lower layer drivers to know specifics about
traffic types.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/pkt_sched.h |   12 +
 net/sched/Kconfig         |   12 +
 net/sched/Makefile        |    1 
 net/sched/sch_generic.c   |    4 
 net/sched/sch_mqprio.c    |  418 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 447 insertions(+), 0 deletions(-)
 create mode 100644 net/sched/sch_mqprio.c

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index 2cfa4bc..776cd93 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -481,4 +481,16 @@ struct tc_drr_stats {
 	__u32	deficit;
 };
 
+/* MQPRIO */
+#define TC_QOPT_BITMASK 15
+#define TC_QOPT_MAX_QUEUE 16
+
+struct tc_mqprio_qopt {
+	__u8	num_tc;
+	__u8	prio_tc_map[TC_QOPT_BITMASK + 1];
+	__u8	hw;
+	__u16	count[TC_QOPT_MAX_QUEUE];
+	__u16	offset[TC_QOPT_MAX_QUEUE];
+};
+
 #endif
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index a36270a..f52f5eb 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -205,6 +205,18 @@ config NET_SCH_DRR
 
 	  If unsure, say N.
 
+config NET_SCH_MQPRIO
+	tristate "Multi-queue priority scheduler (MQPRIO)"
+	help
+	  Say Y here if you want to use the Multi-queue Priority scheduler.
+	  This scheduler allows QOS to be offloaded on NICs that have support
+	  for offloading QOS schedulers.
+
+	  To compile this driver as a module, choose M here: the module will
+	  be called sch_mqprio.
+
+	  If unsure, say N.
+
 config NET_SCH_INGRESS
 	tristate "Ingress Qdisc"
 	depends on NET_CLS_ACT
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5db..26ce681 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ)	+= sch_multiq.o
 obj-$(CONFIG_NET_SCH_ATM)	+= sch_atm.o
 obj-$(CONFIG_NET_SCH_NETEM)	+= sch_netem.o
 obj-$(CONFIG_NET_SCH_DRR)	+= sch_drr.o
+obj-$(CONFIG_NET_SCH_MQPRIO)	+= sch_mqprio.o
 obj-$(CONFIG_NET_CLS_U32)	+= cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)	+= cls_route.o
 obj-$(CONFIG_NET_CLS_FW)	+= cls_fw.o
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598..723b278 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -540,6 +540,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
 	.dump		=	pfifo_fast_dump,
 	.owner		=	THIS_MODULE,
 };
+EXPORT_SYMBOL(pfifo_fast_ops);
 
 struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
 			  struct Qdisc_ops *ops)
@@ -674,6 +675,7 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
 
 	return oqdisc;
 }
+EXPORT_SYMBOL(dev_graft_qdisc);
 
 static void attach_one_default_qdisc(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
@@ -761,6 +763,7 @@ void dev_activate(struct net_device *dev)
 		dev_watchdog_up(dev);
 	}
 }
+EXPORT_SYMBOL(dev_activate);
 
 static void dev_deactivate_queue(struct net_device *dev,
 				 struct netdev_queue *dev_queue,
@@ -840,6 +843,7 @@ void dev_deactivate(struct net_device *dev)
 	list_add(&dev->unreg_list, &single);
 	dev_deactivate_many(&single);
 }
+EXPORT_SYMBOL(dev_deactivate);
 
 static void dev_init_scheduler_queue(struct net_device *dev,
 				     struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 0000000..705bdfa
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,418 @@
+/*
+ * net/sched/sch_mqprio.c
+ *
+ * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+
+struct mqprio_sched {
+	struct Qdisc		**qdiscs;
+	int hw_owned;
+};
+
+static void mqprio_destroy(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned int ntx;
+
+	if (!priv->qdiscs)
+		return;
+
+	for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
+		qdisc_destroy(priv->qdiscs[ntx]);
+
+	if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
+		dev->netdev_ops->ndo_setup_tc(dev, 0, dev->real_num_tx_queues);
+	else
+		netdev_set_num_tc(dev, 0);
+
+	kfree(priv->qdiscs);
+}
+
+static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
+{
+	int i, j;
+
+	/* Verify num_tc is not out of max range */
+	if (qopt->num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	/* Verify priority mapping uses valid tcs */
+	for (i = 0; i < TC_BITMASK + 1; i++) {
+		if (qopt->prio_tc_map[i] >= qopt->num_tc)
+			return -EINVAL;
+	}
+
+	/* net_device does not support requested operation */
+	if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
+		return -EINVAL;
+
+	/* if hw owned qcount and qoffset are taken from LLD so
+	 * no reason to verify them here
+	 */
+	if (qopt->hw)
+		return 0;
+
+	for (i = 0; i < qopt->num_tc; i++) {
+		unsigned int last = qopt->offset[i] + qopt->count[i];
+
+		/* Verify the queue count is in tx range being equal to the
+		 * real_num_tx_queues indicates the last queue is in use.
+		 */
+		if (qopt->offset[i] >= dev->real_num_tx_queues ||
+		    !qopt->count[i] ||
+		    last > dev->real_num_tx_queues)
+			return -EINVAL;
+
+		/* Verify that the offset and counts do not overlap */
+		for (j = i + 1; j < qopt->num_tc; j++) {
+			if (last > qopt->offset[j])
+				return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct netdev_queue *dev_queue;
+	struct Qdisc *qdisc;
+	int i, err = -EOPNOTSUPP;
+	struct tc_mqprio_qopt *qopt = NULL;
+
+	BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
+	BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
+
+	if (sch->parent != TC_H_ROOT)
+		return -EOPNOTSUPP;
+
+	if (!netif_is_multiqueue(dev))
+		return -EOPNOTSUPP;
+
+	if (nla_len(opt) < sizeof(*qopt))
+		return -EINVAL;
+
+	qopt = nla_data(opt);
+	if (mqprio_parse_opt(dev, qopt))
+		return -EINVAL;
+
+	/* pre-allocate qdisc, attachment can't fail */
+	priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
+			       GFP_KERNEL);
+	if (priv->qdiscs == NULL) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		dev_queue = netdev_get_tx_queue(dev, i);
+		qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
+					  TC_H_MAKE(TC_H_MAJ(sch->handle),
+						    TC_H_MIN(i + 1)));
+		if (qdisc == NULL) {
+			err = -ENOMEM;
+			goto err;
+		}
+		qdisc->flags |= TCQ_F_CAN_BYPASS;
+		priv->qdiscs[i] = qdisc;
+	}
+
+	/* If the mqprio options indicate that hardware should own
+	 * the queue mapping then run ndo_setup_tc otherwise use the
+	 * supplied and verified mapping
+	 */
+	if (qopt->hw) {
+		priv->hw_owned = 1;
+		err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc,
+						    dev->real_num_tx_queues);
+		if (err)
+			goto err;
+	} else {
+		netdev_set_num_tc(dev, qopt->num_tc);
+		for (i = 0; i < qopt->num_tc; i++)
+			netdev_set_tc_queue(dev, i,
+					    qopt->count[i], qopt->offset[i]);
+	}
+
+	/* Always use supplied priority mappings */
+	for (i = 0; i < TC_BITMASK + 1; i++)
+		netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
+
+	sch->flags |= TCQ_F_MQROOT;
+	return 0;
+
+err:
+	mqprio_destroy(sch);
+	return err;
+}
+
+static void mqprio_attach(struct Qdisc *sch)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	struct Qdisc *qdisc;
+	unsigned int ntx;
+
+	/* Attach underlying qdisc */
+	for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+		qdisc = priv->qdiscs[ntx];
+		qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
+		if (qdisc)
+			qdisc_destroy(qdisc);
+	}
+	kfree(priv->qdiscs);
+	priv->qdiscs = NULL;
+}
+
+static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
+					     unsigned long cl)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
+
+	if (ntx >= dev->num_tx_queues)
+		return NULL;
+	return netdev_get_tx_queue(dev, ntx);
+}
+
+static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+		    struct Qdisc **old)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return -EINVAL;
+
+	if (dev->flags & IFF_UP)
+		dev_deactivate(dev);
+
+	*old = dev_graft_qdisc(dev_queue, new);
+
+	if (dev->flags & IFF_UP)
+		dev_activate(dev);
+
+	return 0;
+}
+
+static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	struct mqprio_sched *priv = qdisc_priv(sch);
+	unsigned char *b = skb_tail_pointer(skb);
+	struct tc_mqprio_qopt opt;
+	struct Qdisc *qdisc;
+	unsigned int i;
+
+	sch->q.qlen = 0;
+	memset(&sch->bstats, 0, sizeof(sch->bstats));
+	memset(&sch->qstats, 0, sizeof(sch->qstats));
+
+	for (i = 0; i < dev->num_tx_queues; i++) {
+		qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+		spin_lock_bh(qdisc_lock(qdisc));
+		sch->q.qlen		+= qdisc->q.qlen;
+		sch->bstats.bytes	+= qdisc->bstats.bytes;
+		sch->bstats.packets	+= qdisc->bstats.packets;
+		sch->qstats.qlen	+= qdisc->qstats.qlen;
+		sch->qstats.backlog	+= qdisc->qstats.backlog;
+		sch->qstats.drops	+= qdisc->qstats.drops;
+		sch->qstats.requeues	+= qdisc->qstats.requeues;
+		sch->qstats.overlimits	+= qdisc->qstats.overlimits;
+		spin_unlock_bh(qdisc_lock(qdisc));
+	}
+
+	opt.num_tc = netdev_get_num_tc(dev);
+	memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
+	opt.hw = priv->hw_owned;
+
+	for (i = 0; i < netdev_get_num_tc(dev); i++) {
+		opt.count[i] = dev->tc_to_txq[i].count;
+		opt.offset[i] = dev->tc_to_txq[i].offset;
+	}
+
+	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
+
+	return skb->len;
+nla_put_failure:
+	nlmsg_trim(skb, b);
+	return -1;
+}
+
+static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
+{
+	struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+	if (!dev_queue)
+		return NULL;
+
+	return dev_queue->qdisc_sleeping;
+}
+
+static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned int ntx = TC_H_MIN(classid);
+
+	if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
+		return 0;
+	return ntx;
+}
+
+static void mqprio_put(struct Qdisc *sch, unsigned long cl)
+{
+}
+
+static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
+			 struct sk_buff *skb, struct tcmsg *tcm)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		tcm->tcm_parent = TC_H_ROOT;
+		tcm->tcm_info = 0;
+	} else {
+		int i;
+		struct netdev_queue *dev_queue;
+
+		dev_queue = mqprio_queue_get(sch, cl);
+		tcm->tcm_parent = 0;
+		for (i = 0; i < netdev_get_num_tc(dev); i++) {
+			struct netdev_tc_txq tc = dev->tc_to_txq[i];
+			int q_idx = cl - netdev_get_num_tc(dev);
+
+			if (q_idx > tc.offset &&
+			    q_idx <= tc.offset + tc.count) {
+				tcm->tcm_parent =
+					TC_H_MAKE(TC_H_MAJ(sch->handle),
+						  TC_H_MIN(i + 1));
+				break;
+			}
+		}
+		tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
+	}
+	tcm->tcm_handle |= TC_H_MIN(cl);
+	return 0;
+}
+
+static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+			       struct gnet_dump *d)
+{
+	struct net_device *dev = qdisc_dev(sch);
+
+	if (cl <= netdev_get_num_tc(dev)) {
+		int i;
+		struct Qdisc *qdisc;
+		struct gnet_stats_queue qstats = {0};
+		struct gnet_stats_basic_packed bstats = {0};
+		struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
+
+		/* Drop lock here it will be reclaimed before touching
+		 * statistics this is required because the d->lock we
+		 * hold here is the look on dev_queue->qdisc_sleeping
+		 * also acquired below.
+		 */
+		spin_unlock_bh(d->lock);
+
+		for (i = tc.offset; i < tc.offset + tc.count; i++) {
+			qdisc = netdev_get_tx_queue(dev, i)->qdisc;
+			spin_lock_bh(qdisc_lock(qdisc));
+			bstats.bytes      += qdisc->bstats.bytes;
+			bstats.packets    += qdisc->bstats.packets;
+			qstats.qlen       += qdisc->qstats.qlen;
+			qstats.backlog    += qdisc->qstats.backlog;
+			qstats.drops      += qdisc->qstats.drops;
+			qstats.requeues   += qdisc->qstats.requeues;
+			qstats.overlimits += qdisc->qstats.overlimits;
+			spin_unlock_bh(qdisc_lock(qdisc));
+		}
+		/* Reclaim root sleeping lock before completing stats */
+		spin_lock_bh(d->lock);
+		if (gnet_stats_copy_basic(d, &bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &qstats) < 0)
+			return -1;
+	} else {
+		struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
+
+		sch = dev_queue->qdisc_sleeping;
+		sch->qstats.qlen = sch->q.qlen;
+		if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
+		    gnet_stats_copy_queue(d, &sch->qstats) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+	struct net_device *dev = qdisc_dev(sch);
+	unsigned long ntx;
+
+	if (arg->stop)
+		return;
+
+	/* Walk hierarchy with a virtual class per tc */
+	arg->count = arg->skip;
+	for (ntx = arg->skip;
+	     ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
+	     ntx++) {
+		if (arg->fn(sch, ntx + 1, arg) < 0) {
+			arg->stop = 1;
+			break;
+		}
+		arg->count++;
+	}
+}
+
+static const struct Qdisc_class_ops mqprio_class_ops = {
+	.graft		= mqprio_graft,
+	.leaf		= mqprio_leaf,
+	.get		= mqprio_get,
+	.put		= mqprio_put,
+	.walk		= mqprio_walk,
+	.dump		= mqprio_dump_class,
+	.dump_stats	= mqprio_dump_class_stats,
+};
+
+struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
+	.cl_ops		= &mqprio_class_ops,
+	.id		= "mqprio",
+	.priv_size	= sizeof(struct mqprio_sched),
+	.init		= mqprio_init,
+	.destroy	= mqprio_destroy,
+	.attach		= mqprio_attach,
+	.dump		= mqprio_dump,
+	.owner		= THIS_MODULE,
+};
+
+static int __init mqprio_module_init(void)
+{
+	return register_qdisc(&mqprio_qdisc_ops);
+}
+
+static void __exit mqprio_module_exit(void)
+{
+	unregister_qdisc(&mqprio_qdisc_ops);
+}
+
+module_init(mqprio_module_init);
+module_exit(mqprio_module_exit);
+
+MODULE_LICENSE("GPL");


^ permalink raw reply related

* [net-next-2.6 PATCH v7 1/2] net: implement mechanism for HW based QOS
From: John Fastabend @ 2011-01-07 22:45 UTC (permalink / raw)
  To: davem
  Cc: jarkao2, hadi, eric.dumazet, shemminger, tgraf, bhutchings,
	nhorman, netdev

This patch provides a mechanism for lower layer devices to
steer traffic using skb->priority to tx queues. This allows
for hardware based QOS schemes to use the default qdisc without
incurring the penalties related to global state and the qdisc
lock. While reliably receiving skbs on the correct tx ring
to avoid head of line blocking resulting from shuffling in
the LLD. Finally, all the goodness from txq caching and xps/rps
can still be leveraged.

Many drivers and hardware exist with the ability to implement
QOS schemes in the hardware but currently these drivers tend
to rely on firmware to reroute specific traffic, a driver
specific select_queue or the queue_mapping action in the
qdisc.

By using select_queue for this drivers need to be updated for
each and every traffic type and we lose the goodness of much
of the upstream work. Firmware solutions are inherently
inflexible. And finally if admins are expected to build a
qdisc and filter rules to steer traffic this requires knowledge
of how the hardware is currently configured. The number of tx
queues and the queue offsets may change depending on resources.
Also this approach incurs all the overhead of a qdisc with filters.

With the mechanism in this patch users can set skb priority using
expected methods ie setsockopt() or the stack can set the priority
directly. Then the skb will be steered to the correct tx queues
aligned with hardware QOS traffic classes. In the normal case with
a single traffic class and all queues in this class everything
works as is until the LLD enables multiple tcs.

To steer the skb we mask out the lower 4 bits of the priority
and allow the hardware to configure upto 15 distinct classes
of traffic. This is expected to be sufficient for most applications
at any rate it is more then the 8021Q spec designates and is
equal to the number of prio bands currently implemented in
the default qdisc.

This in conjunction with a userspace application such as
lldpad can be used to implement 8021Q transmission selection
algorithms one of these algorithms being the extended transmission
selection algorithm currently being used for DCB.

Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
---

 include/linux/netdevice.h |   65 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c            |   61 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 125 insertions(+), 1 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0f6b1c9..b1dbbed 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -646,6 +646,14 @@ struct xps_dev_maps {
     (nr_cpu_ids * sizeof(struct xps_map *)))
 #endif /* CONFIG_XPS */
 
+#define TC_MAX_QUEUE	16
+#define TC_BITMASK	15
+/* HW offloaded queuing disciplines txq count and offset maps */
+struct netdev_tc_txq {
+	u16 count;
+	u16 offset;
+};
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -756,6 +764,7 @@ struct xps_dev_maps {
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ * void (*ndo_setup_tc)(struct net_device *dev, u8 tc, unsigned int txq)
  */
 #define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
@@ -814,6 +823,8 @@ struct net_device_ops {
 						   struct nlattr *port[]);
 	int			(*ndo_get_vf_port)(struct net_device *dev,
 						   int vf, struct sk_buff *skb);
+	int			(*ndo_setup_tc)(struct net_device *dev, u8 tc,
+						unsigned int txq);
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	int			(*ndo_fcoe_enable)(struct net_device *dev);
 	int			(*ndo_fcoe_disable)(struct net_device *dev);
@@ -1146,6 +1157,9 @@ struct net_device {
 	/* Data Center Bridging netlink ops */
 	const struct dcbnl_rtnl_ops *dcbnl_ops;
 #endif
+	u8 num_tc;
+	struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
+	u8 prio_tc_map[TC_BITMASK + 1];
 
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	/* max exchange id for FCoE LRO by ddp */
@@ -1162,6 +1176,57 @@ struct net_device {
 #define	NETDEV_ALIGN		32
 
 static inline
+int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
+{
+	return dev->prio_tc_map[prio & TC_BITMASK];
+}
+
+static inline
+int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
+{
+	if (tc >= dev->num_tc)
+		return -EINVAL;
+
+	dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
+	return 0;
+}
+
+static inline
+void netdev_reset_tc(struct net_device *dev)
+{
+	dev->num_tc = 0;
+	memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
+	memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
+}
+
+static inline
+int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
+{
+	if (tc >= dev->num_tc)
+		return -EINVAL;
+
+	dev->tc_to_txq[tc].count = count;
+	dev->tc_to_txq[tc].offset = offset;
+	return 0;
+}
+
+static inline
+int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
+{
+	if (num_tc > TC_MAX_QUEUE)
+		return -EINVAL;
+
+	dev->num_tc = num_tc;
+	return 0;
+}
+
+static inline
+int netdev_get_num_tc(struct net_device *dev)
+{
+	return dev->num_tc;
+}
+
+static inline
 struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
 					 unsigned int index)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index a215269..7c9b1aa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1593,6 +1593,54 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 	rcu_read_unlock();
 }
 
+/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
+ * @dev: Network device
+ * @txq: number of queues available
+ *
+ * If real_num_tx_queues is changed the tc mappings may no longer be
+ * valid. To resolve this if the net_device supports ndo_setup_tc
+ * call the ops routine with the new queue number. If the ops is not
+ * available verify the tc mapping remains valid and if not NULL the
+ * mapping. With no priorities mapping to this offset/count pair it
+ * will no longer be used. In the worst case TC0 is invalid nothing
+ * can be done so disable priority mappings.
+ */
+void netif_setup_tc(struct net_device *dev, unsigned int txq)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	if (ops->ndo_setup_tc) {
+		ops->ndo_setup_tc(dev, dev->num_tc, txq);
+	} else {
+		int i;
+		struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
+
+		/* If TC0 is invalidated disable TC mapping */
+		if (tc->offset + tc->count > txq) {
+			pr_warning("Number of in use tx queues changed "
+				   "invalidating tc mappings. Priority "
+				   "traffic classification disabled!\n");
+			dev->num_tc = 0;
+			return;
+		}
+
+		/* Invalidated prio to tc mappings set to TC0 */
+		for (i = 1; i < TC_BITMASK + 1; i++) {
+			int q = netdev_get_prio_tc_map(dev, i);
+
+			tc = &dev->tc_to_txq[q];
+			if (tc->offset + tc->count > txq) {
+				pr_warning("Number of in use tx queues "
+					   "changed. Priority %i to tc "
+					   "mapping %i is no longer valid "
+					   "setting map to 0\n",
+					   i, q);
+				netdev_set_prio_tc_map(dev, i, 0);
+			}
+		}
+	}
+}
+
 /*
  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1612,6 +1660,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
 		if (rc)
 			return rc;
 
+		if (dev->num_tc)
+			netif_setup_tc(dev, txq);
+
 		if (txq < dev->real_num_tx_queues)
 			qdisc_reset_all_tx_gt(dev, txq);
 	}
@@ -2165,6 +2216,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 		  unsigned int num_tx_queues)
 {
 	u32 hash;
+	u16 qoffset = 0;
+	u16 qcount = num_tx_queues;
 
 	if (skb_rx_queue_recorded(skb)) {
 		hash = skb_get_rx_queue(skb);
@@ -2173,13 +2226,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 		return hash;
 	}
 
+	if (dev->num_tc) {
+		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
+		qoffset = dev->tc_to_txq[tc].offset;
+		qcount = dev->tc_to_txq[tc].count;
+	}
+
 	if (skb->sk && skb->sk->sk_hash)
 		hash = skb->sk->sk_hash;
 	else
 		hash = (__force u16) skb->protocol ^ skb->rxhash;
 	hash = jhash_1word(hash, hashrnd);
 
-	return (u16) (((u64) hash * num_tx_queues) >> 32);
+	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 


^ permalink raw reply related

* Re: [GIT] Networking
From: Linus Torvalds @ 2011-01-07 22:48 UTC (permalink / raw)
  To: Francois Romieu
  Cc: Ben Hutchings, David Miller, Hayes Wang, David Woodhouse, akpm,
	netdev, linux-kernel
In-Reply-To: <20110107215505.GA1892@electric-eye.fr.zoreil.com>

On Fri, Jan 7, 2011 at 1:55 PM, Francois Romieu <romieu@fr.zoreil.com> wrote:
> Linus Torvalds <torvalds@linux-foundation.org> :
> [...]
>> I just confirmed that building it as a module works
>
> I have just tried a non-modular build and it worked without firmware.
>
...
> [    4.340876] sd 1:0:0:0: [sda] Attached SCSI disk
> [   63.968081] r8169 0000:02:00.0: eth0: unable to apply firmware patch
>
> It's here. After a 60 seconds black-out.

Hmm. I never even waited for 60 seconds. Maybe my boot would have
continued after the delay.

                    Linus

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox