Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] rps: add flow director support
From: Tom Herbert @ 2010-04-12 13:34 UTC (permalink / raw)
  To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <1271022140-3917-1-git-send-email-xiaosuo@gmail.com>

On Sun, Apr 11, 2010 at 2:42 PM, Changli Gao <xiaosuo@gmail.com> wrote:
> add rps flow director support
>
> with rps flow director, users can do weighted packet dispatching among CPUs.
> For example, CPU0:CPU1 is 1:3 for eth0's rx-0:
>
"Flow director" is a misnomer here in that it has no per flow
awareness, that is what RFS provides.  Please use a different name.

>  localhost linux # echo 4 > /sys/class/net/eth0/queues/rx-0/rps_flows
>  localhost linux # echo 0 > /sys/class/net/eth0/queues/rx-0/rps_flow_0
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_1
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_2
>  localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_flow_3
>
It might be better to put this in its own directory and also do it per
CPU instead of hash entry.  This should result in a lot fewer entries
and I'm not sure how you would deal with holes in the hash table for
unspecified entries.  Also, it would be nice not to have to specify a
number of entries.  Maybe something like:

localhost linux # echo 1 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/0
localhost linux # echo 3 > /sys/class/net/eth0/queues/rx-0/rps_cpu_map/1

To specify CPU 0 with weight 1, CPU 1 with weight 3.

> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
> ----
>  net/core/net-sysfs.c |  176 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 172 insertions(+), 4 deletions(-)
> diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
> index 1e7fdd6..d904610 100644
> --- a/net/core/net-sysfs.c
> +++ b/net/core/net-sysfs.c
> @@ -511,6 +511,109 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
>        .store = rx_queue_attr_store,
>  };
>
> +static DEFINE_MUTEX(rps_map_lock);
> +
> +static ssize_t show_rps_flow(struct netdev_rx_queue *queue,
> +                            struct rx_queue_attribute *attribute, char *buf)
> +{
> +       unsigned long flowid;
> +       struct rps_map *map;
> +       u16 cpu;
> +
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       if (map && flowid < map->len)
> +               cpu = map->cpus[flowid];
> +       else
> +               cpu = 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%hu\n", cpu);
> +}
> +
> +static ssize_t store_rps_flow(struct netdev_rx_queue *queue,
> +                             struct rx_queue_attribute *attribute,
> +                             const char *buf, size_t len)
> +{
> +       unsigned long flowid, cpu;
> +       struct rps_map *map;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &cpu))
> +               return -EINVAL;
> +       strict_strtoul(attribute->attr.name + strlen("rps_flow_"), 10, &flowid);
> +
> +       mutex_lock(&rps_map_lock);
> +       map = queue->rps_map;
> +       if (map && flowid < map->len)
> +               map->cpus[flowid] = cpu;
> +       mutex_unlock(&rps_map_lock);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute **rps_flow_attribute;
> +static int rps_flow_attribute_size;
> +
> +/* must be called with rps_map_lock locked */
> +static int update_rps_flow_files(struct kobject *kobj,
> +                                struct rps_map *old_map, struct rps_map *map)
> +{
> +       int i;
> +       int old_map_len = old_map ? old_map->len : 0;
> +       int map_len = map ? map->len : 0;
> +
> +       if (old_map_len >= map_len) {
> +               for (i = map_len; i < old_map_len; i++)
> +                       sysfs_remove_file(kobj, &rps_flow_attribute[i]->attr);
> +               return 0;
> +       }
> +
> +       if (map_len > rps_flow_attribute_size) {
> +               struct rx_queue_attribute **attrs;
> +               char name[sizeof("rps_flow_4294967295")];
> +               char *pname;
> +
> +               attrs = krealloc(rps_flow_attribute, map_len * sizeof(void *),
> +                                GFP_KERNEL);
> +               if (attrs == NULL)
> +                       return -ENOMEM;
> +               rps_flow_attribute = attrs;
> +               for (i = rps_flow_attribute_size; i < map_len; i++) {
> +                       rps_flow_attribute[i] = kmalloc(sizeof(**attrs),
> +                                                       GFP_KERNEL);
> +                       if (rps_flow_attribute[i] == NULL)
> +                               break;
> +                       sprintf(name, "rps_flow_%d", i);
> +                       pname = kstrdup(name, GFP_KERNEL);
> +                       if (pname == NULL) {
> +                               kfree(rps_flow_attribute[i]);
> +                               break;
> +                       }
> +                       rps_flow_attribute[i]->attr.name = pname;
> +                       rps_flow_attribute[i]->attr.mode = S_IRUGO | S_IWUSR;
> +                       rps_flow_attribute[i]->show = show_rps_flow;
> +                       rps_flow_attribute[i]->store = store_rps_flow;
> +               }
> +               rps_flow_attribute_size = i;
> +               if (i != map_len)
> +                       return -ENOMEM;
> +       }
> +
> +       for (i = old_map_len; i < map_len; i++) {
> +               if (sysfs_create_file(kobj, &rps_flow_attribute[i]->attr)) {
> +                       while (--i >= old_map_len)
> +                               sysfs_remove_file(kobj,
> +                                                 &rps_flow_attribute[i]->attr);
> +                       return -ENOMEM;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
>  static ssize_t show_rps_map(struct netdev_rx_queue *queue,
>                            struct rx_queue_attribute *attribute, char *buf)
>  {
> @@ -555,7 +658,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>        struct rps_map *old_map, *map;
>        cpumask_var_t mask;
>        int err, cpu, i;
> -       static DEFINE_SPINLOCK(rps_map_lock);
>
>        if (!capable(CAP_NET_ADMIN))
>                return -EPERM;
> @@ -588,10 +690,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>                map = NULL;
>        }
>
> -       spin_lock(&rps_map_lock);
> +       mutex_lock(&rps_map_lock);
>        old_map = queue->rps_map;
> -       rcu_assign_pointer(queue->rps_map, map);
> -       spin_unlock(&rps_map_lock);
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err)
> +               rcu_assign_pointer(queue->rps_map, map);
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
>
>        if (old_map)
>                call_rcu(&old_map->rcu, rps_map_release);
> @@ -603,8 +710,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
>  static struct rx_queue_attribute rps_cpus_attribute =
>        __ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
>
> +static ssize_t show_rps_flows(struct netdev_rx_queue *queue,
> +               struct rx_queue_attribute *attribute, char *buf)
> +{
> +       struct rps_map *map;
> +       unsigned int len;
> +
> +       rcu_read_lock();
> +       map = rcu_dereference(queue->rps_map);
> +       len = map ? map->len : 0;
> +       rcu_read_unlock();
> +       return sprintf(buf, "%u\n", len);
> +}
> +
> +static ssize_t store_rps_flows(struct netdev_rx_queue *queue,
> +                              struct rx_queue_attribute *attribute,
> +                              const char *buf, size_t len)
> +{
> +       struct rps_map *old_map, *map;
> +       unsigned long flows;
> +       int err;
> +
> +       if (!capable(CAP_NET_ADMIN))
> +               return -EPERM;
> +
> +       if (strict_strtoul(buf, 0, &flows))
> +               return -EINVAL;
> +       if (flows != 0) {
> +               map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(flows),
> +                                   L1_CACHE_BYTES), GFP_KERNEL);
> +               if (map == NULL)
> +                       return -ENOMEM;
> +               map->len = flows;
> +       } else {
> +               map = NULL;
> +       }
> +
> +       mutex_lock(&rps_map_lock);
> +       old_map = queue->rps_map;
> +       err = update_rps_flow_files(&queue->kobj, old_map, map);
> +       if (!err) {
> +               if (old_map && map)
> +                       memcpy(map->cpus, old_map->cpus,
> +                              sizeof(map->cpus[0]) *
> +                              min_t(unsigned int, flows, old_map->len));
> +               rcu_assign_pointer(queue->rps_map, map);
> +       }
> +       mutex_unlock(&rps_map_lock);
> +
> +       if (err)
> +               return err;
> +
> +       if (old_map)
> +               call_rcu(&old_map->rcu, rps_map_release);
> +
> +       return len;
> +}
> +
> +static struct rx_queue_attribute rps_flows_attribute =
> +       __ATTR(rps_flows, S_IRUGO | S_IWUSR, show_rps_flows, store_rps_flows);
> +
>  static struct attribute *rx_queue_default_attrs[] = {
>        &rps_cpus_attribute.attr,
> +       &rps_flows_attribute.attr,
>        NULL
>  };
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 13:19 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31AA0.5070006@gmail.com>

stephen mulcahy wrote:
>> Are both way non functional (RX and TX), or only one side ?
> 
> Whats the best way of testing this? (tcpdump listening on both hosts and 
> then running pings between the systems?)


stephen mulcahy wrote:
 >> Are both way non functional (RX and TX), or only one side ?
 >
 > Whats the best way of testing this? (tcpdump listening on both hosts and
 > then running pings between the systems?)

On one of the nodes that is in the malfunctioning state (node05), I ran

ssh node20

and grabbed the following output from running tcpdump on node20

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:12:59.612626 IP node05.webstar.cnet.36295 > node20.ssh: Flags [S], 
seq 3677858646, win 5840, options [mss 1460,sackOK,TS val 1599534 ecr 
0,nop,wscale 7], length 0
14:12:59.612656 IP node20.ssh > node05.webstar.cnet.36295: Flags [S.], 
seq 3610575850, ack 3677858647, win 5792, options [mss 1460,sackOK,TS 
val 1598775 ecr 1599534,nop,wscale 7], length 0
14:12:59.612718 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 1, win 46, options [nop,nop,TS val 1599534 ecr 1598775], length 0
14:12:59.617434 IP node20.ssh > node05.webstar.cnet.36295: Flags [P.], 
seq 1:33, ack 1, win 46, options [nop,nop,TS val 1598776 ecr 1599534], 
length 32
14:12:59.617522 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], length 0
14:12:59.617609 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1599535 ecr 1598776], 
length 32
14:12:59.820434 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294936586:4294936618, ack 2620194849, win 46, options [nop,nop,TS 
val 1599586 ecr 1598776], length 32
14:13:00.229069 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294961734:4294961766, ack 3928358945, win 46, options [nop,nop,TS 
val 1599688 ecr 1598776], length 32
14:13:01.044396 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294964167:4294964199, ack 410320929, win 46, options [nop,nop,TS 
val 1599892 ecr 1598776], length 32
14:13:02.676308 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1600300 ecr 1598776], 
length 32
14:13:05.940804 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1601116 
ecr 1598776], length 32
14:13:12.468484 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 17294:17326, ack 3045851169, win 46, options [nop,nop,TS val 1602748 
ecr 1598776], length 32
14:13:23.846891 IP node20.ssh > node05.webstar.cnet.36084: Flags [F.], 
seq 2093054475, ack 2175389538, win 46, options [nop,nop,TS val 1604834 
ecr 1575591], length 0
14:13:23.847278 IP node05.webstar.cnet.36084 > node20.ssh: Flags [R], 
seq 2175389538, win 0, length 0
14:13:25.523850 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1606012 ecr 1598776], 
length 32
14:13:50.127509 IP node20.ssh > node05.webstar.cnet.36143: Flags [F.], 
seq 2526196657, ack 2590340885, win 46, options [nop,nop,TS val 1611404 
ecr 1582161], length 0
14:13:50.127879 IP node05.webstar.cnet.36143 > node20.ssh: Flags [R], 
seq 2590340885, win 0, length 0
14:13:51.633934 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 4294963190:4294963222, ack 9830433, win 46, options [nop,nop,TS val 
1612540 ecr 1598776], length 32
14:13:55.125525 ARP, Request who-has node05.webstar.cnet tell node20, 
length 28
14:13:55.125886 ARP, Reply node05.webstar.cnet is-at 00:30:48:ce:dc:02 
(oui Unknown), length 46
14:14:43.855380 IP node05.webstar.cnet.36295 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1625596 ecr 1598776], 
length 32
14:14:48.855143 ARP, Request who-has node20 tell node05.webstar.cnet, 
length 46
14:14:48.855469 ARP, Reply node20 is-at 00:30:48:ce:de:34 (oui Unknown), 
length 28
14:14:59.617675 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], 
seq 33, ack 1, win 46, options [nop,nop,TS val 1628777 ecr 1599535], 
length 0
14:14:59.618202 IP node05.webstar.cnet.36295 > node20.ssh: Flags [FP.], 
seq 4294959654:4294960446, ack 3930456098, win 46, options [nop,nop,TS 
val 1629536 ecr 1628777], length 792
14:14:59.821527 IP node20.ssh > node05.webstar.cnet.36295: Flags [F.], 
seq 33, ack 1, win 46, options [nop,nop,TS val 1628828 ecr 1599535], 
length 0
14:14:59.821598 IP node05.webstar.cnet.36295 > node20.ssh: Flags [.], 
ack 34, win 46, options [nop,nop,TS val 1629587 ecr 1628828,nop,nop,sack 
1 {33:34}], length 0
^C^
27 packets captured
31 packets received by filter
0 packets dropped by kernel


I then did ifdown and ifup on node05 and again ran

ssh node20

and grabbed the following output from running tcpdump on node20

root@node20:~# tcpdump host node20 and node05
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 96 bytes
14:15:50.626410 IP node05.webstar.cnet.36690 > node20.ssh: Flags [S], 
seq 2044900531, win 5840, options [mss 1460,sackOK,TS val 1642289 ecr 
0,nop,wscale 7], length 0
14:15:50.626441 IP node20.ssh > node05.webstar.cnet.36690: Flags [S.], 
seq 1976694445, ack 2044900532, win 5792, options [mss 1460,sackOK,TS 
val 1641529 ecr 1642289,nop,wscale 7], length 0
14:15:50.626482 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 1, win 46, options [nop,nop,TS val 1642289 ecr 1641529], length 0
14:15:50.631138 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 1:33, ack 1, win 46, options [nop,nop,TS val 1641530 ecr 1642289], 
length 32
14:15:50.631218 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1642290 ecr 1641530], length 0
14:15:50.631267 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 1:33, ack 33, win 46, options [nop,nop,TS val 1642290 ecr 1641530], 
length 32
14:15:50.631281 IP node20.ssh > node05.webstar.cnet.36690: Flags [.], 
ack 33, win 46, options [nop,nop,TS val 1641530 ecr 1642290], length 0
14:15:50.631367 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 33:825, ack 33, win 46, options [nop,nop,TS val 1642290 ecr 
1641530], length 792
14:15:50.631376 IP node20.ssh > node05.webstar.cnet.36690: Flags [.], 
ack 825, win 58, options [nop,nop,TS val 1641530 ecr 1642290], length 0
14:15:50.631808 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 33:817, ack 825, win 58, options [nop,nop,TS val 1641530 ecr 
1642290], length 784
14:15:50.631950 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 825:849, ack 817, win 58, options [nop,nop,TS val 1642290 ecr 
1641530], length 24
14:15:50.633353 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 817:969, ack 849, win 58, options [nop,nop,TS val 1641530 ecr 
1642290], length 152
14:15:50.633932 IP node05.webstar.cnet.36690 > node20.ssh: Flags [P.], 
seq 849:993, ack 969, win 71, options [nop,nop,TS val 1642291 ecr 
1641530], length 144
14:15:50.637998 IP node20.ssh > node05.webstar.cnet.36690: Flags [P.], 
seq 969:1689, ack 993, win 70, options [nop,nop,TS val 1641532 ecr 
1642291], length 720
14:15:50.676465 IP node05.webstar.cnet.36690 > node20.ssh: Flags [.], 
ack 1689, win 83, options [nop,nop,TS val 1642302 ecr 1641532], length 0
14:16:09.776134 IP node05.webstar.cnet.49671 > node20.50060: Flags [S], 
seq 2348078217, win 5840, options [mss 1460,sackOK,TS val 1647077 ecr 
0,nop,wscale 7], length 0
14:16:09.776498 IP node20.50060 > node05.webstar.cnet.49671: Flags [R.], 
seq 0, ack 2348078218, win 0, length 0
^C
17 packets captured
21 packets received by filter
0 packets dropped by kernel


Does that help?

^ permalink raw reply

* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 13:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <1271076426.16881.21.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le lundi 12 avril 2010 à 13:39 +0100, stephen mulcahy a écrit :
> I am not sure I understand. Are you saying that using 2.6.30-2-amd64
> kernel also makes your forcedeth adapter being not functional ?

Hi Eric,

If I run my tests with the 2.6.30-2-amd64 kernel the network doesn't 
malfunction.

If I run my tests with the 2.6.32-3-amd64 kernel the network does 
malfunction.

If I take the forcedeth.ko module from the 2.6.30-2-amd64 kernel and 
drop that into /lib/modules/2.6.32-3-amd64/kernel/drivers/net/ and then 
reboot to 2.6.32-3-amd64 and rerun my tests - the network does malfunction.

> Are both way non functional (RX and TX), or only one side ?

Whats the best way of testing this? (tcpdump listening on both hosts and 
then running pings between the systems?)

-stephen

^ permalink raw reply

* Bug#572201: forcedeth driver hangs under heavy load
From: Eric Dumazet @ 2010-04-12 12:47 UTC (permalink / raw)
  To: stephen mulcahy; +Cc: netdev, Ben Hutchings, Ayaz Abdulla, 572201
In-Reply-To: <4BC31486.1090603@gmail.com>

Le lundi 12 avril 2010 à 13:39 +0100, stephen mulcahy a écrit :
> stephen mulcahy wrote:
> > It doesn't - further testing over the weekend saw 6 of 45 machines drop 
> > off the network with this problem. Nothing in dmesg or system logs. 
> > Happy to run more tests if someone can advise on what should be run.
> 
> I also just tried using the 2.6.30-2-amd64 (Debian) forcedeth kernel 
> module while running the 2.6.32-3-amd64 (Debian) kernel and experienced 
> the same symptoms.
> 
> Not sure if thats any help.
> 

I am not sure I understand. Are you saying that using 2.6.30-2-amd64
kernel also makes your forcedeth adapter being not functional ?

Are both way non functional (RX and TX), or only one side ?






-- 
To UNSUBSCRIBE, email to debian-bugs-dist-REQUEST@lists.debian.org
with a subject of "unsubscribe". Trouble? Contact listmaster@lists.debian.org

^ permalink raw reply

* Bug#572201: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 12:39 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings, Eric Dumazet, Ayaz Abdulla, 572201
In-Reply-To: <4BC2EF88.3060203@atlanticlinux.ie>

stephen mulcahy wrote:
> It doesn't - further testing over the weekend saw 6 of 45 machines drop 
> off the network with this problem. Nothing in dmesg or system logs. 
> Happy to run more tests if someone can advise on what should be run.

I also just tried using the 2.6.30-2-amd64 (Debian) forcedeth kernel 
module while running the 2.6.32-3-amd64 (Debian) kernel and experienced 
the same symptoms.

Not sure if thats any help.

-stephen

^ permalink raw reply

* [PATCH] iproute2: add option to build m_xt as a tc module.
From: Andreas Henriksson @ 2010-04-12 11:55 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

Add TC_CONFIG_XT_MODULE option that can be added
either to Config (after ./configure) or as an argument to "make".

This will build the xt module (action ipt) of tc as a
shared object that is linked at runtime by tc if used,
rather then built into tc.

This is similar to how the atm qdisc support
is handled (q_atm.so).

Signed-off-by: Andreas Henriksson <andreas@fatal.se>

---

The reason for this is simply to be able to avoid
the tc binary from being linked to libxtables. This way
distributions who ship binary packages can
avoid a dependency on the iptables package by
ignoring m_xt.so in the dependency analysis
and let actual users of the tc arguments "action ipt"
make sure they have iptables installed.
(See http://bugs.debian.org/576953 )

This was not a problem with the old/deprecated
m_ipt module which did runtime linking of
the iptables library.

Having the split inside tc, rather then between tc and the required
library, is preferred. This way we'll notice at build-time
when the required library breaks API/ABI rather
then having to rely on people that uses the functionality
to report back when the ABI is broken.
(We've learned this the hard way in debian after many
angry bugreports.)

I've had jamal pre-review this and he didn't see any
problems with this.


diff --git a/tc/Makefile b/tc/Makefile
index 805c108..3af33cf 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -43,10 +43,18 @@ TCMODULES += em_cmp.o
 TCMODULES += em_u32.o
 TCMODULES += em_meta.o
 
+TCSO :=
+ifeq ($(TC_CONFIG_ATM),y)
+  TCSO += q_atm.so
+endif
 
 ifeq ($(TC_CONFIG_XT),y)
-  TCMODULES += m_xt.o
-  LDLIBS += -lxtables
+  ifeq ($(TC_CONFIG_XT_MODULE),y)
+    TCSO += m_xt.so
+  else
+    TCMODULES += m_xt.o
+    LDLIBS += -lxtables
+  endif
 else
   ifeq ($(TC_CONFIG_XT_OLD),y)
     TCMODULES += m_xt_old.o
@@ -81,11 +89,6 @@ ifneq ($(IPT_LIB_DIR),)
 	CFLAGS += -DIPT_LIB_DIR=\"$(IPT_LIB_DIR)\"
 endif
 
-TCSO :=
-ifeq ($(TC_CONFIG_ATM),y)
-  TCSO += q_atm.so
-endif
-
 YACC := bison
 LEX := flex
 
@@ -114,6 +117,9 @@ clean:
 q_atm.so: q_atm.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -shared -fpic -o q_atm.so q_atm.c -latm
 
+m_xt.so: m_xt.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -shared -fpic -o m_xt.so m_xt.c -lxtables
+
 %.yacc.c: %.y
 	$(YACC) $(YACCFLAGS) -o $@ $<
 

^ permalink raw reply related

* Re: [v3 Patch 2/3] bridge: make bridge support netpoll
From: Eric Dumazet @ 2010-04-12 10:38 UTC (permalink / raw)
  To: Cong Wang
  Cc: Stephen Hemminger, linux-kernel, netdev, bridge, Andy Gospodarek,
	Neil Horman, Jeff Moyer, Matt Mackall, bonding-devel,
	Jay Vosburgh, David Miller
In-Reply-To: <4BC2F7E2.7020309@redhat.com>

Le lundi 12 avril 2010 à 18:37 +0800, Cong Wang a écrit :
> Stephen Hemminger wrote:
> > There is no protection on dev->priv_flags for SMP access.
> > It would better bit value in dev->state if you are using it as control flag.
> > 
> > Then you could use 
> > 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> > 				netpoll_send_skb(...)
> > 
> > 
> 
> Hmm, I think we can't use ->state here, it is not for this kind of purpose,
> according to its comments.
> 
> Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
> &, | to set or clear the flags. So there must be some other things preventing
> the race...

Yes, its RTNL that protects priv_flags changes, hopefully...

^ permalink raw reply

* Re: [v3 Patch 2/3] bridge: make bridge support netpoll
From: Cong Wang @ 2010-04-12 10:37 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: linux-kernel, netdev, bridge, Andy Gospodarek, Neil Horman,
	Jeff Moyer, Matt Mackall, bonding-devel, Jay Vosburgh,
	David Miller
In-Reply-To: <20100408083710.2b61ee44@nehalam>

Stephen Hemminger wrote:
>> Index: linux-2.6/net/bridge/br_forward.c
>> ===================================================================
>> --- linux-2.6.orig/net/bridge/br_forward.c
>> +++ linux-2.6/net/bridge/br_forward.c
>> @@ -15,6 +15,7 @@
>>  #include <linux/slab.h>
>>  #include <linux/kernel.h>
>>  #include <linux/netdevice.h>
>> +#include <linux/netpoll.h>
>>  #include <linux/skbuff.h>
>>  #include <linux/if_vlan.h>
>>  #include <linux/netfilter_bridge.h>
>> @@ -50,7 +51,13 @@ int br_dev_queue_push_xmit(struct sk_buf
>>  		else {
>>  			skb_push(skb, ETH_HLEN);
>>  
>> -			dev_queue_xmit(skb);
>> +#ifdef CONFIG_NET_POLL_CONTROLLER
>> +			if (skb->dev->priv_flags & IFF_IN_NETPOLL) {
>> +				netpoll_send_skb(skb->dev->npinfo->netpoll, skb);
>> +				skb->dev->priv_flags &= ~IFF_IN_NETPOLL;
>> +			} else
>> +#endif
> 
> There is no protection on dev->priv_flags for SMP access.
> It would better bit value in dev->state if you are using it as control flag.
> 
> Then you could use 
> 			if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
> 				netpoll_send_skb(...)
> 
> 

Hmm, I think we can't use ->state here, it is not for this kind of purpose,
according to its comments.

Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
&, | to set or clear the flags. So there must be some other things preventing
the race...


Thanks.

^ permalink raw reply

* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Alexey Dobriyan @ 2010-04-12 10:18 UTC (permalink / raw)
  To: Amerigo Wang
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, David Miller
In-Reply-To: <20100412100754.5302.99552.sendpatchset@localhost.localdomain>

On Mon, Apr 12, 2010 at 06:04:04AM -0400, Amerigo Wang wrote:
> As we are about to add another integer handling proc function a little
> bit of cleanup is in order: add a few helper functions to improve code
> readability and decrease code duplication.
> 
> In the process a bug is also fixed: if the user specifies a number
> with more then 20 digits it will be interpreted as two integers
> (e.g. 10000...13 will be interpreted as 100.... and 13).

ULONG_MAX is not 22 digits always.

The fix is to not rely on simple_strtoul()

I guess it's time to finally remove it. :-(

Also, it's better to copy_from user stuff once.
Without looking at non-trivial users, one page should be enough.

> Behavior for EFAULT handling was changed as well. Previous to this
> patch, when an EFAULT error occurred in the middle of a write
> operation, although some of the elements were set, that was not
> acknowledged to the user (by shorting the write and returning the
> number of bytes accepted). EFAULT is now treated just like any other
> errors by acknowledging the amount of bytes accepted.

> +static int proc_skip_wspace(char __user **buf, size_t *size)
> +{
> +	char c;
> +
> +	while (*size) {
> +		if (get_user(c, *buf))
> +			return -EFAULT;
> +		if (!isspace(c))
> +			break;
> +		(*size)--;
> +		(*buf)++;
> +	}
> +
> +	return 0;
> +}

yeah, copy_from_user once, so we won't have this.

> +static bool isanyof(char c, const char *v, unsigned len)

A what?
this is memchr()

> +{
> +	int i;
> +
> +	if (!len)
> +		return false;
> +
> +	for (i = 0; i < len; i++)
> +		if (c == v[i])
> +			break;
> +	if (i == len)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define TMPBUFLEN 22
> +/**
> + * proc_get_long - reads an ASCII formated integer from a user buffer
> + *
> + * @buf - user buffer
> + * @size - size of the user buffer
> + * @val - this is where the number will be stored
> + * @neg - set to %TRUE if number is negative
> + * @perm_tr - a vector which contains the allowed trailers
> + * @perm_tr_len - size of the perm_tr vector
> + * @tr - pointer to store the trailer character
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read. If tr is non NULL and a trailing
> + * character exist (size is non zero after returning from this
> + * function) tr is updated with the trailing character.
> + */
> +static int proc_get_long(char __user **buf, size_t *size,
> +			  unsigned long *val, bool *neg,
> +			  const char *perm_tr, unsigned perm_tr_len, char *tr)
> +{
> +	int len;
> +	char *p, tmp[TMPBUFLEN];
> +
> +	if (!*size)
> +		return -EINVAL;
> +
> +	len = *size;
> +	if (len > TMPBUFLEN-1)
> +		len = TMPBUFLEN-1;
> +
> +	if (copy_from_user(tmp, *buf, len))
> +		return -EFAULT;
> +
> +	tmp[len] = 0;
> +	p = tmp;
> +	if (*p == '-' && *size > 1) {
> +		*neg = 1;
> +		p++;
> +	} else
> +		*neg = 0;
> +	if (!isdigit(*p))
> +		return -EINVAL;
> +
> +	*val = simple_strtoul(p, &p, 0);
> +
> +	len = p - tmp;
> +
> +	/* We don't know if the next char is whitespace thus we may accept
> +	 * invalid integers (e.g. 1234...a) or two integers instead of one
> +	 * (e.g. 123...1). So lets not allow such large numbers. */
> +	if (len == TMPBUFLEN - 1)
> +		return -EINVAL;
> +
> +	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
> +		return -EINVAL;
> +
> +	if (tr && (len < *size))
> +		*tr = *p;
> +
> +	*buf += len;
> +	*size -= len;
> +
> +	return 0;
> +}

^ permalink raw reply

* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Alexey Dobriyan @ 2010-04-12 10:18 UTC (permalink / raw)
  To: Amerigo Wang
  Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
	netdev, Neil Horman, ebiederm, David Miller
In-Reply-To: <20100412100754.5302.99552.sendpatchset@localhost.localdomain>

On Mon, Apr 12, 2010 at 06:04:04AM -0400, Amerigo Wang wrote:
> As we are about to add another integer handling proc function a little
> bit of cleanup is in order: add a few helper functions to improve code
> readability and decrease code duplication.
> 
> In the process a bug is also fixed: if the user specifies a number
> with more then 20 digits it will be interpreted as two integers
> (e.g. 10000...13 will be interpreted as 100.... and 13).

ULONG_MAX is not 22 digits always.

The fix is to not rely on simple_strtoul()

I guess it's time to finally remove it. :-(

Also, it's better to copy_from user stuff once.
Without looking at non-trivial users, one page should be enough.

> Behavior for EFAULT handling was changed as well. Previous to this
> patch, when an EFAULT error occurred in the middle of a write
> operation, although some of the elements were set, that was not
> acknowledged to the user (by shorting the write and returning the
> number of bytes accepted). EFAULT is now treated just like any other
> errors by acknowledging the amount of bytes accepted.

> +static int proc_skip_wspace(char __user **buf, size_t *size)
> +{
> +	char c;
> +
> +	while (*size) {
> +		if (get_user(c, *buf))
> +			return -EFAULT;
> +		if (!isspace(c))
> +			break;
> +		(*size)--;
> +		(*buf)++;
> +	}
> +
> +	return 0;
> +}

yeah, copy_from_user once, so we won't have this.

> +static bool isanyof(char c, const char *v, unsigned len)

A what?
this is memchr()

> +{
> +	int i;
> +
> +	if (!len)
> +		return false;
> +
> +	for (i = 0; i < len; i++)
> +		if (c == v[i])
> +			break;
> +	if (i == len)
> +		return false;
> +
> +	return true;
> +}
> +
> +#define TMPBUFLEN 22
> +/**
> + * proc_get_long - reads an ASCII formated integer from a user buffer
> + *
> + * @buf - user buffer
> + * @size - size of the user buffer
> + * @val - this is where the number will be stored
> + * @neg - set to %TRUE if number is negative
> + * @perm_tr - a vector which contains the allowed trailers
> + * @perm_tr_len - size of the perm_tr vector
> + * @tr - pointer to store the trailer character
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read. If tr is non NULL and a trailing
> + * character exist (size is non zero after returning from this
> + * function) tr is updated with the trailing character.
> + */
> +static int proc_get_long(char __user **buf, size_t *size,
> +			  unsigned long *val, bool *neg,
> +			  const char *perm_tr, unsigned perm_tr_len, char *tr)
> +{
> +	int len;
> +	char *p, tmp[TMPBUFLEN];
> +
> +	if (!*size)
> +		return -EINVAL;
> +
> +	len = *size;
> +	if (len > TMPBUFLEN-1)
> +		len = TMPBUFLEN-1;
> +
> +	if (copy_from_user(tmp, *buf, len))
> +		return -EFAULT;
> +
> +	tmp[len] = 0;
> +	p = tmp;
> +	if (*p == '-' && *size > 1) {
> +		*neg = 1;
> +		p++;
> +	} else
> +		*neg = 0;
> +	if (!isdigit(*p))
> +		return -EINVAL;
> +
> +	*val = simple_strtoul(p, &p, 0);
> +
> +	len = p - tmp;
> +
> +	/* We don't know if the next char is whitespace thus we may accept
> +	 * invalid integers (e.g. 1234...a) or two integers instead of one
> +	 * (e.g. 123...1). So lets not allow such large numbers. */
> +	if (len == TMPBUFLEN - 1)
> +		return -EINVAL;
> +
> +	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
> +		return -EINVAL;
> +
> +	if (tr && (len < *size))
> +		*tr = *p;
> +
> +	*buf += len;
> +	*size -= len;
> +
> +	return 0;
> +}

^ permalink raw reply

* [Patch 3/3] net: reserve ports for applications using fixed port numbers
From: Amerigo Wang @ 2010-04-12 10:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, David Miller, ebiederm
In-Reply-To: <20100412100744.5302.92442.sendpatchset@localhost.localdomain>

From: Octavian Purdila <opurdila@ixiacom.com>

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/Documentation/networking/ip-sysctl.txt
===================================================================
--- linux-2.6.orig/Documentation/networking/ip-sysctl.txt
+++ linux-2.6/Documentation/networking/ip-sysctl.txt
@@ -588,6 +588,37 @@ ip_local_port_range - 2 INTEGERS
 	(i.e. by default) range 1024-4999 is enough to issue up to
 	2000 connections per second to systems supporting timestamps.
 
+ip_local_reserved_ports - list of comma separated ranges
+	Specify the ports which are reserved for known third-party
+	applications. These ports will not be used by automatic port
+	assignments (e.g. when calling connect() or bind() with port
+	number 0). Explicit port allocation behavior is unchanged.
+
+	The format used for both input and output is a comma separated
+	list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
+	10). Writing to the file will clear all previously reserved
+	ports and update the current list with the one given in the
+	input.
+
+	Note that ip_local_port_range and ip_local_reserved_ports
+	settings are independent and both are considered by the kernel
+	when determining which ports are available for automatic port
+	assignments.
+
+	You can reserve ports which are not in the current
+	ip_local_port_range, e.g.:
+
+	$ cat /proc/sys/net/ipv4/ip_local_port_range
+	32000	61000
+	$ cat /proc/sys/net/ipv4/ip_local_reserved_ports
+	8080,9148
+
+	although this is redundant. However such a setting is useful
+	if later the port range is changed to a value that will
+	include the reserved ports.
+
+	Default: Empty
+
 ip_nonlocal_bind - BOOLEAN
 	If set, allows processes to bind() to non-local IP addresses,
 	which can be quite useful - but may break some applications.
Index: linux-2.6/drivers/infiniband/core/cma.c
===================================================================
--- linux-2.6.orig/drivers/infiniband/core/cma.c
+++ linux-2.6/drivers/infiniband/core/cma.c
@@ -1980,6 +1980,8 @@ retry:
 	/* FIXME: add proper port randomization per like inet_csk_get_port */
 	do {
 		ret = idr_get_new_above(ps, bind_list, next_port, &port);
+		if (!ret && inet_is_reserved_local_port(port))
+			ret = -EAGAIN;
 	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
 
 	if (ret)
@@ -2995,11 +2997,19 @@ static void cma_remove_one(struct ib_dev
 static int __init cma_init(void)
 {
 	int ret, low, high, remaining;
+	int tries = 10;
 
-	get_random_bytes(&next_port, sizeof next_port);
 	inet_get_local_port_range(&low, &high);
+again:
+	get_random_bytes(&next_port, sizeof next_port);
 	remaining = (high - low) + 1;
 	next_port = ((unsigned int) next_port % remaining) + low;
+	if (inet_is_reserved_local_port(next_port)) {
+		if (tries--)
+			goto again;
+		else
+			return -EBUSY;
+	}
 
 	cma_wq = create_singlethread_workqueue("rdma_cm");
 	if (!cma_wq)
Index: linux-2.6/include/net/ip.h
===================================================================
--- linux-2.6.orig/include/net/ip.h
+++ linux-2.6/include/net/ip.h
@@ -184,6 +184,12 @@ extern struct local_ports {
 } sysctl_local_ports;
 extern void inet_get_local_port_range(int *low, int *high);
 
+extern unsigned long *sysctl_local_reserved_ports;
+static inline int inet_is_reserved_local_port(int port)
+{
+	return test_bit(port, sysctl_local_reserved_ports);
+}
+
 extern int sysctl_ip_default_ttl;
 extern int sysctl_ip_nonlocal_bind;
 
Index: linux-2.6/net/ipv4/af_inet.c
===================================================================
--- linux-2.6.orig/net/ipv4/af_inet.c
+++ linux-2.6/net/ipv4/af_inet.c
@@ -1552,9 +1552,13 @@ static int __init inet_init(void)
 
 	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
+	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!sysctl_local_reserved_ports)
+		goto out;
+
 	rc = proto_register(&tcp_prot, 1);
 	if (rc)
-		goto out;
+		goto out_free_reserved_ports;
 
 	rc = proto_register(&udp_prot, 1);
 	if (rc)
@@ -1653,6 +1657,8 @@ out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
 	proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+	kfree(sysctl_local_reserved_ports);
 	goto out;
 }
 
Index: linux-2.6/net/ipv4/inet_connection_sock.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_connection_sock.c
+++ linux-2.6/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __
 	.range = { 32768, 61000 },
 };
 
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
 void inet_get_local_port_range(int *low, int *high)
 {
 	unsigned seq;
@@ -108,6 +111,8 @@ again:
 
 		smallest_size = -1;
 		do {
+			if (inet_is_reserved_local_port(rover))
+				goto next_nolock;
 			head = &hashinfo->bhash[inet_bhashfn(net, rover,
 					hashinfo->bhash_size)];
 			spin_lock(&head->lock);
@@ -130,6 +135,7 @@ again:
 			break;
 		next:
 			spin_unlock(&head->lock);
+		next_nolock:
 			if (++rover > high)
 				rover = low;
 		} while (--remaining > 0);
Index: linux-2.6/net/ipv4/inet_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_hashtables.c
+++ linux-2.6/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_time
 		local_bh_disable();
 		for (i = 1; i <= remaining; i++) {
 			port = low + (i + offset) % remaining;
+			if (inet_is_reserved_local_port(port))
+				continue;
 			head = &hinfo->bhash[inet_bhashfn(net, port,
 					hinfo->bhash_size)];
 			spin_lock(&head->lock);
Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-2.6/net/ipv4/sysctl_net_ipv4.c
@@ -299,6 +299,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= NULL, /* initialized in sysctl_ipv4_init */
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_max_memberships",
@@ -736,6 +743,16 @@ static __net_initdata struct pernet_oper
 static __init int sysctl_ipv4_init(void)
 {
 	struct ctl_table_header *hdr;
+	struct ctl_table *i;
+
+	for (i = ipv4_table; i->procname; i++) {
+		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+			i->data = sysctl_local_reserved_ports;
+			break;
+		}
+	}
+	if (!i->procname)
+		return -EINVAL;
 
 	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
 	if (hdr == NULL)
Index: linux-2.6/net/ipv4/udp.c
===================================================================
--- linux-2.6.orig/net/ipv4/udp.c
+++ linux-2.6/net/ipv4/udp.c
@@ -233,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, un
 			 */
 			do {
 				if (low <= snum && snum <= high &&
-				    !test_bit(snum >> udptable->log, bitmap))
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_reserved_local_port(snum))
 					goto found;
 				snum += rand;
 			} while (snum != first);
Index: linux-2.6/net/sctp/socket.c
===================================================================
--- linux-2.6.orig/net/sctp/socket.c
+++ linux-2.6/net/sctp/socket.c
@@ -5436,6 +5436,8 @@ static long sctp_get_port_local(struct s
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
+			if (inet_is_reserved_local_port(rover))
+				continue;
 			index = sctp_phashfn(rover);
 			head = &sctp_port_hashtable[index];
 			sctp_spin_lock(&head->lock);

^ permalink raw reply

* [Patch 2/3] sysctl: add proc_do_large_bitmap
From: Amerigo Wang @ 2010-04-12 10:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, ebiederm, David Miller
In-Reply-To: <20100412100744.5302.92442.sendpatchset@localhost.localdomain>

From: Octavian Purdila <opurdila@ixiacom.com>

The new function can be used to read/write large bitmaps via /proc. A
comma separated range format is used for compact output and input
(e.g. 1,3-4,10-10).

Writing into the file will first reset the bitmap then update it
based on the given input.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/include/linux/sysctl.h
===================================================================
--- linux-2.6.orig/include/linux/sysctl.h
+++ linux-2.6/include/linux/sysctl.h
@@ -980,6 +980,8 @@ extern int proc_doulongvec_minmax(struct
 				  void __user *, size_t *, loff_t *);
 extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 				      void __user *, size_t *, loff_t *);
+extern int proc_do_large_bitmap(struct ctl_table *, int,
+				void __user *, size_t *, loff_t *);
 
 /*
  * Register a set of sysctl names by calling register_sysctl_table
Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -2072,6 +2072,23 @@ static bool isanyof(char c, const char *
 	return true;
 }
 
+static int proc_skip_anyof(char __user **buf, size_t *size,
+			   const char *v, unsigned len)
+{
+	char c;
+
+	while (*size) {
+		if (get_user(c, *buf))
+			return -EFAULT;
+		if (!isanyof(c, v, len))
+			break;
+		(*size)--;
+		(*buf)++;
+	}
+
+	return 0;
+}
+
 #define TMPBUFLEN 22
 /**
  * proc_get_long - reads an ASCII formated integer from a user buffer
@@ -2663,6 +2680,135 @@ static int proc_do_cad_pid(struct ctl_ta
 	return 0;
 }
 
+/**
+ * proc_do_large_bitmap - read/write from/to a large bitmap
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * The bitmap is stored at table->data and the bitmap length (in bits)
+ * in table->maxlen.
+ *
+ * We use a range comma separated format (e.g. 1,3-4,10-10) so that
+ * large bitmaps may be represented in a compact manner. Writing into
+ * the file will clear the bitmap then update it with the given input.
+ *
+ * Returns 0 on success.
+ */
+int proc_do_large_bitmap(struct ctl_table *table, int write,
+			 void __user *_buffer, size_t *lenp, loff_t *ppos)
+{
+	int err = 0;
+	bool first = 1;
+	size_t left = *lenp;
+	unsigned long bitmap_len = table->maxlen;
+	char __user *buffer = (char __user *) _buffer;
+	unsigned long *bitmap = (unsigned long *) table->data;
+	unsigned long *tmp_bitmap = NULL;
+	char tr_a[] = { '-', ',', '\n', 0 }, tr_b[] = { ',', '\n', 0 }, c;
+	char tr_end[] = { '\n', 0 };
+
+
+	if (!bitmap_len || !left || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+
+	if (write) {
+		tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
+				     GFP_KERNEL);
+		if (!tmp_bitmap)
+			return -ENOMEM;
+		err = proc_skip_anyof(&buffer, &left, tr_end, sizeof(tr_end));
+		while (!err && left) {
+			unsigned long val_a, val_b;
+			bool neg;
+
+			err = proc_get_long(&buffer, &left, &val_a, &neg, tr_a,
+					     sizeof(tr_a), &c);
+			if (err)
+				break;
+			if (val_a >= bitmap_len || neg) {
+				err = -EINVAL;
+				break;
+			}
+
+			val_b = val_a;
+			if (left) {
+				buffer++;
+				left--;
+			}
+
+			if (c == '-') {
+				err = proc_get_long(&buffer, &left, &val_b,
+						     &neg, tr_b, sizeof(tr_b),
+						     &c);
+				if (err)
+					break;
+				if (val_b >= bitmap_len || neg ||
+				    val_a > val_b) {
+					err = -EINVAL;
+					break;
+				}
+				if (left) {
+					buffer++;
+					left--;
+				}
+			}
+
+			while (val_a <= val_b)
+				set_bit(val_a++, tmp_bitmap);
+
+			first = 0;
+			err = proc_skip_anyof(&buffer, &left, tr_end,
+					      sizeof(tr_end));
+		}
+	} else {
+		unsigned long bit_a, bit_b = 0;
+
+		while (left) {
+			bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
+			if (bit_a >= bitmap_len)
+				break;
+			bit_b = find_next_zero_bit(bitmap, bitmap_len,
+						   bit_a + 1) - 1;
+
+			err = proc_put_long(&buffer, &left, bit_a, 0, first,
+					     ',');
+			if (err)
+				break;
+			if (bit_a != bit_b) {
+				err = proc_put_char(&buffer, &left, '-');
+				if (err)
+					break;
+				err = proc_put_long(&buffer, &left, bit_b, 0,
+						     1, 0);
+				if (err)
+					break;
+			}
+
+			first = 0; bit_b++;
+		}
+		if (!err)
+			err = proc_put_char(&buffer, &left, '\n');
+	}
+
+	if (!err) {
+		if (write)
+			memcpy(bitmap, tmp_bitmap,
+			       BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+		kfree(tmp_bitmap);
+		*lenp -= left;
+		*ppos += *lenp;
+		return 0;
+	} else {
+		kfree(tmp_bitmap);
+		return err;
+	}
+}
+
 #else /* CONFIG_PROC_FS */
 
 int proc_dostring(struct ctl_table *table, int write,

^ permalink raw reply

* [Patch 1/3] sysctl: refactor integer handling proc code
From: Amerigo Wang @ 2010-04-12 10:04 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, ebiederm, David Miller, Amerigo Wang
In-Reply-To: <20100412100744.5302.92442.sendpatchset@localhost.localdomain>


From: Octavian Purdila <opurdila@ixiacom.com>

As we are about to add another integer handling proc function a little
bit of cleanup is in order: add a few helper functions to improve code
readability and decrease code duplication.

In the process a bug is also fixed: if the user specifies a number
with more then 20 digits it will be interpreted as two integers
(e.g. 10000...13 will be interpreted as 100.... and 13).

Behavior for EFAULT handling was changed as well. Previous to this
patch, when an EFAULT error occurred in the middle of a write
operation, although some of the elements were set, that was not
acknowledged to the user (by shorting the write and returning the
number of bytes accepted). EFAULT is now treated just like any other
errors by acknowledging the amount of bytes accepted.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -2040,8 +2040,148 @@ int proc_dostring(struct ctl_table *tabl
 			       buffer, lenp, ppos);
 }
 
+static int proc_skip_wspace(char __user **buf, size_t *size)
+{
+	char c;
+
+	while (*size) {
+		if (get_user(c, *buf))
+			return -EFAULT;
+		if (!isspace(c))
+			break;
+		(*size)--;
+		(*buf)++;
+	}
+
+	return 0;
+}
+
+static bool isanyof(char c, const char *v, unsigned len)
+{
+	int i;
+
+	if (!len)
+		return false;
+
+	for (i = 0; i < len; i++)
+		if (c == v[i])
+			break;
+	if (i == len)
+		return false;
+
+	return true;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formated integer from a user buffer
+ *
+ * @buf - user buffer
+ * @size - size of the user buffer
+ * @val - this is where the number will be stored
+ * @neg - set to %TRUE if number is negative
+ * @perm_tr - a vector which contains the allowed trailers
+ * @perm_tr_len - size of the perm_tr vector
+ * @tr - pointer to store the trailer character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read. If tr is non NULL and a trailing
+ * character exist (size is non zero after returning from this
+ * function) tr is updated with the trailing character.
+ */
+static int proc_get_long(char __user **buf, size_t *size,
+			  unsigned long *val, bool *neg,
+			  const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+	int len;
+	char *p, tmp[TMPBUFLEN];
+
+	if (!*size)
+		return -EINVAL;
+
+	len = *size;
+	if (len > TMPBUFLEN-1)
+		len = TMPBUFLEN-1;
+
+	if (copy_from_user(tmp, *buf, len))
+		return -EFAULT;
+
+	tmp[len] = 0;
+	p = tmp;
+	if (*p == '-' && *size > 1) {
+		*neg = 1;
+		p++;
+	} else
+		*neg = 0;
+	if (!isdigit(*p))
+		return -EINVAL;
+
+	*val = simple_strtoul(p, &p, 0);
+
+	len = p - tmp;
+
+	/* We don't know if the next char is whitespace thus we may accept
+	 * invalid integers (e.g. 1234...a) or two integers instead of one
+	 * (e.g. 123...1). So lets not allow such large numbers. */
+	if (len == TMPBUFLEN - 1)
+		return -EINVAL;
+
+	if (len < *size && perm_tr_len && !isanyof(*p, perm_tr, perm_tr_len))
+		return -EINVAL;
+
+	if (tr && (len < *size))
+		*tr = *p;
+
+	*buf += len;
+	*size -= len;
+
+	return 0;
+}
+
+/**
+ * proc_put_long - coverts an integer to a decimal ASCII formated string
+ *
+ * @buf - the user buffer
+ * @size - the size of the user buffer
+ * @val - the integer to be converted
+ * @neg - sign of the number, %TRUE for negative
+ * @first - if %FALSE will insert a separator character before the number
+ * @separator - the separator character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read.
+ */
+static int proc_put_long(char __user **buf, size_t *size, unsigned long val,
+			  bool neg, bool first, char separator)
+{
+	int len;
+	char tmp[TMPBUFLEN], *p = tmp;
+
+	if (!first)
+		*p++ = separator;
+	sprintf(p, "%s%lu", neg ? "-" : "", val);
+	len = strlen(tmp);
+	if (len > *size)
+		len = *size;
+	if (copy_to_user(*buf, tmp, len))
+		return -EFAULT;
+	*size -= len;
+	*buf += len;
+	return 0;
+}
+#undef TMPBUFLEN
+
+static int proc_put_char(char __user **buf, size_t *size, char c)
+{
+	if (*size) {
+		if (put_user(c, *buf))
+			return -EFAULT;
+		(*size)--, (*buf)++;
+	}
+	return 0;
+}
 
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int *valp,
 				 int write, void *data)
 {
@@ -2050,7 +2190,7 @@ static int do_proc_dointvec_conv(int *ne
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			*lvalp = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2060,20 +2200,18 @@ static int do_proc_dointvec_conv(int *ne
 	return 0;
 }
 
+static const char proc_wspace_sep[] = { ' ', '\t', '\n', 0 };
+
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
-		  int write, void __user *buffer,
+		  int write, void __user *_buffer,
 		  size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-#define TMPBUFLEN 21
-	int *i, vleft, first = 1, neg;
-	unsigned long lval;
-	size_t left, len;
-	
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
+	int *i, vleft, first = 1, err = 0;
+	size_t left;
+	char __user *buffer = (char __user *) _buffer;
 	
 	if (!tbl_data || !table->maxlen || !*lenp ||
 	    (*ppos && !write)) {
@@ -2089,88 +2227,48 @@ static int __do_proc_dointvec(void *tbl_
 		conv = do_proc_dointvec_conv;
 
 	for (; left && vleft--; i++, first=0) {
-		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > sizeof(buf) - 1)
-				len = sizeof(buf) - 1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-
-			lval = simple_strtoul(p, &p, 0);
+		unsigned long lval;
+		bool neg;
 
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+		if (write) {
+			err = proc_skip_wspace(&buffer, &left);
+			if (err)
+				return err;
+			err = proc_get_long(&buffer, &left, &lval, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
-			s += len;
-			left -= len;
-
-			if (conv(&neg, &lval, i, 1, data))
+			if (conv(&neg, &lval, i, 1, data)) {
+				err = -EINVAL;
 				break;
+			}
 		} else {
-			p = buf;
-			if (!first)
-				*p++ = '\t';
-	
-			if (conv(&neg, &lval, i, 0, data))
+			if (conv(&neg, &lval, i, 0, data)) {
+				err = -EINVAL;
 				break;
-
-			sprintf(p, "%s%lu", neg ? "-" : "", lval);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
-		}
-	}
-
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
-	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
+			}
+			err = proc_put_long(&buffer, &left, lval, neg, first,
+					     '\t');
+			if (err)
 				break;
-			left--;
 		}
 	}
+
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		err = proc_skip_wspace(&buffer, &left);
 	if (write && first)
-		return -EINVAL;
+		return err ? : -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
 	return 0;
-#undef TMPBUFLEN
 }
 
 static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
@@ -2238,8 +2336,8 @@ struct do_proc_dointvec_minmax_conv_para
 	int *max;
 };
 
-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
-					int *valp, 
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+					int *valp,
 					int write, void *data)
 {
 	struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,7 +2350,7 @@ static int do_proc_dointvec_minmax_conv(
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			*lvalp = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2290,17 +2388,15 @@ int proc_dointvec_minmax(struct ctl_tabl
 }
 
 static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write,
-				     void __user *buffer,
+				     void __user *_buffer,
 				     size_t *lenp, loff_t *ppos,
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-#define TMPBUFLEN 21
-	unsigned long *i, *min, *max, val;
-	int vleft, first=1, neg;
-	size_t len, left;
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
+	unsigned long *i, *min, *max;
+	int vleft, first = 1, err = 0;
+	size_t left;
+	char __user *buffer = (char __user *) _buffer;
 	
 	if (!data || !table->maxlen || !*lenp ||
 	    (*ppos && !write)) {
@@ -2315,82 +2411,42 @@ static int __do_proc_doulongvec_minmax(v
 	left = *lenp;
 	
 	for (; left && vleft--; i++, min++, max++, first=0) {
+		unsigned long val;
+
 		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > TMPBUFLEN-1)
-				len = TMPBUFLEN-1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			bool neg;
+
+			err = proc_skip_wspace(&buffer, &left);
+			if (err)
+				return err;
+			err = proc_get_long(&buffer, &left, &val, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
 			if (neg)
-				val = -val;
-			s += len;
-			left -= len;
-
-			if(neg)
 				continue;
 			if ((min && val < *min) || (max && val > *max))
 				continue;
 			*i = val;
 		} else {
-			p = buf;
-			if (!first)
-				*p++ = '\t';
-			sprintf(p, "%lu", convdiv * (*i) / convmul);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
-		}
-	}
-
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
-	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
+			val = convdiv * (*i) / convmul;
+			err = proc_put_long(&buffer, &left, val, 0, first,
+					     '\t');
+			if (err)
 				break;
-			left--;
 		}
 	}
+
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		err = proc_skip_wspace(&buffer, &left);
 	if (write && first)
-		return -EINVAL;
+		return err ? : -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
 	return 0;
-#undef TMPBUFLEN
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2507,7 @@ int proc_doulongvec_ms_jiffies_minmax(st
 }
 
 
-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 					 int *valp,
 					 int write, void *data)
 {
@@ -2463,7 +2519,7 @@ static int do_proc_dointvec_jiffies_conv
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2474,7 +2530,7 @@ static int do_proc_dointvec_jiffies_conv
 	return 0;
 }
 
-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
 						int *valp,
 						int write, void *data)
 {
@@ -2486,7 +2542,7 @@ static int do_proc_dointvec_userhz_jiffi
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2497,7 +2553,7 @@ static int do_proc_dointvec_userhz_jiffi
 	return 0;
 }
 
-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 					    int *valp,
 					    int write, void *data)
 {
@@ -2507,7 +2563,7 @@ static int do_proc_dointvec_ms_jiffies_c
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;

^ permalink raw reply

* [Patch v8 0/3] net: reserve ports for applications using fixed port numbers
From: Amerigo Wang @ 2010-04-12 10:03 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, ebiederm, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, David Miller

Changes from the previous version:
- Rename proc_{get,put}_ulong to proc_{get,put}_long();
- Fix potential dead loop problems in cma code.

------------->

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

There are still some miss behaviors with regard to proc parsing in odd
invalid cases (for "40000\0-40001" all is acknowledged but only 40000
is accepted) but they are not easy to fix without changing the current
"acknowledge how much we accepted" behavior.

Because of that and because the same issues are present in the
existing proc_dointvec code as well I don't think its worth holding
the actual feature (port reservation) after such petty error recovery
issues.

^ permalink raw reply

* Bug#572201: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-12 10:01 UTC (permalink / raw)
  To: netdev; +Cc: Ben Hutchings, Eric Dumazet, Ayaz Abdulla, 572201
In-Reply-To: <1270942606.6179.64.camel@localhost>

Ben Hutchings wrote:
> Stephen Mulcahy reported a regression in forcedeth at
> <http://bugs.debian.org/572201>.  The system information and some
> diagnostic information can be found there.  Anyone able to help?

Incidentally, I also tried the 2.6.33.2 kernel with 
CONFIG_FORCEDETH_NAPI set to "y" to see if that made a difference.

It doesn't - further testing over the weekend saw 6 of 45 machines drop 
off the network with this problem. Nothing in dmesg or system logs. 
Happy to run more tests if someone can advise on what should be run.

-stephen

-- 
Stephen Mulcahy     Atlantic Linux         http://www.atlanticlinux.ie
Registered in Ireland, no. 376591 (144 Ros Caoin, Roscam, Galway)

^ permalink raw reply

* Re: NULL pointer dereference panic in stable (2.6.33.2), amd64
From: Eric Dumazet @ 2010-04-12  9:31 UTC (permalink / raw)
  To: Krishna Kumar2; +Cc: David Miller, netdev, Denys Fedorysychenko
In-Reply-To: <OF802D1BE7.C2709B61-ON65257703.00235EF5-65257703.002B5F38@in.ibm.com>

Le lundi 12 avril 2010 à 13:24 +0530, Krishna Kumar2 a écrit :

> If the dst got changed between call to vlan_dev_hwaccel_hard_start_xmit
> and it's call to dev_queue_xmit, that change to dst should have reset
> sk_tx_queue_mapping to -1 by calling sk_tx_queue_clear (assuming that I
> have changed in all paths, eg __sk_dst_reset), and thus result in a new
> mapping in dev_pick_tx. Would the patch hide the actual bug where we do
> not clear sk_tx_queue_mapping, eg __sk_dst_set does it? I agree the
> patch will fix the panic, but this check could be removed if the code
> which changes the dst is fixed to clear the mapping. I could check that
> if you think this assumption is correct.
> 

I believe you focus on another problem. I am not saying we dont have
another bug (forgetting to reset sk_dst_cache somewhere).

I am only saying that when we want to cache the queue number on a given
socket, we have to make sure current packet routing decision was taken
on same dst_entries than current and future ones. Denys hit the problem
because of long delays caused by traffic shaping.

So the cache renew must be safe, which I tried to fix.

You are saying that cache invalidation might be missing from some paths.
I dont think so because I took an extensive look at these spots when
working on yet another RCU conversion two days ago (sk_dst_lock becomes
a spinlock). This was fresh in my mind, this is why I probably found
Denys problem origin so quickly ;)




^ permalink raw reply

* Re: [RFC] random SYN drops causing connect() delays
From: Thomas Graf @ 2010-04-12  8:39 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20100412080633.GA27418@bombadil.infradead.org>

On Mon, Apr 12, 2010 at 04:06:33AM -0400, Thomas Graf wrote:
>  - While the issue is appearing, the acceptq seems to be overflowing. Both
>    LISTENOVERFLOWS and LISTENDROPS are increasing although not by the exact
>    number of delay occurences. inetdiag reports sk_max_ack_backlog to be 0
>    therefore one possibility that comes to mind is that sk_ack_backlog
>    underflows due to a race.

Forget about the underflow thought, inetdiag was reporting falsely.
sk_max_ack_backlog is set to 128 as it should and the listen overflow
happens normally. Still the fact remains that while the issue is appearing
listen overflows are counted.

^ permalink raw reply

* [PATCH net-next-2.6] be2net: clarify promiscuous cmd with a comment
From: Sathya Perla @ 2010-04-12  8:35 UTC (permalink / raw)
  To: netdev; +Cc: pugs

The promiscous cmd config code gives an impression that
setting a port to promisc mode will unset the other port.
This is not the case and is clarified with a comment.

Signed-off-by: Sathya Perla <sathyap@serverengines.com>
---
 drivers/net/benet/be_cmds.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/drivers/net/benet/be_cmds.c b/drivers/net/benet/be_cmds.c
index da87930..e79bf8b 100644
--- a/drivers/net/benet/be_cmds.c
+++ b/drivers/net/benet/be_cmds.c
@@ -1113,6 +1113,10 @@ int be_cmd_promiscuous_config(struct be_adapter *adapter, u8 port_num, bool en)
 	be_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_ETH,
 		OPCODE_ETH_PROMISCUOUS, sizeof(*req));
 
+	/* In FW versions X.102.149/X.101.487 and later,
+	 * the port setting associated only with the
+	 * issuing pci function will take effect
+	 */
 	if (port_num)
 		req->port1_promiscuous = en;
 	else
-- 
1.6.3.3


^ permalink raw reply related

* [RFC] random SYN drops causing connect() delays
From: Thomas Graf @ 2010-04-12  8:06 UTC (permalink / raw)
  To: netdev

Hello,

I have been tracking down an issue commonly referred to as the 3-sec
connect() delay. It exists since recent 2.6.x kernels and has never
been fixed even though it disappeared in recent releases unless
sched_child_runs_first is set to 1 again.

What happens is that if a client attemps to open many connections to
a socket with only minimal delay inbetween attemps some SYNs are
randomly dropped on the server side causing the client to resend after
the 3 sec TCP timeout and thus causing connect()s to be randomly delayed.

Steps to reproduce:
 1. Compile reproducer attached below
 2. run ./test_delay 127.0.0.1 22 10000 0 > log
 3. awk -F: '{if ($2>2990) print $1 $2;}' log
 4. all listed connection attemps will have been delayed for >3s

Facts:
 - Issue can be reproduced over loopback or real networks.
 - Enabling SO_LINGER on the client side will make the issue disappear!!
 - While the issue is appearing, the acceptq seems to be overflowing. Both
   LISTENOVERFLOWS and LISTENDROPS are increasing although not by the exact
   number of delay occurences. inetdiag reports sk_max_ack_backlog to be 0
   therefore one possibility that comes to mind is that sk_ack_backlog
   underflows due to a race.
 - The issue disappeared in recent kernels, I bisected it down to the following
   commit:
	commit 2bba22c50b06abe9fd0d23933b1e64d35b419262
	Author: Mike Galbraith <efault@gmx.de>
	Date:   Wed Sep 9 15:41:37 2009 +0200

	    sched: Turn off child_runs_first
	    
	    Set child_runs_first default to off.

   Setting kernel.sched_child_runs_first=1 makes the isssue reappear in recent
   kernels.  This hardens the theory of a race condition.
 - It looks like that the issue can only be reproduced if the server
   socket sends out data immediately after the connection has been established
   but I cannot proof this theory.

I will continue to look into the sk_ack_backlog underflow theory but would
appreciate any comments or theories.

Thanks,

Reproducer:

#include <sys/socket.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <netdb.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <time.h>
#include <sys/time.h>

int main(int argc, char *argv[])
{

        int sock,i;  
        struct timeval tim;
        double start,end;  

        struct hostent *host;
        struct sockaddr_in server_addr, local;
	socklen_t len = sizeof(local);

        char* hostname;
        int port, count, delay;

        if( argc < 3 ){
           printf("Usage:\n\t%s host port [count=1000] [delay=0]\n",argv[0]);
           return 1;
        }

        hostname = argv[1];
        port = atoi(argv[2]);

        if( argc > 3 )
           count = atoi(argv[3]);
	else
           count = 1000;

        if( argc > 4 )
           delay = atoi(argv[4]);
	else
           delay = 0;

        host = gethostbyname(hostname);

        server_addr.sin_family = AF_INET;     
        server_addr.sin_port = htons(port);   
        server_addr.sin_addr = *((struct in_addr *)host->h_addr);
        bzero(&(server_addr.sin_zero),8); 

        for(i=0; i< count; i=i+1){
          gettimeofday(&tim, NULL);
          start=tim.tv_sec*1000+(tim.tv_usec/1000);

          if ((sock = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
            perror("Socket");
            exit(1);
          }

          if (connect(sock, (struct sockaddr *)&server_addr,
                    sizeof(struct sockaddr)) == -1) 
          {
            perror("Connect");
            exit(1);
          }

	  getsockname(sock, (struct sockaddr *) &local, &len);
          close(sock);

          gettimeofday(&tim, NULL);
          end=tim.tv_sec*1000+(tim.tv_usec/1000);
          printf("[%d] %u-> Time to open socket (clock): %d\n",
	  	i, ntohs(local.sin_port), (int)(end - start));
	  usleep(delay*1000);
        }
/*
        printf("Time to open socket (ms): %d\n", ((end - start)*1000)/CLOCKS_PER_SEC);
*/

        return 0;
}



^ permalink raw reply

* Re: NULL pointer dereference panic in stable (2.6.33.2), amd64
From: Krishna Kumar2 @ 2010-04-12  7:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Denys Fedorysychenko
In-Reply-To: <1271052111.2078.168.camel@edumazet-laptop>

Hi Eric,

Thanks for your patch, just one question on it though:

Eric Dumazet <eric.dumazet@gmail.com> wrote on 04/12/2010 11:31:51 AM:

> > When route changes, I think my patch had reset sk->sk_tx_queue_mapping
> > by calling sk_tx_queue_clear. I don't know if I missed any path where
> > the route changes and sk_dst_reset() was not called.
> >
>
> Problem is when you reset sk->sk_tx_queue_mapping at the very moment
> route (or destination) changes, we might have old packets queued in tx
> queues, of the old ethernet device (eth0 : multi queue compatable)

> 2) Application does a sendmsg() or connect() call and sk->sk_dst_cache
> is rebuild, it points to a dst_entry referring a new device (eth1 : non
> multiqueue)
>
> 3) When one old packet finally is transmitted, we do :
>
>    queue_index = 1; // any value > 0
>
>    if (sk && sk->sk_dst_cache)
>       sk_tx_queue_set(sk, queue_index); // remember a >0 value
>
> 4) application does a sendmsg(), enqueues a new skb on eth1
>
> 5) We re-enter dev_pick_tx(), and consider cached value in 3) is valid.
>    we pick a non existent txq for eth1 device.
>
> 6) We crash.
>
> > The following might be better to prove the panic is due to this, since
> > your suggestion will hide a panic that happens somewhat rare (according
> > to Denys):
> >
> >       if (sk_tx_queue_recorded(sk)) {
> >             queue_index = sk_tx_queue_get(sk);
> > +           queue_index = dev_cap_txqueue(dev, queue_index);
> >       } else {
> >
>
> Sure, but I thought I was clear enough to prove this commit was wrong,
> and we have to find a fix.

If the dst got changed between call to vlan_dev_hwaccel_hard_start_xmit
and it's call to dev_queue_xmit, that change to dst should have reset
sk_tx_queue_mapping to -1 by calling sk_tx_queue_clear (assuming that I
have changed in all paths, eg __sk_dst_reset), and thus result in a new
mapping in dev_pick_tx. Would the patch hide the actual bug where we do
not clear sk_tx_queue_mapping, eg __sk_dst_set does it? I agree the
patch will fix the panic, but this check could be removed if the code
which changes the dst is fixed to clear the mapping. I could check that
if you think this assumption is correct.

Thanks,

- KK


^ permalink raw reply

* [PATCH 2.6.34-git] 8139too: fix Coding Styles
From: Javier Blanco de Torres (Neurowork) @ 2010-04-12  7:36 UTC (permalink / raw)
  To: netdev

[-- Attachment #1: Type: text/plain, Size: 290 bytes --]

Fixed coding styles in the 8139too net driver.

Signed-off-by: Javier Blanco de Torres <jblanco@neurowork.net>
Signed-off-by: Alejandro Sánchez Acosta <asanchez@neurowork.net>

-- 
Javier Blanco
Community Manager
jblanco@neurowork.net

Neurowork > http://www.neurowork.net
(+34) 916 851 242

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 8139too-patch-fix-coding-style.patch --]
[-- Type: text/x-patch; name="8139too-patch-fix-coding-style.patch", Size: 62768 bytes --]

diff --git a/drivers/net/8139too.c b/drivers/net/8139too.c
index a03d291..305491a 100644
--- a/drivers/net/8139too.c
+++ b/drivers/net/8139too.c
@@ -11,7 +11,7 @@
 
 	-----<snip>-----
 
-        	Written 1997-2001 by Donald Becker.
+		Written 1997-2001 by Donald Becker.
 		This software may be used and distributed according to the
 		terms of the GNU General Public License (GPL), incorporated
 		herein by reference.  Drivers based on or derived from this
@@ -117,8 +117,8 @@
 
 /* Default Message level */
 #define RTL8139_DEF_MSG_ENABLE   (NETIF_MSG_DRV   | \
-                                 NETIF_MSG_PROBE  | \
-                                 NETIF_MSG_LINK)
+				NETIF_MSG_PROBE  | \
+				NETIF_MSG_LINK)
 
 
 /* define to 1, 2 or 3 to enable copious debugging info */
@@ -132,10 +132,10 @@
 #  define assert(expr) do {} while (0)
 #else
 #  define assert(expr) \
-        if (unlikely(!(expr))) {				\
+	if (unlikely(!(expr))) {				\
 		pr_err("Assertion failed! %s,%s,%s,line=%d\n",	\
 		       #expr, __FILE__, __func__, __LINE__);	\
-        }
+	}
 #endif
 
 
@@ -149,7 +149,7 @@ static int full_duplex[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1};
 #ifdef CONFIG_8139TOO_PIO
 static int use_io = 1;
 #else
-static int use_io = 0;
+static int use_io;
 #endif
 
 /* Maximum number of multicast addresses to filter (vs. Rx-all-multicast).
@@ -192,7 +192,8 @@ static int debug = -1;
    Threshold is bytes transferred to chip before transmission starts. */
 #define TX_FIFO_THRESH 256	/* In bytes, rounded down to 32 byte units. */
 
-/* The following settings are log_2(bytes)-4:  0 == 16 bytes .. 6==1024, 7==end of packet. */
+/* The following settings are log_2(bytes)-4: */
+/* 0 == 16 bytes .. 6==1024, 7==end of packet. */
 #define RX_FIFO_THRESH	7	/* Rx buffer level before first PCI xfer.  */
 #define RX_DMA_BURST	7	/* Maximum PCI burst, '6' is 1024 */
 #define TX_DMA_BURST	6	/* Maximum PCI burst, '6' is 1024 */
@@ -217,7 +218,7 @@ enum {
 #define RTL8129_CAPS	HAS_MII_XCVR
 #define RTL8139_CAPS	(HAS_CHIP_XCVR|HAS_LNK_CHNG)
 
-typedef enum {
+enum {
 	RTL8139 = 0,
 	RTL8129,
 } board_t;
@@ -272,7 +273,7 @@ static DEFINE_PCI_DEVICE_TABLE(rtl8139_pci_tbl) = {
 
 	{0,}
 };
-MODULE_DEVICE_TABLE (pci, rtl8139_pci_tbl);
+MODULE_DEVICE_TABLE(pci, rtl8139_pci_tbl);
 
 static struct {
 	const char str[ETH_GSTRING_LEN];
@@ -385,12 +386,12 @@ enum rx_mode_bits {
 
 /* Bits in TxConfig. */
 enum tx_config_bits {
-        /* Interframe Gap Time. Only TxIFG96 doesn't violate IEEE 802.3 */
-        TxIFGShift	= 24,
-        TxIFG84		= (0 << TxIFGShift), /* 8.4us / 840ns (10 / 100Mbps) */
-        TxIFG88		= (1 << TxIFGShift), /* 8.8us / 880ns (10 / 100Mbps) */
-        TxIFG92		= (2 << TxIFGShift), /* 9.2us / 920ns (10 / 100Mbps) */
-        TxIFG96		= (3 << TxIFGShift), /* 9.6us / 960ns (10 / 100Mbps) */
+	/* Interframe Gap Time. Only TxIFG96 doesn't violate IEEE 802.3 */
+	TxIFGShift	= 24,
+	TxIFG84		= (0 << TxIFGShift), /* 8.4us / 840ns (10 / 100Mbps) */
+	TxIFG88		= (1 << TxIFGShift), /* 8.8us / 880ns (10 / 100Mbps) */
+	TxIFG92		= (2 << TxIFGShift), /* 9.2us / 920ns (10 / 100Mbps) */
+	TxIFG96		= (3 << TxIFGShift), /* 9.6us / 960ns (10 / 100Mbps) */
 
 	TxLoopBack	= (1 << 18) | (1 << 17), /* enable loopback test mode */
 	TxCRC		= (1 << 16),	/* DISABLE Tx pkt CRC append */
@@ -417,14 +418,14 @@ enum Config1Bits {
 
 /* Bits in Config3 */
 enum Config3Bits {
-	Cfg3_FBtBEn   	= (1 << 0), /* 1	= Fast Back to Back */
-	Cfg3_FuncRegEn	= (1 << 1), /* 1	= enable CardBus Function registers */
-	Cfg3_CLKRUN_En	= (1 << 2), /* 1	= enable CLKRUN */
-	Cfg3_CardB_En 	= (1 << 3), /* 1	= enable CardBus registers */
-	Cfg3_LinkUp   	= (1 << 4), /* 1	= wake up on link up */
-	Cfg3_Magic    	= (1 << 5), /* 1	= wake up on Magic Packet (tm) */
-	Cfg3_PARM_En  	= (1 << 6), /* 0	= software can set twister parameters */
-	Cfg3_GNTSel   	= (1 << 7), /* 1	= delay 1 clock from PCI GNT signal */
+	Cfg3_FBtBEn	= (1 << 0), /* 1 = Fast Back to Back */
+	Cfg3_FuncRegEn	= (1 << 1), /* 1 = enable CardBus Function registers */
+	Cfg3_CLKRUN_En	= (1 << 2), /* 1 = enable CLKRUN */
+	Cfg3_LinkUp	= (1 << 4), /* 1 = wake up on link up */
+	Cfg3_Magic	= (1 << 5), /* 1 = wake up on Magic Packet (tm) */
+	Cfg3_PARM_En	= (1 << 6), /* 0 = software can set twister*/
+						/*parameters */
+	Cfg3_GNTSel	= (1 << 7), /* 1 = delay 1 clock from PCI GNT signal */
 };
 
 /* Bits in Config4 */
@@ -434,13 +435,13 @@ enum Config4Bits {
 
 /* Bits in Config5 */
 enum Config5Bits {
-	Cfg5_PME_STS   	= (1 << 0), /* 1	= PCI reset resets PME_Status */
-	Cfg5_LANWake   	= (1 << 1), /* 1	= enable LANWake signal */
-	Cfg5_LDPS      	= (1 << 2), /* 0	= save power when link is down */
-	Cfg5_FIFOAddrPtr= (1 << 3), /* Realtek internal SRAM testing */
-	Cfg5_UWF        = (1 << 4), /* 1 = accept unicast wakeup frame */
-	Cfg5_MWF        = (1 << 5), /* 1 = accept multicast wakeup frame */
-	Cfg5_BWF        = (1 << 6), /* 1 = accept broadcast wakeup frame */
+	Cfg5_PME_STS	= (1 << 0), /* 1= PCI reset resets PME_Status */
+	Cfg5_LANWake	= (1 << 1), /* 1= enable LANWake signal */
+	Cfg5_LDPS	= (1 << 2), /* 0= save power when link is down */
+	Cfg5_FIFOAddrPtr = (1 << 3), /* Realtek internal SRAM testing */
+	Cfg5_UWF	= (1 << 4), /* 1 = accept unicast wakeup frame */
+	Cfg5_MWF	= (1 << 5), /* 1 = accept multicast wakeup frame */
+	Cfg5_BWF	= (1 << 6), /* 1 = accept broadcast wakeup frame */
 };
 
 enum RxConfigBits {
@@ -477,7 +478,7 @@ enum Cfg9346Bits {
 	Cfg9346_Unlock	= 0xC0,
 };
 
-typedef enum {
+enum {
 	CH_8139	= 0,
 	CH_8139_K,
 	CH_8139A,
@@ -542,8 +543,8 @@ static const struct {
 
 	{ "RTL-8100",
 	  HW_REVID(1, 1, 1, 1, 0, 1, 0),
- 	  HasLWake,
- 	},
+	HasLWake,
+	},
 
 	{ "RTL-8100B/8139D",
 	  HW_REVID(1, 1, 1, 0, 1, 0, 1),
@@ -588,9 +589,9 @@ struct rtl8139_private {
 				/* Twister tune state. */
 	char			twistie, twist_row, twist_col;
 
-	unsigned int		watchdog_fired : 1;
-	unsigned int		default_port : 4; /* Last dev->if_port value. */
-	unsigned int		have_thread : 1;
+	unsigned int		watchdog_fired:1;
+	unsigned int		default_port:4; /* Last dev->if_port value. */
+	unsigned int		have_thread:1;
 
 	spinlock_t		lock;
 	spinlock_t		rx_lock;
@@ -606,8 +607,8 @@ struct rtl8139_private {
 	unsigned long		fifo_copy_timeout;
 };
 
-MODULE_AUTHOR ("Jeff Garzik <jgarzik@pobox.com>");
-MODULE_DESCRIPTION ("RealTek RTL-8139 Fast Ethernet driver");
+MODULE_AUTHOR("Jeff Garzik <jgarzik@pobox.com>");
+MODULE_DESCRIPTION("RealTek RTL-8139 Fast Ethernet driver");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRV_VERSION);
 
@@ -617,52 +618,56 @@ module_param(multicast_filter_limit, int, 0);
 module_param_array(media, int, NULL, 0);
 module_param_array(full_duplex, int, NULL, 0);
 module_param(debug, int, 0);
-MODULE_PARM_DESC (debug, "8139too bitmapped message enable number");
-MODULE_PARM_DESC (multicast_filter_limit, "8139too maximum number of filtered multicast addresses");
-MODULE_PARM_DESC (media, "8139too: Bits 4+9: force full duplex, bit 5: 100Mbps");
-MODULE_PARM_DESC (full_duplex, "8139too: Force full duplex for board(s) (1)");
-
-static int read_eeprom (void __iomem *ioaddr, int location, int addr_len);
-static int rtl8139_open (struct net_device *dev);
-static int mdio_read (struct net_device *dev, int phy_id, int location);
-static void mdio_write (struct net_device *dev, int phy_id, int location,
+MODULE_PARM_DESC(debug, "8139too bitmapped message enable number");
+MODULE_PARM_DESC(multicast_filter_limit, "8139too maximum number "
+					"of filtered multicast addresses");
+MODULE_PARM_DESC(media, "8139too: Bits 4+9: force full duplex, bit 5: 100Mbps");
+MODULE_PARM_DESC(full_duplex, "8139too: Force full duplex for board(s) (1)");
+
+static int read_eeprom(void __iomem *ioaddr, int location, int addr_len);
+static int rtl8139_open(struct net_device *dev);
+static int mdio_read(struct net_device *dev, int phy_id, int location);
+static void mdio_write(struct net_device *dev, int phy_id, int location,
 			int val);
 static void rtl8139_start_thread(struct rtl8139_private *tp);
-static void rtl8139_tx_timeout (struct net_device *dev);
-static void rtl8139_init_ring (struct net_device *dev);
-static netdev_tx_t rtl8139_start_xmit (struct sk_buff *skb,
+static void rtl8139_tx_timeout(struct net_device *dev);
+static void rtl8139_init_ring(struct net_device *dev);
+static netdev_tx_t rtl8139_start_xmit(struct sk_buff *skb,
 				       struct net_device *dev);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 static void rtl8139_poll_controller(struct net_device *dev);
 #endif
 static int rtl8139_set_mac_address(struct net_device *dev, void *p);
 static int rtl8139_poll(struct napi_struct *napi, int budget);
-static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance);
-static int rtl8139_close (struct net_device *dev);
-static int netdev_ioctl (struct net_device *dev, struct ifreq *rq, int cmd);
-static struct net_device_stats *rtl8139_get_stats (struct net_device *dev);
-static void rtl8139_set_rx_mode (struct net_device *dev);
-static void __set_rx_mode (struct net_device *dev);
-static void rtl8139_hw_start (struct net_device *dev);
-static void rtl8139_thread (struct work_struct *work);
+static irqreturn_t rtl8139_interrupt(int irq, void *dev_instance);
+static int rtl8139_close(struct net_device *dev);
+static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
+static struct net_device_stats *rtl8139_get_stats(struct net_device *dev);
+static void rtl8139_set_rx_mode(struct net_device *dev);
+static void __set_rx_mode(struct net_device *dev);
+static void rtl8139_hw_start(struct net_device *dev);
+static void rtl8139_thread(struct work_struct *work);
 static void rtl8139_tx_timeout_task(struct work_struct *work);
 static const struct ethtool_ops rtl8139_ethtool_ops;
 
 /* write MMIO register, with flush */
 /* Flush avoids rtl8139 bug w/ posted MMIO writes */
-#define RTL_W8_F(reg, val8)	do { iowrite8 ((val8), ioaddr + (reg)); ioread8 (ioaddr + (reg)); } while (0)
-#define RTL_W16_F(reg, val16)	do { iowrite16 ((val16), ioaddr + (reg)); ioread16 (ioaddr + (reg)); } while (0)
-#define RTL_W32_F(reg, val32)	do { iowrite32 ((val32), ioaddr + (reg)); ioread32 (ioaddr + (reg)); } while (0)
+#define RTL_W8_F(reg, val8)	do { iowrite8((val8), "
+		"ioaddr + (reg)); ioread8(ioaddr + (reg)); } while (0)
+#define RTL_W16_F(reg, val16)	do { iowrite16((val16), "
+		"ioaddr + (reg)); ioread16(ioaddr + (reg)); } while (0)
+#define RTL_W32_F(reg, val32)	do { iowrite32((val32), "
+		"ioaddr + (reg)); ioread32(ioaddr + (reg)); } while (0)
 
 /* write MMIO register */
-#define RTL_W8(reg, val8)	iowrite8 ((val8), ioaddr + (reg))
-#define RTL_W16(reg, val16)	iowrite16 ((val16), ioaddr + (reg))
-#define RTL_W32(reg, val32)	iowrite32 ((val32), ioaddr + (reg))
+#define RTL_W8(reg, val8)	iowrite8((val8), ioaddr + (reg))
+#define RTL_W16(reg, val16)	iowrite16((val16), ioaddr + (reg))
+#define RTL_W32(reg, val32)	iowrite32((val32), ioaddr + (reg))
 
 /* read MMIO register */
-#define RTL_R8(reg)		ioread8 (ioaddr + (reg))
-#define RTL_R16(reg)		ioread16 (ioaddr + (reg))
-#define RTL_R32(reg)		((unsigned long) ioread32 (ioaddr + (reg)))
+#define RTL_R8(reg)ioread8(ioaddr + (reg))
+#define RTL_R16(reg)ioread16(ioaddr + (reg))
+#define RTL_R32(reg)((unsigned long) ioread32(ioaddr + (reg)))
 
 
 static const u16 rtl8139_intr_mask =
@@ -700,44 +705,44 @@ static const unsigned int rtl8139_rx_config =
 static const unsigned int rtl8139_tx_config =
 	TxIFG96 | (TX_DMA_BURST << TxDMAShift) | (TX_RETRY << TxRetryShift);
 
-static void __rtl8139_cleanup_dev (struct net_device *dev)
+static void __rtl8139_cleanup_dev(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	struct pci_dev *pdev;
 
-	assert (dev != NULL);
-	assert (tp->pci_dev != NULL);
+	assert(dev != NULL);
+	assert(tp->pci_dev != NULL);
 	pdev = tp->pci_dev;
 
 	if (tp->mmio_addr)
-		pci_iounmap (pdev, tp->mmio_addr);
+		pci_iounmap(pdev, tp->mmio_addr);
 
 	/* it's ok to call this even if we have no regions to free */
-	pci_release_regions (pdev);
+	pci_release_regions(pdev);
 
 	free_netdev(dev);
-	pci_set_drvdata (pdev, NULL);
+	pci_set_drvdata(pdev, NULL);
 }
 
 
-static void rtl8139_chip_reset (void __iomem *ioaddr)
+static void rtl8139_chip_reset(void __iomem *ioaddr)
 {
 	int i;
 
 	/* Soft reset the chip. */
-	RTL_W8 (ChipCmd, CmdReset);
+	RTL_W8(ChipCmd, CmdReset);
 
 	/* Check that the chip has finished the reset. */
 	for (i = 1000; i > 0; i--) {
 		barrier();
-		if ((RTL_R8 (ChipCmd) & CmdReset) == 0)
+		if ((RTL_R8(ChipCmd) & CmdReset) == 0)
 			break;
-		udelay (10);
+		udelay(10);
 	}
 }
 
 
-static __devinit struct net_device * rtl8139_init_board (struct pci_dev *pdev)
+static __devinit struct net_device *rtl8139_init_board(struct pci_dev *pdev)
 {
 	void __iomem *ioaddr;
 	struct net_device *dev;
@@ -749,10 +754,10 @@ static __devinit struct net_device * rtl8139_init_board (struct pci_dev *pdev)
 	unsigned long mmio_start, mmio_end, mmio_flags, mmio_len;
 	u32 version;
 
-	assert (pdev != NULL);
+	assert(pdev != NULL);
 
 	/* dev and priv zeroed in alloc_etherdev */
-	dev = alloc_etherdev (sizeof (*tp));
+	dev = alloc_etherdev(sizeof(*tp));
 	if (dev == NULL) {
 		dev_err(&pdev->dev, "Unable to alloc new net device\n");
 		return ERR_PTR(-ENOMEM);
@@ -763,19 +768,19 @@ static __devinit struct net_device * rtl8139_init_board (struct pci_dev *pdev)
 	tp->pci_dev = pdev;
 
 	/* enable device (incl. PCI PM wakeup and hotplug setup) */
-	rc = pci_enable_device (pdev);
+	rc = pci_enable_device(pdev);
 	if (rc)
 		goto err_out;
 
-	pio_start = pci_resource_start (pdev, 0);
-	pio_end = pci_resource_end (pdev, 0);
-	pio_flags = pci_resource_flags (pdev, 0);
-	pio_len = pci_resource_len (pdev, 0);
+	pio_start = pci_resource_start(pdev, 0);
+	pio_end = pci_resource_end(pdev, 0);
+	pio_flags = pci_resource_flags(pdev, 0);
+	pio_len = pci_resource_len(pdev, 0);
 
-	mmio_start = pci_resource_start (pdev, 1);
-	mmio_end = pci_resource_end (pdev, 1);
-	mmio_flags = pci_resource_flags (pdev, 1);
-	mmio_len = pci_resource_len (pdev, 1);
+	mmio_start = pci_resource_start(pdev, 1);
+	mmio_end = pci_resource_end(pdev, 1);
+	mmio_flags = pci_resource_flags(pdev, 1);
+	mmio_len = pci_resource_len(pdev, 1);
 
 	/* set this immediately, we need to know before
 	 * we talk to the chip directly */
@@ -786,37 +791,41 @@ retry:
 	if (use_io) {
 		/* make sure PCI base addr 0 is PIO */
 		if (!(pio_flags & IORESOURCE_IO)) {
-			dev_err(&pdev->dev, "region #0 not a PIO resource, aborting\n");
+			dev_err(&pdev->dev, "region #0 not a "
+					"PIO resource, aborting\n");
 			rc = -ENODEV;
 			goto err_out;
 		}
 		/* check for weird/broken PCI region reporting */
 		if (pio_len < RTL_MIN_IO_SIZE) {
-			dev_err(&pdev->dev, "Invalid PCI I/O region size(s), aborting\n");
+			dev_err(&pdev->dev, "Invalid PCI I/O "
+					"region size(s), aborting\n");
 			rc = -ENODEV;
 			goto err_out;
 		}
 	} else {
 		/* make sure PCI base addr 1 is MMIO */
 		if (!(mmio_flags & IORESOURCE_MEM)) {
-			dev_err(&pdev->dev, "region #1 not an MMIO resource, aborting\n");
+			dev_err(&pdev->dev, "region #1 not an MMIO "
+						"resource, aborting\n");
 			rc = -ENODEV;
 			goto err_out;
 		}
 		if (mmio_len < RTL_MIN_IO_SIZE) {
-			dev_err(&pdev->dev, "Invalid PCI mem region size(s), aborting\n");
+			dev_err(&pdev->dev, "Invalid PCI mem region "
+						"size(s), aborting\n");
 			rc = -ENODEV;
 			goto err_out;
 		}
 	}
 
-	rc = pci_request_regions (pdev, DRV_NAME);
+	rc = pci_request_regions(pdev, DRV_NAME);
 	if (rc)
 		goto err_out;
 	disable_dev_on_err = 1;
 
 	/* enable PCI bus-mastering */
-	pci_set_master (pdev);
+	pci_set_master(pdev);
 
 	if (use_io) {
 		ioaddr = pci_iomap(pdev, 0, 0);
@@ -842,26 +851,27 @@ retry:
 	tp->mmio_addr = ioaddr;
 
 	/* Bring old chips out of low-power mode. */
-	RTL_W8 (HltClk, 'R');
+	RTL_W8(HltClk, 'R');
 
 	/* check for missing/broken hardware */
-	if (RTL_R32 (TxConfig) == 0xFFFFFFFF) {
+	if (RTL_R32(TxConfig) == 0xFFFFFFFF) {
 		dev_err(&pdev->dev, "Chip not responding, ignoring board\n");
 		rc = -EIO;
 		goto err_out;
 	}
 
 	/* identify chip attached to board */
-	version = RTL_R32 (TxConfig) & HW_REVID_MASK;
-	for (i = 0; i < ARRAY_SIZE (rtl_chip_info); i++)
+	version = RTL_R32(TxConfig) & HW_REVID_MASK;
+	for (i = 0; i < ARRAY_SIZE(rtl_chip_info); i++)
 		if (version == rtl_chip_info[i].version) {
 			tp->chipset = i;
 			goto match;
 		}
 
-	/* if unknown chip, assume array element #0, original RTL-8139 in this case */
+	/* if unknown chip, assume array element #0,
+				original RTL-8139 in this case */
 	dev_dbg(&pdev->dev, "unknown chip version, assuming RTL-8139\n");
-	dev_dbg(&pdev->dev, "TxConfig = 0x%lx\n", RTL_R32 (TxConfig));
+	dev_dbg(&pdev->dev, "TxConfig = 0x%lx\n", RTL_R32(TxConfig));
 	tp->chipset = 0;
 
 match:
@@ -869,40 +879,40 @@ match:
 		 version, i, rtl_chip_info[i].name);
 
 	if (tp->chipset >= CH_8139B) {
-		u8 new_tmp8 = tmp8 = RTL_R8 (Config1);
+		u8 new_tmp8 = tmp8 = RTL_R8(Config1);
 		pr_debug("PCI PM wakeup\n");
 		if ((rtl_chip_info[tp->chipset].flags & HasLWake) &&
 		    (tmp8 & LWAKE))
 			new_tmp8 &= ~LWAKE;
 		new_tmp8 |= Cfg1_PM_Enable;
 		if (new_tmp8 != tmp8) {
-			RTL_W8 (Cfg9346, Cfg9346_Unlock);
-			RTL_W8 (Config1, tmp8);
-			RTL_W8 (Cfg9346, Cfg9346_Lock);
+			RTL_W8(Cfg9346, Cfg9346_Unlock);
+			RTL_W8(Config1, tmp8);
+			RTL_W8(Cfg9346, Cfg9346_Lock);
 		}
 		if (rtl_chip_info[tp->chipset].flags & HasLWake) {
-			tmp8 = RTL_R8 (Config4);
+			tmp8 = RTL_R8(Config4);
 			if (tmp8 & LWPTN) {
-				RTL_W8 (Cfg9346, Cfg9346_Unlock);
-				RTL_W8 (Config4, tmp8 & ~LWPTN);
-				RTL_W8 (Cfg9346, Cfg9346_Lock);
+				RTL_W8(Cfg9346, Cfg9346_Unlock);
+				RTL_W8(Config4, tmp8 & ~LWPTN);
+				RTL_W8(Cfg9346, Cfg9346_Lock);
 			}
 		}
 	} else {
 		pr_debug("Old chip wakeup\n");
-		tmp8 = RTL_R8 (Config1);
+		tmp8 = RTL_R8(Config1);
 		tmp8 &= ~(SLEEP | PWRDN);
-		RTL_W8 (Config1, tmp8);
+		RTL_W8(Config1, tmp8);
 	}
 
-	rtl8139_chip_reset (ioaddr);
+	rtl8139_chip_reset(ioaddr);
 
 	return dev;
 
 err_out:
-	__rtl8139_cleanup_dev (dev);
+	__rtl8139_cleanup_dev(dev);
 	if (disable_dev_on_err)
-		pci_disable_device (pdev);
+		pci_disable_device(pdev);
 	return ERR_PTR(rc);
 }
 
@@ -912,17 +922,13 @@ static const struct net_device_ops rtl8139_netdev_ops = {
 	.ndo_get_stats		= rtl8139_get_stats,
 	.ndo_change_mtu		= eth_change_mtu,
 	.ndo_validate_addr	= eth_validate_addr,
-	.ndo_set_mac_address 	= rtl8139_set_mac_address,
-	.ndo_start_xmit		= rtl8139_start_xmit,
-	.ndo_set_multicast_list	= rtl8139_set_rx_mode,
-	.ndo_do_ioctl		= netdev_ioctl,
-	.ndo_tx_timeout		= rtl8139_tx_timeout,
+
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= rtl8139_poll_controller,
 #endif
 };
 
-static int __devinit rtl8139_init_one (struct pci_dev *pdev,
+static int __devinit rtl8139_init_one(struct pci_dev *pdev,
 				       const struct pci_device_id *ent)
 {
 	struct net_device *dev = NULL;
@@ -931,8 +937,8 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 	void __iomem *ioaddr;
 	static int board_idx = -1;
 
-	assert (pdev != NULL);
-	assert (ent != NULL);
+	assert(pdev != NULL);
+	assert(ent != NULL);
 
 	board_idx++;
 
@@ -948,10 +954,12 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 #endif
 
 	if (pdev->vendor == PCI_VENDOR_ID_REALTEK &&
-	    pdev->device == PCI_DEVICE_ID_REALTEK_8139 && pdev->revision >= 0x20) {
+	    pdev->device == PCI_DEVICE_ID_REALTEK_8139 && "
+					"pdev->revision >= 0x20) {
 		dev_info(&pdev->dev,
-			   "This (id %04x:%04x rev %02x) is an enhanced 8139C+ chip, use 8139cp\n",
-		       	   pdev->vendor, pdev->device, pdev->revision);
+			   "This (id %04x:%04x rev %02x) is an "
+					"enhanced 8139C+ chip, use 8139cp\n",
+		pdev->vendor, pdev->device, pdev->revision);
 		return -ENODEV;
 	}
 
@@ -963,21 +971,21 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 		use_io = 1;
 	}
 
-	dev = rtl8139_init_board (pdev);
+	dev = rtl8139_init_board(pdev);
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
-	assert (dev != NULL);
+	assert(dev != NULL);
 	tp = netdev_priv(dev);
 	tp->dev = dev;
 
 	ioaddr = tp->mmio_addr;
-	assert (ioaddr != NULL);
+	assert(ioaddr != NULL);
 
-	addr_len = read_eeprom (ioaddr, 0, 8) == 0x8129 ? 8 : 6;
+	addr_len = read_eeprom(ioaddr, 0, 8) == 0x8129 ? 8 : 6;
 	for (i = 0; i < 3; i++)
 		((__le16 *) (dev->dev_addr))[i] =
-		    cpu_to_le16(read_eeprom (ioaddr, i + 7, addr_len));
+		    cpu_to_le16(read_eeprom(ioaddr, i + 7, addr_len));
 	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
 
 	/* The Rtl8139-specific entries in the device structure. */
@@ -1002,8 +1010,8 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 	tp->mmio_addr = ioaddr;
 	tp->msg_enable =
 		(debug < 0 ? RTL8139_DEF_MSG_ENABLE : ((1 << debug) - 1));
-	spin_lock_init (&tp->lock);
-	spin_lock_init (&tp->rx_lock);
+	spin_lock_init(&tp->lock);
+	spin_lock_init(&tp->rx_lock);
 	INIT_DELAYED_WORK(&tp->thread, rtl8139_thread);
 	tp->mii.dev = dev;
 	tp->mii.mdio_read = mdio_read;
@@ -1014,10 +1022,11 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 	/* dev is fully set up and ready to use now */
 	pr_debug("about to register device named %s (%p)...\n",
 		 dev->name, dev);
-	i = register_netdev (dev);
-	if (i) goto err_out;
+	i = register_netdev(dev);
+	if (i)
+		goto err_out;
 
-	pci_set_drvdata (pdev, dev);
+	pci_set_drvdata(pdev, dev);
 
 	netdev_info(dev, "%s at 0x%lx, %pM, IRQ %d\n",
 		    board_info[ent->driver_data].name,
@@ -1037,12 +1046,14 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 			if (mii_status != 0xffff  &&  mii_status != 0x0000) {
 				u16 advertising = mdio_read(dev, phy, 4);
 				tp->phys[phy_idx++] = phy;
-				netdev_info(dev, "MII transceiver %d status 0x%04x advertising %04x\n",
+				netdev_info(dev, "MII transceiver %d "
+					"status 0x%04x advertising %04x\n",
 					    phy, mii_status, advertising);
 			}
 		}
 		if (phy_idx == 0) {
-			netdev_info(dev, "No MII transceivers found! Assuming SYM transceiver\n");
+			netdev_info(dev, "No MII transceivers found! "
+						"Assuming SYM transceiver\n");
 			tp->phys[0] = 32;
 		}
 	} else
@@ -1071,35 +1082,36 @@ static int __devinit rtl8139_init_one (struct pci_dev *pdev,
 			    (option & 0x20 ? 100 : 10),
 			    (option & 0x10 ? "full" : "half"));
 		mdio_write(dev, tp->phys[0], 0,
-				   ((option & 0x20) ? 0x2000 : 0) | 	/* 100Mbps? */
-				   ((option & 0x10) ? 0x0100 : 0)); /* Full duplex? */
+			((option & 0x20) ? 0x2000 : 0) |	/* 100Mbps? */
+				   ((option & 0x10) ? 0x0100 : 0));
+							/* Full duplex? */
 	}
 
 	/* Put the chip into low-power mode. */
 	if (rtl_chip_info[tp->chipset].flags & HasHltClk)
-		RTL_W8 (HltClk, 'H');	/* 'R' would leave the clock running. */
+		RTL_W8(HltClk, 'H');	/* 'R' would leave the clock running. */
 
 	return 0;
 
 err_out:
-	__rtl8139_cleanup_dev (dev);
-	pci_disable_device (pdev);
+	__rtl8139_cleanup_dev(dev);
+	pci_disable_device(pdev);
 	return i;
 }
 
 
-static void __devexit rtl8139_remove_one (struct pci_dev *pdev)
+static void __devexit rtl8139_remove_one(struct pci_dev *pdev)
 {
-	struct net_device *dev = pci_get_drvdata (pdev);
+	struct net_device *dev = pci_get_drvdata(pdev);
 
-	assert (dev != NULL);
+	assert(dev != NULL);
 
 	flush_scheduled_work();
 
-	unregister_netdev (dev);
+	unregister_netdev(dev);
 
-	__rtl8139_cleanup_dev (dev);
-	pci_disable_device (pdev);
+	__rtl8139_cleanup_dev(dev);
+	pci_disable_device(pdev);
 }
 
 
@@ -1125,40 +1137,41 @@ static void __devexit rtl8139_remove_one (struct pci_dev *pdev)
 #define EE_READ_CMD		(6)
 #define EE_ERASE_CMD	(7)
 
-static int __devinit read_eeprom (void __iomem *ioaddr, int location, int addr_len)
+static int __devinit read_eeprom(void __iomem *ioaddr,
+					int location, int addr_len)
 {
 	int i;
 	unsigned retval = 0;
 	int read_cmd = location | (EE_READ_CMD << addr_len);
 
-	RTL_W8 (Cfg9346, EE_ENB & ~EE_CS);
-	RTL_W8 (Cfg9346, EE_ENB);
-	eeprom_delay ();
+	RTL_W8(Cfg9346, EE_ENB & ~EE_CS);
+	RTL_W8(Cfg9346, EE_ENB);
+	eeprom_delay();
 
 	/* Shift the read command bits out. */
 	for (i = 4 + addr_len; i >= 0; i--) {
 		int dataval = (read_cmd & (1 << i)) ? EE_DATA_WRITE : 0;
-		RTL_W8 (Cfg9346, EE_ENB | dataval);
-		eeprom_delay ();
-		RTL_W8 (Cfg9346, EE_ENB | dataval | EE_SHIFT_CLK);
-		eeprom_delay ();
+		RTL_W8(Cfg9346, EE_ENB | dataval);
+		eeprom_delay();
+		RTL_W8(Cfg9346, EE_ENB | dataval | EE_SHIFT_CLK);
+		eeprom_delay();
 	}
-	RTL_W8 (Cfg9346, EE_ENB);
-	eeprom_delay ();
+	RTL_W8(Cfg9346, EE_ENB);
+	eeprom_delay();
 
 	for (i = 16; i > 0; i--) {
-		RTL_W8 (Cfg9346, EE_ENB | EE_SHIFT_CLK);
-		eeprom_delay ();
+		RTL_W8(Cfg9346, EE_ENB | EE_SHIFT_CLK);
+		eeprom_delay();
 		retval =
-		    (retval << 1) | ((RTL_R8 (Cfg9346) & EE_DATA_READ) ? 1 :
+		    (retval << 1) | ((RTL_R8(Cfg9346) & EE_DATA_READ) ? 1 :
 				     0);
-		RTL_W8 (Cfg9346, EE_ENB);
-		eeprom_delay ();
+		RTL_W8(Cfg9346, EE_ENB);
+		eeprom_delay();
 	}
 
 	/* Terminate the EEPROM access. */
-	RTL_W8 (Cfg9346, ~EE_CS);
-	eeprom_delay ();
+	RTL_W8(Cfg9346, ~EE_CS);
+	eeprom_delay();
 
 	return retval;
 }
@@ -1193,20 +1206,20 @@ static const char mii_2_8139_map[8] = {
 
 #ifdef CONFIG_8139TOO_8129
 /* Syncronize the MII management interface by shifting 32 one bits out. */
-static void mdio_sync (void __iomem *ioaddr)
+static void mdio_sync(void __iomem *ioaddr)
 {
 	int i;
 
 	for (i = 32; i >= 0; i--) {
-		RTL_W8 (Config4, MDIO_WRITE1);
-		mdio_delay ();
-		RTL_W8 (Config4, MDIO_WRITE1 | MDIO_CLK);
-		mdio_delay ();
+		RTL_W8(Config4, MDIO_WRITE1);
+		mdio_delay();
+		RTL_W8(Config4, MDIO_WRITE1 | MDIO_CLK);
+		mdio_delay();
 	}
 }
 #endif
 
-static int mdio_read (struct net_device *dev, int phy_id, int location)
+static int mdio_read(struct net_device *dev, int phy_id, int location)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	int retval = 0;
@@ -1219,28 +1232,29 @@ static int mdio_read (struct net_device *dev, int phy_id, int location)
 	if (phy_id > 31) {	/* Really a 8139.  Use internal registers. */
 		void __iomem *ioaddr = tp->mmio_addr;
 		return location < 8 && mii_2_8139_map[location] ?
-		    RTL_R16 (mii_2_8139_map[location]) : 0;
+		    RTL_R16(mii_2_8139_map[location]) : 0;
 	}
 
 #ifdef CONFIG_8139TOO_8129
-	mdio_sync (ioaddr);
+	mdio_sync(ioaddr);
 	/* Shift the read command bits out. */
 	for (i = 15; i >= 0; i--) {
 		int dataval = (mii_cmd & (1 << i)) ? MDIO_DATA_OUT : 0;
 
-		RTL_W8 (Config4, MDIO_DIR | dataval);
-		mdio_delay ();
-		RTL_W8 (Config4, MDIO_DIR | dataval | MDIO_CLK);
-		mdio_delay ();
+		RTL_W8(Config4, MDIO_DIR | dataval);
+		mdio_delay();
+		RTL_W8(Config4, MDIO_DIR | dataval | MDIO_CLK);
+		mdio_delay();
 	}
 
 	/* Read the two transition, 16 data, and wire-idle bits. */
 	for (i = 19; i > 0; i--) {
-		RTL_W8 (Config4, 0);
-		mdio_delay ();
-		retval = (retval << 1) | ((RTL_R8 (Config4) & MDIO_DATA_IN) ? 1 : 0);
-		RTL_W8 (Config4, MDIO_CLK);
-		mdio_delay ();
+		RTL_W8(Config4, 0);
+		mdio_delay();
+		retval = (retval << 1) | ((RTL_R8(Config4) & MDIO_DATA_IN) ? "
+									"1 : 0);
+		RTL_W8(Config4, MDIO_CLK);
+		mdio_delay();
 	}
 #endif
 
@@ -1248,57 +1262,59 @@ static int mdio_read (struct net_device *dev, int phy_id, int location)
 }
 
 
-static void mdio_write (struct net_device *dev, int phy_id, int location,
+static void mdio_write(struct net_device *dev, int phy_id, int location,
 			int value)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 #ifdef CONFIG_8139TOO_8129
 	void __iomem *ioaddr = tp->mmio_addr;
-	int mii_cmd = (0x5002 << 16) | (phy_id << 23) | (location << 18) | value;
+	int mii_cmd = (0x5002 << 16) | (phy_id << 23) | (location << 18) |
+	value;
 	int i;
 #endif
 
 	if (phy_id > 31) {	/* Really a 8139.  Use internal registers. */
 		void __iomem *ioaddr = tp->mmio_addr;
 		if (location == 0) {
-			RTL_W8 (Cfg9346, Cfg9346_Unlock);
-			RTL_W16 (BasicModeCtrl, value);
-			RTL_W8 (Cfg9346, Cfg9346_Lock);
+			RTL_W8(Cfg9346, Cfg9346_Unlock);
+			RTL_W16(BasicModeCtrl, value);
+			RTL_W8(Cfg9346, Cfg9346_Lock);
 		} else if (location < 8 && mii_2_8139_map[location])
-			RTL_W16 (mii_2_8139_map[location], value);
+			RTL_W16(mii_2_8139_map[location], value);
 		return;
 	}
 
 #ifdef CONFIG_8139TOO_8129
-	mdio_sync (ioaddr);
+	mdio_sync(ioaddr);
 
 	/* Shift the command bits out. */
 	for (i = 31; i >= 0; i--) {
 		int dataval =
 		    (mii_cmd & (1 << i)) ? MDIO_WRITE1 : MDIO_WRITE0;
-		RTL_W8 (Config4, dataval);
-		mdio_delay ();
-		RTL_W8 (Config4, dataval | MDIO_CLK);
-		mdio_delay ();
+		RTL_W8(Config4, dataval);
+		mdio_delay();
+		RTL_W8(Config4, dataval | MDIO_CLK);
+		mdio_delay();
 	}
 	/* Clear out extra bits. */
 	for (i = 2; i > 0; i--) {
-		RTL_W8 (Config4, 0);
-		mdio_delay ();
-		RTL_W8 (Config4, MDIO_CLK);
-		mdio_delay ();
+		RTL_W8(Config4, 0);
+		mdio_delay();
+		RTL_W8(Config4, MDIO_CLK);
+		mdio_delay();
 	}
 #endif
 }
 
 
-static int rtl8139_open (struct net_device *dev)
+static int rtl8139_open(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	int retval;
 	void __iomem *ioaddr = tp->mmio_addr;
 
-	retval = request_irq (dev->irq, rtl8139_interrupt, IRQF_SHARED, dev->name, dev);
+	retval = request_irq(dev->irq, rtl8139_interrupt,
+	IRQF_SHARED, dev->name, dev);
 	if (retval)
 		return retval;
 
@@ -1325,15 +1341,15 @@ static int rtl8139_open (struct net_device *dev)
 	tp->mii.full_duplex = tp->mii.force_media;
 	tp->tx_flag = (TX_FIFO_THRESH << 11) & 0x003f0000;
 
-	rtl8139_init_ring (dev);
-	rtl8139_hw_start (dev);
-	netif_start_queue (dev);
+	rtl8139_init_ring(dev);
+	rtl8139_hw_start(dev);
+	netif_start_queue(dev);
 
 	netif_dbg(tp, ifup, dev,
 		  "%s() ioaddr %#llx IRQ %d GP Pins %02x %s-duplex\n",
 		  __func__,
-		  (unsigned long long)pci_resource_start (tp->pci_dev, 1),
-		  dev->irq, RTL_R8 (MediaStatus),
+		  (unsigned long long)pci_resource_start(tp->pci_dev, 1),
+		  dev->irq, RTL_R8(MediaStatus),
 		  tp->mii.full_duplex ? "full" : "half");
 
 	rtl8139_start_thread(tp);
@@ -1342,17 +1358,16 @@ static int rtl8139_open (struct net_device *dev)
 }
 
 
-static void rtl_check_media (struct net_device *dev, unsigned int init_media)
+static void rtl_check_media(struct net_device *dev, unsigned int init_media)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 
-	if (tp->phys[0] >= 0) {
+	if (tp->phys[0] >= 0)
 		mii_check_media(&tp->mii, netif_msg_link(tp), init_media);
-	}
 }
 
 /* Start the hardware at open or resume. */
-static void rtl8139_hw_start (struct net_device *dev)
+static void rtl8139_hw_start(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -1361,65 +1376,66 @@ static void rtl8139_hw_start (struct net_device *dev)
 
 	/* Bring old chips out of low-power mode. */
 	if (rtl_chip_info[tp->chipset].flags & HasHltClk)
-		RTL_W8 (HltClk, 'R');
+		RTL_W8(HltClk, 'R');
 
-	rtl8139_chip_reset (ioaddr);
+	rtl8139_chip_reset(ioaddr);
 
 	/* unlock Config[01234] and BMCR register writes */
-	RTL_W8_F (Cfg9346, Cfg9346_Unlock);
+	RTL_W8_F(Cfg9346, Cfg9346_Unlock);
 	/* Restore our idea of the MAC address. */
-	RTL_W32_F (MAC0 + 0, le32_to_cpu (*(__le32 *) (dev->dev_addr + 0)));
-	RTL_W32_F (MAC0 + 4, le16_to_cpu (*(__le16 *) (dev->dev_addr + 4)));
+	RTL_W32_F(MAC0 + 0, le32_to_cpu(*(__le32 *) (dev->dev_addr + 0)));
+	RTL_W32_F(MAC0 + 4, le16_to_cpu(*(__le16 *) (dev->dev_addr + 4)));
 
 	tp->cur_rx = 0;
 
 	/* init Rx ring buffer DMA address */
-	RTL_W32_F (RxBuf, tp->rx_ring_dma);
+	RTL_W32_F(RxBuf, tp->rx_ring_dma);
 
 	/* Must enable Tx/Rx before setting transfer thresholds! */
-	RTL_W8 (ChipCmd, CmdRxEnb | CmdTxEnb);
+	RTL_W8(ChipCmd, CmdRxEnb | CmdTxEnb);
 
 	tp->rx_config = rtl8139_rx_config | AcceptBroadcast | AcceptMyPhys;
-	RTL_W32 (RxConfig, tp->rx_config);
-	RTL_W32 (TxConfig, rtl8139_tx_config);
+	RTL_W32(RxConfig, tp->rx_config);
+	RTL_W32(TxConfig, rtl8139_tx_config);
 
-	rtl_check_media (dev, 1);
+	rtl_check_media(dev, 1);
 
 	if (tp->chipset >= CH_8139B) {
 		/* Disable magic packet scanning, which is enabled
 		 * when PM is enabled in Config1.  It can be reenabled
 		 * via ETHTOOL_SWOL if desired.  */
-		RTL_W8 (Config3, RTL_R8 (Config3) & ~Cfg3_Magic);
+		RTL_W8(Config3, RTL_R8(Config3) & ~Cfg3_Magic);
 	}
 
 	netdev_dbg(dev, "init buffer addresses\n");
 
 	/* Lock Config[01234] and BMCR register writes */
-	RTL_W8 (Cfg9346, Cfg9346_Lock);
+	RTL_W8(Cfg9346, Cfg9346_Lock);
 
 	/* init Tx buffer DMA addresses */
 	for (i = 0; i < NUM_TX_DESC; i++)
-		RTL_W32_F (TxAddr0 + (i * 4), tp->tx_bufs_dma + (tp->tx_buf[i] - tp->tx_bufs));
+		RTL_W32_F(TxAddr0 + (i * 4),
+		 tp->tx_bufs_dma + (tp->tx_buf[i] - tp->tx_bufs));
 
-	RTL_W32 (RxMissed, 0);
+	RTL_W32(RxMissed, 0);
 
-	rtl8139_set_rx_mode (dev);
+	rtl8139_set_rx_mode(dev);
 
 	/* no early-rx interrupts */
-	RTL_W16 (MultiIntr, RTL_R16 (MultiIntr) & MultiIntrClear);
+	RTL_W16(MultiIntr, RTL_R16(MultiIntr) & MultiIntrClear);
 
 	/* make sure RxTx has started */
-	tmp = RTL_R8 (ChipCmd);
+	tmp = RTL_R8(ChipCmd);
 	if ((!(tmp & CmdRxEnb)) || (!(tmp & CmdTxEnb)))
-		RTL_W8 (ChipCmd, CmdRxEnb | CmdTxEnb);
+		RTL_W8(ChipCmd, CmdRxEnb | CmdTxEnb);
 
 	/* Enable all known interrupts by setting the interrupt mask. */
-	RTL_W16 (IntrMask, rtl8139_intr_mask);
+	RTL_W16(IntrMask, rtl8139_intr_mask);
 }
 
 
 /* Initialize the Rx and Tx rings, along with various 'dev' bits. */
-static void rtl8139_init_ring (struct net_device *dev)
+static void rtl8139_init_ring(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	int i;
@@ -1437,7 +1453,7 @@ static void rtl8139_init_ring (struct net_device *dev)
 static int next_tick = 3 * HZ;
 
 #ifndef CONFIG_8139TOO_TUNE_TWISTER
-static inline void rtl8139_tune_twister (struct net_device *dev,
+static inline void rtl8139_tune_twister(struct net_device *dev,
 				  struct rtl8139_private *tp) {}
 #else
 enum TwisterParamVals {
@@ -1453,7 +1469,7 @@ static const unsigned long param[4][4] = {
 	{0xbb39de43, 0xbb39ce43, 0xbb39ce83, 0xbb39ce83}
 };
 
-static void rtl8139_tune_twister (struct net_device *dev,
+static void rtl8139_tune_twister(struct net_device *dev,
 				  struct rtl8139_private *tp)
 {
 	int linkcase;
@@ -1465,23 +1481,24 @@ static void rtl8139_tune_twister (struct net_device *dev,
 	 */
 	switch (tp->twistie) {
 	case 1:
-		if (RTL_R16 (CSCR) & CSCR_LinkOKBit) {
+		if (RTL_R16(CSCR) & CSCR_LinkOKBit) {
 			/* We have link beat, let us tune the twister. */
-			RTL_W16 (CSCR, CSCR_LinkDownOffCmd);
+			RTL_W16(CSCR, CSCR_LinkDownOffCmd);
 			tp->twistie = 2;	/* Change to state 2. */
 			next_tick = HZ / 10;
 		} else {
-			/* Just put in some reasonable defaults for when beat returns. */
-			RTL_W16 (CSCR, CSCR_LinkDownCmd);
-			RTL_W32 (FIFOTMS, 0x20);	/* Turn on cable test mode. */
-			RTL_W32 (PARA78, PARA78_default);
-			RTL_W32 (PARA7c, PARA7c_default);
+			/* Just put in some reasonable defaults for*/
+			/* when beat returns. */
+			RTL_W16(CSCR, CSCR_LinkDownCmd);
+			RTL_W32(FIFOTMS, 0x20);	/* Turn on cable test mode. */
+			RTL_W32(PARA78, PARA78_default);
+			RTL_W32(PARA7c, PARA7c_default);
 			tp->twistie = 0;	/* Bail from future actions. */
 		}
 		break;
 	case 2:
 		/* Read how long it took to hear the echo. */
-		linkcase = RTL_R16 (CSCR) & CSCR_LinkStatusBits;
+		linkcase = RTL_R16(CSCR) & CSCR_LinkStatusBits;
 		if (linkcase == 0x7000)
 			tp->twist_row = 3;
 		else if (linkcase == 0x3000)
@@ -1497,8 +1514,8 @@ static void rtl8139_tune_twister (struct net_device *dev,
 	case 3:
 		/* Put out four tuning parameters, one per 100msec. */
 		if (tp->twist_col == 0)
-			RTL_W16 (FIFOTMS, 0);
-		RTL_W32 (PARA7c, param[(int) tp->twist_row]
+			RTL_W16(FIFOTMS, 0);
+		RTL_W32(PARA7c, param[(int) tp->twist_row]
 			 [(int) tp->twist_col]);
 		next_tick = HZ / 10;
 		if (++tp->twist_col >= 4) {
@@ -1510,22 +1527,22 @@ static void rtl8139_tune_twister (struct net_device *dev,
 		break;
 	case 4:
 		/* Special case for long cables: check for mistune. */
-		if ((RTL_R16 (CSCR) &
+		if ((RTL_R16(CSCR) &
 		     CSCR_LinkStatusBits) == 0x7000) {
 			tp->twistie = 0;
 			break;
 		} else {
-			RTL_W32 (PARA7c, 0xfb38de03);
+			RTL_W32(PARA7c, 0xfb38de03);
 			tp->twistie = 5;
 			next_tick = HZ / 10;
 		}
 		break;
 	case 5:
 		/* Retune for shorter cable (column 2). */
-		RTL_W32 (FIFOTMS, 0x20);
-		RTL_W32 (PARA78, PARA78_default);
-		RTL_W32 (PARA7c, PARA7c_default);
-		RTL_W32 (FIFOTMS, 0x00);
+		RTL_W32(FIFOTMS, 0x20);
+		RTL_W32(PARA78, PARA78_default);
+		RTL_W32(PARA7c, PARA7c_default);
+		RTL_W32(FIFOTMS, 0x00);
 		tp->twist_row = 2;
 		tp->twist_col = 0;
 		tp->twistie = 3;
@@ -1539,13 +1556,13 @@ static void rtl8139_tune_twister (struct net_device *dev,
 }
 #endif /* CONFIG_8139TOO_TUNE_TWISTER */
 
-static inline void rtl8139_thread_iter (struct net_device *dev,
+static inline void rtl8139_thread_iter(struct net_device *dev,
 				 struct rtl8139_private *tp,
 				 void __iomem *ioaddr)
 {
 	int mii_lpa;
 
-	mii_lpa = mdio_read (dev, tp->phys[0], MII_LPA);
+	mii_lpa = mdio_read(dev, tp->phys[0], MII_LPA);
 
 	if (!tp->mii.force_media && mii_lpa != 0xffff) {
 		int duplex = ((mii_lpa & LPA_100FULL) ||
@@ -1554,23 +1571,27 @@ static inline void rtl8139_thread_iter (struct net_device *dev,
 			tp->mii.full_duplex = duplex;
 
 			if (mii_lpa) {
-				netdev_info(dev, "Setting %s-duplex based on MII #%d link partner ability of %04x\n",
-					    tp->mii.full_duplex ? "full" : "half",
+				netdev_info(dev, "Setting %s-duplex based "
+				"on MII #%d link partner ability of %04x\n",
+					    tp->mii.full_duplex ? "full""
+							" : "half",
 					    tp->phys[0], mii_lpa);
 			} else {
-				netdev_info(dev, "media is unconnected, link down, or incompatible connection\n");
+				netdev_info(dev, "media is unconnected, "
+					"link down, or incompatible "
+							"connection\n");
 			}
 #if 0
-			RTL_W8 (Cfg9346, Cfg9346_Unlock);
-			RTL_W8 (Config1, tp->mii.full_duplex ? 0x60 : 0x20);
-			RTL_W8 (Cfg9346, Cfg9346_Lock);
+			RTL_W8(Cfg9346, Cfg9346_Unlock);
+			RTL_W8(Config1, tp->mii.full_duplex ? 0x60 : 0x20);
+			RTL_W8(Cfg9346, Cfg9346_Lock);
 #endif
 		}
 	}
 
 	next_tick = HZ * 60;
 
-	rtl8139_tune_twister (dev, tp);
+	rtl8139_tune_twister(dev, tp);
 
 	netdev_dbg(dev, "Media selection tick, Link partner %04x\n",
 		   RTL_R16(NWayLPAR));
@@ -1580,7 +1601,7 @@ static inline void rtl8139_thread_iter (struct net_device *dev,
 		   RTL_R8(Config0), RTL_R8(Config1));
 }
 
-static void rtl8139_thread (struct work_struct *work)
+static void rtl8139_thread(struct work_struct *work)
 {
 	struct rtl8139_private *tp =
 		container_of(work, struct rtl8139_private, thread.work);
@@ -1601,7 +1622,7 @@ static void rtl8139_thread (struct work_struct *work)
 	if (tp->have_thread)
 		schedule_delayed_work(&tp->thread, thr_delay);
 out_unlock:
-	rtnl_unlock ();
+	rtnl_unlock();
 }
 
 static void rtl8139_start_thread(struct rtl8139_private *tp)
@@ -1618,7 +1639,7 @@ static void rtl8139_start_thread(struct rtl8139_private *tp)
 	schedule_delayed_work(&tp->thread, next_tick);
 }
 
-static inline void rtl8139_tx_clear (struct rtl8139_private *tp)
+static inline void rtl8139_tx_clear(struct rtl8139_private *tp)
 {
 	tp->cur_tx = 0;
 	tp->dirty_tx = 0;
@@ -1626,7 +1647,7 @@ static inline void rtl8139_tx_clear (struct rtl8139_private *tp)
 	/* XXX account for unsent Tx packets in tp->stats.tx_dropped */
 }
 
-static void rtl8139_tx_timeout_task (struct work_struct *work)
+static void rtl8139_tx_timeout_task(struct work_struct *work)
 {
 	struct rtl8139_private *tp =
 		container_of(work, struct rtl8139_private, thread.work);
@@ -1650,28 +1671,28 @@ static void rtl8139_tx_timeout_task (struct work_struct *work)
 	tp->xstats.tx_timeouts++;
 
 	/* disable Tx ASAP, if not already */
-	tmp8 = RTL_R8 (ChipCmd);
+	tmp8 = RTL_R8(ChipCmd);
 	if (tmp8 & CmdTxEnb)
-		RTL_W8 (ChipCmd, CmdRxEnb);
+		RTL_W8(ChipCmd, CmdRxEnb);
 
 	spin_lock_bh(&tp->rx_lock);
 	/* Disable interrupts by clearing the interrupt mask. */
-	RTL_W16 (IntrMask, 0x0000);
+	RTL_W16(IntrMask, 0x0000);
 
 	/* Stop a shared interrupt from scavenging while we are. */
 	spin_lock_irq(&tp->lock);
-	rtl8139_tx_clear (tp);
+	rtl8139_tx_clear(tp);
 	spin_unlock_irq(&tp->lock);
 
 	/* ...and finally, reset everything */
 	if (netif_running(dev)) {
-		rtl8139_hw_start (dev);
-		netif_wake_queue (dev);
+		rtl8139_hw_start(dev);
+		netif_wake_queue(dev);
 	}
 	spin_unlock_bh(&tp->rx_lock);
 }
 
-static void rtl8139_tx_timeout (struct net_device *dev)
+static void rtl8139_tx_timeout(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 
@@ -1682,7 +1703,7 @@ static void rtl8139_tx_timeout (struct net_device *dev)
 	}
 }
 
-static netdev_tx_t rtl8139_start_xmit (struct sk_buff *skb,
+static netdev_tx_t rtl8139_start_xmit(struct sk_buff *skb,
 					     struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
@@ -1713,7 +1734,7 @@ static netdev_tx_t rtl8139_start_xmit (struct sk_buff *skb,
 	 * to make sure that the device sees the updated data.
 	 */
 	wmb();
-	RTL_W32_F (TxStatus0 + (entry * sizeof (u32)),
+	RTL_W32_F(TxStatus0 + (entry * sizeof(u32)),
 		   tp->tx_flag | max(len, (unsigned int)ETH_ZLEN));
 
 	dev->trans_start = jiffies;
@@ -1721,7 +1742,7 @@ static netdev_tx_t rtl8139_start_xmit (struct sk_buff *skb,
 	tp->cur_tx++;
 
 	if ((tp->cur_tx - NUM_TX_DESC) == tp->dirty_tx)
-		netif_stop_queue (dev);
+		netif_stop_queue(dev);
 	spin_unlock_irqrestore(&tp->lock, flags);
 
 	netif_dbg(tp, tx_queued, dev, "Queued Tx packet size %u to slot %d\n",
@@ -1731,14 +1752,14 @@ static netdev_tx_t rtl8139_start_xmit (struct sk_buff *skb,
 }
 
 
-static void rtl8139_tx_interrupt (struct net_device *dev,
+static void rtl8139_tx_interrupt(struct net_device *dev,
 				  struct rtl8139_private *tp,
 				  void __iomem *ioaddr)
 {
 	unsigned long dirty_tx, tx_left;
 
-	assert (dev != NULL);
-	assert (ioaddr != NULL);
+	assert(dev != NULL);
+	assert(ioaddr != NULL);
 
 	dirty_tx = tp->dirty_tx;
 	tx_left = tp->cur_tx - dirty_tx;
@@ -1746,7 +1767,7 @@ static void rtl8139_tx_interrupt (struct net_device *dev,
 		int entry = dirty_tx % NUM_TX_DESC;
 		int txstatus;
 
-		txstatus = RTL_R32 (TxStatus0 + (entry * sizeof (u32)));
+		txstatus = RTL_R32(TxStatus0 + (entry * sizeof(u32)));
 
 		if (!(txstatus & (TxStatOK | TxUnderrun | TxAborted)))
 			break;	/* It still hasn't been Txed */
@@ -1754,13 +1775,14 @@ static void rtl8139_tx_interrupt (struct net_device *dev,
 		/* Note: TxCarrierLost is always asserted at 100mbps. */
 		if (txstatus & (TxOutOfWindow | TxAborted)) {
 			/* There was an major error, log it. */
-			netif_dbg(tp, tx_err, dev, "Transmit error, Tx status %08x\n",
+			netif_dbg(tp, tx_err, dev, "Transmit error, "
+							"Tx status %08x\n",
 				  txstatus);
 			dev->stats.tx_errors++;
 			if (txstatus & TxAborted) {
 				dev->stats.tx_aborted_errors++;
-				RTL_W32 (TxConfig, TxClearAbt);
-				RTL_W16 (IntrStatus, TxErr);
+				RTL_W32(TxConfig, TxClearAbt);
+				RTL_W16(IntrStatus, TxErr);
 				wmb();
 			}
 			if (txstatus & TxCarrierLost)
@@ -1795,13 +1817,13 @@ static void rtl8139_tx_interrupt (struct net_device *dev,
 	if (tp->dirty_tx != dirty_tx) {
 		tp->dirty_tx = dirty_tx;
 		mb();
-		netif_wake_queue (dev);
+		netif_wake_queue(dev);
 	}
 }
 
 
 /* TODO: clean this up!  Rx reset need not be this intensive */
-static void rtl8139_rx_err (u32 rx_status, struct net_device *dev,
+static void rtl8139_rx_err(u32 rx_status, struct net_device *dev,
 			    struct rtl8139_private *tp, void __iomem *ioaddr)
 {
 	u8 tmp8;
@@ -1814,7 +1836,8 @@ static void rtl8139_rx_err (u32 rx_status, struct net_device *dev,
 	dev->stats.rx_errors++;
 	if (!(rx_status & RxStatusOK)) {
 		if (rx_status & RxTooLong) {
-			netdev_dbg(dev, "Oversized Ethernet frame, status %04x!\n",
+			netdev_dbg(dev, "Oversized Ethernet frame, "
+							"status %04x!\n",
 				   rx_status);
 			/* A.C.: The chip hangs here. */
 		}
@@ -1829,20 +1852,20 @@ static void rtl8139_rx_err (u32 rx_status, struct net_device *dev,
 	}
 
 #ifndef CONFIG_8139_OLD_RX_RESET
-	tmp8 = RTL_R8 (ChipCmd);
-	RTL_W8 (ChipCmd, tmp8 & ~CmdRxEnb);
-	RTL_W8 (ChipCmd, tmp8);
-	RTL_W32 (RxConfig, tp->rx_config);
+	tmp8 = RTL_R8(ChipCmd);
+	RTL_W8(ChipCmd, tmp8 & ~CmdRxEnb);
+	RTL_W8(ChipCmd, tmp8);
+	RTL_W32(RxConfig, tp->rx_config);
 	tp->cur_rx = 0;
 #else
 	/* Reset the receiver, based on RealTek recommendation. (Bug?) */
 
 	/* disable receive */
-	RTL_W8_F (ChipCmd, CmdTxEnb);
+	RTL_W8_F(ChipCmd, CmdTxEnb);
 	tmp_work = 200;
 	while (--tmp_work > 0) {
 		udelay(1);
-		tmp8 = RTL_R8 (ChipCmd);
+		tmp8 = RTL_R8(ChipCmd);
 		if (!(tmp8 & CmdRxEnb))
 			break;
 	}
@@ -1851,9 +1874,9 @@ static void rtl8139_rx_err (u32 rx_status, struct net_device *dev,
 	/* restart receive */
 	tmp_work = 200;
 	while (--tmp_work > 0) {
-		RTL_W8_F (ChipCmd, CmdRxEnb | CmdTxEnb);
+		RTL_W8_F(ChipCmd, CmdRxEnb | CmdTxEnb);
 		udelay(1);
-		tmp8 = RTL_R8 (ChipCmd);
+		tmp8 = RTL_R8(ChipCmd);
 		if ((tmp8 & CmdRxEnb) && (tmp8 & CmdTxEnb))
 			break;
 	}
@@ -1861,24 +1884,24 @@ static void rtl8139_rx_err (u32 rx_status, struct net_device *dev,
 		netdev_warn(dev, "tx/rx enable wait too long\n");
 
 	/* and reinitialize all rx related registers */
-	RTL_W8_F (Cfg9346, Cfg9346_Unlock);
+	RTL_W8_F(Cfg9346, Cfg9346_Unlock);
 	/* Must enable Tx/Rx before setting transfer thresholds! */
-	RTL_W8 (ChipCmd, CmdRxEnb | CmdTxEnb);
+	RTL_W8(ChipCmd, CmdRxEnb | CmdTxEnb);
 
 	tp->rx_config = rtl8139_rx_config | AcceptBroadcast | AcceptMyPhys;
-	RTL_W32 (RxConfig, tp->rx_config);
+	RTL_W32(RxConfig, tp->rx_config);
 	tp->cur_rx = 0;
 
 	netdev_dbg(dev, "init buffer addresses\n");
 
 	/* Lock Config[01234] and BMCR register writes */
-	RTL_W8 (Cfg9346, Cfg9346_Lock);
+	RTL_W8(Cfg9346, Cfg9346_Lock);
 
 	/* init Rx ring buffer DMA address */
-	RTL_W32_F (RxBuf, tp->rx_ring_dma);
+	RTL_W32_F(RxBuf, tp->rx_ring_dma);
 
 	/* A.C.: Reset the multicast list. */
-	__set_rx_mode (dev);
+	__set_rx_mode(dev);
 #endif
 }
 
@@ -1901,7 +1924,7 @@ static void rtl8139_isr_ack(struct rtl8139_private *tp)
 	void __iomem *ioaddr = tp->mmio_addr;
 	u16 status;
 
-	status = RTL_R16 (IntrStatus) & RxAckBits;
+	status = RTL_R16(IntrStatus) & RxAckBits;
 
 	/* Clear out errors and receive interrupts */
 	if (likely(status != 0)) {
@@ -1910,7 +1933,7 @@ static void rtl8139_isr_ack(struct rtl8139_private *tp)
 			if (status & RxFIFOOver)
 				tp->dev->stats.rx_fifo_errors++;
 		}
-		RTL_W16_F (IntrStatus, RxAckBits);
+		RTL_W16_F(IntrStatus, RxAckBits);
 	}
 }
 
@@ -1923,12 +1946,13 @@ static int rtl8139_rx(struct net_device *dev, struct rtl8139_private *tp,
 	unsigned int cur_rx = tp->cur_rx;
 	unsigned int rx_size = 0;
 
-	netdev_dbg(dev, "In %s(), current %04x BufAddr %04x, free to %04x, Cmd %02x\n",
+	netdev_dbg(dev, "In %s(), current %04x BufAddr %04x, free to %04x, "
+								"Cmd %02x\n",
 		   __func__, (u16)cur_rx,
 		   RTL_R16(RxBufAddr), RTL_R16(RxBufPtr), RTL_R8(ChipCmd));
 
 	while (netif_running(dev) && received < budget &&
-	       (RTL_R8 (ChipCmd) & RxBufEmpty) == 0) {
+	       (RTL_R8(ChipCmd) & RxBufEmpty) == 0) {
 		u32 ring_offset = cur_rx % RX_BUF_LEN;
 		u32 rx_status;
 		unsigned int pkt_size;
@@ -1937,11 +1961,12 @@ static int rtl8139_rx(struct net_device *dev, struct rtl8139_private *tp,
 		rmb();
 
 		/* read size+status of next frame from DMA ring buffer */
-		rx_status = le32_to_cpu (*(__le32 *) (rx_ring + ring_offset));
+		rx_status = le32_to_cpu(*(__le32 *) (rx_ring + ring_offset));
 		rx_size = rx_status >> 16;
 		pkt_size = rx_size - 4;
 
-		netif_dbg(tp, rx_status, dev, "%s() status %04x, size %04x, cur %04x\n",
+		netif_dbg(tp, rx_status, dev, "%s() status %04x, size %04x, "
+								"cur %04x\n",
 			  __func__, rx_status, rx_size, cur_rx);
 #if RTL8139_DEBUG > 2
 		print_dump_hex(KERN_DEBUG, "Frame contents: ",
@@ -1977,7 +2002,7 @@ no_early_rx:
 		if (unlikely((rx_size > (MAX_ETH_FRAME_SIZE+4)) ||
 			     (rx_size < 8) ||
 			     (!(rx_status & RxStatusOK)))) {
-			rtl8139_rx_err (rx_status, dev, tp, ioaddr);
+			rtl8139_rx_err(rx_status, dev, tp, ioaddr);
 			received = -1;
 			goto out;
 		}
@@ -1990,25 +2015,27 @@ no_early_rx:
 #if RX_BUF_IDX == 3
 			wrap_copy(skb, rx_ring, ring_offset+4, pkt_size);
 #else
-			skb_copy_to_linear_data (skb, &rx_ring[ring_offset + 4], pkt_size);
+			skb_copy_to_linear_data(skb, &rx_ring[ring_offset + 4],
+								 pkt_size);
 #endif
-			skb_put (skb, pkt_size);
+			skb_put(skb, pkt_size);
 
-			skb->protocol = eth_type_trans (skb, dev);
+			skb->protocol = eth_type_trans(skb, dev);
 
 			dev->stats.rx_bytes += pkt_size;
 			dev->stats.rx_packets++;
 
-			netif_receive_skb (skb);
+			netif_receive_skb(skb);
 		} else {
 			if (net_ratelimit())
-				netdev_warn(dev, "Memory squeeze, dropping packet\n");
+				netdev_warn(dev, "Memory squeeze, "
+						"dropping packet\n");
 			dev->stats.rx_dropped++;
 		}
 		received++;
 
 		cur_rx = (cur_rx + rx_size + 4 + 3) & ~3;
-		RTL_W16 (RxBufPtr, (u16) (cur_rx - 16));
+		RTL_W16(RxBufPtr, (u16) (cur_rx - 16));
 
 		rtl8139_isr_ack(tp);
 	}
@@ -2016,7 +2043,8 @@ no_early_rx:
 	if (unlikely(!received || rx_size == 0xfff0))
 		rtl8139_isr_ack(tp);
 
-	netdev_dbg(dev, "Done %s(), current %04x BufAddr %04x, free to %04x, Cmd %02x\n",
+	netdev_dbg(dev, "Done %s(), current %04x BufAddr %04x, free to %04x, "
+								"Cmd %02x\n",
 		   __func__, cur_rx,
 		   RTL_R16(RxBufAddr), RTL_R16(RxBufPtr), RTL_R8(ChipCmd));
 
@@ -2034,20 +2062,20 @@ out:
 }
 
 
-static void rtl8139_weird_interrupt (struct net_device *dev,
+static void rtl8139_weird_interrupt(struct net_device *dev,
 				     struct rtl8139_private *tp,
 				     void __iomem *ioaddr,
 				     int status, int link_changed)
 {
 	netdev_dbg(dev, "Abnormal interrupt, status %08x\n", status);
 
-	assert (dev != NULL);
-	assert (tp != NULL);
-	assert (ioaddr != NULL);
+	assert(dev != NULL);
+	assert(tp != NULL);
+	assert(ioaddr != NULL);
 
 	/* Update the error count. */
-	dev->stats.rx_missed_errors += RTL_R32 (RxMissed);
-	RTL_W32 (RxMissed, 0);
+	dev->stats.rx_missed_errors += RTL_R32(RxMissed);
+	RTL_W32(RxMissed, 0);
 
 	if ((status & RxUnderrun) && link_changed &&
 	    (tp->drv_flags & HAS_LNK_CHNG)) {
@@ -2064,8 +2092,8 @@ static void rtl8139_weird_interrupt (struct net_device *dev,
 		dev->stats.rx_fifo_errors++;
 	if (status & PCIErr) {
 		u16 pci_cmd_status;
-		pci_read_config_word (tp->pci_dev, PCI_STATUS, &pci_cmd_status);
-		pci_write_config_word (tp->pci_dev, PCI_STATUS, pci_cmd_status);
+		pci_read_config_word(tp->pci_dev, PCI_STATUS, &pci_cmd_status);
+		pci_write_config_word(tp->pci_dev, PCI_STATUS, pci_cmd_status);
 
 		netdev_err(dev, "PCI Bus error %04x\n", pci_cmd_status);
 	}
@@ -2073,7 +2101,8 @@ static void rtl8139_weird_interrupt (struct net_device *dev,
 
 static int rtl8139_poll(struct napi_struct *napi, int budget)
 {
-	struct rtl8139_private *tp = container_of(napi, struct rtl8139_private, napi);
+	struct rtl8139_private *tp = container_of(napi, struct rtl8139_private,
+									 napi);
 	struct net_device *dev = tp->dev;
 	void __iomem *ioaddr = tp->mmio_addr;
 	int work_done;
@@ -2101,7 +2130,7 @@ static int rtl8139_poll(struct napi_struct *napi, int budget)
 
 /* The interrupt handler does all of the Rx thread work and cleans up
    after the Tx thread. */
-static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance)
+static irqreturn_t rtl8139_interrupt(int irq, void *dev_instance)
 {
 	struct net_device *dev = (struct net_device *) dev_instance;
 	struct rtl8139_private *tp = netdev_priv(dev);
@@ -2110,8 +2139,8 @@ static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance)
 	int link_changed = 0; /* avoid bogus "uninit" warning */
 	int handled = 0;
 
-	spin_lock (&tp->lock);
-	status = RTL_R16 (IntrStatus);
+	spin_lock(&tp->lock);
+	status = RTL_R16(IntrStatus);
 
 	/* shared irq? */
 	if (unlikely((status & rtl8139_intr_mask) == 0))
@@ -2125,40 +2154,40 @@ static irqreturn_t rtl8139_interrupt (int irq, void *dev_instance)
 
 	/* close possible race's with dev_close */
 	if (unlikely(!netif_running(dev))) {
-		RTL_W16 (IntrMask, 0);
+		RTL_W16(IntrMask, 0);
 		goto out;
 	}
 
 	/* Acknowledge all of the current interrupt sources ASAP, but
 	   an first get an additional status bit from CSCR. */
 	if (unlikely(status & RxUnderrun))
-		link_changed = RTL_R16 (CSCR) & CSCR_LinkChangeBit;
+		link_changed = RTL_R16(CSCR) & CSCR_LinkChangeBit;
 
 	ackstat = status & ~(RxAckBits | TxErr);
 	if (ackstat)
-		RTL_W16 (IntrStatus, ackstat);
+		RTL_W16(IntrStatus, ackstat);
 
 	/* Receive packets are processed by poll routine.
 	   If not running start it now. */
-	if (status & RxAckBits){
+	if (status & RxAckBits) {
 		if (napi_schedule_prep(&tp->napi)) {
-			RTL_W16_F (IntrMask, rtl8139_norx_intr_mask);
+			RTL_W16_F(IntrMask, rtl8139_norx_intr_mask);
 			__napi_schedule(&tp->napi);
 		}
 	}
 
 	/* Check uncommon events with one test. */
 	if (unlikely(status & (PCIErr | PCSTimeout | RxUnderrun | RxErr)))
-		rtl8139_weird_interrupt (dev, tp, ioaddr,
+		rtl8139_weird_interrupt(dev, tp, ioaddr,
 					 status, link_changed);
 
 	if (status & (TxOK | TxErr)) {
-		rtl8139_tx_interrupt (dev, tp, ioaddr);
+		rtl8139_tx_interrupt(dev, tp, ioaddr);
 		if (status & TxErr)
-			RTL_W16 (IntrStatus, TxErr);
+			RTL_W16(IntrStatus, TxErr);
 	}
  out:
-	spin_unlock (&tp->lock);
+	spin_unlock(&tp->lock);
 
 	netdev_dbg(dev, "exiting interrupt, intr_status=%#4.4x\n",
 		   RTL_R16(IntrStatus));
@@ -2201,7 +2230,7 @@ static int rtl8139_set_mac_address(struct net_device *dev, void *p)
 	return 0;
 }
 
-static int rtl8139_close (struct net_device *dev)
+static int rtl8139_close(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -2210,26 +2239,27 @@ static int rtl8139_close (struct net_device *dev)
 	netif_stop_queue(dev);
 	napi_disable(&tp->napi);
 
-	netif_dbg(tp, ifdown, dev, "Shutting down ethercard, status was 0x%04x\n",
-		  RTL_R16(IntrStatus));
+	netif_dbg(tp, ifdown, dev, "Shutting down ethercard,"
+					" status was 0x%04x\n",
+					RTL_R16(IntrStatus));
 
-	spin_lock_irqsave (&tp->lock, flags);
+	spin_lock_irqsave(&tp->lock, flags);
 
 	/* Stop the chip's Tx and Rx DMA processes. */
-	RTL_W8 (ChipCmd, 0);
+	RTL_W8(ChipCmd, 0);
 
 	/* Disable interrupts by clearing the interrupt mask. */
-	RTL_W16 (IntrMask, 0);
+	RTL_W16(IntrMask, 0);
 
 	/* Update the error counts. */
-	dev->stats.rx_missed_errors += RTL_R32 (RxMissed);
-	RTL_W32 (RxMissed, 0);
+	dev->stats.rx_missed_errors += RTL_R32(RxMissed);
+	RTL_W32(RxMissed, 0);
 
-	spin_unlock_irqrestore (&tp->lock, flags);
+	spin_unlock_irqrestore(&tp->lock, flags);
 
-	free_irq (dev->irq, dev);
+	free_irq(dev->irq, dev);
 
-	rtl8139_tx_clear (tp);
+	rtl8139_tx_clear(tp);
 
 	dma_free_coherent(&tp->pci_dev->dev, RX_BUF_TOT_LEN,
 			  tp->rx_ring, tp->rx_ring_dma);
@@ -2239,10 +2269,10 @@ static int rtl8139_close (struct net_device *dev)
 	tp->tx_bufs = NULL;
 
 	/* Green! Put the chip in low-power mode. */
-	RTL_W8 (Cfg9346, Cfg9346_Unlock);
+	RTL_W8(Cfg9346, Cfg9346_Unlock);
 
 	if (rtl_chip_info[tp->chipset].flags & HasHltClk)
-		RTL_W8 (HltClk, 'H');	/* 'R' would leave the clock running. */
+		RTL_W8(HltClk, 'H');	/* 'R' would leave the clock running. */
 
 	return 0;
 }
@@ -2258,8 +2288,8 @@ static void rtl8139_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 
 	spin_lock_irq(&tp->lock);
 	if (rtl_chip_info[tp->chipset].flags & HasLWake) {
-		u8 cfg3 = RTL_R8 (Config3);
-		u8 cfg5 = RTL_R8 (Config5);
+		u8 cfg3 = RTL_R8(Config3);
+		u8 cfg5 = RTL_R8(Config5);
 
 		wol->supported = WAKE_PHY | WAKE_MAGIC
 			| WAKE_UCAST | WAKE_MCAST | WAKE_BCAST;
@@ -2300,16 +2330,16 @@ static int rtl8139_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 		return -EINVAL;
 
 	spin_lock_irq(&tp->lock);
-	cfg3 = RTL_R8 (Config3) & ~(Cfg3_LinkUp | Cfg3_Magic);
+	cfg3 = RTL_R8(Config3) & ~(Cfg3_LinkUp | Cfg3_Magic);
 	if (wol->wolopts & WAKE_PHY)
 		cfg3 |= Cfg3_LinkUp;
 	if (wol->wolopts & WAKE_MAGIC)
 		cfg3 |= Cfg3_Magic;
-	RTL_W8 (Cfg9346, Cfg9346_Unlock);
-	RTL_W8 (Config3, cfg3);
-	RTL_W8 (Cfg9346, Cfg9346_Lock);
+	RTL_W8(Cfg9346, Cfg9346_Unlock);
+	RTL_W8(Config3, cfg3);
+	RTL_W8(Cfg9346, Cfg9346_Lock);
 
-	cfg5 = RTL_R8 (Config5) & ~(Cfg5_UWF | Cfg5_MWF | Cfg5_BWF);
+	cfg5 = RTL_R8(Config5) & ~(Cfg5_UWF | Cfg5_MWF | Cfg5_BWF);
 	/* (KON)FIXME: These are untested.  We may have to set the
 	   CRC0, Wakeup0 and LSBCRC0 registers too, but I have no
 	   documentation.  */
@@ -2319,13 +2349,14 @@ static int rtl8139_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
 		cfg5 |= Cfg5_MWF;
 	if (wol->wolopts & WAKE_BCAST)
 		cfg5 |= Cfg5_BWF;
-	RTL_W8 (Config5, cfg5);	/* need not unlock via Cfg9346 */
+	RTL_W8(Config5, cfg5);	/* need not unlock via Cfg9346 */
 	spin_unlock_irq(&tp->lock);
 
 	return 0;
 }
 
-static void rtl8139_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+static void rtl8139_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	strcpy(info->driver, DRV_NAME);
@@ -2387,7 +2418,8 @@ static int rtl8139_get_regs_len(struct net_device *dev)
 	return tp->regs_len;
 }
 
-static void rtl8139_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *regbuf)
+static void rtl8139_get_regs(struct net_device *dev, struct ethtool_regs *regs,
+								 void *regbuf)
 {
 	struct rtl8139_private *tp;
 
@@ -2413,7 +2445,8 @@ static int rtl8139_get_sset_count(struct net_device *dev, int sset)
 	}
 }
 
-static void rtl8139_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data)
+static void rtl8139_get_ethtool_stats(struct net_device *dev,
+			struct ethtool_stats *stats, u64 *data)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 
@@ -2461,17 +2494,17 @@ static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 }
 
 
-static struct net_device_stats *rtl8139_get_stats (struct net_device *dev)
+static struct net_device_stats *rtl8139_get_stats(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
 	unsigned long flags;
 
 	if (netif_running(dev)) {
-		spin_lock_irqsave (&tp->lock, flags);
-		dev->stats.rx_missed_errors += RTL_R32 (RxMissed);
-		RTL_W32 (RxMissed, 0);
-		spin_unlock_irqrestore (&tp->lock, flags);
+		spin_lock_irqsave(&tp->lock, flags);
+		dev->stats.rx_missed_errors += RTL_R32(RxMissed);
+		RTL_W32(RxMissed, 0);
+		spin_unlock_irqrestore(&tp->lock, flags);
 	}
 
 	return &dev->stats;
@@ -2480,7 +2513,7 @@ static struct net_device_stats *rtl8139_get_stats (struct net_device *dev)
 /* Set or clear the multicast filter for this adaptor.
    This routine is not state sensitive and need not be SMP locked. */
 
-static void __set_rx_mode (struct net_device *dev)
+static void __set_rx_mode(struct net_device *dev)
 {
 	struct rtl8139_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -2507,7 +2540,8 @@ static void __set_rx_mode (struct net_device *dev)
 		rx_mode = AcceptBroadcast | AcceptMyPhys;
 		mc_filter[1] = mc_filter[0] = 0;
 		netdev_for_each_mc_addr(mclist, dev) {
-			int bit_nr = ether_crc(ETH_ALEN, mclist->dmi_addr) >> 26;
+			int bit_nr = ether_crc(ETH_ALEN,
+			mclist->dmi_addr) >> 26;
 
 			mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
 			rx_mode |= AcceptMulticast;
@@ -2517,68 +2551,68 @@ static void __set_rx_mode (struct net_device *dev)
 	/* We can safely update without stopping the chip. */
 	tmp = rtl8139_rx_config | rx_mode;
 	if (tp->rx_config != tmp) {
-		RTL_W32_F (RxConfig, tmp);
+		RTL_W32_F(RxConfig, tmp);
 		tp->rx_config = tmp;
 	}
-	RTL_W32_F (MAR0 + 0, mc_filter[0]);
-	RTL_W32_F (MAR0 + 4, mc_filter[1]);
+	RTL_W32_F(MAR0 + 0, mc_filter[0]);
+	RTL_W32_F(MAR0 + 4, mc_filter[1]);
 }
 
-static void rtl8139_set_rx_mode (struct net_device *dev)
+static void rtl8139_set_rx_mode(struct net_device *dev)
 {
 	unsigned long flags;
 	struct rtl8139_private *tp = netdev_priv(dev);
 
-	spin_lock_irqsave (&tp->lock, flags);
+	spin_lock_irqsave(&tp->lock, flags);
 	__set_rx_mode(dev);
-	spin_unlock_irqrestore (&tp->lock, flags);
+	spin_unlock_irqrestore(&tp->lock, flags);
 }
 
 #ifdef CONFIG_PM
 
-static int rtl8139_suspend (struct pci_dev *pdev, pm_message_t state)
+static int rtl8139_suspend(struct pci_dev *pdev, pm_message_t state)
 {
-	struct net_device *dev = pci_get_drvdata (pdev);
+	struct net_device *dev = pci_get_drvdata(pdev);
 	struct rtl8139_private *tp = netdev_priv(dev);
 	void __iomem *ioaddr = tp->mmio_addr;
 	unsigned long flags;
 
-	pci_save_state (pdev);
+	pci_save_state(pdev);
 
-	if (!netif_running (dev))
+	if (!netif_running(dev))
 		return 0;
 
-	netif_device_detach (dev);
+	netif_device_detach(dev);
 
-	spin_lock_irqsave (&tp->lock, flags);
+	spin_lock_irqsave(&tp->lock, flags);
 
 	/* Disable interrupts, stop Tx and Rx. */
-	RTL_W16 (IntrMask, 0);
-	RTL_W8 (ChipCmd, 0);
+	RTL_W16(IntrMask, 0);
+	RTL_W8(ChipCmd, 0);
 
 	/* Update the error counts. */
-	dev->stats.rx_missed_errors += RTL_R32 (RxMissed);
-	RTL_W32 (RxMissed, 0);
+	dev->stats.rx_missed_errors += RTL_R32(RxMissed);
+	RTL_W32(RxMissed, 0);
 
-	spin_unlock_irqrestore (&tp->lock, flags);
+	spin_unlock_irqrestore(&tp->lock, flags);
 
-	pci_set_power_state (pdev, PCI_D3hot);
+	pci_set_power_state(pdev, PCI_D3hot);
 
 	return 0;
 }
 
 
-static int rtl8139_resume (struct pci_dev *pdev)
+static int rtl8139_resume(struct pci_dev *pdev)
 {
-	struct net_device *dev = pci_get_drvdata (pdev);
+	struct net_device *dev = pci_get_drvdata(pdev);
 
-	pci_restore_state (pdev);
-	if (!netif_running (dev))
+	pci_restore_state(pdev);
+	if (!netif_running(dev))
 		return 0;
-	pci_set_power_state (pdev, PCI_D0);
-	rtl8139_init_ring (dev);
-	rtl8139_hw_start (dev);
-	netif_device_attach (dev);
+	pci_set_power_state(pdev, PCI_D0);
+	rtl8139_init_ring(dev);
+	rtl8139_hw_start(dev);
+	netif_device_attach(dev);
 	return 0;
 }
 
@@ -2597,7 +2631,7 @@ static struct pci_driver rtl8139_pci_driver = {
 };
 
 
-static int __init rtl8139_init_module (void)
+static int __init rtl8139_init_module(void)
 {
 	/* when we're a module, we always print a version message,
 	 * even if no 8139 board is found.
@@ -2610,11 +2644,11 @@ static int __init rtl8139_init_module (void)
 }
 
 
-static void __exit rtl8139_cleanup_module (void)
+static void __exit rtl8139_cleanup_module(void)
 {
-	pci_unregister_driver (&rtl8139_pci_driver);
+	pci_unregister_driver(&rtl8139_pci_driver);
 }
 
 
 module_init(rtl8139_init_module);
-module_exit(rtl8139_cleanup_module);
+Module_exit(rtl8139_cleanup_module);

^ permalink raw reply related

* Re: NULL pointer dereference panic in stable (2.6.33.2), amd64
From: David Miller @ 2010-04-12  7:36 UTC (permalink / raw)
  To: eric.dumazet; +Cc: krkumar2, netdev, nuclearcat
In-Reply-To: <1271056697.16881.7.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 12 Apr 2010 09:18:17 +0200

> Here the patch I cooked, this is a stable candidate, once tested and
> acknowledged.
> 
> [PATCH] net: dev_pick_tx() fix
> 
> When dev_pick_tx() caches tx queue_index on a socket, we must check
> socket dst_entry matches skb one, or risk a crash later, as reported by
> Denys Fedorysychenko, if old packets are in flight during a route
> change, involving devices with different number of queues.
> 
> Bug introduced by commit a4ee3ce3
> (net: Use sk_tx_queue_mapping for connected sockets)
> 
> Reported-by: Denys Fedorysychenko <nuclearcat@nuclearcat.com>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Looks good Eric.  I'll integrate this into net-2.6 and also queue it
up for -stable in a day or two unless some problem is discovered.

Thanks!

^ permalink raw reply

* Re: NULL pointer dereference panic in stable (2.6.33.2), amd64
From: Eric Dumazet @ 2010-04-12  7:18 UTC (permalink / raw)
  To: Krishna Kumar2; +Cc: David Miller, netdev, Denys Fedorysychenko
In-Reply-To: <1271052111.2078.168.camel@edumazet-laptop>

Le lundi 12 avril 2010 à 08:01 +0200, Eric Dumazet a écrit :

> Problem is when you reset sk->sk_tx_queue_mapping at the very moment
> route (or destination) changes, we might have old packets queued in tx
> queues, of the old ethernet device (eth0 : multi queue compatable)
> 
> 2) Application does a sendmsg() or connect() call and sk->sk_dst_cache
> is rebuild, it points to a dst_entry referring a new device (eth1 : non
> multiqueue)
> 
> 3) When one old packet finally is transmitted, we do :
> 
> 	queue_index = 1; // any value > 0
> 
> 	if (sk && sk->sk_dst_cache)
> 		sk_tx_queue_set(sk, queue_index); // remember a >0 value
> 
> 4) application does a sendmsg(), enqueues a new skb on eth1
> 
> 5) We re-enter dev_pick_tx(), and consider cached value in 3) is valid.
>    we pick a non existent txq for eth1 device.
> 
> 6) We crash.
> 
> 
> > The following might be better to prove the panic is due to this, since
> > your suggestion will hide a panic that happens somewhat rare (according
> > to Denys):
> > 
> >       if (sk_tx_queue_recorded(sk)) {
> >             queue_index = sk_tx_queue_get(sk);
> > +           queue_index = dev_cap_txqueue(dev, queue_index);
> >       } else {
> > 
> 
> Sure, but I thought I was clear enough to prove this commit was wrong,
> and we have to find a fix.

Here the patch I cooked, this is a stable candidate, once tested and
acknowledged.

[PATCH] net: dev_pick_tx() fix

When dev_pick_tx() caches tx queue_index on a socket, we must check
socket dst_entry matches skb one, or risk a crash later, as reported by
Denys Fedorysychenko, if old packets are in flight during a route
change, involving devices with different number of queues.

Bug introduced by commit a4ee3ce3
(net: Use sk_tx_queue_mapping for connected sockets)

Reported-by: Denys Fedorysychenko <nuclearcat@nuclearcat.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/core/dev.c b/net/core/dev.c
index 1c8a0ce..92584bf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1989,8 +1989,12 @@ static struct netdev_queue *dev_pick_tx(struct net_device *dev,
 			if (dev->real_num_tx_queues > 1)
 				queue_index = skb_tx_hash(dev, skb);
 
-			if (sk && sk->sk_dst_cache)
-				sk_tx_queue_set(sk, queue_index);
+			if (sk) {
+				struct dst_entry *dst = rcu_dereference(sk->sk_dst_cache);
+
+				if (dst && skb_dst(skb) == dst)
+					sk_tx_queue_set(sk, queue_index);
+			}
 		}
 	}
 



^ permalink raw reply related

* Re: [Patch 3/3] net: reserve ports for applications using fixed port numbers
From: Tetsuo Handa @ 2010-04-12  7:06 UTC (permalink / raw)
  To: amwang
  Cc: linux-kernel, opurdila, eric.dumazet, netdev, nhorman, davem,
	ebiederm
In-Reply-To: <4BC2C34B.50109@redhat.com>

Cong Wang wrote:
> Tetsuo Handa wrote:
> >> Index: linux-2.6/net/sctp/socket.c
> >> ===================================================================
> >> --- linux-2.6.orig/net/sctp/socket.c
> >> +++ linux-2.6/net/sctp/socket.c
> >> @@ -5436,6 +5436,8 @@ static long sctp_get_port_local(struct s
> >>  			rover++;
> >>  			if ((rover < low) || (rover > high))
> >>  				rover = low;
> >> +			if (inet_is_reserved_local_port(rover))
> >> +				continue;
> > 
> > This one needs to be
> > 
> > 			if (inet_is_reserved_local_port(rover))
> > 				goto next_nolock;
> > 
> >>  			index = sctp_phashfn(rover);
> >>  			head = &sctp_port_hashtable[index];
> >>  			sctp_spin_lock(&head->lock);
> > 
> >  next:
> > 			sctp_spin_unlock(&head->lock);
> > +next_nolock:
> > 		} while (--remaining > 0);
> > 
> > otherwise, it will loop forever if all ports were reserved.
> 
> Sorry, doesn't 'continue' jump to exactly where 'next_nolock' is??
> Or I am missing something?

My misreading, sorry.

^ permalink raw reply

* Re: [Patch 3/3] net: reserve ports for applications using fixed port numbers
From: Cong Wang @ 2010-04-12  6:52 UTC (permalink / raw)
  To: Tetsuo Handa
  Cc: linux-kernel, opurdila, eric.dumazet, netdev, nhorman, davem,
	ebiederm
In-Reply-To: <201004092221.GEE32059.FHMOFVLJOFSOtQ@I-love.SAKURA.ne.jp>

Tetsuo Handa wrote:
> Hello.
> 
> Amerigo Wang wrote:
>> Index: linux-2.6/drivers/infiniband/core/cma.c
>> ===================================================================
>> --- linux-2.6.orig/drivers/infiniband/core/cma.c
>> +++ linux-2.6/drivers/infiniband/core/cma.c
>> @@ -1980,6 +1980,8 @@ retry:
>>  	/* FIXME: add proper port randomization per like inet_csk_get_port */
>>  	do {
>>  		ret = idr_get_new_above(ps, bind_list, next_port, &port);
>> +		if (inet_is_reserved_local_port(port))
>> +			ret = -EAGAIN;
> 
> You should not overwrite ret with -EAGAIN when idr_get_new_above() returned
> -ENOSPC. I don't know about idr, thus I don't know whether
> 
> 		if (!ret && inet_is_reserved_local_port(port))
> 			ret = -EAGAIN;
> 
> is correct or not.

Hmm, good catch! I think it is correct.



> 
>>  	} while ((ret == -EAGAIN) && idr_pre_get(ps, GFP_KERNEL));
>>  
>>  	if (ret)
>> @@ -2996,10 +2998,13 @@ static int __init cma_init(void)
>>  {
>>  	int ret, low, high, remaining;
>>  
>> -	get_random_bytes(&next_port, sizeof next_port);
>>  	inet_get_local_port_range(&low, &high);
>> +again:
>> +	get_random_bytes(&next_port, sizeof next_port);
>>  	remaining = (high - low) + 1;
>>  	next_port = ((unsigned int) next_port % remaining) + low;
>> +	if (inet_is_reserved_local_port(next_port))
>> +		goto again;
>>  
> 
> You should not unconditionally "goto again;".
> If all ports were reserved, it will loop forever (CPU stalls).
> 

Yeah, how about:

	int tries = 10;
	...
again:
	...
	if (inet_is_reserved_local_port(next_port)) {
		if (tries--)
			goto again;
		else
			return -EBUSY;
	}

?


>>  	cma_wq = create_singlethread_workqueue("rdma_cm");
>>  	if (!cma_wq)
> 
> 
>> Index: linux-2.6/net/sctp/socket.c
>> ===================================================================
>> --- linux-2.6.orig/net/sctp/socket.c
>> +++ linux-2.6/net/sctp/socket.c
>> @@ -5436,6 +5436,8 @@ static long sctp_get_port_local(struct s
>>  			rover++;
>>  			if ((rover < low) || (rover > high))
>>  				rover = low;
>> +			if (inet_is_reserved_local_port(rover))
>> +				continue;
> 
> This one needs to be
> 
> 			if (inet_is_reserved_local_port(rover))
> 				goto next_nolock;
> 
>>  			index = sctp_phashfn(rover);
>>  			head = &sctp_port_hashtable[index];
>>  			sctp_spin_lock(&head->lock);
> 
>  next:
> 			sctp_spin_unlock(&head->lock);
> +next_nolock:
> 		} while (--remaining > 0);
> 
> otherwise, it will loop forever if all ports were reserved.

Sorry, doesn't 'continue' jump to exactly where 'next_nolock' is??
Or I am missing something?


Thanks for your review!

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox