Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 9/9] sysctl: ipv6: share ip6_ctl_table, ipv6_icmp_table and ipv6_route_table between nets
From: Lucian Adrian Grijincu @ 2011-02-25 18:52 UTC (permalink / raw)
  To: David S. Miller, Alexey Dobriyan, Eric W. Biederman,
	Octavian Purdila, netdev
  Cc: Lucian Adrian Grijincu
In-Reply-To: <1298659961-23863-1-git-send-email-lucian.grijincu@gmail.com>

This patch includes another implementation of the patch from [1]. This
patch will not apply cleanly if that one has been applied.

[1] http://thread.gmane.org/gmane.linux.network/187273

Signed-off-by: Lucian Adrian Grijincu <lucian.grijincu@gmail.com>
---
 include/net/ipv6.h         |    6 +---
 net/ipv6/icmp.c            |   17 +-----------
 net/ipv6/route.c           |   54 +++++++++++----------------------------
 net/ipv6/sysctl_net_ipv6.c |   61 ++++++--------------------------------------
 4 files changed, 27 insertions(+), 111 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 96e50e0..1526ed6 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -652,11 +652,9 @@ static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; }
 #endif
 
 #ifdef CONFIG_SYSCTL
-extern ctl_table ipv6_route_table_template[];
-extern ctl_table ipv6_icmp_table_template[];
+extern ctl_table ipv6_route_table[];
+extern ctl_table ipv6_icmp_table[];
 
-extern struct ctl_table *ipv6_icmp_sysctl_init(struct net *net);
-extern struct ctl_table *ipv6_route_sysctl_init(struct net *net);
 extern int ipv6_sysctl_register(void);
 extern void ipv6_sysctl_unregister(void);
 extern int ipv6_static_sysctl_register(void);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 03e62f9..924cb36 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -938,29 +938,16 @@ int icmpv6_err_convert(u8 type, u8 code, int *err)
 EXPORT_SYMBOL(icmpv6_err_convert);
 
 #ifdef CONFIG_SYSCTL
-ctl_table ipv6_icmp_table_template[] = {
+ctl_table ipv6_icmp_table[] = {
 	{
 		.procname	= "ratelimit",
 		.data		= &init_net.ipv6.sysctl.icmpv6_time,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_ms_jiffies,
+		.proc_handler	= (proc_handler *) netns_proc_dointvec_ms_jiffies,
 	},
 	{ },
 };
 
-struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
-{
-	struct ctl_table *table;
-
-	table = kmemdup(ipv6_icmp_table_template,
-			sizeof(ipv6_icmp_table_template),
-			GFP_KERNEL);
-
-	if (table)
-		table[0].data = &net->ipv6.sysctl.icmpv6_time;
-
-	return table;
-}
 #endif
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a998db6..29e05ca 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2553,11 +2553,11 @@ static const struct file_operations rt6_stats_seq_fops = {
 
 #ifdef CONFIG_SYSCTL
 
-static
-int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
-			      void __user *buffer, size_t *lenp, loff_t *ppos)
+static int netns_ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
+					   void __user *buffer, size_t *lenp,
+					   loff_t *ppos, void *cookie)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct net *net = (struct net *) cookie;
 	int delay = net->ipv6.sysctl.flush_delay;
 	if (write) {
 		proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -2567,103 +2567,79 @@ int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
 		return -EINVAL;
 }
 
-ctl_table ipv6_route_table_template[] = {
+ctl_table ipv6_route_table[] = {
 	{
 		.procname	=	"flush",
 		.data		=	&init_net.ipv6.sysctl.flush_delay,
 		.maxlen		=	sizeof(int),
 		.mode		=	0200,
-		.proc_handler	=	ipv6_sysctl_rtcache_flush
+		.proc_handler	=	(proc_handler *) netns_ipv6_sysctl_rtcache_flush
 	},
 	{
 		.procname	=	"gc_thresh",
 		.data		=	&ip6_dst_ops_template.gc_thresh,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec,
 	},
 	{
 		.procname	=	"max_size",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec,
 	},
 	{
 		.procname	=	"gc_min_interval",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec_jiffies,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec_jiffies,
 	},
 	{
 		.procname	=	"gc_timeout",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec_jiffies,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec_jiffies,
 	},
 	{
 		.procname	=	"gc_interval",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec_jiffies,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec_jiffies,
 	},
 	{
 		.procname	=	"gc_elasticity",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec,
 	},
 	{
 		.procname	=	"mtu_expires",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec_jiffies,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec_jiffies,
 	},
 	{
 		.procname	=	"min_adv_mss",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec,
 	},
 	{
 		.procname	=	"gc_min_interval_ms",
 		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
 		.maxlen		=	sizeof(int),
 		.mode		=	0644,
-		.proc_handler	=	proc_dointvec_ms_jiffies,
+		.proc_handler	=	(proc_handler *) netns_proc_dointvec_ms_jiffies,
 	},
 	{ }
 };
-
-struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
-{
-	struct ctl_table *table;
-
-	table = kmemdup(ipv6_route_table_template,
-			sizeof(ipv6_route_table_template),
-			GFP_KERNEL);
-
-	if (table) {
-		table[0].data = &net->ipv6.sysctl.flush_delay;
-		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
-		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
-		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
-		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
-		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
-		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
-		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
-		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
-		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
-	}
-
-	return table;
-}
 #endif
 
 static int __net_init ip6_route_net_init(struct net *net)
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 7cb65ef..cd15483 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -17,25 +17,25 @@
 
 static struct ctl_table empty[1];
 
-static ctl_table ipv6_table_template[] = {
+static ctl_table ipv6_table[] = {
 	{
 		.procname	= "route",
 		.maxlen		= 0,
 		.mode		= 0555,
-		.child		= ipv6_route_table_template
+		.child		= ipv6_route_table
 	},
 	{
 		.procname	= "icmp",
 		.maxlen		= 0,
 		.mode		= 0555,
-		.child		= ipv6_icmp_table_template
+		.child		= ipv6_icmp_table
 	},
 	{
 		.procname	= "bindv6only",
 		.data		= &init_net.ipv6.sysctl.bindv6only,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.proc_handler	= (proc_handler *) netns_proc_dointvec
 	},
 	{
 		.procname	= "neigh",
@@ -66,62 +66,17 @@ EXPORT_SYMBOL_GPL(net_ipv6_ctl_path);
 
 static int __net_init ipv6_sysctl_net_init(struct net *net)
 {
-	struct ctl_table *ipv6_table;
-	struct ctl_table *ipv6_route_table;
-	struct ctl_table *ipv6_icmp_table;
-	int err;
-
-	err = -ENOMEM;
-	ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
-			     GFP_KERNEL);
-	if (!ipv6_table)
-		goto out;
-
-	ipv6_route_table = ipv6_route_sysctl_init(net);
-	if (!ipv6_route_table)
-		goto out_ipv6_table;
-	ipv6_table[0].child = ipv6_route_table;
-
-	ipv6_icmp_table = ipv6_icmp_sysctl_init(net);
-	if (!ipv6_icmp_table)
-		goto out_ipv6_route_table;
-	ipv6_table[1].child = ipv6_icmp_table;
-
-	ipv6_table[2].data = &net->ipv6.sysctl.bindv6only;
-
-	net->ipv6.sysctl.table = register_net_sysctl_table(net, net_ipv6_ctl_path,
-							   ipv6_table);
+	net->ipv6.sysctl.table = register_net_sysctl_table(net,
+				   net_ipv6_ctl_path, ipv6_table);
 	if (!net->ipv6.sysctl.table)
-		goto out_ipv6_icmp_table;
-
-	err = 0;
-out:
-	return err;
+		return -ENOMEM;
 
-out_ipv6_icmp_table:
-	kfree(ipv6_icmp_table);
-out_ipv6_route_table:
-	kfree(ipv6_route_table);
-out_ipv6_table:
-	kfree(ipv6_table);
-	goto out;
+	return 0;
 }
 
 static void __net_exit ipv6_sysctl_net_exit(struct net *net)
 {
-	struct ctl_table *ipv6_table;
-	struct ctl_table *ipv6_route_table;
-	struct ctl_table *ipv6_icmp_table;
-
-	ipv6_table = net->ipv6.sysctl.table->ctl_table_arg;
-	ipv6_route_table = ipv6_table[0].child;
-	ipv6_icmp_table = ipv6_table[1].child;
-
 	unregister_net_sysctl_table(net->ipv6.sysctl.table);
-
-	kfree(ipv6_table);
-	kfree(ipv6_route_table);
-	kfree(ipv6_icmp_table);
 }
 
 static struct pernet_operations ipv6_sysctl_net_ops = {
-- 
1.7.4.rc1.7.g2cf08.dirty


^ permalink raw reply related

* Re: [Bugme-new] [Bug 29712] New: Bonding Driver(version : 3.5.0) - Problem with ARP monitoring in active backup mode
From: David Miller @ 2011-02-25 18:54 UTC (permalink / raw)
  To: Harsha.R02
  Cc: brian.haley, akpm, bugzilla-daemon, bugme-daemon, netdev, fubar
In-Reply-To: <E351E450E8B9F54684A699D42DC5ADF210062FA2@MPBAGVEX02.corp.mphasis.com>

From: "Harsha R02" <Harsha.R02@mphasis.com>
Date: Fri, 25 Feb 2011 18:14:32 +0530

> Attached patch resolves the issue. Failover happened back to primary
> when it was up again in both the point to point and switch
> configuration.
> 
> Please let us know if this change can be included.

Please don't base64 encode your patches, that makes them harder
to read for some people.  It's just plain text.

^ permalink raw reply

* Re: [PATCH] sysctl: ipv6: use correct net in ipv6_sysctl_rtcache_flush
From: David Miller @ 2011-02-25 19:02 UTC (permalink / raw)
  To: daniel.lezcano; +Cc: lucian.grijincu, netdev, benjamin.thery
In-Reply-To: <4D67CB11.4020801@free.fr>

From: Daniel Lezcano <daniel.lezcano@free.fr>
Date: Fri, 25 Feb 2011 16:30:25 +0100

> On 02/25/2011 06:48 AM, Lucian Adrian Grijincu wrote:
>> Before this patch issuing these commands:
>>
>>    fd = open("/proc/sys/net/ipv6/route/flush")
>>    unshare(CLONE_NEWNET)
>>    write(fd, "stuff")
>>
>> would flush the newly created net, not the original one.
>>
>> The equivalent ipv4 code is correct (stores the net inside ->extra1).
>> ---
> 
> Acked-by: Daniel Lezcano <daniel.lezcano@free.fr>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] don't allow CAP_NET_ADMIN to load non-netdev kernel modules
From: Vasiliy Kulikov @ 2011-02-25 19:02 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji, kaber,
	eric.dumazet, therbert, xiaosuo, jesse, Kees Cook, Eugene Teo,
	Dan Rosenberg, Andrew Morton
In-Reply-To: <20110225.104720.71110261.davem@davemloft.net>

On Fri, Feb 25, 2011 at 10:47 -0800, David Miller wrote:
> From: Vasiliy Kulikov <segoon@openwall.com>
> Date: Fri, 25 Feb 2011 18:14:14 +0300
> 
> > Since a8f80e8ff94ecba629542d9b4b5f5a8ee3eb565c any process with
> > CAP_NET_ADMIN may load any module from /lib/modules/.  This doesn't mean
> > that CAP_NET_ADMIN is a superset of CAP_SYS_MODULE as modules are limited
> > to /lib/modules/**.  However, CAP_NET_ADMIN capability shouldn't allow
> > anybody load any module not related to networking.
> 
> Why go through this naming change, which does break things, instead of
> simply adding a capability mask tag or similar to modules somehow.  You
> could stick it into a special elf section or similar.
>
> Doesn't that make tons more sense than this?

This is not "simply", adding special section for a single workaround
seems like an overkill for me - this touches the core (modules'
internals), which is not related to the initial CAP_* problem at all.

I'd be happy with not breaking anything, but I don't see any acceptable
solution.


Thanks,

-- 
Vasiliy Kulikov
http://www.openwall.com - bringing security into open computing environments

^ permalink raw reply

* Re: [Bugme-new] [Bug 29712] New: Bonding Driver(version : 3.5.0) - Problem with ARP monitoring in active backup mode
From: Jay Vosburgh @ 2011-02-25 19:02 UTC (permalink / raw)
  To: Harsha R02
  Cc: Brian Haley, Andrew Morton, bugzilla-daemon, bugme-daemon, netdev
In-Reply-To: <E351E450E8B9F54684A699D42DC5ADF20C6F1D4A@MPBAGVEX02.corp.mphasis.com>

Harsha R02 <Harsha.R02@mphasis.com> wrote:

>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 40fb5ee..0413917 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -3020,11 +3020,16 @@ static void bond_ab_arp_probe(struct bonding *bond)
>                       bond->curr_active_slave->dev->name);
>        if (bond->curr_active_slave) {
>+                if((bond->curr_active_slave != bond->primary_slave) &&
>+                   (IS_UP(bond->primary_slave->dev)))
>+                        goto failover;
>+
>                bond_arp_send_all(bond, bond->curr_active_slave);
>                read_unlock(&bond->curr_slave_lock);
>                return;
>        }
>+failover:
>        read_unlock(&bond->curr_slave_lock);
>        /* if we don't have a curr_active_slave, search for the next available

	I'm not sure this is the proper place to put the "failover:"
label, as it will go through the "search for any peer" logic that's
normally used when there are no available slaves.  That will likely take
longer than simply switching to the primary.

	It should be possible to simply call bond_change_active_slave
with the appropriate arguments; did you try this?

	-J


>-------------------------------------------------------------------------------
>From: Harsha R02
>Sent: Fri 2/25/2011 6:14 PM
>To: Brian Haley; Andrew Morton
>Cc: bugzilla-daemon@bugzilla.kernel.org; bugme-daemon@bugzilla.kernel.org;
>netdev@vger.kernel.org; Jay Vosburgh
>Subject: RE: [Bugme-new] [Bug 29712] New: Bonding Driver(version : 3.5.0) -
>Problem with ARP monitoring in active backup mode
>
>Attached patch resolves the issue. Failover happened back to primary when it
>was up again in both the point to point and switch configuration.
>
>Please let us know if this change can be included.
>
>Thanks,
>
>- Harsha
>
>-----Original Message-----
>From: Brian Haley [mailto:brian.haley@hp.com]
>Sent: Friday, February 25, 2011 9:12 AM
>To: Andrew Morton
>Cc: Harsha R02; bugzilla-daemon@bugzilla.kernel.org;
>bugme-daemon@bugzilla.kernel.org; netdev@vger.kernel.org; Jay Vosburgh
>Subject: Re: [Bugme-new] [Bug 29712] New: Bonding Driver(version : 3.5.0) -
>Problem with ARP monitoring in active backup mode
>
>On 02/24/2011 05:51 PM, Andrew Morton wrote:
>> (switched to email.  Please respond via emailed reply-to-all, not via the
>> bugzilla web interface).
>>
>> On Wed, 23 Feb 2011 10:41:34 GMT
>> bugzilla-daemon@bugzilla.kernel.org wrote:
>>
>>> https://bugzilla.kernel.org/show_bug.cgi?id=29712
>>>
>>>            Summary: Bonding Driver(version : 3.5.0) - Problem with ARP
>>>                     monitoring in active backup mode
>>>            Product: Drivers
>>>            Version: 2.5
>>>     Kernel Version: 2.6.32
>>
>> That's a paleolithic kernel you have there.  This problem might have
>> been fixed already.  Can you test a more recent kernel?
>
>I can add some more info since I originally looked at the problem.  This
>happens on 2.6.38 as well, and on this 2.6.32 kernel with a backported
>3.7.0 bonding driver (with the primary_reselect option).  Harsha has a
>prototype patch that's being tested, but wanted to log the bug to see
>if one of the bonding maintainers had a better solution.
>
>I'll let him respond as I'm now out of the loop...
>
>Thanks,
>
>-Brian

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: [PATCH] don't allow CAP_NET_ADMIN to load non-netdev kernel modules
From: David Miller @ 2011-02-25 19:05 UTC (permalink / raw)
  To: segoon
  Cc: netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji, kaber,
	eric.dumazet, therbert, xiaosuo, jesse, kees.cook, eugene,
	dan.j.rosenberg, akpm
In-Reply-To: <20110225190205.GA4541@albatros>

From: Vasiliy Kulikov <segoon@openwall.com>
Date: Fri, 25 Feb 2011 22:02:05 +0300

> On Fri, Feb 25, 2011 at 10:47 -0800, David Miller wrote:
>> From: Vasiliy Kulikov <segoon@openwall.com>
>> Date: Fri, 25 Feb 2011 18:14:14 +0300
>> 
>> > Since a8f80e8ff94ecba629542d9b4b5f5a8ee3eb565c any process with
>> > CAP_NET_ADMIN may load any module from /lib/modules/.  This doesn't mean
>> > that CAP_NET_ADMIN is a superset of CAP_SYS_MODULE as modules are limited
>> > to /lib/modules/**.  However, CAP_NET_ADMIN capability shouldn't allow
>> > anybody load any module not related to networking.
>> 
>> Why go through this naming change, which does break things, instead of
>> simply adding a capability mask tag or similar to modules somehow.  You
>> could stick it into a special elf section or similar.
>>
>> Doesn't that make tons more sense than this?
> 
> This is not "simply", adding special section for a single workaround
> seems like an overkill for me - this touches the core (modules'
> internals), which is not related to the initial CAP_* problem at all.
> 
> I'd be happy with not breaking anything, but I don't see any acceptable
> solution.

I think it's warranted given that it allows us to avoid breaking things.

I don't understand there is resistence in response to the first idea
I've seen proprosed that actually allows to fix the problem and not
break anything at the same time.

That seems silly.

^ permalink raw reply

* Re: [PATCH] don't allow CAP_NET_ADMIN to load non-netdev kernel modules
From: Ben Hutchings @ 2011-02-25 19:07 UTC (permalink / raw)
  To: David Miller
  Cc: segoon, netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji,
	kaber, eric.dumazet, therbert, xiaosuo, jesse, kees.cook, eugene,
	dan.j.rosenberg, akpm
In-Reply-To: <20110225.110529.39178636.davem@davemloft.net>

On Fri, 2011-02-25 at 11:05 -0800, David Miller wrote:
> From: Vasiliy Kulikov <segoon@openwall.com>
> Date: Fri, 25 Feb 2011 22:02:05 +0300
> 
> > On Fri, Feb 25, 2011 at 10:47 -0800, David Miller wrote:
> >> From: Vasiliy Kulikov <segoon@openwall.com>
> >> Date: Fri, 25 Feb 2011 18:14:14 +0300
> >> 
> >> > Since a8f80e8ff94ecba629542d9b4b5f5a8ee3eb565c any process with
> >> > CAP_NET_ADMIN may load any module from /lib/modules/.  This doesn't mean
> >> > that CAP_NET_ADMIN is a superset of CAP_SYS_MODULE as modules are limited
> >> > to /lib/modules/**.  However, CAP_NET_ADMIN capability shouldn't allow
> >> > anybody load any module not related to networking.
> >> 
> >> Why go through this naming change, which does break things, instead of
> >> simply adding a capability mask tag or similar to modules somehow.  You
> >> could stick it into a special elf section or similar.
> >>
> >> Doesn't that make tons more sense than this?
> > 
> > This is not "simply", adding special section for a single workaround
> > seems like an overkill for me - this touches the core (modules'
> > internals), which is not related to the initial CAP_* problem at all.
> > 
> > I'd be happy with not breaking anything, but I don't see any acceptable
> > solution.
> 
> I think it's warranted given that it allows us to avoid breaking things.
> 
> I don't understand there is resistence in response to the first idea
> I've seen proprosed that actually allows to fix the problem and not
> break anything at the same time.
> 
> That seems silly.

You realise that module loading doesn't actually run in the context of
request_module(), right?

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: pull request: wireless-next-2.6 2011-02-22
From: David Miller @ 2011-02-25 19:15 UTC (permalink / raw)
  To: linville-2XuSBdqkA4R54TAoqtyWWQ
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, padovan-Y3ZbgMPKUGA34EUeqzHoZw
In-Reply-To: <20110224.224344.104068328.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

From: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Date: Thu, 24 Feb 2011 22:43:44 -0800 (PST)

> From: "John W. Linville" <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
> Date: Tue, 22 Feb 2011 16:52:30 -0500
> 
>> Here is the latest batch of wireless bits intended for 2.6.39.  It seems
>> I neglected to send a pull request last week, so this one is a bit big
>> -- I apologize!
>> 
>> This includes a rather large batch of bluetooth bits by way of Gustavo.
>> It looks like a variety of bits, including some code refactoring, some
>> protocol support enhancements, some bugfixes, etc. -- nothing too
>> unusual.
>> 
>> Other items of interest include a new driver from Realtek, some ssb
>> support enhancements, and the usual sort of updates for mac80211 and a
>> variety of drivers.  Also included is a wireless-2.6 pull to resolve
>> some build breakage.
>> 
>> Please let me know if there are problems!
> 
> Pulled, thanks a lot John.

John a few things:

1) I had to add some vmalloc.h includes to fix the build on sparc64,
   see commit b08cd667c4b6641c4d16a3f87f4550f81a6d69ac in net-next-2.6

2) Something is screwey with the bluetooth config options now.

   I have an allmodconfig tree, and when I run "make oldconfig" after
   this pull, BT_L2CAP and BT_SCO both prompt me, claiming that they
   can only be built statically.

   I give it 'y' just to make it happen, for both, and afterways no
   matter how many times I rerun "make oldconfig" I keep seeing things
   like this in my build:

scripts/kconfig/conf --silentoldconfig Kconfig
include/config/auto.conf:986:warning: symbol value 'm' invalid for BT_SCO
include/config/auto.conf:3156:warning: symbol value 'm' invalid for BT_L2CAP

   First, what the heck is going on here?  Second, why the heck can't these
   non-trivial pieces of code be built modular any more?

   You can't make something "bool", have it depend on something that
   might be modular, and then build it into what could in fact be a
   module.  That's exactly what the bluetooth stuff seems to be doing
   now.

   I suspect commit 642745184f82688eb3ef0cdfaa4ba632055be9af

Thanks.

^ permalink raw reply

* Re: [PATCH] don't allow CAP_NET_ADMIN to load non-netdev kernel modules
From: David Miller @ 2011-02-25 19:16 UTC (permalink / raw)
  To: bhutchings
  Cc: segoon, netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji,
	kaber, eric.dumazet, therbert, xiaosuo, jesse, kees.cook, eugene,
	dan.j.rosenberg, akpm
In-Reply-To: <1298660879.2554.23.camel@bwh-desktop>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 25 Feb 2011 19:07:59 +0000

> You realise that module loading doesn't actually run in the context of
> request_module(), right?

Why is that a barrier?  We could simply pass a capability mask into
request_module if necessary.

It's an implementation detail, and not a deterrant to my suggested
scheme.

^ permalink raw reply

* Re: SO_REUSEPORT - can it be done in kernel?
From: Rick Jones @ 2011-02-25 19:18 UTC (permalink / raw)
  To: Thomas Graf; +Cc: Tom Herbert, Bill Sommerfeld, Daniel Baluta, netdev
In-Reply-To: <20110225125644.GA9763@canuck.infradead.org>

On Fri, 2011-02-25 at 07:56 -0500, Thomas Graf wrote:
> On Thu, Jan 27, 2011 at 01:32:25PM -0800, Tom Herbert wrote:
> > Yes, we are still planning this.  The UDP implementation for my
> > earlier patch should be usable to try for DNS/UDP-- this is in fact
> > where we saw a major performance gain.  Eric Dumazet had some nice
> > improvements that should probably be looked at also.
> 
> I can confirm this.
> 
> Serious scalability issues have been reported on a 12 core system
> running bind 9.7-2. The system was only able to deliver ~110K queries
> per second.
> 
> Using your SO_REUSEPORT patch and a modified bind using it. The same
> system is able to deliver ~650K queries per seconds while maxing out
> all cores completely.

I think the idea is goodness, but will ask, was the (first) bottleneck
actually in the kernel, or was it in bind itself?  I've seen
single-instance, single-byte burst-mode netperf TCP_RR do in excess of
300K transactions per second (with TCP_NODELAY set) on an X5560 core.

ftp://ftp.netperf.org/netperf/misc/dl380g6_X5560_rhel54_ad386_cxgb3_1.4.1.2_b2b_to_same_agg_1500mtu_20100513-2.csv

and that was with now ancient RHEL5.4 bits...  yes, there is a bit of
apples, oranges and kumquats but still, I am wondering if this didn't
also "work around" some internal BIND scaling issues as well.

rick jones

> 
> Tom, Bill: do you have a timeline for merging this? Especially the
> UDP bits?
> 
> -Thomas
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: SO_REUSEPORT - can it be done in kernel?
From: David Miller @ 2011-02-25 19:20 UTC (permalink / raw)
  To: rick.jones2; +Cc: tgraf, therbert, wsommerfeld, daniel.baluta, netdev
In-Reply-To: <1298661495.14113.152.camel@tardy>

From: Rick Jones <rick.jones2@hp.com>
Date: Fri, 25 Feb 2011 11:18:15 -0800

> and that was with now ancient RHEL5.4 bits...  yes, there is a bit of
> apples, oranges and kumquats but still, I am wondering if this didn't
> also "work around" some internal BIND scaling issues as well.

I think this is fundamentally a bind problem as well.

^ permalink raw reply

* Re: [PATCH net-next 0/6] Phonet: small pipe protocol fixes
From: David Miller @ 2011-02-25 19:20 UTC (permalink / raw)
  To: netdev, remi.denis-courmont; +Cc: ofono
In-Reply-To: <201102251113.41620.remi.denis-courmont@nokia.com>

From: "Rémi Denis-Courmont" <remi.denis-courmont@nokia.com>
Date: Fri, 25 Feb 2011 11:13:41 +0200

> This patch series cleans up and fixes a number of small bits in the
> Phonet pipe code, especially the experimental pipe controller. Once
> this small bits are sorted out, I will try to fix the controller
> protocol implementation proper so that we do not need the
> compile-time (experimental) flag anymore.

All applied thanks.

If you want to start using GIT to push phonet changes to me, frankly I
would welcome that :-)

^ permalink raw reply

* Re: SO_REUSEPORT - can it be done in kernel?
From: Eric Dumazet @ 2011-02-25 19:21 UTC (permalink / raw)
  To: rick.jones2
  Cc: Thomas Graf, Tom Herbert, Bill Sommerfeld, Daniel Baluta, netdev
In-Reply-To: <1298661495.14113.152.camel@tardy>

Le vendredi 25 février 2011 à 11:18 -0800, Rick Jones a écrit :

> I think the idea is goodness, but will ask, was the (first) bottleneck
> actually in the kernel, or was it in bind itself?  I've seen
> single-instance, single-byte burst-mode netperf TCP_RR do in excess of
> 300K transactions per second (with TCP_NODELAY set) on an X5560 core.
> 
> ftp://ftp.netperf.org/netperf/misc/dl380g6_X5560_rhel54_ad386_cxgb3_1.4.1.2_b2b_to_same_agg_1500mtu_20100513-2.csv
> 
> and that was with now ancient RHEL5.4 bits...  yes, there is a bit of
> apples, oranges and kumquats but still, I am wondering if this didn't
> also "work around" some internal BIND scaling issues as well.
> 

A single core can probably give 300K transactions.

But if you use several cores, accessing a single socket (the one bound
on port 53), then performance drops because of false sharing,
locking....

^ permalink raw reply

* Re: [PATCH net-next 0/6] Phonet: small pipe protocol fixes
From: David Miller @ 2011-02-25 19:24 UTC (permalink / raw)
  To: netdev, remi.denis-courmont; +Cc: ofono
In-Reply-To: <20110225.112055.260096987.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Fri, 25 Feb 2011 11:20:55 -0800 (PST)

> From: "Rémi Denis-Courmont" <remi.denis-courmont@nokia.com>
> Date: Fri, 25 Feb 2011 11:13:41 +0200
> 
>> This patch series cleans up and fixes a number of small bits in the
>> Phonet pipe code, especially the experimental pipe controller. Once
>> this small bits are sorted out, I will try to fix the controller
>> protocol implementation proper so that we do not need the
>> compile-time (experimental) flag anymore.
> 
> All applied thanks.
> 
> If you want to start using GIT to push phonet changes to me, frankly I
> would welcome that :-)

BTW, I had to add the following patch to fix a build warning:

--------------------
phonet: Protect pipe_do_remove() with appropriate ifdefs.

It is only used when CONFIG_PHONET_PIPECTRLR is not set.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/phonet/pep.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index b8c31fc..875e86c 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -849,6 +849,7 @@ drop:
 	return err;
 }
 
+#ifndef CONFIG_PHONET_PIPECTRLR
 static int pipe_do_remove(struct sock *sk)
 {
 	struct pep_sock *pn = pep_sk(sk);
@@ -870,6 +871,7 @@ static int pipe_do_remove(struct sock *sk)
 
 	return pn_skb_send(sk, skb, NULL);
 }
+#endif
 
 /* associated socket ceases to exist */
 static void pep_sock_close(struct sock *sk, long timeout)
-- 
1.7.4.1


^ permalink raw reply related

* Re: [PATCH] don't allow CAP_NET_ADMIN to load non-netdev kernel modules
From: Ben Hutchings @ 2011-02-25 19:30 UTC (permalink / raw)
  To: David Miller
  Cc: segoon, netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji,
	kaber, eric.dumazet, therbert, xiaosuo, jesse, kees.cook, eugene,
	dan.j.rosenberg, akpm
In-Reply-To: <20110225.111606.115927805.davem@davemloft.net>

On Fri, 2011-02-25 at 11:16 -0800, David Miller wrote:
> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Fri, 25 Feb 2011 19:07:59 +0000
> 
> > You realise that module loading doesn't actually run in the context of
> > request_module(), right?
> 
> Why is that a barrier?  We could simply pass a capability mask into
> request_module if necessary.
> 
> It's an implementation detail, and not a deterrant to my suggested
> scheme.

It's not an implementation detail.  modprobe currently runs with full
capabilities; your proposal requires its capabilities to be limited to
those of the capabilities of the process that triggered the
request_module() (plus, presumably, CAP_SYS_MODULE).

Now modprobe doesn't have CAP_DAC_OVERRIDE and can't read modprobe
configuration files that belong to users other than root.

It doesn't have CAP_SYS_MKNOD so it can't run hooks that call mknod.

etc.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [RFC] be2net: add rxhash support
From: Eric Dumazet @ 2011-02-25 19:32 UTC (permalink / raw)
  To: Ajit Khaparde; +Cc: netdev
In-Reply-To: <1298658096.2659.101.camel@edumazet-laptop>

Le vendredi 25 février 2011 à 19:21 +0100, Eric Dumazet a écrit :
> Le vendredi 25 février 2011 à 11:44 -0600, Ajit Khaparde a écrit :
> > -----
> > [PATCH net-next] be2net: add rxhash support
> > 
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> > Signed-off-by: Ajit Khaparde <ajit.khaparde@emulex.com>
> > ---
> >  drivers/net/benet/be_main.c |   11 +++++++++++
> >  1 files changed, 11 insertions(+), 0 deletions(-)
> > 
> > diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c
> > index 26f9c56..8c4b782 100644
> > --- a/drivers/net/benet/be_main.c
> > +++ b/drivers/net/benet/be_main.c
> > @@ -1038,6 +1038,10 @@ static void be_rx_compl_process(struct be_adapter *adapter,
> >  
> >  	skb->truesize = skb->len + sizeof(struct sk_buff);
> >  	skb->protocol = eth_type_trans(skb, adapter->netdev);
> > +	if (adapter->netdev->features & NETIF_F_RXHASH)
> > +		skb->rxhash = AMAP_GET_BITS(struct amap_eth_rx_compl,
> > +					rsshash, rxcp);
> > +
> >  
> >  	vlanf = AMAP_GET_BITS(struct amap_eth_rx_compl, vtp, rxcp);
> >  	vtm = AMAP_GET_BITS(struct amap_eth_rx_compl, vtm, rxcp);
> > @@ -1099,6 +1103,10 @@ static void be_rx_compl_process_gro(struct be_adapter *adapter,
> >  		return;
> >  	}
> >  
> > +	if (adapter->netdev->features & NETIF_F_RXHASH)
> > +		skb->rxhash = AMAP_GET_BITS(struct amap_eth_rx_compl,
> > +						rsshash, rxcp);
> > +
> >  	remaining = pkt_size;
> >  	for (i = 0, j = -1; i < num_rcvd; i++) {
> >  		page_info = get_rx_page_info(adapter, rxo, rxq_idx);
> > @@ -2619,6 +2627,9 @@ static void be_netdev_init(struct net_device *netdev)
> >  		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
> >  		NETIF_F_GRO | NETIF_F_TSO6;
> >  
> > +	if (be_multi_rxq(adapter))
> > +		netdev->features |= NETIF_F_RXHASH;
> > +
> >  	netdev->vlan_features |= NETIF_F_SG | NETIF_F_TSO |
> >  		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
> >  
> 
> 
> I added some traces, and I am not sure its OK :
> 
> With one active tcp flow, I got different rxhash values :
> 
> [ 1064.674253] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.738104] rxhash=37acd31d rsshp=1 bank=1
> [ 1064.741684] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.874283] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.940201] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.955278] rxhash=b668ace2 rsshp=1 bank=1
> [ 1065.080028] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.153360] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.293164] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.401862] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.460506] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.519980] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.650160] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.717585] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.730909] rxhash=37acd31d rsshp=1 bank=1
> [ 1065.840350] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.900704] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.931526] rxhash=b668ace2 rsshp=1 bank=1
> [ 1066.503657] rxhash=bbd37952 rsshp=1 bank=1
> [ 1066.570138] rxhash=bbd37952 rsshp=1 bank=1
> 
> How is it possible ?
> 
> (I have a VLAN config on top of a bonding)
> 
> 

Also, Ajit, we need something to allow ethtool -K rxhash {on|off}

Something like (completely untested)

diff --git a/drivers/net/benet/be_ethtool.c b/drivers/net/benet/be_ethtool.c
index 6e5e433..540c1ea 100644
--- a/drivers/net/benet/be_ethtool.c
+++ b/drivers/net/benet/be_ethtool.c
@@ -712,6 +712,11 @@ be_read_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
 	return status;
 }
 
+static int be_set_flags(struct net_device *dev, u32 data)
+{
+	return ethtool_op_set_flags(dev, data, ETH_FLAG_RXHASH);
+}
+
 const struct ethtool_ops be_ethtool_ops = {
 	.get_settings = be_get_settings,
 	.get_drvinfo = be_get_drvinfo,
@@ -739,4 +744,5 @@ const struct ethtool_ops be_ethtool_ops = {
 	.get_ethtool_stats = be_get_ethtool_stats,
 	.flash_device = be_do_flash,
 	.self_test = be_self_test,
+	.set_flags = be_set_flags,
 };



^ permalink raw reply related

* Re: pull request: wireless-next-2.6 2011-02-22
From: Gustavo F. Padovan @ 2011-02-25 19:36 UTC (permalink / raw)
  To: David Miller
  Cc: linville-2XuSBdqkA4R54TAoqtyWWQ,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	linux-bluetooth-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20110225.111500.59674472.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>

Hi David,

* David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org> [2011-02-25 11:15:00 -0800]:

> From: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
> Date: Thu, 24 Feb 2011 22:43:44 -0800 (PST)
> 
> > From: "John W. Linville" <linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org>
> > Date: Tue, 22 Feb 2011 16:52:30 -0500
> > 
> >> Here is the latest batch of wireless bits intended for 2.6.39.  It seems
> >> I neglected to send a pull request last week, so this one is a bit big
> >> -- I apologize!
> >> 
> >> This includes a rather large batch of bluetooth bits by way of Gustavo.
> >> It looks like a variety of bits, including some code refactoring, some
> >> protocol support enhancements, some bugfixes, etc. -- nothing too
> >> unusual.
> >> 
> >> Other items of interest include a new driver from Realtek, some ssb
> >> support enhancements, and the usual sort of updates for mac80211 and a
> >> variety of drivers.  Also included is a wireless-2.6 pull to resolve
> >> some build breakage.
> >> 
> >> Please let me know if there are problems!
> > 
> > Pulled, thanks a lot John.
> 
> John a few things:
> 
> 1) I had to add some vmalloc.h includes to fix the build on sparc64,
>    see commit b08cd667c4b6641c4d16a3f87f4550f81a6d69ac in net-next-2.6
> 
> 2) Something is screwey with the bluetooth config options now.
> 
>    I have an allmodconfig tree, and when I run "make oldconfig" after
>    this pull, BT_L2CAP and BT_SCO both prompt me, claiming that they
>    can only be built statically.
> 
>    I give it 'y' just to make it happen, for both, and afterways no
>    matter how many times I rerun "make oldconfig" I keep seeing things
>    like this in my build:
> 
> scripts/kconfig/conf --silentoldconfig Kconfig
> include/config/auto.conf:986:warning: symbol value 'm' invalid for BT_SCO
> include/config/auto.conf:3156:warning: symbol value 'm' invalid for BT_L2CAP
> 
>    First, what the heck is going on here?  Second, why the heck can't these
>    non-trivial pieces of code be built modular any more?

We now have L2CAP and SCO built-in in the main bluetooth.ko module.

> 
>    You can't make something "bool", have it depend on something that
>    might be modular, and then build it into what could in fact be a
>    module.  That's exactly what the bluetooth stuff seems to be doing
>    now.

Seems I did the Kconfig change wrong, I'll fix it ASAP and send it to you
guys.

-- 
Gustavo F. Padovan
http://profusion.mobi

^ permalink raw reply

* Re: [RFC] be2net: add rxhash support
From: Ajit Khaparde @ 2011-02-25 19:36 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev

> -----Original Message-----
> From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> Sent: Friday, February 25, 2011 1:33 PM
> To: Khaparde, Ajit
> Cc: netdev@vger.kernel.org
> Subject: Re: [RFC] be2net: add rxhash support

> I added some traces, and I am not sure its OK :
> 
> With one active tcp flow, I got different rxhash values :
> 
> [ 1064.674253] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.738104] rxhash=37acd31d rsshp=1 bank=1
> [ 1064.741684] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.874283] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.940201] rxhash=bbd37952 rsshp=1 bank=1
> [ 1064.955278] rxhash=b668ace2 rsshp=1 bank=1
> [ 1065.080028] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.153360] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.293164] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.401862] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.460506] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.519980] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.650160] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.717585] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.730909] rxhash=37acd31d rsshp=1 bank=1
> [ 1065.840350] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.900704] rxhash=bbd37952 rsshp=1 bank=1
> [ 1065.931526] rxhash=b668ace2 rsshp=1 bank=1
> [ 1066.503657] rxhash=bbd37952 rsshp=1 bank=1
> [ 1066.570138] rxhash=bbd37952 rsshp=1 bank=1
> 
> How is it possible ?
> 
> (I have a VLAN config on top of a bonding)
> 
I'm looking at this..
There is no switch involved in your test, just back to back?

> 
> Also, Ajit, we need something to allow ethtool -K rxhash {on|off}
> 
> Something like (completely untested)

Yes. That is in the works.

Thanks
-Ajit

^ permalink raw reply

* Re: [PATCH] don't allow CAP_NET_ADMIN to load non-netdev kernel modules
From: David Miller @ 2011-02-25 19:43 UTC (permalink / raw)
  To: bhutchings
  Cc: segoon, netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji,
	kaber, eric.dumazet, therbert, xiaosuo, jesse, kees.cook, eugene,
	dan.j.rosenberg, akpm
In-Reply-To: <1298662216.2554.33.camel@bwh-desktop>

From: Ben Hutchings <bhutchings@solarflare.com>
Date: Fri, 25 Feb 2011 19:30:16 +0000

> On Fri, 2011-02-25 at 11:16 -0800, David Miller wrote:
>> From: Ben Hutchings <bhutchings@solarflare.com>
>> Date: Fri, 25 Feb 2011 19:07:59 +0000
>> 
>> > You realise that module loading doesn't actually run in the context of
>> > request_module(), right?
>> 
>> Why is that a barrier?  We could simply pass a capability mask into
>> request_module if necessary.
>> 
>> It's an implementation detail, and not a deterrant to my suggested
>> scheme.
> 
> It's not an implementation detail.  modprobe currently runs with full
> capabilities; your proposal requires its capabilities to be limited to
> those of the capabilities of the process that triggered the
> request_module() (plus, presumably, CAP_SYS_MODULE).

The idea was that the kernel will be the entity that will inspect the
elf sections and validate the capability bits, not the userspace
module loader.

Surely we if we can pass an arbitrary string out to the loading
process as part of the module loading context, we can pass along
capability bits as well.

^ permalink raw reply

* Re: [RFC] be2net: add rxhash support
From: Eric Dumazet @ 2011-02-25 19:45 UTC (permalink / raw)
  To: Ajit Khaparde; +Cc: netdev
In-Reply-To: <20110225193640.GA11411@akhaparde-VBox>

Le vendredi 25 février 2011 à 13:36 -0600, Ajit Khaparde a écrit :
> > -----Original Message-----
> > From: Eric Dumazet [mailto:eric.dumazet@gmail.com]
> > 
> > How is it possible ?
> > 
> > (I have a VLAN config on top of a bonding)
> > 
> I'm looking at this..
> There is no switch involved in your test, just back to back?
> 

There is a switch.

Machines are HP ProLiant BL460c G7

But why do you ask ?



^ permalink raw reply

* Re: SO_REUSEPORT - can it be done in kernel?
From: Tom Herbert @ 2011-02-25 19:51 UTC (permalink / raw)
  To: Tom Herbert, Bill Sommerfeld, Daniel Baluta, netdev; +Cc: Thomas Graf
In-Reply-To: <20110225125644.GA9763@canuck.infradead.org>

> Using your SO_REUSEPORT patch and a modified bind using it. The same
> system is able to deliver ~650K queries per seconds while maxing out
> all cores completely.
>
Nice data point.

> Tom, Bill: do you have a timeline for merging this? Especially the
> UDP bits?
>
Bill has been working on the TCP implementation which is requiring
some fairly major surgery on the listener connections in syn-rcvd
state, this is ongoing.

On the UDP side, I believe the patch is functional, but as Eric
pointed out it probably could be further optimized.  I'll split out
the UDP bits into a separate patch and post that...

Tom

> -Thomas
>

^ permalink raw reply

* Re: [PATCH] cxgb{3,4}: streamline Kconfig options
From: Dimitris Michailidis @ 2011-02-25 19:51 UTC (permalink / raw)
  To: Jan Beulich; +Cc: David Miller, divy, linux-kbuild, netdev
In-Reply-To: <4D66B45B.2050604@chelsio.com>

Dimitris Michailidis wrote:
> Jan Beulich wrote:
> 
>> As to that INET vs NET dependency - is it possible that the
>> network drivers really just need NET, but the iSCSI ones need
>> INET? In which case the only common dependency would be
>> PCI - certainly not worth a custom helper option.
> 
> Reading the commit message that introduced CHELSIO_T3_DEPENDS, it talks 
> of hidden dependencies that select does not see.  I am not sure which 
> exactly but since it's been a few years since that commit I'll try to 
> see what the situation is today without the *_DEPENDS symbols and let 
> you know.

I looked into this and found that with the current Kconfig the iSCSI driver 
does not appear in the SCSI menu until one first enables NETDEVICES and 
NETDEV_10000 in the network driver menu.  It appears that the *_DEPENDS 
symbols were added to capture dependencies on such symbols within the 
network driver Kconfig, besides the dependencies the driver's entry listed 
explicitly.

The patch below removes *T4*_DEPENDS and the network drivers' unnecessary 
dependency on INET, and updates the iSCSI driver's entry so it is visible 
without requiring any net driver options to be enabled first and has 
adequate selects to be able to build the net driver (this part is adapted 
from bnx2i's Kconfig entry).  I still need to do the T3 part of this and 
check that there isn't a conflict with the current scsi tree.  Just for 
review at this time.

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 0382332..0d314d5 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -2619,14 +2619,9 @@ config CHELSIO_T3
  	  To compile this driver as a module, choose M here: the module
  	  will be called cxgb3.

-config CHELSIO_T4_DEPENDS
-	tristate
-	depends on PCI && INET
-	default y
-
  config CHELSIO_T4
  	tristate "Chelsio Communications T4 Ethernet support"
-	depends on CHELSIO_T4_DEPENDS
+	depends on PCI
  	select FW_LOADER
  	select MDIO
  	help
@@ -2644,14 +2639,9 @@ config CHELSIO_T4
  	  To compile this driver as a module choose M here; the module
  	  will be called cxgb4.

-config CHELSIO_T4VF_DEPENDS
-	tristate
-	depends on PCI && INET
-	default y
-
  config CHELSIO_T4VF
  	tristate "Chelsio Communications T4 Virtual Function Ethernet support"
-	depends on CHELSIO_T4VF_DEPENDS
+	depends on PCI
  	help
  	  This driver supports Chelsio T4-based gigabit and 10Gb Ethernet
  	  adapters with PCI-E SR-IOV Virtual Functions.
diff --git a/drivers/scsi/cxgbi/cxgb4i/Kconfig 
b/drivers/scsi/cxgbi/cxgb4i/Kconfig
index bb94b39..d5302c2 100644
--- a/drivers/scsi/cxgbi/cxgb4i/Kconfig
+++ b/drivers/scsi/cxgbi/cxgb4i/Kconfig
@@ -1,6 +1,8 @@
  config SCSI_CXGB4_ISCSI
  	tristate "Chelsio T4 iSCSI support"
-	depends on CHELSIO_T4_DEPENDS
+	depends on PCI && INET
+	select NETDEVICES
+	select NETDEV_10000
  	select CHELSIO_T4
  	select SCSI_ISCSI_ATTRS
  	---help---

^ permalink raw reply related

* Fwd: Re: via-rhine -- VT6105M and checksum offloading
From: Jan Ceuleers @ 2011-02-25 19:50 UTC (permalink / raw)
  To: netdev, Benjamin LaHaise

[-- Attachment #1: Type: text/plain, Size: 760 bytes --]

Not a diff, but let's see where it leads

-------- Original Message --------
Subject: Re: via-rhine -- VT6105M and checksum offloading
Date: Fri, 25 Feb 2011 20:32:12 +0100
From: Roger Luethi <rl@hellgate.ch>
To: Jan Ceuleers <jan.ceuleers@computer.org>

On Fri, 25 Feb 2011 19:35:28 +0100, Jan Ceuleers wrote:
> Can you post that, preferably rebased to net-next? Even if Benjamin
> doesn't get 'round to implementing all of the improvements Dave proposes
> perhaps Dave will be clement enough to apply it as-is if it proves to be
> a net positive?

I should really take another look at the driver and check what I missed wrt
NETIF_F_GRO. But here's what I have -- it applies cleanly against net-next
(driver not tested with current net-next, though).

Roger


[-- Attachment #2: via-rhine.c --]
[-- Type: text/x-c, Size: 69922 bytes --]

/* via-rhine.c: A Linux Ethernet device driver for VIA Rhine family chips. */
/*
	Written 1998-2001 by Donald Becker.

	Current Maintainer: Roger Luethi <rl@hellgate.ch>

	This software may be used and distributed according to the terms of
	the GNU General Public License (GPL), incorporated herein by reference.
	Drivers based on or derived from this code fall under the GPL and must
	retain the authorship, copyright and license notice.  This file is not
	a complete program and may only be used when the entire operating
	system is licensed under the GPL.

	This driver is designed for the VIA VT86C100A Rhine-I.
	It also works with the Rhine-II (6102) and Rhine-III (6105/6105L/6105LOM
	and management NIC 6105M).

	The author may be reached as becker@scyld.com, or C/O
	Scyld Computing Corporation
	410 Severn Ave., Suite 210
	Annapolis MD 21403


	This driver contains some changes from the original Donald Becker
	version. He may or may not be interested in bug reports on this
	code. You can find his versions at:
	http://www.scyld.com/network/via-rhine.html
	[link no longer provides useful info -jgarzik]

*/

#define DRV_NAME	"via-rhine"
#define DRV_VERSION	"1.5.0"
#define DRV_RELDATE	"2010-10-09"


/* A few user-configurable values.
   These may be modified when a driver module is loaded. */

static int debug = 1;	/* 1 normal messages, 0 quiet .. 7 verbose. */
static int max_interrupt_work = 20;

/* Set the copy breakpoint for the copy-only-tiny-frames scheme.
   Setting to > 1518 effectively disables this feature. */
#if defined(__alpha__) || defined(__arm__) || defined(__hppa__) || \
	defined(CONFIG_SPARC) || defined(__ia64__) ||		   \
	defined(__sh__) || defined(__mips__)
static int rx_copybreak = 1518;
#else
static int rx_copybreak;
#endif

/* Work-around for broken BIOSes: they are unable to get the chip back out of
   power state D3 so PXE booting fails. bootparam(7): via-rhine.avoid_D3=1 */
static int avoid_D3;

/*
 * In case you are looking for 'options[]' or 'full_duplex[]', they
 * are gone. Use ethtool(8) instead.
 */

/* Maximum number of multicast addresses to filter (vs. rx-all-multicast).
   The Rhine has a 64 element 8390-like hash table. */
static const int multicast_filter_limit = 32;


/* Operational parameters that are set at compile time. */

/* Keep the ring sizes a power of two for compile efficiency.
   The compiler will convert <unsigned>'%'<2^N> into a bit mask.
   Making the Tx ring too large decreases the effectiveness of channel
   bonding and packet priority.
   There are no ill effects from too-large receive rings. */
#define TX_RING_SIZE	16
#define TX_QUEUE_LEN	10	/* Limit ring entries actually used. */
#define RX_RING_SIZE	64

/* Operational parameters that usually are not changed. */

/* Time in jiffies before concluding the transmitter is hung. */
#define TX_TIMEOUT	(2*HZ)

#define PKT_BUF_SZ	1536	/* Size of each temporary Rx buffer.*/

#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/timer.h>
#include <linux/errno.h>
#include <linux/ioport.h>
#include <linux/interrupt.h>
#include <linux/pci.h>
#include <linux/dma-mapping.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/mii.h>
#include <linux/ethtool.h>
#include <linux/crc32.h>
#include <linux/if_vlan.h>
#include <linux/bitops.h>
#include <linux/workqueue.h>
#include <asm/processor.h>	/* Processor type for cache alignment. */
#include <asm/io.h>
#include <asm/irq.h>
#include <asm/uaccess.h>
#include <linux/dmi.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>

/* These identify the driver base version and may not be removed. */
static const char version[] __devinitconst =
	KERN_INFO DRV_NAME ".c:v1.10-LK" DRV_VERSION " " DRV_RELDATE
	" Written by Donald Becker\n";

/* This driver was written to use PCI memory space. Some early versions
   of the Rhine may only work correctly with I/O space accesses. */
#ifdef CONFIG_VIA_RHINE_MMIO
#define USE_MMIO
#else
#endif

MODULE_AUTHOR("Donald Becker <becker@scyld.com>");
MODULE_DESCRIPTION("VIA Rhine PCI Fast Ethernet driver");
MODULE_LICENSE("GPL");

module_param(max_interrupt_work, int, 0);
module_param(debug, int, 0);
module_param(rx_copybreak, int, 0);
module_param(avoid_D3, bool, 0);
MODULE_PARM_DESC(max_interrupt_work, "VIA Rhine maximum events handled per interrupt");
MODULE_PARM_DESC(debug, "VIA Rhine debug level (0-7)");
MODULE_PARM_DESC(rx_copybreak, "VIA Rhine copy breakpoint for copy-only-tiny-frames");
MODULE_PARM_DESC(avoid_D3, "Avoid power state D3 (work-around for broken BIOSes)");

#define MCAM_SIZE	32
#define VCAM_SIZE	32

/*
		Theory of Operation

I. Board Compatibility

This driver is designed for the VIA 86c100A Rhine-II PCI Fast Ethernet
controller.

II. Board-specific settings

Boards with this chip are functional only in a bus-master PCI slot.

Many operational settings are loaded from the EEPROM to the Config word at
offset 0x78. For most of these settings, this driver assumes that they are
correct.
If this driver is compiled to use PCI memory space operations the EEPROM
must be configured to enable memory ops.

III. Driver operation

IIIa. Ring buffers

This driver uses two statically allocated fixed-size descriptor lists
formed into rings by a branch from the final descriptor to the beginning of
the list. The ring sizes are set at compile time by RX/TX_RING_SIZE.

IIIb/c. Transmit/Receive Structure

This driver attempts to use a zero-copy receive and transmit scheme.

Alas, all data buffers are required to start on a 32 bit boundary, so
the driver must often copy transmit packets into bounce buffers.

The driver allocates full frame size skbuffs for the Rx ring buffers at
open() time and passes the skb->data field to the chip as receive data
buffers. When an incoming frame is less than RX_COPYBREAK bytes long,
a fresh skbuff is allocated and the frame is copied to the new skbuff.
When the incoming frame is larger, the skbuff is passed directly up the
protocol stack. Buffers consumed this way are replaced by newly allocated
skbuffs in the last phase of rhine_rx().

The RX_COPYBREAK value is chosen to trade-off the memory wasted by
using a full-sized skbuff for small frames vs. the copying costs of larger
frames. New boards are typically used in generously configured machines
and the underfilled buffers have negligible impact compared to the benefit of
a single allocation size, so the default value of zero results in never
copying packets. When copying is done, the cost is usually mitigated by using
a combined copy/checksum routine. Copying also preloads the cache, which is
most useful with small frames.

Since the VIA chips are only able to transfer data to buffers on 32 bit
boundaries, the IP header at offset 14 in an ethernet frame isn't
longword aligned for further processing. Copying these unaligned buffers
has the beneficial effect of 16-byte aligning the IP header.

IIId. Synchronization

The driver runs as two independent, single-threaded flows of control. One
is the send-packet routine, which enforces single-threaded use by the
netdev_priv(dev)->lock spinlock. The other thread is the interrupt handler,
which is single threaded by the hardware and interrupt handling software.

The send packet thread has partial control over the Tx ring. It locks the
netdev_priv(dev)->lock whenever it's queuing a Tx packet. If the next slot in
the ring is not available it stops the transmit queue by
calling netif_stop_queue.

The interrupt handler has exclusive control over the Rx ring and records stats
from the Tx ring. After reaping the stats, it marks the Tx queue entry as
empty by incrementing the dirty_tx mark. If at least half of the entries in
the Rx ring are available the transmit queue is woken up if it was stopped.

IV. Notes

IVb. References

Preliminary VT86C100A manual from http://www.via.com.tw/
http://www.scyld.com/expert/100mbps.html
http://www.scyld.com/expert/NWay.html
ftp://ftp.via.com.tw/public/lan/Products/NIC/VT86C100A/Datasheet/VT86C100A03.pdf
ftp://ftp.via.com.tw/public/lan/Products/NIC/VT6102/Datasheet/VT6102_021.PDF


IVc. Errata

The VT86C100A manual is not reliable information.
The 3043 chip does not handle unaligned transmit or receive buffers, resulting
in significant performance degradation for bounce buffer copies on transmit
and unaligned IP headers on receive.
The chip does not pad to minimum transmit length.

*/


/* This table drives the PCI probe routines. It's mostly boilerplate in all
   of the drivers, and will likely be provided by some future kernel.
   Note the matching code -- the first table entry matchs all 56** cards but
   second only the 1234 card.
*/

enum rhine_revs {
	VT86C100A	= 0x00,
	VTunknown0	= 0x20,
	VT6102		= 0x40,
	VT8231		= 0x50,	/* Integrated MAC */
	VT8233		= 0x60,	/* Integrated MAC */
	VT8235		= 0x74,	/* Integrated MAC */
	VT8237		= 0x78,	/* Integrated MAC */
	VTunknown1	= 0x7C,
	VT6105		= 0x80,
	VT6105_B0	= 0x83,
	VT6105L		= 0x8A,
	VT6107		= 0x8C,
	VTunknown2	= 0x8E,
	VT6105M		= 0x90,	/* Management adapter */
	VT6115		= 0xA0,
};

/*
 * Flags for hardware features/quirks (hw_flags)
 * bits 0-6	ethtool WOL options: WAKE_PHY etc.
 * bits 0-7	used for cfg_flags, too
 */
#define RHINE_WOL_BITS	0x7f		/* ethtool WOL options */
#define RHINE_RX_CSUM	(1 <<  7)	/* Rx checksum support */
#define RHINE_TX_CSUM	(1 <<  8)	/* Tx checksum support */
#define RHINE_WOL	(1 <<  9)	/* Wake-On-LAN support */
#define RHINE_6_PATS	(1 << 10)	/* 6 instead of 4 patterns for WOL */
#define RHINE_VLAN	(1 << 11)	/* VLAN support */
#define RHINE_FORCE_RST	(1 << 12)
#define RHINE_WB_RACE	(1 << 13)	/* Tx Status Writeback Error possible */
#define RHINE_I		(1 << 14)	/* See comment below */
/*
 * RHINE_I: VT86C100A (aka Rhine-I) uses different bits to enable
 * MMIO as well as for the collision counter and the Tx FIFO underflow
 * indicator. In addition, Tx and Rx buffers need to 4 byte aligned.
 */

/* Beware of PCI posted writes */
#define IOSYNC	do { ioread8(ioaddr + StationAddr); } while (0)

static DEFINE_PCI_DEVICE_TABLE(rhine_pci_tbl) = {
	{ 0x1106, 0x3043, PCI_ANY_ID, PCI_ANY_ID, },	/* VT86C100A */
	{ 0x1106, 0x3065, PCI_ANY_ID, PCI_ANY_ID, },	/* VT6102 */
	{ 0x1106, 0x3106, PCI_ANY_ID, PCI_ANY_ID, },	/* 6105{,L,LOM} */
	{ 0x1106, 0x3053, PCI_ANY_ID, PCI_ANY_ID, },	/* VT6105M */
	{ }	/* terminate list */
};
MODULE_DEVICE_TABLE(pci, rhine_pci_tbl);


/* Offsets to the device registers. */
enum register_offsets {
	StationAddr=0x00, RxConfig=0x06, TxConfig=0x07, ChipCmd=0x08,
	ChipCmd1=0x09, TQWake=0x0A,
	IntrStatus=0x0C, IntrEnable=0x0E,
	MulticastFilter0=0x10, MulticastFilter1=0x14,
	RxRingPtr=0x18, TxRingPtr=0x1C, GFIFOTest=0x54,
	MIIPhyAddr=0x6C, MIIStatus=0x6D, PCIBusConfig=0x6E, PCIBusConfig1=0x6F,
	MIICmd=0x70, MIIRegAddr=0x71, MIIData=0x72, MACRegEEcsr=0x74,
	ConfigA=0x78, ConfigB=0x79, ConfigC=0x7A, ConfigD=0x7B,
	RxMissed=0x7C, RxCRCErrs=0x7E, MiscCmd=0x81,
	StickyHW=0x83, IntrStatus2=0x84,
	CamMask=0x88, CamCon=0x92, CamAddr=0x93,
	WOLcrSet=0xA0, PwcfgSet=0xA1, WOLcgSet=0xA3, WOLcrClr=0xA4,
	WOLcrClr1=0xA6, WOLcgClr=0xA7,
	PwrcsrSet=0xA8, PwrcsrSet1=0xA9, PwrcsrClr=0xAC, PwrcsrClr1=0xAD,
};

/* Bits in ConfigD */
enum backoff_bits {
	BackOptional=0x01, BackModify=0x02,
	BackCaptureEffect=0x04, BackRandom=0x08
};

/* Bits in the TxConfig (TCR) register */
enum tcr_bits {
	TCR_PQEN=0x01,
	TCR_LB0=0x02,		/* loopback[0] */
	TCR_LB1=0x04,		/* loopback[1] */
	TCR_OFSET=0x08,
	TCR_RTGOPT=0x10,
	TCR_RTFT0=0x20,
	TCR_RTFT1=0x40,
	TCR_RTSF=0x80,
};

/* Bits in the CamCon (CAMC) register */
enum camcon_bits {
	CAMC_CAMEN=0x01,
	CAMC_VCAMSL=0x02,
	CAMC_CAMWR=0x04,
	CAMC_CAMRD=0x08,
};

/* Bits in the PCIBusConfig1 (BCR1) register */
enum bcr1_bits {
	BCR1_POT0=0x01,
	BCR1_POT1=0x02,
	BCR1_POT2=0x04,
	BCR1_CTFT0=0x08,
	BCR1_CTFT1=0x10,
	BCR1_CTSF=0x20,
	BCR1_TXQNOBK=0x40,	/* for VT6105 */
	BCR1_VIDFR=0x80,	/* for VT6105 */
	BCR1_MED0=0x40,		/* for VT6102 */
	BCR1_MED1=0x80,		/* for VT6102 */
};

#ifdef USE_MMIO
/* Registers we check that mmio and reg are the same. */
static const int mmio_verify_registers[] = {
	RxConfig, TxConfig, IntrEnable, ConfigA, ConfigB, ConfigC, ConfigD,
	0
};
#endif

/* Bits in the interrupt status/mask registers. */
enum intr_status_bits {
	IntrRxDone=0x0001, IntrRxErr=0x0004, IntrRxEmpty=0x0020,
	IntrTxDone=0x0002, IntrTxError=0x0008, IntrTxUnderrun=0x0210,
	IntrPCIErr=0x0040,
	IntrStatsMax=0x0080, IntrRxEarly=0x0100,
	IntrRxOverflow=0x0400, IntrRxDropped=0x0800, IntrRxNoBuf=0x1000,
	IntrTxAborted=0x2000, IntrLinkChange=0x4000,
	IntrRxWakeUp=0x8000,
	IntrNormalSummary=0x0003, IntrAbnormalSummary=0xC260,
	IntrTxDescRace=0x080000,	/* mapped from IntrStatus2 */
	IntrTxErrSummary=0x082218,
};

/* Bits in WOLcrSet/WOLcrClr and PwrcsrSet/PwrcsrClr */
enum wol_bits {
	WOLucast	= 0x10,
	WOLmagic	= 0x20,
	WOLbmcast	= 0x30,
	WOLlnkon	= 0x40,
	WOLlnkoff	= 0x80,
};

/* The Rx and Tx buffer descriptors. */
struct rx_desc {
	__le32 rx_status;
	__le32 desc_length; /* Chain flag, Buffer/frame length */
	__le32 addr;
	__le32 next_desc;
};
struct tx_desc {
	__le32 tx_status;
	__le32 desc_length; /* Chain flag, Tx Config, Frame length */
	__le32 addr;
	__le32 next_desc;
};

/* Initial value for tx_desc.desc_length, Buffer size goes to bits 0-10 */
#define TXDESC		0x00e08000

/* rx_status bits */
#define RSR0_RERR	0x0001
#define RSR0_CRC	0x0002
#define RSR0_FAE	0x0004
#define RSR0_FOV	0x0008
#define RSR0_FRAG	0x0040
#define RSR0_BUFF	0x0080
#define RSR0_ERR	0x008F

#define RSR1_EDP	0x0100 /* End of packet */
#define RSR1_STP	0x0200 /* Start of packet */
#define RSR1_WHOLE_PKT	0x0300 /* Buffer contains complete packet */
#define RSR1_RXOK	0x8000 /* Rx OK */

#define RSR_OWN		0x80000000

/* Rx desc_length bits */
#define PQSTS_IPOK	0x200000 /* IP checksum validation OK */
#define PQSTS_TUOK	0x100000 /* TCP/UDP checksum validation OK */
#define PQSTS_IPKT	0x080000 /* Received an IP packet */
#define PQSTS_TCPKT	0x040000 /* Received a TCP packet */
#define PQSTS_UDPKT	0x020000 /* Received a UDP packet */
#define PQSTS_TAG	0x010000 /* Received a tagged packet */

/* tx_status bits */
#define TSR_OWN		0x80000000


/* Tx desc_length bits */
#define TCR_TCPCK	0x100000  /* Request TCP checksum calculation */
#define TCR_UDPCK	0x080000  /* Request UDP checksum calculation */
#define TCR_IPCK	0x040000  /* Request IP checksum calculation */
#define TCR_TAG		0x020000  /* Request 802.1Q tag insertion */

/* Bits in ChipCmd. */
enum chip_cmd_bits {
	CmdInit=0x01, CmdStart=0x02, CmdStop=0x04, CmdRxOn=0x08,
	CmdTxOn=0x10, Cmd1TxDemand=0x20, CmdRxDemand=0x40,
	Cmd1EarlyRx=0x01, Cmd1EarlyTx=0x02, Cmd1FDuplex=0x04,
	Cmd1NoTxPoll=0x08, Cmd1Reset=0x80,
};

struct rhine_private {
	/* Bit mask for configured VLAN ids */
	unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];

	/* Descriptor rings */
	struct rx_desc *rx_ring;
	struct tx_desc *tx_ring;
	dma_addr_t rx_ring_dma;
	dma_addr_t tx_ring_dma;

	/* The addresses of receive-in-place skbuffs. */
	struct sk_buff *rx_skbuff[RX_RING_SIZE];
	dma_addr_t rx_skbuff_dma[RX_RING_SIZE];

	/* The saved address of a sent-in-place packet/buffer, for later free(). */
	struct sk_buff *tx_skbuff[TX_RING_SIZE];
	dma_addr_t tx_skbuff_dma[TX_RING_SIZE];

	/* Tx bounce buffers (Rhine-I only) */
	unsigned char *tx_buf[TX_RING_SIZE];
	unsigned char *tx_bufs;
	dma_addr_t tx_bufs_dma;

	struct pci_dev *pdev;
	long pioaddr;
	struct net_device *dev;
	struct napi_struct napi;
	spinlock_t lock;
	struct work_struct reset_task;

	/* Frequently used values: keep some adjacent for cache effect. */
	u32 hw_flags;	/* flags for hardware features/quirks */
	u32 cfg_flags;	/* status (enabled/disabled) for some hw features */
	struct rx_desc *rx_head_desc;
	unsigned int cur_rx, dirty_rx;	/* Producer/consumer ring indices */
	unsigned int cur_tx, dirty_tx;
	unsigned int rx_buf_sz;		/* Based on MTU+slack. */

	u8 tx_thresh, rx_thresh;

	struct mii_if_info mii_if;
	void __iomem *base;
};

#define BYTE_REG_BITS_ON(x, p)      do { iowrite8((ioread8((p))|(x)), (p)); } while (0)
#define WORD_REG_BITS_ON(x, p)      do { iowrite16((ioread16((p))|(x)), (p)); } while (0)
#define DWORD_REG_BITS_ON(x, p)     do { iowrite32((ioread32((p))|(x)), (p)); } while (0)

#define BYTE_REG_BITS_IS_ON(x, p)   (ioread8((p)) & (x))
#define WORD_REG_BITS_IS_ON(x, p)   (ioread16((p)) & (x))
#define DWORD_REG_BITS_IS_ON(x, p)  (ioread32((p)) & (x))

#define BYTE_REG_BITS_OFF(x, p)     do { iowrite8(ioread8((p)) & (~(x)), (p)); } while (0)
#define WORD_REG_BITS_OFF(x, p)     do { iowrite16(ioread16((p)) & (~(x)), (p)); } while (0)
#define DWORD_REG_BITS_OFF(x, p)    do { iowrite32(ioread32((p)) & (~(x)), (p)); } while (0)

#define BYTE_REG_BITS_SET(x, m, p)   do { iowrite8((ioread8((p)) & (~(m)))|(x), (p)); } while (0)
#define WORD_REG_BITS_SET(x, m, p)   do { iowrite16((ioread16((p)) & (~(m)))|(x), (p)); } while (0)
#define DWORD_REG_BITS_SET(x, m, p)  do { iowrite32((ioread32((p)) & (~(m)))|(x), (p)); } while (0)

static int  mdio_read(struct net_device *dev, int phy_id, int location);
static void mdio_write(struct net_device *dev, int phy_id, int location, int value);
static int  rhine_open(struct net_device *dev);
static void rhine_reset_task(struct work_struct *work);
static void rhine_tx_timeout(struct net_device *dev);
static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
				  struct net_device *dev);
static irqreturn_t rhine_interrupt(int irq, void *dev_instance);
static void rhine_tx(struct net_device *dev);
static int rhine_rx(struct net_device *dev, int limit);
static void rhine_error(struct net_device *dev, int intr_status);
static void rhine_set_rx_mode(struct net_device *dev);
static struct net_device_stats *rhine_get_stats(struct net_device *dev);
static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
static const struct ethtool_ops netdev_ethtool_ops;
static int  rhine_close(struct net_device *dev);
static void rhine_shutdown (struct pci_dev *pdev);
static void rhine_vlan_rx_add_vid(struct net_device *dev, unsigned short vid);
static void rhine_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid);
static void rhine_set_cam(void __iomem *ioaddr, int idx, u8 *addr);
static void rhine_set_vlan_cam(void __iomem *ioaddr, int idx, u8 *addr);
static void rhine_set_cam_mask(void __iomem *ioaddr, u32 mask);
static void rhine_set_vlan_cam_mask(void __iomem *ioaddr, u32 mask);
static void rhine_init_cam_filter(struct net_device *dev);
static void rhine_update_vcam(struct net_device *dev);

#define RHINE_WAIT_FOR(condition) do {					\
	int i=1024;							\
	while (!(condition) && --i)					\
		;							\
	if (debug > 1 && i < 512)					\
		printk(KERN_INFO "%s: %4d cycles used @ %s:%d\n",	\
				DRV_NAME, 1024-i, __func__, __LINE__);	\
} while(0)

static inline u32 get_intr_status(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	u32 intr_status;

	intr_status = ioread16(ioaddr + IntrStatus);
	/* On Rhine-II, Bit 3 indicates Tx descriptor write-back race. */
	if (rp->hw_flags & RHINE_WB_RACE)
		intr_status |= ioread8(ioaddr + IntrStatus2) << 16;
	return intr_status;
}

/*
 * Get power related registers into sane state.
 * Notify user about past WOL event.
 */
static void rhine_power_init(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	u16 wolstat;

	if (rp->hw_flags & RHINE_WOL) {
		/* Make sure chip is in power state D0 */
		iowrite8(ioread8(ioaddr + StickyHW) & 0xFC, ioaddr + StickyHW);

		/* Disable "force PME-enable" */
		iowrite8(0x80, ioaddr + WOLcgClr);

		/* Clear power-event config bits (WOL) */
		iowrite8(0xFF, ioaddr + WOLcrClr);
		/* More recent cards can manage two additional patterns */
		if (rp->hw_flags & RHINE_6_PATS)
			iowrite8(0x03, ioaddr + WOLcrClr1);

		/* Save power-event status bits */
		wolstat = ioread8(ioaddr + PwrcsrSet);
		if (rp->hw_flags & RHINE_6_PATS)
			wolstat |= (ioread8(ioaddr + PwrcsrSet1) & 0x03) << 8;

		/* Clear power-event status bits */
		iowrite8(0xFF, ioaddr + PwrcsrClr);
		if (rp->hw_flags & RHINE_6_PATS)
			iowrite8(0x03, ioaddr + PwrcsrClr1);

		if (wolstat) {
			char *reason;
			switch (wolstat) {
			case WOLmagic:
				reason = "Magic packet";
				break;
			case WOLlnkon:
				reason = "Link went up";
				break;
			case WOLlnkoff:
				reason = "Link went down";
				break;
			case WOLucast:
				reason = "Unicast packet";
				break;
			case WOLbmcast:
				reason = "Multicast/broadcast packet";
				break;
			default:
				reason = "Unknown";
			}
			printk(KERN_INFO "%s: Woke system up. Reason: %s.\n",
			       DRV_NAME, reason);
		}
	}
}

static void rhine_chip_reset(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	iowrite8(Cmd1Reset, ioaddr + ChipCmd1);
	IOSYNC;

	if (ioread8(ioaddr + ChipCmd1) & Cmd1Reset) {
		printk(KERN_INFO "%s: Reset not complete yet. "
			"Trying harder.\n", DRV_NAME);

		/* Force reset */
		if (rp->hw_flags & RHINE_FORCE_RST)
			iowrite8(0x40, ioaddr + MiscCmd);

		/* Reset can take somewhat longer (rare) */
		RHINE_WAIT_FOR(!(ioread8(ioaddr + ChipCmd1) & Cmd1Reset));
	}

	if (debug > 1)
		printk(KERN_INFO "%s: Reset %s.\n", dev->name,
			(ioread8(ioaddr + ChipCmd1) & Cmd1Reset) ?
			"failed" : "succeeded");
}

#ifdef USE_MMIO
static void enable_mmio(long pioaddr, u32 hw_flags)
{
	int n;
	if (hw_flags & RHINE_I) {
		/* More recent docs say that this bit is reserved ... */
		n = inb(pioaddr + ConfigA) | 0x20;
		outb(n, pioaddr + ConfigA);
	} else {
		n = inb(pioaddr + ConfigD) | 0x80;
		outb(n, pioaddr + ConfigD);
	}
}
#endif

/*
 * Loads bytes 0x00-0x05, 0x6E-0x6F, 0x78-0x7B from EEPROM
 * (plus 0x6C for Rhine-I/II)
 */
static void __devinit rhine_reload_eeprom(long pioaddr, struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	outb(0x20, pioaddr + MACRegEEcsr);
	RHINE_WAIT_FOR(!(inb(pioaddr + MACRegEEcsr) & 0x20));

#ifdef USE_MMIO
	/*
	 * Reloading from EEPROM overwrites ConfigA-D, so we must re-enable
	 * MMIO. If reloading EEPROM was done first this could be avoided, but
	 * it is not known if that still works with the "win98-reboot" problem.
	 */
	enable_mmio(pioaddr, rp->hw_flags);
#endif

	/* Turn off EEPROM-controlled wake-up (magic packet) */
	if (rp->hw_flags & RHINE_WOL)
		iowrite8(ioread8(ioaddr + ConfigA) & 0xFC, ioaddr + ConfigA);

}

#ifdef CONFIG_NET_POLL_CONTROLLER
static void rhine_poll(struct net_device *dev)
{
	disable_irq(dev->irq);
	rhine_interrupt(dev->irq, (void *)dev);
	enable_irq(dev->irq);
}
#endif

static int rhine_napipoll(struct napi_struct *napi, int budget)
{
	struct rhine_private *rp = container_of(napi, struct rhine_private, napi);
	struct net_device *dev = rp->dev;
	void __iomem *ioaddr = rp->base;
	int work_done;

	work_done = rhine_rx(dev, budget);

	if (work_done < budget) {
		napi_complete(napi);

		iowrite16(IntrRxDone | IntrRxErr | IntrRxEmpty| IntrRxOverflow |
			  IntrRxDropped | IntrRxNoBuf | IntrTxAborted |
			  IntrTxDone | IntrTxError | IntrTxUnderrun |
			  IntrPCIErr | IntrStatsMax | IntrLinkChange,
			  ioaddr + IntrEnable);
	}
	return work_done;
}

static void __devinit rhine_hw_init(struct net_device *dev, long pioaddr)
{
	struct rhine_private *rp = netdev_priv(dev);

	/* Reset the chip to erase previous misconfiguration. */
	rhine_chip_reset(dev);

	/* Rhine-I needs extra time to recuperate before EEPROM reload */
	if (rp->hw_flags & RHINE_I)
		msleep(5);

	/* Reload EEPROM controlled bytes cleared by soft reset */
	rhine_reload_eeprom(pioaddr, dev);
}

static const struct net_device_ops rhine_netdev_ops = {
	.ndo_open		 = rhine_open,
	.ndo_stop		 = rhine_close,
	.ndo_start_xmit		 = rhine_start_tx,
	.ndo_get_stats		 = rhine_get_stats,
	.ndo_set_multicast_list	 = rhine_set_rx_mode,
	.ndo_change_mtu		 = eth_change_mtu,
	.ndo_validate_addr	 = eth_validate_addr,
	.ndo_set_mac_address 	 = eth_mac_addr,
	.ndo_do_ioctl		 = netdev_ioctl,
	.ndo_tx_timeout 	 = rhine_tx_timeout,
	.ndo_vlan_rx_add_vid	 = rhine_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid	 = rhine_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	 = rhine_poll,
#endif
};

static int __devinit rhine_init_one(struct pci_dev *pdev,
				    const struct pci_device_id *ent)
{
	struct net_device *dev;
	struct rhine_private *rp;
	int i, rc;
	u32 hw_flags, cfg_flags;
	long pioaddr;
	long memaddr;
	void __iomem *ioaddr;
	int io_size, phy_id;
	const char *name;
#ifdef USE_MMIO
	int bar = 1;
#else
	int bar = 0;
#endif

/* when built into the kernel, we only print version if device is found */
#ifndef MODULE
	static int printed_version;
	if (!printed_version++)
		printk(version);
#endif

	io_size = 256;
	phy_id = 0;
	hw_flags = 0;
	cfg_flags = 0;
	name = "Rhine";
	if (pdev->revision < VTunknown0) {
		hw_flags = RHINE_I;
		io_size = 128;
	}
	else if (pdev->revision >= VT6102) {
		hw_flags = RHINE_WOL | RHINE_FORCE_RST;
		hw_flags |= WAKE_PHY | WAKE_MAGIC | WAKE_UCAST | WAKE_MCAST |
			WAKE_BCAST;	/* Some wake modes are untested */

		if (pdev->revision < VT6105) {
			name = "Rhine II";
			hw_flags |= RHINE_WB_RACE;	/* Rhine-II exclusive */
		}
		else {
			phy_id = 1;	/* Integrated PHY, phy_id fixed to 1 */
			if (pdev->revision >= VT6105_B0)
				hw_flags |= RHINE_6_PATS;
			if (pdev->revision >= VT6105M) {
				name = "Rhine III (Management Adapter)";
				hw_flags |= RHINE_RX_CSUM | RHINE_VLAN;
				cfg_flags |= RHINE_RX_CSUM;
				if (pdev->revision < VT6115) {
					hw_flags |= RHINE_TX_CSUM;
				}
			}
			else
				name = "Rhine III";
		}
	}

	rc = pci_enable_device(pdev);
	if (rc)
		goto err_out;

	/* this should always be supported */
	rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
	if (rc) {
		printk(KERN_ERR "32-bit PCI DMA addresses not supported by "
		       "the card!?\n");
		goto err_out;
	}

	/* sanity check */
	if ((pci_resource_len(pdev, 0) < io_size) ||
	    (pci_resource_len(pdev, 1) < io_size)) {
		rc = -EIO;
		printk(KERN_ERR "Insufficient PCI resources, aborting\n");
		goto err_out;
	}

	pioaddr = pci_resource_start(pdev, 0);
	memaddr = pci_resource_start(pdev, 1);

	pci_set_master(pdev);

	dev = alloc_etherdev(sizeof(struct rhine_private));
	if (!dev) {
		rc = -ENOMEM;
		printk(KERN_ERR "alloc_etherdev failed\n");
		goto err_out;
	}
	SET_NETDEV_DEV(dev, &pdev->dev);

	rp = netdev_priv(dev);
	rp->dev = dev;
	rp->hw_flags = hw_flags;
	rp->cfg_flags = cfg_flags;
	rp->pioaddr = pioaddr;
	rp->pdev = pdev;

	rc = pci_request_regions(pdev, DRV_NAME);
	if (rc)
		goto err_out_free_netdev;

	ioaddr = pci_iomap(pdev, bar, io_size);
	if (!ioaddr) {
		rc = -EIO;
		printk(KERN_ERR "ioremap failed for device %s, region 0x%X "
		       "@ 0x%lX\n", pci_name(pdev), io_size, memaddr);
		goto err_out_free_res;
	}

#ifdef USE_MMIO
	enable_mmio(pioaddr, hw_flags);

	/* Check that selected MMIO registers match the PIO ones */
	i = 0;
	while (mmio_verify_registers[i]) {
		int reg = mmio_verify_registers[i++];
		unsigned char a = inb(pioaddr+reg);
		unsigned char b = readb(ioaddr+reg);
		if (a != b) {
			rc = -EIO;
			printk(KERN_ERR "MMIO do not match PIO [%02x] "
			       "(%02x != %02x)\n", reg, a, b);
			goto err_out_unmap;
		}
	}
#endif /* USE_MMIO */

	dev->base_addr = (unsigned long)ioaddr;
	rp->base = ioaddr;

	/* Get chip registers into a sane state */
	rhine_power_init(dev);
	rhine_hw_init(dev, pioaddr);

	for (i = 0; i < 6; i++)
		dev->dev_addr[i] = ioread8(ioaddr + StationAddr + i);
	memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);

	if (!is_valid_ether_addr(dev->perm_addr)) {
		rc = -EIO;
		printk(KERN_ERR "Invalid MAC address\n");
		goto err_out_unmap;
	}

	/* For Rhine-I/II, phy_id is loaded from EEPROM */
	if (!phy_id)
		phy_id = ioread8(ioaddr + 0x6C);

	dev->irq = pdev->irq;

	spin_lock_init(&rp->lock);
	INIT_WORK(&rp->reset_task, rhine_reset_task);

	rp->mii_if.dev = dev;
	rp->mii_if.mdio_read = mdio_read;
	rp->mii_if.mdio_write = mdio_write;
	rp->mii_if.phy_id_mask = 0x1f;
	rp->mii_if.reg_num_mask = 0x1f;

	/* The chip-specific entries in the device structure. */
	dev->netdev_ops = &rhine_netdev_ops;
	dev->ethtool_ops = &netdev_ethtool_ops,
	dev->watchdog_timeo = TX_TIMEOUT;

	netif_napi_add(dev, &rp->napi, rhine_napipoll, 64);

	if (rp->hw_flags & RHINE_I)
		dev->features |= NETIF_F_SG|NETIF_F_HW_CSUM;

	if (rp->hw_flags & RHINE_VLAN)
		dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
		NETIF_F_HW_VLAN_FILTER;

	if (rp->hw_flags & RHINE_TX_CSUM)
		dev->features |= NETIF_F_IP_CSUM;

	/* dev->name not defined before register_netdev()! */
	rc = register_netdev(dev);
	if (rc)
		goto err_out_unmap;

	printk(KERN_INFO "%s: VIA %s at 0x%lx, %pM, IRQ %d.\n",
	       dev->name, name,
#ifdef USE_MMIO
	       memaddr,
#else
	       (long)ioaddr,
#endif
	       dev->dev_addr, pdev->irq);

	pci_set_drvdata(pdev, dev);

	{
		u16 mii_cmd;
		int mii_status = mdio_read(dev, phy_id, 1);
		mii_cmd = mdio_read(dev, phy_id, MII_BMCR) & ~BMCR_ISOLATE;
		mdio_write(dev, phy_id, MII_BMCR, mii_cmd);
		if (mii_status != 0xffff && mii_status != 0x0000) {
			rp->mii_if.advertising = mdio_read(dev, phy_id, 4);
			printk(KERN_INFO "%s: MII PHY found at address "
			       "%d, status 0x%4.4x advertising %4.4x "
			       "Link %4.4x.\n", dev->name, phy_id,
			       mii_status, rp->mii_if.advertising,
			       mdio_read(dev, phy_id, 5));

			/* set IFF_RUNNING */
			if (mii_status & BMSR_LSTATUS)
				netif_carrier_on(dev);
			else
				netif_carrier_off(dev);

		}
	}
	rp->mii_if.phy_id = phy_id;
	if (debug > 1 && avoid_D3)
		printk(KERN_INFO "%s: No D3 power state at shutdown.\n",
		       dev->name);

	return 0;

err_out_unmap:
	pci_iounmap(pdev, ioaddr);
err_out_free_res:
	pci_release_regions(pdev);
err_out_free_netdev:
	free_netdev(dev);
err_out:
	return rc;
}

static int alloc_ring(struct net_device* dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void *ring;
	dma_addr_t ring_dma;

	ring = pci_alloc_consistent(rp->pdev,
				    RX_RING_SIZE * sizeof(struct rx_desc) +
				    TX_RING_SIZE * sizeof(struct tx_desc),
				    &ring_dma);
	if (!ring) {
		printk(KERN_ERR "Could not allocate DMA memory.\n");
		return -ENOMEM;
	}
	if (rp->hw_flags & RHINE_I) {
		rp->tx_bufs = pci_alloc_consistent(rp->pdev,
						   PKT_BUF_SZ * TX_RING_SIZE,
						   &rp->tx_bufs_dma);
		if (rp->tx_bufs == NULL) {
			pci_free_consistent(rp->pdev,
				    RX_RING_SIZE * sizeof(struct rx_desc) +
				    TX_RING_SIZE * sizeof(struct tx_desc),
				    ring, ring_dma);
			return -ENOMEM;
		}
	}

	rp->rx_ring = ring;
	rp->tx_ring = ring + RX_RING_SIZE * sizeof(struct rx_desc);
	rp->rx_ring_dma = ring_dma;
	rp->tx_ring_dma = ring_dma + RX_RING_SIZE * sizeof(struct rx_desc);

	return 0;
}

static void free_ring(struct net_device* dev)
{
	struct rhine_private *rp = netdev_priv(dev);

	pci_free_consistent(rp->pdev,
			    RX_RING_SIZE * sizeof(struct rx_desc) +
			    TX_RING_SIZE * sizeof(struct tx_desc),
			    rp->rx_ring, rp->rx_ring_dma);
	rp->tx_ring = NULL;

	if (rp->tx_bufs)
		pci_free_consistent(rp->pdev, PKT_BUF_SZ * TX_RING_SIZE,
				    rp->tx_bufs, rp->tx_bufs_dma);

	rp->tx_bufs = NULL;

}

static void alloc_rbufs(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	dma_addr_t next;
	int i;

	rp->dirty_rx = rp->cur_rx = 0;

	rp->rx_buf_sz = (dev->mtu <= 1500 ? PKT_BUF_SZ : dev->mtu + 32);
	rp->rx_head_desc = &rp->rx_ring[0];
	next = rp->rx_ring_dma;

	/* Init the ring entries */
	for (i = 0; i < RX_RING_SIZE; i++) {
		rp->rx_ring[i].rx_status = 0;
		rp->rx_ring[i].desc_length = cpu_to_le32(rp->rx_buf_sz);
		next += sizeof(struct rx_desc);
		rp->rx_ring[i].next_desc = cpu_to_le32(next);
		rp->rx_skbuff[i] = NULL;
	}
	/* Mark the last entry as wrapping the ring. */
	rp->rx_ring[i-1].next_desc = cpu_to_le32(rp->rx_ring_dma);

	/* Fill in the Rx buffers.  Handle allocation failure gracefully. */
	for (i = 0; i < RX_RING_SIZE; i++) {
		struct sk_buff *skb = netdev_alloc_skb(dev, rp->rx_buf_sz);
		rp->rx_skbuff[i] = skb;
		if (skb == NULL)
			break;
		skb->dev = dev;                 /* Mark as being used by this device. */

		rp->rx_skbuff_dma[i] =
			pci_map_single(rp->pdev, skb->data, rp->rx_buf_sz,
				       PCI_DMA_FROMDEVICE);

		rp->rx_ring[i].addr = cpu_to_le32(rp->rx_skbuff_dma[i]);
		rp->rx_ring[i].rx_status = cpu_to_le32(RSR_OWN);
	}
	rp->dirty_rx = (unsigned int)(i - RX_RING_SIZE);
}

static void free_rbufs(struct net_device* dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	int i;

	/* Free all the skbuffs in the Rx queue. */
	for (i = 0; i < RX_RING_SIZE; i++) {
		rp->rx_ring[i].rx_status = 0;
		rp->rx_ring[i].addr = cpu_to_le32(0xBADF00D0); /* An invalid address. */
		if (rp->rx_skbuff[i]) {
			pci_unmap_single(rp->pdev,
					 rp->rx_skbuff_dma[i],
					 rp->rx_buf_sz, PCI_DMA_FROMDEVICE);
			dev_kfree_skb(rp->rx_skbuff[i]);
		}
		rp->rx_skbuff[i] = NULL;
	}
}

static void alloc_tbufs(struct net_device* dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	dma_addr_t next;
	int i;

	rp->dirty_tx = rp->cur_tx = 0;
	next = rp->tx_ring_dma;
	for (i = 0; i < TX_RING_SIZE; i++) {
		rp->tx_skbuff[i] = NULL;
		rp->tx_ring[i].tx_status = 0;
		rp->tx_ring[i].desc_length = cpu_to_le32(TXDESC);
		next += sizeof(struct tx_desc);
		rp->tx_ring[i].next_desc = cpu_to_le32(next);
		if (rp->hw_flags & RHINE_I)
			rp->tx_buf[i] = &rp->tx_bufs[i * PKT_BUF_SZ];
	}
	rp->tx_ring[i-1].next_desc = cpu_to_le32(rp->tx_ring_dma);

}

static void free_tbufs(struct net_device* dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	int i;

	for (i = 0; i < TX_RING_SIZE; i++) {
		rp->tx_ring[i].tx_status = 0;
		rp->tx_ring[i].desc_length = cpu_to_le32(TXDESC);
		rp->tx_ring[i].addr = cpu_to_le32(0xBADF00D0); /* An invalid address. */
		if (rp->tx_skbuff[i]) {
			if (rp->tx_skbuff_dma[i]) {
				pci_unmap_single(rp->pdev,
						 rp->tx_skbuff_dma[i],
						 rp->tx_skbuff[i]->len,
						 PCI_DMA_TODEVICE);
			}
			dev_kfree_skb(rp->tx_skbuff[i]);
		}
		rp->tx_skbuff[i] = NULL;
		rp->tx_buf[i] = NULL;
	}
}

static void rhine_check_media(struct net_device *dev, unsigned int init_media)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	mii_check_media(&rp->mii_if, debug, init_media);

	if (rp->mii_if.full_duplex)
	    iowrite8(ioread8(ioaddr + ChipCmd1) | Cmd1FDuplex,
		   ioaddr + ChipCmd1);
	else
	    iowrite8(ioread8(ioaddr + ChipCmd1) & ~Cmd1FDuplex,
		   ioaddr + ChipCmd1);
	if (debug > 1)
		printk(KERN_INFO "%s: force_media %d, carrier %d\n", dev->name,
			rp->mii_if.force_media, netif_carrier_ok(dev));
}

/* Called after status of force_media possibly changed */
static void rhine_set_carrier(struct mii_if_info *mii)
{
	if (mii->force_media) {
		/* autoneg is off: Link is always assumed to be up */
		if (!netif_carrier_ok(mii->dev))
			netif_carrier_on(mii->dev);
	}
	else	/* Let MMI library update carrier status */
		rhine_check_media(mii->dev, 0);
	if (debug > 1)
		printk(KERN_INFO "%s: force_media %d, carrier %d\n",
		       mii->dev->name, mii->force_media,
		       netif_carrier_ok(mii->dev));
}

/**
 * rhine_set_cam - set CAM multicast filters
 * @ioaddr: register block of this Rhine
 * @idx: multicast CAM index [0..MCAM_SIZE-1]
 * @addr: multicast address (6 bytes)
 *
 * Load addresses into multicast filters.
 */
static void rhine_set_cam(void __iomem *ioaddr, int idx, u8 *addr)
{
	int i;

	iowrite8(CAMC_CAMEN, ioaddr + CamCon);
	wmb();

	/* Paranoid -- idx out of range should never happen */
	idx &= (MCAM_SIZE - 1);

	iowrite8((u8) idx, ioaddr + CamAddr);

	for (i = 0; i < 6; i++, addr++)
		iowrite8(*addr, ioaddr + MulticastFilter0 + i);
	udelay(10);
	wmb();

	iowrite8(CAMC_CAMWR | CAMC_CAMEN, ioaddr + CamCon);
	udelay(10);

	iowrite8(0, ioaddr + CamCon);
}

/**
 * rhine_set_vlan_cam - set CAM VLAN filters
 * @ioaddr: register block of this Rhine
 * @idx: VLAN CAM index [0..VCAM_SIZE-1]
 * @addr: VLAN ID (2 bytes)
 *
 * Load addresses into VLAN filters.
 */
static void rhine_set_vlan_cam(void __iomem *ioaddr, int idx, u8 *addr)
{
	iowrite8(CAMC_CAMEN | CAMC_VCAMSL, ioaddr + CamCon);
	wmb();

	/* Paranoid -- idx out of range should never happen */
	idx &= (VCAM_SIZE - 1);

	iowrite8((u8) idx, ioaddr + CamAddr);

	iowrite16(*((u16 *) addr), ioaddr + MulticastFilter0 + 6);
	udelay(10);
	wmb();

	iowrite8(CAMC_CAMWR | CAMC_CAMEN, ioaddr + CamCon);
	udelay(10);

	iowrite8(0, ioaddr + CamCon);
}

/**
 * rhine_set_cam_mask - set multicast CAM mask
 * @ioaddr: register block of this Rhine
 * @mask: multicast CAM mask
 *
 * Mask sets multicast filters active/inactive.
 */
static void rhine_set_cam_mask(void __iomem *ioaddr, u32 mask)
{
	iowrite8(CAMC_CAMEN, ioaddr + CamCon);
	wmb();

	/* write mask */
	iowrite32(mask, ioaddr + CamMask);

	/* disable CAMEN */
	iowrite8(0, ioaddr + CamCon);
}

/**
 * rhine_set_vlan_cam_mask - set VLAN CAM mask
 * @ioaddr: register block of this Rhine
 * @mask: VLAN CAM mask
 *
 * Mask sets VLAN filters active/inactive.
 */
static void rhine_set_vlan_cam_mask(void __iomem *ioaddr, u32 mask)
{
	iowrite8(CAMC_CAMEN | CAMC_VCAMSL, ioaddr + CamCon);
	wmb();

	/* write mask */
	iowrite32(mask, ioaddr + CamMask);

	/* disable CAMEN */
	iowrite8(0, ioaddr + CamCon);
}

/**
 * rhine_init_cam_filter - initialize CAM filters
 * @dev: network device
 *
 * Initialize (disable) hardware VLAN and multicast support on this
 * Rhine.
 */
static void rhine_init_cam_filter(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	/* Disable all CAMs */
	rhine_set_vlan_cam_mask(ioaddr, 0);
	rhine_set_cam_mask(ioaddr, 0);

	/* disable hardware VLAN support */
	BYTE_REG_BITS_ON(TCR_PQEN, ioaddr + TxConfig);
	BYTE_REG_BITS_OFF(BCR1_VIDFR, ioaddr + PCIBusConfig1);
}

/**
 * rhine_update_vcam - update VLAN CAM filters
 * @rp: rhine_private data of this Rhine
 *
 * Update VLAN CAM filters to match configuration change.
 */
static void rhine_update_vcam(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	u16 vid;
	u32 vCAMmask = 0;	/* 32 vCAMs (6105M and better) */
	unsigned int i = 0;

	for_each_set_bit(vid, rp->active_vlans, VLAN_N_VID) {
		rhine_set_vlan_cam(ioaddr, i, (u8 *)&vid);
		vCAMmask |= 1 << i;
		if (++i >= VCAM_SIZE)
			break;
	}
	rhine_set_vlan_cam_mask(ioaddr, vCAMmask);
}

static void rhine_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
{
	struct rhine_private *rp = netdev_priv(dev);

	spin_lock_irq(&rp->lock);
	set_bit(vid, rp->active_vlans);
	rhine_update_vcam(dev);
	spin_unlock_irq(&rp->lock);
}

static void rhine_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
{
	struct rhine_private *rp = netdev_priv(dev);

	spin_lock_irq(&rp->lock);
	clear_bit(vid, rp->active_vlans);
	rhine_update_vcam(dev);
	spin_unlock_irq(&rp->lock);
}

static void init_registers(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	int i;

	for (i = 0; i < 6; i++)
		iowrite8(dev->dev_addr[i], ioaddr + StationAddr + i);

	/* Initialize other registers. */
	iowrite16(0x0006, ioaddr + PCIBusConfig);	/* Tune configuration??? */
	/* Configure initial FIFO thresholds. */
	iowrite8(0x20, ioaddr + TxConfig);
	rp->tx_thresh = 0x20;
	rp->rx_thresh = 0x60;		/* Written in rhine_set_rx_mode(). */

	iowrite32(rp->rx_ring_dma, ioaddr + RxRingPtr);
	iowrite32(rp->tx_ring_dma, ioaddr + TxRingPtr);

	rhine_set_rx_mode(dev);

	if (rp->hw_flags & RHINE_VLAN)
		rhine_init_cam_filter(dev);

	napi_enable(&rp->napi);

	/* Enable interrupts by setting the interrupt mask. */
	iowrite16(IntrRxDone | IntrRxErr | IntrRxEmpty| IntrRxOverflow |
	       IntrRxDropped | IntrRxNoBuf | IntrTxAborted |
	       IntrTxDone | IntrTxError | IntrTxUnderrun |
	       IntrPCIErr | IntrStatsMax | IntrLinkChange,
	       ioaddr + IntrEnable);

	iowrite16(CmdStart | CmdTxOn | CmdRxOn | (Cmd1NoTxPoll << 8),
	       ioaddr + ChipCmd);
	rhine_check_media(dev, 1);
}

/* Enable MII link status auto-polling (required for IntrLinkChange) */
static void rhine_enable_linkmon(void __iomem *ioaddr)
{
	iowrite8(0, ioaddr + MIICmd);
	iowrite8(MII_BMSR, ioaddr + MIIRegAddr);
	iowrite8(0x80, ioaddr + MIICmd);

	RHINE_WAIT_FOR((ioread8(ioaddr + MIIRegAddr) & 0x20));

	iowrite8(MII_BMSR | 0x40, ioaddr + MIIRegAddr);
}

/* Disable MII link status auto-polling (required for MDIO access) */
static void rhine_disable_linkmon(void __iomem *ioaddr, u32 hw_flags)
{
	iowrite8(0, ioaddr + MIICmd);

	if (hw_flags & RHINE_I) {
		iowrite8(0x01, ioaddr + MIIRegAddr);	// MII_BMSR

		/* Can be called from ISR. Evil. */
		mdelay(1);

		/* 0x80 must be set immediately before turning it off */
		iowrite8(0x80, ioaddr + MIICmd);

		RHINE_WAIT_FOR(ioread8(ioaddr + MIIRegAddr) & 0x20);

		/* Heh. Now clear 0x80 again. */
		iowrite8(0, ioaddr + MIICmd);
	}
	else
		RHINE_WAIT_FOR(ioread8(ioaddr + MIIRegAddr) & 0x80);
}

/* Read and write over the MII Management Data I/O (MDIO) interface. */

static int mdio_read(struct net_device *dev, int phy_id, int regnum)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	int result;

	rhine_disable_linkmon(ioaddr, rp->hw_flags);

	/* rhine_disable_linkmon already cleared MIICmd */
	iowrite8(phy_id, ioaddr + MIIPhyAddr);
	iowrite8(regnum, ioaddr + MIIRegAddr);
	iowrite8(0x40, ioaddr + MIICmd);		/* Trigger read */
	RHINE_WAIT_FOR(!(ioread8(ioaddr + MIICmd) & 0x40));
	result = ioread16(ioaddr + MIIData);

	rhine_enable_linkmon(ioaddr);
	return result;
}

static void mdio_write(struct net_device *dev, int phy_id, int regnum, int value)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	rhine_disable_linkmon(ioaddr, rp->hw_flags);

	/* rhine_disable_linkmon already cleared MIICmd */
	iowrite8(phy_id, ioaddr + MIIPhyAddr);
	iowrite8(regnum, ioaddr + MIIRegAddr);
	iowrite16(value, ioaddr + MIIData);
	iowrite8(0x20, ioaddr + MIICmd);		/* Trigger write */
	RHINE_WAIT_FOR(!(ioread8(ioaddr + MIICmd) & 0x20));

	rhine_enable_linkmon(ioaddr);
}

static int rhine_open(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	int rc;

	rc = request_irq(rp->pdev->irq, rhine_interrupt, IRQF_SHARED, dev->name,
			dev);
	if (rc)
		return rc;

	if (debug > 1)
		printk(KERN_DEBUG "%s: rhine_open() irq %d.\n",
		       dev->name, rp->pdev->irq);

	rc = alloc_ring(dev);
	if (rc) {
		free_irq(rp->pdev->irq, dev);
		return rc;
	}
	alloc_rbufs(dev);
	alloc_tbufs(dev);
	rhine_chip_reset(dev);
	init_registers(dev);
	if (debug > 2)
		printk(KERN_DEBUG "%s: Done rhine_open(), status %4.4x "
		       "MII status: %4.4x.\n",
		       dev->name, ioread16(ioaddr + ChipCmd),
		       mdio_read(dev, rp->mii_if.phy_id, MII_BMSR));

	netif_start_queue(dev);

	return 0;
}

static void rhine_reset_task(struct work_struct *work)
{
	struct rhine_private *rp = container_of(work, struct rhine_private,
						reset_task);
	struct net_device *dev = rp->dev;

	/* protect against concurrent rx interrupts */
	disable_irq(rp->pdev->irq);

	napi_disable(&rp->napi);

	spin_lock_bh(&rp->lock);

	/* clear all descriptors */
	free_tbufs(dev);
	free_rbufs(dev);
	alloc_tbufs(dev);
	alloc_rbufs(dev);

	/* Reinitialize the hardware. */
	rhine_chip_reset(dev);
	init_registers(dev);

	spin_unlock_bh(&rp->lock);
	enable_irq(rp->pdev->irq);

	dev->trans_start = jiffies; /* prevent tx timeout */
	dev->stats.tx_errors++;
	netif_wake_queue(dev);
}

static void rhine_tx_timeout(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	printk(KERN_WARNING "%s: Transmit timed out, status %4.4x, PHY status "
	       "%4.4x, resetting...\n",
	       dev->name, ioread16(ioaddr + IntrStatus),
	       mdio_read(dev, rp->mii_if.phy_id, MII_BMSR));

	schedule_work(&rp->reset_task);
}

static netdev_tx_t rhine_start_tx(struct sk_buff *skb,
				  struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	unsigned entry;
	unsigned long flags;

	/* Caution: the write order is important here, set the field
	   with the "ownership" bits last. */

	/* Calculate the next Tx descriptor entry. */
	entry = rp->cur_tx % TX_RING_SIZE;

	if (skb_padto(skb, ETH_ZLEN))
		return NETDEV_TX_OK;

	rp->tx_skbuff[entry] = skb;

	if ((rp->hw_flags & RHINE_I) &&
	    (((unsigned long)skb->data & 3) || skb_shinfo(skb)->nr_frags != 0 || skb->ip_summed == CHECKSUM_PARTIAL)) {
		/* Must use alignment buffer. */
		if (skb->len > PKT_BUF_SZ) {
			/* packet too long, drop it */
			dev_kfree_skb(skb);
			rp->tx_skbuff[entry] = NULL;
			dev->stats.tx_dropped++;
			return NETDEV_TX_OK;
		}

		/* Padding is not copied and so must be redone. */
		skb_copy_and_csum_dev(skb, rp->tx_buf[entry]);
		if (skb->len < ETH_ZLEN)
			memset(rp->tx_buf[entry] + skb->len, 0,
			       ETH_ZLEN - skb->len);
		rp->tx_skbuff_dma[entry] = 0;
		rp->tx_ring[entry].addr = cpu_to_le32(rp->tx_bufs_dma +
						      (rp->tx_buf[entry] -
						       rp->tx_bufs));
	} else {
		rp->tx_skbuff_dma[entry] =
			pci_map_single(rp->pdev, skb->data, skb->len,
				       PCI_DMA_TODEVICE);
		rp->tx_ring[entry].addr = cpu_to_le32(rp->tx_skbuff_dma[entry]);
	}

	rp->tx_ring[entry].desc_length =
		cpu_to_le32(TXDESC | (skb->len >= ETH_ZLEN ? skb->len : ETH_ZLEN));

	if (unlikely(vlan_tx_tag_present(skb))) {
		rp->tx_ring[entry].tx_status = cpu_to_le32((vlan_tx_tag_get(skb)) << 16);
		/* request tagging */
		rp->tx_ring[entry].desc_length |= cpu_to_le32(TCR_TAG);
	}
	else
		rp->tx_ring[entry].tx_status = 0;

	if ((dev->features & NETIF_F_IP_CSUM) &&
	    (skb->ip_summed == CHECKSUM_PARTIAL)) {
		struct iphdr *ip = ip_hdr(skb);
		if (ip->protocol == IPPROTO_TCP)
			rp->tx_ring[entry].desc_length |=
				cpu_to_le32(TCR_TCPCK);
		else if (ip->protocol == IPPROTO_UDP)
			rp->tx_ring[entry].desc_length |=
				cpu_to_le32(TCR_UDPCK);
		rp->tx_ring[entry].desc_length |= cpu_to_le32(TCR_IPCK);
	}

	/* lock eth irq */
	spin_lock_irqsave(&rp->lock, flags);
	wmb();
	rp->tx_ring[entry].tx_status |= cpu_to_le32(TSR_OWN);
	wmb();

	rp->cur_tx++;

	/* Non-x86 Todo: explicitly flush cache lines here. */

	if (vlan_tx_tag_present(skb))
		/* Tx queues are bits 7-0 (first Tx queue: bit 7) */
		BYTE_REG_BITS_ON(1 << 7, ioaddr + TQWake);

	/* Wake the potentially-idle transmit channel */
	iowrite8(ioread8(ioaddr + ChipCmd1) | Cmd1TxDemand,
	       ioaddr + ChipCmd1);
	IOSYNC;

	if (rp->cur_tx == rp->dirty_tx + TX_QUEUE_LEN)
		netif_stop_queue(dev);

	spin_unlock_irqrestore(&rp->lock, flags);

	if (debug > 4) {
		printk(KERN_DEBUG "%s: Transmit frame #%d queued in slot %d.\n",
		       dev->name, rp->cur_tx-1, entry);
	}
	return NETDEV_TX_OK;
}

/* The interrupt handler does all of the Rx thread work and cleans up
   after the Tx thread. */
static irqreturn_t rhine_interrupt(int irq, void *dev_instance)
{
	struct net_device *dev = dev_instance;
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	u32 intr_status;
	int boguscnt = max_interrupt_work;
	int handled = 0;

	while ((intr_status = get_intr_status(dev))) {
		handled = 1;

		/* Acknowledge all of the current interrupt sources ASAP. */
		if (intr_status & IntrTxDescRace)
			iowrite8(0x08, ioaddr + IntrStatus2);
		iowrite16(intr_status & 0xffff, ioaddr + IntrStatus);
		IOSYNC;

		if (debug > 4)
			printk(KERN_DEBUG "%s: Interrupt, status %8.8x.\n",
			       dev->name, intr_status);

		if (intr_status & (IntrRxDone | IntrRxErr | IntrRxDropped |
				   IntrRxWakeUp | IntrRxEmpty | IntrRxNoBuf)) {
			iowrite16(IntrTxAborted |
				  IntrTxDone | IntrTxError | IntrTxUnderrun |
				  IntrPCIErr | IntrStatsMax | IntrLinkChange,
				  ioaddr + IntrEnable);

			napi_schedule(&rp->napi);
		}

		if (intr_status & (IntrTxErrSummary | IntrTxDone)) {
			if (intr_status & IntrTxErrSummary) {
				/* Avoid scavenging before Tx engine turned off */
				RHINE_WAIT_FOR(!(ioread8(ioaddr+ChipCmd) & CmdTxOn));
				if (debug > 2 &&
				    ioread8(ioaddr+ChipCmd) & CmdTxOn)
					printk(KERN_WARNING "%s: "
					       "rhine_interrupt() Tx engine "
					       "still on.\n", dev->name);
			}
			rhine_tx(dev);
		}

		/* Abnormal error summary/uncommon events handlers. */
		if (intr_status & (IntrPCIErr | IntrLinkChange |
				   IntrStatsMax | IntrTxError | IntrTxAborted |
				   IntrTxUnderrun | IntrTxDescRace))
			rhine_error(dev, intr_status);

		if (--boguscnt < 0) {
			printk(KERN_WARNING "%s: Too much work at interrupt, "
			       "status=%#8.8x.\n",
			       dev->name, intr_status);
			break;
		}
	}

	if (debug > 3)
		printk(KERN_DEBUG "%s: exiting interrupt, status=%8.8x.\n",
		       dev->name, ioread16(ioaddr + IntrStatus));
	return IRQ_RETVAL(handled);
}

/* This routine is logically part of the interrupt handler, but isolated
   for clarity. */
static void rhine_tx(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	int txstatus = 0, entry = rp->dirty_tx % TX_RING_SIZE;

	spin_lock(&rp->lock);

	/* find and cleanup dirty tx descriptors */
	while (rp->dirty_tx != rp->cur_tx) {
		txstatus = le32_to_cpu(rp->tx_ring[entry].tx_status);
		if (debug > 6)
			printk(KERN_DEBUG "Tx scavenge %d status %8.8x.\n",
			       entry, txstatus);
		if (txstatus & TSR_OWN)
			break;
		if (txstatus & 0x8000) {
			if (debug > 1)
				printk(KERN_DEBUG "%s: Transmit error, "
				       "Tx status %8.8x.\n",
				       dev->name, txstatus);
			dev->stats.tx_errors++;
			if (txstatus & 0x0400)
				dev->stats.tx_carrier_errors++;
			if (txstatus & 0x0200)
				dev->stats.tx_window_errors++;
			if (txstatus & 0x0100)
				dev->stats.tx_aborted_errors++;
			if (txstatus & 0x0080)
				dev->stats.tx_heartbeat_errors++;
			if (((rp->hw_flags & RHINE_I) && txstatus & 0x0002) ||
			    (txstatus & 0x0800) || (txstatus & 0x1000)) {
				dev->stats.tx_fifo_errors++;
				rp->tx_ring[entry].tx_status = cpu_to_le32(TSR_OWN);
				break; /* Keep the skb - we try again */
			}
			/* Transmitter restarted in 'abnormal' handler. */
		} else {
			if (rp->hw_flags & RHINE_I)
				dev->stats.collisions += (txstatus >> 3) & 0x0F;
			else
				dev->stats.collisions += txstatus & 0x0F;
			if (debug > 6)
				printk(KERN_DEBUG "collisions: %1.1x:%1.1x\n",
				       (txstatus >> 3) & 0xF,
				       txstatus & 0xF);
			dev->stats.tx_bytes += rp->tx_skbuff[entry]->len;
			dev->stats.tx_packets++;
		}
		/* Free the original skb. */
		if (rp->tx_skbuff_dma[entry]) {
			pci_unmap_single(rp->pdev,
					 rp->tx_skbuff_dma[entry],
					 rp->tx_skbuff[entry]->len,
					 PCI_DMA_TODEVICE);
		}
		dev_kfree_skb_irq(rp->tx_skbuff[entry]);
		rp->tx_skbuff[entry] = NULL;
		entry = (++rp->dirty_tx) % TX_RING_SIZE;
	}
	if ((rp->cur_tx - rp->dirty_tx) < TX_QUEUE_LEN - 4)
		netif_wake_queue(dev);

	spin_unlock(&rp->lock);
}

/**
 * rhine_get_vlan_tci - extract TCI from Rx data buffer
 * @skb: pointer to sk_buff
 * @data_size: used data area of the buffer including CRC
 *
 * If hardware VLAN tag extraction is enabled and the chip indicates a 802.1Q
 * packet, the extracted 802.1Q header (2 bytes TPID + 2 bytes TCI) is 4-byte
 * aligned following the CRC.
 */
static inline u16 rhine_get_vlan_tci(struct sk_buff *skb, int data_size)
{
	u8 *trailer = (u8 *)skb->data + ((data_size + 3) & ~3) + 2;
	return ntohs(*(u16 *)trailer);
}

static inline void rhine_rx_csum(struct rx_desc *rd, struct sk_buff* skb)
{
	u32 status;

	if (rd->rx_status & cpu_to_le32(RSR0_FRAG))
		return;

	status = cpu_to_le32(rd->desc_length);
	if ((status & PQSTS_IPKT) && (status & PQSTS_IPOK)) {
		if (((status & PQSTS_TCPKT) || (status & PQSTS_UDPKT)) &&
		    (!(status & PQSTS_TUOK)))
			return;
		skb->ip_summed = CHECKSUM_UNNECESSARY;
	}
}

/* Process up to limit frames from receive ring */
static int rhine_rx(struct net_device *dev, int limit)
{
	struct rhine_private *rp = netdev_priv(dev);
	int count;
	int entry = rp->cur_rx % RX_RING_SIZE;

	if (debug > 4) {
		printk(KERN_DEBUG "%s: rhine_rx(), entry %d status %8.8x.\n",
		       dev->name, entry,
		       le32_to_cpu(rp->rx_head_desc->rx_status));
	}

	/* If EOP is set on the next entry, it's a new packet. Send it up. */
	for (count = 0; count < limit; ++count) {
		struct rx_desc *desc = rp->rx_head_desc;
		u32 desc_status = le32_to_cpu(desc->rx_status);
		u32 desc_length = le32_to_cpu(desc->desc_length);
		int data_size = desc_status >> 16;

		if (desc_status & RSR_OWN)
			break;

		if (debug > 4)
			printk(KERN_DEBUG "rhine_rx() status is %8.8x.\n",
			       desc_status);

		if ((desc_status & (RSR1_WHOLE_PKT | RSR0_ERR)) != RSR1_WHOLE_PKT) {
			if ((desc_status & RSR1_WHOLE_PKT) != RSR1_WHOLE_PKT) {
				printk(KERN_WARNING "%s: Oversized Ethernet "
				       "frame spanned multiple buffers, entry "
				       "%#x length %d status %8.8x!\n",
				       dev->name, entry, data_size,
				       desc_status);
				printk(KERN_WARNING "%s: Oversized Ethernet "
				       "frame %p vs %p.\n", dev->name,
				       rp->rx_head_desc, &rp->rx_ring[entry]);
				dev->stats.rx_length_errors++;
			} else if (desc_status & RSR0_ERR) {
				/* There was a error. */
				if (debug > 2)
					printk(KERN_DEBUG "rhine_rx() Rx "
					       "error was %8.8x.\n",
					       desc_status);
				dev->stats.rx_errors++;
				if (desc_status & 0x0030)
					dev->stats.rx_length_errors++;
				if (desc_status & 0x0048)
					dev->stats.rx_fifo_errors++;
				if (desc_status & 0x0004)
					dev->stats.rx_frame_errors++;
				if (desc_status & 0x0002) {
					/* this can also be updated outside the interrupt handler */
					spin_lock(&rp->lock);
					dev->stats.rx_crc_errors++;
					spin_unlock(&rp->lock);
				}
			}
		} else {
			struct sk_buff *skb = NULL;
			/* Length should omit the CRC */
			int pkt_len = data_size - 4;
			u16 vlan_tci = 0;

			/* Check if the packet is long enough to accept without
			   copying to a minimally-sized skbuff. */
			if (pkt_len < rx_copybreak)
				skb = netdev_alloc_skb_ip_align(dev, pkt_len);
			if (skb) {
				pci_dma_sync_single_for_cpu(rp->pdev,
							    rp->rx_skbuff_dma[entry],
							    rp->rx_buf_sz,
							    PCI_DMA_FROMDEVICE);

				skb_copy_to_linear_data(skb,
						 rp->rx_skbuff[entry]->data,
						 pkt_len);
				skb_put(skb, pkt_len);
				pci_dma_sync_single_for_device(rp->pdev,
							       rp->rx_skbuff_dma[entry],
							       rp->rx_buf_sz,
							       PCI_DMA_FROMDEVICE);
			} else {
				skb = rp->rx_skbuff[entry];
				if (skb == NULL) {
					printk(KERN_ERR "%s: Inconsistent Rx "
					       "descriptor chain.\n",
					       dev->name);
					break;
				}
				rp->rx_skbuff[entry] = NULL;
				skb_put(skb, pkt_len);
				pci_unmap_single(rp->pdev,
						 rp->rx_skbuff_dma[entry],
						 rp->rx_buf_sz,
						 PCI_DMA_FROMDEVICE);
			}

			if (unlikely(desc_length & PQSTS_TAG))
				vlan_tci = rhine_get_vlan_tci(skb, data_size);

			skb->protocol = eth_type_trans(skb, dev);

			skb->ip_summed = CHECKSUM_NONE;
			if (rp->cfg_flags & RHINE_RX_CSUM)
				rhine_rx_csum(desc, skb);

			if (unlikely(desc_length & PQSTS_TAG))
				__vlan_hwaccel_put_tag(skb, vlan_tci);
			netif_receive_skb(skb);
			dev->stats.rx_bytes += pkt_len;
			dev->stats.rx_packets++;
		}
		entry = (++rp->cur_rx) % RX_RING_SIZE;
		rp->rx_head_desc = &rp->rx_ring[entry];
	}

	/* Refill the Rx ring buffers. */
	for (; rp->cur_rx - rp->dirty_rx > 0; rp->dirty_rx++) {
		struct sk_buff *skb;
		entry = rp->dirty_rx % RX_RING_SIZE;
		if (rp->rx_skbuff[entry] == NULL) {
			skb = netdev_alloc_skb(dev, rp->rx_buf_sz);
			rp->rx_skbuff[entry] = skb;
			if (skb == NULL)
				break;	/* Better luck next round. */
			skb->dev = dev;	/* Mark as being used by this device. */
			rp->rx_skbuff_dma[entry] =
				pci_map_single(rp->pdev, skb->data,
					       rp->rx_buf_sz,
					       PCI_DMA_FROMDEVICE);
			rp->rx_ring[entry].addr = cpu_to_le32(rp->rx_skbuff_dma[entry]);
		}
		rp->rx_ring[entry].rx_status = cpu_to_le32(RSR_OWN);
	}

	return count;
}

/*
 * Clears the "tally counters" for CRC errors and missed frames(?).
 * It has been reported that some chips need a write of 0 to clear
 * these, for others the counters are set to 1 when written to and
 * instead cleared when read. So we clear them both ways ...
 */
static inline void clear_tally_counters(void __iomem *ioaddr)
{
	iowrite32(0, ioaddr + RxMissed);
	ioread16(ioaddr + RxCRCErrs);
	ioread16(ioaddr + RxMissed);
}

static void rhine_restart_tx(struct net_device *dev) {
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	int entry = rp->dirty_tx % TX_RING_SIZE;
	u32 intr_status;

	/*
	 * If new errors occured, we need to sort them out before doing Tx.
	 * In that case the ISR will be back here RSN anyway.
	 */
	intr_status = get_intr_status(dev);

	if ((intr_status & IntrTxErrSummary) == 0) {

		/* We know better than the chip where it should continue. */
		iowrite32(rp->tx_ring_dma + entry * sizeof(struct tx_desc),
		       ioaddr + TxRingPtr);

		iowrite8(ioread8(ioaddr + ChipCmd) | CmdTxOn,
		       ioaddr + ChipCmd);

		if (rp->tx_ring[entry].desc_length & cpu_to_le32(TCR_TAG))
			/* Tx queues are bits 7-0 (first Tx queue: bit 7) */
			BYTE_REG_BITS_ON(1 << 7, ioaddr + TQWake);

		iowrite8(ioread8(ioaddr + ChipCmd1) | Cmd1TxDemand,
		       ioaddr + ChipCmd1);
		IOSYNC;
	}
	else {
		/* This should never happen */
		if (debug > 1)
			printk(KERN_WARNING "%s: rhine_restart_tx() "
			       "Another error occured %8.8x.\n",
			       dev->name, intr_status);
	}

}

static void rhine_error(struct net_device *dev, int intr_status)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	spin_lock(&rp->lock);

	if (intr_status & IntrLinkChange)
		rhine_check_media(dev, 0);
	if (intr_status & IntrStatsMax) {
		dev->stats.rx_crc_errors += ioread16(ioaddr + RxCRCErrs);
		dev->stats.rx_missed_errors += ioread16(ioaddr + RxMissed);
		clear_tally_counters(ioaddr);
	}
	if (intr_status & IntrTxAborted) {
		if (debug > 1)
			printk(KERN_INFO "%s: Abort %8.8x, frame dropped.\n",
			       dev->name, intr_status);
	}
	if (intr_status & IntrTxUnderrun) {
		if (rp->tx_thresh < 0xE0)
			BYTE_REG_BITS_SET((rp->tx_thresh += 0x20), 0x80, ioaddr + TxConfig);
		if (debug > 1)
			printk(KERN_INFO "%s: Transmitter underrun, Tx "
			       "threshold now %2.2x.\n",
			       dev->name, rp->tx_thresh);
	}
	if (intr_status & IntrTxDescRace) {
		if (debug > 2)
			printk(KERN_INFO "%s: Tx descriptor write-back race.\n",
			       dev->name);
	}
	if ((intr_status & IntrTxError) &&
	    (intr_status & (IntrTxAborted |
	     IntrTxUnderrun | IntrTxDescRace)) == 0) {
		if (rp->tx_thresh < 0xE0) {
			BYTE_REG_BITS_SET((rp->tx_thresh += 0x20), 0x80, ioaddr + TxConfig);
		}
		if (debug > 1)
			printk(KERN_INFO "%s: Unspecified error. Tx "
			       "threshold now %2.2x.\n",
			       dev->name, rp->tx_thresh);
	}
	if (intr_status & (IntrTxAborted | IntrTxUnderrun | IntrTxDescRace |
			   IntrTxError))
		rhine_restart_tx(dev);

	if (intr_status & ~(IntrLinkChange | IntrStatsMax | IntrTxUnderrun |
			    IntrTxError | IntrTxAborted | IntrNormalSummary |
			    IntrTxDescRace)) {
		if (debug > 1)
			printk(KERN_ERR "%s: Something Wicked happened! "
			       "%8.8x.\n", dev->name, intr_status);
	}

	spin_unlock(&rp->lock);
}

static struct net_device_stats *rhine_get_stats(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	unsigned long flags;

	spin_lock_irqsave(&rp->lock, flags);
	dev->stats.rx_crc_errors += ioread16(ioaddr + RxCRCErrs);
	dev->stats.rx_missed_errors += ioread16(ioaddr + RxMissed);
	clear_tally_counters(ioaddr);
	spin_unlock_irqrestore(&rp->lock, flags);

	return &dev->stats;
}

static void rhine_set_rx_mode(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;
	u32 mc_filter[2];	/* Multicast hash filter */
	u8 rx_mode = 0x0C;	/* Note: 0x02=accept runt, 0x01=accept errs */
	struct netdev_hw_addr *ha;

	if (dev->flags & IFF_PROMISC) {		/* Set promiscuous. */
		rx_mode = 0x1C;
		iowrite32(0xffffffff, ioaddr + MulticastFilter0);
		iowrite32(0xffffffff, ioaddr + MulticastFilter1);
	} else if ((netdev_mc_count(dev) > multicast_filter_limit) ||
		   (dev->flags & IFF_ALLMULTI)) {
		/* Too many to match, or accept all multicasts. */
		iowrite32(0xffffffff, ioaddr + MulticastFilter0);
		iowrite32(0xffffffff, ioaddr + MulticastFilter1);
	} else if (rp->hw_flags & RHINE_VLAN) {
		int i = 0;
		u32 mCAMmask = 0;	/* 32 mCAMs (6105M and better) */
		netdev_for_each_mc_addr(ha, dev) {
			if (i == MCAM_SIZE)
				break;
			rhine_set_cam(ioaddr, i, ha->addr);
			mCAMmask |= 1 << i;
			i++;
		}
		rhine_set_cam_mask(ioaddr, mCAMmask);
	} else {
		memset(mc_filter, 0, sizeof(mc_filter));
		netdev_for_each_mc_addr(ha, dev) {
			int bit_nr = ether_crc(ETH_ALEN, ha->addr) >> 26;

			mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
		}
		iowrite32(mc_filter[0], ioaddr + MulticastFilter0);
		iowrite32(mc_filter[1], ioaddr + MulticastFilter1);
	}
	/* enable/disable VLAN receive filtering */
	if (rp->hw_flags & RHINE_VLAN) {
		if (dev->flags & IFF_PROMISC)
			BYTE_REG_BITS_OFF(BCR1_VIDFR, ioaddr + PCIBusConfig1);
		else
			BYTE_REG_BITS_ON(BCR1_VIDFR, ioaddr + PCIBusConfig1);
	}
	BYTE_REG_BITS_ON(rx_mode, ioaddr + RxConfig);
}

static void netdev_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
	struct rhine_private *rp = netdev_priv(dev);

	strcpy(info->driver, DRV_NAME);
	strcpy(info->version, DRV_VERSION);
	strcpy(info->bus_info, pci_name(rp->pdev));
}

static int netdev_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct rhine_private *rp = netdev_priv(dev);
	int rc;

	spin_lock_irq(&rp->lock);
	rc = mii_ethtool_gset(&rp->mii_if, cmd);
	spin_unlock_irq(&rp->lock);

	return rc;
}

static int netdev_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
	struct rhine_private *rp = netdev_priv(dev);
	int rc;

	spin_lock_irq(&rp->lock);
	rc = mii_ethtool_sset(&rp->mii_if, cmd);
	spin_unlock_irq(&rp->lock);
	rhine_set_carrier(&rp->mii_if);

	return rc;
}

static int netdev_nway_reset(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);

	return mii_nway_restart(&rp->mii_if);
}

static u32 netdev_get_link(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);

	return mii_link_ok(&rp->mii_if);
}

static u32 netdev_get_msglevel(struct net_device *dev)
{
	return debug;
}

static void netdev_set_msglevel(struct net_device *dev, u32 value)
{
	debug = value;
}

static void rhine_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
{
	struct rhine_private *rp = netdev_priv(dev);

	if (!(rp->hw_flags & RHINE_WOL_BITS))
		return;

	spin_lock_irq(&rp->lock);
	wol->supported = rp->hw_flags & RHINE_WOL_BITS;
	wol->wolopts = rp->cfg_flags & RHINE_WOL_BITS;
	spin_unlock_irq(&rp->lock);
}

static int rhine_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
{
	struct rhine_private *rp = netdev_priv(dev);

	if (wol->wolopts & ~(rp->hw_flags & RHINE_WOL_BITS))
		return -EINVAL;

	spin_lock_irq(&rp->lock);
	rp->cfg_flags = wol->wolopts | (rp->cfg_flags & ~RHINE_WOL_BITS);
	spin_unlock_irq(&rp->lock);

	return 0;
}

static u32 rhine_get_rx_csum(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);

	return (rp->cfg_flags & RHINE_RX_CSUM) != 0;
}

static int rhine_set_rx_csum(struct net_device *dev, u32 data)
{
	struct rhine_private *rp = netdev_priv(dev);

	if (!(rp->hw_flags & RHINE_RX_CSUM)) {
		if (data != 0)
			return -EINVAL;
		return 0;
	}

	if (data)
		rp->cfg_flags |= RHINE_RX_CSUM;
	else
		rp->cfg_flags &= ~RHINE_RX_CSUM;

	return 0;
}

static int rhine_set_tx_csum(struct net_device *dev, u32 data)
{
	struct rhine_private *rp = netdev_priv(dev);

	if (!(rp->hw_flags & RHINE_TX_CSUM)) {
		if (data != 0)
			return -EINVAL;
		return 0;
	}

	ethtool_op_set_tx_csum(dev, data);

	return 0;
}

static const struct ethtool_ops netdev_ethtool_ops = {
	.get_drvinfo		= netdev_get_drvinfo,
	.get_settings		= netdev_get_settings,
	.set_settings		= netdev_set_settings,
	.nway_reset		= netdev_nway_reset,
	.get_link		= netdev_get_link,
	.get_msglevel		= netdev_get_msglevel,
	.set_msglevel		= netdev_set_msglevel,
	.get_wol		= rhine_get_wol,
	.set_wol		= rhine_set_wol,
	.get_rx_csum		= rhine_get_rx_csum,
	.set_rx_csum		= rhine_set_rx_csum,
	.get_tx_csum		= ethtool_op_get_tx_csum,
	.set_tx_csum		= rhine_set_tx_csum,
};

static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
	struct rhine_private *rp = netdev_priv(dev);
	int rc;

	if (!netif_running(dev))
		return -EINVAL;

	spin_lock_irq(&rp->lock);
	rc = generic_mii_ioctl(&rp->mii_if, if_mii(rq), cmd, NULL);
	spin_unlock_irq(&rp->lock);
	rhine_set_carrier(&rp->mii_if);

	return rc;
}

static int rhine_close(struct net_device *dev)
{
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	napi_disable(&rp->napi);
	cancel_work_sync(&rp->reset_task);
	netif_stop_queue(dev);

	spin_lock_irq(&rp->lock);

	if (debug > 1)
		printk(KERN_DEBUG "%s: Shutting down ethercard, "
		       "status was %4.4x.\n",
		       dev->name, ioread16(ioaddr + ChipCmd));

	/* Switch to loopback mode to avoid hardware races. */
	iowrite8(rp->tx_thresh | 0x02, ioaddr + TxConfig);

	/* Disable interrupts by clearing the interrupt mask. */
	iowrite16(0x0000, ioaddr + IntrEnable);

	/* Stop the chip's Tx and Rx processes. */
	iowrite16(CmdStop, ioaddr + ChipCmd);

	spin_unlock_irq(&rp->lock);

	free_irq(rp->pdev->irq, dev);
	free_rbufs(dev);
	free_tbufs(dev);
	free_ring(dev);

	return 0;
}


static void __devexit rhine_remove_one(struct pci_dev *pdev)
{
	struct net_device *dev = pci_get_drvdata(pdev);
	struct rhine_private *rp = netdev_priv(dev);

	unregister_netdev(dev);

	pci_iounmap(pdev, rp->base);
	pci_release_regions(pdev);

	free_netdev(dev);
	pci_disable_device(pdev);
	pci_set_drvdata(pdev, NULL);
}

static void rhine_shutdown (struct pci_dev *pdev)
{
	struct net_device *dev = pci_get_drvdata(pdev);
	struct rhine_private *rp = netdev_priv(dev);
	void __iomem *ioaddr = rp->base;

	if (!(rp->hw_flags & RHINE_WOL))
		return; /* Nothing to do for non-WOL adapters */

	rhine_power_init(dev);

	/* Make sure we use pattern 0, 1 and not 4, 5 */
	if (rp->hw_flags & RHINE_6_PATS)
		iowrite8(0x04, ioaddr + WOLcgClr);

	if (rp->cfg_flags & WAKE_MAGIC) {
		iowrite8(WOLmagic, ioaddr + WOLcrSet);
		/*
		 * Turn EEPROM-controlled wake-up back on -- some hardware may
		 * not cooperate otherwise.
		 */
		iowrite8(ioread8(ioaddr + ConfigA) | 0x03, ioaddr + ConfigA);
	}

	if (rp->cfg_flags & (WAKE_BCAST | WAKE_MCAST))
		iowrite8(WOLbmcast, ioaddr + WOLcgSet);

	if (rp->cfg_flags & WAKE_PHY)
		iowrite8(WOLlnkon | WOLlnkoff, ioaddr + WOLcrSet);

	if (rp->cfg_flags & WAKE_UCAST)
		iowrite8(WOLucast, ioaddr + WOLcrSet);

	if (rp->cfg_flags) {
		/* Enable legacy WOL (for old motherboards) */
		iowrite8(0x01, ioaddr + PwcfgSet);
		iowrite8(ioread8(ioaddr + StickyHW) | 0x04, ioaddr + StickyHW);
	}

	/* Hit power state D3 (sleep) */
	if (!avoid_D3)
		iowrite8(ioread8(ioaddr + StickyHW) | 0x03, ioaddr + StickyHW);

	/* TODO: Check use of pci_enable_wake() */

}

#ifdef CONFIG_PM
static int rhine_suspend(struct pci_dev *pdev, pm_message_t state)
{
	struct net_device *dev = pci_get_drvdata(pdev);
	struct rhine_private *rp = netdev_priv(dev);
	unsigned long flags;

	if (!netif_running(dev))
		return 0;

	napi_disable(&rp->napi);

	netif_device_detach(dev);
	pci_save_state(pdev);

	spin_lock_irqsave(&rp->lock, flags);
	rhine_shutdown(pdev);
	spin_unlock_irqrestore(&rp->lock, flags);

	free_irq(dev->irq, dev);
	return 0;
}

static int rhine_resume(struct pci_dev *pdev)
{
	struct net_device *dev = pci_get_drvdata(pdev);
	struct rhine_private *rp = netdev_priv(dev);
	unsigned long flags;
	int ret;

	if (!netif_running(dev))
		return 0;

	if (request_irq(dev->irq, rhine_interrupt, IRQF_SHARED, dev->name, dev))
		printk(KERN_ERR "via-rhine %s: request_irq failed\n", dev->name);

	ret = pci_set_power_state(pdev, PCI_D0);
	if (debug > 1)
		printk(KERN_INFO "%s: Entering power state D0 %s (%d).\n",
			dev->name, ret ? "failed" : "succeeded", ret);

	pci_restore_state(pdev);

	spin_lock_irqsave(&rp->lock, flags);
#ifdef USE_MMIO
	enable_mmio(rp->pioaddr, rp->hw_flags);
#endif
	rhine_power_init(dev);
	free_tbufs(dev);
	free_rbufs(dev);
	alloc_tbufs(dev);
	alloc_rbufs(dev);
	init_registers(dev);
	spin_unlock_irqrestore(&rp->lock, flags);

	netif_device_attach(dev);

	return 0;
}
#endif /* CONFIG_PM */

static struct pci_driver rhine_driver = {
	.name		= DRV_NAME,
	.id_table	= rhine_pci_tbl,
	.probe		= rhine_init_one,
	.remove		= __devexit_p(rhine_remove_one),
#ifdef CONFIG_PM
	.suspend	= rhine_suspend,
	.resume		= rhine_resume,
#endif /* CONFIG_PM */
	.shutdown =	rhine_shutdown,
};

static struct dmi_system_id __initdata rhine_dmi_table[] = {
	{
		.ident = "EPIA-M",
		.matches = {
			DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."),
			DMI_MATCH(DMI_BIOS_VERSION, "6.00 PG"),
		},
	},
	{
		.ident = "KV7",
		.matches = {
			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
			DMI_MATCH(DMI_BIOS_VERSION, "6.00 PG"),
		},
	},
	{ NULL }
};

static int __init rhine_init(void)
{
/* when a module, this is printed whether or not devices are found in probe */
#ifdef MODULE
	printk(version);
#endif
	if (dmi_check_system(rhine_dmi_table)) {
		/* these BIOSes fail at PXE boot if chip is in D3 */
		avoid_D3 = 1;
		printk(KERN_WARNING "%s: Broken BIOS detected, avoid_D3 "
				    "enabled.\n",
		       DRV_NAME);
	}
	else if (avoid_D3)
		printk(KERN_INFO "%s: avoid_D3 set.\n", DRV_NAME);

	return pci_register_driver(&rhine_driver);
}


static void __exit rhine_cleanup(void)
{
	pci_unregister_driver(&rhine_driver);
}


module_init(rhine_init);
module_exit(rhine_cleanup);


^ permalink raw reply

* Re: [PATCH] don't allow CAP_NET_ADMIN to load non-netdev kernel modules
From: Ben Hutchings @ 2011-02-25 19:53 UTC (permalink / raw)
  To: David Miller
  Cc: segoon, netdev, linux-kernel, kuznet, pekkas, jmorris, yoshfuji,
	kaber, eric.dumazet, therbert, xiaosuo, jesse, kees.cook, eugene,
	dan.j.rosenberg, akpm
In-Reply-To: <20110225.114351.28809001.davem@davemloft.net>

On Fri, 2011-02-25 at 11:43 -0800, David Miller wrote:
> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Fri, 25 Feb 2011 19:30:16 +0000
> 
> > On Fri, 2011-02-25 at 11:16 -0800, David Miller wrote:
> >> From: Ben Hutchings <bhutchings@solarflare.com>
> >> Date: Fri, 25 Feb 2011 19:07:59 +0000
> >> 
> >> > You realise that module loading doesn't actually run in the context of
> >> > request_module(), right?
> >> 
> >> Why is that a barrier?  We could simply pass a capability mask into
> >> request_module if necessary.
> >> 
> >> It's an implementation detail, and not a deterrant to my suggested
> >> scheme.
> > 
> > It's not an implementation detail.  modprobe currently runs with full
> > capabilities; your proposal requires its capabilities to be limited to
> > those of the capabilities of the process that triggered the
> > request_module() (plus, presumably, CAP_SYS_MODULE).
> 
> The idea was that the kernel will be the entity that will inspect the
> elf sections and validate the capability bits, not the userspace
> module loader.

Yes, I understand that.

> Surely we if we can pass an arbitrary string out to the loading
> process as part of the module loading context, we can pass along
> capability bits as well.

If you want insert_module() to be able to deny loading some modules
based on the capabilities of the process calling request_module() then
you either have to *reduce* the capabilities given to modprobe or create
some extra process state, separate from the usual capability state,
specifically for this purpose.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: pull request: wireless-next-2.6 2011-02-22
From: John W. Linville @ 2011-02-25 19:48 UTC (permalink / raw)
  To: David Miller; +Cc: linux-wireless, linux-bluetooth, netdev, padovan
In-Reply-To: <20110225.111500.59674472.davem@davemloft.net>

On Fri, Feb 25, 2011 at 11:15:00AM -0800, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Thu, 24 Feb 2011 22:43:44 -0800 (PST)

> > Pulled, thanks a lot John.
> 
> John a few things:
> 
> 1) I had to add some vmalloc.h includes to fix the build on sparc64,
>    see commit b08cd667c4b6641c4d16a3f87f4550f81a6d69ac in net-next-2.6

I have a patch in my tree for that -- seems they hit it on ARM as well.

> 2) Something is screwey with the bluetooth config options now.
> 
>    I have an allmodconfig tree, and when I run "make oldconfig" after
>    this pull, BT_L2CAP and BT_SCO both prompt me, claiming that they
>    can only be built statically.
> 
>    I give it 'y' just to make it happen, for both, and afterways no
>    matter how many times I rerun "make oldconfig" I keep seeing things
>    like this in my build:
> 
> scripts/kconfig/conf --silentoldconfig Kconfig
> include/config/auto.conf:986:warning: symbol value 'm' invalid for BT_SCO
> include/config/auto.conf:3156:warning: symbol value 'm' invalid for BT_L2CAP
> 
>    First, what the heck is going on here?  Second, why the heck can't these
>    non-trivial pieces of code be built modular any more?
> 
>    You can't make something "bool", have it depend on something that
>    might be modular, and then build it into what could in fact be a
>    module.  That's exactly what the bluetooth stuff seems to be doing
>    now.
> 
>    I suspect commit 642745184f82688eb3ef0cdfaa4ba632055be9af
> 
> Thanks.

Sorry, I overlooked that.  Hopefully Gustavo will figure it out quickly.

Thanks,

John
-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox