Netdev List
 help / color / mirror / Atom feed
* Re: [net-2.6 PATCH] e1000e: enable/disable ASPM L0s and L1 and ERT according to hardware errata
From: David Miller @ 2010-04-29 19:04 UTC (permalink / raw)
  To: bruce.w.allan; +Cc: anton, jeffrey.t.kirsher, netdev, gospo, mjg
In-Reply-To: <8DD2590731AB5D4C9DBF71A877482A9062274501@orsmsx509.amr.corp.intel.com>

From: "Allan, Bruce W" <bruce.w.allan@intel.com>
Date: Thu, 29 Apr 2010 10:19:56 -0700

> Your patch is probably the correct thing to do but I'm not all that
> familiar with the ppc64 architecture.  Would you please provide the
> output of 'lspci -t' and 'lspci -vvv -xxx'.

You're not guarenteed for there to be a pci_dev backing the top-level
host controller, at the very least.  Some platforms don't even implement
the PCI config space for the host controller, whilst on others access
to them is protected by the hypervisor.

So you can't go poking around the PCI host controller registers
unconditionally.

The same OOPS probably would happen on Sparc64 in some configurations
too.  Although all of my PCI-E slots do have PCI-E express switch port
nodes, so maybe it wouldn't trigger here.


^ permalink raw reply

* Re: [PATCH] bonding: fix arp_validate on bonds inside a bridge
From: Jay Vosburgh @ 2010-04-29 18:57 UTC (permalink / raw)
  To: Jiri Bohac; +Cc: bonding-devel, netdev
In-Reply-To: <20100429163921.GA4228@midget.suse.cz>

Jiri Bohac <jbohac@suse.cz> wrote:

>On Wed, Apr 28, 2010 at 12:05:11PM -0700, Jay Vosburgh wrote:
>> Jiri Bohac <jbohac@suse.cz> wrote:
>> >--- a/include/linux/netdevice.h
>> >+++ b/include/linux/netdevice.h
>> >@@ -2055,6 +2055,8 @@ static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
>> > 	}
>> > }
>> >
>> >+extern void (*bond_handle_arp_hook)(struct sk_buff *skb);
>> 
>> 	Should this be inside the the skb_bond_should_drop function to
>> limit its scope?  Just wondering if that's a little tidier.
>
>No, this needs to be global, so that the bonding module can
>initialize it with the correct address.
>
>
>> >--- a/net/core/dev.c
>> >+++ b/net/core/dev.c
>> >@@ -2314,6 +2314,9 @@ static inline int deliver_skb(struct sk_buff *skb,
>> > 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>> > }
>> >
>> >+void (*bond_handle_arp_hook)(struct sk_buff *skb);
>> >+EXPORT_SYMBOL_GPL(bond_handle_arp_hook);
>> 
>> 	Should this be hidden by
>> 
>> #if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>> 
>> 	with some parallel changes to skb_bond_should_drop so it
>> vanishes if bonding is not configured?  Granted, distros will all turn
>> it on anyway, but the embedded size might be a bit smaller.
>
>Sure, good idea. I put the whole skb_bonding_should_drop function
>in an #ifdef, which will actually speed up the rx path if
>CONFIG_BONDING is not set. A new version follows:

	This doesn't apply to the current net-next-2.6 (because
skb_bond_should_drop was pulled out of line a few weeks ago); can you
update the patch?

	-J

>----
>bonding with arp_validate does not currently work when the
>bonding master is part of a bridge. This is because
>bond_arp_rcv() is registered as a packet type handler for ARP,
>but before netif_receive_skb() processes the ptype_base hash
>table, handle_bridge() is called and changes the skb->dev to
>point to the bridge device.
>
>This patch makes bonding_should_drop() call the bonding ARP
>handler directly if a IFF_MASTER_NEEDARP flag is set on the
>bonding master.  bond_register_arp() now only needs to set the
>IFF_MASTER_NEEDARP flag.
>
>We no longer need special ARP handling for inactive slaves, hence
>IFF_SLAVE_NEEDARP is not needed.
>
>skb_reset_network_header() and skb_reset_transport_header() need
>to be called before the call to bonding_should_drop() because
>bond_handle_arp() needs the offsets initialized.
>
>As a side-effect, skb_bond_should_drop is #defined as 0
>when CONFIG_BONDING is not set.
>
>Signed-off-by: Jiri Bohac <jbohac@suse.cz>
>
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 0075514..cafd404 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -1940,8 +1940,7 @@ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
> 	}
>
> 	slave_dev->priv_flags &= ~(IFF_MASTER_8023AD | IFF_MASTER_ALB |
>-				   IFF_SLAVE_INACTIVE | IFF_BONDING |
>-				   IFF_SLAVE_NEEDARP);
>+				   IFF_SLAVE_INACTIVE | IFF_BONDING);
>
> 	kfree(slave);
>
>@@ -2612,11 +2611,12 @@ static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32
> 	}
> }
>
>-static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
>+static void bond_handle_arp(struct sk_buff *skb)
> {
> 	struct arphdr *arp;
> 	struct slave *slave;
> 	struct bonding *bond;
>+	struct net_device *dev = skb->dev->master, *orig_dev = skb->dev;
> 	unsigned char *arp_ptr;
> 	__be32 sip, tip;
>
>@@ -2637,9 +2637,8 @@ static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
> 	bond = netdev_priv(dev);
> 	read_lock(&bond->lock);
>
>-	pr_debug("bond_arp_rcv: bond %s skb->dev %s orig_dev %s\n",
>-		 bond->dev->name, skb->dev ? skb->dev->name : "NULL",
>-		 orig_dev ? orig_dev->name : "NULL");
>+	pr_debug("bond_handle_arp: bond: %s, master: %s, slave: %s\n",
>+		bond->dev->name, dev->name, orig_dev->name);
>
> 	slave = bond_get_slave_by_dev(bond, orig_dev);
> 	if (!slave || !slave_do_arp_validate(bond, slave))
>@@ -2684,8 +2683,7 @@ static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
> out_unlock:
> 	read_unlock(&bond->lock);
> out:
>-	dev_kfree_skb(skb);
>-	return NET_RX_SUCCESS;
>+	return;
> }
>
> /*
>@@ -3567,23 +3565,12 @@ static void bond_unregister_lacpdu(struct bonding *bond)
>
> void bond_register_arp(struct bonding *bond)
> {
>-	struct packet_type *pt = &bond->arp_mon_pt;
>-
>-	if (pt->type)
>-		return;
>-
>-	pt->type = htons(ETH_P_ARP);
>-	pt->dev = bond->dev;
>-	pt->func = bond_arp_rcv;
>-	dev_add_pack(pt);
>+	bond->dev->priv_flags |= IFF_MASTER_NEEDARP;
> }
>
> void bond_unregister_arp(struct bonding *bond)
> {
>-	struct packet_type *pt = &bond->arp_mon_pt;
>-
>-	dev_remove_pack(pt);
>-	pt->type = 0;
>+	bond->dev->priv_flags &= ~IFF_MASTER_NEEDARP;
> }
>
> /*---------------------------- Hashing Policies -----------------------------*/
>@@ -5041,6 +5028,7 @@ static int __init bonding_init(void)
> 	register_netdevice_notifier(&bond_netdev_notifier);
> 	register_inetaddr_notifier(&bond_inetaddr_notifier);
> 	bond_register_ipv6_notifier();
>+	bond_handle_arp_hook = bond_handle_arp;
> out:
> 	return res;
> err:
>@@ -5061,6 +5049,7 @@ static void __exit bonding_exit(void)
>
> 	rtnl_link_unregister(&bond_link_ops);
> 	unregister_pernet_subsys(&bond_net_ops);
>+	bond_handle_arp_hook = NULL;
> }
>
> module_init(bonding_init);
>diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
>index 257a7a4..57adfe5 100644
>--- a/drivers/net/bonding/bonding.h
>+++ b/drivers/net/bonding/bonding.h
>@@ -212,7 +212,6 @@ struct bonding {
> 	struct   bond_params params;
> 	struct   list_head vlan_list;
> 	struct   vlan_group *vlgrp;
>-	struct   packet_type arp_mon_pt;
> 	struct   workqueue_struct *wq;
> 	struct   delayed_work mii_work;
> 	struct   delayed_work arp_work;
>@@ -292,14 +291,12 @@ static inline void bond_set_slave_inactive_flags(struct slave *slave)
> 	if (!bond_is_lb(bond))
> 		slave->state = BOND_STATE_BACKUP;
> 	slave->dev->priv_flags |= IFF_SLAVE_INACTIVE;
>-	if (slave_do_arp_validate(bond, slave))
>-		slave->dev->priv_flags |= IFF_SLAVE_NEEDARP;
> }
>
> static inline void bond_set_slave_active_flags(struct slave *slave)
> {
> 	slave->state = BOND_STATE_ACTIVE;
>-	slave->dev->priv_flags &= ~(IFF_SLAVE_INACTIVE | IFF_SLAVE_NEEDARP);
>+	slave->dev->priv_flags &= ~IFF_SLAVE_INACTIVE;
> }
>
> static inline void bond_set_master_3ad_flags(struct bonding *bond)
>diff --git a/include/linux/if.h b/include/linux/if.h
>index 3a9f410..84ab2c8 100644
>--- a/include/linux/if.h
>+++ b/include/linux/if.h
>@@ -63,7 +63,7 @@
> #define IFF_MASTER_8023AD	0x8	/* bonding master, 802.3ad. 	*/
> #define IFF_MASTER_ALB	0x10		/* bonding master, balance-alb.	*/
> #define IFF_BONDING	0x20		/* bonding master or slave	*/
>-#define IFF_SLAVE_NEEDARP 0x40		/* need ARPs for validation	*/
>+#define IFF_MASTER_NEEDARP 0x40		/* need ARPs for validation	*/
> #define IFF_ISATAP	0x80		/* ISATAP interface (RFC4214)	*/
> #define IFF_MASTER_ARPMON 0x100		/* bonding master, ARP mon in use */
> #define IFF_WAN_HDLC	0x200		/* WAN HDLC device		*/
>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>index fa8b476..1ad9b71 100644
>--- a/include/linux/netdevice.h
>+++ b/include/linux/netdevice.h
>@@ -2055,6 +2055,9 @@ static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
> 	}
> }
>
>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>+extern void (*bond_handle_arp_hook)(struct sk_buff *skb);
>+
> /* On bonding slaves other than the currently active slave, suppress
>  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
>  * ARP on active-backup slaves with arp_validate enabled.
>@@ -2076,11 +2079,13 @@ static inline int skb_bond_should_drop(struct sk_buff *skb,
> 			skb_bond_set_mac_by_master(skb, master);
> 		}
>
>-		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
>-			if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
>-			    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>-				return 0;
>+		/* pass ARP frames directly to bonding
>+		   before bridging or other hooks change them */
>+		if ((master->priv_flags & IFF_MASTER_NEEDARP) &&
>+		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
>+			bond_handle_arp_hook(skb);
>
>+		if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
> 			if (master->priv_flags & IFF_MASTER_ALB) {
> 				if (skb->pkt_type != PACKET_BROADCAST &&
> 				    skb->pkt_type != PACKET_MULTICAST)
>@@ -2095,6 +2100,9 @@ static inline int skb_bond_should_drop(struct sk_buff *skb,
> 	}
> 	return 0;
> }
>+#else
>+#define skb_bond_should_drop(a, b) 0
>+#endif
>
> extern struct pernet_operations __net_initdata loopback_net_ops;
>
>diff --git a/net/core/dev.c b/net/core/dev.c
>index f769098..f9821f1 100644
>--- a/net/core/dev.c
>+++ b/net/core/dev.c
>@@ -2314,6 +2314,11 @@ static inline int deliver_skb(struct sk_buff *skb,
> 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
> }
>
>+#if defined(CONFIG_BONDING) || defined(CONFIG_BONDING_MODULE)
>+void (*bond_handle_arp_hook)(struct sk_buff *skb);
>+EXPORT_SYMBOL_GPL(bond_handle_arp_hook);
>+#endif
>+
> #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
>
> #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
>@@ -2507,6 +2512,10 @@ int netif_receive_skb(struct sk_buff *skb)
> 	if (!skb->skb_iif)
> 		skb->skb_iif = skb->dev->ifindex;
>
>+	skb_reset_network_header(skb);
>+	skb_reset_transport_header(skb);
>+	skb->mac_len = skb->network_header - skb->mac_header;
>+
> 	null_or_orig = NULL;
> 	orig_dev = skb->dev;
> 	master = ACCESS_ONCE(orig_dev->master);
>@@ -2519,10 +2528,6 @@ int netif_receive_skb(struct sk_buff *skb)
>
> 	__get_cpu_var(netdev_rx_stat).total++;
>
>-	skb_reset_network_header(skb);
>-	skb_reset_transport_header(skb);
>-	skb->mac_len = skb->network_header - skb->mac_header;
>-
> 	pt_prev = NULL;
>
> 	rcu_read_lock();
>
>
>Thanks,
>
>-- 
>Jiri Bohac <jbohac@suse.cz>
>SUSE Labs, SUSE CZ
>

---
	-Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com

^ permalink raw reply

* Re: patch sysfs-implement-sysfs-tagged-directory-support.patch added to gregkh-2.6 tree
From: Greg KH @ 2010-04-30 15:58 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Tejun Heo, Eric W. Biederman, bcrl, benjamin.thery, cornelia.huck,
	eric.dumazet, kay.sievers, netdev
In-Reply-To: <20100430154320.GC13977@us.ibm.com>

On Fri, Apr 30, 2010 at 10:43:21AM -0500, Serge E. Hallyn wrote:
> Quoting Tejun Heo (tj@kernel.org):
> > Hello,
> > 
> > On 04/30/2010 04:29 PM, Serge E. Hallyn wrote:
> > > Hmm, but looking back over the previous thread (Mar 31) I guess you
> > > mean more in-line comments around the callbacks, presumably things
> > > like class_dir_child_ns_type() and struct kobj_ns_type_operations
> > > members?
> > 
> > In-line.  What they're, how they're supposed to be used, which calling
> > context is expected, what can be returned and so on.
> > 
> > > It sounds like what you'd really like is to have any explicit
> > > mention to namespaces pulled out of drivers/base (layering as you
> > > keep saying)?  But will there be a use for this outside of
> > > namespaces?  Does trying to anticipate that fall into the category
> > > of over-abstraction?
> > 
> > I wouldn't mind limited amount of layering exceptions as long as
> > they're clearly documented.  What I'm primarily worried about is not
> > the possibility of other users but more the obfuscation of the whole
> > sysfs-kobject-driver model thing which is already overly abstracted
> > and obfuscated (at least it seems to me that way).
> > 
> > NS needs tagged support in the driver model which in itself is fine
> > and I also understand that from someone who's primarily working on NS,
> > adding a bit on top of the whole thing wouldn't seem like much of a
> > problem.  To me it seems like worsening a problem which is already
> > pretty bad.  I hope you could understand my POV too.
> 
> I do.  I can take a stab monday at pushing a cloned version of Eric's
> tree with comments added, if Eric doesn't have time.  (Or a patch on
> top of Greg's tree)

On top of Greg's tree please :)

thanks,

greg k-h

^ permalink raw reply

* Re: NULL pointer dereference panic in stable (2.6.33.2), amd64
From: Denys Fedorysychenko @ 2010-04-29 10:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, krkumar2, netdev
In-Reply-To: <1271327830.16881.2370.camel@edumazet-laptop>

On Thursday 15 April 2010 13:37:10 Eric Dumazet wrote:
> Le jeudi 15 avril 2010 à 12:11 +0300, Denys Fedorysychenko a écrit :
> > Btw i have application using tun.
> 
> Could you add following sanity test to catch the error ?
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index fa8b476..b67274a 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -988,6 +988,7 @@ static inline
>  struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
>  					 unsigned int index)
>  {
> +	WARN_ON(index >= dev->num_tx_queues);
>  	return &dev->_tx[index];
>  }
> 
Very sorry for being late, just i found way to stabilize kernel for me and to 
solve my personal life issues. It took 2 weeks...
I will try this patch, but i'm sure i dont have multiqueue card there.

I had shaper on eth0.33 and eth0 so i disable all offloading except 
checksumming there. Recently i found  that on some interfaces (where is no 
shaper) gso left on. And yes, probably some traffic was queued, then route 
changed, and maybe it went from gso off interface to gso on... 
It can be just pure luck, that i dont have crashes anymore, but maybe trick is 
in gso...

^ permalink raw reply

* Re: patch sysfs-implement-sysfs-tagged-directory-support.patch added to gregkh-2.6 tree
From: Serge E. Hallyn @ 2010-04-30 15:43 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Eric W. Biederman, Greg KH, bcrl, benjamin.thery, cornelia.huck,
	eric.dumazet, kay.sievers, netdev
In-Reply-To: <4BDAF5B0.3080801@kernel.org>

Quoting Tejun Heo (tj@kernel.org):
> Hello,
> 
> On 04/30/2010 04:29 PM, Serge E. Hallyn wrote:
> > Hmm, but looking back over the previous thread (Mar 31) I guess you
> > mean more in-line comments around the callbacks, presumably things
> > like class_dir_child_ns_type() and struct kobj_ns_type_operations
> > members?
> 
> In-line.  What they're, how they're supposed to be used, which calling
> context is expected, what can be returned and so on.
> 
> > It sounds like what you'd really like is to have any explicit
> > mention to namespaces pulled out of drivers/base (layering as you
> > keep saying)?  But will there be a use for this outside of
> > namespaces?  Does trying to anticipate that fall into the category
> > of over-abstraction?
> 
> I wouldn't mind limited amount of layering exceptions as long as
> they're clearly documented.  What I'm primarily worried about is not
> the possibility of other users but more the obfuscation of the whole
> sysfs-kobject-driver model thing which is already overly abstracted
> and obfuscated (at least it seems to me that way).
> 
> NS needs tagged support in the driver model which in itself is fine
> and I also understand that from someone who's primarily working on NS,
> adding a bit on top of the whole thing wouldn't seem like much of a
> problem.  To me it seems like worsening a problem which is already
> pretty bad.  I hope you could understand my POV too.

I do.  I can take a stab monday at pushing a cloned version of Eric's
tree with comments added, if Eric doesn't have time.  (Or a patch on
top of Greg's tree)

thanks,
-serge

^ permalink raw reply

* Re: [PATCH linux-next v2 1/2] irq: Add CPU mask affinity hint
From: Peter P Waskiewicz Jr @ 2010-04-30 16:41 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: davem@davemloft.net, arjan@linux.jf.intel.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <alpine.LFD.2.00.1004301249540.2951@localhost.localdomain>

On Fri, 30 Apr 2010, Thomas Gleixner wrote:

> On Fri, 30 Apr 2010, Peter P Waskiewicz Jr wrote:
>
>> This patch adds a cpumask affinity hint to the irq_desc
>> structure, along with a registration function and a read-only
>> proc entry for each interrupt.
>>
>> This affinity_hint handle for each interrupt can be used by
>> underlying drivers that need a better mechanism to control
>> interrupt affinity.  The underlying driver can register a
>> cpumask for the interrupt, which will allow the driver to
>> provide the CPU mask for the interrupt to anything that
>> requests it.  The intent is to extend the userspace daemon,
>> irqbalance, to help hint to it a preferred CPU mask to balance
>> the interrupt into.
>>
>> Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
>> ---
>>
>>  include/linux/interrupt.h |   13 +++++++++++++
>>  include/linux/irq.h       |    1 +
>>  kernel/irq/manage.c       |   28 ++++++++++++++++++++++++++++
>>  kernel/irq/proc.c         |   33 +++++++++++++++++++++++++++++++++
>>  4 files changed, 75 insertions(+), 0 deletions(-)
>>
>> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
>> index 75f3f00..9c9ea2a 100644
>> --- a/include/linux/interrupt.h
>> +++ b/include/linux/interrupt.h
>> @@ -209,6 +209,9 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
>>  extern int irq_can_set_affinity(unsigned int irq);
>>  extern int irq_select_affinity(unsigned int irq);
>>
>> +extern int irq_register_affinity_hint(unsigned int irq,
>> +                                      const struct cpumask *m);
>
> I think we can do with a single funtion irq_set_affinity_hint() and
> let the caller set the pointer to NULL.

That works too.  I like it.  :-)

>
>>
>> +int irq_register_affinity_hint(unsigned int irq, const struct cpumask *m)
>> +{
>> +	struct irq_desc *desc = irq_to_desc(irq);
>> +	unsigned long flags;
>
>  desc needs to be checked. It might be NULL !

Doh, good point!

>
>> +
>> +	raw_spin_lock_irqsave(&desc->lock, flags);
>> +	desc->affinity_hint = m;
>> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
>> +
>> +	return 0;
>> +}
>> +EXPORT_SYMBOL(irq_register_affinity_hint);
>
>  EXPORT_SYMBOL_GPL please

Will do.

>
>> +static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
>> +{
>> +	struct irq_desc *desc = irq_to_desc((long)m->private);
>> +	unsigned long flags;
>> +	int ret = -EINVAL;
>> +
>> +	raw_spin_lock_irqsave(&desc->lock, flags);
>> +	if (desc->affinity_hint) {
>> +		seq_cpumask(m, desc->affinity_hint);
>
>  Please make a local copy under desc->mask and do the seq_cpumask()
>  stuff on the local copy outside of desc->lock

Will do.

Cheers,

-PJ

^ permalink raw reply

* Re: [PATCH linux-next v2 1/2] irq: Add CPU mask affinity hint
From: Thomas Gleixner @ 2010-04-30 10:53 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: davem, arjan, netdev, linux-kernel
In-Reply-To: <20100430085150.4630.46790.stgit@ppwaskie-hc2.jf.intel.com>

On Fri, 30 Apr 2010, Peter P Waskiewicz Jr wrote:

> This patch adds a cpumask affinity hint to the irq_desc
> structure, along with a registration function and a read-only
> proc entry for each interrupt.
> 
> This affinity_hint handle for each interrupt can be used by
> underlying drivers that need a better mechanism to control
> interrupt affinity.  The underlying driver can register a
> cpumask for the interrupt, which will allow the driver to
> provide the CPU mask for the interrupt to anything that
> requests it.  The intent is to extend the userspace daemon,
> irqbalance, to help hint to it a preferred CPU mask to balance
> the interrupt into.
> 
> Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
> ---
> 
>  include/linux/interrupt.h |   13 +++++++++++++
>  include/linux/irq.h       |    1 +
>  kernel/irq/manage.c       |   28 ++++++++++++++++++++++++++++
>  kernel/irq/proc.c         |   33 +++++++++++++++++++++++++++++++++
>  4 files changed, 75 insertions(+), 0 deletions(-)
> 
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 75f3f00..9c9ea2a 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -209,6 +209,9 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
>  extern int irq_can_set_affinity(unsigned int irq);
>  extern int irq_select_affinity(unsigned int irq);
>  
> +extern int irq_register_affinity_hint(unsigned int irq,
> +                                      const struct cpumask *m);

I think we can do with a single funtion irq_set_affinity_hint() and
let the caller set the pointer to NULL.

>  
> +int irq_register_affinity_hint(unsigned int irq, const struct cpumask *m)
> +{
> +	struct irq_desc *desc = irq_to_desc(irq);
> +	unsigned long flags;

  desc needs to be checked. It might be NULL !

> +
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	desc->affinity_hint = m;
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(irq_register_affinity_hint);

  EXPORT_SYMBOL_GPL please

> +static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
> +{
> +	struct irq_desc *desc = irq_to_desc((long)m->private);
> +	unsigned long flags;
> +	int ret = -EINVAL;
> +
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	if (desc->affinity_hint) {
> +		seq_cpumask(m, desc->affinity_hint);

  Please make a local copy under desc->mask and do the seq_cpumask()
  stuff on the local copy outside of desc->lock

Thanks,

	tglx

^ permalink raw reply

* [PATCH linux-next v2 2/2] ixgbe: Example usage of the new IRQ affinity_hint callback
From: Peter P Waskiewicz Jr @ 2010-04-30  8:52 UTC (permalink / raw)
  To: tglx, davem, arjan; +Cc: netdev, linux-kernel
In-Reply-To: <20100430085150.4630.46790.stgit@ppwaskie-hc2.jf.intel.com>

This patch uses the new IRQ affinity_hint callback mechanism.
It serves purely as an example of how a low-level driver can
utilize this new interface.

An official ixgbe patch will be pushed through netdev once the
IRQ patches have been accepted and merged.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 drivers/net/ixgbe/ixgbe.h      |    2 ++
 drivers/net/ixgbe/ixgbe_main.c |   20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 79c35ae..c220b9f 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -32,6 +32,7 @@
 #include <linux/pci.h>
 #include <linux/netdevice.h>
 #include <linux/aer.h>
+#include <linux/cpumask.h>
 
 #include "ixgbe_type.h"
 #include "ixgbe_common.h"
@@ -236,6 +237,7 @@ struct ixgbe_q_vector {
 	u8 tx_itr;
 	u8 rx_itr;
 	u32 eitr;
+	cpumask_var_t affinity_mask;
 };
 
 /* Helper macros to switch between ints/sec and what the register uses.
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 1b1419c..28f9d6b 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1083,6 +1083,16 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter)
 			q_vector->eitr = adapter->rx_eitr_param;
 
 		ixgbe_write_eitr(q_vector);
+
+		/*
+		 * Allocate the affinity_hint cpumask, assign the mask for
+		 * this vector, and register our affinity_hint for this irq.
+		 */
+		if (!alloc_cpumask_var(&q_vector->affinity_mask, GFP_KERNEL))
+			return;
+		cpumask_set_cpu(v_idx, q_vector->affinity_mask);
+		irq_register_affinity_hint(adapter->msix_entries[v_idx].vector,
+		                           q_vector->affinity_mask);
 	}
 
 	if (adapter->hw.mac.type == ixgbe_mac_82598EB)
@@ -3218,7 +3228,7 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
 	struct ixgbe_hw *hw = &adapter->hw;
 	u32 rxctrl;
 	u32 txdctl;
-	int i, j;
+	int i, j, num_q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
 
 	/* signal that we are down to the interrupt handler */
 	set_bit(__IXGBE_DOWN, &adapter->state);
@@ -3251,6 +3261,14 @@ void ixgbe_down(struct ixgbe_adapter *adapter)
 
 	ixgbe_napi_disable_all(adapter);
 
+	for (i = 0; i < num_q_vectors; i++) {
+		struct ixgbe_q_vector *q_vector = adapter->q_vector[i];
+		/* unregister the affinity_hint */
+		irq_unregister_affinity_hint(adapter->msix_entries[i].vector);
+		/* release the CPU mask memory */
+		free_cpumask_var(q_vector->affinity_mask);
+	}
+
 	clear_bit(__IXGBE_SFP_MODULE_NOT_FOUND, &adapter->state);
 	del_timer_sync(&adapter->sfp_timer);
 	del_timer_sync(&adapter->watchdog_timer);

^ permalink raw reply related

* [PATCH linux-next v2 1/2] irq: Add CPU mask affinity hint
From: Peter P Waskiewicz Jr @ 2010-04-30  8:51 UTC (permalink / raw)
  To: tglx, davem, arjan; +Cc: netdev, linux-kernel

This patch adds a cpumask affinity hint to the irq_desc
structure, along with a registration function and a read-only
proc entry for each interrupt.

This affinity_hint handle for each interrupt can be used by
underlying drivers that need a better mechanism to control
interrupt affinity.  The underlying driver can register a
cpumask for the interrupt, which will allow the driver to
provide the CPU mask for the interrupt to anything that
requests it.  The intent is to extend the userspace daemon,
irqbalance, to help hint to it a preferred CPU mask to balance
the interrupt into.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 include/linux/interrupt.h |   13 +++++++++++++
 include/linux/irq.h       |    1 +
 kernel/irq/manage.c       |   28 ++++++++++++++++++++++++++++
 kernel/irq/proc.c         |   33 +++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9c9ea2a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -209,6 +209,9 @@ extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
+extern int irq_register_affinity_hint(unsigned int irq,
+                                      const struct cpumask *m);
+extern int irq_unregister_affinity_hint(unsigned int irq);
 #else /* CONFIG_SMP */
 
 static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -223,6 +226,16 @@ static inline int irq_can_set_affinity(unsigned int irq)
 
 static inline int irq_select_affinity(unsigned int irq)  { return 0; }
 
+static inline int irq_register_affinity_hint(unsigned int irq,
+                                             const struct cpumask *m)
+{
+	return -EINVAL;
+}
+
+static inline int irq_unregister_affinity_hint(unsigned int irq);
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
 
 #ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 707ab12..83b16d7 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -206,6 +206,7 @@ struct irq_desc {
 	struct proc_dir_entry	*dir;
 #endif
 	const char		*name;
+	struct cpumask		*affinity_hint;
 } ____cacheline_internodealigned_in_smp;
 
 extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488..bce7e38 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,31 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	return 0;
 }
 
+int irq_register_affinity_hint(unsigned int irq, const struct cpumask *m)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	desc->affinity_hint = m;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(irq_register_affinity_hint);
+
+int irq_unregister_affinity_hint(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	desc->affinity_hint = NULL;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(irq_unregister_affinity_hint);
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
@@ -916,6 +941,9 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 			desc->chip->disable(irq);
 	}
 
+	/* make sure affinity_hint is cleaned up */
+	desc->affinity_hint = NULL;
+
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7a6eb04..8b85f77 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,23 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long)m->private);
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	if (desc->affinity_hint) {
+		seq_cpumask(m, desc->affinity_hint);
+		seq_putc(m, '\n');
+		ret = 0;
+	}
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	return ret;
+}
+
 #ifndef is_affinity_mask_valid
 #define is_affinity_mask_valid(val) 1
 #endif
@@ -84,6 +101,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
 
+static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+}
+
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -92,6 +114,13 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
+static const struct file_operations irq_affinity_hint_proc_fops = {
+	.open		= irq_affinity_hint_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_cpumask(m, irq_default_affinity);
@@ -231,6 +260,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
+
+	/* create /proc/irq/<irq>/affinity_hint */
+	proc_create_data("affinity_hint", 0400, desc->dir,
+			 &irq_affinity_hint_proc_fops, (void *)(long)irq);
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,

^ permalink raw reply related

* [Patch 3/3] net: reserve ports for applications using fixed port numbers
From: Amerigo Wang @ 2010-04-30  8:25 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, Amerigo Wang, David Miller, adobriyan, ebiederm
In-Reply-To: <20100430082912.5630.82405.sendpatchset@localhost.localdomain>


(Dropped the infiniband part, because Tetsuo modified the related code,
I will send a separate patch for it once this is accepted.)

This patch introduces /proc/sys/net/ipv4/ip_local_reserved_ports which
allows users to reserve ports for third-party applications.

The reserved ports will not be used by automatic port assignments
(e.g. when calling connect() or bind() with port number 0). Explicit
port allocation behavior is unchanged.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/Documentation/networking/ip-sysctl.txt
===================================================================
--- linux-2.6.orig/Documentation/networking/ip-sysctl.txt
+++ linux-2.6/Documentation/networking/ip-sysctl.txt
@@ -588,6 +588,37 @@ ip_local_port_range - 2 INTEGERS
 	(i.e. by default) range 1024-4999 is enough to issue up to
 	2000 connections per second to systems supporting timestamps.
 
+ip_local_reserved_ports - list of comma separated ranges
+	Specify the ports which are reserved for known third-party
+	applications. These ports will not be used by automatic port
+	assignments (e.g. when calling connect() or bind() with port
+	number 0). Explicit port allocation behavior is unchanged.
+
+	The format used for both input and output is a comma separated
+	list of ranges (e.g. "1,2-4,10-10" for ports 1, 2, 3, 4 and
+	10). Writing to the file will clear all previously reserved
+	ports and update the current list with the one given in the
+	input.
+
+	Note that ip_local_port_range and ip_local_reserved_ports
+	settings are independent and both are considered by the kernel
+	when determining which ports are available for automatic port
+	assignments.
+
+	You can reserve ports which are not in the current
+	ip_local_port_range, e.g.:
+
+	$ cat /proc/sys/net/ipv4/ip_local_port_range
+	32000	61000
+	$ cat /proc/sys/net/ipv4/ip_local_reserved_ports
+	8080,9148
+
+	although this is redundant. However such a setting is useful
+	if later the port range is changed to a value that will
+	include the reserved ports.
+
+	Default: Empty
+
 ip_nonlocal_bind - BOOLEAN
 	If set, allows processes to bind() to non-local IP addresses,
 	which can be quite useful - but may break some applications.
Index: linux-2.6/include/net/ip.h
===================================================================
--- linux-2.6.orig/include/net/ip.h
+++ linux-2.6/include/net/ip.h
@@ -184,6 +184,12 @@ extern struct local_ports {
 } sysctl_local_ports;
 extern void inet_get_local_port_range(int *low, int *high);
 
+extern unsigned long *sysctl_local_reserved_ports;
+static inline int inet_is_reserved_local_port(int port)
+{
+	return test_bit(port, sysctl_local_reserved_ports);
+}
+
 extern int sysctl_ip_default_ttl;
 extern int sysctl_ip_nonlocal_bind;
 
Index: linux-2.6/net/ipv4/af_inet.c
===================================================================
--- linux-2.6.orig/net/ipv4/af_inet.c
+++ linux-2.6/net/ipv4/af_inet.c
@@ -1552,9 +1552,13 @@ static int __init inet_init(void)
 
 	BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
 
+	sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+	if (!sysctl_local_reserved_ports)
+		goto out;
+
 	rc = proto_register(&tcp_prot, 1);
 	if (rc)
-		goto out;
+		goto out_free_reserved_ports;
 
 	rc = proto_register(&udp_prot, 1);
 	if (rc)
@@ -1653,6 +1657,8 @@ out_unregister_udp_proto:
 	proto_unregister(&udp_prot);
 out_unregister_tcp_proto:
 	proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+	kfree(sysctl_local_reserved_ports);
 	goto out;
 }
 
Index: linux-2.6/net/ipv4/inet_connection_sock.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_connection_sock.c
+++ linux-2.6/net/ipv4/inet_connection_sock.c
@@ -37,6 +37,9 @@ struct local_ports sysctl_local_ports __
 	.range = { 32768, 61000 },
 };
 
+unsigned long *sysctl_local_reserved_ports;
+EXPORT_SYMBOL(sysctl_local_reserved_ports);
+
 void inet_get_local_port_range(int *low, int *high)
 {
 	unsigned seq;
@@ -112,6 +115,8 @@ again:
 
 		smallest_size = -1;
 		do {
+			if (inet_is_reserved_local_port(rover))
+				goto next_nolock;
 			head = &hashinfo->bhash[inet_bhashfn(net, rover,
 					hashinfo->bhash_size)];
 			spin_lock(&head->lock);
@@ -136,6 +141,7 @@ again:
 			break;
 		next:
 			spin_unlock(&head->lock);
+		next_nolock:
 			if (++rover > high)
 				rover = low;
 		} while (--remaining > 0);
Index: linux-2.6/net/ipv4/inet_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv4/inet_hashtables.c
+++ linux-2.6/net/ipv4/inet_hashtables.c
@@ -456,6 +456,8 @@ int __inet_hash_connect(struct inet_time
 		local_bh_disable();
 		for (i = 1; i <= remaining; i++) {
 			port = low + (i + offset) % remaining;
+			if (inet_is_reserved_local_port(port))
+				continue;
 			head = &hinfo->bhash[inet_bhashfn(net, port,
 					hinfo->bhash_size)];
 			spin_lock(&head->lock);
Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-2.6/net/ipv4/sysctl_net_ipv4.c
@@ -299,6 +299,13 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_local_port_range,
 	},
+	{
+		.procname	= "ip_local_reserved_ports",
+		.data		= NULL, /* initialized in sysctl_ipv4_init */
+		.maxlen		= 65536,
+		.mode		= 0644,
+		.proc_handler	= proc_do_large_bitmap,
+	},
 #ifdef CONFIG_IP_MULTICAST
 	{
 		.procname	= "igmp_max_memberships",
@@ -736,6 +743,16 @@ static __net_initdata struct pernet_oper
 static __init int sysctl_ipv4_init(void)
 {
 	struct ctl_table_header *hdr;
+	struct ctl_table *i;
+
+	for (i = ipv4_table; i->procname; i++) {
+		if (strcmp(i->procname, "ip_local_reserved_ports") == 0) {
+			i->data = sysctl_local_reserved_ports;
+			break;
+		}
+	}
+	if (!i->procname)
+		return -EINVAL;
 
 	hdr = register_sysctl_paths(net_ipv4_ctl_path, ipv4_table);
 	if (hdr == NULL)
Index: linux-2.6/net/ipv4/udp.c
===================================================================
--- linux-2.6.orig/net/ipv4/udp.c
+++ linux-2.6/net/ipv4/udp.c
@@ -233,7 +233,8 @@ int udp_lib_get_port(struct sock *sk, un
 			 */
 			do {
 				if (low <= snum && snum <= high &&
-				    !test_bit(snum >> udptable->log, bitmap))
+				    !test_bit(snum >> udptable->log, bitmap) &&
+				    !inet_is_reserved_local_port(snum))
 					goto found;
 				snum += rand;
 			} while (snum != first);
Index: linux-2.6/net/sctp/socket.c
===================================================================
--- linux-2.6.orig/net/sctp/socket.c
+++ linux-2.6/net/sctp/socket.c
@@ -5436,6 +5436,8 @@ static long sctp_get_port_local(struct s
 			rover++;
 			if ((rover < low) || (rover > high))
 				rover = low;
+			if (inet_is_reserved_local_port(rover))
+				continue;
 			index = sctp_phashfn(rover);
 			head = &sctp_port_hashtable[index];
 			sctp_spin_lock(&head->lock);

^ permalink raw reply

* [Patch 1/3] sysctl: refactor integer handling proc code
From: Amerigo Wang @ 2010-04-30  8:25 UTC (permalink / raw)
  To: linux-kernel
  Cc: Octavian Purdila, Eric Dumazet, penguin-kernel, netdev,
	Neil Horman, ebiederm, David Miller, adobriyan, Amerigo Wang
In-Reply-To: <20100430082912.5630.82405.sendpatchset@localhost.localdomain>

(Based on Octavian's work, and I modified a lot.)

As we are about to add another integer handling proc function a little
bit of cleanup is in order: add a few helper functions to improve code
readability and decrease code duplication.

In the process a bug is also fixed: if the user specifies a number
with more then 20 digits it will be interpreted as two integers
(e.g. 10000...13 will be interpreted as 100.... and 13).

Behavior for EFAULT handling was changed as well. Previous to this
patch, when an EFAULT error occurred in the middle of a write
operation, although some of the elements were set, that was not
acknowledged to the user (by shorting the write and returning the
number of bytes accepted). EFAULT is now treated just like any other
errors by acknowledging the amount of bytes accepted.

Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---

Index: linux-2.6/kernel/sysctl.c
===================================================================
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -2040,8 +2040,122 @@ int proc_dostring(struct ctl_table *tabl
 			       buffer, lenp, ppos);
 }
 
+static size_t proc_skip_spaces(char **buf)
+{
+	size_t ret;
+	char *tmp = skip_spaces(*buf);
+	ret = tmp - *buf;
+	*buf = tmp;
+	return ret;
+}
+
+#define TMPBUFLEN 22
+/**
+ * proc_get_long - reads an ASCII formated integer from a user buffer
+ *
+ * @buf - a kernel buffer
+ * @size - size of the kernel buffer
+ * @val - this is where the number will be stored
+ * @neg - set to %TRUE if number is negative
+ * @perm_tr - a vector which contains the allowed trailers
+ * @perm_tr_len - size of the perm_tr vector
+ * @tr - pointer to store the trailer character
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read. If tr is non NULL and a trailing
+ * character exist (size is non zero after returning from this
+ * function) tr is updated with the trailing character.
+ */
+static int proc_get_long(char **buf, size_t *size,
+			  unsigned long *val, bool *neg,
+			  const char *perm_tr, unsigned perm_tr_len, char *tr)
+{
+	int len;
+	char *p, tmp[TMPBUFLEN];
+
+	if (!*size)
+		return -EINVAL;
+
+	len = *size;
+	if (len > TMPBUFLEN-1)
+		len = TMPBUFLEN-1;
+
+	memcpy(tmp, *buf, len);
+
+	tmp[len] = 0;
+	p = tmp;
+	if (*p == '-' && *size > 1) {
+		*neg = 1;
+		p++;
+	} else
+		*neg = 0;
+	if (!isdigit(*p))
+		return -EINVAL;
+
+	*val = simple_strtoul(p, &p, 0);
 
-static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
+	len = p - tmp;
+
+	/* We don't know if the next char is whitespace thus we may accept
+	 * invalid integers (e.g. 1234...a) or two integers instead of one
+	 * (e.g. 123...1). So lets not allow such large numbers. */
+	if (len == TMPBUFLEN - 1)
+		return -EINVAL;
+
+	if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
+		return -EINVAL;
+
+	if (tr && (len < *size))
+		*tr = *p;
+
+	*buf += len;
+	*size -= len;
+
+	return 0;
+}
+
+/**
+ * proc_put_long - coverts an integer to a decimal ASCII formated string
+ *
+ * @buf - the user buffer
+ * @size - the size of the user buffer
+ * @val - the integer to be converted
+ * @neg - sign of the number, %TRUE for negative
+ *
+ * In case of success 0 is returned and buf and size are updated with
+ * the amount of bytes read.
+ */
+static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
+			  bool neg)
+{
+	int len;
+	char tmp[TMPBUFLEN], *p = tmp;
+
+	sprintf(p, "%s%lu", neg ? "-" : "", val);
+	len = strlen(tmp);
+	if (len > *size)
+		len = *size;
+	if (copy_to_user(*buf, tmp, len))
+		return -EFAULT;
+	*size -= len;
+	*buf += len;
+	return 0;
+}
+#undef TMPBUFLEN
+
+static int proc_put_char(void __user **buf, size_t *size, char c)
+{
+	if (*size) {
+		char __user **buffer = (char __user **)buf;
+		if (put_user(c, *buffer))
+			return -EFAULT;
+		(*size)--, (*buffer)++;
+		*buf = *buffer;
+	}
+	return 0;
+}
+
+static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
 				 int *valp,
 				 int write, void *data)
 {
@@ -2050,7 +2164,7 @@ static int do_proc_dointvec_conv(int *ne
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			*lvalp = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2060,23 +2174,21 @@ static int do_proc_dointvec_conv(int *ne
 	return 0;
 }
 
+static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
+
 static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 		  int write, void __user *buffer,
 		  size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
-#define TMPBUFLEN 21
-	int *i, vleft, first = 1, neg;
-	unsigned long lval;
-	size_t left, len;
+	int *i, vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
 	
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
-	
-	if (!tbl_data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
@@ -2088,89 +2200,69 @@ static int __do_proc_dointvec(void *tbl_
 	if (!conv)
 		conv = do_proc_dointvec_conv;
 
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
 	for (; left && vleft--; i++, first=0) {
-		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > sizeof(buf) - 1)
-				len = sizeof(buf) - 1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
+		unsigned long lval;
+		bool neg;
 
-			lval = simple_strtoul(p, &p, 0);
+		if (write) {
+			left -= proc_skip_spaces(&kbuf);
 
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			err = proc_get_long(&kbuf, &left, &lval, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
-			s += len;
-			left -= len;
-
-			if (conv(&neg, &lval, i, 1, data))
+			if (conv(&neg, &lval, i, 1, data)) {
+				err = -EINVAL;
 				break;
+			}
 		} else {
-			p = buf;
+			if (conv(&neg, &lval, i, 0, data)) {
+				err = -EINVAL;
+				break;
+			}
 			if (!first)
-				*p++ = '\t';
-	
-			if (conv(&neg, &lval, i, 0, data))
+				err = proc_put_char(&buffer, &left, '\t');
+			if (err)
+				break;
+			err = proc_put_long(&buffer, &left, lval, neg);
+			if (err)
 				break;
-
-			sprintf(p, "%s%lu", neg ? "-" : "", lval);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_dointvec(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos,
-		  int (*conv)(int *negp, unsigned long *lvalp, int *valp,
+		  int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
 			      int write, void *data),
 		  void *data)
 {
@@ -2238,8 +2330,8 @@ struct do_proc_dointvec_minmax_conv_para
 	int *max;
 };
 
-static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 
-					int *valp, 
+static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
+					int *valp,
 					int write, void *data)
 {
 	struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,7 +2344,7 @@ static int do_proc_dointvec_minmax_conv(
 	} else {
 		int val = *valp;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			*lvalp = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2295,102 +2387,78 @@ static int __do_proc_doulongvec_minmax(v
 				     unsigned long convmul,
 				     unsigned long convdiv)
 {
-#define TMPBUFLEN 21
-	unsigned long *i, *min, *max, val;
-	int vleft, first=1, neg;
-	size_t len, left;
-	char buf[TMPBUFLEN], *p;
-	char __user *s = buffer;
-	
-	if (!data || !table->maxlen || !*lenp ||
-	    (*ppos && !write)) {
+	unsigned long *i, *min, *max;
+	int vleft, first = 1, err = 0;
+	unsigned long page = 0;
+	size_t left;
+	char *kbuf;
+
+	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
 		*lenp = 0;
 		return 0;
 	}
-	
+
 	i = (unsigned long *) data;
 	min = (unsigned long *) table->extra1;
 	max = (unsigned long *) table->extra2;
 	vleft = table->maxlen / sizeof(unsigned long);
 	left = *lenp;
-	
+
+	if (write) {
+		if (left > PAGE_SIZE - 1)
+			left = PAGE_SIZE - 1;
+		page = __get_free_page(GFP_TEMPORARY);
+		kbuf = (char *) page;
+		if (!kbuf)
+			return -ENOMEM;
+		if (copy_from_user(kbuf, buffer, left)) {
+			err = -EFAULT;
+			goto free;
+		}
+		kbuf[left] = 0;
+	}
+
 	for (; left && vleft--; i++, min++, max++, first=0) {
+		unsigned long val;
+
 		if (write) {
-			while (left) {
-				char c;
-				if (get_user(c, s))
-					return -EFAULT;
-				if (!isspace(c))
-					break;
-				left--;
-				s++;
-			}
-			if (!left)
-				break;
-			neg = 0;
-			len = left;
-			if (len > TMPBUFLEN-1)
-				len = TMPBUFLEN-1;
-			if (copy_from_user(buf, s, len))
-				return -EFAULT;
-			buf[len] = 0;
-			p = buf;
-			if (*p == '-' && left > 1) {
-				neg = 1;
-				p++;
-			}
-			if (*p < '0' || *p > '9')
-				break;
-			val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
-			len = p-buf;
-			if ((len < left) && *p && !isspace(*p))
+			bool neg;
+
+			left -= proc_skip_spaces(&kbuf);
+
+			err = proc_get_long(&kbuf, &left, &val, &neg,
+					     proc_wspace_sep,
+					     sizeof(proc_wspace_sep), NULL);
+			if (err)
 				break;
 			if (neg)
-				val = -val;
-			s += len;
-			left -= len;
-
-			if(neg)
 				continue;
 			if ((min && val < *min) || (max && val > *max))
 				continue;
 			*i = val;
 		} else {
-			p = buf;
+			val = convdiv * (*i) / convmul;
 			if (!first)
-				*p++ = '\t';
-			sprintf(p, "%lu", convdiv * (*i) / convmul);
-			len = strlen(buf);
-			if (len > left)
-				len = left;
-			if(copy_to_user(s, buf, len))
-				return -EFAULT;
-			left -= len;
-			s += len;
+				err = proc_put_char(&buffer, &left, '\t');
+			err = proc_put_long(&buffer, &left, val, false);
+			if (err)
+				break;
 		}
 	}
 
-	if (!write && !first && left) {
-		if(put_user('\n', s))
-			return -EFAULT;
-		left--, s++;
-	}
+	if (!write && !first && left && !err)
+		err = proc_put_char(&buffer, &left, '\n');
+	if (write && !err)
+		left -= proc_skip_spaces(&kbuf);
+free:
 	if (write) {
-		while (left) {
-			char c;
-			if (get_user(c, s++))
-				return -EFAULT;
-			if (!isspace(c))
-				break;
-			left--;
-		}
+		free_page(page);
+		if (first)
+			return err ? : -EINVAL;
 	}
-	if (write && first)
-		return -EINVAL;
 	*lenp -= left;
 	*ppos += *lenp;
-	return 0;
-#undef TMPBUFLEN
+	return err;
 }
 
 static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2519,7 @@ int proc_doulongvec_ms_jiffies_minmax(st
 }
 
 
-static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
 					 int *valp,
 					 int write, void *data)
 {
@@ -2463,7 +2531,7 @@ static int do_proc_dointvec_jiffies_conv
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2474,7 +2542,7 @@ static int do_proc_dointvec_jiffies_conv
 	return 0;
 }
 
-static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
 						int *valp,
 						int write, void *data)
 {
@@ -2486,7 +2554,7 @@ static int do_proc_dointvec_userhz_jiffi
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;
@@ -2497,7 +2565,7 @@ static int do_proc_dointvec_userhz_jiffi
 	return 0;
 }
 
-static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
+static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
 					    int *valp,
 					    int write, void *data)
 {
@@ -2507,7 +2575,7 @@ static int do_proc_dointvec_ms_jiffies_c
 		int val = *valp;
 		unsigned long lval;
 		if (val < 0) {
-			*negp = -1;
+			*negp = 1;
 			lval = (unsigned long)-val;
 		} else {
 			*negp = 0;

^ permalink raw reply

* Re: 2.6.34-rc5-git7 (plus all patches) -- another suspicious rcu_dereference_check() usage.
From: Paul E. McKenney @ 2010-04-30  0:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Miles Lane, Vivek Goyal, Eric Paris, Lai Jiangshan, Ingo Molnar,
	Peter Zijlstra, LKML, nauman, netdev, Jens Axboe, Gui Jianfeng,
	Li Zefan, Johannes Berg, shemminger
In-Reply-To: <20100428204403.GT2540@linux.vnet.ibm.com>

On Wed, Apr 28, 2010 at 01:44:03PM -0700, Paul E. McKenney wrote:
> On Wed, Apr 28, 2010 at 10:18:43PM +0200, Eric Dumazet wrote:
> > Le mercredi 28 avril 2010 à 13:09 -0700, Paul E. McKenney a écrit :
> > > On Wed, Apr 28, 2010 at 09:38:11PM +0200, Eric Dumazet wrote:
> > > > Le mercredi 28 avril 2010 à 10:54 -0700, Paul E. McKenney a écrit :
> > > > > On Mon, Apr 26, 2010 at 08:51:06PM -0400, Miles Lane wrote:
> > > > > > This one occurred during the wakeup from suspend to RAM.
> > > > > > 
> > > > > > [  984.724697] [ INFO: suspicious rcu_dereference_check() usage. ]
> > > > > > [  984.724700] ---------------------------------------------------
> > > > > > [  984.724703] include/linux/fdtable.h:88 invoked
> > > > > > rcu_dereference_check() without protection!
> > > > > > [  984.724706]
> > > > > > [  984.724707] other info that might help us debug this:
> > > > > > [  984.724708]
> > > > > > [  984.724711]
> > > > > > [  984.724711] rcu_scheduler_active = 1, debug_locks = 1
> > > > > > [  984.724714] no locks held by dbus-daemon/4680.
> > > > > > [  984.724717]
> > > > > > [  984.724717] stack backtrace:
> > > > > > [  984.724721] Pid: 4680, comm: dbus-daemon Not tainted 2.6.34-rc5-git7 #33
> > > > > > [  984.724724] Call Trace:
> > > > > > [  984.724734]  [<ffffffff81074556>] lockdep_rcu_dereference+0x9d/0xa6
> > > > > > [  984.724740]  [<ffffffff810fc785>] fcheck_files+0xb1/0xc9
> > > > > > [  984.724745]  [<ffffffff810fc7f5>] fget_light+0x35/0xab
> > > > > > [  984.724751]  [<ffffffff81433e1b>] ? sock_poll_wait+0x13/0x18
> > > > > > [  984.724755]  [<ffffffff81433e39>] ? unix_poll+0x19/0x95
> > > > > > [  984.724762]  [<ffffffff8110aa95>] do_sys_poll+0x1ff/0x3e5
> > > > > > [  984.724766]  [<ffffffff8110a19e>] ? __pollwait+0x0/0xc7
> > > > > > [  984.724771]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724776]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724780]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724784]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724788]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724793]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724797]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724802]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724806]  [<ffffffff8110a265>] ? pollwake+0x0/0x4f
> > > > > > [  984.724812]  [<ffffffff8110ae0f>] sys_poll+0x50/0xbb
> > > > > > [  984.724818]  [<ffffffff81009d82>] system_call_fastpath+0x16/0x1b
> > > > > 
> > > > > Hmmm...  I am not convinced that this is a false positive.  Couldn't
> > > > > there be a multi-threaded process where one thread is invoking poll()
> > > > > on a UNIX socket just as another thread is calling close() on it?
> > > > > 
> > > > > The current fcheck_files() logic requires that the caller either (1) be in
> > > > > an RCU read-side critical section, (2) hold ->files_lock, or (3) passing
> > > > > in a files_struct with ->count equal to 1 (initialization or cleanup).
> > > > > 
> > > > > So I don't feel comfortable just slapping an RCU read-side critical
> > > > > section around this one, at least not unless someone who understands
> > > > > the locking says that doing so is OK.
> > > > > 
> > > > > 		
> > > > 
> > > > Its a single threaded program.
> > > > 
> > > > So fget_light() calls fcheck_files(files, fd); without rcu lock,
> > > > but some /proc/pid/fd/... user temporarly raised files->count just
> > > > before we perform the condition check.
> > > 
> > > So I should add a single-threaded check.  My first thought was to use
> > > current_is_single_threaded(), but the bit about scanning the full list
> > > of processes does give me pause.  However, thread_group_empty() looks
> > > like a much lighter-weight alternative.
> > > 
> > > I believe that it is possible for a pair of single-threaded processes
> > > to share a file descriptor, but that should not be a problem, as both
> > > of them would need to close it for it to go away.
> > > 
> > > But what happens if someone does a clone() with CLONE_FILES, as some
> > > of the AIO stuff seems to do?  Won't that allow one of the resulting
> > > processes to close the file for both of them, even though both are
> > > otherwise single-threaded?  And the ->count seems to be the only
> > > distinction between these two cases.
> > > 
> > > And AIO does CLONE_VM as well as CLONE_FILES, but that seems to mean that
> > > the check must scan the processes with current_is_single_threaded().
> > > Besides which, a user could invoke clone() with only CLONE_FILES
> > > specified, right?
> > > 
> > > Or am I just confused here?

Not confused, just stupid.  You would think that after almost 20 years
doing parallel programming, I would understand that concurrent debugging
assertions usually cannot be exact.  In this case, I really don't want
false positives, so I must accept some false negatives.  -That- I should
be able to do without adding runtime overhead when !PROVE_RCU.

I will put together a patch, test it, and send it out.

							Thanx, Paul

> > If a program is mono threaded, and doing a fget_light() syscall, it
> > cannot possibly do a clone() in // ;)
> 
> The sequence of events that I am worried about is as follows:
> 
> 1.	Single-threaded process does clone(CLONE_FILES).  The
> 	result is a pair of single-threaded processes that share
> 	file descriptors.
> 
> 2.	One of these processes does files_fdtable(i) at the same
> 	time as the other process closes file descriptor i.
> 
> So, clone and -then- do fget_light().
> 
> > If we want to be picky, we could add a user provided condition, aka "we
> > are sure we are allowed to do this because we are the owner of the files
> > struct".
> 
> But yes, if I understand your trick below, the race conditions from
> the above sequence of events would simply force the processes off
> of the fget_light() path, which should be OK.
> 
> 						Thanx, Paul
> 
> > diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
> > index 6da962c..027f5e1 100644
> > --- a/drivers/char/tty_io.c
> > +++ b/drivers/char/tty_io.c
> > @@ -2694,7 +2694,7 @@ void __do_SAK(struct tty_struct *tty)
> >  			spin_lock(&p->files->file_lock);
> >  			fdt = files_fdtable(p->files);
> >  			for (i = 0; i < fdt->max_fds; i++) {
> > -				filp = fcheck_files(p->files, i);
> > +				filp = fcheck_files(p->files, i, false);
> >  				if (!filp)
> >  					continue;
> >  				if (filp->f_op->read == tty_read &&
> > diff --git a/fs/fcntl.c b/fs/fcntl.c
> > index 452d02f..dabf4d8 100644
> > --- a/fs/fcntl.c
> > +++ b/fs/fcntl.c
> > @@ -119,7 +119,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
> >  		int retval = oldfd;
> > 
> >  		rcu_read_lock();
> > -		if (!fcheck_files(files, oldfd))
> > +		if (!fcheck_files(files, oldfd, false))
> >  			retval = -EBADF;
> >  		rcu_read_unlock();
> >  		return retval;
> > diff --git a/fs/file_table.c b/fs/file_table.c
> > index 32d12b7..2865f72 100644
> > --- a/fs/file_table.c
> > +++ b/fs/file_table.c
> > @@ -274,7 +274,7 @@ struct file *fget(unsigned int fd)
> >  	struct files_struct *files = current->files;
> > 
> >  	rcu_read_lock();
> > -	file = fcheck_files(files, fd);
> > +	file = fcheck_files(files, fd, false);
> >  	if (file) {
> >  		if (!atomic_long_inc_not_zero(&file->f_count)) {
> >  			/* File object ref couldn't be taken */
> > @@ -303,10 +303,10 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
> > 
> >  	*fput_needed = 0;
> >  	if (likely((atomic_read(&files->count) == 1))) {
> > -		file = fcheck_files(files, fd);
> > +		file = fcheck_files(files, fd, true);
> >  	} else {
> >  		rcu_read_lock();
> > -		file = fcheck_files(files, fd);
> > +		file = fcheck_files(files, fd, false);
> >  		if (file) {
> >  			if (atomic_long_inc_not_zero(&file->f_count))
> >  				*fput_needed = 1;
> > diff --git a/fs/proc/base.c b/fs/proc/base.c
> > index 8418fcc..0e89448 100644
> > --- a/fs/proc/base.c
> > +++ b/fs/proc/base.c
> > @@ -1716,7 +1716,7 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
> >  		 * hold ->file_lock.
> >  		 */
> >  		spin_lock(&files->file_lock);
> > -		file = fcheck_files(files, fd);
> > +		file = fcheck_files(files, fd, false);
> >  		if (file) {
> >  			if (path) {
> >  				*path = file->f_path;
> > @@ -1755,7 +1755,7 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
> >  		files = get_files_struct(task);
> >  		if (files) {
> >  			rcu_read_lock();
> > -			if (fcheck_files(files, fd)) {
> > +			if (fcheck_files(files, fd, false)) {
> >  				rcu_read_unlock();
> >  				put_files_struct(files);
> >  				if (task_dumpable(task)) {
> > @@ -1813,7 +1813,7 @@ static struct dentry *proc_fd_instantiate(struct inode *dir,
> >  	 * hold ->file_lock.
> >  	 */
> >  	spin_lock(&files->file_lock);
> > -	file = fcheck_files(files, fd);
> > +	file = fcheck_files(files, fd, false);
> >  	if (!file)
> >  		goto out_unlock;
> >  	if (file->f_mode & FMODE_READ)
> > @@ -1899,7 +1899,7 @@ static int proc_readfd_common(struct file * filp, void * dirent,
> >  				char name[PROC_NUMBUF];
> >  				int len;
> > 
> > -				if (!fcheck_files(files, fd))
> > +				if (!fcheck_files(files, fd, false))
> >  					continue;
> >  				rcu_read_unlock();
> > 
> > diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
> > index 013dc52..76423ad 100644
> > --- a/include/linux/fdtable.h
> > +++ b/include/linux/fdtable.h
> > @@ -57,11 +57,12 @@ struct files_struct {
> >  	struct file * fd_array[NR_OPEN_DEFAULT];
> >  };
> > 
> > -#define rcu_dereference_check_fdtable(files, fdtfd) \
> > +#define rcu_dereference_check_fdtable(files, fdtfd, cond) \
> >  	(rcu_dereference_check((fdtfd), \
> >  			       rcu_read_lock_held() || \
> >  			       lockdep_is_held(&(files)->file_lock) || \
> > -			       atomic_read(&(files)->count) == 1))
> > +			       atomic_read(&(files)->count) == 1 || \
> > +			       cond))
> > 
> >  #define files_fdtable(files) \
> >  		(rcu_dereference_check_fdtable((files), (files)->fdt))
> > @@ -79,13 +80,13 @@ static inline void free_fdtable(struct fdtable *fdt)
> >  	call_rcu(&fdt->rcu, free_fdtable_rcu);
> >  }
> > 
> > -static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
> > +static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd, bool cond)
> >  {
> >  	struct file * file = NULL;
> >  	struct fdtable *fdt = files_fdtable(files);
> > 
> >  	if (fd < fdt->max_fds)
> > -		file = rcu_dereference_check_fdtable(files, fdt->fd[fd]);
> > +		file = rcu_dereference_check_fdtable(files, fdt->fd[fd], cond);
> >  	return file;
> >  }
> > 
> > 
> > 

^ permalink raw reply

* [patch 03/13] eeprom_93cx6: Add data direction control.
From: Ben Dooks @ 2010-04-29 23:16 UTC (permalink / raw)
  To: netdev; +Cc: tristram.ha, support, Wolfram Sang, Jean Delvare, Linux Kernel
In-Reply-To: <20100429231621.015936077@fluff.org.uk>

[-- Attachment #1: 93c-eeprom-add-direction.patch --]
[-- Type: text/plain, Size: 2336 bytes --]

Some devices need to know if the data is to be output or read, so add a
data direction into the eeprom structure to tell the driver whether the
data line should be driven.

The user in this case is the Micrel KS8851 which has a direction
control for the EEPROM data line and thus needs to know whether
to drive it (writing) or to tristate it for receiving.

Signed-off-by: Ben Dooks <ben@simtec.co.uk>
Cc: Wolfram Sang <w.sang@pengutronix.de>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Linux Kernel <linux-kernel@vger.kernel.org>

--
 drivers/misc/eeprom/eeprom_93cx6.c |    3 +++
 include/linux/eeprom_93cx6.h       |    2 ++
 2 files changed, 5 insertions(+)

Index: b/drivers/misc/eeprom/eeprom_93cx6.c
===================================================================
--- a/drivers/misc/eeprom/eeprom_93cx6.c	2009-10-06 15:35:38.000000000 +0100
+++ b/drivers/misc/eeprom/eeprom_93cx6.c	2009-10-06 15:51:18.000000000 +0100
@@ -70,6 +70,7 @@ static void eeprom_93cx6_startup(struct 
 	eeprom->reg_data_out = 0;
 	eeprom->reg_data_clock = 0;
 	eeprom->reg_chip_select = 1;
+	eeprom->drive_data = 1;
 	eeprom->register_write(eeprom);
 
 	/*
@@ -108,6 +109,7 @@ static void eeprom_93cx6_write_bits(stru
 	 */
 	eeprom->reg_data_in = 0;
 	eeprom->reg_data_out = 0;
+	eeprom->drive_data = 1;
 
 	/*
 	 * Start writing all bits.
@@ -147,6 +149,7 @@ static void eeprom_93cx6_read_bits(struc
 	 */
 	eeprom->reg_data_in = 0;
 	eeprom->reg_data_out = 0;
+	eeprom->drive_data = 0;
 
 	/*
 	 * Start reading all bits.
Index: b/include/linux/eeprom_93cx6.h
===================================================================
--- a/include/linux/eeprom_93cx6.h	2009-10-06 15:35:38.000000000 +0100
+++ b/include/linux/eeprom_93cx6.h	2009-10-06 15:51:18.000000000 +0100
@@ -45,6 +45,7 @@
  * @register_write(struct eeprom_93cx6 *eeprom): handler to
  * write to the eeprom register by using all reg_* fields.
  * @width: eeprom width, should be one of the PCI_EEPROM_WIDTH_* defines
+ * @drive_data: Set if we're driving the data line.
  * @reg_data_in: register field to indicate data input
  * @reg_data_out: register field to indicate data output
  * @reg_data_clock: register field to set the data clock
@@ -61,6 +62,7 @@ struct eeprom_93cx6 {
 
 	int width;
 
+	char drive_data;
 	char reg_data_in;
 	char reg_data_out;
 	char reg_data_clock;

^ permalink raw reply

* Re: [PATCH RFC: linux-next 1/2] irq: Add CPU mask affinity hint callback framework
From: Peter P Waskiewicz Jr @ 2010-04-29 20:28 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: davem@davemloft.net, arjan@linux.jf.intel.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <alpine.LFD.2.00.1004292121310.2951@localhost.localdomain>

On Thu, 2010-04-29 at 12:48 -0700, Thomas Gleixner wrote:
> B1;2005;0cPeter,
> 
> On Thu, 29 Apr 2010, Peter P Waskiewicz Jr wrote:
> > On Wed, 2010-04-28 at 09:45 -0700, Thomas Gleixner wrote:
> > > So you need a reference to your device, so what about the following:
> > > 
> > > struct irq_affinity_hint;
> > > 
> > > struct irq_affinity_hint {
> > >        unsigned int (*callback)(unsigned int irq, struct irq_affinity_hint *hint,
> > > 				cpumask_var_t *mask);
> > > }
> > > 
> > > Now you embed that struct into your device private data structure and
> > > you get the reference to it back in the callback function. No extra
> > > kmalloc/kfree, less code.
> > 
> > Good idea!  I'll roll that into my new version.
> 
> Thinking more about it, I wonder whether you have a cpu_mask in your
> driver/device private data anyway. I bet you have :)

Well, at this point we don't, but nothing says we can't.

> So it should be sufficient to set a pointer to that cpu_mask in
> irq_desc and get rid of the callback completely.

So "register" would just assign the pointer, and "unregister" would make
sure to NULL the mask pointer out.  I like it.  It'll sure clean things
up too.

> Any access to desc->affinity_hint needs to be protected by desc->lock.
> For setting the pointer to a real mask resp. NULL that's fine. The
> copy which you need to do in the proc-read function is not going to
> introduce huge latencies either.

Right.

> Thanks,
> 
> 	tglx

Thanks for the additional inputs.  Patches coming soon.

-PJ

^ permalink raw reply

* Re: [PATCH RFC: linux-next 1/2] irq: Add CPU mask affinity hint callback framework
From: Thomas Gleixner @ 2010-04-29 19:48 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: davem@davemloft.net, arjan@linux.jf.intel.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org
In-Reply-To: <1272563946.9614.1.camel@localhost>

B1;2005;0cPeter,

On Thu, 29 Apr 2010, Peter P Waskiewicz Jr wrote:
> On Wed, 2010-04-28 at 09:45 -0700, Thomas Gleixner wrote:
> > So you need a reference to your device, so what about the following:
> > 
> > struct irq_affinity_hint;
> > 
> > struct irq_affinity_hint {
> >        unsigned int (*callback)(unsigned int irq, struct irq_affinity_hint *hint,
> > 				cpumask_var_t *mask);
> > }
> > 
> > Now you embed that struct into your device private data structure and
> > you get the reference to it back in the callback function. No extra
> > kmalloc/kfree, less code.
> 
> Good idea!  I'll roll that into my new version.

Thinking more about it, I wonder whether you have a cpu_mask in your
driver/device private data anyway. I bet you have :)

So it should be sufficient to set a pointer to that cpu_mask in
irq_desc and get rid of the callback completely.

Any access to desc->affinity_hint needs to be protected by desc->lock.
For setting the pointer to a real mask resp. NULL that's fine. The
copy which you need to do in the proc-read function is not going to
introduce huge latencies either.

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCHv7] add mergeable buffers support to vhost_net
From: Michael S. Tsirkin @ 2010-04-29 14:12 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, xiaohui.xin, kvm, virtualization
In-Reply-To: <1272488232.11307.4.camel@w-dls.beaverton.ibm.com>

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> This patch adds mergeable receive buffer support to vhost_net.
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

You can find the latest version on the following net-next based tree:
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost

-- 
MST

^ permalink raw reply

* Re: linux-next: manual merge of the rr tree with the net tree
From: Michael S. Tsirkin @ 2010-04-29 14:06 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Stephen Rothwell, linux-next, linux-kernel, David Miller, netdev
In-Reply-To: <201004292223.36270.rusty@rustcorp.com.au>

On Thu, Apr 29, 2010 at 10:23:36PM +0930, Rusty Russell wrote:
> On Thu, 29 Apr 2010 01:24:57 am Michael S. Tsirkin wrote:
> > On Tue, Apr 27, 2010 at 07:09:13AM +0300, Michael S. Tsirkin wrote:
> > > On Tue, Apr 27, 2010 at 11:58:52AM +1000, Stephen Rothwell wrote:
> > > > Hi Rusty,
> > > > 
> > > > Today's linux-next merge of the rr tree got a conflict in
> > > > drivers/net/virtio_net.c between commit
> > > > 5e01d2f91df62be4d6f282149bc2a8858992ceca ("virtio-net: move sg off
> > > > stack") from the net tree and commit
> > > > 7f62a724a65f864d84f50857bbfd36c240155c8f ("virtio_net: use virtqueue_xxx
> > > > wrappers") from the rr tree.
> > > > 
> > > > I fixed it up (see below) and can carry the fix as necessary.
> > > 
> > > Hmm, Rusty, do you intend for the patches to go through netdev this
> > > time? If you do, it might be simplest to just ask Dave to merge
> > > them in net-next-2.6 now. I can prepare and send them if you like.
> 
> Naah, they hit too many random virtio drivers.  It's pretty simple to keep
> them separate, and once the new API is in place I'll send to Dave to merge.
> It could take two cycles, but there's no huge rush.
> 
> Cheers,
> Rusty.

Well, I'm just thinking of a way to make working on virtio-net less painful.
Maybe you could just put a copy of the two patches above on your tree?

-- 
MST

^ permalink raw reply

* Re: [PATCHv7] add mergeable buffers support to vhost_net
From: Michael S. Tsirkin @ 2010-04-29 13:45 UTC (permalink / raw)
  To: David L Stevens; +Cc: netdev, kvm, virtualization
In-Reply-To: <1272488232.11307.4.camel@w-dls.beaverton.ibm.com>

On Wed, Apr 28, 2010 at 01:57:12PM -0700, David L Stevens wrote:
> This patch adds mergeable receive buffer support to vhost_net.
> 
> Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

I have applied this, thanks very much!
I have also applied some tweaks on top,
please take a look.

Thanks,
MSt

commit 2809e94f5f26d89dc5232aaec753ffda95c4d95e
Author: Michael S. Tsirkin <mst@redhat.com>
Date:   Thu Apr 29 16:18:08 2010 +0300

    vhost-net: minor tweaks in mergeable buffer code
    
    Applies the following tweaks on top of mergeable buffers patch:
    1. vhost_get_desc_n assumes that all desriptors are 'in' only.
       It's also unlikely to be useful for any vhost frontend
       besides vhost_net, so just move it to net.c, and rename
       get_rx_bufs for brevity.
    
    2. for rx, we called iov_length within vhost_get_desc_n
       (now get_rx_bufs) already, so we don't
       need an extra call to iov_length to avoid overflow anymore.
       Accordingly, copy_iovec_hdr can return void now.
    
    3. for rx, do some further code tweaks:
       do not assign len = err as we check that err == len
       handle data length in a way similar to how we handle
       header length: datalen -> sock_len, len -> vhost_len.
       add sock_hlen as a local variable, for symmetry with vhost_hlen.

    4. add some likely/unlikely annotations
    
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index d61d945..85519b4 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -74,9 +74,9 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to,
 	}
 	return seg;
 }
-/* Copy iovec entries for len bytes from iovec. Return segments used. */
-static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
-			  size_t len, int iovcount)
+/* Copy iovec entries for len bytes from iovec. */
+static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
+			   size_t len, int iovcount)
 {
 	int seg = 0;
 	size_t size;
@@ -89,7 +89,6 @@ static int copy_iovec_hdr(const struct iovec *from, struct iovec *to,
 		++to;
 		++seg;
 	}
-	return seg;
 }
 
 /* Caller must have TX VQ lock */
@@ -204,7 +203,7 @@ static void handle_tx(struct vhost_net *net)
 	unuse_mm(net->dev.mm);
 }
 
-static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
+static int peek_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 {
 	struct sk_buff *head;
 	int len = 0;
@@ -212,17 +211,70 @@ static int vhost_head_len(struct vhost_virtqueue *vq, struct sock *sk)
 	lock_sock(sk);
 	head = skb_peek(&sk->sk_receive_queue);
 	if (head)
-		len = head->len + vq->sock_hlen;
+		len = head->len;
 	release_sock(sk);
 	return len;
 }
 
+/* This is a multi-buffer version of vhost_get_desc, that works if
+ *	vq has read descriptors only.
+ * @vq		- the relevant virtqueue
+ * @datalen	- data length we'll be reading
+ * @iovcount	- returned count of io vectors we fill
+ * @log		- vhost log
+ * @log_num	- log offset
+ *	returns number of buffer heads allocated, negative on error
+ */
+static int get_rx_bufs(struct vhost_virtqueue *vq,
+		       struct vring_used_elem *heads,
+		       int datalen,
+		       unsigned *iovcount,
+		       struct vhost_log *log,
+		       unsigned *log_num)
+{
+	unsigned int out, in;
+	int seg = 0;
+	int headcount = 0;
+	unsigned d;
+	int r;
+
+	while (datalen > 0) {
+		if (unlikely(headcount >= VHOST_NET_MAX_SG)) {
+			r = -ENOBUFS;
+			goto err;
+		}
+		d = vhost_get_desc(vq->dev, vq, vq->iov + seg,
+				   ARRAY_SIZE(vq->iov) - seg, &out,
+				   &in, log, log_num);
+		if (d == vq->num) {
+			r = 0;
+			goto err;
+		}
+		if (unlikely(out || in <= 0)) {
+			vq_err(vq, "unexpected descriptor format for RX: "
+				"out %d, in %d\n", out, in);
+			r = -EINVAL;
+			goto err;
+		}
+		heads[headcount].id = d;
+		heads[headcount].len = iov_length(vq->iov + seg, in);
+		datalen -= heads[headcount].len;
+		++headcount;
+		seg += in;
+	}
+	*iovcount = seg;
+	return headcount;
+err:
+	vhost_discard_desc(vq, headcount);
+	return r;
+}
+
 /* Expects to be always run from workqueue - which acts as
  * read-size critical section for our kind of RCU. */
 static void handle_rx(struct vhost_net *net)
 {
 	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
-	unsigned in, log, s;
+	unsigned uninitialized_var(in), log;
 	struct vhost_log *vq_log;
 	struct msghdr msg = {
 		.msg_name = NULL,
@@ -238,9 +290,10 @@ static void handle_rx(struct vhost_net *net)
 		.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
 	};
 
-	size_t len, total_len = 0;
-	int err, headcount, datalen;
-	size_t vhost_hlen;
+	size_t total_len = 0;
+	int err, headcount;
+	size_t vhost_hlen, sock_hlen;
+	size_t vhost_len, sock_len;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 		return;
@@ -249,14 +302,16 @@ static void handle_rx(struct vhost_net *net)
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
 	vhost_hlen = vq->vhost_hlen;
+	sock_hlen = vq->sock_hlen;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
 
-	while ((datalen = vhost_head_len(vq, sock->sk))) {
-		headcount = vhost_get_desc_n(vq, vq->heads,
-					     datalen + vhost_hlen,
-					     &in, vq_log, &log);
+	while ((sock_len = peek_head_len(vq, sock->sk))) {
+		sock_len += sock_hlen;
+		vhost_len = sock_len + vhost_hlen;
+		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
+					&in, vq_log, &log);
 		if (headcount < 0)
 			break;
 		/* OK, now we need to know about added descriptors. */
@@ -272,34 +327,25 @@ static void handle_rx(struct vhost_net *net)
 			break;
 		}
 		/* We don't need to be notified again. */
-		if (vhost_hlen)
+		if (unlikely((vhost_hlen)))
 			/* Skip header. TODO: support TSO. */
-			s = move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
+			move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
 		else
-			s = copy_iovec_hdr(vq->iov, vq->hdr, vq->sock_hlen, in);
+			copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
 		msg.msg_iovlen = in;
-		len = iov_length(vq->iov, in);
-		/* Sanity check */
-		if (!len) {
-			vq_err(vq, "Unexpected header len for RX: "
-			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), vhost_hlen);
-			break;
-		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
-					 len, MSG_DONTWAIT | MSG_TRUNC);
+					 sock_len, MSG_DONTWAIT | MSG_TRUNC);
 		/* TODO: Check specific error and bomb out unless EAGAIN? */
 		if (err < 0) {
 			vhost_discard_desc(vq, headcount);
 			break;
 		}
-		if (err != datalen) {
+		if (err != sock_len) {
 			pr_err("Discarded rx packet: "
-			       " len %d, expected %zd\n", err, datalen);
+			       " len %d, expected %zd\n", err, sock_len);
 			vhost_discard_desc(vq, headcount);
 			continue;
 		}
-		len = err;
 		if (vhost_hlen &&
 		    memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
 				      vhost_hlen)) {
@@ -311,17 +357,16 @@ static void handle_rx(struct vhost_net *net)
 		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
 		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
 				      offsetof(typeof(hdr), num_buffers),
-				      sizeof(hdr.num_buffers))) {
+				      sizeof hdr.num_buffers)) {
 			vq_err(vq, "Failed num_buffers write");
 			vhost_discard_desc(vq, headcount);
 			break;
 		}
-		len += vhost_hlen;
 		vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
 					    headcount);
 		if (unlikely(vq_log))
-			vhost_log_write(vq, vq_log, log, len);
-		total_len += len;
+			vhost_log_write(vq, vq_log, log, vhost_len);
+		total_len += vhost_len;
 		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 			vhost_poll_queue(&vq->poll);
 			break;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 8ef5e3f..4b49991 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -862,53 +862,6 @@ static unsigned get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq,
 	return 0;
 }
 
-/* This is a multi-buffer version of vhost_get_desc
- * @vq		- the relevant virtqueue
- * datalen	- data length we'll be reading
- * @iovcount	- returned count of io vectors we fill
- * @log		- vhost log
- * @log_num	- log offset
- *	returns number of buffer heads allocated, negative on error
- */
-int vhost_get_desc_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads,
-		     int datalen, unsigned *iovcount, struct vhost_log *log,
-		     unsigned int *log_num)
-{
-	unsigned int out, in;
-	int seg = 0;
-	int headcount = 0;
-	int r;
-
-	while (datalen > 0) {
-		if (headcount >= VHOST_NET_MAX_SG) {
-			r = -ENOBUFS;
-			goto err;
-		}
-		heads[headcount].id = vhost_get_desc(vq->dev, vq, vq->iov + seg,
-					      ARRAY_SIZE(vq->iov) - seg, &out,
-					      &in, log, log_num);
-		if (heads[headcount].id == vq->num) {
-			r = 0;
-			goto err;
-		}
-		if (out || in <= 0) {
-			vq_err(vq, "unexpected descriptor format for RX: "
-				"out %d, in %d\n", out, in);
-			r = -EINVAL;
-			goto err;
-		}
-		heads[headcount].len = iov_length(vq->iov + seg, in);
-		datalen -= heads[headcount].len;
-		++headcount;
-		seg += in;
-	}
-	*iovcount = seg;
-	return headcount;
-err:
-	vhost_discard_desc(vq, headcount);
-	return r;
-}
-
 /* This looks in the virtqueue and for the first available buffer, and converts
  * it to an iovec for convenient access.  Since descriptors consist of some
  * number of output then some number of input descriptors, it's actually two
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 08d740a..4c9809e 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -122,9 +122,6 @@ long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg);
 int vhost_vq_access_ok(struct vhost_virtqueue *vq);
 int vhost_log_access_ok(struct vhost_dev *);
 
-int vhost_get_desc_n(struct vhost_virtqueue *, struct vring_used_elem *heads,
-		     int datalen, unsigned int *iovcount, struct vhost_log *log,
-		     unsigned int *log_num);
 unsigned vhost_get_desc(struct vhost_dev *, struct vhost_virtqueue *,
 			   struct iovec iov[], unsigned int iov_count,
 			   unsigned int *out_num, unsigned int *in_num,

-- 
MST

^ permalink raw reply related

* Re: linux-next: manual merge of the rr tree with the net tree
From: Rusty Russell @ 2010-04-29 12:53 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Stephen Rothwell, linux-next, linux-kernel, David Miller, netdev
In-Reply-To: <20100428155457.GB7467@redhat.com>

On Thu, 29 Apr 2010 01:24:57 am Michael S. Tsirkin wrote:
> On Tue, Apr 27, 2010 at 07:09:13AM +0300, Michael S. Tsirkin wrote:
> > On Tue, Apr 27, 2010 at 11:58:52AM +1000, Stephen Rothwell wrote:
> > > Hi Rusty,
> > > 
> > > Today's linux-next merge of the rr tree got a conflict in
> > > drivers/net/virtio_net.c between commit
> > > 5e01d2f91df62be4d6f282149bc2a8858992ceca ("virtio-net: move sg off
> > > stack") from the net tree and commit
> > > 7f62a724a65f864d84f50857bbfd36c240155c8f ("virtio_net: use virtqueue_xxx
> > > wrappers") from the rr tree.
> > > 
> > > I fixed it up (see below) and can carry the fix as necessary.
> > 
> > Hmm, Rusty, do you intend for the patches to go through netdev this
> > time? If you do, it might be simplest to just ask Dave to merge
> > them in net-next-2.6 now. I can prepare and send them if you like.

Naah, they hit too many random virtio drivers.  It's pretty simple to keep
them separate, and once the new API is in place I'll send to Dave to merge.
It could take two cycles, but there's no huge rush.

Cheers,
Rusty.

^ permalink raw reply

* Re: [PATCH] ARM: dmabounce: fix partial sync in dma_sync_single_* API
From: Russell King - ARM Linux @ 2010-04-29 11:34 UTC (permalink / raw)
  To: FUJITA Tomonori; +Cc: netdev, davem, linux-arm-kernel, linux-kernel
In-Reply-To: <20100428075137J.fujita.tomonori@lab.ntt.co.jp>

> Ping?
> 
> Is this going to be merged via the arm tree?

Ping all you want; I've only just got back online after a week of no
internet connectivity thanks to the carrier being unable to fix the
problem (the faulty line is still faulty.)

I'm now going to be catching up over the course of the next week, so
please be patient.

^ permalink raw reply

* Re: [patch] iwl: cleanup: remove unneeded error handling
From: Dan Carpenter @ 2010-04-29  9:29 UTC (permalink / raw)
  To: Zhu Yi
  Cc: Ortiz, Samuel, Intel Linux Wireless, John W. Linville,
	Andrew Morton, Alexey Dobriyan, linux-wireless@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	kernel-janitors@vger.kernel.org
In-Reply-To: <1272507134.14773.4339.camel@debian>

On Thu, Apr 29, 2010 at 10:12:14AM +0800, Zhu Yi wrote:
> 
> Thanks. But looks like you missed the one in if_sdio_debugfs_init().
> 

Yup.

> I don't think we even need to check -ENODEV ourselves because if
> DEBUG_FS is not compiled in, all the debugfs utility functions will
> become no-op.
> 

Good point.

> Signed-off-by: Dan Carpenter <error27@gmail.com>

Probably this could get changed to just "Reported-by" but I'm happy to
take credit for other peoples work.  :)

> Signed-off-by: Zhu Yi <yi.zhu@intel.com>
> 

Acked-by: Dan Carpenter <error27@gmail.com>

regards,
dan carpenter

^ permalink raw reply

* Re: [PATCH 0/3] [RFC] ptp: IEEE 1588 clock support
From: Wolfgang Grandegger @ 2010-04-29  8:08 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-discuss
In-Reply-To: <20100429065422.GA5803-7KxsofuKt4IfAd9E5cN8NEzG7cXyKsk/@public.gmane.org>

Richard Cochran wrote:
> On Wed, Apr 28, 2010 at 04:31:35PM +0200, Wolfgang Grandegger wrote:
>> That's because some 1588_PPS related bits are not yet setup in the
>> platform code of mainline kernel.
> 
> So did you get it working?

Yes, it works now :-). With

  master: ptpd -b eth1 -y 0 -a 3,12 -p
  slave : ptpd -b eth1 -y 0 -a 3,12

I see a PPS jitter of approximately +-100ns on the oscilloscopes. That's
the same value I get with the Freescale's old PTP 1588 implementation.

> I am reworking this patch set to post again, but perhaps you might
> take a look at the patch below. It configures the gianfar PTP clock

I will comment on your previous patches later today.

> parameters via the device tree.
> 
> Richard
> 
> [PATCH] ptp: gianfar clock uses device tree parameters
> Signed-off-by: Richard Cochran <richard.cochran-3mrvs1K0uXizZXS1Dc/lvw@public.gmane.org>
> ---
>  arch/powerpc/boot/dts/mpc8313erdb.dts |   14 +++++
>  arch/powerpc/boot/dts/p2020ds.dts     |   13 ++++
>  arch/powerpc/boot/dts/p2020rdb.dts    |   14 +++++
>  drivers/net/gianfar_ptp.c             |  102 ++++++++++++++++++++++----------
>  4 files changed, 111 insertions(+), 32 deletions(-)
> 
> diff --git a/arch/powerpc/boot/dts/mpc8313erdb.dts b/arch/powerpc/boot/dts/mpc8313erdb.dts
> index 183f2aa..b760aee 100644
> --- a/arch/powerpc/boot/dts/mpc8313erdb.dts
> +++ b/arch/powerpc/boot/dts/mpc8313erdb.dts
> @@ -208,6 +208,20 @@
>  			sleep = <&pmc 0x00300000>;
>  		};
>  
> +		ptp_clock@24E00 {
> +			device_type = "ptp_clock";
> +			model = "eTSEC";
> +			reg = <0x24E00 0xB0>;
> +			interrupts = <0x0C 2 0x0D 2>;

The interrupt number is usually specified in decimal notation.

> +			interrupt-parent = < &ipic >;
> +			tclk_period = <10>;
> +			tmr_prsc    = <100>;
> +			tmr_add     = <0x999999A4>;
> +			cksel       = <0x1>;
> +			tmr_fiper1  = <0x3B9AC9F6>;
> +			tmr_fiper2  = <0x00018696>;
> +		};

You should use the prefix "fsl," for the MPC-specific properties, at
least. A few of the values could be calculated from the clock frequency.
OF people usually prefer human readable values, if feasible, e.g.

        ptp-clock-source = "sys";
        ptp-clock-source = "ext";
	ptp-clock-frequency = "100000000";

Not sure it that works for other PTP hardware but it would be nice to
have a generic set of properties. I have added a CC to the device-tree
discuss mailing for further feedback.

> +
>  		enet0: ethernet@24000 {
>  			#address-cells = <1>;
>  			#size-cells = <1>;
> diff --git a/arch/powerpc/boot/dts/p2020ds.dts b/arch/powerpc/boot/dts/p2020ds.dts
> index 1101914..1dcf790 100644
> --- a/arch/powerpc/boot/dts/p2020ds.dts
> +++ b/arch/powerpc/boot/dts/p2020ds.dts
> @@ -336,6 +336,19 @@
>  			phy_type = "ulpi";
>  		};
>  
> +		ptp_clock@24E00 {
> +			device_type = "ptp_clock";
> +			model = "eTSEC";
> +			reg = <0x24E00 0xB0>;
> +			interrupts = <0x0C 2 0x0D 2>;
> +			interrupt-parent = < &mpic >;
> +			tclk_period = <5>;
> +			tmr_prsc = <200>;
> +			tmr_add = <0xCCCCCCCD>;
> +			tmr_fiper1 = <0x3B9AC9FB>;
> +			tmr_fiper2 = <0x0001869B>;
> +		};
> +
>  		enet0: ethernet@24000 {
>  			#address-cells = <1>;
>  			#size-cells = <1>;
> diff --git a/arch/powerpc/boot/dts/p2020rdb.dts b/arch/powerpc/boot/dts/p2020rdb.dts
> index da4cb0d..ba61e8e 100644
> --- a/arch/powerpc/boot/dts/p2020rdb.dts
> +++ b/arch/powerpc/boot/dts/p2020rdb.dts
> @@ -396,6 +396,20 @@
>  			phy_type = "ulpi";
>  		};
>  
> +		ptp_clock@24E00 {
> +			device_type = "ptp_clock";
> +			model = "eTSEC";
> +			reg = <0x24E00 0xB0>;
> +			interrupts = <0x0C 2 0x0D 2>;
> +			interrupt-parent = < &mpic >;
> +			tclk_period = <5>;
> +			tmr_prsc = <200>;
> +			tmr_add = <0xCCCCCCCD>;
> +			cksel = <1>;
> +			tmr_fiper1 = <0x3B9AC9FB>;
> +			tmr_fiper2 = <0x0001869B>;
> +		};
> +
>  		enet0: ethernet@24000 {
>  			#address-cells = <1>;
>  			#size-cells = <1>;
> diff --git a/drivers/net/gianfar_ptp.c b/drivers/net/gianfar_ptp.c
> index eed3246..ed6234c 100644
> --- a/drivers/net/gianfar_ptp.c
> +++ b/drivers/net/gianfar_ptp.c
> @@ -22,6 +22,8 @@
>  #include <linux/init.h>
>  #include <linux/kernel.h>
>  #include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/of_platform.h>
>  #include <linux/timex.h>
>  #include <asm/io.h>
>  
> @@ -29,29 +31,16 @@
>  
>  #include "gianfar_ptp_reg.h"
>  
> -/*
> - *
> - * TODO - get the following from device tree
> - *
> - */
> -#define TMR_BASE_KERNEL	0xe0024e00 // CONFIG_PPC_85xx 0xffe24e00
> -#define TIMER_OSC	166666666
> -#define TCLK_PERIOD	10
> -#define NOMINAL_FREQ	100000000
> -#define DEF_TMR_PRSC	100
> -#define DEF_TMR_ADD	0x999999A4
> -#define DEFAULT_CKSEL   1
> -
>  #define REG_SIZE	(4 + TMR_ETTS2_L)
>  
>  struct etsects {
>  	void *regs;
> -	u32 timer_osc;    /* Hz */
>  	u32 tclk_period;  /* nanoseconds */
> -	s64 nominal_freq; /* Hz */
>  	u32 tmr_prsc;
>  	u32 tmr_add;
>  	u32 cksel;
> +	u32 tmr_fiper1;
> +	u32 tmr_fiper2;
>  };
>  
>  /* Private globals */
> @@ -111,8 +100,8 @@ static void set_fipers(struct etsects *etsects)
>  
>  	reg_write(etsects, TMR_CTRL,   tmr_ctrl & (~TE));
>  	reg_write(etsects, TMR_PRSC,   etsects->tmr_prsc);
> -	reg_write(etsects, TMR_FIPER1, 0x3B9AC9F6);
> -	reg_write(etsects, TMR_FIPER2, 0x00018696);
> +	reg_write(etsects, TMR_FIPER1, etsects->tmr_fiper1);
> +	reg_write(etsects, TMR_FIPER2, etsects->tmr_fiper2);
>  	set_alarm(etsects);
>  	reg_write(etsects, TMR_CTRL,   tmr_ctrl|TE);
>  }
> @@ -213,34 +202,51 @@ struct ptp_clock_info ptp_gianfar_caps = {
>  	.enable		= ptp_gianfar_enable,
>  };
>  
> -/* module operations */
> +/* OF device tree */
>  
> -static void __exit ptp_gianfar_exit(void)
> +static int get_of_u32(struct device_node *node, char *str, u32 *val)
>  {
> -	ptp_clock_unregister(&ptp_gianfar_caps);
> -	iounmap(the_clock.regs);
> +	int plen;
> +	const u32 *prop = of_get_property(node, str, &plen);
> +
> +	if (!prop || plen != sizeof(*prop))
> +		return -1;
> +	*val = *prop;
> +	return 0;
>  }
>  
> -static int __init ptp_gianfar_init(void)
> +static int gianfar_ptp_probe(struct of_device* dev,
> +			     const struct of_device_id *match)
>  {
> +	u64 addr, size;
> +	struct device_node *node = dev->node;
>  	struct etsects *etsects = &the_clock;
>  	struct timespec now;
> -	phys_addr_t reg_addr = TMR_BASE_KERNEL;
> -	unsigned long reg_size = REG_SIZE;
> +	phys_addr_t reg_addr;
> +	unsigned long reg_size;
>  	u32 tmr_ctrl;
>  	int err;
>  
> +	if (get_of_u32(node, "tclk_period", &etsects->tclk_period) ||
> +	    get_of_u32(node, "tmr_prsc", &etsects->tmr_prsc) ||
> +	    get_of_u32(node, "tmr_add", &etsects->tmr_add) ||
> +	    get_of_u32(node, "cksel", &etsects->cksel) ||
> +	    get_of_u32(node, "tmr_fiper1", &etsects->tmr_fiper1) ||
> +	    get_of_u32(node, "tmr_fiper2", &etsects->tmr_fiper2))
> +		return -ENODEV;
> +
> +	addr = of_translate_address(node, of_get_address(node, 0, &size, NULL));
> +	reg_addr = addr;
> +	reg_size = size;
> +	if (reg_size < REG_SIZE) {
> +		pr_warning("device tree reg range %lu too small\n", reg_size);
> +		reg_size = REG_SIZE;
> +	}
>  	etsects->regs = ioremap(reg_addr, reg_size);
>  	if (!etsects->regs) {
>  		pr_err("ioremap ptp registers failed\n");
>  		return -EINVAL;
>  	}
> -	etsects->timer_osc = TIMER_OSC;
> -	etsects->tclk_period = TCLK_PERIOD;
> -	etsects->nominal_freq = NOMINAL_FREQ;
> -	etsects->tmr_prsc = DEF_TMR_PRSC;
> -	etsects->tmr_add = DEF_TMR_ADD;
> -	etsects->cksel = DEFAULT_CKSEL;
>  
>  	tmr_ctrl =
>  	  (etsects->tclk_period & TCLK_PERIOD_MASK) << TCLK_PERIOD_SHIFT |
> @@ -252,8 +258,8 @@ static int __init ptp_gianfar_init(void)
>  	reg_write(etsects, TMR_CTRL,   tmr_ctrl);
>  	reg_write(etsects, TMR_ADD,    etsects->tmr_add);
>  	reg_write(etsects, TMR_PRSC,   etsects->tmr_prsc);
> -	reg_write(etsects, TMR_FIPER1, 0x3B9AC9F6);
> -	reg_write(etsects, TMR_FIPER2, 0x00018696);
> +	reg_write(etsects, TMR_FIPER1, etsects->tmr_fiper1);
> +	reg_write(etsects, TMR_FIPER2, etsects->tmr_fiper2);
>  	set_alarm(etsects);
>  	reg_write(etsects, TMR_CTRL,   tmr_ctrl|FS|RTPE|TE);
>  
> @@ -261,6 +267,38 @@ static int __init ptp_gianfar_init(void)
>  	return err;
>  }
>  
> +static int gianfar_ptp_remove(struct of_device* dev)
> +{
> +	ptp_clock_unregister(&ptp_gianfar_caps);
> +	iounmap(the_clock.regs);
> +	return 0;
> +}
> +
> +static struct of_device_id match_table[] = {
> +	{ .type = "ptp_clock" },
> +	{},
> +};
> +
> +static struct of_platform_driver gianfar_ptp_driver = {
> +	.name        = "gianfar_ptp",
> +	.match_table = match_table,
> +	.owner       = THIS_MODULE,
> +	.probe       = gianfar_ptp_probe,
> +	.remove      = gianfar_ptp_remove,
> +};
> +
> +/* module operations */
> +
> +static void __exit ptp_gianfar_exit(void)
> +{
> +	of_unregister_platform_driver(&gianfar_ptp_driver);
> +}
> +
> +static int __init ptp_gianfar_init(void)
> +{
> +	return of_register_platform_driver(&gianfar_ptp_driver);
> +}
> +
>  subsys_initcall(ptp_gianfar_init);
>  module_exit(ptp_gianfar_exit);

Wolfgang.

^ permalink raw reply

* Re: linux-next: manual merge of the tty tree with the net tree
From: Greg KH @ 2010-04-29  6:13 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: linux-next, linux-kernel, Sjur Braendeland, David Miller, netdev,
	Alan Cox
In-Reply-To: <20100429155655.479e0982.sfr@canb.auug.org.au>

On Thu, Apr 29, 2010 at 03:56:55PM +1000, Stephen Rothwell wrote:
> Hi Greg,
> 
> Today's linux-next merge of the tty tree got a conflict in
> include/linux/tty.h between commit
> 9b27105b4a44c54bf91ecd7d0315034ae75684f7 ("net-caif-driver: add CAIF
> serial driver (ldisc)") from the net tree and commit
> 39658c58ebb52fe7a3488fd520aa89c75721fd20 ("tty: n_gsm line discipline")
> from the tty tree.
> 
> I used number 21 for the n_gsm line discipline (see below) and 30 for
> NR_LDISCS and can carry this fix as necessary.

That fix looks fine, thanks,

greg k-h

^ permalink raw reply

* linux-next: manual merge of the tty tree with the net tree
From: Stephen Rothwell @ 2010-04-29  5:56 UTC (permalink / raw)
  To: Greg KH
  Cc: linux-next, linux-kernel, Sjur Braendeland, David Miller, netdev,
	Alan Cox

Hi Greg,

Today's linux-next merge of the tty tree got a conflict in
include/linux/tty.h between commit
9b27105b4a44c54bf91ecd7d0315034ae75684f7 ("net-caif-driver: add CAIF
serial driver (ldisc)") from the net tree and commit
39658c58ebb52fe7a3488fd520aa89c75721fd20 ("tty: n_gsm line discipline")
from the tty tree.

I used number 21 for the n_gsm line discipline (see below) and 30 for
NR_LDISCS and can carry this fix as necessary.
-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au

diff --cc include/linux/tty.h
index bb44fa9,bd5acb7..0000000
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@@ -47,7 -47,8 +47,8 @@@
  #define N_SLCAN		17	/* Serial / USB serial CAN Adaptors */
  #define N_PPS		18	/* Pulse per Second */
  #define N_V253		19	/* Codec control over voice modem */
 -#define N_GSM0710	20	/* GSM 0710 Mux */
 -
 +#define N_CAIF		20      /* CAIF protocol for talking to modems */
++#define N_GSM0710	21	/* GSM 0710 Mux */
  
  /*
   * This character is the same as _POSIX_VDISABLE: it cannot be used as

^ permalink raw reply

* Re: [patch] iwl: cleanup: remove unneeded error handling
From: Zhu Yi @ 2010-04-29  2:12 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Ortiz, Samuel, Intel Linux Wireless, John W. Linville,
	Andrew Morton, Alexey Dobriyan, linux-wireless@vger.kernel.org,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	kernel-janitors@vger.kernel.org
In-Reply-To: <20100428090115.GF29093@bicker>

On Wed, 2010-04-28 at 17:01 +0800, Dan Carpenter wrote:
> This is just a cleanup and doesn't change how the code works.
> 
> debugfs_create_dir() and debugfs_create_file() return an error pointer 
> (-ENODEV) if CONFIG_DEBUG_FS is not enabled, otherwise if an error occurs
> they return NULL.  This is how they are implemented and what it says in 
> the DebugFS documentation.  DebugFS can not be compiled as a module.  
> 
> As a result, we only need to check for error pointers and particularly 
> -ENODEV one time to know that DebugFS is enabled.  This patch keeps the 
> first check for error pointers and removes the rest. 
> 
> The other reason for this patch, is that it silences some Smatch warnings.
> Smatch sees the condition "(result != -ENODEV)" and assumes that it's 
> possible for "result" to equal -ENODEV.  If it were possible it would lead
> to an error pointer dereference.  But since it's not, we can just remove
> the check.
> 
> Signed-off-by: Dan Carpenter <error27@gmail.com>

Thanks. But looks like you missed the one in if_sdio_debugfs_init().

I don't think we even need to check -ENODEV ourselves because if
DEBUG_FS is not compiled in, all the debugfs utility functions will
become no-op.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Zhu Yi <yi.zhu@intel.com>

diff --git a/drivers/net/wireless/iwmc3200wifi/bus.h b/drivers/net/wireless/iwmc3200wifi/bus.h
index 836663e..62edd58 100644
--- a/drivers/net/wireless/iwmc3200wifi/bus.h
+++ b/drivers/net/wireless/iwmc3200wifi/bus.h
@@ -31,7 +31,7 @@ struct iwm_if_ops {
 	int (*disable)(struct iwm_priv *iwm);
 	int (*send_chunk)(struct iwm_priv *iwm, u8* buf, int count);
 
-	int (*debugfs_init)(struct iwm_priv *iwm, struct dentry *parent_dir);
+	void (*debugfs_init)(struct iwm_priv *iwm, struct dentry *parent_dir);
 	void (*debugfs_exit)(struct iwm_priv *iwm);
 
 	const char *umac_name;
diff --git a/drivers/net/wireless/iwmc3200wifi/debug.h b/drivers/net/wireless/iwmc3200wifi/debug.h
index e35c9b6..f98bf12 100644
--- a/drivers/net/wireless/iwmc3200wifi/debug.h
+++ b/drivers/net/wireless/iwmc3200wifi/debug.h
@@ -113,10 +113,10 @@ struct iwm_debugfs {
 };
 
 #ifdef CONFIG_IWM_DEBUG
-int iwm_debugfs_init(struct iwm_priv *iwm);
+void iwm_debugfs_init(struct iwm_priv *iwm);
 void iwm_debugfs_exit(struct iwm_priv *iwm);
 #else
-static inline int iwm_debugfs_init(struct iwm_priv *iwm)
+static inline void iwm_debugfs_init(struct iwm_priv *iwm)
 {
 	return 0;
 }
diff --git a/drivers/net/wireless/iwmc3200wifi/debugfs.c b/drivers/net/wireless/iwmc3200wifi/debugfs.c
index 7244413..53b0b77 100644
--- a/drivers/net/wireless/iwmc3200wifi/debugfs.c
+++ b/drivers/net/wireless/iwmc3200wifi/debugfs.c
@@ -48,12 +48,11 @@ static struct {
 
 #define add_dbg_module(dbg, name, id, initlevel) 	\
 do {							\
-	struct dentry *d;				\
 	dbg.dbg_module[id] = (initlevel);		\
-	d = debugfs_create_x8(name, 0600, dbg.dbgdir,	\
-			     &(dbg.dbg_module[id]));	\
-	if (!IS_ERR(d))					\
-		dbg.dbg_module_dentries[id] = d;        \
+	dbg.dbg_module_dentries[id] =			\
+		debugfs_create_x8(name, 0600,		\
+				dbg.dbgdir,		\
+				&(dbg.dbg_module[id]));	\
 } while (0)
 
 static int iwm_debugfs_u32_read(void *data, u64 *val)
@@ -423,89 +422,29 @@ static const struct file_operations iwm_debugfs_fw_err_fops = {
 	.read =		iwm_debugfs_fw_err_read,
 };
 
-int iwm_debugfs_init(struct iwm_priv *iwm)
+void iwm_debugfs_init(struct iwm_priv *iwm)
 {
-	int i, result;
-	char devdir[16];
+	int i;
 
 	iwm->dbg.rootdir = debugfs_create_dir(KBUILD_MODNAME, NULL);
-	result = PTR_ERR(iwm->dbg.rootdir);
-	if (!result || IS_ERR(iwm->dbg.rootdir)) {
-		if (result == -ENODEV) {
-			IWM_ERR(iwm, "DebugFS (CONFIG_DEBUG_FS) not "
-				"enabled in kernel config\n");
-			result = 0;	/* No debugfs support */
-		}
-		IWM_ERR(iwm, "Couldn't create rootdir: %d\n", result);
-		goto error;
-	}
-
-	snprintf(devdir, sizeof(devdir), "%s", wiphy_name(iwm_to_wiphy(iwm)));
-
-	iwm->dbg.devdir = debugfs_create_dir(devdir, iwm->dbg.rootdir);
-	result = PTR_ERR(iwm->dbg.devdir);
-	if (IS_ERR(iwm->dbg.devdir) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create devdir: %d\n", result);
-		goto error;
-	}
-
+	iwm->dbg.devdir = debugfs_create_dir(wiphy_name(iwm_to_wiphy(iwm)),
+					     iwm->dbg.rootdir);
 	iwm->dbg.dbgdir = debugfs_create_dir("debug", iwm->dbg.devdir);
-	result = PTR_ERR(iwm->dbg.dbgdir);
-	if (IS_ERR(iwm->dbg.dbgdir) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create dbgdir: %d\n", result);
-		goto error;
-	}
-
 	iwm->dbg.rxdir = debugfs_create_dir("rx", iwm->dbg.devdir);
-	result = PTR_ERR(iwm->dbg.rxdir);
-	if (IS_ERR(iwm->dbg.rxdir) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create rx dir: %d\n", result);
-		goto error;
-	}
-
 	iwm->dbg.txdir = debugfs_create_dir("tx", iwm->dbg.devdir);
-	result = PTR_ERR(iwm->dbg.txdir);
-	if (IS_ERR(iwm->dbg.txdir) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create tx dir: %d\n", result);
-		goto error;
-	}
-
 	iwm->dbg.busdir = debugfs_create_dir("bus", iwm->dbg.devdir);
-	result = PTR_ERR(iwm->dbg.busdir);
-	if (IS_ERR(iwm->dbg.busdir) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create bus dir: %d\n", result);
-		goto error;
-	}
-
-	if (iwm->bus_ops->debugfs_init) {
-		result = iwm->bus_ops->debugfs_init(iwm, iwm->dbg.busdir);
-		if (result < 0) {
-			IWM_ERR(iwm, "Couldn't create bus entry: %d\n", result);
-			goto error;
-		}
-	}
-
+	if (iwm->bus_ops->debugfs_init)
+		iwm->bus_ops->debugfs_init(iwm, iwm->dbg.busdir);
 
 	iwm->dbg.dbg_level = IWM_DL_NONE;
 	iwm->dbg.dbg_level_dentry =
 		debugfs_create_file("level", 0200, iwm->dbg.dbgdir, iwm,
 				    &fops_iwm_dbg_level);
-	result = PTR_ERR(iwm->dbg.dbg_level_dentry);
-	if (IS_ERR(iwm->dbg.dbg_level_dentry) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create dbg_level: %d\n", result);
-		goto error;
-	}
-
 
 	iwm->dbg.dbg_modules = IWM_DM_DEFAULT;
 	iwm->dbg.dbg_modules_dentry =
 		debugfs_create_file("modules", 0200, iwm->dbg.dbgdir, iwm,
 				    &fops_iwm_dbg_modules);
-	result = PTR_ERR(iwm->dbg.dbg_modules_dentry);
-	if (IS_ERR(iwm->dbg.dbg_modules_dentry) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create dbg_modules: %d\n", result);
-		goto error;
-	}
 
 	for (i = 0; i < __IWM_DM_NR; i++)
 		add_dbg_module(iwm->dbg, iwm_debug_module[i].name,
@@ -514,44 +453,15 @@ int iwm_debugfs_init(struct iwm_priv *iwm)
 	iwm->dbg.txq_dentry = debugfs_create_file("queues", 0200,
 						  iwm->dbg.txdir, iwm,
 						  &iwm_debugfs_txq_fops);
-	result = PTR_ERR(iwm->dbg.txq_dentry);
-	if (IS_ERR(iwm->dbg.txq_dentry) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create tx queue: %d\n", result);
-		goto error;
-	}
-
 	iwm->dbg.tx_credit_dentry = debugfs_create_file("credits", 0200,
 						   iwm->dbg.txdir, iwm,
 						   &iwm_debugfs_tx_credit_fops);
-	result = PTR_ERR(iwm->dbg.tx_credit_dentry);
-	if (IS_ERR(iwm->dbg.tx_credit_dentry) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create tx credit: %d\n", result);
-		goto error;
-	}
-
 	iwm->dbg.rx_ticket_dentry = debugfs_create_file("tickets", 0200,
 						  iwm->dbg.rxdir, iwm,
 						  &iwm_debugfs_rx_ticket_fops);
-	result = PTR_ERR(iwm->dbg.rx_ticket_dentry);
-	if (IS_ERR(iwm->dbg.rx_ticket_dentry) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create rx ticket: %d\n", result);
-		goto error;
-	}
-
 	iwm->dbg.fw_err_dentry = debugfs_create_file("last_fw_err", 0200,
 						     iwm->dbg.dbgdir, iwm,
 						     &iwm_debugfs_fw_err_fops);
-	result = PTR_ERR(iwm->dbg.fw_err_dentry);
-	if (IS_ERR(iwm->dbg.fw_err_dentry) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create last FW err: %d\n", result);
-		goto error;
-	}
-
-
-	return 0;
-
- error:
-	return result;
 }
 
 void iwm_debugfs_exit(struct iwm_priv *iwm)
diff --git a/drivers/net/wireless/iwmc3200wifi/sdio.c b/drivers/net/wireless/iwmc3200wifi/sdio.c
index 1eafd6d..1acea37 100644
--- a/drivers/net/wireless/iwmc3200wifi/sdio.c
+++ b/drivers/net/wireless/iwmc3200wifi/sdio.c
@@ -366,21 +366,13 @@ static const struct file_operations iwm_debugfs_sdio_fops = {
 	.read =		iwm_debugfs_sdio_read,
 };
 
-static int if_sdio_debugfs_init(struct iwm_priv *iwm, struct dentry *parent_dir)
+static void if_sdio_debugfs_init(struct iwm_priv *iwm, struct dentry *parent_dir)
 {
-	int result;
 	struct iwm_sdio_priv *hw = iwm_to_if_sdio(iwm);
 
 	hw->cccr_dentry = debugfs_create_file("cccr", 0200,
 					      parent_dir, iwm,
 					      &iwm_debugfs_sdio_fops);
-	result = PTR_ERR(hw->cccr_dentry);
-	if (IS_ERR(hw->cccr_dentry) && (result != -ENODEV)) {
-		IWM_ERR(iwm, "Couldn't create CCCR entry: %d\n", result);
-		return result;
-	}
-
-	return 0;
 }
 
 static void if_sdio_debugfs_exit(struct iwm_priv *iwm)
@@ -440,11 +432,7 @@ static int iwm_sdio_probe(struct sdio_func *func,
 	hw = iwm_private(iwm);
 	hw->iwm = iwm;
 
-	ret = iwm_debugfs_init(iwm);
-	if (ret < 0) {
-		IWM_ERR(iwm, "Debugfs registration failed\n");
-		goto if_free;
-	}
+	iwm_debugfs_init(iwm);
 
 	sdio_set_drvdata(func, hw);
 
@@ -473,7 +461,6 @@ static int iwm_sdio_probe(struct sdio_func *func,
 	destroy_workqueue(hw->isr_wq);
  debugfs_exit:
 	iwm_debugfs_exit(iwm);
- if_free:
 	iwm_if_free(iwm);
 	return ret;
 }

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox