Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Michael S. Tsirkin @ 2010-05-31 15:22 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <4C02C961.9050606@kernel.org>

On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> argument change from struct work_struct * to struct vhost_poll *,
> there's no visible change to vhost_poll_*() interface.

I would prefer a substructure vhost_work, even just to make
the code easier to review and compare to workqueue.c.

> This conversion is to make each vhost use a dedicated kthread so that
> resource control via cgroup can be applied.
> 
> Partially based on Sridhar Samudrala's patch.
> 
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Sridhar Samudrala <samudrala.sridhar@gmail.com>
> ---
> Okay, here is three patch series to convert vhost to use per-vhost
> kthread, add cgroup_attach_task_current_cg() and apply it to the vhost
> kthreads.  The conversion is mostly straight forward although flush is
> slightly tricky.
> 
> The problem is that I have no idea how to test this.

It's a 3 step process:

1. 
Install qemu-kvm under fc13, or build recent one from source,
get it from here:
git://git.kernel.org/pub/scm/virt/kvm/qemu-kvm.git

2. install guest under it:
qemu-img create -f qcow2 disk.qcow2 100G
Now get some image (e.g. fedora 13 in fc13.iso)
and install guest:
qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2


3. set up networking. I usually simply do host to guest 
on a special subnet for testing purposes:

Set up a bridge named mstbr0:

ifconfig mstbr0 down
brctl delbr mstbr0
brctl addbr mstbr0
brctl setfd mstbr0 0
ifconfig mstbr0 11.0.0.1

cat > ifup << EOF
#!/bin/sh -x
/sbin/ifconfig msttap0 0.0.0.0 up
brctl addif mstbr0 msttap0
EOF


qemu-kvm -enable-kvm -m 1G -cdrom fc13.iso -drive file=disk.qcow2
 -net nic,model=virtio,netdev=foo -netdev
tap,id=foo,ifname=msttap0,script=/home/mst/ifup,downscript=no,vhost=on

after you set up the guest, log into it and
ifconfig eth0 11.0.0.2

You should now be able to ping guest to host and back.
Use something like netperf to stress the connection.
Close qemu with kill -9 and unload module to test flushing code.



> Index: work/drivers/vhost/vhost.c
> ===================================================================
> --- work.orig/drivers/vhost/vhost.c
> +++ work/drivers/vhost/vhost.c

...

> @@ -125,10 +139,50 @@ static void vhost_vq_reset(struct vhost_
>  	vq->log_ctx = NULL;
>  }
> 
> +static int vhost_poller(void *data)
> +{
> +	struct vhost_dev *dev = data;
> +	struct vhost_poll *poll;
> +
> +repeat:
> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> +
> +	if (kthread_should_stop()) {
> +		__set_current_state(TASK_RUNNING);
> +		return 0;
> +	}
> +
> +	poll = NULL;
> +	spin_lock(&dev->poller_lock);
> +	if (!list_empty(&dev->poll_list)) {
> +		poll = list_first_entry(&dev->poll_list,
> +					struct vhost_poll, node);
> +		list_del_init(&poll->node);
> +	}
> +	spin_unlock(&dev->poller_lock);
> +
> +	if (poll) {
> +		__set_current_state(TASK_RUNNING);
> +		poll->fn(poll);
> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
> +		poll->done_seq = poll->queue_seq;
> +		wake_up_all(&poll->done);


This seems to add wakeups on data path, which uses spinlocks etc.
OTOH workqueue.c adds a special barrier
entry which only does a wakeup when needed.
Right?

> +	} else
> +		schedule();
> +
> +	goto repeat;
> +}
> +
>  long vhost_dev_init(struct vhost_dev *dev,
>  		    struct vhost_virtqueue *vqs, int nvqs)
>  {
> +	struct task_struct *poller;
>  	int i;
> +
> +	poller = kthread_create(vhost_poller, dev, "vhost-%d", current->pid);
> +	if (IS_ERR(poller))
> +		return PTR_ERR(poller);
> +
>  	dev->vqs = vqs;
>  	dev->nvqs = nvqs;
>  	mutex_init(&dev->mutex);
> @@ -136,6 +190,9 @@ long vhost_dev_init(struct vhost_dev *de
>  	dev->log_file = NULL;
>  	dev->memory = NULL;
>  	dev->mm = NULL;
> +	spin_lock_init(&dev->poller_lock);
> +	INIT_LIST_HEAD(&dev->poll_list);
> +	dev->poller = poller;
> 
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		dev->vqs[i].dev = dev;
> @@ -143,8 +200,7 @@ long vhost_dev_init(struct vhost_dev *de
>  		vhost_vq_reset(dev, dev->vqs + i);
>  		if (dev->vqs[i].handle_kick)
>  			vhost_poll_init(&dev->vqs[i].poll,
> -					dev->vqs[i].handle_kick,
> -					POLLIN);
> +					dev->vqs[i].handle_kick, POLLIN, dev);
>  	}
>  	return 0;
>  }
> @@ -217,6 +273,8 @@ void vhost_dev_cleanup(struct vhost_dev
>  	if (dev->mm)
>  		mmput(dev->mm);
>  	dev->mm = NULL;
> +
> +	kthread_stop(dev->poller);
>  }
> 
>  static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz)
> @@ -1113,16 +1171,3 @@ void vhost_disable_notify(struct vhost_v
>  		vq_err(vq, "Failed to enable notification at %p: %d\n",
>  		       &vq->used->flags, r);
>  }
> -
> -int vhost_init(void)
> -{
> -	vhost_workqueue = create_singlethread_workqueue("vhost");
> -	if (!vhost_workqueue)
> -		return -ENOMEM;
> -	return 0;
> -}
> -
> -void vhost_cleanup(void)
> -{
> -	destroy_workqueue(vhost_workqueue);

I note that destroy_workqueue does a flush, kthread_stop
doesn't. Right? Sure we don't need to check nothing is in one of
the lists? Maybe add a BUG_ON?

> -}
> Index: work/drivers/vhost/vhost.h
> ===================================================================
> --- work.orig/drivers/vhost/vhost.h
> +++ work/drivers/vhost/vhost.h
> @@ -5,7 +5,6 @@
>  #include <linux/vhost.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> -#include <linux/workqueue.h>
>  #include <linux/poll.h>
>  #include <linux/file.h>
>  #include <linux/skbuff.h>
> @@ -20,19 +19,26 @@ enum {
>  	VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2,
>  };
> 
> +struct vhost_poll;
> +typedef void (*vhost_poll_fn_t)(struct vhost_poll *poll);
> +
>  /* Poll a file (eventfd or socket) */
>  /* Note: there's nothing vhost specific about this structure. */
>  struct vhost_poll {
> +	vhost_poll_fn_t		  fn;
>  	poll_table                table;
>  	wait_queue_head_t        *wqh;
>  	wait_queue_t              wait;
> -	/* struct which will handle all actual work. */
> -	struct work_struct        work;
> +	struct list_head	  node;
> +	wait_queue_head_t	  done;
>  	unsigned long		  mask;
> +	struct vhost_dev	 *dev;
> +	int			  queue_seq;
> +	int			  done_seq;
>  };
> 
> -void vhost_poll_init(struct vhost_poll *poll, work_func_t func,
> -		     unsigned long mask);
> +void vhost_poll_init(struct vhost_poll *poll, vhost_poll_fn_t fn,
> +		     unsigned long mask, struct vhost_dev *dev);
>  void vhost_poll_start(struct vhost_poll *poll, struct file *file);
>  void vhost_poll_stop(struct vhost_poll *poll);
>  void vhost_poll_flush(struct vhost_poll *poll);
> @@ -63,7 +69,7 @@ struct vhost_virtqueue {
>  	struct vhost_poll poll;
> 
>  	/* The routine to call when the Guest pings us, or timeout. */
> -	work_func_t handle_kick;
> +	vhost_poll_fn_t handle_kick;
> 
>  	/* Last available index we saw. */
>  	u16 last_avail_idx;
> @@ -86,11 +92,11 @@ struct vhost_virtqueue {
>  	struct iovec hdr[VHOST_NET_MAX_SG];
>  	size_t hdr_size;
>  	/* We use a kind of RCU to access private pointer.
> -	 * All readers access it from workqueue, which makes it possible to
> -	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
> +	 * All readers access it from poller, which makes it possible to
> +	 * flush the vhost_poll instead of synchronize_rcu. Therefore readers do
>  	 * not need to call rcu_read_lock/rcu_read_unlock: the beginning of
> -	 * work item execution acts instead of rcu_read_lock() and the end of
> -	 * work item execution acts instead of rcu_read_lock().
> +	 * vhost_poll execution acts instead of rcu_read_lock() and the end of
> +	 * vhost_poll execution acts instead of rcu_read_lock().
>  	 * Writers use virtqueue mutex. */
>  	void *private_data;
>  	/* Log write descriptors */
> @@ -110,6 +116,9 @@ struct vhost_dev {
>  	int nvqs;
>  	struct file *log_file;
>  	struct eventfd_ctx *log_ctx;
> +	spinlock_t poller_lock;
> +	struct list_head poll_list;
> +	struct task_struct *poller;
>  };
> 
>  long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
> @@ -136,9 +145,6 @@ bool vhost_enable_notify(struct vhost_vi
>  int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log,
>  		    unsigned int log_num, u64 len);
> 
> -int vhost_init(void);
> -void vhost_cleanup(void);
> -
>  #define vq_err(vq, fmt, ...) do {                                  \
>  		pr_debug(pr_fmt(fmt), ##__VA_ARGS__);       \
>  		if ((vq)->error_ctx)                               \

^ permalink raw reply

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Oleg Nesterov @ 2010-05-31 15:31 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Michael S. Tsirkin, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <4C03D0BE.1040601@kernel.org>

On 05/31, Tejun Heo wrote:
>
> On 05/31/2010 04:39 PM, Oleg Nesterov wrote:
>
> > What I can't understand is why we do have ->queue_seq and ->done_seq.
> >
> > Isn't the single "bool poll->active" enough? vhost_poll_queue() sets
> > ->active == T, vhost_poller() clears it before wake_up_all(poll->done).
>
> I might have slightly over engineered this part not knowing the
> expected workload.  ->queue_seq/->done_seq pair is to guarantee that
> flushers never get starved.

Ah, indeed.

Well, afaics we do not need 2 counters anyway, both vhost_poll_queue()
and vhost_poller() could increment the single counter and the flusher
can take bit 0 into account. But I agree 2 counters are much more clean.

> >> +static int vhost_poller(void *data)
> >> +{
> >> +	struct vhost_dev *dev = data;
> >> +	struct vhost_poll *poll;
> >> +
> >> +repeat:
> >> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
> >
> > I don't understand the comment... why do we need this barrier?
>
> So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
> visible to kthread_should_stop() or task state is set to RUNNING.

Of course, you are right. I am really surprized I asked this question ;)

Thanks,

Oleg.


^ permalink raw reply

* Re: [PATCH] ixp4xx: Support the all multicast flag on the NPE devices.
From: Richard Cochran @ 2010-05-31 15:34 UTC (permalink / raw)
  To: Krzysztof Halasa; +Cc: netdev, linux-arm-kernel
In-Reply-To: <m3pr0c17ue.fsf@intrepid.localdomain>

On Mon, May 31, 2010 at 04:09:13PM +0200, Krzysztof Halasa wrote:
> Looks good, though I'd prefer a bit of "const" and "static" in the
> allmulti[] declaration. Would you please repost? TIA.

Okay, here it is.

Thanks,
Richard

This patch adds support for the IFF_ALLMULTI flag. Previously only the
IFF_PROMISC flag was supported.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
---
 drivers/net/arm/ixp4xx_eth.c |   11 +++++++++++
 1 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/drivers/net/arm/ixp4xx_eth.c b/drivers/net/arm/ixp4xx_eth.c
index 6be8b09..42b4361 100644
--- a/drivers/net/arm/ixp4xx_eth.c
+++ b/drivers/net/arm/ixp4xx_eth.c
@@ -739,6 +739,17 @@ static void eth_set_mcast_list(struct net_device *dev)
 	struct dev_mc_list *mclist;
 	u8 diffs[ETH_ALEN], *addr;
 	int i;
+	static const u8 allmulti[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
+
+	if (dev->flags & IFF_ALLMULTI) {
+		for (i = 0; i < ETH_ALEN; i++) {
+			__raw_writel(allmulti[i], &port->regs->mcast_addr[i]);
+			__raw_writel(allmulti[i], &port->regs->mcast_mask[i]);
+		}
+		__raw_writel(DEFAULT_RX_CNTRL0 | RX_CNTRL0_ADDR_FLTR_EN,
+			&port->regs->rx_control[0]);
+		return;
+	}
 
 	if ((dev->flags & IFF_PROMISC) || netdev_mc_empty(dev)) {
 		__raw_writel(DEFAULT_RX_CNTRL0 & ~RX_CNTRL0_ADDR_FLTR_EN,
-- 
1.6.3.3


^ permalink raw reply related

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Tejun Heo @ 2010-05-31 15:38 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Michael S. Tsirkin, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <20100531153104.GA9452@redhat.com>

Hello,

On 05/31/2010 05:31 PM, Oleg Nesterov wrote:
>> I might have slightly over engineered this part not knowing the
>> expected workload.  ->queue_seq/->done_seq pair is to guarantee that
>> flushers never get starved.
> 
> Ah, indeed.
> 
> Well, afaics we do not need 2 counters anyway, both vhost_poll_queue()
> and vhost_poller() could increment the single counter and the flusher
> can take bit 0 into account. But I agree 2 counters are much more clean.

Right, we can do that too.  Cool. :-)

>>>> +static int vhost_poller(void *data)
>>>> +{
>>>> +	struct vhost_dev *dev = data;
>>>> +	struct vhost_poll *poll;
>>>> +
>>>> +repeat:
>>>> +	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
>>>
>>> I don't understand the comment... why do we need this barrier?
>>
>> So that either kthread_stop()'s should_stop = 1 in kthread_stop() is
>> visible to kthread_should_stop() or task state is set to RUNNING.
> 
> Of course, you are right. I am really surprized I asked this question ;)

This part is always a bit tricky tho.  Maybe it would be a good idea
to make kthread_stop() do periodic wakeups.  It's already injecting
one rather unexpected wake up into the mix anyway so there isn't much
point in avoiding multiple and it would make designing kthread loops
easier.  Or maybe what we need is something like kthread_idle() which
encapsulates the check and sleep part.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Tejun Heo @ 2010-05-31 15:45 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <20100531152221.GB2987@redhat.com>

Hello,

On 05/31/2010 05:22 PM, Michael S. Tsirkin wrote:
> On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
>> Replace vhost_workqueue with per-vhost kthread.  Other than callback
>> argument change from struct work_struct * to struct vhost_poll *,
>> there's no visible change to vhost_poll_*() interface.
> 
> I would prefer a substructure vhost_work, even just to make
> the code easier to review and compare to workqueue.c.

Yeap, sure.

>> The problem is that I have no idea how to test this.
> 
> It's a 3 step process:
...
> You should now be able to ping guest to host and back.
> Use something like netperf to stress the connection.
> Close qemu with kill -9 and unload module to test flushing code.

Thanks for the instruction.  I'll see if there's a way to do it
without building qemu myself on opensuse.  But please feel free to go
ahead and test it.  It might just work!  :-)

>> +	if (poll) {
>> +		__set_current_state(TASK_RUNNING);
>> +		poll->fn(poll);
>> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
>> +		poll->done_seq = poll->queue_seq;
>> +		wake_up_all(&poll->done);
> 

> This seems to add wakeups on data path, which uses spinlocks etc.
> OTOH workqueue.c adds a special barrier entry which only does a
> wakeup when needed.  Right?

Yeah, well, if it's a really hot path sure we can avoid wake_up_all()
in most cases.  Do you think that would be necessary?

>> -void vhost_cleanup(void)
>> -{
>> -	destroy_workqueue(vhost_workqueue);
> 
> I note that destroy_workqueue does a flush, kthread_stop
> doesn't. Right? Sure we don't need to check nothing is in one of
> the lists? Maybe add a BUG_ON?

There were a bunch of flushes before kthread_stop() and they seemed to
stop and flush everything.  Aren't they enough?  We can definitely add
BUG_ON() after kthread_should_stop() check succeeds either way tho.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [RFC] netfilter: WIP: Xtables idletimer target implementation
From: Patrick McHardy @ 2010-05-31 15:59 UTC (permalink / raw)
  To: Luciano Coelho
  Cc: ext Jan Engelhardt, netfilter-devel@vger.kernel.org,
	netdev@vger.kernel.org, Timo Teras
In-Reply-To: <1275040724.24490.121.camel@chilepepper>

Luciano Coelho wrote:
> On Fri, 2010-05-28 at 10:05 +0200, ext Jan Engelhardt wrote:
>> On Friday 2010-05-28 07:25, Luciano Coelho wrote:
>>> Do you have any other suggestion on how I can associate the rules to
>>> specific interfaces?
>> -A INPUT -i foo -j do
>> -A do -j idletimer
>>
>> A little funny, but actually this would allow me to keep a timer
>> for a group of interfaces rather than just per-if.
> 
> Yes, this is what our userspace apps are doing.  I've formulated my
> question in an unclear way.  If you check the rest of the code, I create
> sysfs files under the interface's directory and use it as an attribute
> to notify the userspace when the timer has expired.
> 
> In short, I need to figure out a way to associate each rule with an
> interface in sysfs, so I can notify the userspace when the timer has
> expired.  I couldn't figure out another way to do it.  Any suggestions?

How about just using an arbitrary user-supplied name? People can
name them after interfaces, or anything else.

>>>>> +static int xt_idletimer_checkentry(const struct xt_tgchk_param *par)
>>>>> +{
>>>>> +	const struct xt_idletimer_info *info = par->targinfo;
>>>>> +	const struct ipt_entry *entryinfo = par->entryinfo;
>>>>> +	const struct ipt_ip *ip = &entryinfo->ip;
>>>> I'm not sure spying on ipt_ip is a long-term viable solution.
>>> Do you have any other suggestions on how I could get an interface
>>> associated with the rule? I thought about having the userspace pass the
>>> interface as an option to the rule (like I already do for the timeout
>>> value), but that looked ugly to me, since the interface can already be
>>> defined as part of the ruleset.
>> I have patches ready since a while that decouple ipt_ip
>> from a rule, so there is no guarantee that such will exist.
> 
> Okay, if that's the case, then I don't know how to associate the rule
> with a specific net object in the kobject tree.  Maybe I have to figure
> out a different way to notify the userspace, unless I add the target
> option I mentioned above. :/
> 
> 


^ permalink raw reply

* Re: [PATCH 1/3] vhost: replace vhost_workqueue with per-vhost kthread
From: Michael S. Tsirkin @ 2010-05-31 16:00 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Oleg Nesterov, Sridhar Samudrala, netdev, lkml,
	kvm@vger.kernel.org, Andrew Morton, Dmitri Vorobiev, Jiri Kosina,
	Thomas Gleixner, Ingo Molnar, Andi Kleen
In-Reply-To: <4C03D983.9010905@kernel.org>

On Mon, May 31, 2010 at 05:45:07PM +0200, Tejun Heo wrote:
> Hello,
> 
> On 05/31/2010 05:22 PM, Michael S. Tsirkin wrote:
> > On Sun, May 30, 2010 at 10:24:01PM +0200, Tejun Heo wrote:
> >> Replace vhost_workqueue with per-vhost kthread.  Other than callback
> >> argument change from struct work_struct * to struct vhost_poll *,
> >> there's no visible change to vhost_poll_*() interface.
> > 
> > I would prefer a substructure vhost_work, even just to make
> > the code easier to review and compare to workqueue.c.
> 
> Yeap, sure.
> 
> >> The problem is that I have no idea how to test this.
> > 
> > It's a 3 step process:
> ...
> > You should now be able to ping guest to host and back.
> > Use something like netperf to stress the connection.
> > Close qemu with kill -9 and unload module to test flushing code.
> 
> Thanks for the instruction.  I'll see if there's a way to do it
> without building qemu myself on opensuse.

My guess is no, there was no stable qemu release with vhost net support
yet.  Building it is mostly configure/make/make install,
as far as I remember you only need devel versions of X libraries,
SDL and curses installed.

>  But please feel free to go
> ahead and test it.  It might just work!  :-)
> 
> >> +	if (poll) {
> >> +		__set_current_state(TASK_RUNNING);
> >> +		poll->fn(poll);
> >> +		smp_wmb();	/* paired with rmb in vhost_poll_flush() */
> >> +		poll->done_seq = poll->queue_seq;
> >> +		wake_up_all(&poll->done);
> > 
> 
> > This seems to add wakeups on data path, which uses spinlocks etc.
> > OTOH workqueue.c adds a special barrier entry which only does a
> > wakeup when needed.  Right?
> 
> Yeah, well, if it's a really hot path sure we can avoid wake_up_all()
> in most cases.  Do you think that would be necessary?

My guess is yes. This is really very hot path in code, and we are
close to 100% CPU in some benchmarks.

> >> -void vhost_cleanup(void)
> >> -{
> >> -	destroy_workqueue(vhost_workqueue);
> > 
> > I note that destroy_workqueue does a flush, kthread_stop
> > doesn't. Right? Sure we don't need to check nothing is in one of
> > the lists? Maybe add a BUG_ON?
> 
> There were a bunch of flushes before kthread_stop() and they seemed to
> stop and flush everything.  Aren't they enough?

I was just asking, I'll need to review the code in depth.

> We can definitely add
> BUG_ON() after kthread_should_stop() check succeeds either way tho.
> 
> Thanks.
> 
> -- 
> tejun

^ permalink raw reply

* [PATCH] net: sock_queue_err_skb() dont mess with sk_forward_alloc
From: Eric Dumazet @ 2010-05-31 16:02 UTC (permalink / raw)
  To: David Miller; +Cc: anton, netdev
In-Reply-To: <20100529.002124.149815573.davem@davemloft.net>

Le samedi 29 mai 2010 à 00:21 -0700, David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Wed, 26 May 2010 12:12:56 +0200
> 
> > [PATCH] net: fix sk_forward_alloc corruptions
> > 
> > As David found out, sock_queue_err_skb() should be called with socket
> > lock hold, or we risk sk_forward_alloc corruption, since we use non
> > atomic operations to update this field.
> > 
> > This patch adds bh_lock_sock()/bh_unlock_sock() pair to three spots.
> > (BH already disabled)
> > 
> > 1) skb_tstamp_tx() 
> > 2) Before calling ip_icmp_error(), in __udp4_lib_err() 
> > 3) Before calling ipv6_icmp_error(), in __udp6_lib_err()
> > 
> > Reported-by: Anton Blanchard <anton@samba.org>
> > Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> 
> This wasn't the direct cause of Anton's problems but is
> a serious legitimate bug.
> 
> So, applied, thanks!

Thats embarrassing...

I believe there is still a problem with sock_queue_err_skb(), sorry
Dave :(

There is also a problem in ip_recv_error(), not called with socket
locked, skb freed -> potential corruption.

If current socket is 'owned' by a user thread, then we can still corrupt
sk_forward_alloc, even if we use bh_lock_sock()

I dont think we need to have another backlog for such case, maybe we
could account for skb->truesize in sk_rmem_alloc (this is atomic), and
not account for sk_mem_charge ?

Another possibility would be to store in skb the backlog function
pointer, so that backlog is generic (normal packets and error packets
handled in same backlog queue), instead of using a protocol provided
pointer. But thats more complex and error prone.

Thanks

[PATCH] net: sock_queue_err_skb() dont mess with sk_forward_alloc

Correct sk_forward_alloc handling for error_queue would need to use a
backlog of frames that softirq handler could not deliver because socket
is owned by user thread. Or extend backlog processing to be able to
process normal and error packets.

Another possibility is to not use mem charge for error queue, this is
what I implemented in this patch.

Note: this reverts commit 29030374
(net: fix sk_forward_alloc corruptions), since we dont need to lock
socket anymore.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/sock.h |   15 +--------------
 net/core/skbuff.c  |   30 ++++++++++++++++++++++++++++--
 net/ipv4/udp.c     |    6 ++----
 net/ipv6/udp.c     |    6 ++----
 4 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index ca241ea..731150d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1524,20 +1524,7 @@ extern void sk_stop_timer(struct sock *sk, struct timer_list* timer);
 
 extern int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
 
-static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
-{
-	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
-	   number of warnings when compiling with -W --ANK
-	 */
-	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
-	    (unsigned)sk->sk_rcvbuf)
-		return -ENOMEM;
-	skb_set_owner_r(skb, sk);
-	skb_queue_tail(&sk->sk_error_queue, skb);
-	if (!sock_flag(sk, SOCK_DEAD))
-		sk->sk_data_ready(sk, skb->len);
-	return 0;
-}
+extern int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
 
 /*
  *	Recover an error report and clear atomically
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 4e7ac09..9f07e74 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2965,6 +2965,34 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 }
 EXPORT_SYMBOL_GPL(skb_cow_data);
 
+static void sock_rmem_free(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+
+	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+}
+
+/*
+ * Note: We dont mem charge error packets (no sk_forward_alloc changes)
+ */
+int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+	    (unsigned)sk->sk_rcvbuf)
+		return -ENOMEM;
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = sock_rmem_free;
+	atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+
+	skb_queue_tail(&sk->sk_error_queue, skb);
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_data_ready(sk, skb->len);
+	return 0;
+}
+EXPORT_SYMBOL(sock_queue_err_skb);
+
 void skb_tstamp_tx(struct sk_buff *orig_skb,
 		struct skb_shared_hwtstamps *hwtstamps)
 {
@@ -2997,9 +3025,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
 	serr->ee.ee_errno = ENOMSG;
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 
-	bh_lock_sock(sk);
 	err = sock_queue_err_skb(sk, skb);
-	bh_unlock_sock(sk);
 
 	if (err)
 		kfree_skb(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 50678f9..eec4ff4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -633,11 +633,9 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	if (!inet->recverr) {
 		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
 			goto out;
-	} else {
-		bh_lock_sock(sk);
+	} else
 		ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
-		bh_unlock_sock(sk);
-	}
+
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3048f90..87be586 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -466,11 +466,9 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	if (sk->sk_state != TCP_ESTABLISHED && !np->recverr)
 		goto out;
 
-	if (np->recverr) {
-		bh_lock_sock(sk);
+	if (np->recverr)
 		ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
-		bh_unlock_sock(sk);
-	}
+
 	sk->sk_err = err;
 	sk->sk_error_report(sk);
 out:



^ permalink raw reply related

* Re: [PATCH] ixp4xx: Support the all multicast flag on the NPE devices.
From: Mikael Pettersson @ 2010-05-31 16:09 UTC (permalink / raw)
  To: Richard Cochran; +Cc: Krzysztof Halasa, netdev, linux-arm-kernel
In-Reply-To: <20100531153419.GA17317@riccoc20.at.omicron.at>

Richard Cochran writes:
 > On Mon, May 31, 2010 at 04:09:13PM +0200, Krzysztof Halasa wrote:
 > > Looks good, though I'd prefer a bit of "const" and "static" in the
 > > allmulti[] declaration. Would you please repost? TIA.
 > 
 > Okay, here it is.
 > 
 > Thanks,
 > Richard
 > 
 > This patch adds support for the IFF_ALLMULTI flag. Previously only the
 > IFF_PROMISC flag was supported.
 > 
 > Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
 > ---
 >  drivers/net/arm/ixp4xx_eth.c |   11 +++++++++++
 >  1 files changed, 11 insertions(+), 0 deletions(-)
 > 
 > diff --git a/drivers/net/arm/ixp4xx_eth.c b/drivers/net/arm/ixp4xx_eth.c
 > index 6be8b09..42b4361 100644
 > --- a/drivers/net/arm/ixp4xx_eth.c
 > +++ b/drivers/net/arm/ixp4xx_eth.c
 > @@ -739,6 +739,17 @@ static void eth_set_mcast_list(struct net_device *dev)
 >  	struct dev_mc_list *mclist;
 >  	u8 diffs[ETH_ALEN], *addr;
 >  	int i;
 > +	static const u8 allmulti[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
 > +
 > +	if (dev->flags & IFF_ALLMULTI) {
 > +		for (i = 0; i < ETH_ALEN; i++) {
 > +			__raw_writel(allmulti[i], &port->regs->mcast_addr[i]);
 > +			__raw_writel(allmulti[i], &port->regs->mcast_mask[i]);

Seems a bit excessive to define a lookup table for a computation
that amounts to nothing more than "i ? 0 : 1".

Something like the following would IMO be cleaner:

	if (...) {
		for (...) {
			u8 multi = i ? 0x00 : 0x01;
			__raw_writel(multi, ...);
			...
		}
	}

^ permalink raw reply

* [PATCH 0/2] netfilter: netfilter fixes
From: kaber @ 2010-05-31 16:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev

the following two patches fix two problems with the recent introduction of
a seperately allocated jumpstack for xtables rulesets:

- a memory leak when registering new tables, from Xiaotian Feng

- performance degradation caused by not using per-CPU allocations
  for the jumpstack, from Eric

Please apply or pull from:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-2.6.git master

Thanks!

^ permalink raw reply

* [PATCH 2/2] netfilter: xtables: stackptr should be percpu
From: kaber @ 2010-05-31 16:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1275322487-19026-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

commit f3c5c1bfd4 (netfilter: xtables: make ip_tables reentrant)
introduced a performance regression, because stackptr array is shared by
all cpus, adding cache line ping pongs. (16 cpus share a 64 bytes cache
line)

Fix this using alloc_percpu()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-By: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h |    2 +-
 net/ipv4/netfilter/ip_tables.c     |    2 +-
 net/ipv6/netfilter/ip6_tables.c    |    2 +-
 net/netfilter/x_tables.c           |   13 +++----------
 4 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index c00cc0c..24e5d01 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -397,7 +397,7 @@ struct xt_table_info {
 	 * @stacksize jumps (number of user chains) can possibly be made.
 	 */
 	unsigned int stacksize;
-	unsigned int *stackptr;
+	unsigned int __percpu *stackptr;
 	void ***jumpstack;
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 63958f3..4b6c5ca 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -336,7 +336,7 @@ ipt_do_table(struct sk_buff *skb,
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
-	stackptr   = &private->stackptr[cpu];
+	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 6f517bd..9d2d68f 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -363,7 +363,7 @@ ip6t_do_table(struct sk_buff *skb,
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
-	stackptr   = &private->stackptr[cpu];
+	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 47b1e79..e34622f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -699,10 +699,8 @@ void xt_free_table_info(struct xt_table_info *info)
 		vfree(info->jumpstack);
 	else
 		kfree(info->jumpstack);
-	if (sizeof(unsigned int) * nr_cpu_ids > PAGE_SIZE)
-		vfree(info->stackptr);
-	else
-		kfree(info->stackptr);
+
+	free_percpu(info->stackptr);
 
 	kfree(info);
 }
@@ -753,14 +751,9 @@ static int xt_jumpstack_alloc(struct xt_table_info *i)
 	unsigned int size;
 	int cpu;
 
-	size = sizeof(unsigned int) * nr_cpu_ids;
-	if (size > PAGE_SIZE)
-		i->stackptr = vmalloc(size);
-	else
-		i->stackptr = kmalloc(size, GFP_KERNEL);
+	i->stackptr = alloc_percpu(unsigned int);
 	if (i->stackptr == NULL)
 		return -ENOMEM;
-	memset(i->stackptr, 0, size);
 
 	size = sizeof(void **) * nr_cpu_ids;
 	if (size > PAGE_SIZE)
-- 
1.7.0.4


^ permalink raw reply related

* [PATCH 1/2] netfilter: don't xt_jumpstack_alloc twice in xt_register_table
From: kaber @ 2010-05-31 16:14 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1275322487-19026-1-git-send-email-kaber@trash.net>

From: Xiaotian Feng <dfeng@redhat.com>

In xt_register_table, xt_jumpstack_alloc is called first, later
xt_replace_table is used. But in xt_replace_table, xt_jumpstack_alloc
will be used again. Then the memory allocated by previous xt_jumpstack_alloc
will be leaked. We can simply remove the previous xt_jumpstack_alloc because
there aren't any users of newinfo between xt_jumpstack_alloc and
xt_replace_table.

Signed-off-by: Xiaotian Feng <dfeng@redhat.com>
Cc: Patrick McHardy <kaber@trash.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jan Engelhardt <jengelh@medozas.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Acked-By: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/x_tables.c |    4 ----
 1 files changed, 0 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 445de70..47b1e79 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -844,10 +844,6 @@ struct xt_table *xt_register_table(struct net *net,
 	struct xt_table_info *private;
 	struct xt_table *t, *table;
 
-	ret = xt_jumpstack_alloc(newinfo);
-	if (ret < 0)
-		return ERR_PTR(ret);
-
 	/* Don't add one object to multiple lists. */
 	table = kmemdup(input_table, sizeof(struct xt_table), GFP_KERNEL);
 	if (!table) {
-- 
1.7.0.4


^ permalink raw reply related

* Re: [Patch] fix packet loss and massive ping spikes with PPP multi-link
From: Ben McKeegan @ 2010-05-31 16:20 UTC (permalink / raw)
  To: Paul Mackerras
  Cc: netdev, linux-ppp, Alan Cox, Alexander E. Patrakov,
	Richard Hartmann, linux-kernel
In-Reply-To: <20100529021624.GA2538@brick.ozlabs.ibm.com>

Paul Mackerras wrote:

>> I needed to do something similar a while back and I took a very
>> different approach, which I think is more flexible.   Rather than
>> implement a new round-robin scheduler I simply introduced a target
>> minimum fragment size into the fragment size calculation, as a per
>> bundle parameter that can be configured via a new ioctl.  This
>> modifies the algorithm so that it tries to limit the number of
>> fragments such that each fragment is at least the minimum size.  If
>> the minimum size is greater than the packet size it will not be
>> fragmented all but will instead just get sent down the next
>> available channel.
>>
>> A pppd plugin generates the ioctl call allowing this to be tweaked
>> per connection.  It is more flexible in that you can still have the
>> larger packets fragmented if you wish.
> 
> I like this a lot better than the other proposed patch.  It adds less
> code because it uses the fact that ppp_mp_explode() already has a
> round-robin capability using the ppp->nxchan field, plus it provides a
> way to control it per bundle via pppd.
> 
> If you fix up the indentation issues (2-space indent in some of the
> added code -- if you're using emacs, set c-basic-offset to 8), I'll
> ack it and hopefully DaveM will pick it up.

Okay, I'll fix it up when I'm back at work Tuesday and submit (today is 
a UK public holiday) and also dig out and fix up the userspace code to 
go with it.

Ben.


^ permalink raw reply

* Re: [PATCH] ixp4xx: Support the all multicast flag on the NPE devices.
From: Krzysztof Halasa @ 2010-05-31 16:23 UTC (permalink / raw)
  To: Mikael Pettersson; +Cc: Richard Cochran, netdev, linux-arm-kernel
In-Reply-To: <19459.57149.428000.694558@pilspetsen.it.uu.se>

Mikael Pettersson <mikpe@it.uu.se> writes:

>  > +	static const u8 allmulti[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
>  > +
>  > +	if (dev->flags & IFF_ALLMULTI) {
>  > +		for (i = 0; i < ETH_ALEN; i++) {
>  > +			__raw_writel(allmulti[i], &port->regs->mcast_addr[i]);
>  > +			__raw_writel(allmulti[i], &port->regs->mcast_mask[i]);
>
> Seems a bit excessive to define a lookup table for a computation
> that amounts to nothing more than "i ? 0 : 1".
>
> Something like the following would IMO be cleaner:
>
> 	if (...) {
> 		for (...) {
> 			u8 multi = i ? 0x00 : 0x01;
> 			__raw_writel(multi, ...);
> 			...
> 		}
> 	}

Well... cleaner = easier to understand?
I don't think so, the array is clearly a MAC address (mask) and the
"i ? 0x00 : 0x01" (which one could simply write as "!i") requires some
additional attention.

It's a slow "admin" path so an unmeasurable speedup doesn't mean
anything.
-- 
Krzysztof Halasa

^ permalink raw reply

* Re: [PATCH] ixp4xx: Support the all multicast flag on the NPE devices.
From: Krzysztof Halasa @ 2010-05-31 16:24 UTC (permalink / raw)
  To: Richard Cochran; +Cc: netdev, linux-arm-kernel
In-Reply-To: <20100531153419.GA17317@riccoc20.at.omicron.at>

Richard Cochran <richardcochran@gmail.com> writes:

> This patch adds support for the IFF_ALLMULTI flag. Previously only the
> IFF_PROMISC flag was supported.
>
> Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
> ---
>  drivers/net/arm/ixp4xx_eth.c |   11 +++++++++++
>  1 files changed, 11 insertions(+), 0 deletions(-)
>
> diff --git a/drivers/net/arm/ixp4xx_eth.c b/drivers/net/arm/ixp4xx_eth.c
> index 6be8b09..42b4361 100644
> --- a/drivers/net/arm/ixp4xx_eth.c
> +++ b/drivers/net/arm/ixp4xx_eth.c
> @@ -739,6 +739,17 @@ static void eth_set_mcast_list(struct net_device *dev)
>  	struct dev_mc_list *mclist;
>  	u8 diffs[ETH_ALEN], *addr;
>  	int i;
> +	static const u8 allmulti[] = { 0x01, 0x00, 0x00, 0x00, 0x00, 0x00 };
> +
> +	if (dev->flags & IFF_ALLMULTI) {
> +		for (i = 0; i < ETH_ALEN; i++) {
> +			__raw_writel(allmulti[i], &port->regs->mcast_addr[i]);
> +			__raw_writel(allmulti[i], &port->regs->mcast_mask[i]);
> +		}
> +		__raw_writel(DEFAULT_RX_CNTRL0 | RX_CNTRL0_ADDR_FLTR_EN,
> +			&port->regs->rx_control[0]);
> +		return;
> +	}
>  
>  	if ((dev->flags & IFF_PROMISC) || netdev_mc_empty(dev)) {
>  		__raw_writel(DEFAULT_RX_CNTRL0 & ~RX_CNTRL0_ADDR_FLTR_EN,

Acked-By: Krzysztof Halasa <khc@pm.waw.pl>
-- 
Krzysztof Halasa

^ permalink raw reply

* [PATCH] can: mpc5xxx_can.c: Fix build failure
From: Anatolij Gustschin @ 2010-05-31 16:31 UTC (permalink / raw)
  To: netdev, socketcan-core
  Cc: David S. Miller, Anatolij Gustschin, Wolfgang Grandegger,
	Grant Likely

Fixes build error caused by the OF device_node pointer
being moved into struct device.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
---
 drivers/net/can/mscan/mpc5xxx_can.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c
index 8af8442..f48deaf 100644
--- a/drivers/net/can/mscan/mpc5xxx_can.c
+++ b/drivers/net/can/mscan/mpc5xxx_can.c
@@ -152,7 +152,7 @@ static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
 	}
 
 	/* Determine the MSCAN device index from the physical address */
-	pval = of_get_property(ofdev->node, "reg", &plen);
+	pval = of_get_property(ofdev->dev.of_node, "reg", &plen);
 	BUG_ON(!pval || plen < sizeof(*pval));
 	clockidx = (*pval & 0x80) ? 1 : 0;
 	if (*pval & 0x2000)
@@ -168,11 +168,11 @@ static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
 	 */
 	if (clock_name && !strcmp(clock_name, "ip")) {
 		*mscan_clksrc = MSCAN_CLKSRC_IPS;
-		freq = mpc5xxx_get_bus_frequency(ofdev->node);
+		freq = mpc5xxx_get_bus_frequency(ofdev->dev.of_node);
 	} else {
 		*mscan_clksrc = MSCAN_CLKSRC_BUS;
 
-		pval = of_get_property(ofdev->node,
+		pval = of_get_property(ofdev->dev.of_node,
 				       "fsl,mscan-clock-divider", &plen);
 		if (pval && plen == sizeof(*pval))
 			clockdiv = *pval;
@@ -251,7 +251,7 @@ static int __devinit mpc5xxx_can_probe(struct of_device *ofdev,
 				       const struct of_device_id *id)
 {
 	struct mpc5xxx_can_data *data = (struct mpc5xxx_can_data *)id->data;
-	struct device_node *np = ofdev->node;
+	struct device_node *np = ofdev->dev.of_node;
 	struct net_device *dev;
 	struct mscan_priv *priv;
 	void __iomem *base;
-- 
1.7.0.4


^ permalink raw reply related

* Re: HFSC classes going out of bounds, regression in recent kernels?
From: Michal Soltys @ 2010-05-31 17:53 UTC (permalink / raw)
  To: Denys Fedorysychenko; +Cc: Patrick McHardy, netdev, Jeff Garzik, Eric Dumazet
In-Reply-To: <201005200434.28433.nuclearcat@nuclearcat.com>

On 10-05-20 03:34, Denys Fedorysychenko wrote:
> I am trying to track down HFSC bug.
> It seems, most probably it is related to PSCHED_SHIFT at the end, i am doing
> testing again. I will try to do complete clean build, maybe last time some .o
> was left or i forgot to do make clean.
> 
> SM_SHIFT in HFSC is calculated as 30 - PSCHED_SHIFT, and it is shifted too
> much (or not enough) with new changes (ISM_SHIFT seems wrong too). So it is
> most probably overflow or not enough resolution.
> I will try to change PSCHED_SHIFT back to confirm that, and at least i found
> way to reproduce bug.
> 
> Additionally in sch_hfsc.c i notice mentioned that PSCHED_SHIFT 10 is tick per
> 1024us, but i try to calculate their table (in source comments), it doesn't
> fit with my calculations based on 1024us/tick, but fits well with 1024
> nanosecond.
> 
> Is it was 1024ns per tick and now 64ns per tick? Or it is microseconds(us) ?
> --

In the old table (when PSCHED_SHIFT was 10) in the hfsc file, the requirements 
to keep 4 decimal digits were 20 and 18 for SM_ and ISM_ respectively. 
For the reference:

 *  bits/sec      100Kbps     1Mbps     10Mbps     100Mbps    1Gbps
 *  ------------+-------------------------------------------------------
 *  bytes/1.024us 12.8e-3    128e-3     1280e-3    12800e-3   128000e-3
 *  1.024us/byte  78.125     7.8125     0.78125    0.078125   0.0078125
 *
 * So, for PSCHED_SHIFT 10 we need: SM_SHIFT 20, ISM_SHIFT 18.


Considering the table - and if I didn't miss anything - they were swapped.

psticks/byte in its "corner" case (0.0078125) requires 1e6, which corrsponds to 
ISM_SHIFT == 20.

Similary, corner case for bytes/pstick (0.0128) requires 1e5, where 
SM_SHIFT == 17 would be sufficient.

The above assuming pstick is 1.024us

Currently, with PSCHED_SHIFT == 6 (so pstick is 64ns), the define macros 
actually hit the spot, giving following values:

bytes/pstick: SM_SHIFT  24 (0.0008 @ 100kbit)
psticks/byte: ISM_SHIFT 14 (0.125  @ 1gbit)

But in generic case - the macros will not yield appropriate results - e.g. 
for PSCHED_SHIFT == 10, SM_SHIFT would be 20 and ISM_SHIFT would be 18 (not 
enough for 4 decimal digits).

On a related subject - it would probably be desirable to consider smaller 
speeds than 100kbit, as adsl links with mere 128kbit upstream total are 
nothing out of ordinary. If we considered 10kbit as the reference value, we 
would need SM_SHIFT 27 to cover 0.00008 @ 10kbit. Not sure if 10gbit is 
desirable too - ISM_SHIFT would need 17 then. All assuming that 4 decimal 
digits is what we really need.


^ permalink raw reply

* [PATCH] net/ipv4/tcp_input.c: fix compilation breakage when FASTRETRANS_DEBUG > 1
From: Joe Perches @ 2010-05-31 18:20 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Eric Dumazet

Commit: c720c7e8383aff1cb219bddf474ed89d850336e3 missed these.

Signed-off-by: Joe Perches <joe@perches.com>
---
 net/ipv4/tcp_input.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e6dafc..548d575 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2639,7 +2639,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 	if (sk->sk_family == AF_INET) {
 		printk(KERN_DEBUG "Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
 		       msg,
-		       &inet->daddr, ntohs(inet->dport),
+		       &inet->inet_daddr, ntohs(inet->inet_dport),
 		       tp->snd_cwnd, tcp_left_out(tp),
 		       tp->snd_ssthresh, tp->prior_ssthresh,
 		       tp->packets_out);
@@ -2649,7 +2649,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
 		struct ipv6_pinfo *np = inet6_sk(sk);
 		printk(KERN_DEBUG "Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
 		       msg,
-		       &np->daddr, ntohs(inet->dport),
+		       &np->daddr, ntohs(inet->inet_dport),
 		       tp->snd_cwnd, tcp_left_out(tp),
 		       tp->snd_ssthresh, tp->prior_ssthresh,
 		       tp->packets_out);



^ permalink raw reply related

* Re: [PATCH] can: mpc5xxx_can.c: Fix build failure
From: Wolfgang Grandegger @ 2010-05-31 18:27 UTC (permalink / raw)
  To: Anatolij Gustschin; +Cc: netdev, socketcan-core, David S. Miller, Grant Likely
In-Reply-To: <1275323499-15543-1-git-send-email-agust@denx.de>

Hi Anatolij,

On 05/31/2010 06:31 PM, Anatolij Gustschin wrote:
> Fixes build error caused by the OF device_node pointer
> being moved into struct device.
> 
> Signed-off-by: Anatolij Gustschin <agust@denx.de>
> Cc: Wolfgang Grandegger <wg@grandegger.com>
> Cc: Grant Likely <grant.likely@secretlab.ca>
> ---
>  drivers/net/can/mscan/mpc5xxx_can.c |    8 ++++----
>  1 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c
> index 8af8442..f48deaf 100644
> --- a/drivers/net/can/mscan/mpc5xxx_can.c
> +++ b/drivers/net/can/mscan/mpc5xxx_can.c
> @@ -152,7 +152,7 @@ static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
>  	}
>  
>  	/* Determine the MSCAN device index from the physical address */
> -	pval = of_get_property(ofdev->node, "reg", &plen);
> +	pval = of_get_property(ofdev->dev.of_node, "reg", &plen);
>  	BUG_ON(!pval || plen < sizeof(*pval));
>  	clockidx = (*pval & 0x80) ? 1 : 0;
>  	if (*pval & 0x2000)
> @@ -168,11 +168,11 @@ static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
>  	 */
>  	if (clock_name && !strcmp(clock_name, "ip")) {
>  		*mscan_clksrc = MSCAN_CLKSRC_IPS;
> -		freq = mpc5xxx_get_bus_frequency(ofdev->node);
> +		freq = mpc5xxx_get_bus_frequency(ofdev->dev.of_node);
>  	} else {
>  		*mscan_clksrc = MSCAN_CLKSRC_BUS;
>  
> -		pval = of_get_property(ofdev->node,
> +		pval = of_get_property(ofdev->dev.of_node,
>  				       "fsl,mscan-clock-divider", &plen);
>  		if (pval && plen == sizeof(*pval))
>  			clockdiv = *pval;
> @@ -251,7 +251,7 @@ static int __devinit mpc5xxx_can_probe(struct of_device *ofdev,
>  				       const struct of_device_id *id)
>  {
>  	struct mpc5xxx_can_data *data = (struct mpc5xxx_can_data *)id->data;
> -	struct device_node *np = ofdev->node;
> +	struct device_node *np = ofdev->dev.of_node;
>  	struct net_device *dev;
>  	struct mscan_priv *priv;
>  	void __iomem *base;

The patch seems incomplete. I get at least one more warning (because
I'm compiling for 52xx, I guess):

  CC      drivers/net/can/mscan/mpc5xxx_can.o
drivers/net/can/mscan/mpc5xxx_can.c: In function 'mpc52xx_can_get_clock':
drivers/net/can/mscan/mpc5xxx_can.c:76: error: 'struct of_device' has no member named 'node'

Should I send v2? Thanks for fixing that. I also checked the
sja1000_of_platform driver,  which was converted correctly.

Wolfgang.

^ permalink raw reply

* RE: [PATCH 1/2] Export firmware assigned labels of network devices to sysfs
From: Narendra_K @ 2010-05-31 18:42 UTC (permalink / raw)
  To: michael
  Cc: netdev, linux-hotplug, linux-pci, Matt_Domsch, Jordan_Hargrave,
	Charles_Rose, Vijay_Nijhawan
In-Reply-To: <1275314876.21246.29.camel@concordia>

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Michael Ellerman
> Sent: Monday, May 31, 2010 7:38 PM
> To: K, Narendra
> Cc: netdev@vger.kernel.org; linux-hotplug@vger.kernel.org; linux-
> pci@vger.kernel.org; Domsch, Matt; Hargrave, Jordan; Rose, Charles;
> Nijhawan, Vijay
> Subject: Re: [PATCH 1/2] Export firmware assigned labels of network
> devices to sysfs
> 
> On Fri, 2010-05-28 at 06:55 -0500, K, Narendra wrote:
> > Hello,
> >
> > This patch is in continuation of an earlier discussion -
> >
> > http://marc.info/?l=linux-netdev&m=126712978908314&w=3
> >
> > The patch has the following review suggestions from the community
> > incorporated -
> >
> > 1. The name of the attribute has been changed from "smbiosname" to
> > "label" to hide the implementation details.
> > 2. The implementation has been moved to a new file
> > drivers/pci/pci-label.c
> 
> You've changed the name, which is good, but the implementation is still
> 100% dependant on ACPI or DMI AFAICS.
> 
> So it seems to me until it's supported on another platform it may as
> well go in pci-acpi.c, 

You mean the ACPI _DSM ? If yes, it is expected to become a standard very soon. I assume you meant non-Dell platforms by another platform.

> or at least only be compiled if (ACPI || DMI).
> Otherwise it's just dead code.
> 

Is DMI not implemented widely today ? Please correct me if I am missing something here.

With regards,
Narendra K




^ permalink raw reply

* Re: [PATCH] can: mpc5xxx_can.c: Fix build failure
From: Anatolij Gustschin @ 2010-05-31 18:54 UTC (permalink / raw)
  To: Wolfgang Grandegger; +Cc: netdev, socketcan-core, David S. Miller, Grant Likely
In-Reply-To: <4C03FF75.3010208@grandegger.com>

Hi Wolfgang,

On Mon, 31 May 2010 20:27:01 +0200
Wolfgang Grandegger <wg@grandegger.com> wrote:
...
> > @@ -251,7 +251,7 @@ static int __devinit mpc5xxx_can_probe(struct of_device *ofdev,
> >  				       const struct of_device_id *id)
> >  {
> >  	struct mpc5xxx_can_data *data = (struct mpc5xxx_can_data *)id->data;
> > -	struct device_node *np = ofdev->node;
> > +	struct device_node *np = ofdev->dev.of_node;
> >  	struct net_device *dev;
> >  	struct mscan_priv *priv;
> >  	void __iomem *base;
> 
> The patch seems incomplete. I get at least one more warning (because
> I'm compiling for 52xx, I guess):

Yes, this is true. Unfortunately I didn't compile for 5200 and the issue
didn't show up.

>   CC      drivers/net/can/mscan/mpc5xxx_can.o
> drivers/net/can/mscan/mpc5xxx_can.c: In function 'mpc52xx_can_get_clock':
> drivers/net/can/mscan/mpc5xxx_can.c:76: error: 'struct of_device' has no member named 'node'
> 
> Should I send v2? Thanks for fixing that. I also checked the
> sja1000_of_platform driver,  which was converted correctly.

I will send v2 patch shortly. Thanks for reporting!

Anatolij

^ permalink raw reply

* [PATCH v2] can: mpc5xxx_can.c: Fix build failure
From: Anatolij Gustschin @ 2010-05-31 18:56 UTC (permalink / raw)
  To: netdev, socketcan-core
  Cc: David S. Miller, Anatolij Gustschin, Wolfgang Grandegger,
	Grant Likely
In-Reply-To: <1275323499-15543-1-git-send-email-agust@denx.de>

Fixes build error caused by the OF device_node pointer
being moved into struct device.

Signed-off-by: Anatolij Gustschin <agust@denx.de>
Cc: Wolfgang Grandegger <wg@grandegger.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
---
v1 -> v2:
 - also fix building for mpc5200 which also didn't work
   as reported by Wolfgang.

 drivers/net/can/mscan/mpc5xxx_can.c |   10 +++++-----
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/net/can/mscan/mpc5xxx_can.c b/drivers/net/can/mscan/mpc5xxx_can.c
index 8af8442..af75393 100644
--- a/drivers/net/can/mscan/mpc5xxx_can.c
+++ b/drivers/net/can/mscan/mpc5xxx_can.c
@@ -73,7 +73,7 @@ static u32 __devinit mpc52xx_can_get_clock(struct of_device *ofdev,
 	else
 		*mscan_clksrc = MSCAN_CLKSRC_XTAL;
 
-	freq = mpc5xxx_get_bus_frequency(ofdev->node);
+	freq = mpc5xxx_get_bus_frequency(ofdev->dev.of_node);
 	if (!freq)
 		return 0;
 
@@ -152,7 +152,7 @@ static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
 	}
 
 	/* Determine the MSCAN device index from the physical address */
-	pval = of_get_property(ofdev->node, "reg", &plen);
+	pval = of_get_property(ofdev->dev.of_node, "reg", &plen);
 	BUG_ON(!pval || plen < sizeof(*pval));
 	clockidx = (*pval & 0x80) ? 1 : 0;
 	if (*pval & 0x2000)
@@ -168,11 +168,11 @@ static u32 __devinit mpc512x_can_get_clock(struct of_device *ofdev,
 	 */
 	if (clock_name && !strcmp(clock_name, "ip")) {
 		*mscan_clksrc = MSCAN_CLKSRC_IPS;
-		freq = mpc5xxx_get_bus_frequency(ofdev->node);
+		freq = mpc5xxx_get_bus_frequency(ofdev->dev.of_node);
 	} else {
 		*mscan_clksrc = MSCAN_CLKSRC_BUS;
 
-		pval = of_get_property(ofdev->node,
+		pval = of_get_property(ofdev->dev.of_node,
 				       "fsl,mscan-clock-divider", &plen);
 		if (pval && plen == sizeof(*pval))
 			clockdiv = *pval;
@@ -251,7 +251,7 @@ static int __devinit mpc5xxx_can_probe(struct of_device *ofdev,
 				       const struct of_device_id *id)
 {
 	struct mpc5xxx_can_data *data = (struct mpc5xxx_can_data *)id->data;
-	struct device_node *np = ofdev->node;
+	struct device_node *np = ofdev->dev.of_node;
 	struct net_device *dev;
 	struct mscan_priv *priv;
 	void __iomem *base;
-- 
1.7.0.4


^ permalink raw reply related

* Re: [v5 Patch 1/3] netpoll: add generic support for bridge and bonding devices
From: Flavio Leitner @ 2010-05-31 19:08 UTC (permalink / raw)
  To: Cong Wang
  Cc: linux-kernel, Matt Mackall, netdev, bridge, Andy Gospodarek,
	Neil Horman, Jeff Moyer, Stephen Hemminger, bonding-devel,
	Jay Vosburgh, David Miller
In-Reply-To: <4C034FA4.5000401@redhat.com>

On Mon, May 31, 2010 at 01:56:52PM +0800, Cong Wang wrote:
> Hi, Flavio,
> 
> Please use the attached patch instead, try to see if it solves
> all your problems.

I tried and it hangs. No backtraces this time.
The bond_change_active_slave() prints before NETDEV_BONDING_FAILOVER
notification, so I think it won't work. 

Please, correct if I'm wrong, but when a failover happens with your 
patch applied, the netconsole would be disabled forever even with
another healthy slave, right?

fbl


> 
> Thanks a lot!
> 

> diff --git a/drivers/net/netconsole.c b/drivers/net/netconsole.c
> index ca142c4..2d1d594 100644
> --- a/drivers/net/netconsole.c
> +++ b/drivers/net/netconsole.c
> @@ -666,7 +666,8 @@ static int netconsole_netdev_event(struct notifier_block *this,
>  	struct net_device *dev = ptr;
>  
>  	if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER ||
> -	      event == NETDEV_BONDING_DESLAVE || event == NETDEV_GOING_DOWN))
> +	      event == NETDEV_BONDING_DESLAVE || event == NETDEV_GOING_DOWN ||
> +	      event == NETDEV_BONDING_FAILOVER))
>  		goto done;
>  
>  	spin_lock_irqsave(&target_list_lock, flags);
> @@ -682,6 +683,7 @@ static int netconsole_netdev_event(struct notifier_block *this,
>  				/* Fall through */
>  			case NETDEV_GOING_DOWN:
>  			case NETDEV_BONDING_DESLAVE:
> +			case NETDEV_BONDING_FAILOVER:
>  				nt->enabled = 0;
>  				break;
>  			}


-- 
Flavio

^ permalink raw reply

* [PATCH 1/2] r6040: implement phylib
From: Florian Fainelli @ 2010-05-31 19:18 UTC (permalink / raw)
  To: netdev, David Miller

This patch adds support for using phylib and adds the required mdiobus driver
stubs. This allows for less code to be present in the driver and removes
the PHY status specific timer which is now handled by phylib directly.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 2decc59..fe113d0 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -1659,6 +1659,7 @@ config R6040
 	depends on NET_PCI && PCI
 	select CRC32
 	select MII
+	select PHYLIB
 	help
 	  This is a driver for the R6040 Fast Ethernet MACs found in the
 	  the RDC R-321x System-on-chips.
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 9a251ac..6878511 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -44,6 +44,7 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/uaccess.h>
+#include <linux/phy.h>
 
 #include <asm/processor.h>
 
@@ -179,7 +180,6 @@ struct r6040_descriptor {
 
 struct r6040_private {
 	spinlock_t lock;		/* driver lock */
-	struct timer_list timer;
 	struct pci_dev *pdev;
 	struct r6040_descriptor *rx_insert_ptr;
 	struct r6040_descriptor *rx_remove_ptr;
@@ -189,13 +189,15 @@ struct r6040_private {
 	struct r6040_descriptor *tx_ring;
 	dma_addr_t rx_ring_dma;
 	dma_addr_t tx_ring_dma;
-	u16	tx_free_desc, phy_addr, phy_mode;
+	u16	tx_free_desc, phy_addr;
 	u16	mcr0, mcr1;
-	u16	switch_sig;
 	struct net_device *dev;
-	struct mii_if_info mii_if;
+	struct mii_bus *mii_bus;
 	struct napi_struct napi;
 	void __iomem *base;
+	struct phy_device *phydev;
+	int old_link;
+	int old_duplex;
 };
 
 static char version[] __devinitdata = KERN_INFO DRV_NAME
@@ -238,20 +240,30 @@ static void r6040_phy_write(void __iomem *ioaddr, int phy_addr, int reg, u16 val
 	}
 }
 
-static int r6040_mdio_read(struct net_device *dev, int mii_id, int reg)
+static int r6040_mdiobus_read(struct mii_bus *bus, int phy_addr, int reg)
 {
+	struct net_device *dev = bus->priv;
 	struct r6040_private *lp = netdev_priv(dev);
 	void __iomem *ioaddr = lp->base;
 
-	return (r6040_phy_read(ioaddr, lp->phy_addr, reg));
+	return r6040_phy_read(ioaddr, phy_addr, reg);
 }
 
-static void r6040_mdio_write(struct net_device *dev, int mii_id, int reg, int val)
+static int r6040_mdiobus_write(struct mii_bus *bus, int phy_addr,
+						int reg, u16 value)
 {
+	struct net_device *dev = bus->priv;
 	struct r6040_private *lp = netdev_priv(dev);
 	void __iomem *ioaddr = lp->base;
 
-	r6040_phy_write(ioaddr, lp->phy_addr, reg, val);
+	r6040_phy_write(ioaddr, phy_addr, reg, value);
+
+	return 0;
+}
+
+static int r6040_mdiobus_reset(struct mii_bus *bus)
+{
+	return 0;
 }
 
 static void r6040_free_txbufs(struct net_device *dev)
@@ -408,10 +420,9 @@ static void r6040_tx_timeout(struct net_device *dev)
 	void __iomem *ioaddr = priv->base;
 
 	netdev_warn(dev, "transmit timed out, int enable %4.4x "
-		"status %4.4x, PHY status %4.4x\n",
+		"status %4.4x\n",
 		ioread16(ioaddr + MIER),
-		ioread16(ioaddr + MISR),
-		r6040_mdio_read(dev, priv->mii_if.phy_id, MII_BMSR));
+		ioread16(ioaddr + MISR));
 
 	dev->stats.tx_errors++;
 
@@ -463,9 +474,6 @@ static int r6040_close(struct net_device *dev)
 	struct r6040_private *lp = netdev_priv(dev);
 	struct pci_dev *pdev = lp->pdev;
 
-	/* deleted timer */
-	del_timer_sync(&lp->timer);
-
 	spin_lock_irq(&lp->lock);
 	napi_disable(&lp->napi);
 	netif_stop_queue(dev);
@@ -495,64 +503,14 @@ static int r6040_close(struct net_device *dev)
 	return 0;
 }
 
-/* Status of PHY CHIP */
-static int r6040_phy_mode_chk(struct net_device *dev)
-{
-	struct r6040_private *lp = netdev_priv(dev);
-	void __iomem *ioaddr = lp->base;
-	int phy_dat;
-
-	/* PHY Link Status Check */
-	phy_dat = r6040_phy_read(ioaddr, lp->phy_addr, 1);
-	if (!(phy_dat & 0x4))
-		phy_dat = 0x8000;	/* Link Failed, full duplex */
-
-	/* PHY Chip Auto-Negotiation Status */
-	phy_dat = r6040_phy_read(ioaddr, lp->phy_addr, 1);
-	if (phy_dat & 0x0020) {
-		/* Auto Negotiation Mode */
-		phy_dat = r6040_phy_read(ioaddr, lp->phy_addr, 5);
-		phy_dat &= r6040_phy_read(ioaddr, lp->phy_addr, 4);
-		if (phy_dat & 0x140)
-			/* Force full duplex */
-			phy_dat = 0x8000;
-		else
-			phy_dat = 0;
-	} else {
-		/* Force Mode */
-		phy_dat = r6040_phy_read(ioaddr, lp->phy_addr, 0);
-		if (phy_dat & 0x100)
-			phy_dat = 0x8000;
-		else
-			phy_dat = 0x0000;
-	}
-
-	return phy_dat;
-};
-
-static void r6040_set_carrier(struct mii_if_info *mii)
-{
-	if (r6040_phy_mode_chk(mii->dev)) {
-		/* autoneg is off: Link is always assumed to be up */
-		if (!netif_carrier_ok(mii->dev))
-			netif_carrier_on(mii->dev);
-	} else
-		r6040_phy_mode_chk(mii->dev);
-}
-
 static int r6040_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 {
 	struct r6040_private *lp = netdev_priv(dev);
-	struct mii_ioctl_data *data = if_mii(rq);
-	int rc;
 
-	if (!netif_running(dev))
+	if (!lp->phydev)
 		return -EINVAL;
-	spin_lock_irq(&lp->lock);
-	rc = generic_mii_ioctl(&lp->mii_if, data, cmd, NULL);
-	spin_unlock_irq(&lp->lock);
-	r6040_set_carrier(&lp->mii_if);
-	return rc;
+
+	return phy_mii_ioctl(lp->phydev, if_mii(rq), cmd);
 }
 
 static int r6040_rx(struct net_device *dev, int limit)
@@ -751,26 +709,6 @@ static int r6040_up(struct net_device *dev)
 	if (ret)
 		return ret;
 
-	/* Read the PHY ID */
-	lp->switch_sig = r6040_phy_read(ioaddr, 0, 2);
-
-	if (lp->switch_sig  == ICPLUS_PHY_ID) {
-		r6040_phy_write(ioaddr, 29, 31, 0x175C); /* Enable registers */
-		lp->phy_mode = 0x8000;
-	} else {
-		/* PHY Mode Check */
-		r6040_phy_write(ioaddr, lp->phy_addr, 4, PHY_CAP);
-		r6040_phy_write(ioaddr, lp->phy_addr, 0, PHY_MODE);
-
-		if (PHY_MODE == 0x3100)
-			lp->phy_mode = r6040_phy_mode_chk(dev);
-		else
-			lp->phy_mode = (PHY_MODE & 0x0100) ? 0x8000:0x0;
-	}
-
-	/* Set duplex mode */
-	lp->mcr0 |= lp->phy_mode;
-
 	/* improve performance (by RDC guys) */
 	r6040_phy_write(ioaddr, 30, 17, (r6040_phy_read(ioaddr, 30, 17) | 0x4000));
 	r6040_phy_write(ioaddr, 30, 17, ~((~r6040_phy_read(ioaddr, 30, 17)) | 0x2000));
@@ -783,35 +721,6 @@ static int r6040_up(struct net_device *dev)
 	return 0;
 }
 
-/*
-  A periodic timer routine
-	Polling PHY Chip Link Status
-*/
-static void r6040_timer(unsigned long data)
-{
-	struct net_device *dev = (struct net_device *)data;
-	struct r6040_private *lp = netdev_priv(dev);
-	void __iomem *ioaddr = lp->base;
-	u16 phy_mode;
-
-	/* Polling PHY Chip Status */
-	if (PHY_MODE == 0x3100)
-		phy_mode = r6040_phy_mode_chk(dev);
-	else
-		phy_mode = (PHY_MODE & 0x0100) ? 0x8000:0x0;
-
-	if (phy_mode != lp->phy_mode) {
-		lp->phy_mode = phy_mode;
-		lp->mcr0 = (lp->mcr0 & 0x7fff) | phy_mode;
-		iowrite16(lp->mcr0, ioaddr);
-	}
-
-	/* Timer active again */
-	mod_timer(&lp->timer, round_jiffies(jiffies + HZ));
-
-	/* Check media */
-	mii_check_media(&lp->mii_if, 1, 1);
-}
 
 /* Read/set MAC address routines */
 static void r6040_mac_address(struct net_device *dev)
@@ -873,10 +782,6 @@ static int r6040_open(struct net_device *dev)
 	napi_enable(&lp->napi);
 	netif_start_queue(dev);
 
-	/* set and active a timer process */
-	setup_timer(&lp->timer, r6040_timer, (unsigned long) dev);
-	if (lp->switch_sig != ICPLUS_PHY_ID)
-		mod_timer(&lp->timer, jiffies + HZ);
 	return 0;
 }
 
@@ -1015,40 +920,22 @@ static void netdev_get_drvinfo(struct net_device *dev,
 static int netdev_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	struct r6040_private *rp = netdev_priv(dev);
-	int rc;
-
-	spin_lock_irq(&rp->lock);
-	rc = mii_ethtool_gset(&rp->mii_if, cmd);
-	spin_unlock_irq(&rp->lock);
 
-	return rc;
+	return  phy_ethtool_gset(rp->phydev, cmd);
 }
 
 static int netdev_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
 {
 	struct r6040_private *rp = netdev_priv(dev);
-	int rc;
-
-	spin_lock_irq(&rp->lock);
-	rc = mii_ethtool_sset(&rp->mii_if, cmd);
-	spin_unlock_irq(&rp->lock);
-	r6040_set_carrier(&rp->mii_if);
-
-	return rc;
-}
-
-static u32 netdev_get_link(struct net_device *dev)
-{
-	struct r6040_private *rp = netdev_priv(dev);
 
-	return mii_link_ok(&rp->mii_if);
+	return phy_ethtool_sset(rp->phydev, cmd);
 }
 
 static const struct ethtool_ops netdev_ethtool_ops = {
 	.get_drvinfo		= netdev_get_drvinfo,
 	.get_settings		= netdev_get_settings,
 	.set_settings		= netdev_set_settings,
-	.get_link		= netdev_get_link,
+	.get_link		= ethtool_op_get_link,
 };
 
 static const struct net_device_ops r6040_netdev_ops = {
@@ -1067,6 +954,79 @@ static const struct net_device_ops r6040_netdev_ops = {
 #endif
 };
 
+static void r6040_adjust_link(struct net_device *dev)
+{
+	struct r6040_private *lp = netdev_priv(dev);
+	struct phy_device *phydev = lp->phydev;
+	int status_changed = 0;
+	void __iomem *ioaddr = lp->base;
+
+	BUG_ON(!phydev);
+
+	if (lp->old_link != phydev->link) {
+		status_changed = 1;
+		lp->old_link = phydev->link;
+	}
+
+	/* reflect duplex change */
+	if (phydev->link && (lp->old_duplex != phydev->duplex)) {
+		lp->mcr0 |= (phydev->duplex == DUPLEX_FULL ? 0x8000 : 0);
+		iowrite16(lp->mcr0, ioaddr);
+
+		status_changed = 1;
+		lp->old_duplex = phydev->duplex;
+	}
+
+	if (status_changed) {
+		pr_info("%s: link %s", dev->name, phydev->link ?
+			"UP" : "DOWN");
+		if (phydev->link)
+			pr_cont(" - %d/%s", phydev->speed,
+			DUPLEX_FULL == phydev->duplex ? "full" : "half");
+		pr_cont("\n");
+	}
+}
+
+static int r6040_mii_probe(struct net_device *dev)
+{
+	struct r6040_private *lp = netdev_priv(dev);
+	struct phy_device *phydev = NULL;
+
+	phydev = phy_find_first(lp->mii_bus);
+	if (!phydev) {
+		dev_err(&lp->pdev->dev, "no PHY found\n");
+		return -ENODEV;
+	}
+
+	phydev = phy_connect(dev, dev_name(&phydev->dev), &r6040_adjust_link,
+				0, PHY_INTERFACE_MODE_MII);
+
+	if (IS_ERR(phydev)) {
+		dev_err(&lp->pdev->dev, "could not attach to PHY\n");
+		return PTR_ERR(phydev);
+	}
+
+	/* mask with MAC supported features */
+	phydev->supported &= (SUPPORTED_10baseT_Half
+				| SUPPORTED_10baseT_Full
+				| SUPPORTED_100baseT_Half
+				| SUPPORTED_100baseT_Full
+				| SUPPORTED_Autoneg
+				| SUPPORTED_MII
+				| SUPPORTED_TP);
+
+	phydev->advertising = phydev->supported;
+	lp->phydev = phydev;
+	lp->old_link = 0;
+	lp->old_duplex = -1;
+
+	dev_info(&lp->pdev->dev, "attached PHY driver [%s] "
+		"(mii_bus:phy_addr=%s)\n",
+		phydev->drv->name, dev_name(&phydev->dev));
+
+	return 0;
+}
+
 static int __devinit r6040_init_one(struct pci_dev *pdev,
 					 const struct pci_device_id *ent)
 {
@@ -1077,6 +1037,7 @@ static int __devinit r6040_init_one(struct pci_dev *pdev,
 	static int card_idx = -1;
 	int bar = 0;
 	u16 *adrp;
+	int i;
 
 	printk("%s\n", version);
 
@@ -1163,7 +1124,6 @@ static int __devinit r6040_init_one(struct pci_dev *pdev,
 	/* Init RDC private data */
 	lp->mcr0 = 0x1002;
 	lp->phy_addr = phy_table[card_idx];
-	lp->switch_sig = 0;
 
 	/* The RDC-specific entries in the device structure. */
 	dev->netdev_ops = &r6040_netdev_ops;
@@ -1171,28 +1131,54 @@ static int __devinit r6040_init_one(struct pci_dev *pdev,
 	dev->watchdog_timeo = TX_TIMEOUT;
 
 	netif_napi_add(dev, &lp->napi, r6040_poll, 64);
-	lp->mii_if.dev = dev;
-	lp->mii_if.mdio_read = r6040_mdio_read;
-	lp->mii_if.mdio_write = r6040_mdio_write;
-	lp->mii_if.phy_id = lp->phy_addr;
-	lp->mii_if.phy_id_mask = 0x1f;
-	lp->mii_if.reg_num_mask = 0x1f;
-
-	/* Check the vendor ID on the PHY, if 0xffff assume none attached */
-	if (r6040_phy_read(ioaddr, lp->phy_addr, 2) == 0xffff) {
-		dev_err(&pdev->dev, "Failed to detect an attached PHY\n");
-		err = -ENODEV;
+
+	lp->mii_bus = mdiobus_alloc();
+	if (!lp->mii_bus) {
+		dev_err(&pdev->dev, "mdiobus_alloc() failed\n");
 		goto err_out_unmap;
 	}
 
+	lp->mii_bus->priv = dev;
+	lp->mii_bus->read = r6040_mdiobus_read;
+	lp->mii_bus->write = r6040_mdiobus_write;
+	lp->mii_bus->reset = r6040_mdiobus_reset;
+	lp->mii_bus->name = "r6040_eth_mii";
+	snprintf(lp->mii_bus->id, MII_BUS_ID_SIZE, "%x", card_idx);
+	lp->mii_bus->irq = kmalloc(sizeof(int)*PHY_MAX_ADDR, GFP_KERNEL);
+	if (!lp->mii_bus->irq) {
+		dev_err(&pdev->dev, "mii_bus irq allocation failed\n");
+		goto err_out_mdio;
+	}
+
+	for (i = 0; i < PHY_MAX_ADDR; i++)
+		lp->mii_bus->irq[i] = PHY_POLL;
+
+	err = mdiobus_register(lp->mii_bus);
+	if (err) {
+		dev_err(&pdev->dev, "failed to register MII bus\n");
+		goto err_out_mdio_irq;
+	}
+
+	err = r6040_mii_probe(dev);
+	if (err) {
+		dev_err(&pdev->dev, "failed to probe MII bus\n");
+		goto err_out_mdio_unregister;
+	}
+
 	/* Register net device. After this dev->name assign */
 	err = register_netdev(dev);
 	if (err) {
 		dev_err(&pdev->dev, "Failed to register net device\n");
-		goto err_out_unmap;
+		goto err_out_mdio_unregister;
 	}
 	return 0;
 
+err_out_mdio_unregister:
+	mdiobus_unregister(lp->mii_bus);
+err_out_mdio_irq:
+	kfree(lp->mii_bus->irq);
+err_out_mdio:
+	mdiobus_free(lp->mii_bus);
 err_out_unmap:
 	pci_iounmap(pdev, ioaddr);
 err_out_free_res:
@@ -1206,8 +1192,12 @@ err_out:
 static void __devexit r6040_remove_one(struct pci_dev *pdev)
 {
 	struct net_device *dev = pci_get_drvdata(pdev);
+	struct r6040_private *lp = netdev_priv(dev);
 
 	unregister_netdev(dev);
+	mdiobus_unregister(lp->mii_bus);
+	kfree(lp->mii_bus->irq);
+	mdiobus_free(lp->mii_bus);
 	pci_release_regions(pdev);
 	free_netdev(dev);
 	pci_disable_device(pdev);
-- 
1.7.1



^ permalink raw reply related

* [PATCH 2/2] r6040: bump version to 0.26 and date to 30 May 2010
From: Florian Fainelli @ 2010-05-31 19:19 UTC (permalink / raw)
  To: netdev, David Miller

Signed-off-by: Florian Fainelli <florian@openwrt.org>
---
diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c
index 6878511..7d482a2 100644
--- a/drivers/net/r6040.c
+++ b/drivers/net/r6040.c
@@ -49,8 +49,8 @@
 #include <asm/processor.h>
 
 #define DRV_NAME	"r6040"
-#define DRV_VERSION	"0.25"
-#define DRV_RELDATE	"20Aug2009"
+#define DRV_VERSION	"0.26"
+#define DRV_RELDATE	"30May2010"
 
 /* PHY CHIP Address */
 #define PHY1_ADDR	1	/* For MAC1 */
-- 
1.7.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox