* Re: [PATCH v3] net: batch skb dequeueing from softnet input_pkt_queue
From: Eric Dumazet @ 2010-04-14 15:20 UTC (permalink / raw)
To: Changli Gao; +Cc: David S. Miller, netdev
In-Reply-To: <1271238738-8386-1-git-send-email-xiaosuo@gmail.com>
Le mercredi 14 avril 2010 à 17:52 +0800, Changli Gao a écrit :
> batch skb dequeueing from softnet input_pkt_queue
>
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention and irq disabling/enabling.
>
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Adding stop_machine() with no explanation ?
No ack from my previous comments, suggestions, and still same logic ?
Are we supposed to read patch, test it, make some benches, correct bugs,
say Amen ?
This is becoming silly, if you ask me.
This is a NACK of this patch, obviously.
^ permalink raw reply
* Re: [PATCH net-next-2.6] fasync: RCU locking
From: Eric Dumazet @ 2010-04-14 14:57 UTC (permalink / raw)
To: Lai Jiangshan; +Cc: David Miller, Paul E. McKenney, netdev, linux-kernel
In-Reply-To: <4BC57E7D.9060706@cn.fujitsu.com>
Le mercredi 14 avril 2010 à 16:36 +0800, Lai Jiangshan a écrit :
> Since rcu_read_lock() protects fasync_struct *fa for us, we can access
> to @fa safely even fasync_remove_entry() is just called.
>
> But this patch does not ensure 'fa->fa_file is not freed' nor
> 'fa->fa_fd is not released', so kill_fasync_rcu() may do wrong thing
> if there is no other code ensure it.
You are 100% right, I forgot my old attempt to RCUified struct files
failed...
Maybe its time to finally move f_owner out of struct file, and use RCU
to free it.
In the mean time, adding a lock in fasync_struct is more than enough.
Thanks !
[PATCH net-next-2.6 v2] fasync: fine grained locking
kill_fasync() uses a central rwlock, candidate for RCU conversion, to
avoid cache line ping pongs on SMP.
fasync_remove_entry() and fasync_add_entry() can disable IRQS on a short
section instead during whole list scan.
Use a spinlock per fasync_struct to synchronize fasync_{remove|
add}_entry() and kill_fasync_rcu()
We can remove __kill_fasync() direct use in net, and rename it to
kill_fasync_rcu().
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
v2: As Lai Jiangshan noticed, we need a mutual exclusion between
fasync_{remove|add}_entry() and kill_fasync_rcu().
fs/fcntl.c | 66 +++++++++++++++++++++++++++----------------
include/linux/fs.h | 12 +++----
net/socket.c | 4 +-
3 files changed, 50 insertions(+), 32 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f..0a14074 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -614,9 +614,15 @@ int send_sigurg(struct fown_struct *fown)
return ret;
}
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __read_mostly;
+static void fasync_free_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(fasync_cache,
+ container_of(head, struct fasync_struct, fa_rcu));
+}
+
/*
* Remove a fasync entry. If successfully removed, return
* positive and clear the FASYNC flag. If no entry exists,
@@ -625,8 +631,6 @@ static struct kmem_cache *fasync_cache __read_mostly;
* NOTE! It is very important that the FASYNC flag always
* match the state "is the filp on a fasync list".
*
- * We always take the 'filp->f_lock', in since fasync_lock
- * needs to be irq-safe.
*/
static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
@@ -634,17 +638,22 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
int result = 0;
spin_lock(&filp->f_lock);
- write_lock_irq(&fasync_lock);
+ spin_lock(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
+
+ spin_lock_irq(&fa->fa_lock);
+ fa->fa_file = NULL;
+ spin_unlock_irq(&fa->fa_lock);
+
*fp = fa->fa_next;
- kmem_cache_free(fasync_cache, fa);
+ call_rcu(&fa->fa_rcu, fasync_free_rcu);
filp->f_flags &= ~FASYNC;
result = 1;
break;
}
- write_unlock_irq(&fasync_lock);
+ spin_unlock(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
@@ -666,25 +675,30 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
return -ENOMEM;
spin_lock(&filp->f_lock);
- write_lock_irq(&fasync_lock);
+ spin_lock(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
+
+ spin_lock_irq(&fa->fa_lock);
fa->fa_fd = fd;
+ spin_unlock_irq(&fa->fa_lock);
+
kmem_cache_free(fasync_cache, new);
goto out;
}
+ spin_lock_init(&new->fa_lock);
new->magic = FASYNC_MAGIC;
new->fa_file = filp;
new->fa_fd = fd;
new->fa_next = *fapp;
- *fapp = new;
+ rcu_assign_pointer(*fapp, new);
result = 1;
filp->f_flags |= FASYNC;
out:
- write_unlock_irq(&fasync_lock);
+ spin_unlock(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
@@ -704,37 +718,41 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
EXPORT_SYMBOL(fasync_helper);
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
- struct fown_struct * fown;
+ struct fown_struct *fown;
if (fa->magic != FASYNC_MAGIC) {
printk(KERN_ERR "kill_fasync: bad magic number in "
"fasync_struct!\n");
return;
}
- fown = &fa->fa_file->f_owner;
- /* Don't send SIGURG to processes which have not set a
- queued signum: SIGURG has its own default signalling
- mechanism. */
- if (!(sig == SIGURG && fown->signum == 0))
- send_sigio(fown, fa->fa_fd, band);
- fa = fa->fa_next;
+ spin_lock(&fa->fa_lock);
+ if (fa->fa_file) {
+ fown = &fa->fa_file->f_owner;
+ /* Don't send SIGURG to processes which have not set a
+ queued signum: SIGURG has its own default signalling
+ mechanism. */
+ if (!(sig == SIGURG && fown->signum == 0))
+ send_sigio(fown, fa->fa_fd, band);
+ }
+ spin_unlock(&fa->fa_lock);
+ fa = rcu_dereference(fa->fa_next);
}
}
-EXPORT_SYMBOL(__kill_fasync);
-
void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
/* First a quick test without locking: usually
* the list is empty.
*/
if (*fp) {
- read_lock(&fasync_lock);
- /* reread *fp after obtaining the lock */
- __kill_fasync(*fp, sig, band);
- read_unlock(&fasync_lock);
+ rcu_read_lock();
+ kill_fasync_rcu(rcu_dereference(*fp), sig, band);
+ rcu_read_unlock();
}
}
EXPORT_SYMBOL(kill_fasync);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39d57bc..018d382 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1280,10 +1280,12 @@ static inline int lock_may_write(struct inode *inode, loff_t start,
struct fasync_struct {
- int magic;
- int fa_fd;
- struct fasync_struct *fa_next; /* singly linked list */
- struct file *fa_file;
+ spinlock_t fa_lock;
+ int magic;
+ int fa_fd;
+ struct fasync_struct *fa_next; /* singly linked list */
+ struct file *fa_file;
+ struct rcu_head fa_rcu;
};
#define FASYNC_MAGIC 0x4601
@@ -1292,8 +1294,6 @@ struct fasync_struct {
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);
-/* only for net: no internal synchronization */
-extern void __kill_fasync(struct fasync_struct *, int, int);
extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, unsigned long arg, int force);
diff --git a/net/socket.c b/net/socket.c
index 35bc198..846739c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1159,10 +1159,10 @@ int sock_wake_async(struct socket *sock, int how, int band)
/* fall through */
case SOCK_WAKE_IO:
call_kill:
- __kill_fasync(sock->fasync_list, SIGIO, band);
+ kill_fasync(sock->fasync_list, SIGIO, band);
break;
case SOCK_WAKE_URG:
- __kill_fasync(sock->fasync_list, SIGURG, band);
+ kill_fasync(sock->fasync_list, SIGURG, band);
}
return 0;
}
^ permalink raw reply related
* Re: [RFC][PATCH v3 1/3] A device for zero-copy based on KVM virtio-net.
From: Arnd Bergmann @ 2010-04-14 14:55 UTC (permalink / raw)
To: xiaohui.xin; +Cc: netdev, kvm, linux-kernel, mst, mingo, davem, jdike
In-Reply-To: <1270805865-16901-2-git-send-email-xiaohui.xin@intel.com>
On Friday 09 April 2010, xiaohui.xin@intel.com wrote:
> From: Xin Xiaohui <xiaohui.xin@intel.com>
>
> Add a device to utilize the vhost-net backend driver for
> copy-less data transfer between guest FE and host NIC.
> It pins the guest user space to the host memory and
> provides proto_ops as sendmsg/recvmsg to vhost-net.
Sorry for taking so long before finding the time to look
at your code in more detail.
It seems that you are duplicating a lot of functionality that
is already in macvtap. I've asked about this before but then
didn't look at your newer versions. Can you explain the value
of introducing another interface to user land?
I'm still planning to add zero-copy support to macvtap,
hopefully reusing parts of your code, but do you think there
is value in having both?
> diff --git a/drivers/vhost/mpassthru.c b/drivers/vhost/mpassthru.c
> new file mode 100644
> index 0000000..86d2525
> --- /dev/null
> +++ b/drivers/vhost/mpassthru.c
> @@ -0,0 +1,1264 @@
> +
> +#ifdef MPASSTHRU_DEBUG
> +static int debug;
> +
> +#define DBG if (mp->debug) printk
> +#define DBG1 if (debug == 2) printk
> +#else
> +#define DBG(a...)
> +#define DBG1(a...)
> +#endif
This should probably just use the existing dev_dbg/pr_debug infrastructure.
> [... skipping buffer management code for now]
> +static int mp_sendmsg(struct kiocb *iocb, struct socket *sock,
> + struct msghdr *m, size_t total_len)
> +{
> [...]
This function looks like we should be able to easily include it into
macvtap and get zero-copy transmits without introducing the new
user-level interface.
> +static int mp_recvmsg(struct kiocb *iocb, struct socket *sock,
> + struct msghdr *m, size_t total_len,
> + int flags)
> +{
> + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> + struct page_ctor *ctor;
> + struct vhost_virtqueue *vq = (struct vhost_virtqueue *)(iocb->private);
It smells like a layering violation to look at the iocb->private field
from a lower-level driver. I would have hoped that it's possible to implement
this without having this driver know about the higher-level vhost driver
internals. Can you explain why this is needed?
> + spin_lock_irqsave(&ctor->read_lock, flag);
> + list_add_tail(&info->list, &ctor->readq);
> + spin_unlock_irqrestore(&ctor->read_lock, flag);
> +
> + if (!vq->receiver) {
> + vq->receiver = mp_recvmsg_notify;
> + set_memlock_rlimit(ctor, RLIMIT_MEMLOCK,
> + vq->num * 4096,
> + vq->num * 4096);
> + }
> +
> + return 0;
> +}
Not sure what I'm missing, but who calls the vq->receiver? This seems
to be neither in the upstream version of vhost nor introduced by your
patch.
> +static void __mp_detach(struct mp_struct *mp)
> +{
> + mp->mfile = NULL;
> +
> + mp_dev_change_flags(mp->dev, mp->dev->flags & ~IFF_UP);
> + page_ctor_detach(mp);
> + mp_dev_change_flags(mp->dev, mp->dev->flags | IFF_UP);
> +
> + /* Drop the extra count on the net device */
> + dev_put(mp->dev);
> +}
> +
> +static DEFINE_MUTEX(mp_mutex);
> +
> +static void mp_detach(struct mp_struct *mp)
> +{
> + mutex_lock(&mp_mutex);
> + __mp_detach(mp);
> + mutex_unlock(&mp_mutex);
> +}
> +
> +static void mp_put(struct mp_file *mfile)
> +{
> + if (atomic_dec_and_test(&mfile->count))
> + mp_detach(mfile->mp);
> +}
> +
> +static int mp_release(struct socket *sock)
> +{
> + struct mp_struct *mp = container_of(sock->sk, struct mp_sock, sk)->mp;
> + struct mp_file *mfile = mp->mfile;
> +
> + mp_put(mfile);
> + sock_put(mp->socket.sk);
> + put_net(mfile->net);
> +
> + return 0;
> +}
Doesn't this prevent the underlying interface from going away while the chardev
is open? You also have logic to handle that case, so why do you keep the extra
reference on the netdev?
> +/* Ops structure to mimic raw sockets with mp device */
> +static const struct proto_ops mp_socket_ops = {
> + .sendmsg = mp_sendmsg,
> + .recvmsg = mp_recvmsg,
> + .release = mp_release,
> +};
> +static int mp_chr_open(struct inode *inode, struct file * file)
> +{
> + struct mp_file *mfile;
> + cycle_kernel_lock();
I don't think you really want to use the BKL here, just kill that line.
> +static long mp_chr_ioctl(struct file *file, unsigned int cmd,
> + unsigned long arg)
> +{
> + struct mp_file *mfile = file->private_data;
> + struct mp_struct *mp;
> + struct net_device *dev;
> + void __user* argp = (void __user *)arg;
> + struct ifreq ifr;
> + struct sock *sk;
> + int ret;
> +
> + ret = -EINVAL;
> +
> + switch (cmd) {
> + case MPASSTHRU_BINDDEV:
> + ret = -EFAULT;
> + if (copy_from_user(&ifr, argp, sizeof ifr))
> + break;
This is broken for 32 bit compat mode ioctls, because struct ifreq
is different between 32 and 64 bit systems. Since you are only
using the device name anyway, a fixed length string or just the
interface index would be simpler and work better.
> + ifr.ifr_name[IFNAMSIZ-1] = '\0';
> +
> + ret = -EBUSY;
> +
> + if (ifr.ifr_flags & IFF_MPASSTHRU_EXCL)
> + break;
Your current use of the IFF_MPASSTHRU* flags does not seem to make
any sense whatsoever. You check that this flag is never set, but set
it later yourself and then ignore all flags.
> + ret = -ENODEV;
> + dev = dev_get_by_name(mfile->net, ifr.ifr_name);
> + if (!dev)
> + break;
There is no permission checking on who can access what device, which
seems a bit simplistic. Any user that has access to the mpassthru device
seems to be able to bind to any network interface in the namespace.
This is one point where the macvtap model seems more appropriate, it
separates the permissions for creating logical interfaces and using them.
> +static ssize_t mp_chr_aio_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long count, loff_t pos)
> +{
> + struct file *file = iocb->ki_filp;
> + struct mp_struct *mp = mp_get(file->private_data);
> + struct sock *sk = mp->socket.sk;
> + struct sk_buff *skb;
> + int len, err;
> + ssize_t result;
Can you explain what this function is even there for? AFAICT, vhost-net
doesn't call it, the interface is incompatible with the existing
tap interface, and you don't provide a read function.
> diff --git a/include/linux/mpassthru.h b/include/linux/mpassthru.h
> new file mode 100644
> index 0000000..2be21c5
> --- /dev/null
> +++ b/include/linux/mpassthru.h
> @@ -0,0 +1,29 @@
> +#ifndef __MPASSTHRU_H
> +#define __MPASSTHRU_H
> +
> +#include <linux/types.h>
> +#include <linux/if_ether.h>
> +
> +/* ioctl defines */
> +#define MPASSTHRU_BINDDEV _IOW('M', 213, int)
> +#define MPASSTHRU_UNBINDDEV _IOW('M', 214, int)
These definitions are slightly wrong, because you pass more than just an 'int'.
> +/* MPASSTHRU ifc flags */
> +#define IFF_MPASSTHRU 0x0001
> +#define IFF_MPASSTHRU_EXCL 0x0002
As mentioned above, these flags don't make any sense with your current code.
Arnd
^ permalink raw reply
* Re: forcedeth driver hangs under heavy load
From: stephen mulcahy @ 2010-04-14 14:30 UTC (permalink / raw)
To: Ayaz Abdulla
Cc: Eric Dumazet, David Miller, bhutchings@solarflare.com,
netdev@vger.kernel.org, ben@decadent.org.uk,
572201@bugs.debian.org
In-Reply-To: <4BC5539B.6050908@nvidia.com>
Ayaz Abdulla wrote:
> Attached fix has been submitted to netdev.
I've run my reproducer with this patch applied to be Debian 2.6.32
kernel and so far the problem with nodes becoming unresponsive hasn't
occurred.
NIC settings were left the default so this looks positive
root@node23:~# ethtool -k eth0
Offload parameters for eth0:
rx-checksumming: on
tx-checksumming: on
scatter-gather: on
tcp-segmentation-offload: on
udp-fragmentation-offload: off
generic-segmentation-offload: on
generic-receive-offload: off
large-receive-offload: off
Thanks!
-stephen
^ permalink raw reply
* [GIT PULL] vhost-net fix for 2.6.34-rc4
From: Michael S. Tsirkin @ 2010-04-14 14:17 UTC (permalink / raw)
To: David Miller, netdev, Christoph Hellwig
David,
The following tree includes a patch fixing an issue with vhost-net in
2.6.34-rc4. Please pull for 2.6.34.
Thanks!
The following changes since commit 2ba3abd8186f24c7fb418927025b4e2120e3a362:
Merge branch 'pm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael/suspend-2.6 (2010-04-13 17:49:48 -0700)
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost
Christoph Hellwig (1):
vhost: fix sparse warnings
drivers/vhost/net.c | 4 ++--
drivers/vhost/vhost.c | 11 ++++++-----
2 files changed, 8 insertions(+), 7 deletions(-)
^ permalink raw reply
* Re: [PATCH v3] net: batch skb dequeueing from softnet input_pkt_queue
From: jamal @ 2010-04-14 12:28 UTC (permalink / raw)
To: Changli Gao; +Cc: David S. Miller, Eric Dumazet, netdev
In-Reply-To: <o2k412e6f7f1004140513h8de62790tb775bb357e2db6b1@mail.gmail.com>
On Wed, 2010-04-14 at 20:13 +0800, Changli Gao wrote:
> On Wed, Apr 14, 2010 at 7:58 PM, jamal <hadi@cyberus.ca> wrote:
> No extra IPI is needed.
>
> + qlen = queue->input_pkt_queue.qlen + queue->processing_queue.qlen;
> + if (qlen <= netdev_max_backlog) {
> + if (qlen) {
>
> the packets in processing_queue are counted too.
Ok - Looks reasonable.
> > IPIs add to latency (refer to my other email). Did you test this
> > to reach some conclusion that it improves thing or was it just by
> > inspection?
> >
>
> :( only insepection.
I am probably being pushy, but one simple test for latency of single
flow is:
from machine 1, send ping -f
on rps machine:
Base test: no rps on ( a fresh boot with no sysctls should do fine)
Test 1: irq affinity on cpuX, rps to cpuY
Test 2: repeat test1 with your change.
It should show no difference between test1 and 2. If it shows
improvement better - but showing worse latency is bad.
cheers,
jamal
^ permalink raw reply
* Re: [PATCH v3] net: batch skb dequeueing from softnet input_pkt_queue
From: Changli Gao @ 2010-04-14 12:13 UTC (permalink / raw)
To: hadi; +Cc: David S. Miller, Eric Dumazet, netdev
In-Reply-To: <1271246304.3943.60.camel@bigi>
On Wed, Apr 14, 2010 at 7:58 PM, jamal <hadi@cyberus.ca> wrote:
>
> It seems we are now going to generate a lot more IPIs with such a
> change. At least this is what i am imagining.
> CPU0: packet comes in,queue empty, generate an IPI to CPU1
> CPU0: second packet comes in, enqueue
> CPU1: grab two packets to process and run with them
> CPU0: packet comes in,queue empty, generate an IPI to CPU1
No extra IPI is needed.
+ qlen = queue->input_pkt_queue.qlen + queue->processing_queue.qlen;
+ if (qlen <= netdev_max_backlog) {
+ if (qlen) {
the packets in processing_queue are counted too.
>
> IPIs add to latency (refer to my other email). Did you test this
> to reach some conclusion that it improves thing or was it just by
> inspection?
>
:( only insepection.
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply
* Re: [RFC] random SYN drops causing connect() delays
From: Lennart Schulte @ 2010-04-14 11:37 UTC (permalink / raw)
To: tgraf; +Cc: netdev
In-Reply-To: <20100412080633.GA27418@bombadil.infradead.org>
Hi,
this is very similar to what i have noticed, but up to now I couldn't figure out where it came from.
Thanks very much for clearing it up!
> I have been tracking down an issue commonly referred to as the 3-sec
> connect() delay. It exists since recent 2.6.x kernels and has never
> been fixed even though it disappeared in recent releases unless
> sched_child_runs_first is set to 1 again.
>
> What happens is that if a client attemps to open many connections to
> a socket with only minimal delay inbetween attemps some SYNs are
> randomly dropped on the server side causing the client to resend after
> the 3 sec TCP timeout and thus causing connect()s to be randomly delayed.
>
> Facts:
> - Issue can be reproduced over loopback or real networks.
> - Enabling SO_LINGER on the client side will make the issue disappear!!
> - While the issue is appearing, the acceptq seems to be overflowing. Both
> LISTENOVERFLOWS and LISTENDROPS are increasing although not by the exact
> number of delay occurences. inetdiag reports sk_max_ack_backlog to be 0
> therefore one possibility that comes to mind is that sk_ack_backlog
> underflows due to a race.
> - The issue disappeared in recent kernels, I bisected it down to the following
> commit:
> commit 2bba22c50b06abe9fd0d23933b1e64d35b419262
> Author: Mike Galbraith <efault@gmx.de>
> Date: Wed Sep 9 15:41:37 2009 +0200
>
> sched: Turn off child_runs_first
>
> Set child_runs_first default to off.
>
> Setting kernel.sched_child_runs_first=1 makes the isssue reappear in recent
> kernels. This hardens the theory of a race condition.
> - It looks like that the issue can only be reproduced if the server
> socket sends out data immediately after the connection has been established
> but I cannot proof this theory.
^ permalink raw reply
* Re: [PATCH v3] net: batch skb dequeueing from softnet input_pkt_queue
From: jamal @ 2010-04-14 11:58 UTC (permalink / raw)
To: Changli Gao; +Cc: David S. Miller, Eric Dumazet, netdev
In-Reply-To: <1271238738-8386-1-git-send-email-xiaosuo@gmail.com>
On Wed, 2010-04-14 at 17:52 +0800, Changli Gao wrote:
> batch skb dequeueing from softnet input_pkt_queue
>
> batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
> contention and irq disabling/enabling.
>
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
It seems we are now going to generate a lot more IPIs with such a
change. At least this is what i am imagining.
CPU0: packet comes in,queue empty, generate an IPI to CPU1
CPU0: second packet comes in, enqueue
CPU1: grab two packets to process and run with them
CPU0: packet comes in,queue empty, generate an IPI to CPU1
..
...
.....
IPIs add to latency (refer to my other email). Did you test this
to reach some conclusion that it improves thing or was it just by
inspection?
cheers,
jamal
^ permalink raw reply
* Re: [PATCH] tun: orphan an skb on tx
From: David Miller @ 2010-04-14 11:55 UTC (permalink / raw)
To: herbert
Cc: eric.dumazet, mst, jan.kiszka, paul.moore, David.Woodhouse,
netdev, linux-kernel, qemu-devel
In-Reply-To: <20100414005822.GD18044@gondor.apana.org.au>
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 14 Apr 2010 08:58:22 +0800
> On Tue, Apr 13, 2010 at 08:31:03PM +0200, Eric Dumazet wrote:
>>
>> Herbert Acked your patch, so I guess its OK, but I think it can be
>> dangerous.
>
> The tun socket accounting was never designed to stop it from
> flooding another tun interface. It's there to stop it from
> transmitting above a destination interface TX bandwidth and
> cause unnecessary packet drops. It also limits the total amount
> of kernel memory that can be pinned down by a single tun interface.
>
> In this case, all we're doing is shifting the accounting from the
> "hardware" queue to the qdisc queue.
>
> So your ability to flood a tun interface is essentially unchanged.
>
> BTW we do the same thing in a number of hardware drivers, as well
> as virtio-net.
Right. Although this reminds me about the whole SKB
orphaning on xmit issue that keeps coming back to haunt
us.
If there weren't odd references to the SKB's socket in
the packet scheduler et al. we could just orphan these
things right upon entry to the qdisc and not have to
add hacks like this to every driver.
In fact... maybe we can just do it in dev_hard_queue_xmit()
since we are out of the qdisc at that point.... but I guess
there might be weird drivers that want the SKB socket in
their ->xmit routine... Ho hum.
In any event that's net-next-2.6 exploratory material, and I've
applied this patch to net-2.6, thanks!
^ permalink raw reply
* rps perfomance WAS(Re: rps: question
From: jamal @ 2010-04-14 11:53 UTC (permalink / raw)
To: Tom Herbert
Cc: Eric Dumazet, netdev, robert, David Miller, Changli Gao,
Andi Kleen
In-Reply-To: <1265641748.3688.56.camel@bigi>
Following up like promised:
On Mon, 2010-02-08 at 10:09 -0500, jamal wrote:
> On Sun, 2010-02-07 at 21:58 -0800, Tom Herbert wrote:
>
> > I don't have specific numbers, although we are using this on
> > application doing forwarding and numbers seem in line with what we see
> > for an end host.
> >
>
> When i get the chance i will give it a run. I have access to an i7
> somewhere. It seems like i need some specific nics?
I did step #0 last night on an i7 (single Nehalem). I think more than
anything i was impressed by the Nehalem's excellent caching system.
Robert, I am almost tempted to say skb recycling performance will be
excellent on this machine given the cost of a cache miss is much lower
than previous generation hardware.
My test was simple: irq affinity on cpu0(core0) and rps redirection to
cpu1(core 1); tried also to redirect to different SMT threads (aka CPUs)
on different cores with similar results. I base tested against no rps
being used and a kernel which didnt have any RPS config on.
[BTW, I had to hand-edit the .config since i couldnt do it from
menuconfig (Is there any reason for it to be so?)]
Traffic was sent from another machine into the i7 via an el-cheapo sky2
(dont know how shitty this NIC is, but it seems to know how to do MSI so
probably capable of multiqueueing); the test was several sets of
a ping first and then a ping -f (I will get more sophisticated in my
next test likely this weekend).
Results:
CPU utilization was about 20-30% higher in the case of rps. On cpu0, the
cpu was being chewed highly by sky2_poll and on the redirected-to-core
it was always smp_call_function_single.
Latency was (consistently) on average 5 microseconds.
So if i sent 1M ping -f packets, without RPS it took on average
176 seconds and with RPS it took 181 seconds to do a round-trip.
Throughput didnt change but this could be attributed to the low amounts
of data i was sending.
I observed that we were generating, on average, an IPI per packet even
with ping -f. (added an extra stat to record when we sent an IPI and
counted against the number of packets sent).
In my opinion it is these IPIs that contribute the most to the latency
and i think it happens that the Nehalem is just highly improved in this
area. I wish i had a more commonly used machine to test rps on.
I expect that rps will perform worse on currently cheaper/older hardware
for the traffic characteristic i tested.
On IPIs:
Is anyone familiar with what is going on with Nehalem? Why is it this
good? I expect things will get a lot nastier with other hardware like
xeon based or even Nehalem with rps going across QPI.
Here's why i think IPIs are bad, please correct me if i am wrong:
- they are synchronous. i.e an IPI issuer has to wait for an ACK (which
is in the form of an IPI).
- data cache has to be synced to main memory
- the instruction pipeline is flushed
- what else did i miss? Andi?
So my question to Tom, Eric and Changli or anyone else who has been
running RPS:
What hardware did you use? Is there anyone using older hardware than
say AMD Opteron or Intel Nehalem?
My impressions of rps so far:
I think i may end up being impressed when i generate a lot more traffic
since the cost of IPI will be amortized.
At this point multiqueue seems a lot more impressive alternative and it
seems to me multiqueu hardware is a lot more commodity (price-point)
than a Nehalem.
Plan:
I plan to still attack the app space (and write a basic udp app that
binds to one or more rps cpus and try blasting a lot of UDP traffic to
see what happens) my step after that is to move to forwarding tests..
cheers,
jamal
^ permalink raw reply
* Re: HTB - What's the minimal value for 'rate' parameter?
From: Antonio Almeida @ 2010-04-14 10:22 UTC (permalink / raw)
To: Jarek Poplawski; +Cc: netdev, kaber, davem, devik
In-Reply-To: <20100409212657.GA3560@del.dom.local>
What do you mean with "1:2 has grandchildren with overflown rate tables"?
I couldn't understand your idea. Is there any mistake in the
configuration I sent?
How would you set rates for this particular example?
Regards
Antonio Almeida
On Fri, Apr 9, 2010 at 10:26 PM, Jarek Poplawski wrote:
> On Fri, Apr 09, 2010 at 04:40:44PM +0100, Antonio Almeida wrote:
>> So, what about the rate limit miss?
>> As you can see the ceil of class 1:2 is set to 4096Kbit but its
>> sending rate is actually 8071Kbit!
>> It looks like classes 1:10 and 1:11 are ignoring hierarchical rate
>> restrictions of class 1:2
>> Here:
>> class htb 1:2 parent 1:1 rate 4096Kbit ceil 4096Kbit burst 3655b cburst 3655b
>> Sent 84285894 bytes 55671 pkt (dropped 0, overlimits 0 requeues 0)
>> rate 8071Kbit 666pps backlog 0b 0p requeues 0
>> lended: 0 borrowed: 0 giants: 0
>> tokens: -937499999 ctokens: -937499999
>
> Yes, since 1:2 has grandchildren with overflown rate tables, they
> could behave as if they had set rates higher than their parents or
> grandparent (and HTB doesn't restrict it hierarchically).
>
> Jarek P.
>
^ permalink raw reply
* Re: [PATCH] forcedeth: fix tx limit2 flag check
From: stephen mulcahy @ 2010-04-14 10:14 UTC (permalink / raw)
To: Ayaz Abdulla
Cc: David Miller, eric.dumazet@gmail.com, bhutchings@solarflare.com,
netdev@vger.kernel.org, ben@decadent.org.uk,
572201@bugs.debian.org
In-Reply-To: <4BC5532E.7000302@nvidia.com>
Ayaz Abdulla wrote:
> This patch fixes the TX_LIMIT feature flag. The previous logic check for
> TX_LIMIT2 also took into account a device that only had TX_LIMIT set.
>
> Signed-off-by: Ayaz Abdulla <aabdulla@nvidia.com>
>
> This is a fix for bug 572201 @ bugs.debian.org
Hi,
Thanks! I'll rebuild my Debian kernel with this and run a test today.
-stephen
^ permalink raw reply
* [PATCH v3] net: batch skb dequeueing from softnet input_pkt_queue
From: Changli Gao @ 2010-04-14 9:52 UTC (permalink / raw)
To: David S. Miller; +Cc: Eric Dumazet, netdev, Changli Gao
batch skb dequeueing from softnet input_pkt_queue
batch skb dequeueing from softnet input_pkt_queue to reduce potential lock
contention and irq disabling/enabling.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
include/linux/netdevice.h | 1
net/core/dev.c | 56 ++++++++++++++++++++++++++++++++--------------
2 files changed, 40 insertions(+), 17 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d1a21b5..898bc62 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1335,6 +1335,7 @@ struct softnet_data {
struct call_single_data csd ____cacheline_aligned_in_smp;
#endif
struct sk_buff_head input_pkt_queue;
+ struct sk_buff_head processing_queue;
struct napi_struct backlog;
};
diff --git a/net/core/dev.c b/net/core/dev.c
index a10a216..c635a71 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -131,6 +131,7 @@
#include <linux/random.h>
#include <trace/events/napi.h>
#include <linux/pci.h>
+#include <linux/stop_machine.h>
#include "net-sysfs.h"
@@ -2332,6 +2333,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
{
struct softnet_data *queue;
unsigned long flags;
+ u32 qlen;
queue = &per_cpu(softnet_data, cpu);
@@ -2339,8 +2341,9 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
__get_cpu_var(netdev_rx_stat).total++;
rps_lock(queue);
- if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
- if (queue->input_pkt_queue.qlen) {
+ qlen = queue->input_pkt_queue.qlen + queue->processing_queue.qlen;
+ if (qlen <= netdev_max_backlog) {
+ if (qlen) {
enqueue:
__skb_queue_tail(&queue->input_pkt_queue, skb);
rps_unlock(queue);
@@ -2791,19 +2794,31 @@ int netif_receive_skb(struct sk_buff *skb)
EXPORT_SYMBOL(netif_receive_skb);
/* Network device is going away, flush any packets still pending */
-static void flush_backlog(void *arg)
+static void __flush_backlog(struct sk_buff_head *head, struct net_device *dev)
{
- struct net_device *dev = arg;
- struct softnet_data *queue = &__get_cpu_var(softnet_data);
struct sk_buff *skb, *tmp;
- rps_lock(queue);
- skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
+ skb_queue_walk_safe(head, skb, tmp) {
if (skb->dev == dev) {
- __skb_unlink(skb, &queue->input_pkt_queue);
+ __skb_unlink(skb, head);
kfree_skb(skb);
}
- rps_unlock(queue);
+ }
+}
+
+static int flush_backlog(void *arg)
+{
+ struct net_device *dev = arg;
+ struct softnet_data *queue;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ queue = &per_cpu(softnet_data, cpu);
+ __flush_backlog(&queue->input_pkt_queue, dev);
+ __flush_backlog(&queue->processing_queue, dev);
+ }
+
+ return 0;
}
static int napi_gro_complete(struct sk_buff *skb)
@@ -3118,20 +3133,23 @@ static int process_backlog(struct napi_struct *napi, int quota)
local_irq_disable();
rps_lock(queue);
- skb = __skb_dequeue(&queue->input_pkt_queue);
- if (!skb) {
+ skb_queue_splice_tail_init(&queue->input_pkt_queue,
+ &queue->processing_queue);
+ if (skb_queue_empty(&queue->processing_queue)) {
__napi_complete(napi);
rps_unlock(queue);
local_irq_enable();
- break;
+ return work;
}
rps_unlock(queue);
local_irq_enable();
- __netif_receive_skb(skb);
- } while (++work < quota && jiffies == start_time);
-
- return work;
+ while ((skb = __skb_dequeue(&queue->processing_queue))) {
+ __netif_receive_skb(skb);
+ if (++work >= quota || jiffies != start_time)
+ return work;
+ }
+ } while (1);
}
/**
@@ -5027,7 +5045,7 @@ void netdev_run_todo(void)
dev->reg_state = NETREG_UNREGISTERED;
- on_each_cpu(flush_backlog, dev, 1);
+ stop_machine(flush_backlog, dev, NULL);
netdev_wait_allrefs(dev);
@@ -5487,6 +5505,9 @@ static int dev_cpu_callback(struct notifier_block *nfb,
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ while ((skb = __skb_dequeue(&oldsd->processing_queue)))
+ netif_rx(skb);
+
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
netif_rx(skb);
@@ -5709,6 +5730,7 @@ static int __init net_dev_init(void)
queue = &per_cpu(softnet_data, i);
skb_queue_head_init(&queue->input_pkt_queue);
+ skb_queue_head_init(&queue->processing_queue);
queue->completion_queue = NULL;
INIT_LIST_HEAD(&queue->poll_list);
^ permalink raw reply related
* Re: [PATCH net-next-2.6] fasync: RCU locking
From: Lai Jiangshan @ 2010-04-14 8:36 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, Paul E. McKenney, netdev, linux-kernel
In-Reply-To: <1271230961.16881.630.camel@edumazet-laptop>
Eric Dumazet wrote:
> -void __kill_fasync(struct fasync_struct *fa, int sig, int band)
> +/*
> + * rcu_read_lock() is held
> + */
> +static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
> {
> while (fa) {
> struct fown_struct * fown;
> @@ -719,22 +728,19 @@ void __kill_fasync(struct fasync_struct *fa, int sig, int band)
> mechanism. */
> if (!(sig == SIGURG && fown->signum == 0))
> send_sigio(fown, fa->fa_fd, band);
> - fa = fa->fa_next;
> + fa = rcu_dereference(fa->fa_next);
> }
> }
>
Since rcu_read_lock() protects fasync_struct *fa for us, we can access
to @fa safely even fasync_remove_entry() is just called.
But this patch does not ensure 'fa->fa_file is not freed' nor
'fa->fa_fd is not released', so kill_fasync_rcu() may do wrong thing
if there is no other code ensure it.
^ permalink raw reply
* Re: usb-sound circular locking again?
From: Richard Zidlicky @ 2010-04-14 8:26 UTC (permalink / raw)
To: Takashi Iwai; +Cc: Andrew Morton, linux-kernel, netdev
In-Reply-To: <s5h4oje5zl9.wl%tiwai@suse.de>
Hi,
> > is this the same old issue?
>
> I think so. It appears relatively new since a sysfs lockdep check was
> introduced.
you are right, it was definitely my impression that this particular instance is
a new (last previously tested 2.6.32.8).
After a few more tests it appears to be 100% repeatable in pm-hibernate. Simply
doing "sync" right now does nothing.
Richard
^ permalink raw reply
* Intel Pro1000 on CN5020, hangs Uboot
From: Jack Daniel @ 2010-04-14 8:18 UTC (permalink / raw)
To: netdev; +Cc: davem
Hi,
I have a OCTEON CN5020, on which I tried plugging in an Intel Pro 1000
PCI NIC. But Uboot hangs with a trap exception. Uboot has no problems
with a Realtek RTL8139 NIC PCI card that supports 100MBps. Could
someone tell me the reason for such a behaviour?
Regards,
Jack
Uboot Version (as reported by Uboot) : U-Boot 1.1.1 (Development
build, svnversion: u-boot:47725, exec:47725)
Uboot start up message:
PAL rev: 1.01, MCU rev: 1.07, CPU voltage: 1.20
DRAM: 512 MB
Clearing DRAM....... done
Flash: 8 MB
BIST check passed.
Starting PCI
PCI Status: PCI 32-bit
Reg: 0x0 0x0
Reg: 0x1 0x62
Reg: 0x2 0x0
Reg: 0x3 0x0
Reg: 0x4 0x0
Reg: 0x5 0xE
Reg: 0x6 0x1
Reg: 0x7 0xFFFFFFFFC00D5C70
Reg: 0x8 0x800119040000000E
Reg: 0x9 0x61784
Reg: 0xA 0xFFFFFFFFC0000A4C
Reg: 0xB 0xFFFFFFFFC0062208
Reg: 0xC 0xFFFFFFFFC0062550
Reg: 0xD 0xFFFFFFFFC005E4A0
Reg: 0xE 0x20
Reg: 0xF 0x0
Reg: 0x10 0xFFFFFFFFC00D5CA0
Reg: 0x11 0xFFFFFFFFC008F490
Reg: 0x12 0x0
Reg: 0x13 0x0
Reg: 0x14 0x0
Reg: 0x15 0x0
Reg: 0x16 0xFF00
Reg: 0x17 0xFFFFFFFFC00D5CA6
Reg: 0x18 0xFFFFFFFFC0062550
Reg: 0x19 0xFFFFFFFFC0038E24
Reg: 0x1A 0xFFFFFFFFFFFF8000
Reg: 0x1B 0x30
Reg: 0x1C 0xFFFFFFFFBFC621D0
Reg: 0x1D 0xFFFFFFFFFFFF97F8
Reg: 0x1E 0xFFFFFFFFC00D5CA4
Reg: 0x1F 0xFFFFFFFFBFC00AE0
status: 0x504000E7
cause: 0x4000801C
epc: 0xFFFFFFFFC0038EC8
badvaddr: 0x0
^ permalink raw reply
* Re: [Bonding-devel] [v3 Patch 2/3] bridge: make bridge support netpoll
From: Cong Wang @ 2010-04-14 8:16 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Jay Vosburgh, Eric Dumazet, Neil Horman, netdev, Andy Gospodarek,
bridge, linux-kernel, bonding-devel, Jeff Moyer, Matt Mackall,
David Miller
In-Reply-To: <20100413103320.11a2a4f7@nehalam>
Stephen Hemminger wrote:
> On Tue, 13 Apr 2010 09:52:47 -0700
> Jay Vosburgh <fubar@us.ibm.com> wrote:
>
>> Cong Wang <amwang@redhat.com> wrote:
>>
>>> Stephen Hemminger wrote:
>>>> On Mon, 12 Apr 2010 12:38:57 +0200
>>>> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>
>>>>> Le lundi 12 avril 2010 à 18:37 +0800, Cong Wang a écrit :
>>>>>> Stephen Hemminger wrote:
>>>>>>> There is no protection on dev->priv_flags for SMP access.
>>>>>>> It would better bit value in dev->state if you are using it as control flag.
>>>>>>>
>>>>>>> Then you could use
>>>>>>> if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
>>>>>>> netpoll_send_skb(...)
>>>>>>>
>>>>>>>
>>>>>> Hmm, I think we can't use ->state here, it is not for this kind of purpose,
>>>>>> according to its comments.
>>>>>>
>>>>>> Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
>>>>>> &, | to set or clear the flags. So there must be some other things preventing
>>>>>> the race...
>>>>> Yes, its RTNL that protects priv_flags changes, hopefully...
>>>>>
>>>> The patch was not protecting priv_flags with RTNL.
>>>> For example..
>>>>
>>>>
>>>> @@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netp
>>>> tries > 0; --tries) {
>>>> if (__netif_tx_trylock(txq)) {
>>>> if (!netif_tx_queue_stopped(txq)) {
>>>> + dev->priv_flags |= IFF_IN_NETPOLL;
>>>> status = ops->ndo_start_xmit(skb, dev);
>>>> + dev->priv_flags &= ~IFF_IN_NETPOLL;
>>>> if (status == NETDEV_TX_OK)
>>>> txq_trans_update(txq);
>>> Hmm, but I checked the bonding case (IFF_BONDING), it doesn't
>>> hold rtnl_lock. Strange.
>> I looked, and there are a couple of cases in bonding that don't
>> have RTNL for adjusting priv_flags (in bond_ab_arp_probe when no slaves
>> are up, and a couple of cases in 802.3ad). I think the solution there
>> is to move bonding away from priv_flags for some of this (e.g., convert
>> bonding to use a frame hook like bridge and macvlan, and greatly
>> simplify skb_bond_should_drop), but that's a separate topic.
>>
>> The majority of the cases, however, do hold RTNL. Bonding
>> generally doesn't have to acquire RTNL itself, since whatever called
>> into bonding is holding it already. For example, the slave add and
>> remove paths (bond_enslave, bond_release) are called either via sysfs or
>> ioctl, both of which acquire RTNL. All of the set and clear operations
>> for IFF_BONDING fall into this category; look at bonding_store_slaves
>> for an example.
>>
>> Bonding does acquire RTNL itself when performing failovers,
>> e.g., bond_mii_monitor holds RTNL prior to calling bond_miimon_commit,
>> which will change priv_flags.
>>
>
> All this was related to netpoll. And netpoll processing often needs to occur
> in hard IRQ context. Therefor netpoll stuff and RTNL (which is a mutex),
> really don't mix well. Keep RTNL for what it was meant for network
> reconfiguration. Don't turn it into a network special BKL.
>
Hmm, I think for my patch, holding RTNL lock is not necessary,
because there're no other call pathes to change IFF_IN_NETPOLL bit,
which is unlike bonding or bridge cases where sysfs/ioctl is provided
to change it.
The only chance to change IFF_IN_NETPOLL is in netpoll_send_skb()
which can't be called simultaneously because there are other locks
protecting it.
Or am I still missing something?
Thanks.
^ permalink raw reply
* Re: [Bonding-devel] [v3 Patch 2/3] bridge: make bridge support netpoll
From: Cong Wang @ 2010-04-14 8:11 UTC (permalink / raw)
To: Jay Vosburgh
Cc: Stephen Hemminger, Eric Dumazet, Neil Horman, netdev,
Andy Gospodarek, bridge, linux-kernel, bonding-devel, Jeff Moyer,
Matt Mackall, David Miller
In-Reply-To: <8304.1271177567@death.nxdomain.ibm.com>
Jay Vosburgh wrote:
> Cong Wang <amwang@redhat.com> wrote:
>
>> Stephen Hemminger wrote:
>>> On Mon, 12 Apr 2010 12:38:57 +0200
>>> Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>
>>>> Le lundi 12 avril 2010 à 18:37 +0800, Cong Wang a écrit :
>>>>> Stephen Hemminger wrote:
>>>>>> There is no protection on dev->priv_flags for SMP access.
>>>>>> It would better bit value in dev->state if you are using it as control flag.
>>>>>>
>>>>>> Then you could use
>>>>>> if (unlikely(test_and_clear_bit(__IN_NETPOLL, &skb->dev->state)))
>>>>>> netpoll_send_skb(...)
>>>>>>
>>>>>>
>>>>> Hmm, I think we can't use ->state here, it is not for this kind of purpose,
>>>>> according to its comments.
>>>>>
>>>>> Also, I find other usages of IFF_XXX flags of ->priv_flags are also using
>>>>> &, | to set or clear the flags. So there must be some other things preventing
>>>>> the race...
>>>> Yes, its RTNL that protects priv_flags changes, hopefully...
>>>>
>>> The patch was not protecting priv_flags with RTNL.
>>> For example..
>>>
>>>
>>> @@ -308,7 +312,9 @@ static void netpoll_send_skb(struct netp
>>> tries > 0; --tries) {
>>> if (__netif_tx_trylock(txq)) {
>>> if (!netif_tx_queue_stopped(txq)) {
>>> + dev->priv_flags |= IFF_IN_NETPOLL;
>>> status = ops->ndo_start_xmit(skb, dev);
>>> + dev->priv_flags &= ~IFF_IN_NETPOLL;
>>> if (status == NETDEV_TX_OK)
>>> txq_trans_update(txq);
>> Hmm, but I checked the bonding case (IFF_BONDING), it doesn't
>> hold rtnl_lock. Strange.
>
> I looked, and there are a couple of cases in bonding that don't
> have RTNL for adjusting priv_flags (in bond_ab_arp_probe when no slaves
> are up, and a couple of cases in 802.3ad). I think the solution there
> is to move bonding away from priv_flags for some of this (e.g., convert
> bonding to use a frame hook like bridge and macvlan, and greatly
> simplify skb_bond_should_drop), but that's a separate topic.
>
> The majority of the cases, however, do hold RTNL. Bonding
> generally doesn't have to acquire RTNL itself, since whatever called
> into bonding is holding it already. For example, the slave add and
> remove paths (bond_enslave, bond_release) are called either via sysfs or
> ioctl, both of which acquire RTNL. All of the set and clear operations
> for IFF_BONDING fall into this category; look at bonding_store_slaves
> for an example.
>
> Bonding does acquire RTNL itself when performing failovers,
> e.g., bond_mii_monitor holds RTNL prior to calling bond_miimon_commit,
> which will change priv_flags.
>
Thanks a lot for your reply!
You are right, I missed something.
Hmm, for bonding, RTNL lock is necessary because there are sysfs
interface and ioctl interface to change its configuration.
^ permalink raw reply
* [PATCH v2] RPS: export internal software RX queues via sysfs
From: Changli Gao @ 2010-04-14 7:57 UTC (permalink / raw)
To: David S. Miller; +Cc: Tom Herbert, Eric Dumazet, netdev, Changli Gao
export internal software RX queues via sysfs.
The RPS software RX queues are exported as
/sys/class/net/$nic/queues/rx-$/sw-rx-$, and you can specify which CPU handles
a special queue by writing the CPU id to the corresponding file sw-rx-$.
The number of software RX queues can be specified by writing
/sys/class/net/$nic/queues/rx-$/nr-sw-rx. nr-sw-rx is 0 by default.
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
net/core/net-sysfs.c | 234 ++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 230 insertions(+), 4 deletions(-)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 96ed690..4a547b7 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -512,6 +512,167 @@ static struct sysfs_ops rx_queue_sysfs_ops = {
.store = rx_queue_attr_store,
};
+static DEFINE_MUTEX(rps_map_lock);
+
+static ssize_t show_sw_rx(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute, char *buf)
+{
+ unsigned long id;
+ struct rps_map *map;
+ u16 cpu;
+
+ strict_strtoul(attribute->attr.name + strlen("sw-rx-"), 10, &id);
+ rcu_read_lock();
+ map = rcu_dereference(queue->rps_map);
+ if (map && id < map->len)
+ cpu = map->cpus[id];
+ else
+ cpu = 0;
+ rcu_read_unlock();
+ return sprintf(buf, "%hu\n", cpu);
+}
+
+static ssize_t store_sw_rx(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ unsigned long id, cpu;
+ struct rps_map *map;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (strict_strtoul(buf, 0, &cpu) || cpu >= nr_cpumask_bits)
+ return -EINVAL;
+ strict_strtoul(attribute->attr.name + strlen("sw-rx-"), 10, &id);
+
+ mutex_lock(&rps_map_lock);
+ map = queue->rps_map;
+ if (map && id < map->len)
+ map->cpus[id] = cpu;
+ mutex_unlock(&rps_map_lock);
+
+ return len;
+}
+
+struct sw_rx_attribute {
+ struct rx_queue_attribute qattr;
+ atomic_t ref;
+};
+
+static inline void sw_rx_attribute_free(struct sw_rx_attribute *attr)
+{
+ kfree(attr->qattr.attr.name);
+ kfree(attr);
+}
+
+static struct sw_rx_attribute **sw_rx_attr;
+static int sw_rx_attr_size;
+
+#define SW_RX_MAX 65535
+
+static void shrink_sw_rx_attr(void)
+{
+ struct sw_rx_attribute **attrs;
+
+ if (sw_rx_attr_size == 0) {
+ kfree(sw_rx_attr);
+ sw_rx_attr = NULL;
+ return;
+ }
+
+ attrs = kmalloc(sw_rx_attr_size * sizeof(void *), GFP_KERNEL);
+ if (attrs == NULL)
+ return;
+ memcpy(attrs, sw_rx_attr, sw_rx_attr_size * sizeof(void *));
+ swap(attrs, sw_rx_attr);
+ kfree(attrs);
+}
+
+/* must be called with rps_map_lock locked */
+static int update_sw_rx_files(struct kobject *kobj,
+ struct rps_map *old_map, struct rps_map *map)
+{
+ int i;
+ int old_map_len = old_map ? old_map->len : 0;
+ int map_len = map ? map->len : 0;
+
+ if (old_map_len >= map_len) {
+ bool shrink = false;
+
+ for (i = old_map_len - 1; i >= map_len; i--) {
+ sysfs_remove_file(kobj, &sw_rx_attr[i]->qattr.attr);
+ if (atomic_dec_and_test(&sw_rx_attr[i]->ref)) {
+ sw_rx_attribute_free(sw_rx_attr[i]);
+ sw_rx_attr_size--;
+ shrink = true;
+ }
+
+ }
+
+ if (shrink)
+ shrink_sw_rx_attr();
+
+ return 0;
+ }
+
+ if (map_len > sw_rx_attr_size) {
+ struct sw_rx_attribute **attrs;
+ char name[sizeof("sw-rx-" __stringify(SW_RX_MAX))];
+ char *pname;
+
+ attrs = krealloc(sw_rx_attr, map_len * sizeof(void *),
+ GFP_KERNEL);
+ if (attrs == NULL)
+ return -ENOMEM;
+ sw_rx_attr = attrs;
+ for (i = sw_rx_attr_size; i < map_len; i++) {
+ sw_rx_attr[i] = kmalloc(sizeof(**attrs), GFP_KERNEL);
+ if (sw_rx_attr[i] == NULL)
+ break;
+ sprintf(name, "sw-rx-%d", i);
+ pname = kstrdup(name, GFP_KERNEL);
+ if (pname == NULL) {
+ kfree(sw_rx_attr[i]);
+ break;
+ }
+ sw_rx_attr[i]->qattr.attr.name = pname;
+ sw_rx_attr[i]->qattr.attr.mode = S_IRUGO | S_IWUSR;
+ sw_rx_attr[i]->qattr.show = show_sw_rx;
+ sw_rx_attr[i]->qattr.store = store_sw_rx;
+ atomic_set(&sw_rx_attr[i]->ref, 0);
+ }
+ if (i != map_len) {
+ while (--i >= sw_rx_attr_size)
+ sw_rx_attribute_free(sw_rx_attr[i]);
+ shrink_sw_rx_attr();
+ return -ENOMEM;
+ }
+ }
+
+ for (i = old_map_len; i < map_len; i++) {
+ atomic_inc(&sw_rx_attr[i]->ref);
+ if (sysfs_create_file(kobj, &sw_rx_attr[i]->qattr.attr) == 0)
+ continue;
+ atomic_dec(&sw_rx_attr[i]->ref);
+ while (--i >= old_map_len) {
+ sysfs_remove_file(kobj, &sw_rx_attr[i]->qattr.attr);
+ atomic_dec(&sw_rx_attr[i]->ref);
+ }
+ if (sw_rx_attr_size < map_len) {
+ for (i = sw_rx_attr_size; i < map_len; i++)
+ sw_rx_attribute_free(sw_rx_attr[i]);
+ shrink_sw_rx_attr();
+ }
+ return -ENOMEM;
+ }
+
+ if (sw_rx_attr_size < map_len)
+ sw_rx_attr_size = map_len;
+
+ return 0;
+}
+
static ssize_t show_rps_map(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attribute, char *buf)
{
@@ -556,7 +717,6 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
struct rps_map *old_map, *map;
cpumask_var_t mask;
int err, cpu, i;
- static DEFINE_SPINLOCK(rps_map_lock);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -589,10 +749,15 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
map = NULL;
}
- spin_lock(&rps_map_lock);
+ mutex_lock(&rps_map_lock);
old_map = queue->rps_map;
- rcu_assign_pointer(queue->rps_map, map);
- spin_unlock(&rps_map_lock);
+ err = update_sw_rx_files(&queue->kobj, old_map, map);
+ if (!err)
+ rcu_assign_pointer(queue->rps_map, map);
+ mutex_unlock(&rps_map_lock);
+
+ if (err)
+ return err;
if (old_map)
call_rcu(&old_map->rcu, rps_map_release);
@@ -604,8 +769,69 @@ ssize_t store_rps_map(struct netdev_rx_queue *queue,
static struct rx_queue_attribute rps_cpus_attribute =
__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
+static ssize_t show_nr_sw_rx(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute, char *buf)
+{
+ struct rps_map *map;
+ unsigned int len;
+
+ rcu_read_lock();
+ map = rcu_dereference(queue->rps_map);
+ len = map ? map->len : 0;
+ rcu_read_unlock();
+ return sprintf(buf, "%u\n", len);
+}
+
+static ssize_t store_nr_sw_rx(struct netdev_rx_queue *queue,
+ struct rx_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct rps_map *old_map, *map;
+ unsigned long nr;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (strict_strtoul(buf, 0, &nr) || nr > SW_RX_MAX + 1)
+ return -EINVAL;
+ if (nr != 0) {
+ map = kzalloc(max_t(unsigned, RPS_MAP_SIZE(nr), L1_CACHE_BYTES),
+ GFP_KERNEL);
+ if (map == NULL)
+ return -ENOMEM;
+ map->len = nr;
+ } else {
+ map = NULL;
+ }
+
+ mutex_lock(&rps_map_lock);
+ old_map = queue->rps_map;
+ err = update_sw_rx_files(&queue->kobj, old_map, map);
+ if (!err) {
+ if (old_map && map)
+ memcpy(map->cpus, old_map->cpus,
+ sizeof(map->cpus[0]) *
+ min_t(unsigned int, nr, old_map->len));
+ rcu_assign_pointer(queue->rps_map, map);
+ }
+ mutex_unlock(&rps_map_lock);
+
+ if (err)
+ return err;
+
+ if (old_map)
+ call_rcu(&old_map->rcu, rps_map_release);
+
+ return len;
+}
+
+static struct rx_queue_attribute nr_sw_rx_attribute =
+ __ATTR(nr-sw-rx, S_IRUGO | S_IWUSR, show_nr_sw_rx, store_nr_sw_rx);
+
static struct attribute *rx_queue_default_attrs[] = {
&rps_cpus_attribute.attr,
+ &nr_sw_rx_attribute.attr,
NULL
};
^ permalink raw reply related
* Re: [PATCH] fix potential wild pointer when NIC is dying
From: Eric Dumazet @ 2010-04-14 7:49 UTC (permalink / raw)
To: Changli Gao; +Cc: David S. Miller, Tom Herbert, Herbert Xu, netdev
In-Reply-To: <u2r412e6f7f1004140025i51e533c9t7402bc751dd925c2@mail.gmail.com>
Le mercredi 14 avril 2010 à 15:25 +0800, Changli Gao a écrit :
>
> Thanks, I got it.
>
No problem, its better to double check anyway :)
^ permalink raw reply
* [PATCH net-next-2.6] fasync: RCU locking
From: Eric Dumazet @ 2010-04-14 7:42 UTC (permalink / raw)
To: David Miller, Paul E. McKenney; +Cc: netdev, linux-kernel
Paul, could you please check this patch, I am not sure
of the IRQ safety thing...
Is call_rcu() the right method to use in this case ?
Thanks
[PATCH net-next-2.6] fasync: RCU locking
kill_fasync() uses a central rwlock, candidate for RCU conversion.
We can remove __kill_fasync() direct use in net, and rename it to
kill_fasync_rcu()
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
fs/fcntl.c | 36 +++++++++++++++++++++---------------
include/linux/fs.h | 11 +++++------
net/socket.c | 4 ++--
3 files changed, 28 insertions(+), 23 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 452d02f..33cb3ee 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -614,9 +614,15 @@ int send_sigurg(struct fown_struct *fown)
return ret;
}
-static DEFINE_RWLOCK(fasync_lock);
+static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __read_mostly;
+static void fasync_free_rcu(struct rcu_head *head)
+{
+ kmem_cache_free(fasync_cache,
+ container_of(head, struct fasync_struct, fa_rcu));
+}
+
/*
* Remove a fasync entry. If successfully removed, return
* positive and clear the FASYNC flag. If no entry exists,
@@ -634,17 +640,17 @@ static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
int result = 0;
spin_lock(&filp->f_lock);
- write_lock_irq(&fasync_lock);
+ spin_lock_irq(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
*fp = fa->fa_next;
- kmem_cache_free(fasync_cache, fa);
+ call_rcu(&fa->fa_rcu, fasync_free_rcu);
filp->f_flags &= ~FASYNC;
result = 1;
break;
}
- write_unlock_irq(&fasync_lock);
+ spin_unlock_irq(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
@@ -666,7 +672,7 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
return -ENOMEM;
spin_lock(&filp->f_lock);
- write_lock_irq(&fasync_lock);
+ spin_lock_irq(&fasync_lock);
for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
if (fa->fa_file != filp)
continue;
@@ -679,12 +685,12 @@ static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fa
new->fa_file = filp;
new->fa_fd = fd;
new->fa_next = *fapp;
- *fapp = new;
+ rcu_assign_pointer(*fapp, new);
result = 1;
filp->f_flags |= FASYNC;
out:
- write_unlock_irq(&fasync_lock);
+ spin_unlock_irq(&fasync_lock);
spin_unlock(&filp->f_lock);
return result;
}
@@ -704,7 +710,10 @@ int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fap
EXPORT_SYMBOL(fasync_helper);
-void __kill_fasync(struct fasync_struct *fa, int sig, int band)
+/*
+ * rcu_read_lock() is held
+ */
+static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
while (fa) {
struct fown_struct * fown;
@@ -719,22 +728,19 @@ void __kill_fasync(struct fasync_struct *fa, int sig, int band)
mechanism. */
if (!(sig == SIGURG && fown->signum == 0))
send_sigio(fown, fa->fa_fd, band);
- fa = fa->fa_next;
+ fa = rcu_dereference(fa->fa_next);
}
}
-EXPORT_SYMBOL(__kill_fasync);
-
void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
/* First a quick test without locking: usually
* the list is empty.
*/
if (*fp) {
- read_lock(&fasync_lock);
- /* reread *fp after obtaining the lock */
- __kill_fasync(*fp, sig, band);
- read_unlock(&fasync_lock);
+ rcu_read_lock();
+ kill_fasync_rcu(rcu_dereference(*fp), sig, band);
+ rcu_read_unlock();
}
}
EXPORT_SYMBOL(kill_fasync);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39d57bc..158b2cc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1280,10 +1280,11 @@ static inline int lock_may_write(struct inode *inode, loff_t start,
struct fasync_struct {
- int magic;
- int fa_fd;
- struct fasync_struct *fa_next; /* singly linked list */
- struct file *fa_file;
+ int magic;
+ int fa_fd;
+ struct fasync_struct *fa_next; /* singly linked list */
+ struct file *fa_file;
+ struct rcu_head fa_rcu;
};
#define FASYNC_MAGIC 0x4601
@@ -1292,8 +1293,6 @@ struct fasync_struct {
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);
-/* only for net: no internal synchronization */
-extern void __kill_fasync(struct fasync_struct *, int, int);
extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, unsigned long arg, int force);
diff --git a/net/socket.c b/net/socket.c
index 35bc198..846739c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1159,10 +1159,10 @@ int sock_wake_async(struct socket *sock, int how, int band)
/* fall through */
case SOCK_WAKE_IO:
call_kill:
- __kill_fasync(sock->fasync_list, SIGIO, band);
+ kill_fasync(sock->fasync_list, SIGIO, band);
break;
case SOCK_WAKE_URG:
- __kill_fasync(sock->fasync_list, SIGURG, band);
+ kill_fasync(sock->fasync_list, SIGURG, band);
}
return 0;
}
^ permalink raw reply related
* Re: [PATCH] fix potential wild pointer when NIC is dying
From: Changli Gao @ 2010-04-14 7:25 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, Tom Herbert, Herbert Xu, netdev
In-Reply-To: <1271223212.16881.598.camel@edumazet-laptop>
On Wed, Apr 14, 2010 at 1:33 PM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le mercredi 14 avril 2010 à 20:18 +0800, Changli Gao a écrit :
>
> I dont see how the problem can happens, and how RPS is involved.
>
> Did you got a single panic, could you provide us a stack trace ?
>
> Maybe are you referring to NAPI ?
>
> NAPI process packets delivered by NIC, and through RPS deliver it to a
> (possibly) remote CPU queue.
>
> But at device dismantle time, we should stop NAPI on this device and
> packet delivery machinery. RPS being on or not, NAPI wont deliver new
> packets. The fact that NAPI can be throtled doesnt change the napi
> instance being disabled at this point. No more packet will be delivered
> (RPS or not)
>
> Only after this point we call flush_backlog() to make sure we dont have
> any queued packet in each cpu input_pkt_queue pointing to the device we
> dismantle.
>
> RPS doesnt change this at all.
>
> Hmm ???
>
Thanks, I got it.
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply
* [net-next 7/7] stmmac: updated the drv module version
From: Giuseppe CAVALLARO @ 2010-04-14 6:21 UTC (permalink / raw)
To: netdev; +Cc: Giuseppe Cavallaro
In-Reply-To: <1271226077-25882-6-git-send-email-peppe.cavallaro@st.com>
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
---
drivers/net/stmmac/stmmac.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/stmmac/stmmac.h b/drivers/net/stmmac/stmmac.h
index 1a6eb7b..ebebc64 100644
--- a/drivers/net/stmmac/stmmac.h
+++ b/drivers/net/stmmac/stmmac.h
@@ -20,7 +20,7 @@
Author: Giuseppe Cavallaro <peppe.cavallaro@st.com>
*******************************************************************************/
-#define DRV_MODULE_VERSION "Jan_2010"
+#define DRV_MODULE_VERSION "Apr_2010"
#include <linux/stmmac.h>
#include "common.h"
--
1.6.0.4
^ permalink raw reply related
* [net-next 6/7] stmmac: fix vlan support setup
From: Giuseppe CAVALLARO @ 2010-04-14 6:21 UTC (permalink / raw)
To: netdev; +Cc: Giuseppe Cavallaro
In-Reply-To: <1271226077-25882-5-git-send-email-peppe.cavallaro@st.com>
Moved STMMAC_VLAN_TAG_USED from stmmac.h to common.h header
because it is used within the device and descriptor cores.
Signed-off-by: Giuseppe Cavallaro <peppe.cavallaro@st.com>
---
drivers/net/stmmac/common.h | 5 +++++
drivers/net/stmmac/stmmac.h | 5 -----
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/net/stmmac/common.h b/drivers/net/stmmac/common.h
index 27a05b4..144f76f 100644
--- a/drivers/net/stmmac/common.h
+++ b/drivers/net/stmmac/common.h
@@ -23,6 +23,11 @@
*******************************************************************************/
#include <linux/netdevice.h>
+#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
+#define STMMAC_VLAN_TAG_USED
+#include <linux/if_vlan.h>
+#endif
+
#include "descs.h"
#undef CHIP_DEBUG_PRINT
diff --git a/drivers/net/stmmac/stmmac.h b/drivers/net/stmmac/stmmac.h
index 0d776bc..1a6eb7b 100644
--- a/drivers/net/stmmac/stmmac.h
+++ b/drivers/net/stmmac/stmmac.h
@@ -23,11 +23,6 @@
#define DRV_MODULE_VERSION "Jan_2010"
#include <linux/stmmac.h>
-#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
-#define STMMAC_VLAN_TAG_USED
-#include <linux/if_vlan.h>
-#endif
-
#include "common.h"
#ifdef CONFIG_STMMAC_TIMER
#include "stmmac_timer.h"
--
1.6.0.4
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox