* fired a bug report on bugzilla.redhat.com
From: Qianfeng Zhang @ 2010-06-11 3:08 UTC (permalink / raw)
To: Herbert Xu, Michael S. Tsirkin, Stephen Hemminger
Cc: David S. Miller, netdev, WANG Cong, Matt Mackall,
Paul E. McKenney
In-Reply-To: <E1OMtjm-0006PK-D6@gondolin.me.apana.org.au>
Hi
Thanks everybody. I justly logged a bug report, here it is
https://bugzilla.redhat.com/show_bug.cgi?id=602927
Qianfeng Zhang (张前锋)
Senior Solution Architect
Linux Platform & Infrastructure
RedHat Software, Beijing
Phone: 010-65339374
Mobile: 13911823595
Email: frzhang@redhat.com
^ permalink raw reply
* [PATCH 8/8] bridge: Fix netpoll support
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
bridge: Fix netpoll support
There are multiple problems with the newly added netpoll support:
1) Use-after-free on each netpoll packet.
2) Invoking unsafe code on netpoll/IRQ path.
3) Breaks when netpoll is enabled on the underlying device.
This patch fixes all of these problems. In particular, we now
allocate proper netpoll structures for each underlying device.
We only allow netpoll to be enabled on the bridge when all the
devices underneath it support netpoll. Once it is enabled, we
do not allow non-netpoll devices to join the bridge (until netpoll
is disabled again).
This allows us to do away with the npinfo juggling that caused
problem number 1.
Incidentally this patch fixes number 2 by bypassing unsafe code
such as multicast snooping and netfilter.
Reported-by: Qianfeng Zhang <frzhang@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
net/bridge/br_device.c | 108 +++++++++++++++++++++++++++---------------------
net/bridge/br_forward.c | 34 +++++----------
net/bridge/br_if.c | 12 +++--
net/bridge/br_private.h | 46 ++++++++++++++++----
4 files changed, 118 insertions(+), 82 deletions(-)
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index dce0611..cd4f07a 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -47,6 +47,10 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
skb_pull(skb, ETH_HLEN);
if (is_multicast_ether_addr(dest)) {
+ if (unlikely(netpoll_tx_running(dev))) {
+ br_flood_deliver(br, skb);
+ goto out;
+ }
if (br_multicast_rcv(br, NULL, skb))
goto out;
@@ -199,72 +203,81 @@ static int br_set_tx_csum(struct net_device *dev, u32 data)
}
#ifdef CONFIG_NET_POLL_CONTROLLER
-static bool br_devices_support_netpoll(struct net_bridge *br)
+static void br_poll_controller(struct net_device *br_dev)
{
- struct net_bridge_port *p;
- bool ret = true;
- int count = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&br->lock, flags);
- list_for_each_entry(p, &br->port_list, list) {
- count++;
- if ((p->dev->priv_flags & IFF_DISABLE_NETPOLL) ||
- !p->dev->netdev_ops->ndo_poll_controller)
- ret = false;
- }
- spin_unlock_irqrestore(&br->lock, flags);
- return count != 0 && ret;
}
-static void br_poll_controller(struct net_device *br_dev)
+static void br_netpoll_cleanup(struct net_device *dev)
{
- struct netpoll *np = br_dev->npinfo->netpoll;
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p, *n;
- if (np->real_dev != br_dev)
- netpoll_poll_dev(np->real_dev);
+ list_for_each_entry_safe(p, n, &br->port_list, list) {
+ br_netpoll_disable(p);
+ }
}
-void br_netpoll_cleanup(struct net_device *dev)
+static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *p, *n;
- const struct net_device_ops *ops;
+ int err = 0;
list_for_each_entry_safe(p, n, &br->port_list, list) {
- if (p->dev) {
- ops = p->dev->netdev_ops;
- if (ops->ndo_netpoll_cleanup)
- ops->ndo_netpoll_cleanup(p->dev);
- else
- p->dev->npinfo = NULL;
- }
+ if (!p->dev)
+ continue;
+
+ err = br_netpoll_enable(p);
+ if (err)
+ goto fail;
}
+
+out:
+ return err;
+
+fail:
+ br_netpoll_cleanup(dev);
+ goto out;
}
-void br_netpoll_disable(struct net_bridge *br,
- struct net_device *dev)
+int br_netpoll_enable(struct net_bridge_port *p)
{
- if (br_devices_support_netpoll(br))
- br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
- if (dev->netdev_ops->ndo_netpoll_cleanup)
- dev->netdev_ops->ndo_netpoll_cleanup(dev);
- else
- dev->npinfo = NULL;
+ struct netpoll *np;
+ int err = 0;
+
+ np = kzalloc(sizeof(*p->np), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!np)
+ goto out;
+
+ np->dev = p->dev;
+
+ err = __netpoll_setup(np);
+ if (err) {
+ kfree(np);
+ goto out;
+ }
+
+ p->np = np;
+
+out:
+ return err;
}
-void br_netpoll_enable(struct net_bridge *br,
- struct net_device *dev)
+void br_netpoll_disable(struct net_bridge_port *p)
{
- if (br_devices_support_netpoll(br)) {
- br->dev->priv_flags &= ~IFF_DISABLE_NETPOLL;
- if (br->dev->npinfo)
- dev->npinfo = br->dev->npinfo;
- } else if (!(br->dev->priv_flags & IFF_DISABLE_NETPOLL)) {
- br->dev->priv_flags |= IFF_DISABLE_NETPOLL;
- br_info(br,"new device %s does not support netpoll (disabling)",
- dev->name);
- }
+ struct netpoll *np = p->np;
+
+ if (!np)
+ return;
+
+ p->np = NULL;
+
+ /* Wait for transmitting packets to finish before freeing. */
+ synchronize_rcu_bh();
+
+ __netpoll_cleanup(np);
+ kfree(np);
}
#endif
@@ -293,6 +306,7 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_change_mtu = br_change_mtu,
.ndo_do_ioctl = br_dev_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_netpoll_setup = br_netpoll_setup,
.ndo_netpoll_cleanup = br_netpoll_cleanup,
.ndo_poll_controller = br_poll_controller,
#endif
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index a98ef13..6e97711 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -50,14 +50,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb)
kfree_skb(skb);
else {
skb_push(skb, ETH_HLEN);
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
- if (unlikely(skb->dev->priv_flags & IFF_IN_NETPOLL)) {
- netpoll_send_skb(skb->dev->npinfo->netpoll, skb);
- skb->dev->priv_flags &= ~IFF_IN_NETPOLL;
- } else
-#endif
- dev_queue_xmit(skb);
+ dev_queue_xmit(skb);
}
}
@@ -73,23 +66,20 @@ int br_forward_finish(struct sk_buff *skb)
static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
{
-#ifdef CONFIG_NET_POLL_CONTROLLER
- struct net_bridge *br = to->br;
- if (unlikely(br->dev->priv_flags & IFF_IN_NETPOLL)) {
- struct netpoll *np;
- to->dev->npinfo = skb->dev->npinfo;
- np = skb->dev->npinfo->netpoll;
- np->real_dev = np->dev = to->dev;
- to->dev->priv_flags |= IFF_IN_NETPOLL;
- }
-#endif
skb->dev = to->dev;
+
+ if (unlikely(netpoll_tx_running(to->dev))) {
+ if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
+ kfree_skb(skb);
+ else {
+ skb_push(skb, ETH_HLEN);
+ br_netpoll_send_skb(to, skb);
+ }
+ return;
+ }
+
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev,
br_forward_finish);
-#ifdef CONFIG_NET_POLL_CONTROLLER
- if (skb->dev->npinfo)
- skb->dev->npinfo->netpoll->dev = br->dev;
-#endif
}
static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 18b245e..13102e0 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -154,7 +154,8 @@ static void del_nbp(struct net_bridge_port *p)
kobject_uevent(&p->kobj, KOBJ_REMOVE);
kobject_del(&p->kobj);
- br_netpoll_disable(br, dev);
+ br_netpoll_disable(p);
+
call_rcu(&p->rcu, destroy_nbp_rcu);
}
@@ -167,8 +168,6 @@ static void del_br(struct net_bridge *br, struct list_head *head)
del_nbp(p);
}
- br_netpoll_cleanup(br->dev);
-
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
@@ -428,6 +427,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
if (err)
goto err2;
+ if (br_netpoll_info(br) && ((err = br_netpoll_enable(p))))
+ goto err3;
+
rcu_assign_pointer(dev->br_port, p);
dev_disable_lro(dev);
@@ -448,9 +450,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
kobject_uevent(&p->kobj, KOBJ_ADD);
- br_netpoll_enable(br, dev);
-
return 0;
+err3:
+ sysfs_remove_link(br->ifobj, p->dev->name);
err2:
br_fdb_delete_by_port(br, p, 1);
err1:
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 0f4a74b..29f6d66 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -15,6 +15,7 @@
#include <linux/netdevice.h>
#include <linux/if_bridge.h>
+#include <linux/netpoll.h>
#include <net/route.h>
#define BR_HASH_BITS 8
@@ -143,6 +144,10 @@ struct net_bridge_port
#ifdef CONFIG_SYSFS
char sysfs_name[IFNAMSIZ];
#endif
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *np;
+#endif
};
struct br_cpu_netstats {
@@ -273,16 +278,41 @@ extern void br_dev_setup(struct net_device *dev);
extern netdev_tx_t br_dev_xmit(struct sk_buff *skb,
struct net_device *dev);
#ifdef CONFIG_NET_POLL_CONTROLLER
-extern void br_netpoll_cleanup(struct net_device *dev);
-extern void br_netpoll_enable(struct net_bridge *br,
- struct net_device *dev);
-extern void br_netpoll_disable(struct net_bridge *br,
- struct net_device *dev);
+static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
+{
+ return br->dev->npinfo;
+}
+
+static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ struct netpoll *np = p->np;
+
+ if (np)
+ netpoll_send_skb(np, skb);
+}
+
+extern int br_netpoll_enable(struct net_bridge_port *p);
+extern void br_netpoll_disable(struct net_bridge_port *p);
#else
-#define br_netpoll_cleanup(br)
-#define br_netpoll_enable(br, dev)
-#define br_netpoll_disable(br, dev)
+static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
+{
+ return NULL;
+}
+
+static inline void br_netpoll_send_skb(struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+static inline int br_netpoll_enable(struct net_bridge_port *p)
+{
+ return 0;
+}
+
+static inline void br_netpoll_disable(struct net_bridge_port *p)
+{
+}
#endif
/* br_fdb.c */
^ permalink raw reply related
* [PATCH 7/8] netpoll: Add netpoll_tx_running
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
netpoll: Add netpoll_tx_running
This patch adds the helper netpoll_tx_running for use within
ndo_start_xmit. It returns non-zero if ndo_start_xmit is being
invoked by netpoll, and zero otherwise.
This is currently implemented by simply looking at the hardirq
count. This is because for all non-netpoll uses of ndo_start_xmit,
IRQs must be enabled while netpoll always disables IRQs before
calling ndo_start_xmit.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
include/linux/netpoll.h | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index f3ad74a..4c77fe7 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -116,6 +116,11 @@ static inline void netpoll_poll_unlock(void *have)
}
}
+static inline int netpoll_tx_running(struct net_device *dev)
+{
+ return irqs_disabled();
+}
+
#else
static inline int netpoll_rx(struct sk_buff *skb)
{
@@ -139,6 +144,10 @@ static inline void netpoll_poll_unlock(void *have)
static inline void netpoll_netdev_init(struct net_device *dev)
{
}
+static inline int netpoll_tx_running(struct net_device *dev)
+{
+ return 0;
+}
#endif
#endif
^ permalink raw reply related
* [PATCH 6/8] netpoll: Allow netpoll_setup/cleanup recursion
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
netpoll: Allow netpoll_setup/cleanup recursion
This patch adds the functions __netpoll_setup/__netpoll_cleanup
which is designed to be called recursively through ndo_netpoll_seutp.
They must be called with RTNL held, and the caller must initialise
np->dev and ensure that it has a valid reference count.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
include/linux/netpoll.h | 2
net/core/netpoll.c | 176 ++++++++++++++++++++++++++----------------------
2 files changed, 99 insertions(+), 79 deletions(-)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 95c9f7e..f3ad74a 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -46,9 +46,11 @@ void netpoll_poll(struct netpoll *np);
void netpoll_send_udp(struct netpoll *np, const char *msg, int len);
void netpoll_print_options(struct netpoll *np);
int netpoll_parse_options(struct netpoll *np, char *opt);
+int __netpoll_setup(struct netpoll *np);
int netpoll_setup(struct netpoll *np);
int netpoll_trap(void);
void netpoll_set_trap(int trap);
+void __netpoll_cleanup(struct netpoll *np);
void netpoll_cleanup(struct netpoll *np);
int __netpoll_rx(struct sk_buff *skb);
void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index c445896..f728208 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -724,15 +724,78 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
return -1;
}
-int netpoll_setup(struct netpoll *np)
+int __netpoll_setup(struct netpoll *np)
{
- struct net_device *ndev = NULL;
- struct in_device *in_dev;
+ struct net_device *ndev = np->dev;
struct netpoll_info *npinfo;
const struct net_device_ops *ops;
unsigned long flags;
int err;
+ if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+ !ndev->netdev_ops->ndo_poll_controller) {
+ printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
+ np->name, np->dev_name);
+ err = -ENOTSUPP;
+ goto out;
+ }
+
+ if (!ndev->npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ npinfo->rx_flags = 0;
+ INIT_LIST_HEAD(&npinfo->rx_np);
+
+ spin_lock_init(&npinfo->rx_lock);
+ skb_queue_head_init(&npinfo->arp_tx);
+ skb_queue_head_init(&npinfo->txq);
+ INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+ atomic_set(&npinfo->refcnt, 1);
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_setup) {
+ err = ops->ndo_netpoll_setup(ndev, npinfo);
+ if (err)
+ goto free_npinfo;
+ }
+ } else {
+ npinfo = ndev->npinfo;
+ atomic_inc(&npinfo->refcnt);
+ }
+
+ npinfo->netpoll = np;
+
+ if (np->rx_hook) {
+ spin_lock_irqsave(&npinfo->rx_lock, flags);
+ npinfo->rx_flags |= NETPOLL_RX_ENABLED;
+ list_add_tail(&np->rx, &npinfo->rx_np);
+ spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ }
+
+ /* last thing to do is link it to the net device structure */
+ rcu_assign_pointer(ndev->npinfo, npinfo);
+ rtnl_unlock();
+
+ return 0;
+
+free_npinfo:
+ kfree(npinfo);
+out:
+ return err;
+}
+EXPORT_SYMBOL_GPL(__netpoll_setup);
+
+int netpoll_setup(struct netpoll *np)
+{
+ struct net_device *ndev = NULL;
+ struct in_device *in_dev;
+ int err;
+
if (np->dev_name)
ndev = dev_get_by_name(&init_net, np->dev_name);
if (!ndev) {
@@ -805,61 +868,14 @@ int netpoll_setup(struct netpoll *np)
refill_skbs();
rtnl_lock();
- if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
- !ndev->netdev_ops->ndo_poll_controller) {
- printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
- np->name, np->dev_name);
- err = -ENOTSUPP;
- goto unlock;
- }
-
- if (!ndev->npinfo) {
- npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
- if (!npinfo) {
- err = -ENOMEM;
- goto unlock;
- }
-
- npinfo->rx_flags = 0;
- INIT_LIST_HEAD(&npinfo->rx_np);
-
- spin_lock_init(&npinfo->rx_lock);
- skb_queue_head_init(&npinfo->arp_tx);
- skb_queue_head_init(&npinfo->txq);
- INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
-
- atomic_set(&npinfo->refcnt, 1);
-
- ops = np->dev->netdev_ops;
- if (ops->ndo_netpoll_setup) {
- err = ops->ndo_netpoll_setup(ndev, npinfo);
- if (err)
- goto free_npinfo;
- }
- } else {
- npinfo = ndev->npinfo;
- atomic_inc(&npinfo->refcnt);
- }
-
- npinfo->netpoll = np;
-
- if (np->rx_hook) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- npinfo->rx_flags |= NETPOLL_RX_ENABLED;
- list_add_tail(&np->rx, &npinfo->rx_np);
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
-
- /* last thing to do is link it to the net device structure */
- rcu_assign_pointer(ndev->npinfo, npinfo);
+ err = __netpoll_setup(np);
rtnl_unlock();
+ if (err)
+ goto put;
+
return 0;
-free_npinfo:
- kfree(npinfo);
-unlock:
- rtnl_unlock();
put:
dev_put(ndev);
return err;
@@ -872,40 +888,32 @@ static int __init netpoll_init(void)
}
core_initcall(netpoll_init);
-void netpoll_cleanup(struct netpoll *np)
+void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
unsigned long flags;
- int free = 0;
- if (!np->dev)
+ npinfo = np->dev->npinfo;
+ if (!npinfo)
return;
- rtnl_lock();
- npinfo = np->dev->npinfo;
- if (npinfo) {
- if (!list_empty(&npinfo->rx_np)) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_del(&np->rx);
- if (list_empty(&npinfo->rx_np))
- npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
+ if (!list_empty(&npinfo->rx_np)) {
+ spin_lock_irqsave(&npinfo->rx_lock, flags);
+ list_del(&np->rx);
+ if (list_empty(&npinfo->rx_np))
+ npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+ spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ }
- free = atomic_dec_and_test(&npinfo->refcnt);
- if (free) {
- const struct net_device_ops *ops;
+ if (atomic_dec_and_test(&npinfo->refcnt)) {
+ const struct net_device_ops *ops;
- ops = np->dev->netdev_ops;
- if (ops->ndo_netpoll_cleanup)
- ops->ndo_netpoll_cleanup(np->dev);
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_cleanup)
+ ops->ndo_netpoll_cleanup(np->dev);
- rcu_assign_pointer(np->dev->npinfo, NULL);
- }
- }
- rtnl_unlock();
+ rcu_assign_pointer(np->dev->npinfo, NULL);
- if (free) {
/* avoid racing with NAPI reading npinfo */
synchronize_rcu_bh();
@@ -917,9 +925,19 @@ void netpoll_cleanup(struct netpoll *np)
__skb_queue_purge(&npinfo->txq);
kfree(npinfo);
}
+}
+EXPORT_SYMBOL_GPL(__netpoll_cleanup);
- dev_put(np->dev);
+void netpoll_cleanup(struct netpoll *np)
+{
+ if (!np->dev)
+ return;
+ rtnl_lock();
+ __netpoll_cleanup(np);
+ rtnl_unlock();
+
+ dev_put(np->dev);
np->dev = NULL;
}
^ permalink raw reply related
* [PATCH 5/8] netpoll: Add ndo_netpoll_setup
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
netpoll: Add ndo_netpoll_setup
This patch adds ndo_netpoll_setup as the initialisation primitive
to complement ndo_netpoll_cleanup.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
include/linux/netdevice.h | 2 ++
net/core/netpoll.c | 10 ++++++++++
2 files changed, 12 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 40291f3..619d3f1 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -728,6 +728,8 @@ struct net_device_ops {
unsigned short vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*ndo_poll_controller)(struct net_device *dev);
+ int (*ndo_netpoll_setup)(struct net_device *dev,
+ struct netpoll_info *info);
void (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
int (*ndo_set_vf_mac)(struct net_device *dev,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 9dcd767..c445896 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -729,6 +729,7 @@ int netpoll_setup(struct netpoll *np)
struct net_device *ndev = NULL;
struct in_device *in_dev;
struct netpoll_info *npinfo;
+ const struct net_device_ops *ops;
unsigned long flags;
int err;
@@ -828,6 +829,13 @@ int netpoll_setup(struct netpoll *np)
INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
atomic_set(&npinfo->refcnt, 1);
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_setup) {
+ err = ops->ndo_netpoll_setup(ndev, npinfo);
+ if (err)
+ goto free_npinfo;
+ }
} else {
npinfo = ndev->npinfo;
atomic_inc(&npinfo->refcnt);
@@ -848,6 +856,8 @@ int netpoll_setup(struct netpoll *np)
return 0;
+free_npinfo:
+ kfree(npinfo);
unlock:
rtnl_unlock();
put:
^ permalink raw reply related
* [PATCH 4/8] netpoll: Add locking for netpoll_setup/cleanup
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
netpoll: Add locking for netpoll_setup/cleanup
As it stands, netpoll_setup and netpoll_cleanup have no locking
protection whatsoever. So chaos ensures if two entities try to
perform them on the same device.
This patch adds RTNL to the equation. The code has be rearranged
so that bits that do not need RTNL protection are now moved to
the top of netpoll_setup.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
net/core/netpoll.c | 151 ++++++++++++++++++++++++++---------------------------
1 file changed, 76 insertions(+), 75 deletions(-)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 4b7623d..9dcd767 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -729,7 +729,6 @@ int netpoll_setup(struct netpoll *np)
struct net_device *ndev = NULL;
struct in_device *in_dev;
struct netpoll_info *npinfo;
- struct netpoll *npe, *tmp;
unsigned long flags;
int err;
@@ -741,38 +740,6 @@ int netpoll_setup(struct netpoll *np)
return -ENODEV;
}
- np->dev = ndev;
- if (!ndev->npinfo) {
- npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
- if (!npinfo) {
- err = -ENOMEM;
- goto put;
- }
-
- npinfo->rx_flags = 0;
- INIT_LIST_HEAD(&npinfo->rx_np);
-
- spin_lock_init(&npinfo->rx_lock);
- skb_queue_head_init(&npinfo->arp_tx);
- skb_queue_head_init(&npinfo->txq);
- INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
-
- atomic_set(&npinfo->refcnt, 1);
- } else {
- npinfo = ndev->npinfo;
- atomic_inc(&npinfo->refcnt);
- }
-
- npinfo->netpoll = np;
-
- if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
- !ndev->netdev_ops->ndo_poll_controller) {
- printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
- np->name, np->dev_name);
- err = -ENOTSUPP;
- goto release;
- }
-
if (!netif_running(ndev)) {
unsigned long atmost, atleast;
@@ -786,7 +753,7 @@ int netpoll_setup(struct netpoll *np)
if (err) {
printk(KERN_ERR "%s: failed to open %s\n",
np->name, ndev->name);
- goto release;
+ goto put;
}
atleast = jiffies + HZ/10;
@@ -823,7 +790,7 @@ int netpoll_setup(struct netpoll *np)
printk(KERN_ERR "%s: no IP address for %s, aborting\n",
np->name, np->dev_name);
err = -EDESTADDRREQ;
- goto release;
+ goto put;
}
np->local_ip = in_dev->ifa_list->ifa_local;
@@ -831,6 +798,43 @@ int netpoll_setup(struct netpoll *np)
printk(KERN_INFO "%s: local IP %pI4\n", np->name, &np->local_ip);
}
+ np->dev = ndev;
+
+ /* fill up the skb queue */
+ refill_skbs();
+
+ rtnl_lock();
+ if ((ndev->priv_flags & IFF_DISABLE_NETPOLL) ||
+ !ndev->netdev_ops->ndo_poll_controller) {
+ printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n",
+ np->name, np->dev_name);
+ err = -ENOTSUPP;
+ goto unlock;
+ }
+
+ if (!ndev->npinfo) {
+ npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
+ if (!npinfo) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ npinfo->rx_flags = 0;
+ INIT_LIST_HEAD(&npinfo->rx_np);
+
+ spin_lock_init(&npinfo->rx_lock);
+ skb_queue_head_init(&npinfo->arp_tx);
+ skb_queue_head_init(&npinfo->txq);
+ INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
+
+ atomic_set(&npinfo->refcnt, 1);
+ } else {
+ npinfo = ndev->npinfo;
+ atomic_inc(&npinfo->refcnt);
+ }
+
+ npinfo->netpoll = np;
+
if (np->rx_hook) {
spin_lock_irqsave(&npinfo->rx_lock, flags);
npinfo->rx_flags |= NETPOLL_RX_ENABLED;
@@ -838,24 +842,14 @@ int netpoll_setup(struct netpoll *np)
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
}
- /* fill up the skb queue */
- refill_skbs();
-
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
+ rtnl_unlock();
return 0;
- release:
- if (!ndev->npinfo) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_for_each_entry_safe(npe, tmp, &npinfo->rx_np, rx) {
- npe->dev = NULL;
- }
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
-
- kfree(npinfo);
- }
+unlock:
+ rtnl_unlock();
put:
dev_put(ndev);
return err;
@@ -872,43 +866,50 @@ void netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
unsigned long flags;
+ int free = 0;
- if (np->dev) {
- npinfo = np->dev->npinfo;
- if (npinfo) {
- if (!list_empty(&npinfo->rx_np)) {
- spin_lock_irqsave(&npinfo->rx_lock, flags);
- list_del(&np->rx);
- if (list_empty(&npinfo->rx_np))
- npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
- spin_unlock_irqrestore(&npinfo->rx_lock, flags);
- }
+ if (!np->dev)
+ return;
- if (atomic_dec_and_test(&npinfo->refcnt)) {
- const struct net_device_ops *ops;
+ rtnl_lock();
+ npinfo = np->dev->npinfo;
+ if (npinfo) {
+ if (!list_empty(&npinfo->rx_np)) {
+ spin_lock_irqsave(&npinfo->rx_lock, flags);
+ list_del(&np->rx);
+ if (list_empty(&npinfo->rx_np))
+ npinfo->rx_flags &= ~NETPOLL_RX_ENABLED;
+ spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+ }
- ops = np->dev->netdev_ops;
- if (ops->ndo_netpoll_cleanup)
- ops->ndo_netpoll_cleanup(np->dev);
+ free = atomic_dec_and_test(&npinfo->refcnt);
+ if (free) {
+ const struct net_device_ops *ops;
- rcu_assign_pointer(np->dev->npinfo, NULL);
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_cleanup)
+ ops->ndo_netpoll_cleanup(np->dev);
- /* avoid racing with NAPI reading npinfo */
- synchronize_rcu_bh();
+ rcu_assign_pointer(np->dev->npinfo, NULL);
+ }
+ }
+ rtnl_unlock();
- skb_queue_purge(&npinfo->arp_tx);
- skb_queue_purge(&npinfo->txq);
- cancel_rearming_delayed_work(&npinfo->tx_work);
+ if (free) {
+ /* avoid racing with NAPI reading npinfo */
+ synchronize_rcu_bh();
- /* clean after last, unfinished work */
- __skb_queue_purge(&npinfo->txq);
- kfree(npinfo);
- }
- }
+ skb_queue_purge(&npinfo->arp_tx);
+ skb_queue_purge(&npinfo->txq);
+ cancel_rearming_delayed_work(&npinfo->tx_work);
- dev_put(np->dev);
+ /* clean after last, unfinished work */
+ __skb_queue_purge(&npinfo->txq);
+ kfree(npinfo);
}
+ dev_put(np->dev);
+
np->dev = NULL;
}
^ permalink raw reply related
* [PATCH 3/8] netpoll: Fix RCU usage
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
netpoll: Fix RCU usage
The use of RCU in netpoll is incorrect in a number of places:
1) The initial setting is lacking a write barrier.
2) The synchronize_rcu is in the wrong place.
3) Read barriers are missing.
4) Some places are even missing rcu_read_lock.
5) npinfo is zeroed after freeing.
This patch fixes those issues. As most users are in BH context,
this also converts the RCU usage to the BH variant.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
include/linux/netpoll.h | 13 ++++++++-----
net/core/netpoll.c | 20 ++++++++++++--------
2 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index e9e2312..95c9f7e 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -57,12 +57,15 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb);
#ifdef CONFIG_NETPOLL
static inline bool netpoll_rx(struct sk_buff *skb)
{
- struct netpoll_info *npinfo = skb->dev->npinfo;
+ struct netpoll_info *npinfo;
unsigned long flags;
bool ret = false;
+ rcu_read_lock_bh();
+ npinfo = rcu_dereference(skb->dev->npinfo);
+
if (!npinfo || (list_empty(&npinfo->rx_np) && !npinfo->rx_flags))
- return false;
+ goto out;
spin_lock_irqsave(&npinfo->rx_lock, flags);
/* check rx_flags again with the lock held */
@@ -70,12 +73,14 @@ static inline bool netpoll_rx(struct sk_buff *skb)
ret = true;
spin_unlock_irqrestore(&npinfo->rx_lock, flags);
+out:
+ rcu_read_unlock_bh();
return ret;
}
static inline int netpoll_rx_on(struct sk_buff *skb)
{
- struct netpoll_info *npinfo = skb->dev->npinfo;
+ struct netpoll_info *npinfo = rcu_dereference(skb->dev->npinfo);
return npinfo && (!list_empty(&npinfo->rx_np) || npinfo->rx_flags);
}
@@ -91,7 +96,6 @@ static inline void *netpoll_poll_lock(struct napi_struct *napi)
{
struct net_device *dev = napi->dev;
- rcu_read_lock(); /* deal with race on ->npinfo */
if (dev && dev->npinfo) {
spin_lock(&napi->poll_lock);
napi->poll_owner = smp_processor_id();
@@ -108,7 +112,6 @@ static inline void netpoll_poll_unlock(void *have)
napi->poll_owner = -1;
spin_unlock(&napi->poll_lock);
}
- rcu_read_unlock();
}
#else
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 748c930..4b7623d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -292,6 +292,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
unsigned long tries;
struct net_device *dev = np->dev;
const struct net_device_ops *ops = dev->netdev_ops;
+ /* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo = np->dev->npinfo;
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
@@ -841,10 +842,7 @@ int netpoll_setup(struct netpoll *np)
refill_skbs();
/* last thing to do is link it to the net device structure */
- ndev->npinfo = npinfo;
-
- /* avoid racing with NAPI reading npinfo */
- synchronize_rcu();
+ rcu_assign_pointer(ndev->npinfo, npinfo);
return 0;
@@ -888,6 +886,16 @@ void netpoll_cleanup(struct netpoll *np)
if (atomic_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
+
+ ops = np->dev->netdev_ops;
+ if (ops->ndo_netpoll_cleanup)
+ ops->ndo_netpoll_cleanup(np->dev);
+
+ rcu_assign_pointer(np->dev->npinfo, NULL);
+
+ /* avoid racing with NAPI reading npinfo */
+ synchronize_rcu_bh();
+
skb_queue_purge(&npinfo->arp_tx);
skb_queue_purge(&npinfo->txq);
cancel_rearming_delayed_work(&npinfo->tx_work);
@@ -895,10 +903,6 @@ void netpoll_cleanup(struct netpoll *np)
/* clean after last, unfinished work */
__skb_queue_purge(&npinfo->txq);
kfree(npinfo);
- ops = np->dev->netdev_ops;
- if (ops->ndo_netpoll_cleanup)
- ops->ndo_netpoll_cleanup(np->dev);
- np->dev->npinfo = NULL;
}
}
^ permalink raw reply related
* [PATCH 1/8] netpoll: Set npinfo to NULL even with ndo_netpoll_cleanup
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
netpoll: Set npinfo to NULL even with ndo_netpoll_cleanup
Sinec we have to null npinfo regardless of whether there is a
ndo_netpoll_cleanup, it makes sense to do this unconditionally
in netpoll_cleanup rather than having every driver do it by
themselves.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
net/core/netpoll.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 94825b1..748c930 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -898,8 +898,7 @@ void netpoll_cleanup(struct netpoll *np)
ops = np->dev->netdev_ops;
if (ops->ndo_netpoll_cleanup)
ops->ndo_netpoll_cleanup(np->dev);
- else
- np->dev->npinfo = NULL;
+ np->dev->npinfo = NULL;
}
}
^ permalink raw reply related
* [PATCH 2/8] bridge: Remove redundant npinfo NULL setting
From: Herbert Xu @ 2010-06-11 2:12 UTC (permalink / raw)
To: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Stephen
In-Reply-To: <20100611021142.GA24490@gondor.apana.org.au>
bridge: Remove redundant npinfo NULL setting
Now that netpoll always zaps npinfo we no longer needs to do it
in bridge.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
net/bridge/br_device.c | 1 -
1 file changed, 1 deletion(-)
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index eedf2c9..dce0611 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -231,7 +231,6 @@ void br_netpoll_cleanup(struct net_device *dev)
struct net_bridge_port *p, *n;
const struct net_device_ops *ops;
- br->dev->npinfo = NULL;
list_for_each_entry_safe(p, n, &br->port_list, list) {
if (p->dev) {
ops = p->dev->netdev_ops;
^ permalink raw reply related
* Re: [0/8] netpoll/bridge fixes
From: Herbert Xu @ 2010-06-11 2:11 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Matt Mackall
In-Reply-To: <20100610224839.GA22469@gondor.apana.org.au>
On Fri, Jun 11, 2010 at 08:48:39AM +1000, Herbert Xu wrote:
> On Thu, Jun 10, 2010 at 02:59:15PM -0700, Stephen Hemminger wrote:
> >
> > Okay, then add a comment where in_irq is used?
>
> Actually let me put it into a wrapper. I'll respin the patches.
OK here is a repost. And this time it really is 8 patches :)
I've tested it lightly.
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* [PATCH] virtio_net: implements ethtool_ops.get_drvinfo
From: Taku Izumi @ 2010-06-11 1:29 UTC (permalink / raw)
To: David S. Miller, netdev@vger.kernel.org, rusty
This patch implements ethtool_ops.get_drvinfo interface of virtio_net driver.
Signed-off-by: Taku Izumi <izumi.taku@jp.fujitsu.com>
---
drivers/net/virtio_net.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
Index: net-next.35/drivers/net/virtio_net.c
===================================================================
--- net-next.35.orig/drivers/net/virtio_net.c
+++ net-next.35/drivers/net/virtio_net.c
@@ -701,6 +701,18 @@ static int virtnet_close(struct net_devi
return 0;
}
+static void virtnet_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ struct virtnet_info *vi = netdev_priv(dev);
+ struct virtio_device *vdev = vi->vdev;
+
+ strncpy(drvinfo->driver, KBUILD_MODNAME, 32);
+ strncpy(drvinfo->version, "N/A", 32);
+ strncpy(drvinfo->fw_version, "N/A", 32);
+ strncpy(drvinfo->bus_info, dev_name(&vdev->dev), 32);
+}
+
static int virtnet_set_tx_csum(struct net_device *dev, u32 data)
{
struct virtnet_info *vi = netdev_priv(dev);
@@ -813,6 +825,7 @@ static void virtnet_vlan_rx_kill_vid(str
}
static const struct ethtool_ops virtnet_ethtool_ops = {
+ .get_drvinfo = virtnet_get_drvinfo,
.set_tx_csum = virtnet_set_tx_csum,
.set_sg = ethtool_op_set_sg,
.set_tso = ethtool_op_set_tso,
^ permalink raw reply
* puzzled of the congestion control window of Scalable TCP
From: jellyaaa @ 2010-06-11 1:15 UTC (permalink / raw)
To: netdev
I am now doing something related to the STCP (Scalable TCP). By pringking the
dynamic snd_cwnd in linux (kernel 2.6.18) ,the result puzzled me . Theoryly,
if any congestion occurs, the snd_cwnd = snd_cwnd * 0.875,but I haven't got
the result I expected. The snd_cwnd changes like this :
snd_cwnd = 1 , snd_ssthresh = 2147483647
snd_cwnd = 2 , snd_ssthresh = 2147483647
snd_cwnd = 3, snd_ssthresh = 2147483647
......
snd_cwnd = 19022, snd_ssthresh = 2147483647 // keeping this value for
several ms
snd_cwnd = 1769, snd_ssthresh = 16645 // congestion occurs
snd_cwnd = 1770, snd_ssthresh = 16645
......
>From the above data. when congestion occurs , snd_ssthresh = 16645 ~=~
19022*0.875=16626 corresponds to the theory calculated result. no problems.
but what is the mean of snd_cwnd = 1770? how this value is calculated .who
(where )calculated it ?
--
View this message in context: http://old.nabble.com/puzzled-of-the-congestion-control-window-of-Scalable-TCP-tp28850421p28850421.html
Sent from the netdev mailing list archive at Nabble.com.
^ permalink raw reply
* Re: [0/8] netpoll/bridge fixes
From: Herbert Xu @ 2010-06-10 22:48 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Matt Mackall
In-Reply-To: <20100610145915.721a86b7@nehalam>
On Thu, Jun 10, 2010 at 02:59:15PM -0700, Stephen Hemminger wrote:
>
> Okay, then add a comment where in_irq is used?
Actually let me put it into a wrapper. I'll respin the patches.
Thanks,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* Re: [0/8] netpoll/bridge fixes
From: Stephen Hemminger @ 2010-06-10 21:59 UTC (permalink / raw)
To: Herbert Xu
Cc: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Matt Mackall
In-Reply-To: <20100610215643.GA22048@gondor.apana.org.au>
On Fri, 11 Jun 2010 07:56:43 +1000
Herbert Xu <herbert@gondor.apana.org.au> wrote:
> On Thu, Jun 10, 2010 at 07:49:12AM -0700, Stephen Hemminger wrote:
> >
> > Thanks for looking at this, and yes the original code does
> > look buggy. Not sure if I like the use of in_irq() to handle the netpoll
> > packets as special case. Is in_irq() really reliable for this?
>
> Yes because netpoll always calls ndo_start_xmit with IRQs disabled,
> while on the normal TX path IRQs must be enabled (as otherwise
> enabling BH would warn).
Okay, then add a comment where in_irq is used?
^ permalink raw reply
* Re: [0/8] netpoll/bridge fixes
From: Herbert Xu @ 2010-06-10 21:56 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Michael S. Tsirkin, Qianfeng Zhang, David S. Miller, netdev,
WANG Cong, Matt Mackall
In-Reply-To: <20100610074912.1303e3da@nehalam>
On Thu, Jun 10, 2010 at 07:49:12AM -0700, Stephen Hemminger wrote:
>
> Thanks for looking at this, and yes the original code does
> look buggy. Not sure if I like the use of in_irq() to handle the netpoll
> packets as special case. Is in_irq() really reliable for this?
Yes because netpoll always calls ndo_start_xmit with IRQs disabled,
while on the normal TX path IRQs must be enabled (as otherwise
enabling BH would warn).
Cheers,
--
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply
* Re: QoS weirdness : HTB accuracy
From: Andrew Beverley @ 2010-06-10 21:22 UTC (permalink / raw)
To: Julien Vehent; +Cc: Philip A. Prindeville, Netdev, netfilter
In-Reply-To: <e6bd0cff8dafc8b3af0760facb5fdc8c@localhost>
> > Sorry for the late response: could this be an "aliasing" issue caused
> > by sampling intervals (granularity)?
> >
>
> I was, in fact, an error in my ruleset. I had put the 'linklayer atm' at
> both the branch and leaf levels, so the overhead was computed twice,
> creating those holes in the bandwidth.
I am seeing similar behaviour with my setup. Am I making the same
mistake? A subset of my rules is as follows:
tc qdisc add dev ppp0 root handle 1: htb r2q 1
tc class add dev ppp0 parent 1: classid 1:1 htb \
rate ${DOWNLINK}kbit ceil ${DOWNLINK}kbit \
overhead $overhead linklayer atm <------- Here
tc class add dev ppp0 parent 1:1 classid 1:10 htb \
rate 612kbit ceil 612kbit prio 0 \
overhead $overhead linklayer atm <------- And here
tc qdisc add dev ppp0 parent 1:10 handle 4210: \
sfq perturb 10 limit 50
tc filter add dev ppp0 parent 1:0 protocol ip \
prio 10 handle 10 fw flowid 1:10
Thanks,
Andy
^ permalink raw reply
* Re: [PATCH net-2.6] ixgbe: fix automatic LRO/RSC settings for low latency
From: Jeff Kirsher @ 2010-06-10 20:17 UTC (permalink / raw)
To: Andy Gospodarek; +Cc: netdev, Jesse Brandeburg, Stanislaw Gruszka, stable
In-Reply-To: <20100610181204.GO7497@gospo.rdu.redhat.com>
On Thu, Jun 10, 2010 at 11:12, Andy Gospodarek <andy@greyhouse.net> wrote:
>
> This patch added to 2.6.34:
>
> commit f8d1dcaf88bddc7f282722ec1fdddbcb06a72f18
> Author: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Date: Tue Apr 27 01:37:20 2010 +0000
>
> ixgbe: enable extremely low latency
>
> introduced a feature where LRO (called RSC on the hardware) was disabled
> automatically when setting rx-usecs to 0 via ethtool. Some might not
> like the fact that LRO was disabled automatically, but I'm fine with
> that. What I don't like is that LRO/RSC is automatically enabled when
> rx-usecs is set >0 via ethtool.
>
> This would certainly be a problem if the device was used for forwarding
> and it was determined that the low latency wasn't needed after the
> device was already forwarding. I played around with saving the state of
> LRO in the driver, but it just didn't seem worthwhile and would require
> a small change to dev_disable_lro() that I did not like.
>
> This patch simply leaves LRO disabled when setting rx-usecs >0 and
> requires that the user enable it again. An extra informational message
> will also now appear in the log so users can understand why LRO isn't
> being enabled as they expect.
>
> Inconsistency of LRO setting first noticed by Stanislaw Gruszka.
>
> Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
> CC: Stanislaw Gruszka <sgruszka@redhat.com>
> CC: stable@kernel.org
>
> ---
> drivers/net/ixgbe/ixgbe_ethtool.c | 37 ++++++++-----------------------------
> 1 files changed, 8 insertions(+), 29 deletions(-)
>
Thanks Andy, I have added the patch to my queue.
^ permalink raw reply
* Re: [PATCH] ethoc: Write bus addresses to registers
From: Jonas Bonn @ 2010-06-10 19:56 UTC (permalink / raw)
To: netdev; +Cc: davem
In-Reply-To: <1276181958-31172-1-git-send-email-jonas@southpole.se>
[-- Attachment #1: Type: text/plain, Size: 947 bytes --]
Hmm... found a little problem here... this bit is dependent on another
patch that should be in there before it... I'd appreciate comment on the
approach of the whole thing and then I'll submit the missing bits in the
correct order. Thanks and sorry for the confusion...
/Jonas
See my comment below...
> @@ -978,6 +989,12 @@ static int ethoc_probe(struct platform_device *pdev)
> priv->dma_alloc = buffer_size;
> }
---> Missing bits are here: calculate number of transmission buffers...
>
> + priv->vma = devm_kzalloc(&pdev->dev, priv->num_tx*sizeof(void*), GFP_KERNEL);
---> And here: should be
priv->vma = devm_kzalloc(&pdev->dev, (priv->num_tx
+priv->num_rx)*sizeof(void*), GFP_KERNEL);
> + if (!priv->vma) {
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> /* Allow the platform setup code to pass in a MAC address. */
> if (pdev->dev.platform_data) {
> struct ethoc_platform_data *pdata =
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 198 bytes --]
^ permalink raw reply
* Re: [PATCH for-2.6.35] virtio_net: fix oom handling on tx
From: Michael S. Tsirkin @ 2010-06-10 19:03 UTC (permalink / raw)
To: Stephen Hemminger
Cc: Sridhar Samudrala, virtualization, Rusty Russell, Jiri Pirko,
Shirley Ma, netdev, linux-kernel
In-Reply-To: <20100610104653.1aed2ecc@nehalam>
On Thu, Jun 10, 2010 at 10:46:53AM -0700, Stephen Hemminger wrote:
> On Thu, 10 Jun 2010 10:17:07 -0700
> Sridhar Samudrala <sri@us.ibm.com> wrote:
>
> > On Thu, 2010-06-10 at 18:20 +0300, Michael S. Tsirkin wrote:
> > > virtio net will never try to overflow the TX ring, so the only reason
> > > add_buf may fail is out of memory. Thus, we can not stop the
> > > device until some request completes - there's no guarantee anything
> > > at all is outstanding.
> > >
> > > Make the error message clearer as well: error here does not
> > > indicate queue full.
> > >
> > > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > > ---
> > > drivers/net/virtio_net.c | 15 ++++++++-------
> > > 1 files changed, 8 insertions(+), 7 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 85615a3..e48a06f 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -563,7 +563,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > > struct virtnet_info *vi = netdev_priv(dev);
> > > int capacity;
> > >
> > > -again:
> > > /* Free up any pending old buffers before queueing new ones. */
> > > free_old_xmit_skbs(vi);
> > >
> > > @@ -572,12 +571,14 @@ again:
> > >
> > > /* This can happen with OOM and indirect buffers. */
> > > if (unlikely(capacity < 0)) {
> > > - netif_stop_queue(dev);
> > > - dev_warn(&dev->dev, "Unexpected full queue\n");
> > > - if (unlikely(!virtqueue_enable_cb(vi->svq))) {
> > > - virtqueue_disable_cb(vi->svq);
> > > - netif_start_queue(dev);
> > > - goto again;
> > > + if (net_ratelimit()) {
> > > + if (likely(capacity == -ENOMEM))
> > > + dev_warn(&dev->dev,
> > > + "TX queue failure: out of memory\n");
> > > + else
> > > + dev_warn(&dev->dev,
> > > + "Unexpected TX queue failure: %d\n",
> > > + capacity);
> > > }
> > > return NETDEV_TX_BUSY;
> > > }
> >
> > It is not clear to me how xmit_skb() can return -ENOMEM.
> > xmit_skb() calls virtqueue_add_buf_gfp() which can return -ENOSPC.
> > Even vring_add_indirect() doesn't return -ENOMEM on kmalloc failure.
>
> It makes more sense to have the device increment tx_droppped,
> and return NETDEV_TX_OK. Skip the message (or make it a pr_debug()).
> Network devices do not guarantee packet delivery, and if out of
> resources then holding more data in the
> queue is going to hurt not help the situation.
>
> --
Well, I only keep the existing behaviour around. The changes you propose
would be 2.6.36 material.
I have it on my todo list to look for a way to test performance under
GFP_ATOMIC failure scenario. Any suggestions?
--
MST
^ permalink raw reply
* Re: [PATCH for-2.6.35] virtio_net: fix oom handling on tx
From: Michael S. Tsirkin @ 2010-06-10 18:57 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: virtualization, Rusty Russell, Jiri Pirko, Shirley Ma, netdev,
linux-kernel
In-Reply-To: <1276190227.22064.19.camel@w-sridhar.beaverton.ibm.com>
On Thu, Jun 10, 2010 at 10:17:07AM -0700, Sridhar Samudrala wrote:
> On Thu, 2010-06-10 at 18:20 +0300, Michael S. Tsirkin wrote:
> > virtio net will never try to overflow the TX ring, so the only reason
> > add_buf may fail is out of memory. Thus, we can not stop the
> > device until some request completes - there's no guarantee anything
> > at all is outstanding.
> >
> > Make the error message clearer as well: error here does not
> > indicate queue full.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > drivers/net/virtio_net.c | 15 ++++++++-------
> > 1 files changed, 8 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 85615a3..e48a06f 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -563,7 +563,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > struct virtnet_info *vi = netdev_priv(dev);
> > int capacity;
> >
> > -again:
> > /* Free up any pending old buffers before queueing new ones. */
> > free_old_xmit_skbs(vi);
> >
> > @@ -572,12 +571,14 @@ again:
> >
> > /* This can happen with OOM and indirect buffers. */
> > if (unlikely(capacity < 0)) {
> > - netif_stop_queue(dev);
> > - dev_warn(&dev->dev, "Unexpected full queue\n");
> > - if (unlikely(!virtqueue_enable_cb(vi->svq))) {
> > - virtqueue_disable_cb(vi->svq);
> > - netif_start_queue(dev);
> > - goto again;
> > + if (net_ratelimit()) {
> > + if (likely(capacity == -ENOMEM))
> > + dev_warn(&dev->dev,
> > + "TX queue failure: out of memory\n");
> > + else
> > + dev_warn(&dev->dev,
> > + "Unexpected TX queue failure: %d\n",
> > + capacity);
> > }
> > return NETDEV_TX_BUSY;
> > }
>
> It is not clear to me how xmit_skb() can return -ENOMEM.
> xmit_skb() calls virtqueue_add_buf_gfp() which can return -ENOSPC.
> Even vring_add_indirect() doesn't return -ENOMEM on kmalloc failure.
>
> Thanks
> Sridhar
A separate patch fixes vring_add_indirect to return -ENOMEM.
-ENOSPC really means ring is full so nothing to do
and no need to retry.
--
MST
^ permalink raw reply
* [PATCH net-2.6] ixgbe: fix automatic LRO/RSC settings for low latency
From: Andy Gospodarek @ 2010-06-10 18:12 UTC (permalink / raw)
To: netdev, Jeff Kirsher, Jesse Brandeburg; +Cc: Stanislaw Gruszka, stable
This patch added to 2.6.34:
commit f8d1dcaf88bddc7f282722ec1fdddbcb06a72f18
Author: Jesse Brandeburg <jesse.brandeburg@intel.com>
Date: Tue Apr 27 01:37:20 2010 +0000
ixgbe: enable extremely low latency
introduced a feature where LRO (called RSC on the hardware) was disabled
automatically when setting rx-usecs to 0 via ethtool. Some might not
like the fact that LRO was disabled automatically, but I'm fine with
that. What I don't like is that LRO/RSC is automatically enabled when
rx-usecs is set >0 via ethtool.
This would certainly be a problem if the device was used for forwarding
and it was determined that the low latency wasn't needed after the
device was already forwarding. I played around with saving the state of
LRO in the driver, but it just didn't seem worthwhile and would require
a small change to dev_disable_lro() that I did not like.
This patch simply leaves LRO disabled when setting rx-usecs >0 and
requires that the user enable it again. An extra informational message
will also now appear in the log so users can understand why LRO isn't
being enabled as they expect.
Inconsistency of LRO setting first noticed by Stanislaw Gruszka.
Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
CC: Stanislaw Gruszka <sgruszka@redhat.com>
CC: stable@kernel.org
---
drivers/net/ixgbe/ixgbe_ethtool.c | 37 ++++++++-----------------------------
1 files changed, 8 insertions(+), 29 deletions(-)
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index c50a754..9e672ee 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -2077,25 +2077,6 @@ static int ixgbe_get_coalesce(struct net_device *netdev,
return 0;
}
-/*
- * this function must be called before setting the new value of
- * rx_itr_setting
- */
-static bool ixgbe_reenable_rsc(struct ixgbe_adapter *adapter,
- struct ethtool_coalesce *ec)
-{
- /* check the old value and enable RSC if necessary */
- if ((adapter->rx_itr_setting == 0) &&
- (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)) {
- adapter->flags2 |= IXGBE_FLAG2_RSC_ENABLED;
- adapter->netdev->features |= NETIF_F_LRO;
- DPRINTK(PROBE, INFO, "rx-usecs set to %d, re-enabling RSC\n",
- ec->rx_coalesce_usecs);
- return true;
- }
- return false;
-}
-
static int ixgbe_set_coalesce(struct net_device *netdev,
struct ethtool_coalesce *ec)
{
@@ -2124,9 +2105,6 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
(1000000/ec->rx_coalesce_usecs < IXGBE_MIN_INT_RATE))
return -EINVAL;
- /* check the old value and enable RSC if necessary */
- need_reset = ixgbe_reenable_rsc(adapter, ec);
-
/* store the value in ints/second */
adapter->rx_eitr_param = 1000000/ec->rx_coalesce_usecs;
@@ -2135,9 +2113,6 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
/* clear the lower bit as its used for dynamic state */
adapter->rx_itr_setting &= ~1;
} else if (ec->rx_coalesce_usecs == 1) {
- /* check the old value and enable RSC if necessary */
- need_reset = ixgbe_reenable_rsc(adapter, ec);
-
/* 1 means dynamic mode */
adapter->rx_eitr_param = 20000;
adapter->rx_itr_setting = 1;
@@ -2157,10 +2132,11 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
*/
if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) {
adapter->flags2 &= ~IXGBE_FLAG2_RSC_ENABLED;
- netdev->features &= ~NETIF_F_LRO;
- DPRINTK(PROBE, INFO,
- "rx-usecs set to 0, disabling RSC\n");
-
+ if (netdev->features & NETIF_F_LRO) {
+ netdev->features &= ~NETIF_F_LRO;
+ DPRINTK(PROBE, INFO, "rx-usecs set to 0, "
+ "disabling LRO/RSC\n");
+ }
need_reset = true;
}
}
@@ -2255,6 +2231,9 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data)
}
} else if (!adapter->rx_itr_setting) {
netdev->features &= ~ETH_FLAG_LRO;
+ if (data & ETH_FLAG_LRO)
+ DPRINTK(PROBE, INFO, "rx-usecs set to 0, "
+ "LRO/RSC cannot be enabled.\n");
}
}
^ permalink raw reply related
* Re: [PATCH for-2.6.35] virtio_net: fix oom handling on tx
From: Stephen Hemminger @ 2010-06-10 17:46 UTC (permalink / raw)
To: Sridhar Samudrala
Cc: Michael S. Tsirkin, virtualization, Rusty Russell, Jiri Pirko,
Shirley Ma, netdev, linux-kernel
In-Reply-To: <1276190227.22064.19.camel@w-sridhar.beaverton.ibm.com>
On Thu, 10 Jun 2010 10:17:07 -0700
Sridhar Samudrala <sri@us.ibm.com> wrote:
> On Thu, 2010-06-10 at 18:20 +0300, Michael S. Tsirkin wrote:
> > virtio net will never try to overflow the TX ring, so the only reason
> > add_buf may fail is out of memory. Thus, we can not stop the
> > device until some request completes - there's no guarantee anything
> > at all is outstanding.
> >
> > Make the error message clearer as well: error here does not
> > indicate queue full.
> >
> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> > ---
> > drivers/net/virtio_net.c | 15 ++++++++-------
> > 1 files changed, 8 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 85615a3..e48a06f 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -563,7 +563,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> > struct virtnet_info *vi = netdev_priv(dev);
> > int capacity;
> >
> > -again:
> > /* Free up any pending old buffers before queueing new ones. */
> > free_old_xmit_skbs(vi);
> >
> > @@ -572,12 +571,14 @@ again:
> >
> > /* This can happen with OOM and indirect buffers. */
> > if (unlikely(capacity < 0)) {
> > - netif_stop_queue(dev);
> > - dev_warn(&dev->dev, "Unexpected full queue\n");
> > - if (unlikely(!virtqueue_enable_cb(vi->svq))) {
> > - virtqueue_disable_cb(vi->svq);
> > - netif_start_queue(dev);
> > - goto again;
> > + if (net_ratelimit()) {
> > + if (likely(capacity == -ENOMEM))
> > + dev_warn(&dev->dev,
> > + "TX queue failure: out of memory\n");
> > + else
> > + dev_warn(&dev->dev,
> > + "Unexpected TX queue failure: %d\n",
> > + capacity);
> > }
> > return NETDEV_TX_BUSY;
> > }
>
> It is not clear to me how xmit_skb() can return -ENOMEM.
> xmit_skb() calls virtqueue_add_buf_gfp() which can return -ENOSPC.
> Even vring_add_indirect() doesn't return -ENOMEM on kmalloc failure.
It makes more sense to have the device increment tx_droppped,
and return NETDEV_TX_OK. Skip the message (or make it a pr_debug()).
Network devices do not guarantee packet delivery, and if out of
resources then holding more data in the
queue is going to hurt not help the situation.
--
^ permalink raw reply
* Re: [PATCH 1/2] pktgen: increasing transmission granularity
From: Robert Olsson @ 2010-06-10 17:21 UTC (permalink / raw)
To: Daniel Turull; +Cc: David Miller, netdev, robert, jens.laas
In-Reply-To: <4C10A735.20405@gmail.com>
Thanks,
So with this improved rate control we could do RFC2544-like testing as well.
Cheers
--ro
Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Daniel Turull writes:
> This patch increases the granularity of the rate generated by pktgen.
> The previous version of pktgen uses micro seconds (udelay) resolution when it
> was delayed causing gaps in the rates. It is changed to nanosecond (ndelay).
> Now any rate is possible.
>
> Also it allows to set, the desired rate in Mb/s or packets per second.
>
> The documentation has been updated.
>
> Signed-off-by: Daniel Turull <daniel.turull@gmail.com>
>
> ---
>
> diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt
> index 61bb645..75e4fd7 100644
> --- a/Documentation/networking/pktgen.txt
> +++ b/Documentation/networking/pktgen.txt
> @@ -151,6 +151,8 @@ Examples:
>
> pgset stop aborts injection. Also, ^C aborts generator.
>
> + pgset "rate 300M" set rate to 300 Mb/s
> + pgset "ratep 1000000" set rate to 1Mpps
>
> Example scripts
> ===============
> @@ -241,6 +243,9 @@ src6
> flows
> flowlen
>
> +rate
> +ratep
> +
> References:
> ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/
> ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/
> diff --git a/net/core/pktgen.c b/net/core/pktgen.c
> index 1dacd7b..6428653 100644
> --- a/net/core/pktgen.c
> +++ b/net/core/pktgen.c
> @@ -169,7 +169,7 @@
> #include <asm/dma.h>
> #include <asm/div64.h> /* do_div */
>
> -#define VERSION "2.73"
> +#define VERSION "2.74"
> #define IP_NAME_SZ 32
> #define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
> #define MPLS_STACK_BOTTOM htonl(0x00000100)
> @@ -980,6 +980,40 @@ static ssize_t pktgen_if_write(struct file *file,
> (unsigned long long) pkt_dev->delay);
> return count;
> }
> + if (!strcmp(name, "rate")) {
> + len = num_arg(&user_buffer[i], 10, &value);
> + if (len < 0)
> + return len;
> +
> + i += len;
> + if (!value)
> + return len;
> + pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
> + if (debug)
> + printk(KERN_INFO
> + "pktgen: Delay set at: %llu ns\n",
> + pkt_dev->delay);
> +
> + sprintf(pg_result, "OK: rate=%lu", value);
> + return count;
> + }
> + if (!strcmp(name, "ratep")) {
> + len = num_arg(&user_buffer[i], 10, &value);
> + if (len < 0)
> + return len;
> +
> + i += len;
> + if (!value)
> + return len;
> + pkt_dev->delay = NSEC_PER_SEC/value;
> + if (debug)
> + printk(KERN_INFO
> + "pktgen: Delay set at: %llu ns\n",
> + pkt_dev->delay);
> +
> + sprintf(pg_result, "OK: rate=%lu", value);
> + return count;
> + }
> if (!strcmp(name, "udp_src_min")) {
> len = num_arg(&user_buffer[i], 10, &value);
> if (len < 0)
> @@ -2142,15 +2176,15 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
> hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
> hrtimer_set_expires(&t.timer, spin_until);
>
> - remaining = ktime_to_us(hrtimer_expires_remaining(&t.timer));
> + remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
> if (remaining <= 0) {
> pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
> return;
> }
>
> start_time = ktime_now();
> - if (remaining < 100)
> - udelay(remaining); /* really small just spin */
> + if (remaining < 100000)
> + ndelay(remaining); /* really small just spin */
> else {
> /* see do_nanosleep */
> hrtimer_init_sleeper(&t, current);
^ permalink raw reply
* Re: [PATCH for-2.6.35] virtio_net: fix oom handling on tx
From: Sridhar Samudrala @ 2010-06-10 17:17 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: virtualization, Rusty Russell, Jiri Pirko, Shirley Ma, netdev,
linux-kernel
In-Reply-To: <20100610152041.GA3480@redhat.com>
On Thu, 2010-06-10 at 18:20 +0300, Michael S. Tsirkin wrote:
> virtio net will never try to overflow the TX ring, so the only reason
> add_buf may fail is out of memory. Thus, we can not stop the
> device until some request completes - there's no guarantee anything
> at all is outstanding.
>
> Make the error message clearer as well: error here does not
> indicate queue full.
>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/net/virtio_net.c | 15 ++++++++-------
> 1 files changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 85615a3..e48a06f 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -563,7 +563,6 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> struct virtnet_info *vi = netdev_priv(dev);
> int capacity;
>
> -again:
> /* Free up any pending old buffers before queueing new ones. */
> free_old_xmit_skbs(vi);
>
> @@ -572,12 +571,14 @@ again:
>
> /* This can happen with OOM and indirect buffers. */
> if (unlikely(capacity < 0)) {
> - netif_stop_queue(dev);
> - dev_warn(&dev->dev, "Unexpected full queue\n");
> - if (unlikely(!virtqueue_enable_cb(vi->svq))) {
> - virtqueue_disable_cb(vi->svq);
> - netif_start_queue(dev);
> - goto again;
> + if (net_ratelimit()) {
> + if (likely(capacity == -ENOMEM))
> + dev_warn(&dev->dev,
> + "TX queue failure: out of memory\n");
> + else
> + dev_warn(&dev->dev,
> + "Unexpected TX queue failure: %d\n",
> + capacity);
> }
> return NETDEV_TX_BUSY;
> }
It is not clear to me how xmit_skb() can return -ENOMEM.
xmit_skb() calls virtqueue_add_buf_gfp() which can return -ENOSPC.
Even vring_add_indirect() doesn't return -ENOMEM on kmalloc failure.
Thanks
Sridhar
^ permalink raw reply
* Re: tunnel creation: using fb_tnl_dev inside struct ip6_tnl_net
From: kernel coder @ 2010-06-10 17:12 UTC (permalink / raw)
To: netdev
In-Reply-To: <AANLkTil_o71Qz2y9UJGw4uKVNQAOlYBVQLRjYtghyrni@mail.gmail.com>
Hi,
I need to port some code from linux kernel 2.6.18 to kernel 2.6.29.
One of the file where changes are involved is <net/ipv6/ip6_tunnel.c>,
where a new structure struct ip6_tnl_net was added into kernel2.6.29
that didn't exist in 2.6.18 as follows:
struct ip6_tnl_net {
/* the IPv6 tunnel fallback device */
struct net_device *fb_tnl_dev;
/* lists for storing tunnels in use */
struct ip6_tnl *tnls_r_l[HASH_SIZE];
struct ip6_tnl *tnls_wc[1];
struct ip6_tnl **tnls[2];
};
All those fields in struct ip6_tnl_net were in global namespace in
2.6.18, but wrapped in this struct for some reason (what benefit does
this gain?)
2.6.18 code in function ip6_tunnel_init() works fine as follows. After
being ported to kernel 2.6.29 as shown below, after kernel/module
build, install, reboot hangs. I inserted some printk message as shown,
only 111111 to 44444 were displayed, which indicates that
"register_netdev(ip6n->fb_tnl_dev)" failed.
Any inputs? Thanks.
====================Kernel 2.6.18 code, works fine====================
static int __init ip6_tunnel_init(void)
{
int err;
struct ip6_tnl *tun0;
if (xfrm6_tunnel_register(&ip6ip6_handler)) {
printk(KERN_ERR "ip6ip6 init: can't register tunnel\n");
return -EAGAIN;
}
ip6ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
ip6ip6_tnl_dev_setup);
if (!ip6ip6_fb_tnl_dev) {
err = -ENOMEM;
goto fail;
}
ip6ip6_fb_tnl_dev->init = ip6ip6_fb_tnl_dev_init;
if ((err = register_netdev(ip6ip6_fb_tnl_dev))) {
free_netdev(ip6ip6_fb_tnl_dev);
goto fail;
}
tun0 = ip6ip6_fb_tnl_dev->priv;
tun0->parms.flags |= (IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV);
return 0;
fail:
xfrm6_tunnel_deregister(&ip6ip6_handler);
return err;
}
=================modified Kernel 2.6.29 code, buggy==============
static int __init ip6_tunnel_init(void)
{
int err;
struct ip6_tnl_net *ip6n; /*added due to kernel struct change*/
struct ip6_tnl *tun0; /*added 06082010*/
if (xfrm6_tunnel_register(&ip4ip6_handler, AF_INET)) {
printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n");
err = -EAGAIN;
goto out;
}
if (xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6)) {
printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n");
err = -EAGAIN;
goto unreg_ip4ip6;
}
err = register_pernet_gen_device(&ip6_tnl_net_id, &ip6_tnl_net_ops);
if (err < 0)
goto err_pernet;
printk(KERN_ERR "=========1111111111111111111========\n");
//alloc_netdev() and init the device, added
err = -ENOMEM;
ip6n = kzalloc(sizeof(struct ip6_tnl_net), GFP_KERNEL);
if (ip6n == NULL)
goto out;
ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
ip6_tnl_dev_setup);
printk(KERN_ERR "=======222222222222222222=========\n");
if (!ip6n->fb_tnl_dev) {
err = -ENOMEM;
goto out;
}
printk(KERN_ERR "=========3333333333333333333========\n");
ip6n->fb_tnl_dev->init = ip6_fb_tnl_dev_init2;
//ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
printk(KERN_ERR "=========4444444444444444444========\n");
err = register_netdev(ip6n->fb_tnl_dev);
if (err<0) {
free_netdev(ip6n->fb_tnl_dev);
goto err_pernet;
}
printk(KERN_ERR "=========555555555555555555========\n");
// tun0 = ip6ip6_fb_tnl_dev->priv; /*added per patch*/
tun0 = netdev_priv(ip6n->fb_tnl_dev); /*modified */
printk(KERN_ERR "=========666666666666666666========\n");
tun0->parms.flags |= (IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV);
printk(KERN_ERR "=========777777777777777777========\n");
return 0;
err_pernet:
xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
printk(KERN_ERR "========err pernet=========\n");
unreg_ip4ip6:
xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
printk(KERN_ERR "========err unreg_ip4ip6=========\n");
out:
printk(KERN_ERR "========err out==================\n");
return err;
}
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox