* [patch net-next 0/3] team: bug fixes
From: Jiri Pirko @ 2011-11-16 21:09 UTC (permalink / raw)
To: netdev
Cc: davem, eric.dumazet, bhutchings, shemminger, andy, fbl, jzupka,
ivecera
Jiri Pirko (3):
team: Do not hold rcu_read_lock when running netlink cmds
team: convert overall spinlock to mutex
team: replicate options on register
drivers/net/team/team.c | 120 +++++++++++++++++++++--------
drivers/net/team/team_mode_activebackup.c | 5 +-
include/linux/if_team.h | 10 +-
3 files changed, 93 insertions(+), 42 deletions(-)
--
1.7.6
^ permalink raw reply
* [patch net-next 1/3] team: Do not hold rcu_read_lock when running netlink cmds
From: Jiri Pirko @ 2011-11-16 21:09 UTC (permalink / raw)
To: netdev
Cc: davem, eric.dumazet, bhutchings, shemminger, andy, fbl, jzupka,
ivecera
In-Reply-To: <1321477749-1877-1-git-send-email-jpirko@redhat.com>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
drivers/net/team/team.c | 12 ++++--------
1 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 60672bb..e0cf11c 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1043,8 +1043,7 @@ err_msg_put:
/*
* Netlink cmd functions should be locked by following two functions.
- * To ensure team_uninit would not be called in between, hold rcu_read_lock
- * all the time.
+ * Since dev gets held here, that ensures dev won't disappear in between.
*/
static struct team *team_nl_team_get(struct genl_info *info)
{
@@ -1057,12 +1056,9 @@ static struct team *team_nl_team_get(struct genl_info *info)
return NULL;
ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]);
- rcu_read_lock();
- dev = dev_get_by_index_rcu(net, ifindex);
- if (!dev || dev->netdev_ops != &team_netdev_ops) {
- rcu_read_unlock();
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev || dev->netdev_ops != &team_netdev_ops)
return NULL;
- }
team = netdev_priv(dev);
spin_lock(&team->lock);
@@ -1072,7 +1068,7 @@ static struct team *team_nl_team_get(struct genl_info *info)
static void team_nl_team_put(struct team *team)
{
spin_unlock(&team->lock);
- rcu_read_unlock();
+ dev_put(team->dev);
}
static int team_nl_send_generic(struct genl_info *info, struct team *team,
--
1.7.6
^ permalink raw reply related
* [patch net-next 2/3] team: convert overall spinlock to mutex
From: Jiri Pirko @ 2011-11-16 21:09 UTC (permalink / raw)
To: netdev
Cc: davem, eric.dumazet, bhutchings, shemminger, andy, fbl, jzupka,
ivecera
In-Reply-To: <1321477749-1877-1-git-send-email-jpirko@redhat.com>
No need to have spinlock for this purpose. So convert this to mutex and
avoid current schedule while atomic problems in netlink code.
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
drivers/net/team/team.c | 32 ++++++++++++++++----------------
include/linux/if_team.h | 2 +-
2 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index e0cf11c..04f9302 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -443,9 +443,9 @@ static void __team_compute_features(struct team *team)
static void team_compute_features(struct team *team)
{
- spin_lock(&team->lock);
+ mutex_lock(&team->lock);
__team_compute_features(team);
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
}
static int team_port_enter(struct team *team, struct team_port *port)
@@ -647,7 +647,7 @@ static int team_init(struct net_device *dev)
int i;
team->dev = dev;
- spin_lock_init(&team->lock);
+ mutex_init(&team->lock);
team->pcpu_stats = alloc_percpu(struct team_pcpu_stats);
if (!team->pcpu_stats)
@@ -672,13 +672,13 @@ static void team_uninit(struct net_device *dev)
struct team_port *port;
struct team_port *tmp;
- spin_lock(&team->lock);
+ mutex_lock(&team->lock);
list_for_each_entry_safe(port, tmp, &team->port_list, list)
team_port_del(team, port->dev);
__team_change_mode(team, NULL); /* cleanup */
__team_options_unregister(team, team_options, ARRAY_SIZE(team_options));
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
}
static void team_destructor(struct net_device *dev)
@@ -784,7 +784,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
* Alhough this is reader, it's guarded by team lock. It's not possible
* to traverse list in reverse under rcu_read_lock
*/
- spin_lock(&team->lock);
+ mutex_lock(&team->lock);
list_for_each_entry(port, &team->port_list, list) {
err = dev_set_mtu(port->dev, new_mtu);
if (err) {
@@ -793,7 +793,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
goto unwind;
}
}
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
dev->mtu = new_mtu;
@@ -802,7 +802,7 @@ static int team_change_mtu(struct net_device *dev, int new_mtu)
unwind:
list_for_each_entry_continue_reverse(port, &team->port_list, list)
dev_set_mtu(port->dev, dev->mtu);
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
return err;
}
@@ -880,9 +880,9 @@ static int team_add_slave(struct net_device *dev, struct net_device *port_dev)
struct team *team = netdev_priv(dev);
int err;
- spin_lock(&team->lock);
+ mutex_lock(&team->lock);
err = team_port_add(team, port_dev);
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
return err;
}
@@ -891,9 +891,9 @@ static int team_del_slave(struct net_device *dev, struct net_device *port_dev)
struct team *team = netdev_priv(dev);
int err;
- spin_lock(&team->lock);
+ mutex_lock(&team->lock);
err = team_port_del(team, port_dev);
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
return err;
}
@@ -1061,13 +1061,13 @@ static struct team *team_nl_team_get(struct genl_info *info)
return NULL;
team = netdev_priv(dev);
- spin_lock(&team->lock);
+ mutex_lock(&team->lock);
return team;
}
static void team_nl_team_put(struct team *team)
{
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
dev_put(team->dev);
}
@@ -1483,9 +1483,9 @@ static void team_port_change_check(struct team_port *port, bool linkup)
{
struct team *team = port->team;
- spin_lock(&team->lock);
+ mutex_lock(&team->lock);
__team_port_change_check(port, linkup);
- spin_unlock(&team->lock);
+ mutex_unlock(&team->lock);
}
/************************************
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 14f6388..a6eac12 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -92,7 +92,7 @@ struct team {
struct net_device *dev; /* associated netdevice */
struct team_pcpu_stats __percpu *pcpu_stats;
- spinlock_t lock; /* used for overall locking, e.g. port lists write */
+ struct mutex lock; /* used for overall locking, e.g. port lists write */
/*
* port lists with port count
--
1.7.6
^ permalink raw reply related
* [patch net-next 3/3] team: replicate options on register
From: Jiri Pirko @ 2011-11-16 21:09 UTC (permalink / raw)
To: netdev
Cc: davem, eric.dumazet, bhutchings, shemminger, andy, fbl, jzupka,
ivecera
In-Reply-To: <1321477749-1877-1-git-send-email-jpirko@redhat.com>
Since multiple team instances are putting defined options into their
option list, during register each option must be cloned before added
into list. This resolves uncool memory corruptions when using multiple
teams.
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
drivers/net/team/team.c | 76 +++++++++++++++++++++++++----
drivers/net/team/team_mode_activebackup.c | 5 +-
include/linux/if_team.h | 8 ++--
3 files changed, 72 insertions(+), 17 deletions(-)
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 04f9302..b47f43e 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -80,30 +80,78 @@ EXPORT_SYMBOL(team_port_set_team_mac);
* Options handling
*******************/
-void team_options_register(struct team *team, struct team_option *option,
- size_t option_count)
+struct team_option *__team_find_option(struct team *team, const char *opt_name)
+{
+ struct team_option *option;
+
+ list_for_each_entry(option, &team->option_list, list) {
+ if (strcmp(option->name, opt_name) == 0)
+ return option;
+ }
+ return NULL;
+}
+
+int team_options_register(struct team *team,
+ const struct team_option *option,
+ size_t option_count)
{
int i;
+ struct team_option *dst_opts[option_count];
+ int err;
+
+ memset(dst_opts, 0, sizeof(dst_opts));
+ for (i = 0; i < option_count; i++, option++) {
+ struct team_option *dst_opt;
+
+ if (__team_find_option(team, option->name)) {
+ err = -EEXIST;
+ goto rollback;
+ }
+ dst_opt = kmalloc(sizeof(*option), GFP_KERNEL);
+ if (!dst_opt) {
+ err = -ENOMEM;
+ goto rollback;
+ }
+ memcpy(dst_opt, option, sizeof(*option));
+ dst_opts[i] = dst_opt;
+ }
+
+ for (i = 0; i < option_count; i++)
+ list_add_tail(&dst_opts[i]->list, &team->option_list);
- for (i = 0; i < option_count; i++, option++)
- list_add_tail(&option->list, &team->option_list);
+ return 0;
+
+rollback:
+ for (i = 0; i < option_count; i++)
+ kfree(dst_opts[i]);
+
+ return err;
}
+
EXPORT_SYMBOL(team_options_register);
static void __team_options_change_check(struct team *team,
struct team_option *changed_option);
static void __team_options_unregister(struct team *team,
- struct team_option *option,
+ const struct team_option *option,
size_t option_count)
{
int i;
- for (i = 0; i < option_count; i++, option++)
- list_del(&option->list);
+ for (i = 0; i < option_count; i++, option++) {
+ struct team_option *del_opt;
+
+ del_opt = __team_find_option(team, option->name);
+ if (del_opt) {
+ list_del(&del_opt->list);
+ kfree(del_opt);
+ }
+ }
}
-void team_options_unregister(struct team *team, struct team_option *option,
+void team_options_unregister(struct team *team,
+ const struct team_option *option,
size_t option_count)
{
__team_options_unregister(team, option, option_count);
@@ -632,7 +680,7 @@ static int team_mode_option_set(struct team *team, void *arg)
return team_change_mode(team, *str);
}
-static struct team_option team_options[] = {
+static const struct team_option team_options[] = {
{
.name = "mode",
.type = TEAM_OPTION_TYPE_STRING,
@@ -645,6 +693,7 @@ static int team_init(struct net_device *dev)
{
struct team *team = netdev_priv(dev);
int i;
+ int err;
team->dev = dev;
mutex_init(&team->lock);
@@ -660,10 +709,17 @@ static int team_init(struct net_device *dev)
team_adjust_ops(team);
INIT_LIST_HEAD(&team->option_list);
- team_options_register(team, team_options, ARRAY_SIZE(team_options));
+ err = team_options_register(team, team_options, ARRAY_SIZE(team_options));
+ if (err)
+ goto err_options_register;
netif_carrier_off(dev);
return 0;
+
+err_options_register:
+ free_percpu(team->pcpu_stats);
+
+ return err;
}
static void team_uninit(struct net_device *dev)
diff --git a/drivers/net/team/team_mode_activebackup.c b/drivers/net/team/team_mode_activebackup.c
index 6fe920c..b344275 100644
--- a/drivers/net/team/team_mode_activebackup.c
+++ b/drivers/net/team/team_mode_activebackup.c
@@ -83,7 +83,7 @@ static int ab_active_port_set(struct team *team, void *arg)
return -ENOENT;
}
-static struct team_option ab_options[] = {
+static const struct team_option ab_options[] = {
{
.name = "activeport",
.type = TEAM_OPTION_TYPE_U32,
@@ -94,8 +94,7 @@ static struct team_option ab_options[] = {
int ab_init(struct team *team)
{
- team_options_register(team, ab_options, ARRAY_SIZE(ab_options));
- return 0;
+ return team_options_register(team, ab_options, ARRAY_SIZE(ab_options));
}
void ab_exit(struct team *team)
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index a6eac12..828181f 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -140,11 +140,11 @@ static inline struct team_port *team_get_port_by_index_rcu(struct team *team,
}
extern int team_port_set_team_mac(struct team_port *port);
-extern void team_options_register(struct team *team,
- struct team_option *option,
- size_t option_count);
+extern int team_options_register(struct team *team,
+ const struct team_option *option,
+ size_t option_count);
extern void team_options_unregister(struct team *team,
- struct team_option *option,
+ const struct team_option *option,
size_t option_count);
extern int team_mode_register(struct team_mode *mode);
extern int team_mode_unregister(struct team_mode *mode);
--
1.7.6
^ permalink raw reply related
* sky2 driver breaks reboot/shutdown
From: Sven Joachim @ 2011-11-16 21:14 UTC (permalink / raw)
To: Stephen Hemminger, netdev; +Cc: David S. Miller, linux-kernel
[ Resending since my first attempt apparently did not make it to the
mailing lists. Apologies to anyone who receives duplicates. ]
Hi,
after upgrading from Linux 3.1 to 3.2-rc2 I noticed that the kernel
would no longer reboot or shutdown properly, hanging instead. Removing
the sky2 driver works around this. According to "lspci -v", I have the
following network card:
,----
| 03:00.0 Ethernet controller: Marvell Technology Group Ltd. 88E8056 PCI-E Gigabit Ethernet Controller (rev 13)
| Subsystem: ABIT Computer Corp. Device 108f
| Flags: bus master, fast devsel, latency 0, IRQ 17
| Memory at fdefc000 (64-bit, non-prefetchable) [size=16K]
| I/O ports at ce00 [size=256]
| [virtual] Expansion ROM at fdd00000 [disabled] [size=128K]
| Capabilities: <access denied>
| Kernel driver in use: sky2
`----
Bisecting shows 0bdb0bd01 as the first bad commit:
commit 0bdb0bd0139f3b6afa252de1487e3ce82a494db9
Author: stephen hemminger <shemminger@vyatta.com>
Date: Fri Sep 23 11:13:40 2011 +0000
sky2: manage irq better on single port card
Most sky2 hardware only has a single port, although some variations of the
chip support two interfaces. For the single port case, use the standard
Ethernet driver convention of allocating IRQ when device is brought up
rather than at probe time.
Also, change the error handling of dual port cards so that if second
port can not be brought up, then just fail. No point in continuing, since
the failure is most certainly because of out of memory.
The dual port sky2 device has a single irq and a single status ring,
therefore it has a single NAPI object shared by both ports.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index ef2dc02..338b10c 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -148,6 +148,7 @@ static const unsigned rxqaddr[] = { Q_R1, Q_R2 };
static const u32 portirq_msk[] = { Y2_IS_PORT_1, Y2_IS_PORT_2 };
static void sky2_set_multicast(struct net_device *dev);
+static irqreturn_t sky2_intr(int irq, void *dev_id);
/* Access to PHY via serial interconnect */
static int gm_phy_write(struct sky2_hw *hw, unsigned port, u16 reg, u16 val)
@@ -1715,6 +1716,27 @@ static void sky2_hw_up(struct sky2_port *sky2)
sky2_rx_start(sky2);
}
+/* Setup device IRQ and enable napi to process */
+static int sky2_setup_irq(struct sky2_hw *hw, const char *name)
+{
+ struct pci_dev *pdev = hw->pdev;
+ int err;
+
+ err = request_irq(pdev->irq, sky2_intr,
+ (hw->flags & SKY2_HW_USE_MSI) ? 0 : IRQF_SHARED,
+ name, hw);
+ if (err)
+ dev_err(&pdev->dev, "cannot assign irq %d\n", pdev->irq);
+ else {
+ napi_enable(&hw->napi);
+ sky2_write32(hw, B0_IMSK, Y2_IS_BASE);
+ sky2_read32(hw, B0_IMSK);
+ }
+
+ return err;
+}
+
+
/* Bring up network interface. */
static int sky2_up(struct net_device *dev)
{
@@ -1730,6 +1752,10 @@ static int sky2_up(struct net_device *dev)
if (err)
goto err_out;
+ /* With single port, IRQ is setup when device is brought up */
+ if (hw->ports == 1 && (err = sky2_setup_irq(hw, dev->name)))
+ goto err_out;
+
sky2_hw_up(sky2);
/* Enable interrupts from phy/mac for port */
@@ -2091,8 +2117,13 @@ static int sky2_down(struct net_device *dev)
sky2_read32(hw, B0_IMSK) & ~portirq_msk[sky2->port]);
sky2_read32(hw, B0_IMSK);
- synchronize_irq(hw->pdev->irq);
- napi_synchronize(&hw->napi);
+ if (hw->ports == 1) {
+ napi_disable(&hw->napi);
+ free_irq(hw->pdev->irq, hw);
+ } else {
+ synchronize_irq(hw->pdev->irq);
+ napi_synchronize(&hw->napi);
+ }
sky2_hw_down(sky2);
@@ -4798,7 +4829,7 @@ static const char *sky2_name(u8 chipid, char *buf, int sz)
static int __devinit sky2_probe(struct pci_dev *pdev,
const struct pci_device_id *ent)
{
- struct net_device *dev;
+ struct net_device *dev, *dev1;
struct sky2_hw *hw;
int err, using_dac = 0, wol_default;
u32 reg;
@@ -4924,33 +4955,26 @@ static int __devinit sky2_probe(struct pci_dev *pdev,
netif_napi_add(dev, &hw->napi, sky2_poll, NAPI_WEIGHT);
- err = request_irq(pdev->irq, sky2_intr,
- (hw->flags & SKY2_HW_USE_MSI) ? 0 : IRQF_SHARED,
- hw->irq_name, hw);
- if (err) {
- dev_err(&pdev->dev, "cannot assign irq %d\n", pdev->irq);
- goto err_out_unregister;
- }
- sky2_write32(hw, B0_IMSK, Y2_IS_BASE);
- napi_enable(&hw->napi);
-
sky2_show_addr(dev);
if (hw->ports > 1) {
- struct net_device *dev1;
-
- err = -ENOMEM;
dev1 = sky2_init_netdev(hw, 1, using_dac, wol_default);
- if (dev1 && (err = register_netdev(dev1)) == 0)
- sky2_show_addr(dev1);
- else {
- dev_warn(&pdev->dev,
- "register of second port failed (%d)\n", err);
- hw->dev[1] = NULL;
- hw->ports = 1;
- if (dev1)
- free_netdev(dev1);
+ if (!dev1) {
+ err = -ENOMEM;
+ goto err_out_unregister;
}
+
+ err = register_netdev(dev1);
+ if (err) {
+ dev_err(&pdev->dev, "cannot register second net device\n");
+ goto err_out_free_dev1;
+ }
+
+ err = sky2_setup_irq(hw, hw->irq_name);
+ if (err)
+ goto err_out_unregister_dev1;
+
+ sky2_show_addr(dev1);
}
setup_timer(&hw->watchdog_timer, sky2_watchdog, (unsigned long) hw);
@@ -4961,6 +4985,10 @@ static int __devinit sky2_probe(struct pci_dev *pdev,
return 0;
+err_out_unregister_dev1:
+ unregister_netdev(dev1);
+err_out_free_dev1:
+ free_netdev(dev1);
err_out_unregister:
if (hw->flags & SKY2_HW_USE_MSI)
pci_disable_msi(pdev);
@@ -5000,13 +5028,18 @@ static void __devexit sky2_remove(struct pci_dev *pdev)
unregister_netdev(hw->dev[i]);
sky2_write32(hw, B0_IMSK, 0);
+ sky2_read32(hw, B0_IMSK);
sky2_power_aux(hw);
sky2_write8(hw, B0_CTST, CS_RST_SET);
sky2_read8(hw, B0_CTST);
- free_irq(pdev->irq, hw);
+ if (hw->ports > 1) {
+ napi_disable(&hw->napi);
+ free_irq(pdev->irq, hw);
+ }
+
if (hw->flags & SKY2_HW_USE_MSI)
pci_disable_msi(pdev);
pci_free_consistent(pdev, hw->st_size * sizeof(struct sky2_status_le),
Kind Regards,
Sven
^ permalink raw reply related
* Re: [patch net-next 1/3] team: Do not hold rcu_read_lock when running netlink cmds
From: Eric Dumazet @ 2011-11-16 21:17 UTC (permalink / raw)
To: Jiri Pirko
Cc: netdev, davem, bhutchings, shemminger, andy, fbl, jzupka, ivecera
In-Reply-To: <1321477749-1877-2-git-send-email-jpirko@redhat.com>
Le mercredi 16 novembre 2011 à 22:09 +0100, Jiri Pirko a écrit :
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
> drivers/net/team/team.c | 12 ++++--------
> 1 files changed, 4 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
> index 60672bb..e0cf11c 100644
> --- a/drivers/net/team/team.c
> +++ b/drivers/net/team/team.c
> @@ -1043,8 +1043,7 @@ err_msg_put:
>
> /*
> * Netlink cmd functions should be locked by following two functions.
> - * To ensure team_uninit would not be called in between, hold rcu_read_lock
> - * all the time.
> + * Since dev gets held here, that ensures dev won't disappear in between.
> */
> static struct team *team_nl_team_get(struct genl_info *info)
> {
> @@ -1057,12 +1056,9 @@ static struct team *team_nl_team_get(struct genl_info *info)
> return NULL;
>
> ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]);
> - rcu_read_lock();
> - dev = dev_get_by_index_rcu(net, ifindex);
> - if (!dev || dev->netdev_ops != &team_netdev_ops) {
> - rcu_read_unlock();
> + dev = dev_get_by_index(net, ifindex);
> + if (!dev || dev->netdev_ops != &team_netdev_ops)
Hmmm, you return NULL but dev refcnt was increased...
> return NULL;
> - }
>
> team = netdev_priv(dev);
> spin_lock(&team->lock);
> @@ -1072,7 +1068,7 @@ static struct team *team_nl_team_get(struct genl_info *info)
> static void team_nl_team_put(struct team *team)
> {
> spin_unlock(&team->lock);
> - rcu_read_unlock();
> + dev_put(team->dev);
> }
>
> static int team_nl_send_generic(struct genl_info *info, struct team *team,
^ permalink raw reply
* [patch net-next 1/3 v2] team: Do not hold rcu_read_lock when running netlink cmds
From: Jiri Pirko @ 2011-11-16 21:55 UTC (permalink / raw)
To: Eric Dumazet
Cc: netdev, davem, bhutchings, shemminger, andy, fbl, jzupka, ivecera
In-Reply-To: <1321478244.3274.5.camel@edumazet-laptop>
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
drivers/net/team/team.c | 11 +++++------
1 files changed, 5 insertions(+), 6 deletions(-)
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 60672bb..e5390c7 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1043,8 +1043,7 @@ err_msg_put:
/*
* Netlink cmd functions should be locked by following two functions.
- * To ensure team_uninit would not be called in between, hold rcu_read_lock
- * all the time.
+ * Since dev gets held here, that ensures dev won't disappear in between.
*/
static struct team *team_nl_team_get(struct genl_info *info)
{
@@ -1057,10 +1056,10 @@ static struct team *team_nl_team_get(struct genl_info *info)
return NULL;
ifindex = nla_get_u32(info->attrs[TEAM_ATTR_TEAM_IFINDEX]);
- rcu_read_lock();
- dev = dev_get_by_index_rcu(net, ifindex);
+ dev = dev_get_by_index(net, ifindex);
if (!dev || dev->netdev_ops != &team_netdev_ops) {
- rcu_read_unlock();
+ if (dev)
+ dev_put(dev);
return NULL;
}
@@ -1072,7 +1071,7 @@ static struct team *team_nl_team_get(struct genl_info *info)
static void team_nl_team_put(struct team *team)
{
spin_unlock(&team->lock);
- rcu_read_unlock();
+ dev_put(team->dev);
}
static int team_nl_send_generic(struct genl_info *info, struct team *team,
--
1.7.6
^ permalink raw reply related
* Re: [3.1] Divide by zero in __tcp_select_window()
From: David Miller @ 2011-11-16 21:58 UTC (permalink / raw)
To: sim
Cc: eric.dumazet, tglx, netdev, a.p.zijlstra, linux-kernel, davej,
schwidefsky, mingo
In-Reply-To: <20111116195419.GE24411@hostway.ca>
From: Simon Kirby <sim@hostway.ca>
Date: Wed, 16 Nov 2011 11:54:19 -0800
> Looks good, thanks! Working on ~25 boxes without issue for >36 hours.
>
> Simon-
>
> Tested-by: Simon Kirby <sim@hostway.ca>
>
>> [PATCH] tcp: clear xmit timers in tcp_v4_syn_recv_sock()
Applied and queued up for -stable, thanks everyone.
^ permalink raw reply
* Re: [PATCH] route: add more relaxed option for secure_redirects
From: David Miller @ 2011-11-16 22:02 UTC (permalink / raw)
To: fbl; +Cc: netdev
In-Reply-To: <20111116184612.25615c02@asterix.rh>
From: Flavio Leitner <fbl@redhat.com>
Date: Wed, 16 Nov 2011 18:46:12 -0200
> Thus, the only option at the sender side would be using iptables
> to change the ICMP redirect source address to be the float address,
> but that is not working as well. (It isn't passing through -t nat)
If it's going to mangle the packet in one direct, the only option
for sane operation is to make the exact reverse transformation in
the other direction for ICMP messages.
I'm sorry to be so difficult about this, but this is the only way to
handle this problem. If packet mangling is performed to change the
world, that mangling entity has taken on the responsibility to make
everything look correct to all entities for the mangled packets
and any packets generated in response to such mangled packets.
^ permalink raw reply
* Re: [PATCH] bonding: Don't allow mode change via sysfs with slaves present
From: Andy Gospodarek @ 2011-11-16 22:02 UTC (permalink / raw)
To: Nicolas de Pesloüan
Cc: Andy Gospodarek, Veaceslav Falico, netdev, Jay Vosburgh,
debian-kernel
In-Reply-To: <4EC3A64D.40405@gmail.com>
On Wed, Nov 16, 2011 at 01:02:21PM +0100, Nicolas de Pesloüan wrote:
> Le 15/11/2011 21:47, Andy Gospodarek a écrit :
>> Nicolas,
>>
>> I took a look at the ifenslave package for debian more closely and it
>> actually looks like devices are enslaved last, after mode is set. Can
>> you please take a look at this package and confirm what I'm seeing in
>> the 'pre-up' script.
>>
>> It appears to me that setup_master sets the mode and enslave_slaves is
>> called after and enslaves the devices:
>>
>> # Option slaves deprecated, replaced by bond-slaves, but still supported
>> # for backward compatibility.
>> IF_BOND_SLAVES=${IF_BOND_SLAVES:-$IF_SLAVES}
>>
>> if [ "$IF_BOND_MASTER" ] ; then
>> BOND_MASTER="$IF_BOND_MASTER"
>> BOND_SLAVES="$IFACE"
>> else
>> if [ "$IF_BOND_SLAVES" ] ; then
>> BOND_MASTER="$IFACE"
>> BOND_SLAVES="$IF_BOND_SLAVES"
>> fi
>> fi
>>
>> # Exit if nothing to do...
>> [ -z "$BOND_MASTER$BOND_SLAVES" ]&& exit
>>
>> add_master
>> early_setup_master
>> setup_master
>> enslave_slaves
>> exit 0
>
> Andy,
>
> I'm really surprise by this extract. In the most up to date version of
> the ifenslave-2.6 package (1.1.0-19), the order is:
>
> add_master
> early_setup_master
> enslave_slaves
> setup_master
>
> early_setup_master was created to be able to do things that absolutely
> need to be done before enslavement. (See the comment just before this
> function). The idea was to do every possible setup in setup_master, after
> enslavement, except those that need to be done in early_setup_master. So
> having enslave_slaves after setup_master instead of before is definitely
> a mistake. Some of the operations in setup_master must be done after
> enslavement, in particular selecting the primary slave.
>
> In version 1.1.0-18 (change log below), I checked all the possible order
> constraints of the sysfs interface and totally reworked most of the setup
> code, putting everything in the right order to achieve consistent
> results.
>
> ifenslave-2.6 (1.1.0-18) experimental; urgency=low
>
> * Apply patch from Nicolas de Pesloüan:
>
> - Major change: Check and fix the order in which the configuration is
> written into /sys files, to ensure reliable results, according to the
> tests done in the kernel (in drivers/net/bonding/bond_sysfs.c).
> - Ensure that master is properly brought down when changing a parameter
> that require it to be down.
> - Ensure the master is brought up at the end of the setup, in the case
> where ifup won't bring it up because it is currently configuring a slave.
> - Add support for some previously unsupported /sys files: ad_select,
> num_grat_arp, num_unsol_na, primary_reselect and queue_id.
> - Enhance the documentation (README.Debian), to describe all the currently
> supported bond-* options.
> - Many other changes in the documentation.
> - Reverse the order of the arguments to most sysfs_* internal functions, for
> better readability.
>
> It was a hard work, because there really exist many such constraints. I
> fail to find enough time to insert the result of this work into
> Documentation/networking/bonding.txt, but still plan to do so, even if
> the result is documented in the script you looked at.
>
> Of course, it is possible to change the scripts in ifenslave-2.6 package
> to arrange for one more constraint. For as far as I understand, this
> would cause the Debian kernel that introduce the change we discuss about
> and all the future Debian kernels to be flagged with 'Breaks:
> ifenslave-2.6 (<< 1.1.0-20)'. I'm not really comfortable with this and
> the Debian kernel team need to be involved. I copied them.
>
> All that being said, I really advocate for less constraints on the sysfs
> setup. This is not strictly related to sysfs setup. If we eventually add
> a NETLINK interface for all the things we can setup using sysfs, we will
> face the exact same problem.
I was looking at ifenslave 1.1.0-20. If you look at Debian bug #641250
you will see a very similar report to what prompted Veaceslav to come up
with this patch and post it here.
ifenslave-2.6 (1.1.0-20) unstable; urgency=low
* Use dashes consistently for bonding options in README.Debian.
Closes: #639244
* Enslave slaves only after fully setting up the master. Closes: #641250
* Add build-arch and build-indep targets to debian/rules.
-- Guus Sliepen <guus@debian.org> Mon, 14 Nov 2011 11:36:21 +0100
ifenslave-2.6 (1.1.0-19) unstable; urgency=low
* Don't bother trying to move configuration files anymore. This is not an
issue anymore in for the next stable release, and it was broken anyway.
Closes: #626959
* Bump Standards-Version.
-- Guus Sliepen <guus@debian.org> Wed, 25 May 2011 18:42:32 +0200
ifenslave-2.6 (1.1.0-18) experimental; urgency=low
* Apply patch from Nicolas de Pesloüan:
- Major change: Check and fix the order in which the configuration is
written into /sys files, to ensure reliable results, according to the
tests done in the kernel (in drivers/net/bonding/bond_sysfs.c).
- Ensure that master is properly brought down when changing a parameter
that require it to be down.
- Ensure the master is brought up at the end of the setup, in the case
where ifup won't bring it up because it is currently configuring a slave.
- Add support for some previously unsupported /sys files: ad_select,
num_grat_arp, num_unsol_na, primary_reselect and queue_id.
- Enhance the documentation (README.Debian), to describe all the currently
supported bond-* options.
- Many other changes in the documentation.
- Reverse the order of the arguments to most sysfs_* internal functions, for
better readability.
* Upload to experimental due to the freeze.
-- Guus Sliepen <guus@debian.org> Tue, 21 Dec 2010 12:46:04 +0100
> I perfectly understand, as Veaceslav noted in another email that there
> are a lot of mode-specific initialization stuff that's present only in
> bond_enslave(), but I think this is what needs to be fixed... Those
> initialization stuff should be moved out of bond_enslave() and called at
> appropriate sime, from bond_enslave() and from other locations, in
> particular when changing mode.
>
I think Veaceslav is working on this, but there is significant
re-organization that is needed to make it work properly and make sure it
is tested. I could be wrong about how long it will take him, but to
test it properly it will take some time.
Since this problem seems like a pretty major problem and now Debian,
Fedora, RHEL, and Ubuntu all seem to have proper initialization scripts
to handle it, I stand behind my original ACK.
^ permalink raw reply
* [PATCH net-next v6 0/9] net-sysfs+forcedeth: stats & debug enhancements
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, David Decotigny
These changes provide a new generic network tx_timeout sysfs queue
attribute and implement the ndo_get_stats64 API for forcedeth. They
also add a few more stats and debugging features for forcedeth. They
ensure that stats updates are correct in SMP systems, 32 or 64-bits.
Thanks to all the reviewers of the previous versions of this series!
Note: patch 1 is the cherry-pick of 898bdf2cb43e ("forcedeth: fix
stats on hardware without extended stats support")
Changes since v5:
- get_stats64: back to using u64_stats_sync.h instead of atomic
integers, but this time with 2 seqcounts: one for TX, one for RX
Changes since v4:
- tx_timeout counter now a generic sysfs attribute. Credits to
Stephen Hemminger for initial implementation
- revert get_stats64 to using atomic variables: see
http://patchwork.ozlabs.org/patch/125861/ for motivations
- dropped patch "expose module parameters in /sys/module" for now: I
will work on this later, following Stephen's recommendations
(http://patchwork.ozlabs.org/patch/125862/).
Changes since v3:
- updated get_stats64 + rx_dropped patches to use u64_stats_sync.h
- dropped indentation "whitespace/indentation fixes" (included in
get_stats64 api patch)
Changes since v2:
- patch 1/9 is the cherry-pick of 898bdf2cb43e ("forcedeth: fix
stats on hardware without extended stats support")
- removed patch 5/10 "stats for rx_packets based on hardware
registers" because packets&bytes stats are updated in software
only (898bdf2cb43e)
Changes since v1:
- patch 1/10 is the same as
http://patchwork.ozlabs.org/patch/125017/ (targetting net)
- other patches updated to take patch 1/10 into account
- various commit message updates
Tested:
~150Mbps incoming TCP, ethtool -S in a loop, x86_64 16-way:
tx_bytes: 5441989419
rx_packets: 5439224
tx_timeout: 0
tx_packets: 5456705
rx_bytes: 5566763850
Tested:
pktgen + loopback report same RX/TX packets and bytes stats
Tested:
tests above with Kconfig DEBUG_PAGEALLOC DEBUG_MUTEXES
DEBUG_SPINLOCK LOCKUP_DETECTOR DEBUG_RT_MUTEXES DEBUG_LOCK_ALLOC
PROVE_LOCKING DEBUG_ATOMIC_SLEEP DEBUG_STACK_USAGE DEBUG_KOBJECT
DEBUG_VM DEBUG_LIST DEBUG_SG DEBUG_NOTIFIERS TEST_KSTRTOX
STRICT_DEVMEM DEBUG_STACKOVERFLOW
############################################
# Patch Set Summary:
David Decotigny (6):
net-sysfs: fixed minor sparse warning
kbuild: document RPS/XPS network Kconfig options
net: new counter for tx_timeout errors in sysfs
forcedeth: implement ndo_get_stats64() API
forcedeth: account for dropped RX frames
forcedeth: stats updated with a deferrable timer
Mike Ditto (1):
forcedeth: Add messages to indicate using MSI or MSI-X
Sameer Nanda (1):
forcedeth: allow to silence "TX timeout" debug messages
david decotigny (1):
forcedeth: fix stats on hardware without extended stats support
drivers/net/ethernet/nvidia/forcedeth.c | 321 ++++++++++++++++++++++---------
include/linux/netdevice.h | 12 +-
net/Kconfig | 22 ++-
net/core/net-sysfs.c | 49 ++++--
net/sched/sch_generic.c | 1 +
5 files changed, 296 insertions(+), 109 deletions(-)
--
1.7.3.1
^ permalink raw reply
* [PATCH net-next v6 1/9] forcedeth: fix stats on hardware without extended stats support
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, david decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
From: david decotigny <david.decotigny@google.com>
This change makes sure that tx_packets/rx_bytes ifconfig counters are
updated even on NICs that don't provide hardware support for these
stats: they are now updated in software. For the sake of consistency,
we also now have tx_bytes updated in software (hardware counters
include ethernet CRC, and software doesn't account for it).
This reverts parts of:
- "forcedeth: statistics optimization" (21828163b2)
- "forcedeth: Improve stats counters" (0bdfea8ba8)
- "forcedeth: remove unneeded stats updates" (4687f3f364)
Tested:
pktgen + loopback (http://patchwork.ozlabs.org/patch/124698/)
reports identical tx_packets/rx_packets and tx_bytes/rx_bytes.
Signed-off-by: David Decotigny <david.decotigny@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 898bdf2cb43eb0a962c397eb4dd1aec2c7211be2)
---
drivers/net/ethernet/nvidia/forcedeth.c | 36 +++++++++++++++++++++++-------
1 files changed, 27 insertions(+), 9 deletions(-)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index e8a5ae3..374625b 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -609,7 +609,7 @@ struct nv_ethtool_str {
};
static const struct nv_ethtool_str nv_estats_str[] = {
- { "tx_bytes" },
+ { "tx_bytes" }, /* includes Ethernet FCS CRC */
{ "tx_zero_rexmt" },
{ "tx_one_rexmt" },
{ "tx_many_rexmt" },
@@ -637,7 +637,7 @@ static const struct nv_ethtool_str nv_estats_str[] = {
/* version 2 stats */
{ "tx_deferral" },
{ "tx_packets" },
- { "rx_bytes" },
+ { "rx_bytes" }, /* includes Ethernet FCS CRC */
{ "tx_pause" },
{ "rx_pause" },
{ "rx_drop_frame" },
@@ -649,7 +649,7 @@ static const struct nv_ethtool_str nv_estats_str[] = {
};
struct nv_ethtool_stats {
- u64 tx_bytes;
+ u64 tx_bytes; /* should be ifconfig->tx_bytes + 4*tx_packets */
u64 tx_zero_rexmt;
u64 tx_one_rexmt;
u64 tx_many_rexmt;
@@ -670,14 +670,14 @@ struct nv_ethtool_stats {
u64 rx_unicast;
u64 rx_multicast;
u64 rx_broadcast;
- u64 rx_packets;
+ u64 rx_packets; /* should be ifconfig->rx_packets */
u64 rx_errors_total;
u64 tx_errors_total;
/* version 2 stats */
u64 tx_deferral;
- u64 tx_packets;
- u64 rx_bytes;
+ u64 tx_packets; /* should be ifconfig->tx_packets */
+ u64 rx_bytes; /* should be ifconfig->rx_bytes + 4*rx_packets */
u64 tx_pause;
u64 rx_pause;
u64 rx_drop_frame;
@@ -1706,10 +1706,17 @@ static struct net_device_stats *nv_get_stats(struct net_device *dev)
if (np->driver_data & (DEV_HAS_STATISTICS_V1|DEV_HAS_STATISTICS_V2|DEV_HAS_STATISTICS_V3)) {
nv_get_hw_stats(dev);
+ /*
+ * Note: because HW stats are not always available and
+ * for consistency reasons, the following ifconfig
+ * stats are managed by software: rx_bytes, tx_bytes,
+ * rx_packets and tx_packets. The related hardware
+ * stats reported by ethtool should be equivalent to
+ * these ifconfig stats, with 4 additional bytes per
+ * packet (Ethernet FCS CRC).
+ */
+
/* copy to net_device stats */
- dev->stats.tx_packets = np->estats.tx_packets;
- dev->stats.rx_bytes = np->estats.rx_bytes;
- dev->stats.tx_bytes = np->estats.tx_bytes;
dev->stats.tx_fifo_errors = np->estats.tx_fifo_errors;
dev->stats.tx_carrier_errors = np->estats.tx_carrier_errors;
dev->stats.rx_crc_errors = np->estats.rx_crc_errors;
@@ -2380,6 +2387,9 @@ static int nv_tx_done(struct net_device *dev, int limit)
if (flags & NV_TX_ERROR) {
if ((flags & NV_TX_RETRYERROR) && !(flags & NV_TX_RETRYCOUNT_MASK))
nv_legacybackoff_reseed(dev);
+ } else {
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
}
dev_kfree_skb_any(np->get_tx_ctx->skb);
np->get_tx_ctx->skb = NULL;
@@ -2390,6 +2400,9 @@ static int nv_tx_done(struct net_device *dev, int limit)
if (flags & NV_TX2_ERROR) {
if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK))
nv_legacybackoff_reseed(dev);
+ } else {
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
}
dev_kfree_skb_any(np->get_tx_ctx->skb);
np->get_tx_ctx->skb = NULL;
@@ -2429,6 +2442,9 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit)
else
nv_legacybackoff_reseed(dev);
}
+ } else {
+ dev->stats.tx_packets++;
+ dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
}
dev_kfree_skb_any(np->get_tx_ctx->skb);
@@ -2678,6 +2694,7 @@ static int nv_rx_process(struct net_device *dev, int limit)
skb->protocol = eth_type_trans(skb, dev);
napi_gro_receive(&np->napi, skb);
dev->stats.rx_packets++;
+ dev->stats.rx_bytes += len;
next_pkt:
if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
np->get_rx.orig = np->first_rx.orig;
@@ -2761,6 +2778,7 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
}
napi_gro_receive(&np->napi, skb);
dev->stats.rx_packets++;
+ dev->stats.rx_bytes += len;
} else {
dev_kfree_skb(skb);
}
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 2/9] net-sysfs: fixed minor sparse warning
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
This commit fixes following warning:
net/core/net-sysfs.c:921:6: warning: symbol 'numa_node' shadows an earlier one
include/linux/topology.h:222:1: originally declared here
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
net/core/net-sysfs.c | 12 ++++++------
1 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c71c434..a64382f 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -901,7 +901,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
struct xps_map *map, *new_map;
struct xps_dev_maps *dev_maps, *new_dev_maps;
int nonempty = 0;
- int numa_node = -2;
+ int numa_node_id = -2;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -944,10 +944,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);
#ifdef CONFIG_NUMA
if (need_set) {
- if (numa_node == -2)
- numa_node = cpu_to_node(cpu);
- else if (numa_node != cpu_to_node(cpu))
- numa_node = -1;
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(cpu);
+ else if (numa_node_id != cpu_to_node(cpu))
+ numa_node_id = -1;
}
#endif
if (need_set && pos >= map_len) {
@@ -997,7 +997,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
if (dev_maps)
kfree_rcu(dev_maps, rcu);
- netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node :
+ netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id :
NUMA_NO_NODE);
mutex_unlock(&xps_map_mutex);
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 4/9] net: new counter for tx_timeout errors in sysfs
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
This adds the /sys/class/net/DEV/queues/Q/tx_timeout attribute
containing the total number of timeout events on the given queue. It
is always available with CONFIG_SYSFS, independently of
CONFIG_RPS/XPS.
Credits to Stephen Hemminger for a preliminary version of this patch.
Tested:
without CONFIG_SYSFS (compilation only)
with sysfs and without CONFIG_RPS & CONFIG_XPS
with sysfs and without CONFIG_RPS
with sysfs and without CONFIG_XPS
with defaults
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
include/linux/netdevice.h | 12 ++++++++++--
net/core/net-sysfs.c | 37 +++++++++++++++++++++++++++++++------
net/sched/sch_generic.c | 1 +
3 files changed, 42 insertions(+), 8 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cbeb586..9e6cf09 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,7 +530,7 @@ struct netdev_queue {
struct Qdisc *qdisc;
unsigned long state;
struct Qdisc *qdisc_sleeping;
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
struct kobject kobj;
#endif
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
@@ -545,6 +545,12 @@ struct netdev_queue {
* please use this field instead of dev->trans_start
*/
unsigned long trans_start;
+
+ /*
+ * Number of TX timeouts for this queue
+ * (/sys/class/net/DEV/Q/trans_timeout)
+ */
+ unsigned long trans_timeout;
} ____cacheline_aligned_in_smp;
static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
@@ -1184,9 +1190,11 @@ struct net_device {
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
struct kset *queues_kset;
+#endif
+#ifdef CONFIG_RPS
struct netdev_rx_queue *_rx;
/* Number of RX queues allocated at register_netdev() time */
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index a64382f..602b141 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -780,7 +780,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
#endif
}
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
*/
@@ -826,6 +826,23 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
+static ssize_t show_trans_timeout(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ unsigned long trans_timeout;
+
+ spin_lock_irq(&queue->_xmit_lock);
+ trans_timeout = queue->trans_timeout;
+ spin_unlock_irq(&queue->_xmit_lock);
+
+ return sprintf(buf, "%lu", trans_timeout);
+}
+
+static struct netdev_queue_attribute queue_trans_timeout =
+ __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+
+#ifdef CONFIG_XPS
static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
@@ -1020,12 +1037,17 @@ error:
static struct netdev_queue_attribute xps_cpus_attribute =
__ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+#endif /* CONFIG_XPS */
static struct attribute *netdev_queue_default_attrs[] = {
+ &queue_trans_timeout.attr,
+#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
+#endif
NULL
};
+#ifdef CONFIG_XPS
static void netdev_queue_release(struct kobject *kobj)
{
struct netdev_queue *queue = to_netdev_queue(kobj);
@@ -1076,10 +1098,13 @@ static void netdev_queue_release(struct kobject *kobj)
memset(kobj, 0, sizeof(*kobj));
dev_put(queue->dev);
}
+#endif /* CONFIG_XPS */
static struct kobj_type netdev_queue_ktype = {
.sysfs_ops = &netdev_queue_sysfs_ops,
+#ifdef CONFIG_XPS
.release = netdev_queue_release,
+#endif
.default_attrs = netdev_queue_default_attrs,
};
@@ -1102,12 +1127,12 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)
return error;
}
-#endif /* CONFIG_XPS */
+#endif /* CONFIG_SYSFS */
int
netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
{
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
int i;
int error = 0;
@@ -1125,14 +1150,14 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
return error;
#else
return 0;
-#endif
+#endif /* CONFIG_SYSFS */
}
static int register_queue_kobjects(struct net_device *net)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
net->queues_kset = kset_create_and_add("queues",
NULL, &net->dev.kobj);
if (!net->queues_kset)
@@ -1173,7 +1198,7 @@ static void remove_queue_kobjects(struct net_device *net)
net_rx_queue_update_kobjects(net, real_rx, 0);
netdev_queue_update_kobjects(net, real_tx, 0);
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
kset_unregister(net->queues_kset);
#endif
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 69fca27..79ac145 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -246,6 +246,7 @@ static void dev_watchdog(unsigned long arg)
time_after(jiffies, (trans_start +
dev->watchdog_timeo))) {
some_queue_timedout = 1;
+ txq->trans_timeout++;
break;
}
}
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 5/9] forcedeth: Add messages to indicate using MSI or MSI-X
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, Mike Ditto, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
From: Mike Ditto <mditto@google.com>
This adds a few kernel messages to indicate whether PCIe interrupts
are signaled with MSI or MSI-X.
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
drivers/net/ethernet/nvidia/forcedeth.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 374625b..fe17e42 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -3810,6 +3810,7 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
writel(0, base + NvRegMSIXMap0);
writel(0, base + NvRegMSIXMap1);
}
+ netdev_info(dev, "MSI-X enabled\n");
}
}
if (ret != 0 && np->msi_flags & NV_MSI_CAPABLE) {
@@ -3831,6 +3832,7 @@ static int nv_request_irq(struct net_device *dev, int intr_test)
writel(0, base + NvRegMSIMap1);
/* enable msi vector 0 */
writel(NVREG_MSI_VECTOR_0_ENABLED, base + NvRegMSIIrqMask);
+ netdev_info(dev, "MSI enabled\n");
}
}
if (ret != 0) {
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 6/9] forcedeth: allow to silence "TX timeout" debug messages
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, Sameer Nanda, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
From: Sameer Nanda <snanda@google.com>
This adds a new module parameter "debug_tx_timeout" to silence most
debug messages in case of TX timeout. These messages don't provide a
signal/noise ratio high enough for production systems and, with ~30kB
logged each time, they tend to add to a cascade effect if the system
is already under stress (memory pressure, disk, etc.).
By default, the parameter is clear, meaning that only a single warning
will be reported.
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
drivers/net/ethernet/nvidia/forcedeth.c | 98 ++++++++++++++++++-------------
1 files changed, 57 insertions(+), 41 deletions(-)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index fe17e42..9b917ff 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -892,6 +892,11 @@ enum {
static int dma_64bit = NV_DMA_64BIT_ENABLED;
/*
+ * Debug output control for tx_timeout
+ */
+static bool debug_tx_timeout = false;
+
+/*
* Crossover Detection
* Realtek 8201 phy + some OEM boards do not work properly.
*/
@@ -2477,56 +2482,64 @@ static void nv_tx_timeout(struct net_device *dev)
u32 status;
union ring_type put_tx;
int saved_tx_limit;
- int i;
if (np->msi_flags & NV_MSI_X_ENABLED)
status = readl(base + NvRegMSIXIrqStatus) & NVREG_IRQSTAT_MASK;
else
status = readl(base + NvRegIrqStatus) & NVREG_IRQSTAT_MASK;
- netdev_info(dev, "Got tx_timeout. irq: %08x\n", status);
+ netdev_warn(dev, "Got tx_timeout. irq status: %08x\n", status);
- netdev_info(dev, "Ring at %lx\n", (unsigned long)np->ring_addr);
- netdev_info(dev, "Dumping tx registers\n");
- for (i = 0; i <= np->register_size; i += 32) {
- netdev_info(dev,
- "%3x: %08x %08x %08x %08x %08x %08x %08x %08x\n",
- i,
- readl(base + i + 0), readl(base + i + 4),
- readl(base + i + 8), readl(base + i + 12),
- readl(base + i + 16), readl(base + i + 20),
- readl(base + i + 24), readl(base + i + 28));
- }
- netdev_info(dev, "Dumping tx ring\n");
- for (i = 0; i < np->tx_ring_size; i += 4) {
- if (!nv_optimized(np)) {
- netdev_info(dev,
- "%03x: %08x %08x // %08x %08x // %08x %08x // %08x %08x\n",
- i,
- le32_to_cpu(np->tx_ring.orig[i].buf),
- le32_to_cpu(np->tx_ring.orig[i].flaglen),
- le32_to_cpu(np->tx_ring.orig[i+1].buf),
- le32_to_cpu(np->tx_ring.orig[i+1].flaglen),
- le32_to_cpu(np->tx_ring.orig[i+2].buf),
- le32_to_cpu(np->tx_ring.orig[i+2].flaglen),
- le32_to_cpu(np->tx_ring.orig[i+3].buf),
- le32_to_cpu(np->tx_ring.orig[i+3].flaglen));
- } else {
+ if (unlikely(debug_tx_timeout)) {
+ int i;
+
+ netdev_info(dev, "Ring at %lx\n", (unsigned long)np->ring_addr);
+ netdev_info(dev, "Dumping tx registers\n");
+ for (i = 0; i <= np->register_size; i += 32) {
netdev_info(dev,
- "%03x: %08x %08x %08x // %08x %08x %08x // %08x %08x %08x // %08x %08x %08x\n",
+ "%3x: %08x %08x %08x %08x "
+ "%08x %08x %08x %08x\n",
i,
- le32_to_cpu(np->tx_ring.ex[i].bufhigh),
- le32_to_cpu(np->tx_ring.ex[i].buflow),
- le32_to_cpu(np->tx_ring.ex[i].flaglen),
- le32_to_cpu(np->tx_ring.ex[i+1].bufhigh),
- le32_to_cpu(np->tx_ring.ex[i+1].buflow),
- le32_to_cpu(np->tx_ring.ex[i+1].flaglen),
- le32_to_cpu(np->tx_ring.ex[i+2].bufhigh),
- le32_to_cpu(np->tx_ring.ex[i+2].buflow),
- le32_to_cpu(np->tx_ring.ex[i+2].flaglen),
- le32_to_cpu(np->tx_ring.ex[i+3].bufhigh),
- le32_to_cpu(np->tx_ring.ex[i+3].buflow),
- le32_to_cpu(np->tx_ring.ex[i+3].flaglen));
+ readl(base + i + 0), readl(base + i + 4),
+ readl(base + i + 8), readl(base + i + 12),
+ readl(base + i + 16), readl(base + i + 20),
+ readl(base + i + 24), readl(base + i + 28));
+ }
+ netdev_info(dev, "Dumping tx ring\n");
+ for (i = 0; i < np->tx_ring_size; i += 4) {
+ if (!nv_optimized(np)) {
+ netdev_info(dev,
+ "%03x: %08x %08x // %08x %08x "
+ "// %08x %08x // %08x %08x\n",
+ i,
+ le32_to_cpu(np->tx_ring.orig[i].buf),
+ le32_to_cpu(np->tx_ring.orig[i].flaglen),
+ le32_to_cpu(np->tx_ring.orig[i+1].buf),
+ le32_to_cpu(np->tx_ring.orig[i+1].flaglen),
+ le32_to_cpu(np->tx_ring.orig[i+2].buf),
+ le32_to_cpu(np->tx_ring.orig[i+2].flaglen),
+ le32_to_cpu(np->tx_ring.orig[i+3].buf),
+ le32_to_cpu(np->tx_ring.orig[i+3].flaglen));
+ } else {
+ netdev_info(dev,
+ "%03x: %08x %08x %08x "
+ "// %08x %08x %08x "
+ "// %08x %08x %08x "
+ "// %08x %08x %08x\n",
+ i,
+ le32_to_cpu(np->tx_ring.ex[i].bufhigh),
+ le32_to_cpu(np->tx_ring.ex[i].buflow),
+ le32_to_cpu(np->tx_ring.ex[i].flaglen),
+ le32_to_cpu(np->tx_ring.ex[i+1].bufhigh),
+ le32_to_cpu(np->tx_ring.ex[i+1].buflow),
+ le32_to_cpu(np->tx_ring.ex[i+1].flaglen),
+ le32_to_cpu(np->tx_ring.ex[i+2].bufhigh),
+ le32_to_cpu(np->tx_ring.ex[i+2].buflow),
+ le32_to_cpu(np->tx_ring.ex[i+2].flaglen),
+ le32_to_cpu(np->tx_ring.ex[i+3].bufhigh),
+ le32_to_cpu(np->tx_ring.ex[i+3].buflow),
+ le32_to_cpu(np->tx_ring.ex[i+3].flaglen));
+ }
}
}
@@ -6156,6 +6169,9 @@ module_param(phy_cross, int, 0);
MODULE_PARM_DESC(phy_cross, "Phy crossover detection for Realtek 8201 phy is enabled by setting to 1 and disabled by setting to 0.");
module_param(phy_power_down, int, 0);
MODULE_PARM_DESC(phy_power_down, "Power down phy and disable link when interface is down (1), or leave phy powered up (0).");
+module_param(debug_tx_timeout, bool, 0);
+MODULE_PARM_DESC(debug_tx_timeout,
+ "Dump tx related registers and ring when tx_timeout happens");
MODULE_AUTHOR("Manfred Spraul <manfred@colorfullife.com>");
MODULE_DESCRIPTION("Reverse Engineered nForce ethernet driver");
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 9/9] forcedeth: stats updated with a deferrable timer
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
Mark stats timer as deferrable: punctuality in waking the stats timer
callback doesn't matter much, as it is responsible only to avoid
integer wraparound.
We need at least 1 other timer to fire within 17s (fully loaded 1Gbps)
to avoid wrap-arounds. Desired period is still 10s.
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
drivers/net/ethernet/nvidia/forcedeth.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 9bf993d..630174b 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -5531,7 +5531,7 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
init_timer(&np->nic_poll);
np->nic_poll.data = (unsigned long) dev;
np->nic_poll.function = nv_do_nic_poll; /* timer handler */
- init_timer(&np->stats_poll);
+ init_timer_deferrable(&np->stats_poll);
np->stats_poll.data = (unsigned long) dev;
np->stats_poll.function = nv_do_stats_poll; /* timer handler */
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 3/9] kbuild: document RPS/XPS network Kconfig options
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
This adds a description of RPS/XPS options and allow them to be
changed at make menuconfig time.
It also fixes following checkpatch syntax warnings:
ERROR: trailing whitespace
+^I $
ERROR: trailing whitespace
+^I$
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
net/Kconfig | 22 +++++++++++++++++-----
1 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/net/Kconfig b/net/Kconfig
index a073148..8e2104e 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -10,7 +10,7 @@ menuconfig NET
The reason is that some programs need kernel networking support even
when running on a stand-alone machine that isn't connected to any
other computer.
-
+
If you are upgrading from an older kernel, you
should consider updating your networking tools too because changes
in the kernel and the tools often go hand in hand. The tools are
@@ -217,20 +217,33 @@ source "net/dns_resolver/Kconfig"
source "net/batman-adv/Kconfig"
config RPS
- boolean
+ boolean "Enable Receive Packet Steering"
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
default y
+ help
+ RPS distributes the load of received packet processing
+ across multiple CPUs. If unsure, say Y.
config RFS_ACCEL
- boolean
+ boolean "Enable Hardware Acceleration of RFS"
depends on RPS && GENERIC_HARDIRQS
select CPU_RMAP
default y
+ help
+ This is the hardware version of RPS. On multi-queue network
+ devices, this configures the hardware to distribute the
+ received packets across multiple CPUs. If unsure, say Y.
config XPS
- boolean
+ boolean "Enable Transmit Packet Steering"
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
default y
+ help
+ For multiqueue devices, XPS selects a transmit queue during
+ packet transmission based on configuration. This is done by
+ mapping the CPU transmitting the packet to a queue. XPS can
+ reduce transmit network latency on SMP systems. If unsure,
+ say Y.
config HAVE_BPF_JIT
bool
@@ -274,7 +287,6 @@ config NET_TCPPROBE
Documentation on how to use TCP connection probing can be found
at:
-
http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe
To compile this code as a module, choose M here: the
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 8/9] forcedeth: account for dropped RX frames
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
This adds code to update the stats counter for dropped RX frames.
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
drivers/net/ethernet/nvidia/forcedeth.c | 14 ++++++++++++--
1 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 427c9c1..9bf993d 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -815,6 +815,7 @@ struct fe_priv {
u64 stat_rx_packets;
u64 stat_rx_bytes; /* not always available in HW */
u64 stat_rx_missed_errors;
+ u64 stat_rx_dropped;
/* media detection workaround.
* Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
@@ -1758,6 +1759,7 @@ nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
syncp_start = u64_stats_fetch_begin(&np->swstats_rx_syncp);
storage->rx_packets = np->stat_rx_packets;
storage->rx_bytes = np->stat_rx_bytes;
+ storage->rx_dropped = np->stat_rx_dropped;
storage->rx_missed_errors = np->stat_rx_missed_errors;
} while (u64_stats_fetch_retry(&np->swstats_rx_syncp, syncp_start));
@@ -1828,8 +1830,12 @@ static int nv_alloc_rx(struct net_device *dev)
np->put_rx.orig = np->first_rx.orig;
if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
np->put_rx_ctx = np->first_rx_ctx;
- } else
+ } else {
+ u64_stats_update_begin(&np->swstats_rx_syncp);
+ np->stat_rx_dropped++;
+ u64_stats_update_end(&np->swstats_rx_syncp);
return 1;
+ }
}
return 0;
}
@@ -1860,8 +1866,12 @@ static int nv_alloc_rx_optimized(struct net_device *dev)
np->put_rx.ex = np->first_rx.ex;
if (unlikely(np->put_rx_ctx++ == np->last_rx_ctx))
np->put_rx_ctx = np->first_rx_ctx;
- } else
+ } else {
+ u64_stats_update_begin(&np->swstats_rx_syncp);
+ np->stat_rx_dropped++;
+ u64_stats_update_end(&np->swstats_rx_syncp);
return 1;
+ }
}
return 0;
}
--
1.7.3.1
^ permalink raw reply related
* [PATCH net-next v6 7/9] forcedeth: implement ndo_get_stats64() API
From: David Decotigny @ 2011-11-16 22:15 UTC (permalink / raw)
To: netdev, linux-kernel
Cc: David S. Miller, Ian Campbell, Eric Dumazet, Jeff Kirsher,
Ben Hutchings, Jiri Pirko, Joe Perches, Szymon Janc,
Richard Jones, Ayaz Abdulla, David Decotigny
In-Reply-To: <cover.1321481064.git.david.decotigny@google.com>
This commit implements the ndo_get_stats64() API for forcedeth. Since
hardware stats are being updated from different contexts (process and
timer), this commit adds synchronization. For software stats, it
relies on the u64_stats_sync.h API.
Tested:
- 16-way SMP x86_64 ->
RX bytes:7244556582 (7.2 GB) TX bytes:181904254 (181.9 MB)
- pktgen + loopback: identical rx_bytes/tx_bytes and rx_packets/tx_packets
Signed-off-by: David Decotigny <david.decotigny@google.com>
---
drivers/net/ethernet/nvidia/forcedeth.c | 197 +++++++++++++++++++++++--------
1 files changed, 146 insertions(+), 51 deletions(-)
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 9b917ff..427c9c1 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -65,7 +65,8 @@
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/prefetch.h>
-#include <linux/io.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/io.h>
#include <asm/irq.h>
#include <asm/system.h>
@@ -736,6 +737,16 @@ struct nv_skb_map {
* - tx setup is lockless: it relies on netif_tx_lock. Actual submission
* needs netdev_priv(dev)->lock :-(
* - set_multicast_list: preparation lockless, relies on netif_tx_lock.
+ *
+ * Hardware stats updates are protected by hwstats_lock:
+ * - updated by nv_do_stats_poll (timer). This is meant to avoid
+ * integer wraparound in the NIC stats registers, at low frequency
+ * (0.1 Hz)
+ * - updated by nv_get_ethtool_stats + nv_get_stats64
+ *
+ * Software stats are accessed only through 64b synchronization points
+ * and are not subject to other synchronization techniques (single
+ * update thread on the TX or RX paths).
*/
/* in dev: base, irq */
@@ -745,9 +756,10 @@ struct fe_priv {
struct net_device *dev;
struct napi_struct napi;
- /* General data:
- * Locking: spin_lock(&np->lock); */
+ /* hardware stats are updated in syscall and timer */
+ spinlock_t hwstats_lock;
struct nv_ethtool_stats estats;
+
int in_shutdown;
u32 linkspeed;
int duplex;
@@ -798,6 +810,12 @@ struct fe_priv {
u32 nic_poll_irq;
int rx_ring_size;
+ /* RX software stats */
+ struct u64_stats_sync swstats_rx_syncp;
+ u64 stat_rx_packets;
+ u64 stat_rx_bytes; /* not always available in HW */
+ u64 stat_rx_missed_errors;
+
/* media detection workaround.
* Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
*/
@@ -820,6 +838,12 @@ struct fe_priv {
struct nv_skb_map *tx_end_flip;
int tx_stop;
+ /* TX software stats */
+ struct u64_stats_sync swstats_tx_syncp;
+ u64 stat_tx_packets; /* not always available in HW */
+ u64 stat_tx_bytes;
+ u64 stat_tx_dropped;
+
/* msi/msi-x fields */
u32 msi_flags;
struct msix_entry msi_x_entry[NV_MSI_X_MAX_VECTORS];
@@ -1635,11 +1659,19 @@ static void nv_mac_reset(struct net_device *dev)
pci_push(base);
}
-static void nv_get_hw_stats(struct net_device *dev)
+/* Caller must appropriately lock netdev_priv(dev)->hwstats_lock */
+static void nv_update_stats(struct net_device *dev)
{
struct fe_priv *np = netdev_priv(dev);
u8 __iomem *base = get_hwbase(dev);
+ /* If it happens that this is run in top-half context, then
+ * replace the spin_lock of hwstats_lock with
+ * spin_lock_irqsave() in calling functions. */
+ WARN_ONCE(in_irq(), "forcedeth: estats spin_lock(_bh) from top-half");
+ assert_spin_locked(&np->hwstats_lock);
+
+ /* query hardware */
np->estats.tx_bytes += readl(base + NvRegTxCnt);
np->estats.tx_zero_rexmt += readl(base + NvRegTxZeroReXmt);
np->estats.tx_one_rexmt += readl(base + NvRegTxOneReXmt);
@@ -1698,40 +1730,72 @@ static void nv_get_hw_stats(struct net_device *dev)
}
/*
- * nv_get_stats: dev->get_stats function
+ * nv_get_stats64: dev->ndo_get_stats64 function
* Get latest stats value from the nic.
* Called with read_lock(&dev_base_lock) held for read -
* only synchronized against unregister_netdevice.
*/
-static struct net_device_stats *nv_get_stats(struct net_device *dev)
+static struct rtnl_link_stats64*
+nv_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *storage)
+ __acquires(&netdev_priv(dev)->hwstats_lock)
+ __releases(&netdev_priv(dev)->hwstats_lock)
{
struct fe_priv *np = netdev_priv(dev);
+ unsigned int syncp_start;
+
+ /*
+ * Note: because HW stats are not always available and for
+ * consistency reasons, the following ifconfig stats are
+ * managed by software: rx_bytes, tx_bytes, rx_packets and
+ * tx_packets. The related hardware stats reported by ethtool
+ * should be equivalent to these ifconfig stats, with 4
+ * additional bytes per packet (Ethernet FCS CRC), except for
+ * tx_packets when TSO kicks in.
+ */
+
+ /* software stats */
+ do {
+ syncp_start = u64_stats_fetch_begin(&np->swstats_rx_syncp);
+ storage->rx_packets = np->stat_rx_packets;
+ storage->rx_bytes = np->stat_rx_bytes;
+ storage->rx_missed_errors = np->stat_rx_missed_errors;
+ } while (u64_stats_fetch_retry(&np->swstats_rx_syncp, syncp_start));
+
+ do {
+ syncp_start = u64_stats_fetch_begin(&np->swstats_tx_syncp);
+ storage->tx_packets = np->stat_tx_packets;
+ storage->tx_bytes = np->stat_tx_bytes;
+ storage->tx_dropped = np->stat_tx_dropped;
+ } while (u64_stats_fetch_retry(&np->swstats_tx_syncp, syncp_start));
/* If the nic supports hw counters then retrieve latest values */
- if (np->driver_data & (DEV_HAS_STATISTICS_V1|DEV_HAS_STATISTICS_V2|DEV_HAS_STATISTICS_V3)) {
- nv_get_hw_stats(dev);
+ if (np->driver_data & DEV_HAS_STATISTICS_V123) {
+ spin_lock_bh(&np->hwstats_lock);
- /*
- * Note: because HW stats are not always available and
- * for consistency reasons, the following ifconfig
- * stats are managed by software: rx_bytes, tx_bytes,
- * rx_packets and tx_packets. The related hardware
- * stats reported by ethtool should be equivalent to
- * these ifconfig stats, with 4 additional bytes per
- * packet (Ethernet FCS CRC).
- */
+ nv_update_stats(dev);
+
+ /* generic stats */
+ storage->rx_errors = np->estats.rx_errors_total;
+ storage->tx_errors = np->estats.tx_errors_total;
+
+ /* meaningful only when NIC supports stats v3 */
+ storage->multicast = np->estats.rx_multicast;
+
+ /* detailed rx_errors */
+ storage->rx_length_errors = np->estats.rx_length_error;
+ storage->rx_over_errors = np->estats.rx_over_errors;
+ storage->rx_crc_errors = np->estats.rx_crc_errors;
+ storage->rx_frame_errors = np->estats.rx_frame_align_error;
+ storage->rx_fifo_errors = np->estats.rx_drop_frame;
- /* copy to net_device stats */
- dev->stats.tx_fifo_errors = np->estats.tx_fifo_errors;
- dev->stats.tx_carrier_errors = np->estats.tx_carrier_errors;
- dev->stats.rx_crc_errors = np->estats.rx_crc_errors;
- dev->stats.rx_over_errors = np->estats.rx_over_errors;
- dev->stats.rx_fifo_errors = np->estats.rx_drop_frame;
- dev->stats.rx_errors = np->estats.rx_errors_total;
- dev->stats.tx_errors = np->estats.tx_errors_total;
+ /* detailed tx_errors */
+ storage->tx_carrier_errors = np->estats.tx_carrier_errors;
+ storage->tx_fifo_errors = np->estats.tx_fifo_errors;
+
+ spin_unlock_bh(&np->hwstats_lock);
}
- return &dev->stats;
+ return storage;
}
/*
@@ -1932,8 +1996,11 @@ static void nv_drain_tx(struct net_device *dev)
np->tx_ring.ex[i].bufhigh = 0;
np->tx_ring.ex[i].buflow = 0;
}
- if (nv_release_txskb(np, &np->tx_skb[i]))
- dev->stats.tx_dropped++;
+ if (nv_release_txskb(np, &np->tx_skb[i])) {
+ u64_stats_update_begin(&np->swstats_tx_syncp);
+ np->stat_tx_dropped++;
+ u64_stats_update_end(&np->swstats_tx_syncp);
+ }
np->tx_skb[i].dma = 0;
np->tx_skb[i].dma_len = 0;
np->tx_skb[i].dma_single = 0;
@@ -2390,11 +2457,14 @@ static int nv_tx_done(struct net_device *dev, int limit)
if (np->desc_ver == DESC_VER_1) {
if (flags & NV_TX_LASTPACKET) {
if (flags & NV_TX_ERROR) {
- if ((flags & NV_TX_RETRYERROR) && !(flags & NV_TX_RETRYCOUNT_MASK))
+ if ((flags & NV_TX_RETRYERROR)
+ && !(flags & NV_TX_RETRYCOUNT_MASK))
nv_legacybackoff_reseed(dev);
} else {
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
+ u64_stats_update_begin(&np->swstats_tx_syncp);
+ np->stat_tx_packets++;
+ np->stat_tx_bytes += np->get_tx_ctx->skb->len;
+ u64_stats_update_end(&np->swstats_tx_syncp);
}
dev_kfree_skb_any(np->get_tx_ctx->skb);
np->get_tx_ctx->skb = NULL;
@@ -2403,11 +2473,14 @@ static int nv_tx_done(struct net_device *dev, int limit)
} else {
if (flags & NV_TX2_LASTPACKET) {
if (flags & NV_TX2_ERROR) {
- if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK))
+ if ((flags & NV_TX2_RETRYERROR)
+ && !(flags & NV_TX2_RETRYCOUNT_MASK))
nv_legacybackoff_reseed(dev);
} else {
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
+ u64_stats_update_begin(&np->swstats_tx_syncp);
+ np->stat_tx_packets++;
+ np->stat_tx_bytes += np->get_tx_ctx->skb->len;
+ u64_stats_update_end(&np->swstats_tx_syncp);
}
dev_kfree_skb_any(np->get_tx_ctx->skb);
np->get_tx_ctx->skb = NULL;
@@ -2441,15 +2514,18 @@ static int nv_tx_done_optimized(struct net_device *dev, int limit)
if (flags & NV_TX2_LASTPACKET) {
if (flags & NV_TX2_ERROR) {
- if ((flags & NV_TX2_RETRYERROR) && !(flags & NV_TX2_RETRYCOUNT_MASK)) {
+ if ((flags & NV_TX2_RETRYERROR)
+ && !(flags & NV_TX2_RETRYCOUNT_MASK)) {
if (np->driver_data & DEV_HAS_GEAR_MODE)
nv_gear_backoff_reseed(dev);
else
nv_legacybackoff_reseed(dev);
}
} else {
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += np->get_tx_ctx->skb->len;
+ u64_stats_update_begin(&np->swstats_tx_syncp);
+ np->stat_tx_packets++;
+ np->stat_tx_bytes += np->get_tx_ctx->skb->len;
+ u64_stats_update_end(&np->swstats_tx_syncp);
}
dev_kfree_skb_any(np->get_tx_ctx->skb);
@@ -2662,8 +2738,11 @@ static int nv_rx_process(struct net_device *dev, int limit)
}
/* the rest are hard errors */
else {
- if (flags & NV_RX_MISSEDFRAME)
- dev->stats.rx_missed_errors++;
+ if (flags & NV_RX_MISSEDFRAME) {
+ u64_stats_update_begin(&np->swstats_rx_syncp);
+ np->stat_rx_missed_errors++;
+ u64_stats_update_end(&np->swstats_rx_syncp);
+ }
dev_kfree_skb(skb);
goto next_pkt;
}
@@ -2706,8 +2785,10 @@ static int nv_rx_process(struct net_device *dev, int limit)
skb_put(skb, len);
skb->protocol = eth_type_trans(skb, dev);
napi_gro_receive(&np->napi, skb);
- dev->stats.rx_packets++;
- dev->stats.rx_bytes += len;
+ u64_stats_update_begin(&np->swstats_rx_syncp);
+ np->stat_rx_packets++;
+ np->stat_rx_bytes += len;
+ u64_stats_update_end(&np->swstats_rx_syncp);
next_pkt:
if (unlikely(np->get_rx.orig++ == np->last_rx.orig))
np->get_rx.orig = np->first_rx.orig;
@@ -2790,8 +2871,10 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
__vlan_hwaccel_put_tag(skb, vid);
}
napi_gro_receive(&np->napi, skb);
- dev->stats.rx_packets++;
- dev->stats.rx_bytes += len;
+ u64_stats_update_begin(&np->swstats_rx_syncp);
+ np->stat_rx_packets++;
+ np->stat_rx_bytes += len;
+ u64_stats_update_end(&np->swstats_rx_syncp);
} else {
dev_kfree_skb(skb);
}
@@ -4000,11 +4083,18 @@ static void nv_poll_controller(struct net_device *dev)
#endif
static void nv_do_stats_poll(unsigned long data)
+ __acquires(&netdev_priv(dev)->hwstats_lock)
+ __releases(&netdev_priv(dev)->hwstats_lock)
{
struct net_device *dev = (struct net_device *) data;
struct fe_priv *np = netdev_priv(dev);
- nv_get_hw_stats(dev);
+ /* If lock is currently taken, the stats are being refreshed
+ * and hence fresh enough */
+ if (spin_trylock(&np->hwstats_lock)) {
+ nv_update_stats(dev);
+ spin_unlock(&np->hwstats_lock);
+ }
if (!np->in_shutdown)
mod_timer(&np->stats_poll,
@@ -4711,14 +4801,18 @@ static int nv_get_sset_count(struct net_device *dev, int sset)
}
}
-static void nv_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *estats, u64 *buffer)
+static void nv_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *estats, u64 *buffer)
+ __acquires(&netdev_priv(dev)->hwstats_lock)
+ __releases(&netdev_priv(dev)->hwstats_lock)
{
struct fe_priv *np = netdev_priv(dev);
- /* update stats */
- nv_get_hw_stats(dev);
-
- memcpy(buffer, &np->estats, nv_get_sset_count(dev, ETH_SS_STATS)*sizeof(u64));
+ spin_lock_bh(&np->hwstats_lock);
+ nv_update_stats(dev);
+ memcpy(buffer, &np->estats,
+ nv_get_sset_count(dev, ETH_SS_STATS)*sizeof(u64));
+ spin_unlock_bh(&np->hwstats_lock);
}
static int nv_link_test(struct net_device *dev)
@@ -5362,7 +5456,7 @@ static int nv_close(struct net_device *dev)
static const struct net_device_ops nv_netdev_ops = {
.ndo_open = nv_open,
.ndo_stop = nv_close,
- .ndo_get_stats = nv_get_stats,
+ .ndo_get_stats64 = nv_get_stats64,
.ndo_start_xmit = nv_start_xmit,
.ndo_tx_timeout = nv_tx_timeout,
.ndo_change_mtu = nv_change_mtu,
@@ -5379,7 +5473,7 @@ static const struct net_device_ops nv_netdev_ops = {
static const struct net_device_ops nv_netdev_ops_optimized = {
.ndo_open = nv_open,
.ndo_stop = nv_close,
- .ndo_get_stats = nv_get_stats,
+ .ndo_get_stats64 = nv_get_stats64,
.ndo_start_xmit = nv_start_xmit_optimized,
.ndo_tx_timeout = nv_tx_timeout,
.ndo_change_mtu = nv_change_mtu,
@@ -5418,6 +5512,7 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
np->dev = dev;
np->pci_dev = pci_dev;
spin_lock_init(&np->lock);
+ spin_lock_init(&np->hwstats_lock);
SET_NETDEV_DEV(dev, &pci_dev->dev);
init_timer(&np->oom_kick);
--
1.7.3.1
^ permalink raw reply related
* Re: [RFC PATCH net-next v2] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: David Miller @ 2011-11-16 22:27 UTC (permalink / raw)
To: raj; +Cc: netdev, virtualization, mst
In-Reply-To: <20111115001708.72ED0290054F@tardy>
From: raj@tardy.cup.hp.com (Rick Jones)
Date: Mon, 14 Nov 2011 16:17:08 -0800 (PST)
> From: Rick Jones <rick.jones2@hp.com>
>
> Add a new .bus_name to virtio_config_ops then modify virtio_net to
> call through to it in an ethtool .get_drvinfo routine to report
> bus_info in ethtool -i output which is consistent with other
> emulated NICs and the output of lspci.
>
> Signed-off-by: Rick Jones <rick.jones2@hp.com>
>
> ---
>
> The changes to drivers/lguest/lguest_device.c, drivers/s390/kvm/kvm_virtio.c,
> and drivers/virtio/virtio_mmio.c code inspected only, not compiled.
Applied, thanks Rick.
^ permalink raw reply
* Re: [PATCH v5 4/9] net: introduce and use netdev_features_t for device features sets
From: Ben Hutchings @ 2011-11-16 22:30 UTC (permalink / raw)
To: Michał Mirosław; +Cc: netdev, David S. Miller
In-Reply-To: <5d60e779e1f7c316783ea59c194af3e5f57ed6d2.1321404954.git.mirq-linux@rere.qmqm.pl>
On Wed, 2011-11-16 at 02:29 +0100, Michał Mirosław wrote:
[...]
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
[...]
> @@ -3538,7 +3538,7 @@ static int __devinit init_one(struct pci_dev *pdev,
> {
> int func, i, err;
> struct port_info *pi;
> - unsigned int highdma = 0;
> + bool highdma = false;
> struct adapter *adapter = NULL;
>
> printk_once(KERN_INFO "%s - version %s\n", DRV_DESC, DRV_VERSION);
> @@ -3564,7 +3564,7 @@ static int __devinit init_one(struct pci_dev *pdev,
> }
>
> if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
> - highdma = NETIF_F_HIGHDMA;
> + highdma = true;
> err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
> if (err) {
> dev_err(&pdev->dev, "unable to obtain 64-bit DMA for "
> @@ -3638,7 +3638,9 @@ static int __devinit init_one(struct pci_dev *pdev,
> NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
> NETIF_F_RXCSUM | NETIF_F_RXHASH |
> NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
> - netdev->features |= netdev->hw_features | highdma;
> + if (highdma)
> + netdev->hw_features |= NETIF_F_HIGHDMA;
This wasn't previously included in hw_features. Since hidma was being
used as a bitmask in this function, not a boolean, why don't you just
change its type to netdev_features_t for now?
> + netdev->features |= netdev->hw_features;
> netdev->vlan_features = netdev->features & VLAN_FEAT;
>
> netdev->priv_flags |= IFF_UNICAST_FLT;
[....]
> --- a/drivers/net/ethernet/jme.c
> +++ b/drivers/net/ethernet/jme.c
> @@ -1917,7 +1917,7 @@ jme_map_tx_skb(struct jme_adapter *jme, struct sk_buff *skb, int idx)
> struct jme_ring *txring = &(jme->txring[0]);
> struct txdesc *txdesc = txring->desc, *ctxdesc;
> struct jme_buffer_info *txbi = txring->bufinf, *ctxbi;
> - u8 hidma = jme->dev->features & NETIF_F_HIGHDMA;
> + u8 hidma = !!(jme->dev->features & NETIF_F_HIGHDMA);
The original here is nasty! But you can change the type to bool and
then there is no need for the '!!'.
[...]
> --- a/drivers/net/ethernet/marvell/mv643xx_eth.c
> +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
> @@ -1579,10 +1579,10 @@ mv643xx_eth_set_ringparam(struct net_device *dev, struct ethtool_ringparam *er)
>
>
> static int
> -mv643xx_eth_set_features(struct net_device *dev, u32 features)
> +mv643xx_eth_set_features(struct net_device *dev, netdev_features_t features)
> {
> struct mv643xx_eth_private *mp = netdev_priv(dev);
> - u32 rx_csum = features & NETIF_F_RXCSUM;
> + int rx_csum = !!(features & NETIF_F_RXCSUM);
Again, change the type to bool.
[...]
> --- a/drivers/net/ethernet/marvell/sky2.c
> +++ b/drivers/net/ethernet/marvell/sky2.c
[...]
> -static int sky2_set_features(struct net_device *dev, u32 features)
> +static int sky2_set_features(struct net_device *dev, netdev_features_t features)
> {
> struct sky2_port *sky2 = netdev_priv(dev);
> - u32 changed = dev->features ^ features;
> + netdev_features_t changed = dev->features ^ features;
>
> if (changed & NETIF_F_RXCSUM) {
> - u32 on = features & NETIF_F_RXCSUM;
> + int on = !!(features & NETIF_F_RXCSUM);
Same here.
[...]
> --- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
> +++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
> @@ -1491,7 +1491,7 @@ myri10ge_clean_rx_done(struct myri10ge_slice_state *ss, int budget)
> * access to avoid theoretical race condition with functions that
> * change NETIF_F_LRO flag at runtime.
> */
> - bool lro_enabled = ACCESS_ONCE(mgp->dev->features) & NETIF_F_LRO;
> + bool lro_enabled = !!(ACCESS_ONCE(mgp->dev->features) & NETIF_F_LRO);
No change required here.
[...]
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -203,7 +203,7 @@ static void xennet_sysfs_delif(struct net_device *netdev);
>
> static int xennet_can_sg(struct net_device *dev)
> {
> - return dev->features & NETIF_F_SG;
> + return !!(dev->features & NETIF_F_SG);
> }
[...]
You could change the return type to bool.
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: Unable to flush ICMP redirect routes in kernel 3.0+
From: Ivan Zahariev @ 2011-11-16 22:32 UTC (permalink / raw)
To: netdev
In-Reply-To: <1321391355.2602.0.camel@edumazet-laptop>
On 11/15/2011 11:09 PM, Eric Dumazet wrote:
> Le mardi 15 novembre 2011 à 22:23 +0200, Ivan Zahariev a écrit :
>> Hello,
>>
>> We have changed nothing in our network infrastructure but only upgraded
>> from Linux kernel 2.6.36.2 to 3.0.3. Here is the problem we are
>> experiencing:
>>
>> ICMP redirected routes are cached forever, and they can be cleared only
>> by a reboot.
>>
>> Here is an example:
>>
>> root@machine5:~# ip route get 1.1.1.1
>> 1.1.1.1 via 9.0.0.1 dev eth0 src 5.5.5.5
>> cache<redirected> ipid 0xfb5d rtt 1475ms rttvar 450ms cwnd 10
>>
>> root@machine5:~# ip route list cache match 1.1.1.1
>> 1.1.1.1 tos lowdelay via 9.0.0.1 dev eth0 src 5.5.5.5
>> cache<redirected> ipid 0xfb5d rtt 1475ms rttvar 450ms cwnd 10
>> 1.1.1.1 via 9.0.0.1 dev eth0 src 5.5.5.5
>> cache<redirected> ipid 0xfb5d rtt 1475ms rttvar 450ms cwnd 10
>> ...(two more entries, all go via 9.0.0.1)...
>>
>> 1.1.1.1 is the test destination address
>> 5.5.5.5 is the source IP address of "machine5" via dev eth0, the only
>> interface besides "lo"
>> 9.0.0.1 is the incorrect gateway which we were redirected to; we want to
>> change the route to 9.0.0.8
>>
>> I found no way to clear this route. What I tried:
>>
>> root@machine5:~# ip route flush cache ### CACHE FLUSH ###
>> root@machine5:~# ip route list cache match 1.1.1.1 # empty
>>
>> root@machine5:~# ip route flush cache ### CACHE FLUSH ###
>> root@machine5:~# echo 1> /proc/sys/net/ipv4/route/flush
>> root@machine5:~# ip route list cache match 1.1.1.1 # empty
>>
>> root@machine5:~# ip route get 1.1.1.1 # magically re-inserts the
>> <redirected> route, tcpdump sees NO ICMP traffic
>> 1.1.1.1 via 9.0.0.1 dev eth0 src 5.5.5.5
>> cache<redirected> ipid 0xfb5d rtt 1475ms rttvar 450ms cwnd 10
>>
>> I also tried to force a scheduled route flush:
>>
>> root@machine5:~# echo 1> /proc/sys/net/ipv4/route/gc_timeout
>> root@machine5:~# echo 1> /proc/sys/net/ipv4/route/gc_interval
>>
>> A reboot fixed it all.
>>
>> This may be related to the "Several major changes to our routing
>> infrastructure" (https://lkml.org/lkml/2011/3/16/384).
>> Other users are reporting the same problem:
>> * https://plus.google.com/u/0/117161704068825702652/posts/1UK1Rp4KA4J
>> * http://lists.debian.org/debian-kernel/2011/10/msg00633.html
>> Other similar issues:
>> * http://www.spinics.net/lists/netdev/msg176966.html
>> * http://forums.gentoo.org/viewtopic-t-901024-start-0.html
>>
>> This has been occurring on a few KVM guest machines and also on a
>> regular Linux machine, so it's not KVM related.
>>
>> Is this a bug, or it's me who's missing something?
>>
> It is a bug, and as such could you provide needed information for us to
> reproduce it ?
>
> What is your network setup ?
Network setup is nothing fancy. We have the following machines on a
single /24 ethernet segment:
* 192.168.0.244 (machine5) -- the machine on which we reproduce the
kernel routing bug; kernel: 3.0.3-grsec
* 192.168.0.8 (router8) -- the default gw for the whole
192.168.0.0/24 network; does SNAT; kernel: 2.6.32-5-686
* 192.168.0.120 -- another host with disabled ip_forwarding; must be up
and reachable
There are two bugs actually:
1. Basically, *any* ICMP redirect is cached forever.
2. The output of "ip route" is not consistent with the kernel's routing
behavior.
Quick fix: Disabling "net.ipv4.conf.*.accept_redirects" on all
interfaces works OK and prevents ICMP redirects from affecting the
internal route cache.
Here is a sample test-case scenario:
### right after a clean machine reboot
root@machine5:~# ip route list cache match 8.8.4.4
root@machine5:~# ip route get 8.8.4.4
8.8.4.4 via 192.168.0.8 dev eth0 src 192.168.0.244
cache
### make a TCP request; the TCP packets go to the default gw
192.168.0.8; we see this with a tcpdump at 192.168.0.8
root@machine5:~# telnet 8.8.4.4
### route is still OK and as expected
root@machine5:~# ip route list cache match 8.8.4.4
8.8.4.4 from 192.168.0.244 tos lowdelay via 192.168.0.8 dev eth0
cache ipid 0x303a
8.8.4.4 tos lowdelay via 192.168.0.8 dev eth0 src 192.168.0.244
cache ipid 0x303a
8.8.4.4 via 192.168.0.8 dev eth0 src 192.168.0.244
cache
root@machine5:~# ip route get 8.8.4.4
8.8.4.4 via 192.168.0.8 dev eth0 src 192.168.0.244
cache
### change route to a fake host on the same subnet, so that an ICMP
redirect will follow later
### we also disable NAT for 192.168.0.244, so that an ICMP redirect is
sent accordingly
root@router8:~# route add -host 8.8.4.4 gw 192.168.0.120
### first TCP packet goes to the default gw 192.168.0.8; we see this
with a tcpdump at 192.168.0.8
root@machine5:~# telnet 8.8.4.4
### at machine5: we got the ICMP redirect from the default gw, as expected
# tcpdump: IP 192.168.0.8 > 192.168.0.244: ICMP redirect 8.8.4.4 to host
192.168.0.120, length 68
### the TCP packets now start to use the <redirected> route
192.168.0.120; we see this with a tcpdump at 192.168.0.120
root@machine5:~# telnet 8.8.4.4
### (bug #2) what "ip route" returns is inconsistent, because we are
using the <redirected> route 192.168.0.120 in reality
### note that the count of the route lines increased with one
root@machine5:~# ip route list cache match 8.8.4.4
8.8.4.4 from 192.168.0.244 tos lowdelay via 192.168.0.8 dev eth0
cache ipid 0x303a
8.8.4.4 tos lowdelay via 192.168.0.8 dev eth0 src 192.168.0.244
cache ipid 0x303a
8.8.4.4 via 192.168.0.8 dev eth0 src 192.168.0.244
cache
8.8.4.4 from 192.168.0.244 tos lowdelay via 192.168.0.8 dev eth0
cache ipid 0x303a
root@machine5:~# ip route get 8.8.4.4
8.8.4.4 via 192.168.0.8 dev eth0 src 192.168.0.244
cache
### restore the route on the default gw 192.168.0.8, so that it accepts
8.8.4.4 as destination again
### restore NAT for 192.168.0.244
root@router8:~# route del -host 8.8.4.4 gw 192.168.0.120
### (bug #1) even though we flushed the route cache, the <redirected>
route resurrects from somewhere; even without making any TCP requests
### this time what "ip" returns is consistent with the real (incorrect)
routing behavior of machine5
root@machine5:~# ip route flush cache
root@machine5:~# ip route list cache match 8.8.4.4
root@machine5:~# ip route get 8.8.4.4
8.8.4.4 via 192.168.0.120 dev eth0 src 192.168.0.244
cache <redirected> ipid 0x303a
### the TCP packets STILL use the <redirected> route 192.168.0.120; we
see this with a tcpdump at 192.168.0.120
root@machine5:~# telnet 8.8.4.4
### only a reboot clears the cached <redirected> routes
Cheers.
--Ivan
^ permalink raw reply
* Re: [PATCH] Add ethtool to mii advertisment conversion helpers
From: David Miller @ 2011-11-16 22:36 UTC (permalink / raw)
To: mcarlson; +Cc: netdev, mchan
In-Reply-To: <1321394453-21076-1-git-send-email-mcarlson@broadcom.com>
From: "Matt Carlson" <mcarlson@broadcom.com>
Date: Tue, 15 Nov 2011 14:00:53 -0800
> Translating between ethtool advertisement settings and MII
> advertisements are common operations for ethernet drivers. This patch
> adds a set of helper functions that implements the conversion. The
> patch then modifies a couple of the drivers to use the new functions.
>
> Signed-off-by: Matt Carlson <mcarlson@broadcom.com>
> Signed-off-by: Michael Chan <mchan@broadcom.com>
Doesn't build:
In file included from drivers/net/ethernet/3com/3c59x.c:82:0:
include/linux/mii.h: In function ‘ethtool_adv_to_mii_100bt’:
include/linux/mii.h:254:15: error: ‘ADVERTISED_10baseT_Half’ undeclared (first use in this function)
include/linux/mii.h:254:15: note: each undeclared identifier is reported only once for each function it appears in
include/linux/mii.h:256:15: error: ‘ADVERTISED_10baseT_Full’ undeclared (first use in this function)
include/linux/mii.h:258:15: error: ‘ADVERTISED_100baseT_Half’ undeclared (first use in this function)
include/linux/mii.h:260:15: error: ‘ADVERTISED_100baseT_Full’ undeclared (first use in this function)
include/linux/mii.h:262:15: error: ‘ADVERTISED_Pause’ undeclared (first use in this function)
include/linux/mii.h:264:15: error: ‘ADVERTISED_Asym_Pause’ undeclared (first use in this function)
^ permalink raw reply
* Re: [PATCH v5 5/9] net: Define enum for net device features.
From: Ben Hutchings @ 2011-11-16 22:39 UTC (permalink / raw)
To: Michał Mirosław; +Cc: netdev, David S. Miller
In-Reply-To: <d47c3468cf74bd42f28989d078d9a70ca8cfd7e0.1321404954.git.mirq-linux@rere.qmqm.pl>
On Wed, 2011-11-16 at 02:29 +0100, Michał Mirosław wrote:
> Define feature values by bit position instead of direct 2**i values
> and force the values to be of type netdev_features_t.
>
> Cleaned and extended from patch by Mahesh Bandewar <maheshb@google.com>:
> + added netdev_features_t casts
> + included bits under NETIF_F_GSO_MASK
> + moved feature #defines out of struct net_device definition
>
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> ---
> include/linux/netdev_features.h | 129 +++++++++++++++++++++++++++-----------
> 1 files changed, 91 insertions(+), 38 deletions(-)
>
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index af52381..04ac8f8 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
[...]
> + /**/NETIF_F_GSO_SHIFT, /* keep the order of SKB_GSO_* bits */
> + NETIF_F_TSO_BIT /* ... TCPv4 segmentation */
> + = NETIF_F_GSO_SHIFT,
> + NETIF_F_UFO_BIT, /* ... UDPv4 fragmentation */
> + NETIF_F_GSO_ROBUST_BIT, /* ... ->SKB_GSO_DODGY */
> + NETIF_F_TSO_ECN_BIT, /* ... TCP ECN support */
> + NETIF_F_TSO6_BIT, /* ... TCPv6 segmentation */
> + NETIF_F_FSO_BIT, /* ... FCoE segmentation */
> + NETIF_F_GSO_RESERVED1, /* ... free (fill GSO_MASK to 8 bits) */
> + /**/NETIF_F_GSO_LAST, /* [can't be last bit, see GSO_MASK] */
> + NETIF_F_GSO_RESERVED2 /* ... free (fill GSO_MASK to 8 bits) */
> + = NETIF_F_GSO_LAST,
>
> -/* Segmentation offload features */
> -#define NETIF_F_GSO_SHIFT 16
> -#define NETIF_F_GSO_MASK 0x00ff0000
> -#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
> -#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
> -#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
> -#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
> -#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
> -#define NETIF_F_FSO (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
[...]
You should either add BUILD_BUG_ON()s somewhere to ensure that the
netdev feature and skb GSO flags remain in sync, or redefine the skb GSO
flags using the netdev feature flags (which I thought was the reason for
moving features to their own header).
Ben.
--
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox