Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] [ath5k][leds] Ability to disable leds support. If leds support enabled do not force mac802.11 leds layer selection.
From: Dmytro Milinevskyy @ 2010-04-07 18:58 UTC (permalink / raw)
  To: ath5k-devel-xDcbHBWguxEUs3QNXV6qNA
  Cc: Kalle Valo, linux-wireless-u79uwXL29TY76Z2rM5mHXA, GeunSik Lim,
	Jiri Slaby, Greg Kroah-Hartman, John W. Linville, Keng-Yu Lin,
	netdev-u79uwXL29TY76Z2rM5mHXA, Jiri Kosina, Johannes Berg,
	Shahar Or, Dmytro Milinevskyy,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Luca Verdesca

Hello!

Here is the patch to disable ath5k leds support on build stage.
However if the leds support was enabled do not force selection of 802.11 leds layer.

---
 drivers/net/wireless/ath/ath5k/Kconfig  |   10 +++++++---
 drivers/net/wireless/ath/ath5k/Makefile |    2 +-
 drivers/net/wireless/ath/ath5k/ath5k.h  |   11 ++++++-----
 drivers/net/wireless/ath/ath5k/attach.c |    2 ++
 drivers/net/wireless/ath/ath5k/base.c   |   18 ++++++++++++++++++
 drivers/net/wireless/ath/ath5k/base.h   |   12 ++++++++++++
 drivers/net/wireless/ath/ath5k/gpio.c   |    2 ++
 drivers/net/wireless/ath/ath5k/led.c    |    9 +++++++++
 8 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/ath/ath5k/Kconfig b/drivers/net/wireless/ath/ath5k/Kconfig
index eb83b7b..6b5677e 100644
--- a/drivers/net/wireless/ath/ath5k/Kconfig
+++ b/drivers/net/wireless/ath/ath5k/Kconfig
@@ -1,9 +1,6 @@
 config ATH5K
 	tristate "Atheros 5xxx wireless cards support"
 	depends on PCI && MAC80211
-	select MAC80211_LEDS
-	select LEDS_CLASS
-	select NEW_LEDS
 	---help---
 	  This module adds support for wireless adapters based on
 	  Atheros 5xxx chipset.
@@ -18,6 +15,13 @@ config ATH5K
 	  If you choose to build a module, it'll be called ath5k. Say M if
 	  unsure.
 
+
+config ATH5K_LEDS
+	tristate "Atheros 5xxx wireless cards LEDs support"
+	depends on ATH5K
+	---help---
+	  Atheros 5xxx LED support.
+
 config ATH5K_DEBUG
 	bool "Atheros 5xxx debugging"
 	depends on ATH5K
diff --git a/drivers/net/wireless/ath/ath5k/Makefile b/drivers/net/wireless/ath/ath5k/Makefile
index 090dc6d..fd36145 100644
--- a/drivers/net/wireless/ath/ath5k/Makefile
+++ b/drivers/net/wireless/ath/ath5k/Makefile
@@ -10,7 +10,7 @@ ath5k-y				+= phy.o
 ath5k-y				+= reset.o
 ath5k-y				+= attach.o
 ath5k-y				+= base.o
-ath5k-y				+= led.o
 ath5k-y				+= rfkill.o
 ath5k-$(CONFIG_ATH5K_DEBUG)	+= debug.o
+ath5k-$(CONFIG_ATH5K_LEDS) += led.o
 obj-$(CONFIG_ATH5K)		+= ath5k.o
diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h
index ac67f02..8285c07 100644
--- a/drivers/net/wireless/ath/ath5k/ath5k.h
+++ b/drivers/net/wireless/ath/ath5k/ath5k.h
@@ -929,6 +929,7 @@ enum ath5k_power_mode {
 	AR5K_PM_NETWORK_SLEEP,
 };
 
+#ifdef CONFIG_ATH5K_LEDS
 /*
  * These match net80211 definitions (not used in
  * mac80211).
@@ -939,11 +940,7 @@ enum ath5k_power_mode {
 #define AR5K_LED_AUTH	2 /*IEEE80211_S_AUTH*/
 #define AR5K_LED_ASSOC	3 /*IEEE80211_S_ASSOC*/
 #define AR5K_LED_RUN	4 /*IEEE80211_S_RUN*/
-
-/* GPIO-controlled software LED */
-#define AR5K_SOFTLED_PIN	0
-#define AR5K_SOFTLED_ON		0
-#define AR5K_SOFTLED_OFF	1
+#endif
 
 /*
  * Chipset capabilities -see ath5k_hw_get_capability-
@@ -1161,11 +1158,13 @@ struct ath5k_hw {
 extern int ath5k_hw_attach(struct ath5k_softc *sc);
 extern void ath5k_hw_detach(struct ath5k_hw *ah);
 
+#ifdef CONFIG_ATH5K_LEDS
 /* LED functions */
 extern int ath5k_init_leds(struct ath5k_softc *sc);
 extern void ath5k_led_enable(struct ath5k_softc *sc);
 extern void ath5k_led_off(struct ath5k_softc *sc);
 extern void ath5k_unregister_leds(struct ath5k_softc *sc);
+#endif
 
 /* Reset Functions */
 extern int ath5k_hw_nic_wakeup(struct ath5k_hw *ah, int flags, bool initial);
@@ -1259,7 +1258,9 @@ extern int ath5k_hw_set_slot_time(struct ath5k_hw *ah, unsigned int slot_time);
 extern int ath5k_hw_init_desc_functions(struct ath5k_hw *ah);
 
 /* GPIO Functions */
+#ifdef CONFIG_ATH5K_LEDS
 extern void ath5k_hw_set_ledstate(struct ath5k_hw *ah, unsigned int state);
+#endif
 extern int ath5k_hw_set_gpio_input(struct ath5k_hw *ah, u32 gpio);
 extern int ath5k_hw_set_gpio_output(struct ath5k_hw *ah, u32 gpio);
 extern u32 ath5k_hw_get_gpio(struct ath5k_hw *ah, u32 gpio);
diff --git a/drivers/net/wireless/ath/ath5k/attach.c b/drivers/net/wireless/ath/ath5k/attach.c
index dc0786c..c27f741 100644
--- a/drivers/net/wireless/ath/ath5k/attach.c
+++ b/drivers/net/wireless/ath/ath5k/attach.c
@@ -334,8 +334,10 @@ int ath5k_hw_attach(struct ath5k_softc *sc)
 
 	ath5k_hw_init_nfcal_hist(ah);
 
+#ifdef CONFIG_ATH5K_LEDS
 	/* turn on HW LEDs */
 	ath5k_hw_set_ledstate(ah, AR5K_LED_INIT);
+#endif
 
 	return 0;
 err_free:
diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c
index 3abbe75..db8ca05 100644
--- a/drivers/net/wireless/ath/ath5k/base.c
+++ b/drivers/net/wireless/ath/ath5k/base.c
@@ -709,18 +709,22 @@ ath5k_pci_remove(struct pci_dev *pdev)
 #ifdef CONFIG_PM
 static int ath5k_pci_suspend(struct device *dev)
 {
+#ifdef CONFIG_ATH5K_LEDS
 	struct ieee80211_hw *hw = pci_get_drvdata(to_pci_dev(dev));
 	struct ath5k_softc *sc = hw->priv;
 
 	ath5k_led_off(sc);
+#endif
 	return 0;
 }
 
 static int ath5k_pci_resume(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+#ifdef CONFIG_ATH5K_LEDS
 	struct ieee80211_hw *hw = pci_get_drvdata(pdev);
 	struct ath5k_softc *sc = hw->priv;
+#endif
 
 	/*
 	 * Suspend/Resume resets the PCI configuration space, so we have to
@@ -729,7 +733,9 @@ static int ath5k_pci_resume(struct device *dev)
 	 */
 	pci_write_config_byte(pdev, 0x41, 0);
 
+#ifdef CONFIG_ATH5K_LEDS
 	ath5k_led_enable(sc);
+#endif
 	return 0;
 }
 #endif /* CONFIG_PM */
@@ -859,7 +865,9 @@ ath5k_attach(struct pci_dev *pdev, struct ieee80211_hw *hw)
 	if (!ath_is_world_regd(regulatory))
 		regulatory_hint(hw->wiphy, regulatory->alpha2);
 
+#ifdef CONFIG_ATH5K_LEDS
 	ath5k_init_leds(sc);
+#endif
 
 	return 0;
 err_queues:
@@ -894,7 +902,9 @@ ath5k_detach(struct pci_dev *pdev, struct ieee80211_hw *hw)
 	ath5k_desc_free(sc, pdev);
 	ath5k_txq_release(sc);
 	ath5k_hw_release_tx_queue(sc->ah, sc->bhalq);
+#ifdef CONFIG_ATH5K_LEDS
 	ath5k_unregister_leds(sc);
+#endif
 
 	/*
 	 * NB: can't reclaim these until after ieee80211_ifdetach
@@ -2470,7 +2480,9 @@ ath5k_stop_locked(struct ath5k_softc *sc)
 	ieee80211_stop_queues(sc->hw);
 
 	if (!test_bit(ATH_STAT_INVALID, sc->status)) {
+#ifdef CONFIG_ATH5K_LEDS
 		ath5k_led_off(sc);
+#endif
 		ath5k_hw_set_imr(ah, 0);
 		synchronize_irq(sc->pdev->irq);
 	}
@@ -3246,8 +3258,10 @@ static void ath5k_bss_info_changed(struct ieee80211_hw *hw,
 		sc->assoc = bss_conf->assoc;
 		if (sc->opmode == NL80211_IFTYPE_STATION)
 			set_beacon_filter(hw, sc->assoc);
+#ifdef CONFIG_ATH5K_LEDS
 		ath5k_hw_set_ledstate(sc->ah, sc->assoc ?
 			AR5K_LED_ASSOC : AR5K_LED_INIT);
+#endif
 		if (bss_conf->assoc) {
 			ATH5K_DBG(sc, ATH5K_DEBUG_ANY,
 				  "Bss Info ASSOC %d, bssid: %pM\n",
@@ -3277,16 +3291,20 @@ static void ath5k_bss_info_changed(struct ieee80211_hw *hw,
 
 static void ath5k_sw_scan_start(struct ieee80211_hw *hw)
 {
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_softc *sc = hw->priv;
 	if (!sc->assoc)
 		ath5k_hw_set_ledstate(sc->ah, AR5K_LED_SCAN);
+#endif
 }
 
 static void ath5k_sw_scan_complete(struct ieee80211_hw *hw)
 {
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_softc *sc = hw->priv;
 	ath5k_hw_set_ledstate(sc->ah, sc->assoc ?
 		AR5K_LED_ASSOC : AR5K_LED_INIT);
+#endif
 }
 
 /**
diff --git a/drivers/net/wireless/ath/ath5k/base.h b/drivers/net/wireless/ath/ath5k/base.h
index 7e1a88a..19208a7 100644
--- a/drivers/net/wireless/ath/ath5k/base.h
+++ b/drivers/net/wireless/ath/ath5k/base.h
@@ -85,15 +85,21 @@ struct ath5k_txq {
 
 #define ATH5K_LED_MAX_NAME_LEN 31
 
+#ifdef CONFIG_ATH5K_LEDS
 /*
  * State for LED triggers
  */
 struct ath5k_led
 {
+#ifdef CONFIG_LEDS_CLASS
 	char name[ATH5K_LED_MAX_NAME_LEN + 1];	/* name of the LED in sysfs */
+#endif
 	struct ath5k_softc *sc;			/* driver state */
+#ifdef CONFIG_LEDS_CLASS
 	struct led_classdev led_dev;		/* led classdev */
+#endif
 };
+#endif
 
 /* Rfkill */
 struct ath5k_rfkill {
@@ -154,8 +160,10 @@ struct ath5k_softc {
 
 	u8			bssidmask[ETH_ALEN];
 
+#ifdef CONFIG_ATH5K_LEDS
 	unsigned int		led_pin,	/* GPIO pin for driving LED */
 				led_on;		/* pin setting for LED on */
+#endif
 
 	struct tasklet_struct	restq;		/* reset tasklet */
 
@@ -164,7 +172,9 @@ struct ath5k_softc {
 	spinlock_t		rxbuflock;
 	u32			*rxlink;	/* link ptr in last RX desc */
 	struct tasklet_struct	rxtq;		/* rx intr tasklet */
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_led	rx_led;		/* rx led */
+#endif
 
 	struct list_head	txbuf;		/* transmit buffer */
 	spinlock_t		txbuflock;
@@ -172,7 +182,9 @@ struct ath5k_softc {
 	struct ath5k_txq	txqs[AR5K_NUM_TX_QUEUES];	/* tx queues */
 	struct ath5k_txq	*txq;		/* main tx queue */
 	struct tasklet_struct	txtq;		/* tx intr tasklet */
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_led	tx_led;		/* tx led */
+#endif
 
 	struct ath5k_rfkill	rf_kill;
 
diff --git a/drivers/net/wireless/ath/ath5k/gpio.c b/drivers/net/wireless/ath/ath5k/gpio.c
index 64a27e7..9e757b3 100644
--- a/drivers/net/wireless/ath/ath5k/gpio.c
+++ b/drivers/net/wireless/ath/ath5k/gpio.c
@@ -25,6 +25,7 @@
 #include "debug.h"
 #include "base.h"
 
+#ifdef CONFIG_ATH5K_LEDS
 /*
  * Set led state
  */
@@ -76,6 +77,7 @@ void ath5k_hw_set_ledstate(struct ath5k_hw *ah, unsigned int state)
 	else
 		AR5K_REG_ENABLE_BITS(ah, AR5K_PCICFG, led_5210);
 }
+#endif
 
 /*
  * Set GPIO inputs
diff --git a/drivers/net/wireless/ath/ath5k/led.c b/drivers/net/wireless/ath/ath5k/led.c
index 67aa52e..df8e8d2 100644
--- a/drivers/net/wireless/ath/ath5k/led.c
+++ b/drivers/net/wireless/ath/ath5k/led.c
@@ -121,6 +121,7 @@ ath5k_led_brightness_set(struct led_classdev *led_dev,
 		ath5k_led_on(led->sc);
 }
 
+#ifdef CONFIG_LEDS_CLASS
 static int
 ath5k_register_led(struct ath5k_softc *sc, struct ath5k_led *led,
 		   const char *name, char *trigger)
@@ -140,13 +141,16 @@ ath5k_register_led(struct ath5k_softc *sc, struct ath5k_led *led,
 	}
 	return err;
 }
+#endif
 
 static void
 ath5k_unregister_led(struct ath5k_led *led)
 {
 	if (!led->sc)
 		return;
+#ifdef CONFIG_LEDS_CLASS
 	led_classdev_unregister(&led->led_dev);
+#endif
 	ath5k_led_off(led->sc);
 	led->sc = NULL;
 }
@@ -177,6 +181,7 @@ int ath5k_init_leds(struct ath5k_softc *sc)
 
 	ath5k_led_enable(sc);
 
+#ifdef CONFIG_LEDS_CLASS
 	snprintf(name, sizeof(name), "ath5k-%s::rx", wiphy_name(hw->wiphy));
 	ret = ath5k_register_led(sc, &sc->rx_led, name,
 		ieee80211_get_rx_led_name(hw));
@@ -186,6 +191,10 @@ int ath5k_init_leds(struct ath5k_softc *sc)
 	snprintf(name, sizeof(name), "ath5k-%s::tx", wiphy_name(hw->wiphy));
 	ret = ath5k_register_led(sc, &sc->tx_led, name,
 		ieee80211_get_tx_led_name(hw));
+#else
+    sc->rx_led.sc = sc;
+    sc->tx_led.sc = sc;
+#endif
 out:
 	return ret;
 }
-- 
1.7.1

^ permalink raw reply related

* [PATCH] [ath5k][leds] Ability to disable leds support. If leds support enabled do not force mac802.11 leds layer selection.
From: Dmytro Milinevskyy @ 2010-04-07 18:58 UTC (permalink / raw)
  Cc: Jiri Slaby, Nick Kossifidis, Luis R. Rodriguez, Bob Copeland,
	John W. Linville, GeunSik Lim, Greg Kroah-Hartman, Lukas Turek,
	Mark Hindley, Johannes Berg, Jiri Kosina, Kalle Valo, Keng-Yu Lin,
	Luca Verdesca, Shahar Or, linux-wireless, ath5k-devel, netdev,
	linux-kernel, Dmytro Milinevskyy

Hello!

Here is the patch to disable ath5k leds support on build stage.
However if the leds support was enabled do not force selection of 802.11 leds layer.

---
 drivers/net/wireless/ath/ath5k/Kconfig  |   10 +++++++---
 drivers/net/wireless/ath/ath5k/Makefile |    2 +-
 drivers/net/wireless/ath/ath5k/ath5k.h  |   11 ++++++-----
 drivers/net/wireless/ath/ath5k/attach.c |    2 ++
 drivers/net/wireless/ath/ath5k/base.c   |   18 ++++++++++++++++++
 drivers/net/wireless/ath/ath5k/base.h   |   12 ++++++++++++
 drivers/net/wireless/ath/ath5k/gpio.c   |    2 ++
 drivers/net/wireless/ath/ath5k/led.c    |    9 +++++++++
 8 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/drivers/net/wireless/ath/ath5k/Kconfig b/drivers/net/wireless/ath/ath5k/Kconfig
index eb83b7b..6b5677e 100644
--- a/drivers/net/wireless/ath/ath5k/Kconfig
+++ b/drivers/net/wireless/ath/ath5k/Kconfig
@@ -1,9 +1,6 @@
 config ATH5K
 	tristate "Atheros 5xxx wireless cards support"
 	depends on PCI && MAC80211
-	select MAC80211_LEDS
-	select LEDS_CLASS
-	select NEW_LEDS
 	---help---
 	  This module adds support for wireless adapters based on
 	  Atheros 5xxx chipset.
@@ -18,6 +15,13 @@ config ATH5K
 	  If you choose to build a module, it'll be called ath5k. Say M if
 	  unsure.
 
+
+config ATH5K_LEDS
+	tristate "Atheros 5xxx wireless cards LEDs support"
+	depends on ATH5K
+	---help---
+	  Atheros 5xxx LED support.
+
 config ATH5K_DEBUG
 	bool "Atheros 5xxx debugging"
 	depends on ATH5K
diff --git a/drivers/net/wireless/ath/ath5k/Makefile b/drivers/net/wireless/ath/ath5k/Makefile
index 090dc6d..fd36145 100644
--- a/drivers/net/wireless/ath/ath5k/Makefile
+++ b/drivers/net/wireless/ath/ath5k/Makefile
@@ -10,7 +10,7 @@ ath5k-y				+= phy.o
 ath5k-y				+= reset.o
 ath5k-y				+= attach.o
 ath5k-y				+= base.o
-ath5k-y				+= led.o
 ath5k-y				+= rfkill.o
 ath5k-$(CONFIG_ATH5K_DEBUG)	+= debug.o
+ath5k-$(CONFIG_ATH5K_LEDS) += led.o
 obj-$(CONFIG_ATH5K)		+= ath5k.o
diff --git a/drivers/net/wireless/ath/ath5k/ath5k.h b/drivers/net/wireless/ath/ath5k/ath5k.h
index ac67f02..8285c07 100644
--- a/drivers/net/wireless/ath/ath5k/ath5k.h
+++ b/drivers/net/wireless/ath/ath5k/ath5k.h
@@ -929,6 +929,7 @@ enum ath5k_power_mode {
 	AR5K_PM_NETWORK_SLEEP,
 };
 
+#ifdef CONFIG_ATH5K_LEDS
 /*
  * These match net80211 definitions (not used in
  * mac80211).
@@ -939,11 +940,7 @@ enum ath5k_power_mode {
 #define AR5K_LED_AUTH	2 /*IEEE80211_S_AUTH*/
 #define AR5K_LED_ASSOC	3 /*IEEE80211_S_ASSOC*/
 #define AR5K_LED_RUN	4 /*IEEE80211_S_RUN*/
-
-/* GPIO-controlled software LED */
-#define AR5K_SOFTLED_PIN	0
-#define AR5K_SOFTLED_ON		0
-#define AR5K_SOFTLED_OFF	1
+#endif
 
 /*
  * Chipset capabilities -see ath5k_hw_get_capability-
@@ -1161,11 +1158,13 @@ struct ath5k_hw {
 extern int ath5k_hw_attach(struct ath5k_softc *sc);
 extern void ath5k_hw_detach(struct ath5k_hw *ah);
 
+#ifdef CONFIG_ATH5K_LEDS
 /* LED functions */
 extern int ath5k_init_leds(struct ath5k_softc *sc);
 extern void ath5k_led_enable(struct ath5k_softc *sc);
 extern void ath5k_led_off(struct ath5k_softc *sc);
 extern void ath5k_unregister_leds(struct ath5k_softc *sc);
+#endif
 
 /* Reset Functions */
 extern int ath5k_hw_nic_wakeup(struct ath5k_hw *ah, int flags, bool initial);
@@ -1259,7 +1258,9 @@ extern int ath5k_hw_set_slot_time(struct ath5k_hw *ah, unsigned int slot_time);
 extern int ath5k_hw_init_desc_functions(struct ath5k_hw *ah);
 
 /* GPIO Functions */
+#ifdef CONFIG_ATH5K_LEDS
 extern void ath5k_hw_set_ledstate(struct ath5k_hw *ah, unsigned int state);
+#endif
 extern int ath5k_hw_set_gpio_input(struct ath5k_hw *ah, u32 gpio);
 extern int ath5k_hw_set_gpio_output(struct ath5k_hw *ah, u32 gpio);
 extern u32 ath5k_hw_get_gpio(struct ath5k_hw *ah, u32 gpio);
diff --git a/drivers/net/wireless/ath/ath5k/attach.c b/drivers/net/wireless/ath/ath5k/attach.c
index dc0786c..c27f741 100644
--- a/drivers/net/wireless/ath/ath5k/attach.c
+++ b/drivers/net/wireless/ath/ath5k/attach.c
@@ -334,8 +334,10 @@ int ath5k_hw_attach(struct ath5k_softc *sc)
 
 	ath5k_hw_init_nfcal_hist(ah);
 
+#ifdef CONFIG_ATH5K_LEDS
 	/* turn on HW LEDs */
 	ath5k_hw_set_ledstate(ah, AR5K_LED_INIT);
+#endif
 
 	return 0;
 err_free:
diff --git a/drivers/net/wireless/ath/ath5k/base.c b/drivers/net/wireless/ath/ath5k/base.c
index 3abbe75..db8ca05 100644
--- a/drivers/net/wireless/ath/ath5k/base.c
+++ b/drivers/net/wireless/ath/ath5k/base.c
@@ -709,18 +709,22 @@ ath5k_pci_remove(struct pci_dev *pdev)
 #ifdef CONFIG_PM
 static int ath5k_pci_suspend(struct device *dev)
 {
+#ifdef CONFIG_ATH5K_LEDS
 	struct ieee80211_hw *hw = pci_get_drvdata(to_pci_dev(dev));
 	struct ath5k_softc *sc = hw->priv;
 
 	ath5k_led_off(sc);
+#endif
 	return 0;
 }
 
 static int ath5k_pci_resume(struct device *dev)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
+#ifdef CONFIG_ATH5K_LEDS
 	struct ieee80211_hw *hw = pci_get_drvdata(pdev);
 	struct ath5k_softc *sc = hw->priv;
+#endif
 
 	/*
 	 * Suspend/Resume resets the PCI configuration space, so we have to
@@ -729,7 +733,9 @@ static int ath5k_pci_resume(struct device *dev)
 	 */
 	pci_write_config_byte(pdev, 0x41, 0);
 
+#ifdef CONFIG_ATH5K_LEDS
 	ath5k_led_enable(sc);
+#endif
 	return 0;
 }
 #endif /* CONFIG_PM */
@@ -859,7 +865,9 @@ ath5k_attach(struct pci_dev *pdev, struct ieee80211_hw *hw)
 	if (!ath_is_world_regd(regulatory))
 		regulatory_hint(hw->wiphy, regulatory->alpha2);
 
+#ifdef CONFIG_ATH5K_LEDS
 	ath5k_init_leds(sc);
+#endif
 
 	return 0;
 err_queues:
@@ -894,7 +902,9 @@ ath5k_detach(struct pci_dev *pdev, struct ieee80211_hw *hw)
 	ath5k_desc_free(sc, pdev);
 	ath5k_txq_release(sc);
 	ath5k_hw_release_tx_queue(sc->ah, sc->bhalq);
+#ifdef CONFIG_ATH5K_LEDS
 	ath5k_unregister_leds(sc);
+#endif
 
 	/*
 	 * NB: can't reclaim these until after ieee80211_ifdetach
@@ -2470,7 +2480,9 @@ ath5k_stop_locked(struct ath5k_softc *sc)
 	ieee80211_stop_queues(sc->hw);
 
 	if (!test_bit(ATH_STAT_INVALID, sc->status)) {
+#ifdef CONFIG_ATH5K_LEDS
 		ath5k_led_off(sc);
+#endif
 		ath5k_hw_set_imr(ah, 0);
 		synchronize_irq(sc->pdev->irq);
 	}
@@ -3246,8 +3258,10 @@ static void ath5k_bss_info_changed(struct ieee80211_hw *hw,
 		sc->assoc = bss_conf->assoc;
 		if (sc->opmode == NL80211_IFTYPE_STATION)
 			set_beacon_filter(hw, sc->assoc);
+#ifdef CONFIG_ATH5K_LEDS
 		ath5k_hw_set_ledstate(sc->ah, sc->assoc ?
 			AR5K_LED_ASSOC : AR5K_LED_INIT);
+#endif
 		if (bss_conf->assoc) {
 			ATH5K_DBG(sc, ATH5K_DEBUG_ANY,
 				  "Bss Info ASSOC %d, bssid: %pM\n",
@@ -3277,16 +3291,20 @@ static void ath5k_bss_info_changed(struct ieee80211_hw *hw,
 
 static void ath5k_sw_scan_start(struct ieee80211_hw *hw)
 {
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_softc *sc = hw->priv;
 	if (!sc->assoc)
 		ath5k_hw_set_ledstate(sc->ah, AR5K_LED_SCAN);
+#endif
 }
 
 static void ath5k_sw_scan_complete(struct ieee80211_hw *hw)
 {
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_softc *sc = hw->priv;
 	ath5k_hw_set_ledstate(sc->ah, sc->assoc ?
 		AR5K_LED_ASSOC : AR5K_LED_INIT);
+#endif
 }
 
 /**
diff --git a/drivers/net/wireless/ath/ath5k/base.h b/drivers/net/wireless/ath/ath5k/base.h
index 7e1a88a..19208a7 100644
--- a/drivers/net/wireless/ath/ath5k/base.h
+++ b/drivers/net/wireless/ath/ath5k/base.h
@@ -85,15 +85,21 @@ struct ath5k_txq {
 
 #define ATH5K_LED_MAX_NAME_LEN 31
 
+#ifdef CONFIG_ATH5K_LEDS
 /*
  * State for LED triggers
  */
 struct ath5k_led
 {
+#ifdef CONFIG_LEDS_CLASS
 	char name[ATH5K_LED_MAX_NAME_LEN + 1];	/* name of the LED in sysfs */
+#endif
 	struct ath5k_softc *sc;			/* driver state */
+#ifdef CONFIG_LEDS_CLASS
 	struct led_classdev led_dev;		/* led classdev */
+#endif
 };
+#endif
 
 /* Rfkill */
 struct ath5k_rfkill {
@@ -154,8 +160,10 @@ struct ath5k_softc {
 
 	u8			bssidmask[ETH_ALEN];
 
+#ifdef CONFIG_ATH5K_LEDS
 	unsigned int		led_pin,	/* GPIO pin for driving LED */
 				led_on;		/* pin setting for LED on */
+#endif
 
 	struct tasklet_struct	restq;		/* reset tasklet */
 
@@ -164,7 +172,9 @@ struct ath5k_softc {
 	spinlock_t		rxbuflock;
 	u32			*rxlink;	/* link ptr in last RX desc */
 	struct tasklet_struct	rxtq;		/* rx intr tasklet */
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_led	rx_led;		/* rx led */
+#endif
 
 	struct list_head	txbuf;		/* transmit buffer */
 	spinlock_t		txbuflock;
@@ -172,7 +182,9 @@ struct ath5k_softc {
 	struct ath5k_txq	txqs[AR5K_NUM_TX_QUEUES];	/* tx queues */
 	struct ath5k_txq	*txq;		/* main tx queue */
 	struct tasklet_struct	txtq;		/* tx intr tasklet */
+#ifdef CONFIG_ATH5K_LEDS
 	struct ath5k_led	tx_led;		/* tx led */
+#endif
 
 	struct ath5k_rfkill	rf_kill;
 
diff --git a/drivers/net/wireless/ath/ath5k/gpio.c b/drivers/net/wireless/ath/ath5k/gpio.c
index 64a27e7..9e757b3 100644
--- a/drivers/net/wireless/ath/ath5k/gpio.c
+++ b/drivers/net/wireless/ath/ath5k/gpio.c
@@ -25,6 +25,7 @@
 #include "debug.h"
 #include "base.h"
 
+#ifdef CONFIG_ATH5K_LEDS
 /*
  * Set led state
  */
@@ -76,6 +77,7 @@ void ath5k_hw_set_ledstate(struct ath5k_hw *ah, unsigned int state)
 	else
 		AR5K_REG_ENABLE_BITS(ah, AR5K_PCICFG, led_5210);
 }
+#endif
 
 /*
  * Set GPIO inputs
diff --git a/drivers/net/wireless/ath/ath5k/led.c b/drivers/net/wireless/ath/ath5k/led.c
index 67aa52e..df8e8d2 100644
--- a/drivers/net/wireless/ath/ath5k/led.c
+++ b/drivers/net/wireless/ath/ath5k/led.c
@@ -121,6 +121,7 @@ ath5k_led_brightness_set(struct led_classdev *led_dev,
 		ath5k_led_on(led->sc);
 }
 
+#ifdef CONFIG_LEDS_CLASS
 static int
 ath5k_register_led(struct ath5k_softc *sc, struct ath5k_led *led,
 		   const char *name, char *trigger)
@@ -140,13 +141,16 @@ ath5k_register_led(struct ath5k_softc *sc, struct ath5k_led *led,
 	}
 	return err;
 }
+#endif
 
 static void
 ath5k_unregister_led(struct ath5k_led *led)
 {
 	if (!led->sc)
 		return;
+#ifdef CONFIG_LEDS_CLASS
 	led_classdev_unregister(&led->led_dev);
+#endif
 	ath5k_led_off(led->sc);
 	led->sc = NULL;
 }
@@ -177,6 +181,7 @@ int ath5k_init_leds(struct ath5k_softc *sc)
 
 	ath5k_led_enable(sc);
 
+#ifdef CONFIG_LEDS_CLASS
 	snprintf(name, sizeof(name), "ath5k-%s::rx", wiphy_name(hw->wiphy));
 	ret = ath5k_register_led(sc, &sc->rx_led, name,
 		ieee80211_get_rx_led_name(hw));
@@ -186,6 +191,10 @@ int ath5k_init_leds(struct ath5k_softc *sc)
 	snprintf(name, sizeof(name), "ath5k-%s::tx", wiphy_name(hw->wiphy));
 	ret = ath5k_register_led(sc, &sc->tx_led, name,
 		ieee80211_get_tx_led_name(hw));
+#else
+    sc->rx_led.sc = sc;
+    sc->tx_led.sc = sc;
+#endif
 out:
 	return ret;
 }
-- 
1.7.1

^ permalink raw reply related

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Eric Dumazet @ 2010-04-07 18:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Pekka Enberg, Zhang, Yanmin, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <alpine.DEB.2.00.1004071319320.16159@router.home>

Le mercredi 07 avril 2010 à 13:20 -0500, Christoph Lameter a écrit :
> On Wed, 7 Apr 2010, Pekka Enberg wrote:
> 
> > Oh, sorry, I think it's actually '____cacheline_aligned_in_smp' (with four
> > underscores) for per-cpu data. Confusing...
> 
> This does not particulary help to clarify the situation since we are
> dealing with data that can either be allocated via the percpu allocator or
> be statically present (kmalloc bootstrap situation).
> 
> --

Do we have a user program to check actual L1 cache size of a machine ?

I remember my HP blades have many BIOS options, I would like to make
sure they are properly set.




^ permalink raw reply

* pull request: wireless-2.6 2010-04-07
From: John W. Linville @ 2010-04-07 18:28 UTC (permalink / raw)
  To: davem-fT/PcQaiUtIeIZ0/mPfg9Q
  Cc: linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Dave,

Here is a batch of fixes intended for 2.6.34.  Included is a variable
initialization required for some ath9k hardware to function reliably,
some RCU annotation to avoid some warnings in mac80211, and a fix for
a regression relating to 802.11s mesh networking.  Also included are
several iwlwifi fixes, include the elimination of an order-4 allocation
during resume, avoidance of a the BUG_ON in the rate scaling routines,
and the elimination of a DMA API warning during module removal.
The iwlwifi patches are a little larger than I would like, but the
fixes seem legitimate and worthwhile to me.

Please let me know if there are problems!

Thanks,

John

---

The following changes since commit fb9e2d887243499b8d28efcf80821c4f6a092395:
  Ken Kawasaki (1):
        smc91c92_cs: fix the problem of "Unable to find hardware address"

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-2.6.git master

Felix Fietkau (1):
      ath9k: fix double calls to ath_radio_enable

Javier Cardona (1):
      mac80211: Handle mesh action frames in ieee80211_rx_h_action

Johannes Berg (1):
      mac80211: annotate station rcu dereferences

Shanyu Zhao (1):
      iwlwifi: use consistent table for tx data collect

Zhu Yi (2):
      iwlwifi: fix DMA allocation warnings
      iwlwifi: avoid Tx queue memory allocation in interface down

 drivers/net/wireless/ath/ath9k/main.c     |    3 +-
 drivers/net/wireless/iwlwifi/iwl-agn-rs.c |   55 +++++++--------
 drivers/net/wireless/iwlwifi/iwl-core.c   |   11 ++-
 drivers/net/wireless/iwlwifi/iwl-core.h   |    5 +-
 drivers/net/wireless/iwlwifi/iwl-tx.c     |  107 +++++++++++++++++++++++++----
 net/mac80211/main.c                       |    4 +-
 net/mac80211/mesh.c                       |    3 -
 net/mac80211/rx.c                         |    5 ++
 net/mac80211/sta_info.c                   |   20 ++++-
 9 files changed, 153 insertions(+), 60 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c
index 67ca4e5..115e1ae 100644
--- a/drivers/net/wireless/ath/ath9k/main.c
+++ b/drivers/net/wireless/ath/ath9k/main.c
@@ -1532,8 +1532,7 @@ static int ath9k_config(struct ieee80211_hw *hw, u32 changed)
 		all_wiphys_idle =  ath9k_all_wiphys_idle(sc);
 		ath9k_set_wiphy_idle(aphy, idle);
 
-		if (!idle && all_wiphys_idle)
-			enable_radio = true;
+		enable_radio = (!idle && all_wiphys_idle);
 
 		/*
 		 * After we unlock here its possible another wiphy
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
index 8bf7c20..be00cb3 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn-rs.c
@@ -345,6 +345,17 @@ static inline int get_num_of_ant_from_rate(u32 rate_n_flags)
 	       !!(rate_n_flags & RATE_MCS_ANT_C_MSK);
 }
 
+/*
+ * Static function to get the expected throughput from an iwl_scale_tbl_info
+ * that wraps a NULL pointer check
+ */
+static s32 get_expected_tpt(struct iwl_scale_tbl_info *tbl, int rs_index)
+{
+	if (tbl->expected_tpt)
+		return tbl->expected_tpt[rs_index];
+	return 0;
+}
+
 /**
  * rs_collect_tx_data - Update the success/failure sliding window
  *
@@ -352,19 +363,21 @@ static inline int get_num_of_ant_from_rate(u32 rate_n_flags)
  * at this rate.  window->data contains the bitmask of successful
  * packets.
  */
-static int rs_collect_tx_data(struct iwl_rate_scale_data *windows,
-			      int scale_index, s32 tpt, int attempts,
-			      int successes)
+static int rs_collect_tx_data(struct iwl_scale_tbl_info *tbl,
+			      int scale_index, int attempts, int successes)
 {
 	struct iwl_rate_scale_data *window = NULL;
 	static const u64 mask = (((u64)1) << (IWL_RATE_MAX_WINDOW - 1));
-	s32 fail_count;
+	s32 fail_count, tpt;
 
 	if (scale_index < 0 || scale_index >= IWL_RATE_COUNT)
 		return -EINVAL;
 
 	/* Select window for current tx bit rate */
-	window = &(windows[scale_index]);
+	window = &(tbl->win[scale_index]);
+
+	/* Get expected throughput */
+	tpt = get_expected_tpt(tbl, scale_index);
 
 	/*
 	 * Keep track of only the latest 62 tx frame attempts in this rate's
@@ -738,16 +751,6 @@ static bool table_type_matches(struct iwl_scale_tbl_info *a,
 	return (a->lq_type == b->lq_type) && (a->ant_type == b->ant_type) &&
 		(a->is_SGI == b->is_SGI);
 }
-/*
- * Static function to get the expected throughput from an iwl_scale_tbl_info
- * that wraps a NULL pointer check
- */
-static s32 get_expected_tpt(struct iwl_scale_tbl_info *tbl, int rs_index)
-{
-	if (tbl->expected_tpt)
-		return tbl->expected_tpt[rs_index];
-	return 0;
-}
 
 /*
  * mac80211 sends us Tx status
@@ -764,12 +767,10 @@ static void rs_tx_status(void *priv_r, struct ieee80211_supported_band *sband,
 	struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
 	struct iwl_priv *priv = (struct iwl_priv *)priv_r;
 	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
-	struct iwl_rate_scale_data *window = NULL;
 	enum mac80211_rate_control_flags mac_flags;
 	u32 tx_rate;
 	struct iwl_scale_tbl_info tbl_type;
-	struct iwl_scale_tbl_info *curr_tbl, *other_tbl;
-	s32 tpt = 0;
+	struct iwl_scale_tbl_info *curr_tbl, *other_tbl, *tmp_tbl;
 
 	IWL_DEBUG_RATE_LIMIT(priv, "get frame ack response, update rate scale window\n");
 
@@ -852,7 +853,6 @@ static void rs_tx_status(void *priv_r, struct ieee80211_supported_band *sband,
 		IWL_DEBUG_RATE(priv, "Neither active nor search matches tx rate\n");
 		return;
 	}
-	window = (struct iwl_rate_scale_data *)&(curr_tbl->win[0]);
 
 	/*
 	 * Updating the frame history depends on whether packets were
@@ -865,8 +865,7 @@ static void rs_tx_status(void *priv_r, struct ieee80211_supported_band *sband,
 		tx_rate = le32_to_cpu(table->rs_table[0].rate_n_flags);
 		rs_get_tbl_info_from_mcs(tx_rate, priv->band, &tbl_type,
 				&rs_index);
-		tpt = get_expected_tpt(curr_tbl, rs_index);
-		rs_collect_tx_data(window, rs_index, tpt,
+		rs_collect_tx_data(curr_tbl, rs_index,
 				   info->status.ampdu_ack_len,
 				   info->status.ampdu_ack_map);
 
@@ -896,19 +895,13 @@ static void rs_tx_status(void *priv_r, struct ieee80211_supported_band *sband,
 			 * table as active/search.
 			 */
 			if (table_type_matches(&tbl_type, curr_tbl))
-				tpt = get_expected_tpt(curr_tbl, rs_index);
+				tmp_tbl = curr_tbl;
 			else if (table_type_matches(&tbl_type, other_tbl))
-				tpt = get_expected_tpt(other_tbl, rs_index);
+				tmp_tbl = other_tbl;
 			else
 				continue;
-
-			/* Constants mean 1 transmission, 0 successes */
-			if (i < retries)
-				rs_collect_tx_data(window, rs_index, tpt, 1,
-						0);
-			else
-				rs_collect_tx_data(window, rs_index, tpt, 1,
-						legacy_success);
+			rs_collect_tx_data(tmp_tbl, rs_index, 1,
+					   i < retries ? 0 : legacy_success);
 		}
 
 		/* Update success/fail counts if not searching for new mode */
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index 112149e..894bcb8 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -307,10 +307,13 @@ int iwl_hw_nic_init(struct iwl_priv *priv)
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* Allocate and init all Tx and Command queues */
-	ret = iwl_txq_ctx_reset(priv);
-	if (ret)
-		return ret;
+	/* Allocate or reset and init all Tx and Command queues */
+	if (!priv->txq) {
+		ret = iwl_txq_ctx_alloc(priv);
+		if (ret)
+			return ret;
+	} else
+		iwl_txq_ctx_reset(priv);
 
 	set_bit(STATUS_INIT, &priv->status);
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.h b/drivers/net/wireless/iwlwifi/iwl-core.h
index 4ef7739..732590f 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.h
+++ b/drivers/net/wireless/iwlwifi/iwl-core.h
@@ -442,7 +442,8 @@ void iwl_rx_csa(struct iwl_priv *priv, struct iwl_rx_mem_buffer *rxb);
 /*****************************************************
 * TX
 ******************************************************/
-int iwl_txq_ctx_reset(struct iwl_priv *priv);
+int iwl_txq_ctx_alloc(struct iwl_priv *priv);
+void iwl_txq_ctx_reset(struct iwl_priv *priv);
 void iwl_hw_txq_free_tfd(struct iwl_priv *priv, struct iwl_tx_queue *txq);
 int iwl_hw_txq_attach_buf_to_tfd(struct iwl_priv *priv,
 				 struct iwl_tx_queue *txq,
@@ -456,6 +457,8 @@ void iwl_free_tfds_in_queue(struct iwl_priv *priv,
 void iwl_txq_update_write_ptr(struct iwl_priv *priv, struct iwl_tx_queue *txq);
 int iwl_tx_queue_init(struct iwl_priv *priv, struct iwl_tx_queue *txq,
 		      int slots_num, u32 txq_id);
+void iwl_tx_queue_reset(struct iwl_priv *priv, struct iwl_tx_queue *txq,
+			int slots_num, u32 txq_id);
 void iwl_tx_queue_free(struct iwl_priv *priv, int txq_id);
 int iwl_tx_agg_start(struct iwl_priv *priv, const u8 *ra, u16 tid, u16 *ssn);
 int iwl_tx_agg_stop(struct iwl_priv *priv , const u8 *ra, u16 tid);
diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c
index 8c12311..343d81a 100644
--- a/drivers/net/wireless/iwlwifi/iwl-tx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-tx.c
@@ -193,10 +193,34 @@ void iwl_cmd_queue_free(struct iwl_priv *priv)
 	struct iwl_queue *q = &txq->q;
 	struct device *dev = &priv->pci_dev->dev;
 	int i;
+	bool huge = false;
 
 	if (q->n_bd == 0)
 		return;
 
+	for (; q->read_ptr != q->write_ptr;
+	     q->read_ptr = iwl_queue_inc_wrap(q->read_ptr, q->n_bd)) {
+		/* we have no way to tell if it is a huge cmd ATM */
+		i = get_cmd_index(q, q->read_ptr, 0);
+
+		if (txq->meta[i].flags & CMD_SIZE_HUGE) {
+			huge = true;
+			continue;
+		}
+
+		pci_unmap_single(priv->pci_dev,
+				 pci_unmap_addr(&txq->meta[i], mapping),
+				 pci_unmap_len(&txq->meta[i], len),
+				 PCI_DMA_BIDIRECTIONAL);
+	}
+	if (huge) {
+		i = q->n_window;
+		pci_unmap_single(priv->pci_dev,
+				 pci_unmap_addr(&txq->meta[i], mapping),
+				 pci_unmap_len(&txq->meta[i], len),
+				 PCI_DMA_BIDIRECTIONAL);
+	}
+
 	/* De-alloc array of command/tx buffers */
 	for (i = 0; i <= TFD_CMD_SLOTS; i++)
 		kfree(txq->cmd[i]);
@@ -409,6 +433,26 @@ out_free_arrays:
 }
 EXPORT_SYMBOL(iwl_tx_queue_init);
 
+void iwl_tx_queue_reset(struct iwl_priv *priv, struct iwl_tx_queue *txq,
+			int slots_num, u32 txq_id)
+{
+	int actual_slots = slots_num;
+
+	if (txq_id == IWL_CMD_QUEUE_NUM)
+		actual_slots++;
+
+	memset(txq->meta, 0, sizeof(struct iwl_cmd_meta) * actual_slots);
+
+	txq->need_update = 0;
+
+	/* Initialize queue's high/low-water marks, and head/tail indexes */
+	iwl_queue_init(priv, &txq->q, TFD_QUEUE_SIZE_MAX, slots_num, txq_id);
+
+	/* Tell device where to find queue */
+	priv->cfg->ops->lib->txq_init(priv, txq);
+}
+EXPORT_SYMBOL(iwl_tx_queue_reset);
+
 /**
  * iwl_hw_txq_ctx_free - Free TXQ Context
  *
@@ -420,8 +464,7 @@ void iwl_hw_txq_ctx_free(struct iwl_priv *priv)
 
 	/* Tx queues */
 	if (priv->txq) {
-		for (txq_id = 0; txq_id < priv->hw_params.max_txq_num;
-		     txq_id++)
+		for (txq_id = 0; txq_id < priv->hw_params.max_txq_num; txq_id++)
 			if (txq_id == IWL_CMD_QUEUE_NUM)
 				iwl_cmd_queue_free(priv);
 			else
@@ -437,15 +480,15 @@ void iwl_hw_txq_ctx_free(struct iwl_priv *priv)
 EXPORT_SYMBOL(iwl_hw_txq_ctx_free);
 
 /**
- * iwl_txq_ctx_reset - Reset TX queue context
- * Destroys all DMA structures and initialize them again
+ * iwl_txq_ctx_alloc - allocate TX queue context
+ * Allocate all Tx DMA structures and initialize them
  *
  * @param priv
  * @return error code
  */
-int iwl_txq_ctx_reset(struct iwl_priv *priv)
+int iwl_txq_ctx_alloc(struct iwl_priv *priv)
 {
-	int ret = 0;
+	int ret;
 	int txq_id, slots_num;
 	unsigned long flags;
 
@@ -503,8 +546,31 @@ int iwl_txq_ctx_reset(struct iwl_priv *priv)
 	return ret;
 }
 
+void iwl_txq_ctx_reset(struct iwl_priv *priv)
+{
+	int txq_id, slots_num;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	/* Turn off all Tx DMA fifos */
+	priv->cfg->ops->lib->txq_set_sched(priv, 0);
+
+	/* Tell NIC where to find the "keep warm" buffer */
+	iwl_write_direct32(priv, FH_KW_MEM_ADDR_REG, priv->kw.dma >> 4);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	/* Alloc and init all Tx queues, including the command queue (#4) */
+	for (txq_id = 0; txq_id < priv->hw_params.max_txq_num; txq_id++) {
+		slots_num = txq_id == IWL_CMD_QUEUE_NUM ?
+			    TFD_CMD_SLOTS : TFD_TX_CMD_SLOTS;
+		iwl_tx_queue_reset(priv, &priv->txq[txq_id], slots_num, txq_id);
+	}
+}
+
 /**
- * iwl_txq_ctx_stop - Stop all Tx DMA channels, free Tx queue memory
+ * iwl_txq_ctx_stop - Stop all Tx DMA channels
  */
 void iwl_txq_ctx_stop(struct iwl_priv *priv)
 {
@@ -524,9 +590,6 @@ void iwl_txq_ctx_stop(struct iwl_priv *priv)
 				    1000);
 	}
 	spin_unlock_irqrestore(&priv->lock, flags);
-
-	/* Deallocate memory for all Tx queues */
-	iwl_hw_txq_ctx_free(priv);
 }
 EXPORT_SYMBOL(iwl_txq_ctx_stop);
 
@@ -1049,6 +1112,14 @@ int iwl_enqueue_hcmd(struct iwl_priv *priv, struct iwl_host_cmd *cmd)
 
 	spin_lock_irqsave(&priv->hcmd_lock, flags);
 
+	/* If this is a huge cmd, mark the huge flag also on the meta.flags
+	 * of the _original_ cmd. This is used for DMA mapping clean up.
+	 */
+	if (cmd->flags & CMD_SIZE_HUGE) {
+		idx = get_cmd_index(q, q->write_ptr, 0);
+		txq->meta[idx].flags = CMD_SIZE_HUGE;
+	}
+
 	idx = get_cmd_index(q, q->write_ptr, cmd->flags & CMD_SIZE_HUGE);
 	out_cmd = txq->cmd[idx];
 	out_meta = &txq->meta[idx];
@@ -1226,6 +1297,7 @@ void iwl_tx_cmd_complete(struct iwl_priv *priv, struct iwl_rx_mem_buffer *rxb)
 	bool huge = !!(pkt->hdr.sequence & SEQ_HUGE_FRAME);
 	struct iwl_device_cmd *cmd;
 	struct iwl_cmd_meta *meta;
+	struct iwl_tx_queue *txq = &priv->txq[IWL_CMD_QUEUE_NUM];
 
 	/* If a Tx command is being handled and it isn't in the actual
 	 * command queue then there a command routing bug has been introduced
@@ -1239,9 +1311,17 @@ void iwl_tx_cmd_complete(struct iwl_priv *priv, struct iwl_rx_mem_buffer *rxb)
 		return;
 	}
 
-	cmd_index = get_cmd_index(&priv->txq[IWL_CMD_QUEUE_NUM].q, index, huge);
-	cmd = priv->txq[IWL_CMD_QUEUE_NUM].cmd[cmd_index];
-	meta = &priv->txq[IWL_CMD_QUEUE_NUM].meta[cmd_index];
+	/* If this is a huge cmd, clear the huge flag on the meta.flags
+	 * of the _original_ cmd. So that iwl_cmd_queue_free won't unmap
+	 * the DMA buffer for the scan (huge) command.
+	 */
+	if (huge) {
+		cmd_index = get_cmd_index(&txq->q, index, 0);
+		txq->meta[cmd_index].flags = 0;
+	}
+	cmd_index = get_cmd_index(&txq->q, index, huge);
+	cmd = txq->cmd[cmd_index];
+	meta = &txq->meta[cmd_index];
 
 	pci_unmap_single(priv->pci_dev,
 			 pci_unmap_addr(meta, mapping),
@@ -1263,6 +1343,7 @@ void iwl_tx_cmd_complete(struct iwl_priv *priv, struct iwl_rx_mem_buffer *rxb)
 			       get_cmd_string(cmd->hdr.cmd));
 		wake_up_interruptible(&priv->wait_command_queue);
 	}
+	meta->flags = 0;
 }
 EXPORT_SYMBOL(iwl_tx_cmd_complete);
 
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 06c33b6..b887e48 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -225,11 +225,11 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
 			switch (sdata->vif.type) {
 			case NL80211_IFTYPE_AP:
 				sdata->vif.bss_conf.enable_beacon =
-					!!rcu_dereference(sdata->u.ap.beacon);
+					!!sdata->u.ap.beacon;
 				break;
 			case NL80211_IFTYPE_ADHOC:
 				sdata->vif.bss_conf.enable_beacon =
-					!!rcu_dereference(sdata->u.ibss.presp);
+					!!sdata->u.ibss.presp;
 				break;
 			case NL80211_IFTYPE_MESH_POINT:
 				sdata->vif.bss_conf.enable_beacon = true;
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 61080c5..7a6bebc 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -749,9 +749,6 @@ ieee80211_mesh_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb)
 
 	switch (fc & IEEE80211_FCTL_STYPE) {
 	case IEEE80211_STYPE_ACTION:
-		if (skb->len < IEEE80211_MIN_ACTION_SIZE)
-			return RX_DROP_MONITOR;
-		/* fall through */
 	case IEEE80211_STYPE_PROBE_RESP:
 	case IEEE80211_STYPE_BEACON:
 		skb_queue_tail(&ifmsh->skb_queue, skb);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index b5c48de..13fcd2d 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -1973,6 +1973,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			goto handled;
 		}
 		break;
+	case MESH_PLINK_CATEGORY:
+	case MESH_PATH_SEL_CATEGORY:
+		if (ieee80211_vif_is_mesh(&sdata->vif))
+			return ieee80211_mesh_rx_mgmt(sdata, rx->skb);
+		break;
 	}
 
 	/*
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 56422d8..fb12cec 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -93,12 +93,18 @@ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 
-	sta = rcu_dereference(local->sta_hash[STA_HASH(addr)]);
+	sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
+				    rcu_read_lock_held() ||
+				    lockdep_is_held(&local->sta_lock) ||
+				    lockdep_is_held(&local->sta_mtx));
 	while (sta) {
 		if (sta->sdata == sdata &&
 		    memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
 			break;
-		sta = rcu_dereference(sta->hnext);
+		sta = rcu_dereference_check(sta->hnext,
+					    rcu_read_lock_held() ||
+					    lockdep_is_held(&local->sta_lock) ||
+					    lockdep_is_held(&local->sta_mtx));
 	}
 	return sta;
 }
@@ -113,13 +119,19 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_local *local = sdata->local;
 	struct sta_info *sta;
 
-	sta = rcu_dereference(local->sta_hash[STA_HASH(addr)]);
+	sta = rcu_dereference_check(local->sta_hash[STA_HASH(addr)],
+				    rcu_read_lock_held() ||
+				    lockdep_is_held(&local->sta_lock) ||
+				    lockdep_is_held(&local->sta_mtx));
 	while (sta) {
 		if ((sta->sdata == sdata ||
 		     sta->sdata->bss == sdata->bss) &&
 		    memcmp(sta->sta.addr, addr, ETH_ALEN) == 0)
 			break;
-		sta = rcu_dereference(sta->hnext);
+		sta = rcu_dereference_check(sta->hnext,
+					    rcu_read_lock_held() ||
+					    lockdep_is_held(&local->sta_lock) ||
+					    lockdep_is_held(&local->sta_mtx));
 	}
 	return sta;
 }
-- 
John W. Linville		Someday the world will need a hero, and you
linville-2XuSBdqkA4R54TAoqtyWWQ@public.gmane.org			might be all we have.  Be ready.
--
To unsubscribe from this list: send the line "unsubscribe linux-wireless" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Pekka Enberg @ 2010-04-07 18:25 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Zhang, Yanmin, Eric Dumazet, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <alpine.DEB.2.00.1004071319320.16159@router.home>

Christoph Lameter wrote:
> On Wed, 7 Apr 2010, Pekka Enberg wrote:
> 
>> Oh, sorry, I think it's actually '____cacheline_aligned_in_smp' (with four
>> underscores) for per-cpu data. Confusing...
> 
> This does not particulary help to clarify the situation since we are
> dealing with data that can either be allocated via the percpu allocator or
> be statically present (kmalloc bootstrap situation).

Yes, I am an idiot. :-)

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Christoph Lameter @ 2010-04-07 18:20 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Zhang, Yanmin, Eric Dumazet, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <4BBCB868.2000705@cs.helsinki.fi>

On Wed, 7 Apr 2010, Pekka Enberg wrote:

> Oh, sorry, I think it's actually '____cacheline_aligned_in_smp' (with four
> underscores) for per-cpu data. Confusing...

This does not particulary help to clarify the situation since we are
dealing with data that can either be allocated via the percpu allocator or
be statically present (kmalloc bootstrap situation).

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Christoph Lameter @ 2010-04-07 18:18 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Zhang, Yanmin, Eric Dumazet, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <4BBCB7B7.4040901@cs.helsinki.fi>

On Wed, 7 Apr 2010, Pekka Enberg wrote:

> Christoph Lameter wrote:
> > I wonder if this is not related to the kmem_cache_cpu structure straggling
> > cache line boundaries under some conditions. On 2.6.33 the kmem_cache_cpu
> > structure was larger and therefore tight packing resulted in different
> > alignment.
> >
> > Could you see how the following patch affects the results. It attempts to
> > increase the size of kmem_cache_cpu to a power of 2 bytes. There is also
> > the potential that other per cpu fetches to neighboring objects affect the
> > situation. We could cacheline align the whole thing.
> >
> > ---
> >  include/linux/slub_def.h |    5 +++++
> >  1 file changed, 5 insertions(+)
> >
> > Index: linux-2.6/include/linux/slub_def.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/slub_def.h	2010-04-07 11:33:50.000000000
> > -0500
> > +++ linux-2.6/include/linux/slub_def.h	2010-04-07 11:35:18.000000000
> > -0500
> > @@ -38,6 +38,11 @@ struct kmem_cache_cpu {
> >  	void **freelist;	/* Pointer to first free per cpu object */
> >  	struct page *page;	/* The slab from which we are allocating */
> >  	int node;		/* The node of the page (or -1 for debug) */
> > +#ifndef CONFIG_64BIT
> > +	int dummy1;
> > +#endif
> > +	unsigned long dummy2;
> > +
> >  #ifdef CONFIG_SLUB_STATS
> >  	unsigned stat[NR_SLUB_STAT_ITEMS];
> >  #endif
>
> Would __cacheline_aligned_in_smp do the trick here?

This is allocated via the percpu allocator. We could specify cacheline
alignment there but that would reduce the density. You basically need 4
words for a kmem_cache_cpu structure. A number of those fit into one 64
byte cacheline.

^ permalink raw reply

* Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net
From: Michael S. Tsirkin @ 2010-04-07 18:09 UTC (permalink / raw)
  To: David Stevens; +Cc: kvm, kvm-owner, netdev, rusty, virtualization
In-Reply-To: <OFC2D0B87C.35124B1B-ON882576FE.005ADDB2-882576FE.0060CB7B@us.ibm.com>

On Wed, Apr 07, 2010 at 10:37:17AM -0700, David Stevens wrote:
> > 
> > Thanks!
> > There's some whitespace damage, are you sending with your new
> > sendmail setup? It seems to have worked for qemu patches ...
> 
>         Yes, I saw some line wraps in what I received, but I checked
> the original draft to be sure and they weren't there. Possibly from
> the relay; Sigh.
> 
> 
> > > @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
> > >        /* TODO: Check specific error and bomb out unless ENOBUFS? */
> > >        err = sock->ops->sendmsg(NULL, sock, &msg, len);
> > >        if (unlikely(err < 0)) {
> > > -         vhost_discard_vq_desc(vq);
> > > -         tx_poll_start(net, sock);
> > > +         if (err == -EAGAIN) {
> > > +            vhost_discard_desc(vq, 1);
> > > +            tx_poll_start(net, sock);
> > > +         } else {
> > > +            vq_err(vq, "sendmsg: errno %d\n", -err);
> > > +            /* drop packet; do not discard/resend */
> > > +            vhost_add_used_and_signal(&net->dev, vq, head,
> > > +                       0);
> > 
> > vhost does not currently has a consistent error handling strategy:
> > if we drop packets, need to think which other errors should cause
> > packet drops.  I prefer to just call vq_err for now,
> > and have us look at handling segfaults etc in a consistent way
> > separately.
> 
> I had to add this to avoid an infinite loop when I wrote a bad
> packet on the socket. I agree error handling needs a better look,
> but retrying a bad packet continuously while dumping in the log
> is what it was doing when I hit an error before this code. Isn't
> this better, at least until a second look?
> 

Hmm, what do you mean 'continuously'? Don't we only try again
on next kick?

> > > +}
> > > +
> > 
> > I wonder whether it makes sense to check
> > skb_queue_empty(&sk->sk_receive_queue)
> > outside the lock, to reduce the cost of this call
> > on an empty queue (we know that it happens at least once
> > each time we exit the loop on rx)?
> 
>         I was looking at alternatives to adding the lock in the
> first place, but I found I couldn't measure a difference in the
> cost with and without the lock.
> 
> 
> > > +   int retries = 0;
> > >     size_t len, total_len = 0;
> > > -   int err;
> > > +   int err, headcount, datalen;
> > >     size_t hdr_size;
> > >     struct socket *sock = rcu_dereference(vq->private_data);
> > >     if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > > @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
> > >     vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> > >        vq->log : NULL;
> > > 
> > > -   for (;;) {
> > > -      head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > > -                ARRAY_SIZE(vq->iov),
> > > -                &out, &in,
> > > -                vq_log, &log);
> > > +   while ((datalen = vhost_head_len(sock->sk))) {
> > > +      headcount = vhost_get_desc_n(vq, vq->heads, datalen, &in,
> > > +                    vq_log, &log);
> > 
> > This looks like a bug, I think we need to pass
> > datalen + header size to vhost_get_desc_n.
> > Not sure how we know the header size that backend will use though.
> > Maybe just look at our features.
> 
>         Yes; we have hdr_size, so I can add it here. It'll be 0 for
> the cases where the backend and guest both have vnet header (either
> the regular or larger mergeable buffers one), but should be added
> in for future raw socket support.

So hdr_size is the wrong thing to add then.
We need to add non-zero value for tap now.

> > 
> > >        /* OK, now we need to know about added descriptors. */
> > > -      if (head == vq->num) {
> > > -         if (unlikely(vhost_enable_notify(vq))) {
> > > +      if (!headcount) {
> > > +         if (retries == 0 && unlikely(vhost_enable_notify(vq))) {
> > >              /* They have slipped one in as we were
> > >               * doing that: check again. */
> > >              vhost_disable_notify(vq);
> > > +            retries++;
> > >              continue;
> > >           }
> > 
> > Hmm. The reason we have the code at all, as the comment says, is because
> > guest could have added more buffers between the time we read last index
> > and the time we enabled notification. So if we just break like this
> > the race still exists. We could remember the
> > last head value we observed, and have vhost_enable_notify check
> > against this value?
> 
>         This is to prevent a spin loop in the case where we have some
> buffers available, but not enough for the current packet (ie, this
> is the replacement code for the "rxmaxheadcount" business). If they
> actually added something new, retrying once should see it, but what
> vhost_enable_notify() returns non-zero on is not "new buffers" but
> rather "not empty". In the case mentioned, we aren't empty, so
> vhost_enable_notify() returns nonzero every time, but the guest hasn't
> given us enough buffers to proceed, so we continuously retry; this
> code breaks the spin loop until we've really got new buffers from
> the guest.

What about the race I point out above? It seems to potentially
cause a deadlock.

> > 
> > Need to think about it.
> > 
> > Another concern here is that on retries vhost_get_desc_n
> > is doing extra work, rescanning the same descriptor
> > again and again. Not sure how common this is, might be
> > worthwhile to add a TODO to consider this at least.
> 
>         I had a printk in there to test the code and with the
> retries counter, it happens when we fill the ring (once,
> because of the retries checks), and then proceeds as
> desired when the guest gives us more buffers. Without the
> check, it spews until we hear from the guest.

I don't understand whether what you write above means 'yes this happens
and is worth optimizing for' or 'this happens very rarely
and is not worth optimizing for'.

> > > @@ -261,14 +275,33 @@ static void handle_rx(struct vhost_net *
> > >                  len, MSG_DONTWAIT | MSG_TRUNC);
> > >        /* TODO: Check specific error and bomb out unless EAGAIN? */
> > >        if (err < 0) {
> > 
> > I think we need to compare err and datalen and drop packet on mismatch 
> as well.
> > The check err > len won't be needed then.
> 
>         OK, but this is the original code, unchanged by my patch. :-)

Right. original code does not know the datalen.

> > > -         vhost_discard_vq_desc(vq);
> > > +         vhost_discard_desc(vq, headcount);
> > >           break;
> > >        }
> > >        /* TODO: Should check and handle checksum. */
> > > +      if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> > > +         struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > > +            (struct virtio_net_hdr_mrg_rxbuf *)
> > > +            vq->iov[0].iov_base;
> > > +         /* add num_buffers */
> > > +         if (vhost_has_feature(&net->dev,
> > > +                     VHOST_NET_F_VIRTIO_NET_HDR))
> > > +            hdr.num_buffers = headcount;
> > 
> > Why is the above necessary?
> 
>         This adds mergeable buffer support for the raw case.

So my suggestion is to have a copy of the header
and then same code will fill in the field for
both raw and tap.

> > 
> > > +         else if (vq->iov[0].iov_len < sizeof(*vhdr)) {
> > 
> > I think length check is already done when we copy the header. No?
> 
>         This sets num_buffers for the tap case (where hdr_size is 0).

Ah, right. So let's just check the total length is > sizeof(*vhdr).

> We don't copy any headers at all for this case, but we need to
> add num_buffers at offset 10 in the user buffer already read by
> recvmsg().
> 
> > 
> > > +            vq_err(vq, "tiny buffers < %d unsupported",
> > > +               vq->iov[0].iov_len);
> > > +            vhost_discard_desc(vq, headcount);
> > > +            break;
> > 
> > Problem here is that recvmsg might modify iov.
> > This is why I think we need to use vq->hdr here.
> 
>         rcvmsg() can never modify the iovec;


Look at af_packet code for an example where it does.
The pointer is non-const, it's OK to modify.
tap used to modify iovec as well, the fact it doesn't
now is a side-effect of reusing same code for
recvmsg and aio read.

> it's the standard socket call.

It's an internal API that is used to implement the call.
iovec it gets is a kernel side copy of what user passed in.

> To use vq->hdr, we'd have to copy
> it in from user space, modify it, and then copy it back
> in to user space, on every packet. In the normal tap case,
> hdr_size is 0 and we read it directly from the socket to
> user space. It is already correct for mergeable buffers,
> except for the num_buffers value added here.


I don't understand what you write above, sorry.
We have iov, all we need is store the first address+length
in the hdr field.
This should be close to free and does not involve any copies
from/to userspace. All the branching and special-casing
is possibly more expensive.


> > 
> > > +         } else if (put_user(headcount, &vhdr->num_buffers)) {
> > 
> > The above put_user writes out a 32 bit value, right?
> > This seems wrong.
> 
>         I'll recheck this -- I'll make the type of "headcount" be
> the type of "num_buffers" in the MRXB vnet header, if it isn't
> already.

No, I was confused Sorry about the noise.

> > 
> > How about using
> >    memcpy_toiovecend(vq->hdr, &headcount,
> >            offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers),
> >            sizeof headcount);
> > 
> > this way we also do not make any assumptions about layout.
> 
>         Because this overwrites the valid vnet header we got from
> the tap socket with a local copy that isn't correct.

It does? I think that this only writes out 2 bytes at an offset.

> For this to
> work, we first need to copy_from_user() the vnet header from the
> socket into vq->hdr (on every packet).

copy_from_user from vnet header to vq->hdr does not
seem to make any sense and is not what I suggested.

>         It doesn't assume anything here-- it requires (and checks)
> that the user didn't give us a <12 byte buffer, which I think is
> reasonable.
> That's about the size of the descriptor for the buffer,
> and I'd call that a broken guest.

I think it's better not to assume this. virtio spec does
mentions layout assumptions as legacy limitations.
Guest can post descriptor consisting of 2 buffers:
1. 10 bytes. 2. 2 bytes. and I don't see a reason
not to support this unless this makes code simpler.
In this case code is more complex.


> The cost of supporting those
> tiny buffers in the general case would be a copy_from_user() and
> copy_to_user() of the vnet_hdr on every packet, which I think is
> not worth it (and especially since I don't expect any guest is
> ever going to give us a <12 byte buffer in the first place).

You keep saying this, but I do not see any need for extra
copy_to_user. Just use memcpy_toiovecend instead of put_user.


> > > @@ -560,9 +593,14 @@ done:
> > > 
> > >  static int vhost_net_set_features(struct vhost_net *n, u64 features)
> > >  {
> > > -   size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> > > -      sizeof(struct virtio_net_hdr) : 0;
> > > +   size_t hdr_size = 0;
> > >     int i;
> > > +
> > > +   if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> > > +      hdr_size = sizeof(struct virtio_net_hdr);
> > > +      if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> > > +         hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> > 
> > My personal style for this would be:
> >      if (!(features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)))
> >       hdr_size = 0
> >    else if (!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
> >       hdr_size = sizeof(virtio_net_hdr);
> >    else
> >       hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> > 
> > which results in more symmetry and less nesting.
> 
> OK.
> 
> > 
> > >     mutex_lock(&n->dev.mutex);
> > >     if ((features & (1 << VHOST_F_LOG_ALL)) &&
> > >         !vhost_log_access_ok(&n->dev)) {
> > > diff -ruNp net-next-p0/drivers/vhost/vhost.c
> > > net-next-v3/drivers/vhost/vhost.c
> > > --- net-next-p0/drivers/vhost/vhost.c   2010-03-22 12:04:38.000000000
> > > -0700
> > > +++ net-next-v3/drivers/vhost/vhost.c   2010-04-06 12:57:51.000000000
> > > -0700
> > > @@ -856,6 +856,47 @@ static unsigned get_indirect(struct vhos
> > >     return 0;
> > >  }
> > > 
> > > +/* This is a multi-buffer version of vhost_get_vq_desc
> > > + * @vq      - the relevant virtqueue
> > > + * datalen   - data length we'll be reading
> > > + * @iovcount   - returned count of io vectors we fill
> > > + * @log      - vhost log
> > > + * @log_num   - log offset
> > > + *   returns number of buffer heads allocated, 0 on error
> > 
> > This is unusual. Let's return a negative error code on error.
> 
>         This is like malloc -- "0" is never a valid return value, and
> we don't have actual error values to return and handle; they generate
> vq_err() messages within the code itself. In all cases,
> it is the count of buffers we allocated (which is 0 when we fail to
> allocate). So, I don't agree with this, but can do it if you insist.

malloc returns NULL, not zero.
standard error handling approaches are:
- NULL on error
- ERR_PTR on error
- >= 0 for success, -errno for error
- true for success false for error

And of course
- some custom strategy so you need to read documentation to figure out
  how to use a function
which is where this one gets classified.
So I am not saying 0 won't work, just that standard stuff is better.

> > 
> > > + */
> > > +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct 
> vring_used_elem
> > > *heads,
> > > +           int datalen, int *iovcount, struct vhost_log *log,
> > > +           unsigned int *log_num)
> > > +{
> > > +   int out, in;
> > > +   int seg = 0;      /* iov index */
> > > +   int hc = 0;      /* head count */
> > > +
> > > +   while (datalen > 0) {
> > > +      if (hc >= VHOST_NET_MAX_SG)
> > > +         goto err;
> > > +      heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
> > > +                     ARRAY_SIZE(vq->iov)-seg, &out,
> > > +                     &in, log, log_num);
> > > +      if (heads[hc].id == vq->num)
> > > +         goto err;
> > > +      if (out || in <= 0) {
> > > +         vq_err(vq, "unexpected descriptor format for RX: "
> > > +            "out %d, in %d\n", out, in);
> > > +         goto err;
> > > +      }
> > > +      heads[hc].len = iov_length(vq->iov+seg, in);
> > > +      datalen -= heads[hc].len;
> > 
> > This signed/unsigned mix makes me nervuous.
> > Let's make datalen unsigned, add unsigned total_len, and
> > while (datalen < total_len).
> > 
> > > +      hc++;
> > > +      seg += in;
> > > +   }
> > > +   *iovcount = seg;
> > > +   return hc;
> > > +err:
> > > +   vhost_discard_desc(vq, hc);
> > > +   return 0;
> > > +}
> > > +
> > >  /* This looks in the virtqueue and for the first available buffer, 
> and
> > > converts
> > >   * it to an iovec for convenient access.  Since descriptors consist 
> of
> > > some
> > >   * number of output then some number of input descriptors, it's
> > > actually two
> > > @@ -863,7 +904,7 @@ static unsigned get_indirect(struct vhos
> > >   *
> > >   * This function returns the descriptor number found, or vq->num 
> (which
> > >   * is never a valid descriptor number) if none was found. */
> > > -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct
> > > vhost_virtqueue *vq,
> > > +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue
> > > *vq,
> > >              struct iovec iov[], unsigned int iov_size,
> > >              unsigned int *out_num, unsigned int *in_num,
> > >              struct vhost_log *log, unsigned int *log_num)
> > > @@ -981,31 +1022,42 @@ unsigned vhost_get_vq_desc(struct vhost_
> > >  }
> > > 
> > >  /* Reverse the effect of vhost_get_vq_desc. Useful for error 
> handling.
> > > */
> > > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> > > +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
> > >  {
> > > -   vq->last_avail_idx--;
> > > +   vq->last_avail_idx -= n;
> > >  }
> > > 
> > >  /* After we've used one of their buffers, we tell them about it. 
> We'll
> > > then
> > >   * want to notify the guest, using eventfd. */
> > > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int
> > > len)
> > > +int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem
> > > *heads,
> > > +         int count)
> > 
> > I think we are better off with vhost_add_used and vhost_add_used_n:
> > the version with _n has a lot of extra complexity, and tx always
> > adds them 1 by one.
> 
>         The external code only calls vhost_add_used_and_signal(), so I
> split that. I can add a single variant of vhost_add_used() too, but
> I was trying to avoid adding duplicate code and external interface.
> I don't actually agree with splitting these; isn't it better to have
> them going through the same code path whether it's single or multiple?
> A bug in one would show up as an intermittent failure, and a change
> to one requires changing both.

It's a small function so I am not really worried about maintainance.

> I don't think the multiple should just
> do the single repeatedly, either, since the multiple variant now can
> do the work in one or two copy_to_user()'s, without a loop;

Yes.

> so splitting
> these just makes a special case for single to carry in code maintenance
> with no benefit, I think.

I donnu, I have a feeling all these loops and memory accesses
on data path won't be free, and for tx this just adds overhead.

> > 
> > >  {
> > >     struct vring_used_elem *used;
> > > +   int start, n;
> > > +
> > > +   if (count <= 0)
> > > +      return -EINVAL;
> > > 
> > > -   /* The virtqueue contains a ring of used buffers.  Get a pointer 
> to
> > > the
> > > -    * next entry in that used ring. */
> > > -   used = &vq->used->ring[vq->last_used_idx % vq->num];
> > > -   if (put_user(head, &used->id)) {
> > > -      vq_err(vq, "Failed to write used id");
> > > +   start = vq->last_used_idx % vq->num;
> > > +   if (vq->num - start < count)
> > > +      n = vq->num - start;
> > > +   else
> > > +      n = count;
> > 
> > use min?
> 
>         Sure.
> 
> > 
> > > +   used = vq->used->ring + start;
> > > +   if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
> > > +      vq_err(vq, "Failed to write used");
> > >        return -EFAULT;
> > >     }
> > > -   if (put_user(len, &used->len)) {
> > > -      vq_err(vq, "Failed to write used len");
> > > -      return -EFAULT;
> > > +   if (n < count) {   /* wrapped the ring */
> > > +      used = vq->used->ring;
> > > +      if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
> > > +         vq_err(vq, "Failed to write used");
> > > +         return -EFAULT;
> > > +      }
> > >     }
> > >     /* Make sure buffer is written before we update index. */
> > >     smp_wmb();
> > > -   if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> > > +   if (put_user(vq->last_used_idx+count, &vq->used->idx)) {
> > 
> > I am a bit confused ... will this write a 32 or 16 bit value?
> > count is 32 bit ... Maybe we are better off with
> >   u16 idx = vq->last_used_idx + count
> >   put_user(idx, &vq->used->idx)
> >   vq->last_used_idx = idx
> 
>         How about a cast?:
> 
>         if (put_user((u16)(vq->last_used_idx+count), &vq->used->idx)) {
> 
> 
>                                         +-DLS

No, I was confused. Code is ok (just whitespace damaged).


^ permalink raw reply

* Re: mmotm 2010-04-05-16-09 uploaded
From: Valdis.Kletnieks @ 2010-04-07 18:01 UTC (permalink / raw)
  To: Andrew Morton, Peter Zijlstra, Ingo Molnar, Patrick McHardy,
	David S. Miller
  Cc: linux-kernel, netfilter-devel, netdev
In-Reply-To: <201004052336.o35NaeSE015814@imap1.linux-foundation.org>

[-- Attachment #1: Type: text/plain, Size: 1946 bytes --]

On Mon, 05 Apr 2010 16:09:45 PDT, akpm@linux-foundation.org said:
> The mm-of-the-moment snapshot 2010-04-05-16-09 has been uploaded to
> 
>    http://userweb.kernel.org/~akpm/mmotm/

Seen in dmesg, 2.6.34-rc2-mmotm0323 didn't do this. Tossing it at all the
likely suspects, hopefully somebody will recognize it and save me the
bisecting. ;)

[   11.488535] ctnetlink v0.93: registering with nfnetlink.
[   11.488579] 
[   11.488579] ===================================================
[   11.489529] [ INFO: suspicious rcu_dereference_check() usage. ]
[   11.489988] ---------------------------------------------------
[   11.490494] net/netfilter/nf_conntrack_ecache.c:88 invoked rcu_dereference_check() without protection!
[   11.491024] 
[   11.491024] other info that might help us debug this:
[   11.491025] 
[   11.492834] 
[   11.492835] rcu_scheduler_active = 1, debug_locks = 0
[   11.494124] 1 lock held by swapper/1:
[   11.494776]  #0:  (nf_ct_ecache_mutex){+.+...}, at: [<ffffffff8148c606>] nf_conntrack_register_notifier+0x1a/0x76
[   11.495505] 
[   11.495506] stack backtrace:
[   11.496927] Pid: 1, comm: swapper Not tainted 2.6.34-rc3-mmotm0405 #1
[   11.497668] Call Trace:
[   11.498419]  [<ffffffff810638c5>] lockdep_rcu_dereference+0xaa/0xb2
[   11.499185]  [<ffffffff8148c629>] nf_conntrack_register_notifier+0x3d/0x76
[   11.499967]  [<ffffffff81b5dd7b>] ctnetlink_init+0x71/0xd5
[   11.500775]  [<ffffffff81b5dd0a>] ? ctnetlink_init+0x0/0xd5
[   11.501579]  [<ffffffff810001ef>] do_one_initcall+0x59/0x14e
[   11.502396]  [<ffffffff81b3268a>] kernel_init+0x144/0x1ce
[   11.503218]  [<ffffffff81003514>] kernel_thread_helper+0x4/0x10
[   11.504047]  [<ffffffff8159cc00>] ? restore_args+0x0/0x30
[   11.504882]  [<ffffffff81b32546>] ? kernel_init+0x0/0x1ce
[   11.505715]  [<ffffffff81003510>] ? kernel_thread_helper+0x0/0x10
[   11.506636] ip_tables: (C) 2000-2006 Netfilter Core Team
[   11.507372] TCP bic registered


[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply

* Re: linux-next: Tree for April 7 (net/core/dev_addr_lists)
From: Randy Dunlap @ 2010-04-07 17:50 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Stephen Rothwell, netdev, linux-next, LKML
In-Reply-To: <1270661889.8141.45.camel@edumazet-laptop>

On Wed, 07 Apr 2010 19:38:09 +0200 Eric Dumazet wrote:

> [PATCH net-next-2.6] net: include linux/proc_fs.h in dev_addr_lists.c
> 
> As pointed by Randy Dunlap, we must include linux/proc_fs.h in
> net/core/dev_addr_lists.c, regardless of CONFIG_PROC_FS
> 
> Reported-by: Randy Dunlap <randy.dunlap@oracle.com>, 
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>

Acked-by: Randy Dunlap <randy.dunlap@oracle.com>

Thanks, Eric.


> ---
> diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
> index 37d5975..508f9c1 100644
> --- a/net/core/dev_addr_lists.c
> +++ b/net/core/dev_addr_lists.c
> @@ -14,6 +14,7 @@
>  #include <linux/netdevice.h>
>  #include <linux/rtnetlink.h>
>  #include <linux/list.h>
> +#include <linux/proc_fs.h>
>  
>  /*
>   * General list handling functions
> @@ -667,7 +668,6 @@ void dev_mc_init(struct net_device *dev)
>  EXPORT_SYMBOL(dev_mc_init);
>  
>  #ifdef CONFIG_PROC_FS
> -#include <linux/proc_fs.h>
>  #include <linux/seq_file.h>
>  
>  static int dev_mc_seq_show(struct seq_file *seq, void *v)
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


---
~Randy

^ permalink raw reply

* [GIT PULL] vhost-net fix for 2.6.34-rc3
From: Michael S. Tsirkin @ 2010-04-07 17:35 UTC (permalink / raw)
  To: David Miller; +Cc: kvm, virtualization, netdev, linux-kernel, Jeff Dike
In-Reply-To: <20100318095355.GA24194@redhat.com>

David,
The following tree includes a patch fixing an issue with vhost-net in
2.6.34-rc3.  Please pull for 2.6.34.

Thanks!

The following changes since commit 2eaa9cfdf33b8d7fb7aff27792192e0019ae8fc6:

  Linux 2.6.34-rc3 (2010-03-30 09:24:39 -0700)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git vhost

Jeff Dike (1):
      vhost-net: fix vq_memory_access_ok error checking

 drivers/vhost/vhost.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

^ permalink raw reply

* Re: linux-next: Tree for April 7 (net/core/dev_addr_lists)
From: Eric Dumazet @ 2010-04-07 17:38 UTC (permalink / raw)
  To: Randy Dunlap, David Miller; +Cc: Stephen Rothwell, netdev, linux-next, LKML
In-Reply-To: <20100407095845.f29ebf48.randy.dunlap@oracle.com>

Le mercredi 07 avril 2010 à 09:58 -0700, Randy Dunlap a écrit :
> On Wed, 7 Apr 2010 17:41:23 +1000 Stephen Rothwell wrote:
> 
> > Hi all,
> > 
> > Changes since 20100401:
> 
> 
> when CONFIG_PROC_FS is disabled:
> 
> linux-next-20100407/net/core/dev_addr_lists.c: In function 'dev_mc_net_init':
> linux-next-20100407/net/core/dev_addr_lists.c:722: error: implicit declaration of function 'proc_net_fops_create'
> linux-next-20100407/net/core/dev_addr_lists.c:722: error: 'dev_mc_seq_fops' undeclared (first use in this function)
> linux-next-20100407/net/core/dev_addr_lists.c:722: error: (Each undeclared identifier is reported only once
> linux-next-20100407/net/core/dev_addr_lists.c:722: error: for each function it appears in.)
> linux-next-20100407/net/core/dev_addr_lists.c: In function 'dev_mc_net_exit':
> linux-next-20100407/net/core/dev_addr_lists.c:729: error: implicit declaration of function 'proc_net_remove'
> make[3]: *** [net/core/dev_addr_lists.o] Error 1
> 
> ---
> ~Randy

Thanks Randy, this should correct this

[PATCH net-next-2.6] net: include linux/proc_fs.h in dev_addr_lists.c

As pointed by Randy Dunlap, we must include linux/proc_fs.h in
net/core/dev_addr_lists.c, regardless of CONFIG_PROC_FS

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>, 
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 37d5975..508f9c1 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -14,6 +14,7 @@
 #include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/list.h>
+#include <linux/proc_fs.h>
 
 /*
  * General list handling functions
@@ -667,7 +668,6 @@ void dev_mc_init(struct net_device *dev)
 EXPORT_SYMBOL(dev_mc_init);
 
 #ifdef CONFIG_PROC_FS
-#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
 static int dev_mc_seq_show(struct seq_file *seq, void *v)

^ permalink raw reply related

* Re: [PATCH v3] Add Mergeable receive buffer support to vhost_net
From: David Stevens @ 2010-04-07 17:37 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, kvm-owner, netdev, rusty, virtualization
In-Reply-To: <20100407105910.GD9550@redhat.com>

> 
> Thanks!
> There's some whitespace damage, are you sending with your new
> sendmail setup? It seems to have worked for qemu patches ...

        Yes, I saw some line wraps in what I received, but I checked
the original draft to be sure and they weren't there. Possibly from
the relay; Sigh.


> > @@ -167,8 +166,15 @@ static void handle_tx(struct vhost_net *
> >        /* TODO: Check specific error and bomb out unless ENOBUFS? */
> >        err = sock->ops->sendmsg(NULL, sock, &msg, len);
> >        if (unlikely(err < 0)) {
> > -         vhost_discard_vq_desc(vq);
> > -         tx_poll_start(net, sock);
> > +         if (err == -EAGAIN) {
> > +            vhost_discard_desc(vq, 1);
> > +            tx_poll_start(net, sock);
> > +         } else {
> > +            vq_err(vq, "sendmsg: errno %d\n", -err);
> > +            /* drop packet; do not discard/resend */
> > +            vhost_add_used_and_signal(&net->dev, vq, head,
> > +                       0);
> 
> vhost does not currently has a consistent error handling strategy:
> if we drop packets, need to think which other errors should cause
> packet drops.  I prefer to just call vq_err for now,
> and have us look at handling segfaults etc in a consistent way
> separately.

I had to add this to avoid an infinite loop when I wrote a bad
packet on the socket. I agree error handling needs a better look,
but retrying a bad packet continuously while dumping in the log
is what it was doing when I hit an error before this code. Isn't
this better, at least until a second look?


> > +}
> > +
> 
> I wonder whether it makes sense to check
> skb_queue_empty(&sk->sk_receive_queue)
> outside the lock, to reduce the cost of this call
> on an empty queue (we know that it happens at least once
> each time we exit the loop on rx)?

        I was looking at alternatives to adding the lock in the
first place, but I found I couldn't measure a difference in the
cost with and without the lock.


> > +   int retries = 0;
> >     size_t len, total_len = 0;
> > -   int err;
> > +   int err, headcount, datalen;
> >     size_t hdr_size;
> >     struct socket *sock = rcu_dereference(vq->private_data);
> >     if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
> > @@ -222,31 +242,25 @@ static void handle_rx(struct vhost_net *
> >     vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
> >        vq->log : NULL;
> > 
> > -   for (;;) {
> > -      head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
> > -                ARRAY_SIZE(vq->iov),
> > -                &out, &in,
> > -                vq_log, &log);
> > +   while ((datalen = vhost_head_len(sock->sk))) {
> > +      headcount = vhost_get_desc_n(vq, vq->heads, datalen, &in,
> > +                    vq_log, &log);
> 
> This looks like a bug, I think we need to pass
> datalen + header size to vhost_get_desc_n.
> Not sure how we know the header size that backend will use though.
> Maybe just look at our features.

        Yes; we have hdr_size, so I can add it here. It'll be 0 for
the cases where the backend and guest both have vnet header (either
the regular or larger mergeable buffers one), but should be added
in for future raw socket support.

> 
> >        /* OK, now we need to know about added descriptors. */
> > -      if (head == vq->num) {
> > -         if (unlikely(vhost_enable_notify(vq))) {
> > +      if (!headcount) {
> > +         if (retries == 0 && unlikely(vhost_enable_notify(vq))) {
> >              /* They have slipped one in as we were
> >               * doing that: check again. */
> >              vhost_disable_notify(vq);
> > +            retries++;
> >              continue;
> >           }
> 
> Hmm. The reason we have the code at all, as the comment says, is because
> guest could have added more buffers between the time we read last index
> and the time we enabled notification. So if we just break like this
> the race still exists. We could remember the
> last head value we observed, and have vhost_enable_notify check
> against this value?

        This is to prevent a spin loop in the case where we have some
buffers available, but not enough for the current packet (ie, this
is the replacement code for the "rxmaxheadcount" business). If they
actually added something new, retrying once should see it, but what
vhost_enable_notify() returns non-zero on is not "new buffers" but
rather "not empty". In the case mentioned, we aren't empty, so
vhost_enable_notify() returns nonzero every time, but the guest hasn't
given us enough buffers to proceed, so we continuously retry; this
code breaks the spin loop until we've really got new buffers from
the guest.

> 
> Need to think about it.
> 
> Another concern here is that on retries vhost_get_desc_n
> is doing extra work, rescanning the same descriptor
> again and again. Not sure how common this is, might be
> worthwhile to add a TODO to consider this at least.

        I had a printk in there to test the code and with the
retries counter, it happens when we fill the ring (once,
because of the retries checks), and then proceeds as
desired when the guest gives us more buffers. Without the
check, it spews until we hear from the guest.

> > @@ -261,14 +275,33 @@ static void handle_rx(struct vhost_net *
> >                  len, MSG_DONTWAIT | MSG_TRUNC);
> >        /* TODO: Check specific error and bomb out unless EAGAIN? */
> >        if (err < 0) {
> 
> I think we need to compare err and datalen and drop packet on mismatch 
as well.
> The check err > len won't be needed then.

        OK, but this is the original code, unchanged by my patch. :-)
> 
> > -         vhost_discard_vq_desc(vq);
> > +         vhost_discard_desc(vq, headcount);
> >           break;
> >        }
> >        /* TODO: Should check and handle checksum. */
> > +      if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) {
> > +         struct virtio_net_hdr_mrg_rxbuf *vhdr =
> > +            (struct virtio_net_hdr_mrg_rxbuf *)
> > +            vq->iov[0].iov_base;
> > +         /* add num_buffers */
> > +         if (vhost_has_feature(&net->dev,
> > +                     VHOST_NET_F_VIRTIO_NET_HDR))
> > +            hdr.num_buffers = headcount;
> 
> Why is the above necessary?

        This adds mergeable buffer support for the raw case.
> 
> > +         else if (vq->iov[0].iov_len < sizeof(*vhdr)) {
> 
> I think length check is already done when we copy the header. No?

        This sets num_buffers for the tap case (where hdr_size is 0).
We don't copy any headers at all for this case, but we need to
add num_buffers at offset 10 in the user buffer already read by
recvmsg().

> 
> > +            vq_err(vq, "tiny buffers < %d unsupported",
> > +               vq->iov[0].iov_len);
> > +            vhost_discard_desc(vq, headcount);
> > +            break;
> 
> Problem here is that recvmsg might modify iov.
> This is why I think we need to use vq->hdr here.

        rcvmsg() can never modify the iovec; it's the
standard socket call. To use vq->hdr, we'd have to copy
it in from user space, modify it, and then copy it back
in to user space, on every packet. In the normal tap case,
hdr_size is 0 and we read it directly from the socket to
user space. It is already correct for mergeable buffers,
except for the num_buffers value added here.

> 
> > +         } else if (put_user(headcount, &vhdr->num_buffers)) {
> 
> The above put_user writes out a 32 bit value, right?
> This seems wrong.

        I'll recheck this -- I'll make the type of "headcount" be
the type of "num_buffers" in the MRXB vnet header, if it isn't
already.

> 
> How about using
>    memcpy_toiovecend(vq->hdr, &headcount,
>            offsetof(struct virtio_net_hdr_mrg_rxbuf, num_buffers),
>            sizeof headcount);
> 
> this way we also do not make any assumptions about layout.

        Because this overwrites the valid vnet header we got from
the tap socket with a local copy that isn't correct. For this to
work, we first need to copy_from_user() the vnet header from the
socket into vq->hdr (on every packet).
        It doesn't assume anything here-- it requires (and checks)
that the user didn't give us a <12 byte buffer, which I think is
reasonable. That's about the size of the descriptor for the buffer,
and I'd call that a broken guest. The cost of supporting those
tiny buffers in the general case would be a copy_from_user() and
copy_to_user() of the vnet_hdr on every packet, which I think is
not worth it (and especially since I don't expect any guest is
ever going to give us a <12 byte buffer in the first place).


> > @@ -560,9 +593,14 @@ done:
> > 
> >  static int vhost_net_set_features(struct vhost_net *n, u64 features)
> >  {
> > -   size_t hdr_size = features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ?
> > -      sizeof(struct virtio_net_hdr) : 0;
> > +   size_t hdr_size = 0;
> >     int i;
> > +
> > +   if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)) {
> > +      hdr_size = sizeof(struct virtio_net_hdr);
> > +      if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
> > +         hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> My personal style for this would be:
>      if (!(features & (1 << VHOST_NET_F_VIRTIO_NET_HDR)))
>       hdr_size = 0
>    else if (!(features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
>       hdr_size = sizeof(virtio_net_hdr);
>    else
>       hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> which results in more symmetry and less nesting.

OK.

> 
> >     mutex_lock(&n->dev.mutex);
> >     if ((features & (1 << VHOST_F_LOG_ALL)) &&
> >         !vhost_log_access_ok(&n->dev)) {
> > diff -ruNp net-next-p0/drivers/vhost/vhost.c
> > net-next-v3/drivers/vhost/vhost.c
> > --- net-next-p0/drivers/vhost/vhost.c   2010-03-22 12:04:38.000000000
> > -0700
> > +++ net-next-v3/drivers/vhost/vhost.c   2010-04-06 12:57:51.000000000
> > -0700
> > @@ -856,6 +856,47 @@ static unsigned get_indirect(struct vhos
> >     return 0;
> >  }
> > 
> > +/* This is a multi-buffer version of vhost_get_vq_desc
> > + * @vq      - the relevant virtqueue
> > + * datalen   - data length we'll be reading
> > + * @iovcount   - returned count of io vectors we fill
> > + * @log      - vhost log
> > + * @log_num   - log offset
> > + *   returns number of buffer heads allocated, 0 on error
> 
> This is unusual. Let's return a negative error code on error.

        This is like malloc -- "0" is never a valid return value, and
we don't have actual error values to return and handle; they generate
vq_err() messages within the code itself. In all cases,
it is the count of buffers we allocated (which is 0 when we fail to
allocate). So, I don't agree with this, but can do it if you insist.

> 
> > + */
> > +int vhost_get_desc_n(struct vhost_virtqueue *vq, struct 
vring_used_elem
> > *heads,
> > +           int datalen, int *iovcount, struct vhost_log *log,
> > +           unsigned int *log_num)
> > +{
> > +   int out, in;
> > +   int seg = 0;      /* iov index */
> > +   int hc = 0;      /* head count */
> > +
> > +   while (datalen > 0) {
> > +      if (hc >= VHOST_NET_MAX_SG)
> > +         goto err;
> > +      heads[hc].id = vhost_get_desc(vq->dev, vq, vq->iov+seg,
> > +                     ARRAY_SIZE(vq->iov)-seg, &out,
> > +                     &in, log, log_num);
> > +      if (heads[hc].id == vq->num)
> > +         goto err;
> > +      if (out || in <= 0) {
> > +         vq_err(vq, "unexpected descriptor format for RX: "
> > +            "out %d, in %d\n", out, in);
> > +         goto err;
> > +      }
> > +      heads[hc].len = iov_length(vq->iov+seg, in);
> > +      datalen -= heads[hc].len;
> 
> This signed/unsigned mix makes me nervuous.
> Let's make datalen unsigned, add unsigned total_len, and
> while (datalen < total_len).
> 
> > +      hc++;
> > +      seg += in;
> > +   }
> > +   *iovcount = seg;
> > +   return hc;
> > +err:
> > +   vhost_discard_desc(vq, hc);
> > +   return 0;
> > +}
> > +
> >  /* This looks in the virtqueue and for the first available buffer, 
and
> > converts
> >   * it to an iovec for convenient access.  Since descriptors consist 
of
> > some
> >   * number of output then some number of input descriptors, it's
> > actually two
> > @@ -863,7 +904,7 @@ static unsigned get_indirect(struct vhos
> >   *
> >   * This function returns the descriptor number found, or vq->num 
(which
> >   * is never a valid descriptor number) if none was found. */
> > -unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct
> > vhost_virtqueue *vq,
> > +unsigned vhost_get_desc(struct vhost_dev *dev, struct vhost_virtqueue
> > *vq,
> >              struct iovec iov[], unsigned int iov_size,
> >              unsigned int *out_num, unsigned int *in_num,
> >              struct vhost_log *log, unsigned int *log_num)
> > @@ -981,31 +1022,42 @@ unsigned vhost_get_vq_desc(struct vhost_
> >  }
> > 
> >  /* Reverse the effect of vhost_get_vq_desc. Useful for error 
handling.
> > */
> > -void vhost_discard_vq_desc(struct vhost_virtqueue *vq)
> > +void vhost_discard_desc(struct vhost_virtqueue *vq, int n)
> >  {
> > -   vq->last_avail_idx--;
> > +   vq->last_avail_idx -= n;
> >  }
> > 
> >  /* After we've used one of their buffers, we tell them about it. 
We'll
> > then
> >   * want to notify the guest, using eventfd. */
> > -int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int
> > len)
> > +int vhost_add_used(struct vhost_virtqueue *vq, struct vring_used_elem
> > *heads,
> > +         int count)
> 
> I think we are better off with vhost_add_used and vhost_add_used_n:
> the version with _n has a lot of extra complexity, and tx always
> adds them 1 by one.

        The external code only calls vhost_add_used_and_signal(), so I
split that. I can add a single variant of vhost_add_used() too, but
I was trying to avoid adding duplicate code and external interface.
I don't actually agree with splitting these; isn't it better to have
them going through the same code path whether it's single or multiple?
A bug in one would show up as an intermittent failure, and a change
to one requires changing both. I don't think the multiple should just
do the single repeatedly, either, since the multiple variant now can
do the work in one or two copy_to_user()'s, without a loop; so splitting
these just makes a special case for single to carry in code maintenance
with no benefit, I think.

> 
> >  {
> >     struct vring_used_elem *used;
> > +   int start, n;
> > +
> > +   if (count <= 0)
> > +      return -EINVAL;
> > 
> > -   /* The virtqueue contains a ring of used buffers.  Get a pointer 
to
> > the
> > -    * next entry in that used ring. */
> > -   used = &vq->used->ring[vq->last_used_idx % vq->num];
> > -   if (put_user(head, &used->id)) {
> > -      vq_err(vq, "Failed to write used id");
> > +   start = vq->last_used_idx % vq->num;
> > +   if (vq->num - start < count)
> > +      n = vq->num - start;
> > +   else
> > +      n = count;
> 
> use min?

        Sure.

> 
> > +   used = vq->used->ring + start;
> > +   if (copy_to_user(used, heads, sizeof(heads[0])*n)) {
> > +      vq_err(vq, "Failed to write used");
> >        return -EFAULT;
> >     }
> > -   if (put_user(len, &used->len)) {
> > -      vq_err(vq, "Failed to write used len");
> > -      return -EFAULT;
> > +   if (n < count) {   /* wrapped the ring */
> > +      used = vq->used->ring;
> > +      if (copy_to_user(used, heads+n, sizeof(heads[0])*(count-n))) {
> > +         vq_err(vq, "Failed to write used");
> > +         return -EFAULT;
> > +      }
> >     }
> >     /* Make sure buffer is written before we update index. */
> >     smp_wmb();
> > -   if (put_user(vq->last_used_idx + 1, &vq->used->idx)) {
> > +   if (put_user(vq->last_used_idx+count, &vq->used->idx)) {
> 
> I am a bit confused ... will this write a 32 or 16 bit value?
> count is 32 bit ... Maybe we are better off with
>   u16 idx = vq->last_used_idx + count
>   put_user(idx, &vq->used->idx)
>   vq->last_used_idx = idx

        How about a cast?:

        if (put_user((u16)(vq->last_used_idx+count), &vq->used->idx)) {


                                        +-DLS


^ permalink raw reply

* Re: linux-next: Tree for April 7 (net/core/dev_addr_lists)
From: Randy Dunlap @ 2010-04-07 16:58 UTC (permalink / raw)
  To: Stephen Rothwell, netdev; +Cc: linux-next, LKML
In-Reply-To: <20100407174123.81b32310.sfr@canb.auug.org.au>

On Wed, 7 Apr 2010 17:41:23 +1000 Stephen Rothwell wrote:

> Hi all,
> 
> Changes since 20100401:


when CONFIG_PROC_FS is disabled:

linux-next-20100407/net/core/dev_addr_lists.c: In function 'dev_mc_net_init':
linux-next-20100407/net/core/dev_addr_lists.c:722: error: implicit declaration of function 'proc_net_fops_create'
linux-next-20100407/net/core/dev_addr_lists.c:722: error: 'dev_mc_seq_fops' undeclared (first use in this function)
linux-next-20100407/net/core/dev_addr_lists.c:722: error: (Each undeclared identifier is reported only once
linux-next-20100407/net/core/dev_addr_lists.c:722: error: for each function it appears in.)
linux-next-20100407/net/core/dev_addr_lists.c: In function 'dev_mc_net_exit':
linux-next-20100407/net/core/dev_addr_lists.c:729: error: implicit declaration of function 'proc_net_remove'
make[3]: *** [net/core/dev_addr_lists.o] Error 1

---
~Randy

^ permalink raw reply

* Re: [RFC] change dst_entry padding from using an array to using __attribute__
From: Eric Dumazet @ 2010-04-07 16:56 UTC (permalink / raw)
  To: Laurent Chavey; +Cc: netdev
In-Reply-To: <1270659329.8141.40.camel@edumazet-laptop>

Le mercredi 07 avril 2010 à 18:55 +0200, Eric Dumazet a écrit :

> We dont want a huge hole (say.. 120 bytes) being not noticed.

sorry, 60 bytes of course.



^ permalink raw reply

* Re: [RFC] change dst_entry padding from using an array to using __attribute__
From: Eric Dumazet @ 2010-04-07 16:55 UTC (permalink / raw)
  To: Laurent Chavey; +Cc: netdev
In-Reply-To: <t2o97949e3e1004070946mfacef7c0la6bddc38d29720ae@mail.gmail.com>

Le mercredi 07 avril 2010 à 09:46 -0700, Laurent Chavey a écrit :
> what are the benefit(s)  of using an array to force a struct
> element to be aligned on 64 bytes / 32 bytes boundaries
> versus using gcc __attribute__.
> 
> 
> ------------------------------------
> 
> struct dst_entry {
> 
>        /*
>         * Align __refcnt to a 64 bytes alignment
>         * (L1_CACHE_SIZE would be too much)
>         */
> - #ifdef CONFIG_64BIT
> -       long                    __pad_to_align_refcnt[1];
> - #else
> -       long                    __pad_to_align_refcnt[0];
> - #endif
> +       atomic_t                __refcnt __attribute__
> ((aligned(64))); /* client references    */
> #undef __padding__
>        /*
>         * __refcnt wants to be on a different cache line from
>         * input/output/ops or performance tanks badly
>         */
>        atomic_t                __refcnt;       /* client references    */
> };

We dont want a huge hole (say.. 120 bytes) being not noticed.

Some machines around have 4 millions dst entries.

This deserve us being not lazy :)

Thanks



^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Pekka Enberg @ 2010-04-07 16:52 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Zhang, Yanmin, Eric Dumazet, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <4BBCB7B7.4040901@cs.helsinki.fi>

Pekka Enberg wrote:
> Christoph Lameter wrote:
>> I wonder if this is not related to the kmem_cache_cpu structure 
>> straggling
>> cache line boundaries under some conditions. On 2.6.33 the kmem_cache_cpu
>> structure was larger and therefore tight packing resulted in different
>> alignment.
>>
>> Could you see how the following patch affects the results. It attempts to
>> increase the size of kmem_cache_cpu to a power of 2 bytes. There is also
>> the potential that other per cpu fetches to neighboring objects affect 
>> the
>> situation. We could cacheline align the whole thing.
>>
>> ---
>>  include/linux/slub_def.h |    5 +++++
>>  1 file changed, 5 insertions(+)
>>
>> Index: linux-2.6/include/linux/slub_def.h
>> ===================================================================
>> --- linux-2.6.orig/include/linux/slub_def.h    2010-04-07 
>> 11:33:50.000000000 -0500
>> +++ linux-2.6/include/linux/slub_def.h    2010-04-07 
>> 11:35:18.000000000 -0500
>> @@ -38,6 +38,11 @@ struct kmem_cache_cpu {
>>      void **freelist;    /* Pointer to first free per cpu object */
>>      struct page *page;    /* The slab from which we are allocating */
>>      int node;        /* The node of the page (or -1 for debug) */
>> +#ifndef CONFIG_64BIT
>> +    int dummy1;
>> +#endif
>> +    unsigned long dummy2;
>> +
>>  #ifdef CONFIG_SLUB_STATS
>>      unsigned stat[NR_SLUB_STAT_ITEMS];
>>  #endif
> 
> Would __cacheline_aligned_in_smp do the trick here?

Oh, sorry, I think it's actually '____cacheline_aligned_in_smp' (with 
four underscores) for per-cpu data. Confusing...

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Pekka Enberg @ 2010-04-07 16:49 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Zhang, Yanmin, Eric Dumazet, netdev, Tejun Heo, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <alpine.DEB.2.00.1004071130260.13261@router.home>

Christoph Lameter wrote:
> I wonder if this is not related to the kmem_cache_cpu structure straggling
> cache line boundaries under some conditions. On 2.6.33 the kmem_cache_cpu
> structure was larger and therefore tight packing resulted in different
> alignment.
> 
> Could you see how the following patch affects the results. It attempts to
> increase the size of kmem_cache_cpu to a power of 2 bytes. There is also
> the potential that other per cpu fetches to neighboring objects affect the
> situation. We could cacheline align the whole thing.
> 
> ---
>  include/linux/slub_def.h |    5 +++++
>  1 file changed, 5 insertions(+)
> 
> Index: linux-2.6/include/linux/slub_def.h
> ===================================================================
> --- linux-2.6.orig/include/linux/slub_def.h	2010-04-07 11:33:50.000000000 -0500
> +++ linux-2.6/include/linux/slub_def.h	2010-04-07 11:35:18.000000000 -0500
> @@ -38,6 +38,11 @@ struct kmem_cache_cpu {
>  	void **freelist;	/* Pointer to first free per cpu object */
>  	struct page *page;	/* The slab from which we are allocating */
>  	int node;		/* The node of the page (or -1 for debug) */
> +#ifndef CONFIG_64BIT
> +	int dummy1;
> +#endif
> +	unsigned long dummy2;
> +
>  #ifdef CONFIG_SLUB_STATS
>  	unsigned stat[NR_SLUB_STAT_ITEMS];
>  #endif

Would __cacheline_aligned_in_smp do the trick here?

^ permalink raw reply

* [RFC] change dst_entry padding from using an array to using __attribute__
From: Laurent Chavey @ 2010-04-07 16:46 UTC (permalink / raw)
  To: netdev; +Cc: laurent chavey

what are the benefit(s)  of using an array to force a struct
element to be aligned on 64 bytes / 32 bytes boundaries
versus using gcc __attribute__.


------------------------------------

struct dst_entry {

       /*
        * Align __refcnt to a 64 bytes alignment
        * (L1_CACHE_SIZE would be too much)
        */
- #ifdef CONFIG_64BIT
-       long                    __pad_to_align_refcnt[1];
- #else
-       long                    __pad_to_align_refcnt[0];
- #endif
+       atomic_t                __refcnt __attribute__
((aligned(64))); /* client references    */
#undef __padding__
       /*
        * __refcnt wants to be on a different cache line from
        * input/output/ops or performance tanks badly
        */
       atomic_t                __refcnt;       /* client references    */
};

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Christoph Lameter @ 2010-04-07 16:43 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: Eric Dumazet, netdev, Tejun Heo, Pekka Enberg, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <1270607668.2078.259.camel@ymzhang.sh.intel.com>

On Wed, 7 Apr 2010, Zhang, Yanmin wrote:

> I collected retired instruction, dtlb miss and LLC miss.
> Below is data of LLC miss.
>
> Kernel 2.6.33:
>     20.94%        hackbench  [kernel.kallsyms]                                       [k] copy_user_generic_string
>     14.56%        hackbench  [kernel.kallsyms]                                       [k] unix_stream_recvmsg
>     12.88%        hackbench  [kernel.kallsyms]                                       [k] kfree
>      7.37%        hackbench  [kernel.kallsyms]                                       [k] kmem_cache_free
>      7.18%        hackbench  [kernel.kallsyms]                                       [k] kmem_cache_alloc_node
>      6.78%        hackbench  [kernel.kallsyms]                                       [k] kfree_skb
>      6.27%        hackbench  [kernel.kallsyms]                                       [k] __kmalloc_node_track_caller
>      2.73%        hackbench  [kernel.kallsyms]                                       [k] __slab_free
>      2.21%        hackbench  [kernel.kallsyms]                                       [k] get_partial_node
>      2.01%        hackbench  [kernel.kallsyms]                                       [k] _raw_spin_lock
>      1.59%        hackbench  [kernel.kallsyms]                                       [k] schedule
>      1.27%        hackbench  hackbench                                               [.] receiver
>      0.99%        hackbench  libpthread-2.9.so                                       [.] __read
>      0.87%        hackbench  [kernel.kallsyms]                                       [k] unix_stream_sendmsg
>
> Kernel 2.6.34-rc3:
>     18.55%        hackbench  [kernel.kallsyms]                                                     [k] copy_user_generic_str
> ing
>     13.19%        hackbench  [kernel.kallsyms]                                                     [k] unix_stream_recvmsg
>     11.62%        hackbench  [kernel.kallsyms]                                                     [k] kfree
>      8.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_free
>      7.88%        hackbench  [kernel.kallsyms]                                                     [k] __kmalloc_node_track_
> caller

Seems that the overhead of __kmalloc_node_track_caller was increased. The
function inlines slab_alloc().

>      6.54%        hackbench  [kernel.kallsyms]                                                     [k] kmem_cache_alloc_node
>      5.94%        hackbench  [kernel.kallsyms]                                                     [k] kfree_skb
>      3.48%        hackbench  [kernel.kallsyms]                                                     [k] __slab_free
>      2.15%        hackbench  [kernel.kallsyms]                                                     [k] _raw_spin_lock
>      1.83%        hackbench  [kernel.kallsyms]                                                     [k] schedule
>      1.82%        hackbench  [kernel.kallsyms]                                                     [k] get_partial_node
>      1.59%        hackbench  hackbench                                                             [.] receiver
>      1.37%        hackbench  libpthread-2.9.so                                                     [.] __read

I wonder if this is not related to the kmem_cache_cpu structure straggling
cache line boundaries under some conditions. On 2.6.33 the kmem_cache_cpu
structure was larger and therefore tight packing resulted in different
alignment.

Could you see how the following patch affects the results. It attempts to
increase the size of kmem_cache_cpu to a power of 2 bytes. There is also
the potential that other per cpu fetches to neighboring objects affect the
situation. We could cacheline align the whole thing.

---
 include/linux/slub_def.h |    5 +++++
 1 file changed, 5 insertions(+)

Index: linux-2.6/include/linux/slub_def.h
===================================================================
--- linux-2.6.orig/include/linux/slub_def.h	2010-04-07 11:33:50.000000000 -0500
+++ linux-2.6/include/linux/slub_def.h	2010-04-07 11:35:18.000000000 -0500
@@ -38,6 +38,11 @@ struct kmem_cache_cpu {
 	void **freelist;	/* Pointer to first free per cpu object */
 	struct page *page;	/* The slab from which we are allocating */
 	int node;		/* The node of the page (or -1 for debug) */
+#ifndef CONFIG_64BIT
+	int dummy1;
+#endif
+	unsigned long dummy2;
+
 #ifdef CONFIG_SLUB_STATS
 	unsigned stat[NR_SLUB_STAT_ITEMS];
 #endif

^ permalink raw reply

* Re: hackbench regression due to commit 9dfc6e68bfe6e
From: Christoph Lameter @ 2010-04-07 16:30 UTC (permalink / raw)
  To: Zhang, Yanmin
  Cc: Eric Dumazet, netdev, Tejun Heo, Pekka Enberg, alex.shi,
	linux-kernel@vger.kernel.org, Ma, Ling, Chen, Tim C,
	Andrew Morton
In-Reply-To: <1270607668.2078.259.camel@ymzhang.sh.intel.com>

On Wed, 7 Apr 2010, Zhang, Yanmin wrote:

> > booting with slub_min_order=3 do change hackbench results for example ;)
> By default, slub_min_order=3 on my Nehalem machines. I also tried different
> larger slub_min_order and didn't find help.

Lets stop fiddling with kernel command line parameters for these test.
Leave as default. That is how I tested.

^ permalink raw reply

* Re: [Bug 15703] New: Getting "MD5 Hash failed for" for fragmented IP packets.
From: Stephen Hemminger @ 2010-04-07 16:28 UTC (permalink / raw)
  To: bugzilla-daemon, vishal.swarnkar; +Cc: netdev
In-Reply-To: <bug-15703-100@https.bugzilla.kernel.org/>

On Tue, 6 Apr 2010 09:52:46 GMT
bugzilla-daemon@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=15703
> 
>            Summary: Getting "MD5 Hash failed for" for fragmented IP
>                     packets.
>            Product: Networking
>            Version: 2.5
>     Kernel Version: 2.6.31-14-generic
>           Platform: All
>         OS/Version: Linux
>               Tree: Mainline
>             Status: NEW
>           Severity: normal
>           Priority: P1
>          Component: IPV4
>         AssignedTo: shemminger@linux-foundation.org
>         ReportedBy: vishal.swarnkar@gmail.com
>                 CC: vishal.swarnkar@gmail.com
>         Regression: No
> 
> 
> I am using Ubuntu 9.10 with a kernel 2.6.31-14-generic.
> 
> At my linux box I am receiving IP packets with TCP as payload with TCP-MD5
> option as its a BGP update.
> 
> If I am getting fragmented IP packets on my Linux box then I am receiving only
> one fragment of the IP packet and I don't receive the second fragments because
> of some settings at my firewall ( I discard small packets, and the second
> fragment has only 20 bytes of data, so I consider it as small packet and always
> discard it).
> 
> I have ensured that I am not receiving the second fragment of the packet all
> the time ( in all further retransmission ) using sniffers.
> 
> Now for a IP fragmented packet I keep on getting messages 
> "MD5 hash failed for IP(src)--> IP(Dst)". I receive this message for all
> fragmented packets. The message is in tcp_ipv4.c - >tcp_v4_inbound_md5_hash
> method.
> 
> I hope I should not be getting these message because the fragmented packets
> should not be pushed to the upper layer( TCP) for further sanity checks(
> MD5-check sum verification), until all fragments are assembled together.
> 
> I tried to look at the ip_rcv code and found that ip_local_deliver(struct
> sk_buff* skb) is calling ip_defrag() function to check if the fragmentation
> task is still in progress or not.
> 
> I tried to dig down more into ip_defrag function and the return values from
> ip_defrag function and I hope that the check in the ip_local_deliver function
> is not correct.
> 
> int ip_local_deliver(struct sk_buff *skb)
> {
>     /*
>      *    Reassemble IP fragments.
>      */
> 
>     if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
>         if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
>             return 0;
>     }
> 
>     return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
>                ip_local_deliver_finish);
> }
> 
> 
> I think the check for ip_defrag should be like this 
> ------>>>>if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER) < 0) 
> 
> we should check the negative return to avoid ip_local_deliver_finish, instead
> of checking the null return.

Maybe path MTU discovery can help with this.

^ permalink raw reply

* [PATCH] xtables: make XT_ALIGN() usable in exported headers by exporting __ALIGN_KERNEL()
From: Alexey Dobriyan @ 2010-04-07 16:22 UTC (permalink / raw)
  To: kaber; +Cc: linux-kernel, netdev, shemminger, bhutchings, andreas, hadi,
	hideaki

XT_ALIGN() was rewritten through ALIGN() by commit 42107f5009da223daa800d6da6904d77297ae829
"netfilter: xtables: symmetric COMPAT_XT_ALIGN definition".
ALIGN() is not exported in userspace headers, which created compile problem for tc(8)
and will create problem for iptables(8).

We can't export generic looking name ALIGN() but we can export less generic
__ALIGN_KERNEL() (suggested by Ben Hutchings).
Google knows nothing about __ALIGN_KERNEL().

COMPAT_XT_ALIGN() changed for symmetry.

Reported-by: Andreas Henriksson <andreas@fatal.se>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---

 include/linux/kernel.h             |    5 +++--
 include/linux/netfilter/x_tables.h |    6 +++---
 2 files changed, 6 insertions(+), 5 deletions(-)

--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -4,6 +4,8 @@
 /*
  * 'kernel.h' contains some often-used function prototypes etc
  */
+#define __ALIGN_KERNEL(x, a)		__ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
+#define __ALIGN_KERNEL_MASK(x, mask)	(((x) + (mask)) & ~(mask))
 
 #ifdef __KERNEL__
 
@@ -37,8 +39,7 @@ extern const char linux_proc_banner[];
 
 #define STACK_MAGIC	0xdeadbeef
 
-#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
-#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+#define ALIGN(x, a)		__ALIGN_KERNEL((x), (a))
 #define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
 #define IS_ALIGNED(x, a)		(((x) & ((typeof(x))(a) - 1)) == 0)
 
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -1,6 +1,6 @@
 #ifndef _X_TABLES_H
 #define _X_TABLES_H
-
+#include <linux/kernel.h>
 #include <linux/types.h>
 
 #define XT_FUNCTION_MAXNAMELEN 30
@@ -93,7 +93,7 @@ struct _xt_align {
 	__u64 u64;
 };
 
-#define XT_ALIGN(s) ALIGN((s), __alignof__(struct _xt_align))
+#define XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _xt_align))
 
 /* Standard return verdict, or do jump. */
 #define XT_STANDARD_TARGET ""
@@ -598,7 +598,7 @@ struct _compat_xt_align {
 	compat_u64 u64;
 };
 
-#define COMPAT_XT_ALIGN(s) ALIGN((s), __alignof__(struct _compat_xt_align))
+#define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align))
 
 extern void xt_compat_lock(u_int8_t af);
 extern void xt_compat_unlock(u_int8_t af);

^ permalink raw reply

* Re: [RFC PATCH net-next 2/4] gianfar: Added timer feature for eTSEC
From: Scott Wood @ 2010-04-07 16:21 UTC (permalink / raw)
  To: Manfred Rudigier
  Cc: 'sandeep.kumar@freescale.com',
	'netdev@vger.kernel.org',
	'linuxppc-dev@lists.ozlabs.org'
In-Reply-To: <95DC1AA8EC908B48939B72CF375AA5E311A9F4E0@alice.at.omicron.at>

On Wed, Apr 07, 2010 at 11:46:24AM +0200, Manfred Rudigier wrote:
>  	switch (config.tx_type) {
>  	case HWTSTAMP_TX_OFF:
> +		priv->hwts_tx_en = 0;
>  		break;
>  	case HWTSTAMP_TX_ON:
> +		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
> +			return -ERANGE;
> +		priv->hwts_tx_en = 1;
>  		return -ERANGE;
>  	default:
>  		return -ERANGE;
> @@ -796,8 +802,12 @@ static int gfar_hwtstamp_ioctl(struct net_device *netdev,
>  
>  	switch (config.rx_filter) {
>  	case HWTSTAMP_FILTER_NONE:
> +		priv->hwts_rx_en = 0;
>  		break;
>  	default:
> +		if (!(priv->device_flags & FSL_GIANFAR_DEV_HAS_TIMER))
> +			return -ERANGE;
> +		priv->hwts_rx_en = 1;
>  		return -ERANGE;
>  	}

You need to acquire bflock (or avoid using bitfields for this), or you could
race with the setting of wol_en or rx_csum_enable.

-Scott

^ permalink raw reply

* Re: [PATCH] IPVS: replace sprintf to snprintf to avoid stack buffer overflow
From: Patrick McHardy @ 2010-04-07 16:09 UTC (permalink / raw)
  To: Simon Horman
  Cc: wzt.wzt, linux-kernel, Wensong Zhang, Julian Anastasov, netdev,
	lvs-devel
In-Reply-To: <20100406032248.GF30359@verge.net.au>

[-- Attachment #1: Type: text/plain, Size: 968 bytes --]

Simon Horman wrote:
> On Tue, Apr 06, 2010 at 10:50:20AM +0800, wzt.wzt@gmail.com wrote:
>> IPVS not check the length of pp->name, use sprintf will cause stack buffer overflow.
>> struct ip_vs_protocol{} declare name as char *, if register a protocol as:
>> struct ip_vs_protocol ip_vs_test = {
>>         .name =			"aaaaaaaa....128...aaa",
>> 	.debug_packet =         ip_vs_tcpudp_debug_packet,
>> };
>>
>> when called ip_vs_tcpudp_debug_packet(), sprintf(buf, "%s TRUNCATED", pp->name); 
>> will cause stack buffer overflow.
>>
>> Signed-off-by: Zhitong Wang <zhitong.wangzt@alibaba-inc.com>
> 
> I think that the simple answer is, don't do that.

Indeed.

> But your patch seems entirely reasonable to me.
> 
> Acked-by: Simon Horman <horms@verge.net.au>
> 
> Patrick, please consider merging this.

I think this fix is a bit silly, we can simply print the name in
the pr_debug() statement and avoid both the potential overflow
and truncation.

How does this look?

[-- Attachment #2: x --]
[-- Type: text/plain, Size: 3385 bytes --]

diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 0e58455..27add97 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -166,26 +166,24 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else if (ih->frag_off & htons(IP_OFFSET))
-		sprintf(buf, "%s %pI4->%pI4 frag",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr);
 	else {
 		__be16 _ports[2], *pptr
 ;
 		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
-			sprintf(buf, "%s TRUNCATED %pI4->%pI4",
-				pp->name, &ih->saddr, &ih->daddr);
+			sprintf(buf, "TRUNCATED %pI4->%pI4",
+				&ih->saddr, &ih->daddr);
 		else
-			sprintf(buf, "%s %pI4:%u->%pI4:%u",
-				pp->name,
+			sprintf(buf, "%pI4:%u->%pI4:%u",
 				&ih->saddr, ntohs(pptr[0]),
 				&ih->daddr, ntohs(pptr[1]));
 	}
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -200,26 +198,24 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else if (ih->nexthdr == IPPROTO_FRAGMENT)
-		sprintf(buf, "%s %pI6->%pI6 frag",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI6->%pI6 frag",	&ih->saddr, &ih->daddr);
 	else {
 		__be16 _ports[2], *pptr;
 
 		pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr),
 					  sizeof(_ports), _ports);
 		if (pptr == NULL)
-			sprintf(buf, "%s TRUNCATED %pI6->%pI6",
-				pp->name, &ih->saddr, &ih->daddr);
+			sprintf(buf, "TRUNCATED %pI6->%pI6",
+				&ih->saddr, &ih->daddr);
 		else
-			sprintf(buf, "%s %pI6:%u->%pI6:%u",
-				pp->name,
+			sprintf(buf, "%pI6:%u->%pI6:%u",
 				&ih->saddr, ntohs(pptr[0]),
 				&ih->daddr, ntohs(pptr[1]));
 	}
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 #endif
 
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index c30b43c..1892dfc 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -136,12 +136,11 @@ ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else
-		sprintf(buf, "%s %pI4->%pI4",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr);
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 
 #ifdef CONFIG_IP_VS_IPV6
@@ -154,12 +153,11 @@ ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb,
 
 	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
 	if (ih == NULL)
-		sprintf(buf, "%s TRUNCATED", pp->name);
+		sprintf(buf, "TRUNCATED");
 	else
-		sprintf(buf, "%s %pI6->%pI6",
-			pp->name, &ih->saddr, &ih->daddr);
+		sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr);
 
-	pr_debug("%s: %s\n", msg, buf);
+	pr_debug("%s: %s %s\n", msg, pp->name, buf);
 }
 #endif
 

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox