Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFT PATCH] net: remove legacy ethtool ops
From: Michał Mirosław @ 2011-05-07 11:48 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Patrick McHardy, Ben Hutchings, Jeff Kirsher,
	e1000-devel

As all drivers are converted, we may now remove discrete offload setting
callback handling.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---

Note: This needs to wait for Intel guys to finish conversion of their
LAN drivers.

 include/linux/ethtool.h   |   52 ------
 include/linux/netdevice.h |   16 --
 net/8021q/vlan_dev.c      |    2 +-
 net/core/dev.c            |   13 +-
 net/core/ethtool.c        |  399 +++------------------------------------------
 5 files changed, 28 insertions(+), 454 deletions(-)

diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index 4194a20..2ef53fa 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -691,9 +691,6 @@ enum ethtool_sfeatures_retval_bits {
 
 #include <linux/rculist.h>
 
-/* needed by dev_disable_lro() */
-extern int __ethtool_set_flags(struct net_device *dev, u32 flags);
-
 struct ethtool_rx_ntuple_flow_spec_container {
 	struct ethtool_rx_ntuple_flow_spec fs;
 	struct list_head list;
@@ -726,18 +723,6 @@ struct net_device;
 
 /* Some generic methods drivers may use in their ethtool_ops */
 u32 ethtool_op_get_link(struct net_device *dev);
-u32 ethtool_op_get_tx_csum(struct net_device *dev);
-int ethtool_op_set_tx_csum(struct net_device *dev, u32 data);
-int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data);
-int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data);
-u32 ethtool_op_get_sg(struct net_device *dev);
-int ethtool_op_set_sg(struct net_device *dev, u32 data);
-u32 ethtool_op_get_tso(struct net_device *dev);
-int ethtool_op_set_tso(struct net_device *dev, u32 data);
-u32 ethtool_op_get_ufo(struct net_device *dev);
-int ethtool_op_set_ufo(struct net_device *dev, u32 data);
-u32 ethtool_op_get_flags(struct net_device *dev);
-int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported);
 void ethtool_ntuple_flush(struct net_device *dev);
 bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
 
@@ -784,22 +769,6 @@ bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
  * @get_pauseparam: Report pause parameters
  * @set_pauseparam: Set pause parameters.  Returns a negative error code
  *	or zero.
- * @get_rx_csum: Deprecated in favour of the netdev feature %NETIF_F_RXCSUM.
- *	Report whether receive checksums are turned on or off.
- * @set_rx_csum: Deprecated in favour of generic netdev features.  Turn
- *	receive checksum on or off.  Returns a negative error code or zero.
- * @get_tx_csum: Deprecated as redundant. Report whether transmit checksums
- *	are turned on or off.
- * @set_tx_csum: Deprecated in favour of generic netdev features.  Turn
- *	transmit checksums on or off.  Returns a egative error code or zero.
- * @get_sg: Deprecated as redundant.  Report whether scatter-gather is
- *	enabled.  
- * @set_sg: Deprecated in favour of generic netdev features.  Turn
- *	scatter-gather on or off. Returns a negative error code or zero.
- * @get_tso: Deprecated as redundant.  Report whether TCP segmentation
- *	offload is enabled.
- * @set_tso: Deprecated in favour of generic netdev features.  Turn TCP
- *	segmentation offload on or off.  Returns a negative error code or zero.
  * @self_test: Run specified self-tests
  * @get_strings: Return a set of strings that describe the requested objects
  * @set_phys_id: Identify the physical devices, e.g. by flashing an LED
@@ -827,15 +796,6 @@ bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported);
  *	negative error code or zero.
  * @complete: Function to be called after any other operation except
  *	@begin.  Will be called even if the other operation failed.
- * @get_ufo: Deprecated as redundant.  Report whether UDP fragmentation
- *	offload is enabled.
- * @set_ufo: Deprecated in favour of generic netdev features.  Turn UDP
- *	fragmentation offload on or off.  Returns a negative error code or zero.
- * @get_flags: Deprecated as redundant.  Report features included in
- *	&enum ethtool_flags that are enabled.  
- * @set_flags: Deprecated in favour of generic netdev features.  Turn
- *	features included in &enum ethtool_flags on or off.  Returns a
- *	negative error code or zero.
  * @get_priv_flags: Report driver-specific feature flags.
  * @set_priv_flags: Set driver-specific feature flags.  Returns a negative
  *	error code or zero.
@@ -897,14 +857,6 @@ struct ethtool_ops {
 				  struct ethtool_pauseparam*);
 	int	(*set_pauseparam)(struct net_device *,
 				  struct ethtool_pauseparam*);
-	u32	(*get_rx_csum)(struct net_device *);
-	int	(*set_rx_csum)(struct net_device *, u32);
-	u32	(*get_tx_csum)(struct net_device *);
-	int	(*set_tx_csum)(struct net_device *, u32);
-	u32	(*get_sg)(struct net_device *);
-	int	(*set_sg)(struct net_device *, u32);
-	u32	(*get_tso)(struct net_device *);
-	int	(*set_tso)(struct net_device *, u32);
 	void	(*self_test)(struct net_device *, struct ethtool_test *, u64 *);
 	void	(*get_strings)(struct net_device *, u32 stringset, u8 *);
 	int	(*set_phys_id)(struct net_device *, enum ethtool_phys_id_state);
@@ -913,10 +865,6 @@ struct ethtool_ops {
 				     struct ethtool_stats *, u64 *);
 	int	(*begin)(struct net_device *);
 	void	(*complete)(struct net_device *);
-	u32	(*get_ufo)(struct net_device *);
-	int	(*set_ufo)(struct net_device *, u32);
-	u32	(*get_flags)(struct net_device *);
-	int	(*set_flags)(struct net_device *, u32);
 	u32	(*get_priv_flags)(struct net_device *);
 	int	(*set_priv_flags)(struct net_device *, u32);
 	int	(*get_sset_count)(struct net_device *, int);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d5de66a..7be3ca2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2600,22 +2600,6 @@ extern struct pernet_operations __net_initdata loopback_net_ops;
 int dev_ethtool_get_settings(struct net_device *dev,
 			     struct ethtool_cmd *cmd);
 
-static inline u32 dev_ethtool_get_rx_csum(struct net_device *dev)
-{
-	if (dev->features & NETIF_F_RXCSUM)
-		return 1;
-	if (!dev->ethtool_ops || !dev->ethtool_ops->get_rx_csum)
-		return 0;
-	return dev->ethtool_ops->get_rx_csum(dev);
-}
-
-static inline u32 dev_ethtool_get_flags(struct net_device *dev)
-{
-	if (!dev->ethtool_ops || !dev->ethtool_ops->get_flags)
-		return 0;
-	return dev->ethtool_ops->get_flags(dev);
-}
-
 /* Logging, debugging and troubleshooting/diagnostic helpers. */
 
 /* netdev_printk helpers, similar to dev_printk */
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 526159a..df66715 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -592,7 +592,7 @@ static u32 vlan_dev_fix_features(struct net_device *dev, u32 features)
 
 	features &= real_dev->features;
 	features &= real_dev->vlan_features;
-	if (dev_ethtool_get_rx_csum(real_dev))
+	if (real_dev->features & NETIF_F_RXCSUM)
 		features |= NETIF_F_RXCSUM;
 	features |= NETIF_F_LLTX;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 44ef8f8..7193499 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1304,19 +1304,12 @@ EXPORT_SYMBOL(dev_close);
  */
 void dev_disable_lro(struct net_device *dev)
 {
-	u32 flags;
+	dev->wanted_features &= ~NETIF_F_LRO;
+	netdev_update_features(dev);
 
-	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
-		flags = dev->ethtool_ops->get_flags(dev);
-	else
-		flags = ethtool_op_get_flags(dev);
-
-	if (!(flags & ETH_FLAG_LRO))
-		return;
-
-	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
 	if (unlikely(dev->features & NETIF_F_LRO))
 		netdev_WARN(dev, "failed to disable LRO!\n");
+
 }
 EXPORT_SYMBOL(dev_disable_lro);
 
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index d8b1a8d..34f32b0 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -36,139 +36,6 @@ u32 ethtool_op_get_link(struct net_device *dev)
 }
 EXPORT_SYMBOL(ethtool_op_get_link);
 
-u32 ethtool_op_get_tx_csum(struct net_device *dev)
-{
-	return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_tx_csum);
-
-int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
-{
-	if (data)
-		dev->features |= NETIF_F_IP_CSUM;
-	else
-		dev->features &= ~NETIF_F_IP_CSUM;
-
-	return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-
-int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
-{
-	if (data)
-		dev->features |= NETIF_F_HW_CSUM;
-	else
-		dev->features &= ~NETIF_F_HW_CSUM;
-
-	return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
-
-int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
-{
-	if (data)
-		dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
-	else
-		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-
-	return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
-
-u32 ethtool_op_get_sg(struct net_device *dev)
-{
-	return (dev->features & NETIF_F_SG) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_sg);
-
-int ethtool_op_set_sg(struct net_device *dev, u32 data)
-{
-	if (data)
-		dev->features |= NETIF_F_SG;
-	else
-		dev->features &= ~NETIF_F_SG;
-
-	return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_sg);
-
-u32 ethtool_op_get_tso(struct net_device *dev)
-{
-	return (dev->features & NETIF_F_TSO) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_tso);
-
-int ethtool_op_set_tso(struct net_device *dev, u32 data)
-{
-	if (data)
-		dev->features |= NETIF_F_TSO;
-	else
-		dev->features &= ~NETIF_F_TSO;
-
-	return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tso);
-
-u32 ethtool_op_get_ufo(struct net_device *dev)
-{
-	return (dev->features & NETIF_F_UFO) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_ufo);
-
-int ethtool_op_set_ufo(struct net_device *dev, u32 data)
-{
-	if (data)
-		dev->features |= NETIF_F_UFO;
-	else
-		dev->features &= ~NETIF_F_UFO;
-	return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_ufo);
-
-/* the following list of flags are the same as their associated
- * NETIF_F_xxx values in include/linux/netdevice.h
- */
-static const u32 flags_dup_features =
-	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
-	 ETH_FLAG_RXHASH);
-
-u32 ethtool_op_get_flags(struct net_device *dev)
-{
-	/* in the future, this function will probably contain additional
-	 * handling for flags which are not so easily handled
-	 * by a simple masking operation
-	 */
-
-	return dev->features & flags_dup_features;
-}
-EXPORT_SYMBOL(ethtool_op_get_flags);
-
-/* Check if device can enable (or disable) particular feature coded in "data"
- * argument. Flags "supported" describe features that can be toggled by device.
- * If feature can not be toggled, it state (enabled or disabled) must match
- * hardcoded device features state, otherwise flags are marked as invalid.
- */
-bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported)
-{
-	u32 features = dev->features & flags_dup_features;
-	/* "data" can contain only flags_dup_features bits,
-	 * see __ethtool_set_flags */
-
-	return (features & ~supported) != (data & ~supported);
-}
-EXPORT_SYMBOL(ethtool_invalid_flags);
-
-int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
-{
-	if (ethtool_invalid_flags(dev, data, supported))
-		return -EINVAL;
-
-	dev->features = ((dev->features & ~flags_dup_features) |
-			 (data & flags_dup_features));
-	return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_flags);
-
 void ethtool_ntuple_flush(struct net_device *dev)
 {
 	struct ethtool_rx_ntuple_flow_spec_container *fsc, *f;
@@ -185,76 +52,6 @@ EXPORT_SYMBOL(ethtool_ntuple_flush);
 
 #define ETHTOOL_DEV_FEATURE_WORDS	1
 
-static void ethtool_get_features_compat(struct net_device *dev,
-	struct ethtool_get_features_block *features)
-{
-	if (!dev->ethtool_ops)
-		return;
-
-	/* getting RX checksum */
-	if (dev->ethtool_ops->get_rx_csum)
-		if (dev->ethtool_ops->get_rx_csum(dev))
-			features[0].active |= NETIF_F_RXCSUM;
-
-	/* mark legacy-changeable features */
-	if (dev->ethtool_ops->set_sg)
-		features[0].available |= NETIF_F_SG;
-	if (dev->ethtool_ops->set_tx_csum)
-		features[0].available |= NETIF_F_ALL_CSUM;
-	if (dev->ethtool_ops->set_tso)
-		features[0].available |= NETIF_F_ALL_TSO;
-	if (dev->ethtool_ops->set_rx_csum)
-		features[0].available |= NETIF_F_RXCSUM;
-	if (dev->ethtool_ops->set_flags)
-		features[0].available |= flags_dup_features;
-}
-
-static int ethtool_set_feature_compat(struct net_device *dev,
-	int (*legacy_set)(struct net_device *, u32),
-	struct ethtool_set_features_block *features, u32 mask)
-{
-	u32 do_set;
-
-	if (!legacy_set)
-		return 0;
-
-	if (!(features[0].valid & mask))
-		return 0;
-
-	features[0].valid &= ~mask;
-
-	do_set = !!(features[0].requested & mask);
-
-	if (legacy_set(dev, do_set) < 0)
-		netdev_info(dev,
-			"Legacy feature change (%s) failed for 0x%08x\n",
-			do_set ? "set" : "clear", mask);
-
-	return 1;
-}
-
-static int ethtool_set_features_compat(struct net_device *dev,
-	struct ethtool_set_features_block *features)
-{
-	int compat;
-
-	if (!dev->ethtool_ops)
-		return 0;
-
-	compat  = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg,
-		features, NETIF_F_SG);
-	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum,
-		features, NETIF_F_ALL_CSUM);
-	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso,
-		features, NETIF_F_ALL_TSO);
-	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum,
-		features, NETIF_F_RXCSUM);
-	compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_flags,
-		features, flags_dup_features);
-
-	return compat;
-}
-
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
 {
 	struct ethtool_gfeatures cmd = {
@@ -272,8 +69,6 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
 	u32 __user *sizeaddr;
 	u32 copy_size;
 
-	ethtool_get_features_compat(dev, features);
-
 	sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
 	if (get_user(copy_size, sizeaddr))
 		return -EFAULT;
@@ -309,9 +104,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
 	if (features[0].valid & ~NETIF_F_ETHTOOL_BITS)
 		return -EINVAL;
 
-	if (ethtool_set_features_compat(dev, features))
-		ret |= ETHTOOL_F_COMPAT;
-
 	if (features[0].valid & ~dev->hw_features) {
 		features[0].valid &= dev->hw_features;
 		ret |= ETHTOOL_F_UNSUPPORTED;
@@ -422,34 +214,6 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd)
 	}
 }
 
-static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
-{
-	const struct ethtool_ops *ops = dev->ethtool_ops;
-
-	if (!ops)
-		return NULL;
-
-	switch (ethcmd) {
-	case ETHTOOL_GTXCSUM:
-		return ops->get_tx_csum;
-	case ETHTOOL_GRXCSUM:
-		return ops->get_rx_csum;
-	case ETHTOOL_SSG:
-		return ops->get_sg;
-	case ETHTOOL_STSO:
-		return ops->get_tso;
-	case ETHTOOL_SUFO:
-		return ops->get_ufo;
-	default:
-		return NULL;
-	}
-}
-
-static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
-{
-	return !!(dev->features & NETIF_F_ALL_CSUM);
-}
-
 static int ethtool_get_one_feature(struct net_device *dev,
 	char __user *useraddr, u32 ethcmd)
 {
@@ -459,31 +223,11 @@ static int ethtool_get_one_feature(struct net_device *dev,
 		.data = !!(dev->features & mask),
 	};
 
-	/* compatibility with discrete get_ ops */
-	if (!(dev->hw_features & mask)) {
-		u32 (*actor)(struct net_device *);
-
-		actor = __ethtool_get_one_feature_actor(dev, ethcmd);
-
-		/* bug compatibility with old get_rx_csum */
-		if (ethcmd == ETHTOOL_GRXCSUM && !actor)
-			actor = __ethtool_get_rx_csum_oldbug;
-
-		if (actor)
-			edata.data = actor(dev);
-	}
-
 	if (copy_to_user(useraddr, &edata, sizeof(edata)))
 		return -EFAULT;
 	return 0;
 }
 
-static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
-static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
-static int __ethtool_set_sg(struct net_device *dev, u32 data);
-static int __ethtool_set_tso(struct net_device *dev, u32 data);
-static int __ethtool_set_ufo(struct net_device *dev, u32 data);
-
 static int ethtool_set_one_feature(struct net_device *dev,
 	void __user *useraddr, u32 ethcmd)
 {
@@ -495,56 +239,38 @@ static int ethtool_set_one_feature(struct net_device *dev,
 
 	mask = ethtool_get_feature_mask(ethcmd);
 	mask &= dev->hw_features;
-	if (mask) {
-		if (edata.data)
-			dev->wanted_features |= mask;
-		else
-			dev->wanted_features &= ~mask;
-
-		__netdev_update_features(dev);
-		return 0;
-	}
-
-	/* Driver is not converted to ndo_fix_features or does not
-	 * support changing this offload. In the latter case it won't
-	 * have corresponding ethtool_ops field set.
-	 *
-	 * Following part is to be removed after all drivers advertise
-	 * their changeable features in netdev->hw_features and stop
-	 * using discrete offload setting ops.
-	 */
-
-	switch (ethcmd) {
-	case ETHTOOL_STXCSUM:
-		return __ethtool_set_tx_csum(dev, edata.data);
-	case ETHTOOL_SRXCSUM:
-		return __ethtool_set_rx_csum(dev, edata.data);
-	case ETHTOOL_SSG:
-		return __ethtool_set_sg(dev, edata.data);
-	case ETHTOOL_STSO:
-		return __ethtool_set_tso(dev, edata.data);
-	case ETHTOOL_SUFO:
-		return __ethtool_set_ufo(dev, edata.data);
-	default:
+	if (!mask)
 		return -EOPNOTSUPP;
-	}
+
+	if (edata.data)
+		dev->wanted_features |= mask;
+	else
+		dev->wanted_features &= ~mask;
+
+	__netdev_update_features(dev);
+
+	return 0;
+}
+
+/* the following list of flags are the same as their associated
+ * NETIF_F_xxx values in include/linux/netdevice.h
+ */
+static const u32 flags_dup_features =
+	(ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
+	 ETH_FLAG_RXHASH);
+
+static u32 __ethtool_get_flags(struct net_device *dev)
+{
+	return dev->features & flags_dup_features;
 }
 
-int __ethtool_set_flags(struct net_device *dev, u32 data)
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
 {
 	u32 changed;
 
 	if (data & ~flags_dup_features)
 		return -EINVAL;
 
-	/* legacy set_flags() op */
-	if (dev->ethtool_ops->set_flags) {
-		if (unlikely(dev->hw_features & flags_dup_features))
-			netdev_warn(dev,
-				"driver BUG: mixed hw_features and set_flags()\n");
-		return dev->ethtool_ops->set_flags(dev, data);
-	}
-
 	/* allow changing only bits set in hw_features */
 	changed = (data ^ dev->features) & flags_dup_features;
 	if (changed & ~dev->hw_features)
@@ -1502,81 +1228,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
 	return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
 }
 
-static int __ethtool_set_sg(struct net_device *dev, u32 data)
-{
-	int err;
-
-	if (!dev->ethtool_ops->set_sg)
-		return -EOPNOTSUPP;
-
-	if (data && !(dev->features & NETIF_F_ALL_CSUM))
-		return -EINVAL;
-
-	if (!data && dev->ethtool_ops->set_tso) {
-		err = dev->ethtool_ops->set_tso(dev, 0);
-		if (err)
-			return err;
-	}
-
-	if (!data && dev->ethtool_ops->set_ufo) {
-		err = dev->ethtool_ops->set_ufo(dev, 0);
-		if (err)
-			return err;
-	}
-	return dev->ethtool_ops->set_sg(dev, data);
-}
-
-static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
-{
-	int err;
-
-	if (!dev->ethtool_ops->set_tx_csum)
-		return -EOPNOTSUPP;
-
-	if (!data && dev->ethtool_ops->set_sg) {
-		err = __ethtool_set_sg(dev, 0);
-		if (err)
-			return err;
-	}
-
-	return dev->ethtool_ops->set_tx_csum(dev, data);
-}
-
-static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
-{
-	if (!dev->ethtool_ops->set_rx_csum)
-		return -EOPNOTSUPP;
-
-	if (!data)
-		dev->features &= ~NETIF_F_GRO;
-
-	return dev->ethtool_ops->set_rx_csum(dev, data);
-}
-
-static int __ethtool_set_tso(struct net_device *dev, u32 data)
-{
-	if (!dev->ethtool_ops->set_tso)
-		return -EOPNOTSUPP;
-
-	if (data && !(dev->features & NETIF_F_SG))
-		return -EINVAL;
-
-	return dev->ethtool_ops->set_tso(dev, data);
-}
-
-static int __ethtool_set_ufo(struct net_device *dev, u32 data)
-{
-	if (!dev->ethtool_ops->set_ufo)
-		return -EOPNOTSUPP;
-	if (data && !(dev->features & NETIF_F_SG))
-		return -EINVAL;
-	if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
-		(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
-			== (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
-		return -EINVAL;
-	return dev->ethtool_ops->set_ufo(dev, data);
-}
-
 static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
 {
 	struct ethtool_test test;
@@ -1965,9 +1616,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 		break;
 	case ETHTOOL_GFLAGS:
 		rc = ethtool_get_value(dev, useraddr, ethcmd,
-				       (dev->ethtool_ops->get_flags ?
-					dev->ethtool_ops->get_flags :
-					ethtool_op_get_flags));
+					__ethtool_get_flags);
 		break;
 	case ETHTOOL_SFLAGS:
 		rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
-- 
1.7.2.5


^ permalink raw reply related

* [RFC PATCH] net: fold dev_disable_lro() into netdev_fix_features()
From: Michał Mirosław @ 2011-05-07 11:48 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Stephen Hemminger, Alexey Kuznetsov,
	Pekka Savola (ipv6), James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, Eric Dumazet, Tom Herbert, Ben Hutchings, bridge

This moves checks that device is forwarding from bridge, IPv4 and IPv6
code into netdev_fix_features(). As a side effect, after device is no longer
forwarding it gets LRO back. This also means that user is not allowed to
enable LRO after device is put to forwarding mode.

This patch depends on removal of discrete offload setting ethtool ops.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 include/linux/netdevice.h |    1 -
 net/bridge/br_if.c        |    6 +++---
 net/core/dev.c            |   41 +++++++++++++++++++++--------------------
 net/ipv4/devinet.c        |   20 +++++++++-----------
 net/ipv6/addrconf.c       |    7 +++----
 5 files changed, 36 insertions(+), 39 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7be3ca2..3a8c21d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1627,7 +1627,6 @@ extern struct net_device	*__dev_get_by_name(struct net *net, const char *name);
 extern int		dev_alloc_name(struct net_device *dev, const char *name);
 extern int		dev_open(struct net_device *dev);
 extern int		dev_close(struct net_device *dev);
-extern void		dev_disable_lro(struct net_device *dev);
 extern int		dev_queue_xmit(struct sk_buff *skb);
 extern int		register_netdevice(struct net_device *dev);
 extern void		unregister_netdevice_queue(struct net_device *dev,
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 5dbdfdf..62aab1e 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -158,6 +158,8 @@ static void del_nbp(struct net_bridge_port *p)
 	br_netpoll_disable(p);
 
 	call_rcu(&p->rcu, destroy_nbp_rcu);
+
+	netdev_update_features(dev);
 }
 
 /* called with RTNL */
@@ -368,11 +370,9 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
 
 	dev->priv_flags |= IFF_BRIDGE_PORT;
 
-	dev_disable_lro(dev);
-
 	list_add_rcu(&p->list, &br->port_list);
 
-	netdev_update_features(br->dev);
+	netdev_update_features(dev);
 
 	spin_lock_bh(&br->lock);
 	changed_addr = br_stp_recalculate_bridge_id(br);
diff --git a/net/core/dev.c b/net/core/dev.c
index 7193499..3d646c9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
 #include <trace/events/skb.h>
 #include <linux/pci.h>
 #include <linux/inetdevice.h>
+#include <net/addrconf.h>
 #include <linux/cpu_rmap.h>
 
 #include "net-sysfs.h"
@@ -1294,26 +1295,6 @@ int dev_close(struct net_device *dev)
 EXPORT_SYMBOL(dev_close);
 
 
-/**
- *	dev_disable_lro - disable Large Receive Offload on a device
- *	@dev: device
- *
- *	Disable Large Receive Offload (LRO) on a net device.  Must be
- *	called under RTNL.  This is needed if received packets may be
- *	forwarded to another interface.
- */
-void dev_disable_lro(struct net_device *dev)
-{
-	dev->wanted_features &= ~NETIF_F_LRO;
-	netdev_update_features(dev);
-
-	if (unlikely(dev->features & NETIF_F_LRO))
-		netdev_WARN(dev, "failed to disable LRO!\n");
-
-}
-EXPORT_SYMBOL(dev_disable_lro);
-
-
 static int dev_boot_phase = 1;
 
 /**
@@ -5239,6 +5220,26 @@ u32 netdev_fix_features(struct net_device *dev, u32 features)
 		}
 	}
 
+	if (features & NETIF_F_LRO) {
+		struct in_device *in4_dev;
+		struct inet6_dev *in6_dev;
+
+		/* disable LRO for bridge ports */
+		if (dev->priv_flags & IFF_BRIDGE_PORT) {
+			netdev_info(dev, "Disabling LRO for bridge port.\n");
+			features &= NETIF_F_LRO;
+		} else /* ... or when forwarding IPv4 */
+		if (((in4_dev = __in_dev_get_rtnl(dev))) &&
+		    IN_DEV_CONF_GET(in4_dev, FORWARDING)) {
+			netdev_info(dev, "Disabling LRO for IPv4 router port.\n");
+			features &= NETIF_F_LRO;
+		} else /* ... or when forwarding IPv6 */
+		if (((in6_dev = __in6_dev_get(dev))) && in6_dev->cnf.forwarding) {
+			netdev_info(dev, "Disabling LRO for IPv6 router port.\n");
+			features &= NETIF_F_LRO;
+		}
+	}
+
 	return features;
 }
 EXPORT_SYMBOL(netdev_fix_features);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cd9ca08..e9c0557 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -245,8 +245,6 @@ static struct in_device *inetdev_init(struct net_device *dev)
 	in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
 	if (!in_dev->arp_parms)
 		goto out_kfree;
-	if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
-		dev_disable_lro(dev);
 	/* Reference in_dev->dev */
 	dev_hold(dev);
 	/* Account for reference dev->ip_ptr (below) */
@@ -259,6 +257,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
 
 	/* we can receive as soon as ip_ptr is set -- do this last */
 	rcu_assign_pointer(dev->ip_ptr, in_dev);
+
+	netdev_update_features(dev);
 out:
 	return in_dev;
 out_kfree:
@@ -1475,14 +1475,12 @@ static void inet_forward_change(struct net *net)
 	IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
 
 	for_each_netdev(net, dev) {
-		struct in_device *in_dev;
-		if (on)
-			dev_disable_lro(dev);
-		rcu_read_lock();
-		in_dev = __in_dev_get_rcu(dev);
-		if (in_dev)
+		struct in_device *in_dev = __in_dev_get_rtnl(dev);
+
+		if (in_dev) {
 			IN_DEV_CONF_SET(in_dev, FORWARDING, on);
-		rcu_read_unlock();
+			netdev_update_features(in_dev->dev);
+		}
 	}
 }
 
@@ -1527,11 +1525,11 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write,
 			}
 			if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
 				inet_forward_change(net);
-			} else if (*valp) {
+			} else {
 				struct ipv4_devconf *cnf = ctl->extra1;
 				struct in_device *idev =
 					container_of(cnf, struct in_device, cnf);
-				dev_disable_lro(idev->dev);
+				netdev_update_features(idev->dev);
 			}
 			rtnl_unlock();
 			rt_cache_flush(net, 0);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f2f9b2e..d1344ac 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -370,8 +370,6 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 		kfree(ndev);
 		return NULL;
 	}
-	if (ndev->cnf.forwarding)
-		dev_disable_lro(dev);
 	/* We refer to the device */
 	dev_hold(dev);
 
@@ -435,6 +433,7 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev)
 	addrconf_sysctl_register(ndev);
 	/* protected by rtnl_lock */
 	rcu_assign_pointer(dev->ip6_ptr, ndev);
+	netdev_update_features(dev);
 
 	/* Join all-node multicast group */
 	ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
@@ -469,8 +468,6 @@ static void dev_forward_change(struct inet6_dev *idev)
 	if (!idev)
 		return;
 	dev = idev->dev;
-	if (idev->cnf.forwarding)
-		dev_disable_lro(dev);
 	if (dev && (dev->flags & IFF_MULTICAST)) {
 		if (idev->cnf.forwarding)
 			ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
@@ -486,6 +483,8 @@ static void dev_forward_change(struct inet6_dev *idev)
 		else
 			addrconf_leave_anycast(ifa);
 	}
+
+	netdev_update_features(dev);
 }
 
 
-- 
1.7.2.5


^ permalink raw reply related

* [PATCH] net: bonding: factor out rlock(bond->lock) in xmit path
From: Michał Mirosław @ 2011-05-07 11:48 UTC (permalink / raw)
  To: netdev; +Cc: Jay Vosburgh, Andy Gospodarek

Pull read_lock(&bond->lock) and BOND_IS_OK() to bond_start_xmit() from
mode-dependent xmit functions.

netif_running() is always true in hard_start_xmit.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/bonding/bond_3ad.c  |   10 +-----
 drivers/net/bonding/bond_alb.c  |   11 +-----
 drivers/net/bonding/bond_main.c |   74 +++++++++++++++++----------------------
 3 files changed, 35 insertions(+), 60 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index d4160f8..c7537abc 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2403,14 +2403,6 @@ int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev)
 	struct ad_info ad_info;
 	int res = 1;
 
-	/* make sure that the slaves list will
-	 * not change during tx
-	 */
-	read_lock(&bond->lock);
-
-	if (!BOND_IS_OK(bond))
-		goto out;
-
 	if (bond_3ad_get_active_agg_info(bond, &ad_info)) {
 		pr_debug("%s: Error: bond_3ad_get_active_agg_info failed\n",
 			 dev->name);
@@ -2464,7 +2456,7 @@ out:
 		/* no suitable interface, frame not sent */
 		dev_kfree_skb(skb);
 	}
-	read_unlock(&bond->lock);
+
 	return NETDEV_TX_OK;
 }
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 3b7b040..8f2d2e7 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1225,16 +1225,10 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	skb_reset_mac_header(skb);
 	eth_data = eth_hdr(skb);
 
-	/* make sure that the curr_active_slave and the slaves list do
-	 * not change during tx
+	/* make sure that the curr_active_slave do not change during tx
 	 */
-	read_lock(&bond->lock);
 	read_lock(&bond->curr_slave_lock);
 
-	if (!BOND_IS_OK(bond)) {
-		goto out;
-	}
-
 	switch (ntohs(skb->protocol)) {
 	case ETH_P_IP: {
 		const struct iphdr *iph = ip_hdr(skb);
@@ -1334,13 +1328,12 @@ int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 		}
 	}
 
-out:
 	if (res) {
 		/* no suitable interface, frame not sent */
 		dev_kfree_skb(skb);
 	}
 	read_unlock(&bond->curr_slave_lock);
-	read_unlock(&bond->lock);
+
 	return NETDEV_TX_OK;
 }
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 04a2205..1f8902e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3975,10 +3975,6 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 	int i, slave_no, res = 1;
 	struct iphdr *iph = ip_hdr(skb);
 
-	read_lock(&bond->lock);
-
-	if (!BOND_IS_OK(bond))
-		goto out;
 	/*
 	 * Start with the curr_active_slave that joined the bond as the
 	 * default for sending IGMP traffic.  For failover purposes one
@@ -4025,7 +4021,7 @@ out:
 		/* no suitable interface, frame not sent */
 		dev_kfree_skb(skb);
 	}
-	read_unlock(&bond->lock);
+
 	return NETDEV_TX_OK;
 }
 
@@ -4039,24 +4035,18 @@ static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_d
 	struct bonding *bond = netdev_priv(bond_dev);
 	int res = 1;
 
-	read_lock(&bond->lock);
 	read_lock(&bond->curr_slave_lock);
 
-	if (!BOND_IS_OK(bond))
-		goto out;
+	if (bond->curr_active_slave)
+		res = bond_dev_queue_xmit(bond, skb,
+			bond->curr_active_slave->dev);
 
-	if (!bond->curr_active_slave)
-		goto out;
-
-	res = bond_dev_queue_xmit(bond, skb, bond->curr_active_slave->dev);
-
-out:
 	if (res)
 		/* no suitable interface, frame not sent */
 		dev_kfree_skb(skb);
 
 	read_unlock(&bond->curr_slave_lock);
-	read_unlock(&bond->lock);
+
 	return NETDEV_TX_OK;
 }
 
@@ -4073,11 +4063,6 @@ static int bond_xmit_xor(struct sk_buff *skb, struct net_device *bond_dev)
 	int i;
 	int res = 1;
 
-	read_lock(&bond->lock);
-
-	if (!BOND_IS_OK(bond))
-		goto out;
-
 	slave_no = bond->xmit_hash_policy(skb, bond->slave_cnt);
 
 	bond_for_each_slave(bond, slave, i) {
@@ -4097,12 +4082,11 @@ static int bond_xmit_xor(struct sk_buff *skb, struct net_device *bond_dev)
 		}
 	}
 
-out:
 	if (res) {
 		/* no suitable interface, frame not sent */
 		dev_kfree_skb(skb);
 	}
-	read_unlock(&bond->lock);
+
 	return NETDEV_TX_OK;
 }
 
@@ -4117,11 +4101,6 @@ static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
 	int i;
 	int res = 1;
 
-	read_lock(&bond->lock);
-
-	if (!BOND_IS_OK(bond))
-		goto out;
-
 	read_lock(&bond->curr_slave_lock);
 	start_at = bond->curr_active_slave;
 	read_unlock(&bond->curr_slave_lock);
@@ -4160,7 +4139,6 @@ out:
 		dev_kfree_skb(skb);
 
 	/* frame sent to all suitable interfaces */
-	read_unlock(&bond->lock);
 	return NETDEV_TX_OK;
 }
 
@@ -4192,10 +4170,8 @@ static inline int bond_slave_override(struct bonding *bond,
 	struct slave *slave = NULL;
 	struct slave *check_slave;
 
-	read_lock(&bond->lock);
-
-	if (!BOND_IS_OK(bond) || !skb->queue_mapping)
-		goto out;
+	if (!skb->queue_mapping)
+		return 1;
 
 	/* Find out if any slaves have the same mapping as this skb. */
 	bond_for_each_slave(bond, check_slave, i) {
@@ -4211,8 +4187,6 @@ static inline int bond_slave_override(struct bonding *bond,
 		res = bond_dev_queue_xmit(bond, skb, slave->dev);
 	}
 
-out:
-	read_unlock(&bond->lock);
 	return res;
 }
 
@@ -4234,17 +4208,10 @@ static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb)
 	return txq;
 }
 
-static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 
-	/*
-	 * If we risk deadlock from transmitting this in the
-	 * netpoll path, tell netpoll to queue the frame for later tx
-	 */
-	if (is_netpoll_tx_blocked(dev))
-		return NETDEV_TX_BUSY;
-
 	if (TX_QUEUE_OVERRIDE(bond->params.mode)) {
 		if (!bond_slave_override(bond, skb))
 			return NETDEV_TX_OK;
@@ -4274,6 +4241,29 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 }
 
+static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct bonding *bond = netdev_priv(dev);
+	netdev_tx_t ret = NETDEV_TX_OK;
+
+	/*
+	 * If we risk deadlock from transmitting this in the
+	 * netpoll path, tell netpoll to queue the frame for later tx
+	 */
+	if (is_netpoll_tx_blocked(dev))
+		return NETDEV_TX_BUSY;
+
+	read_lock(&bond->lock);
+
+	if (bond->slave_cnt)
+		ret = __bond_start_xmit(skb, dev);
+	else
+		dev_kfree_skb(skb);
+
+	read_unlock(&bond->lock);
+
+	return ret;
+}
 
 /*
  * set bond mode specific net device operations
-- 
1.7.2.5


^ permalink raw reply related

* Scalability of interface creation and deletion
From: Alex Bligh @ 2011-05-07 11:08 UTC (permalink / raw)
  To: netdev; +Cc: Alex Bligh

I am trying to track down why interface creation slows down badly with
large numbers of interfaces (~1,000 interfaces) and why deletion is so
slow. Use case: restarting routers needs to be fast; some failover methods
require interface up/down; some routers need lots of interfaces.

I have written a small shell script to create and delete a number of
interfaces supplied on the command line (script appended below). It
is important to run this with udev, udev-bridge etc. disabled. In
my environment
(Ubuntu 2.6.32-28-generic, Lucid). I did this by
 * service upstart-udev-bridge stop
 * service udev stop
 * unshare -n bash
If you don't do this, you are simply timing your distro's interface
scripts.

Note the "-n" parameter creates the supplied number of veth pair
interfaces. As these are pairs, there are twice as many interfaces actually
created.

So, the results which are pretty repeatable are as follows:

                            100 pairs      500 pairs
Interface creation               14ms          110ms
Interface deletion              160ms          148ms

Now I don't think interface deletion has in fact got faster: simply
the overhead of loading the script is spread over more processes.
But there are two obvious conclusions:

1. Interface creation slows down hugely with more interfaces
2. Interface deletion is normally much slower than interface creation

strace -T -ttt on the "ip" command used to do this does not show the delay
where I thought it would be - cataloguing the existing interfaces. Instead,
it's the final send() to the netlink socket which does the relevant action
which appears to be slow, for both addition and detion. Adding the last
interface takes 200ms in that syscall, the first is quick (symptomatic of a
slowdown); for deletion the last send syscall is quick.

Poking about in net/core/dev.c, I see that interface names are hashed using
a hash with a maximum of 256 entries. However, these seem to be hash
buckets supporting multiple entries so I can't imagine a chain of 4 entries
is problematic.

I am having difficulty seeing what might be the issue in interface
creation. Any ideas?

In interface deletion, my attention is drawn to netdev_wait_allrefs,
which does this:
        refcnt = netdev_refcnt_read(dev);

        while (refcnt != 0) {
                ...
                msleep(250);

                refcnt = netdev_refcnt_read(dev);
		....
        }

I am guessing that this is going to do the msleep 50% of the time,
explaining 125ms of the observed time. How would people react to
exponential backoff instead (untested):

	int backoff = 10;
        refcnt = netdev_refcnt_read(dev);

        while (refcnt != 0) {
                ...
                msleep(backoff);
                if ((backoff *= 2) > 250)
                  backoff = 250;

                refcnt = netdev_refcnt_read(dev);
		....
        }

-- 
Alex Bligh

#!/bin/bash

# Usage:
#   ifaceseq [options]
#
# Options:
#   -n NUM : use NUM interfaces
#   -t TYPE : use TYPE of interfaces (supported: veth, vlan)

numifs=10
itype=veth

while getopts n:t: flag; do
    case ${flag} in
	n) numifs=${OPTARG} ;;
	t) itype=${OPTARG} ;;
    esac
done

shift $((OPTIND-1))

createifs ()
{
    echo `date` creating $numifs interfaces
    case ${itype} in
	vlan)
	    for i in `seq 1 $numifs` ; do
		ip link add link eth0 name vlan${i} type vlan id ${i}
	    done
	    ;;
	*)
	    for i in `seq 1 $numifs` ; do
		ip link add testa${i} type veth peer name testb${i}
	    done
    esac
    echo `date` done
}

deleteifs ()
{
    echo `date` deleting $numifs interfaces
    case ${itype} in
	vlan)
	    for i in `seq 1 $numifs` ; do
		ip link delete dev vlan${i}
	    done
	    ;;
	*)
	    for i in `seq 1 $numifs` ; do
		ip link delete testa${i}
	    done
    esac
    echo `date` done
}

time createifs;
time deleteifs;

^ permalink raw reply

* [net-next-2.6 5/5] ixgbe: add ethtool counters for OS2BMC
From: Jeff Kirsher @ 2011-05-07 10:25 UTC (permalink / raw)
  To: davem; +Cc: Emil Tantilov, netdev, gospo, bphilips, Jeff Kirsher
In-Reply-To: <1304763923-6839-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Emil Tantilov <emil.s.tantilov@intel.com>

OS2BMC registers are available for X540.
This patch adds ethtool counters based on those registers.

Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Evan Swanson <evan.swanson@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe_ethtool.c |    4 ++++
 drivers/net/ixgbe/ixgbe_main.c    |    7 ++++++-
 drivers/net/ixgbe/ixgbe_type.h    |    8 ++++++++
 3 files changed, 18 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 545b231..1fdd075 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -102,6 +102,10 @@ static struct ixgbe_stats ixgbe_gstrings_stats[] = {
 	{"alloc_rx_page_failed", IXGBE_STAT(alloc_rx_page_failed)},
 	{"alloc_rx_buff_failed", IXGBE_STAT(alloc_rx_buff_failed)},
 	{"rx_no_dma_resources", IXGBE_STAT(hw_rx_no_dma_resources)},
+	{"os2bmc_rx_by_bmc", IXGBE_STAT(stats.o2bgptc)},
+	{"os2bmc_tx_by_bmc", IXGBE_STAT(stats.b2ospc)},
+	{"os2bmc_tx_by_host", IXGBE_STAT(stats.o2bspc)},
+	{"os2bmc_rx_by_host", IXGBE_STAT(stats.b2ogprc)},
 #ifdef IXGBE_FCOE
 	{"fcoe_bad_fccrc", IXGBE_STAT(stats.fccrc)},
 	{"rx_fcoe_dropped", IXGBE_STAT(stats.fcoerpdc)},
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 56cc9a1..a3e384b 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -5912,8 +5912,13 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter)
 		hwstats->gotc += IXGBE_READ_REG(hw, IXGBE_GOTCH);
 		hwstats->tor += IXGBE_READ_REG(hw, IXGBE_TORH);
 		break;
-	case ixgbe_mac_82599EB:
 	case ixgbe_mac_X540:
+		/* OS2BMC stats are X540 only*/
+		hwstats->o2bgptc += IXGBE_READ_REG(hw, IXGBE_O2BGPTC);
+		hwstats->o2bspc += IXGBE_READ_REG(hw, IXGBE_O2BSPC);
+		hwstats->b2ospc += IXGBE_READ_REG(hw, IXGBE_B2OSPC);
+		hwstats->b2ogprc += IXGBE_READ_REG(hw, IXGBE_B2OGPRC);
+	case ixgbe_mac_82599EB:
 		hwstats->gorc += IXGBE_READ_REG(hw, IXGBE_GORCL);
 		IXGBE_READ_REG(hw, IXGBE_GORCH); /* to clear */
 		hwstats->gotc += IXGBE_READ_REG(hw, IXGBE_GOTCL);
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index b1d523c..70e6870 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -672,6 +672,10 @@
 #define IXGBE_FCOEDWRC  0x0242C /* Number of FCoE DWords Received */
 #define IXGBE_FCOEPTC   0x08784 /* Number of FCoE Packets Transmitted */
 #define IXGBE_FCOEDWTC  0x08788 /* Number of FCoE DWords Transmitted */
+#define IXGBE_O2BGPTC   0x041C4
+#define IXGBE_O2BSPC    0x087B0
+#define IXGBE_B2OSPC    0x041C0
+#define IXGBE_B2OGPRC   0x02F90
 #define IXGBE_PCRC8ECL  0x0E810
 #define IXGBE_PCRC8ECH  0x0E811
 #define IXGBE_PCRC8ECH_MASK     0x1F
@@ -2554,6 +2558,10 @@ struct ixgbe_hw_stats {
 	u64 fcoeptc;
 	u64 fcoedwrc;
 	u64 fcoedwtc;
+	u64 b2ospc;
+	u64 b2ogprc;
+	u64 o2bgptc;
+	u64 o2bspc;
 };
 
 /* forward declaration */
-- 
1.7.4.4


^ permalink raw reply related

* [net-next-2.6 4/5] ixgbe: add rxhash support
From: Jeff Kirsher @ 2011-05-07 10:25 UTC (permalink / raw)
  To: davem; +Cc: Emil Tantilov, netdev, gospo, bphilips, Jeff Kirsher
In-Reply-To: <1304763923-6839-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Emil Tantilov <emil.s.tantilov@intel.com>

feed RSS hash into skb->rxhash

Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
Tested-by: Evan Swanson <evan.swanson@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ixgbe/ixgbe_ethtool.c |    7 ++++++-
 drivers/net/ixgbe/ixgbe_main.c    |   14 +++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index f2efa32..545b231 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -2253,8 +2253,13 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data)
 	need_reset = (data & ETH_FLAG_RXVLAN) !=
 		     (netdev->features & NETIF_F_HW_VLAN_RX);
 
+	if ((data & ETH_FLAG_RXHASH) &&
+	    !(adapter->flags & IXGBE_FLAG_RSS_ENABLED))
+		return -EOPNOTSUPP;
+
 	rc = ethtool_op_set_flags(netdev, data, ETH_FLAG_LRO | ETH_FLAG_NTUPLE |
-					ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN);
+				  ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN |
+				  ETH_FLAG_RXHASH);
 	if (rc)
 		return rc;
 
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index eebb192..56cc9a1 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -1063,8 +1063,14 @@ static int __ixgbe_notify_dca(struct device *dev, void *data)
 
 	return 0;
 }
-
 #endif /* CONFIG_IXGBE_DCA */
+
+static inline void ixgbe_rx_hash(union ixgbe_adv_rx_desc *rx_desc,
+				 struct sk_buff *skb)
+{
+	skb->rxhash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
+}
+
 /**
  * ixgbe_receive_skb - Send a completed packet up the stack
  * @adapter: board private structure
@@ -1456,6 +1462,8 @@ static void ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
 		}
 
 		ixgbe_rx_checksum(adapter, rx_desc, skb);
+		if (adapter->netdev->features & NETIF_F_RXHASH)
+			ixgbe_rx_hash(rx_desc, skb);
 
 		/* probably a little skewed due to removing CRC */
 		total_rx_bytes += skb->len;
@@ -7361,6 +7369,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	netdev->features |= NETIF_F_TSO;
 	netdev->features |= NETIF_F_TSO6;
 	netdev->features |= NETIF_F_GRO;
+	netdev->features |= NETIF_F_RXHASH;
 
 	switch (adapter->hw.mac.type) {
 	case ixgbe_mac_82599EB:
@@ -7441,6 +7450,9 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	if (err)
 		goto err_sw_init;
 
+	if (!(adapter->flags & IXGBE_FLAG_RSS_ENABLED))
+		netdev->features &= ~NETIF_F_RXHASH;
+
 	switch (pdev->device) {
 	case IXGBE_DEV_ID_82599_SFP:
 		/* Only this subdevice supports WOL */
-- 
1.7.4.4


^ permalink raw reply related

* [net-next-2.6 3/5] igb: convert to ethtool set_phys_id
From: Jeff Kirsher @ 2011-05-07 10:25 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, bphilips, Stephen Hemminger
In-Reply-To: <1304763923-6839-1-git-send-email-jeffrey.t.kirsher@intel.com>

Based on patch from Stephen Hemminger.
Convert igb driver to use new set_phys_id ethtool interface.

CC: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/igb/igb_ethtool.c |   37 +++++++++++++++++++------------------
 1 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index 6e29634..fdc895e 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -1964,27 +1964,28 @@ static int igb_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 /* bit defines for adapter->led_status */
 #define IGB_LED_ON		0
 
-static int igb_phys_id(struct net_device *netdev, u32 data)
+static int igb_set_phys_id(struct net_device *netdev,
+			   enum ethtool_phys_id_state state)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
-	unsigned long timeout;
 
-	timeout = data * 1000;
-
-	/*
-	 *  msleep_interruptable only accepts unsigned int so we are limited
-	 * in how long a duration we can wait
-	 */
-	if (!timeout || timeout > UINT_MAX)
-		timeout = UINT_MAX;
-
-	igb_blink_led(hw);
-	msleep_interruptible(timeout);
-
-	igb_led_off(hw);
-	clear_bit(IGB_LED_ON, &adapter->led_status);
-	igb_cleanup_led(hw);
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		igb_blink_led(hw);
+		return 2;
+	case ETHTOOL_ID_ON:
+		igb_blink_led(hw);
+		break;
+	case ETHTOOL_ID_OFF:
+		igb_led_off(hw);
+		break;
+	case ETHTOOL_ID_INACTIVE:
+		igb_led_off(hw);
+		clear_bit(IGB_LED_ON, &adapter->led_status);
+		igb_cleanup_led(hw);
+		break;
+	}
 
 	return 0;
 }
@@ -2216,7 +2217,7 @@ static const struct ethtool_ops igb_ethtool_ops = {
 	.set_tso                = igb_set_tso,
 	.self_test              = igb_diag_test,
 	.get_strings            = igb_get_strings,
-	.phys_id                = igb_phys_id,
+	.set_phys_id            = igb_set_phys_id,
 	.get_sset_count         = igb_get_sset_count,
 	.get_ethtool_stats      = igb_get_ethtool_stats,
 	.get_coalesce           = igb_get_coalesce,
-- 
1.7.4.4


^ permalink raw reply related

* [net-next-2.6 2/5] e1000: convert to set_phys_id
From: Jeff Kirsher @ 2011-05-07 10:25 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, bphilips, Stephen Hemminger
In-Reply-To: <1304763923-6839-1-git-send-email-jeffrey.t.kirsher@intel.com>

Based on the original patch from Stephen Hemminger.
Convert to new LED control infrastucture and remove no longer
necessary bits.

CC: Stephen Hemminger <shemminger@vyatta.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e1000/e1000.h         |    3 --
 drivers/net/e1000/e1000_ethtool.c |   50 ++++++++++++-------------------------
 2 files changed, 16 insertions(+), 37 deletions(-)

diff --git a/drivers/net/e1000/e1000.h b/drivers/net/e1000/e1000.h
index b1b23dd..8676899 100644
--- a/drivers/net/e1000/e1000.h
+++ b/drivers/net/e1000/e1000.h
@@ -238,9 +238,6 @@ struct e1000_adapter {
 	struct work_struct reset_task;
 	u8 fc_autoneg;
 
-	struct timer_list blink_timer;
-	unsigned long led_status;
-
 	/* TX */
 	struct e1000_tx_ring *tx_ring;      /* One per active queue */
 	unsigned int restart_queue;
diff --git a/drivers/net/e1000/e1000_ethtool.c b/drivers/net/e1000/e1000_ethtool.c
index 4fa727c..ec0fa42 100644
--- a/drivers/net/e1000/e1000_ethtool.c
+++ b/drivers/net/e1000/e1000_ethtool.c
@@ -1755,46 +1755,28 @@ static int e1000_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
 	return 0;
 }
 
-/* toggle LED 4 times per second = 2 "blinks" per second */
-#define E1000_ID_INTERVAL	(HZ/4)
-
-/* bit defines for adapter->led_status */
-#define E1000_LED_ON		0
-
-static void e1000_led_blink_callback(unsigned long data)
+static int e1000_set_phys_id(struct net_device *netdev,
+			     enum ethtool_phys_id_state state)
 {
-	struct e1000_adapter *adapter = (struct e1000_adapter *) data;
+	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
 
-	if (test_and_change_bit(E1000_LED_ON, &adapter->led_status))
-		e1000_led_off(hw);
-	else
-		e1000_led_on(hw);
-
-	mod_timer(&adapter->blink_timer, jiffies + E1000_ID_INTERVAL);
-}
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		e1000_setup_led(hw);
+		return 2;
 
-static int e1000_phys_id(struct net_device *netdev, u32 data)
-{
-	struct e1000_adapter *adapter = netdev_priv(netdev);
-	struct e1000_hw *hw = &adapter->hw;
+	case ETHTOOL_ID_ON:
+		e1000_led_on(hw);
+		break;
 
-	if (!data)
-		data = INT_MAX;
+	case ETHTOOL_ID_OFF:
+		e1000_led_off(hw);
+		break;
 
-	if (!adapter->blink_timer.function) {
-		init_timer(&adapter->blink_timer);
-		adapter->blink_timer.function = e1000_led_blink_callback;
-		adapter->blink_timer.data = (unsigned long)adapter;
+	case ETHTOOL_ID_INACTIVE:
+		e1000_cleanup_led(hw);
 	}
-	e1000_setup_led(hw);
-	mod_timer(&adapter->blink_timer, jiffies);
-	msleep_interruptible(data * 1000);
-	del_timer_sync(&adapter->blink_timer);
-
-	e1000_led_off(hw);
-	clear_bit(E1000_LED_ON, &adapter->led_status);
-	e1000_cleanup_led(hw);
 
 	return 0;
 }
@@ -1931,7 +1913,7 @@ static const struct ethtool_ops e1000_ethtool_ops = {
 	.set_tso                = e1000_set_tso,
 	.self_test              = e1000_diag_test,
 	.get_strings            = e1000_get_strings,
-	.phys_id                = e1000_phys_id,
+	.set_phys_id            = e1000_set_phys_id,
 	.get_ethtool_stats      = e1000_get_ethtool_stats,
 	.get_sset_count         = e1000_get_sset_count,
 	.get_coalesce           = e1000_get_coalesce,
-- 
1.7.4.4


^ permalink raw reply related

* [net-next-2.6 1/5] e100: implemenet set_phys_id
From: Jeff Kirsher @ 2011-05-07 10:25 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, bphilips, Stephen Hemminger
In-Reply-To: <1304763923-6839-1-git-send-email-jeffrey.t.kirsher@intel.com>

Based on the original patch from Stephen Hemminger.
Implement set_phys_id to control LED.

CC: Stephen Hemminger <shemminger@vyatta.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/e100.c |   66 ++++++++++++++++++++++------------------------------
 1 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 29f812d..e336c79 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -593,7 +593,6 @@ struct nic {
 	enum phy phy;
 	struct params params;
 	struct timer_list watchdog;
-	struct timer_list blink_timer;
 	struct mii_if_info mii;
 	struct work_struct tx_timeout_task;
 	enum loopback loopback;
@@ -618,7 +617,6 @@ struct nic {
 	u32 rx_tco_frames;
 	u32 rx_over_length_errors;
 
-	u16 leds;
 	u16 eeprom_wc;
 	__le16 eeprom[256];
 	spinlock_t mdio_lock;
@@ -2353,30 +2351,6 @@ err_clean_rx:
 #define E100_82552_LED_OVERRIDE 0x19
 #define E100_82552_LED_ON       0x000F /* LEDTX and LED_RX both on */
 #define E100_82552_LED_OFF      0x000A /* LEDTX and LED_RX both off */
-static void e100_blink_led(unsigned long data)
-{
-	struct nic *nic = (struct nic *)data;
-	enum led_state {
-		led_on     = 0x01,
-		led_off    = 0x04,
-		led_on_559 = 0x05,
-		led_on_557 = 0x07,
-	};
-	u16 led_reg = MII_LED_CONTROL;
-
-	if (nic->phy == phy_82552_v) {
-		led_reg = E100_82552_LED_OVERRIDE;
-
-		nic->leds = (nic->leds == E100_82552_LED_ON) ?
-		            E100_82552_LED_OFF : E100_82552_LED_ON;
-	} else {
-		nic->leds = (nic->leds & led_on) ? led_off :
-		            (nic->mac < mac_82559_D101M) ? led_on_557 :
-		            led_on_559;
-	}
-	mdio_write(nic->netdev, nic->mii.phy_id, led_reg, nic->leds);
-	mod_timer(&nic->blink_timer, jiffies + HZ / 4);
-}
 
 static int e100_get_settings(struct net_device *netdev, struct ethtool_cmd *cmd)
 {
@@ -2600,19 +2574,38 @@ static void e100_diag_test(struct net_device *netdev,
 	msleep_interruptible(4 * 1000);
 }
 
-static int e100_phys_id(struct net_device *netdev, u32 data)
+static int e100_set_phys_id(struct net_device *netdev,
+			    enum ethtool_phys_id_state state)
 {
 	struct nic *nic = netdev_priv(netdev);
+	enum led_state {
+		led_on     = 0x01,
+		led_off    = 0x04,
+		led_on_559 = 0x05,
+		led_on_557 = 0x07,
+	};
 	u16 led_reg = (nic->phy == phy_82552_v) ? E100_82552_LED_OVERRIDE :
-	              MII_LED_CONTROL;
+		MII_LED_CONTROL;
+	u16 leds = 0;
+
+	switch (state) {
+	case ETHTOOL_ID_ACTIVE:
+		return 2;
 
-	if (!data || data > (u32)(MAX_SCHEDULE_TIMEOUT / HZ))
-		data = (u32)(MAX_SCHEDULE_TIMEOUT / HZ);
-	mod_timer(&nic->blink_timer, jiffies);
-	msleep_interruptible(data * 1000);
-	del_timer_sync(&nic->blink_timer);
-	mdio_write(netdev, nic->mii.phy_id, led_reg, 0);
+	case ETHTOOL_ID_ON:
+		leds = (nic->phy == phy_82552_v) ? E100_82552_LED_ON :
+		       (nic->mac < mac_82559_D101M) ? led_on_557 : led_on_559;
+		break;
+
+	case ETHTOOL_ID_OFF:
+		leds = (nic->phy == phy_82552_v) ? E100_82552_LED_OFF : led_off;
+		break;
+
+	case ETHTOOL_ID_INACTIVE:
+		break;
+	}
 
+	mdio_write(netdev, nic->mii.phy_id, led_reg, leds);
 	return 0;
 }
 
@@ -2693,7 +2686,7 @@ static const struct ethtool_ops e100_ethtool_ops = {
 	.set_ringparam		= e100_set_ringparam,
 	.self_test		= e100_diag_test,
 	.get_strings		= e100_get_strings,
-	.phys_id		= e100_phys_id,
+	.set_phys_id		= e100_set_phys_id,
 	.get_ethtool_stats	= e100_get_ethtool_stats,
 	.get_sset_count		= e100_get_sset_count,
 };
@@ -2834,9 +2827,6 @@ static int __devinit e100_probe(struct pci_dev *pdev,
 	init_timer(&nic->watchdog);
 	nic->watchdog.function = e100_watchdog;
 	nic->watchdog.data = (unsigned long)nic;
-	init_timer(&nic->blink_timer);
-	nic->blink_timer.function = e100_blink_led;
-	nic->blink_timer.data = (unsigned long)nic;
 
 	INIT_WORK(&nic->tx_timeout_task, e100_tx_timeout_task);
 
-- 
1.7.4.4


^ permalink raw reply related

* [net-next-2.6 0/5][pull request] Intel Wired LAN Driver Update
From: Jeff Kirsher @ 2011-05-07 10:25 UTC (permalink / raw)
  To: davem; +Cc: Jeff Kirsher, netdev, gospo, bphilips

The following series contains updates to e100, e1000, igb and ixgbe.

Sorry for the delay on the e100/e1000/igb convert to set_phys_id patches,
it was due to me falling ill and not completing the patches in a timely
manner.

The following are changes since commit 706527280ec38fcdcd0466f10b607105fd23801b:
  ipv4: Initialize cork->opt using NULL not 0
and are available in the git repository at:
  master.kernel.org:/pub/scm/linux/kernel/git/jkirsher/net-next-2.6 master

Emil Tantilov (2):
  ixgbe: add rxhash support
  ixgbe: add ethtool counters for OS2BMC

Jeff Kirsher (3):
  e100: implemenet set_phys_id
  e1000: convert to set_phys_id
  igb: convert to ethtool set_phys_id

 drivers/net/e100.c                |   66 +++++++++++++++---------------------
 drivers/net/e1000/e1000.h         |    3 --
 drivers/net/e1000/e1000_ethtool.c |   50 +++++++++-------------------
 drivers/net/igb/igb_ethtool.c     |   37 ++++++++++----------
 drivers/net/ixgbe/ixgbe_ethtool.c |   11 +++++-
 drivers/net/ixgbe/ixgbe_main.c    |   21 ++++++++++-
 drivers/net/ixgbe/ixgbe_type.h    |    8 ++++
 7 files changed, 100 insertions(+), 96 deletions(-)

-- 
1.7.4.4


^ permalink raw reply

* Re: [Bugme-new] [Bug 34322] New: No ECN marking in IPv6
From: Steinar H. Gunderson @ 2011-05-07  9:59 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, Andrew Morton, netdev, bugzilla-daemon,
	bugme-daemon, YOSHIFUJI Hideaki
In-Reply-To: <1304761486.2821.945.camel@edumazet-laptop>

On Sat, May 07, 2011 at 11:44:46AM +0200, Eric Dumazet wrote:
> I cooked for you the official patch and made sure it worked with a RED
> ECN setup, and one ipv6 tcp xmit.

Great, thanks :-) This looks good to me.

/* Steinar */
-- 
Homepage: http://www.sesse.net/

^ permalink raw reply

* Re: [Bugme-new] [Bug 34322] New: No ECN marking in IPv6
From: Eric Dumazet @ 2011-05-07  9:44 UTC (permalink / raw)
  To: Steinar H. Gunderson, David Miller
  Cc: Andrew Morton, netdev, bugzilla-daemon, bugme-daemon,
	YOSHIFUJI Hideaki
In-Reply-To: <20110506171249.GA29942@uio.no>

From: Steinar H. Gunderson <sgunderson@bigfoot.com>

Le vendredi 06 mai 2011 à 19:12 +0200, Steinar H. Gunderson a écrit :

> Sure, but is really checking against NULL the right way of checking for IPv6
> sockets? I'd imagined I should have checked address family or something
> instead...
> 

It should be fine.

I cooked for you the official patch and made sure it worked with a RED
ECN setup, and one ipv6 tcp xmit.

# tc -s -d qdisc show dev eth1
...
qdisc red 11: parent 1:11 limit 120Kb min 8Kb max 80Kb ecn ewma 2 Plog 21 Scell_log 11
 Sent 114694826 bytes 76446 pkt (dropped 15, overlimits 485 requeues 0) 
 rate 12126Kbit 1011pps backlog 0b 0p requeues 0 
  marked 470 early 15 pdrop 0 other 0


Thanks again !

[PATCH] ipv6: restore correct ECN handling on TCP xmit

Since commit e9df2e8fd8fbc9 (Use appropriate sock tclass setting for
routing lookup) we lost ability to properly add ECN codemarks to ipv6
TCP frames.

It seems like TCP_ECN_send() calls INET_ECN_xmit(), which only sets the
ECN bit in the IPv4 ToS field (inet_sk(sk)->tos), but after the patch,
what's checked is inet6_sk(sk)->tclass, which is a completely different
field.

Close bug https://bugzilla.kernel.org/show_bug.cgi?id=34322

[Eric Dumazet] : added the INET_ECN_dontxmit() fix and replace macros
by inline functions for clarity.

Signed-off-by: Steinar H. Gunderson <sgunderson@bigfoot.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/net/inet_ecn.h |   16 +++++++++++++---
 1 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index 88bdd01..2fa8d13 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -38,9 +38,19 @@ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
 	return outer;
 }
 
-#define	INET_ECN_xmit(sk) do { inet_sk(sk)->tos |= INET_ECN_ECT_0; } while (0)
-#define	INET_ECN_dontxmit(sk) \
-	do { inet_sk(sk)->tos &= ~INET_ECN_MASK; } while (0)
+static inline void INET_ECN_xmit(struct sock *sk)
+{
+	inet_sk(sk)->tos |= INET_ECN_ECT_0;
+	if (inet6_sk(sk) != NULL)
+		inet6_sk(sk)->tclass |= INET_ECN_ECT_0;
+}
+
+static inline void INET_ECN_dontxmit(struct sock *sk)
+{
+	inet_sk(sk)->tos &= ~INET_ECN_MASK;
+	if (inet6_sk(sk) != NULL)
+		inet6_sk(sk)->tclass &= ~INET_ECN_MASK;
+}
 
 #define IP6_ECN_flow_init(label) do {		\
       (label) &= ~htonl(INET_ECN_MASK << 20);	\



^ permalink raw reply related

* Re: [PATCH 7/7] ns: Wire up the setns system call
From: Geert Uytterhoeven @ 2011-05-07  8:27 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-arch, linux-kernel, netdev, linux-fsdevel, jamal,
	Daniel Lezcano, Linux Containers, Renato Westphal
In-Reply-To: <1304735101-1824-7-git-send-email-ebiederm@xmission.com>

On Sat, May 7, 2011 at 04:25, Eric W. Biederman <ebiederm@xmission.com> wrote:
>  arch/m68k/include/asm/unistd.h         |    3 ++-
>  arch/m68k/kernel/syscalltable.S        |    1 +

As the unified syscalltable for m68k/m68knommu is not yet in mainline
(planned for
2.6.40), you should also add it to arch/m68k/kernel/entry_mm.S.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH] hamachi: Delete TX checksumming code commented out since 1999
From: David Miller @ 2011-05-07  8:02 UTC (permalink / raw)
  To: mirqus; +Cc: netdev
In-Reply-To: <BANLkTi=ZeY6X1XOdK2UXsODNBqzbbG+4KQ@mail.gmail.com>

From: Michał Mirosław <mirqus@gmail.com>
Date: Sat, 7 May 2011 09:59:28 +0200

> This also disables RX checksumming. You wanted to leave '#define
> RX_CHECKSUM' in place?

Oops, I'll fix that, thanks.

^ permalink raw reply

* Re: [PATCH 2/7] ns: Introduce the setns syscall
From: Rémi Denis-Courmont @ 2011-05-07  8:01 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: linux-arch-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Linux Containers,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1304735101-1824-2-git-send-email-ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>

Le samedi 7 mai 2011 05:24:56 Eric W. Biederman, vous avez écrit :
> Pieces of this puzzle can also be solved by instead of
> coming up with a general purpose system call coming up
> with targed system calls perhaps socketat that solve
> a subset of the larger problem.  Overall that appears
> to be more work for less reward.

socketat() is still required for multithreaded namespace-aware userspace, I 
believe.

-- 
Rémi Denis-Courmont
http://www.remlab.info/
http://fi.linkedin.com/in/remidenis

^ permalink raw reply

* Re: [PATCH] hamachi: Delete TX checksumming code commented out since 1999
From: Michał Mirosław @ 2011-05-07  7:59 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110506.115945.104066424.davem@davemloft.net>

2011/5/6 David Miller <davem@davemloft.net>:
>
> TX checksumming support has been ifdef commented out of this driver
> for more than 10 years, and it makes references to aspects of the IPv4
> stack from back then as well.
>
> If someone has one of these rare cards and wants to properly resurrect
> TX checksumming support, they can still get at this code in the
> version control history.
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>
> I stumbled over this cruft while auditing ip_queue_xmit() users.
>
>  drivers/net/hamachi.c |   79 -------------------------------------------------
>  1 files changed, 0 insertions(+), 79 deletions(-)
>
> diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c
> index 80d25ed..f5fba73 100644
> --- a/drivers/net/hamachi.c
> +++ b/drivers/net/hamachi.c
> @@ -132,14 +132,8 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1};
>  /*
>  * RX_CHECKSUM turns on card-generated receive checksum generation for
>  *   TCP and UDP packets.  Otherwise the upper layers do the calculation.
> - * TX_CHECKSUM won't do anything too useful, even if it works.  There's no
> - *   easy mechanism by which to tell the TCP/UDP stack that it need not
> - *   generate checksums for this device.  But if somebody can find a way
> - *   to get that to work, most of the card work is in here already.
>  * 3/10/1999 Pete Wyckoff <wyckoff@ca.sandia.gov>
>  */
> -#undef  TX_CHECKSUM
> -#define RX_CHECKSUM

This also disables RX checksumming. You wanted to leave '#define
RX_CHECKSUM' in place?

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCHv5 2/2] tg3: Allow ethtool to enable/disable loopback.
From: Michał Mirosław @ 2011-05-07  7:43 UTC (permalink / raw)
  To: Mahesh Bandewar
  Cc: Matt Carlson, David Miller, netdev, Michael Chan, Tom Herbert
In-Reply-To: <1304749117-1989-1-git-send-email-maheshb@google.com>

On Fri, May 06, 2011 at 11:18:37PM -0700, Mahesh Bandewar wrote:
> This patch adds tg3_set_features() to handle loopback mode. Currently the
> capability is added for the devices which support internal MAC loopback mode.
> So when enabled, it enables internal-MAC loopback.
[...]
> diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
> index 7c7c9a8..b7270c2 100644
> --- a/drivers/net/tg3.c
> +++ b/drivers/net/tg3.c
[...]
> @@ -6319,6 +6356,24 @@ static u32 tg3_fix_features(struct net_device *dev, u32 features)
>  	return features;
>  }
>  
> +static int tg3_set_features(struct net_device *dev, u32 features)
> +{
> +	struct tg3 *tp = netdev_priv(dev);
> +	u32 changed = dev->features ^ features;
> +
> +	if (changed & NETIF_F_LOOPBACK) {
> +		if (tg3_flag(tp, LOOPBACK_ENABLED))
> +			tg3_flag_clear(tp, LOOPBACK_ENABLED);
> +		else
> +			tg3_flag_set(tp, LOOPBACK_ENABLED);
> +
> +		if (netif_running(dev))
> +			tg3_set_loopback(dev);
> +	}
> +
> +	return 0;
> +}
> +
>  static inline void tg3_set_mtu(struct net_device *dev, struct tg3 *tp,
>  			       int new_mtu)
>  {
> @@ -9485,6 +9540,13 @@ static int tg3_open(struct net_device *dev)
>  
>  	netif_tx_start_all_queues(dev);
>  
> +	/*
> +	 * Reset loopback feature if it was turned on while the device was down
> +	 * make sure that it's installed properly now.
> +	 */
> +	if (tg3_flag(tp, LOOPBACK_ENABLED))
> +		tg3_set_loopback(dev);
> +
>  	return 0;
>  
>  err_out3:
[...]

So, you've just implemented what I said about enabling loopback at the end
of tg3_open(), but you also added (redundant) flag that mirrors
dev->features & NETIF_F_LOOPBACK. Why?

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH] bonding: convert to ndo_fix_features
From: Michał Mirosław @ 2011-05-07  7:37 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: netdev, Andy Gospodarek
In-Reply-To: <5525.1304705884@death>

On Fri, May 06, 2011 at 11:18:04AM -0700, Jay Vosburgh wrote:
> Michał Mirosław <mirq-linux@rere.qmqm.pl> wrote:
> >This should also fix updating of vlan_features and propagating changes to
> >VLAN devices on the bond.
> >
> >Side effect: it allows user to force-disable some offloads on the bond
> >interface.
> >
> >Note: NETIF_F_VLAN_CHALLENGED is managed by bond_fix_features() now.
> >
> >BTW, What are the problems in creating VLAN devices on an empty bond
> >(as stated in one of bond_setup() comments)?
> 	If there are no slaves, then the bond does not have a MAC
> address assigned (because it gets its initial MAC from the first slave).
> It's therefore impossible to pass a MAC address up to the VLAN
> interface.
> 
> 	So the limitation is that the bond must have at least one slave
> before a VLAN may be configured above it.

Hmm. That might be worked aroud by generating random MAC then. This would
allow the user to first set a new MAC, create VLANs and then add slaves
when they show up.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH 0/4] [RFC] virtio-net: Improve small packet performance
From: Krishna Kumar2 @ 2011-05-07  7:15 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: davem, eric.dumazet, kvm, netdev, rusty
In-Reply-To: <20110505153413.GC1915@redhat.com>

"Michael S. Tsirkin" <mst@redhat.com> wrote on 05/05/2011 09:04:13 PM:

> > I haven't tuned the threshhold, it is left it at 3/4. I ran
> > the new qemu/vhost/guest, and the results for 1K, 2K and 16K
> > are below. Note this is a different kernel version from my
> > earlier test results. So, f.e., BW1 represents 2.6.39-rc2,
> > the original kernel; while BW2 represents 2.6.37-rc5 (MST's
> > kernel).
>
> Weird. My kernel is actually 2.6.39-rc2. So which is which?

I cloned git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git

# git branch -a
  vhost
* vhost-net-next-event-idx-v1
  remotes/origin/HEAD -> origin/vhost
  remotes/origin/for-linus
  remotes/origin/master
  remotes/origin/net-2.6
  remotes/origin/vhost
  remotes/origin/vhost-broken
  remotes/origin/vhost-devel
  remotes/origin/vhost-mrg-rxbuf
  remotes/origin/vhost-net
  remotes/origin/vhost-net-next
  remotes/origin/vhost-net-next-event-idx-v1
  remotes/origin/vhost-net-next-rebased
  remotes/origin/virtio-layout-aligned
  remotes/origin/virtio-layout-minimal
  remotes/origin/virtio-layout-original
  remotes/origin/virtio-layout-padded
  remotes/origin/virtio-publish-used

# git checkout vhost-net-next-event-idx-v1
Already on 'vhost-net-next-event-idx-v1'

# head -4 Makefile
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 37
EXTRAVERSION = -rc5

I am not sure what I am missing.

thanks,

- KK


^ permalink raw reply

* Re: [PATCH 0/7] Network namespace manipulation with file descriptors
From: Alex Bligh @ 2011-05-07  6:58 UTC (permalink / raw)
  To: Eric W. Biederman, linux-arch-u79uwXL29TY76Z2rM5mHXA
  Cc: Linux Containers, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <m1tyd7p7tq.fsf-+imSwln9KH6u2/kzUuoCbdi2O/JbrIOy@public.gmane.org>

--On 6 May 2011 19:23:29 -0700 "Eric W. Biederman" <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org> 
wrote:

> This patchset addresses the user interface limitations by introducing
> proc files you can open to get file descriptors that keep alive and
> refer to your a tasks namespaces.  Those file descriptors can be passed
> to the new setns system call or the NET_NS_FD argument in netlink
> messages.

This is conceptually very interesting. I am one of those people you
describe with a routing daemon (or more accurately a wrapper around
existing daemons) that does the unshare() and keeps the network
alive. It also has a control socket etc.

You say:
> This addresses three specific problems that can make namespaces hard to
> work with.
> - Namespaces require a dedicated process to pin them in memory.
> - It is not possible to use a namespace unless you are the child
>   of the original creator.
> - Namespaces don't have names that userspace can use to talk about
>   them.

At least for me, the best way to solve the second blob would be to
be able to unshare to an existing namespace. That way I would be able
to run a daemon (without modification) in a pre-existing namespace.
The user interface here would just be an option to 'unshare'. I
don't think your patch allows this, does it? Right now I'm effectively
doing that by causing the pid concerned to fork() and do my bidding,
but that is far from perfect.

As a secondary issue, ever without your patch, it would be really
useful to be able to read from userspace the current network namespace.
(i.e. the pid concerned, or 1 if not unshared). I would like to
simply modify a routing daemon's init script so it doesn't start
if in the host, e.g. at the top:
  [ `cat /proc/.../networknamespace` eq 1 ] && exit 0

-- 
Alex Bligh

^ permalink raw reply

* [PATCHv5 2/2] tg3: Allow ethtool to enable/disable loopback.
From: Mahesh Bandewar @ 2011-05-07  6:18 UTC (permalink / raw)
  To: Matt Carlson, David Miller
  Cc: netdev, Michael Chan, Tom Herbert, Michał Mirosław,
	Mahesh Bandewar
In-Reply-To: <1304559247-16111-1-git-send-email-maheshb@google.com>

This patch adds tg3_set_features() to handle loopback mode. Currently the
capability is added for the devices which support internal MAC loopback mode.
So when enabled, it enables internal-MAC loopback.

Signed-off-by: Mahesh Bandewar <maheshb@google.com>
---
Changes since v4:
 (a) Added TG3_FLAG_LOOPBACK_ENABLED flag to keep loopback state in driver's 
     private data-structure.
 (b) Corrected the loopback implementation by using tp->mac_mode.
 (c) Forced Link up when in loopback mode. 

Changes since v3:
 (a) Corrected the condition (|| => &&) where loopback capability is added.
 (b) set_features() always returns 0.
 (c) Clear the loopback bit in ndo_open callback to avoid discrepancy.

Changes since v2:
 Implemtned Joe Perches's style change.

Changes since v1:
 Implemented Matt Carlson's suggestions.
  (a) Enable this capability on the devices which are capable of MAC-loopback
      mode.
  (b) check if the device is running before making changes.
  (c) check bits before making changes.


 drivers/net/tg3.c |   78 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/net/tg3.h |    1 +
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index 7c7c9a8..b7270c2 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -3373,8 +3373,8 @@ relink:
 		tg3_phy_copper_begin(tp);
 
 		tg3_readphy(tp, MII_BMSR, &bmsr);
-		if (!tg3_readphy(tp, MII_BMSR, &bmsr) &&
-		    (bmsr & BMSR_LSTATUS))
+		if ((!tg3_readphy(tp, MII_BMSR, &bmsr) && (bmsr & BMSR_LSTATUS)) ||
+		    (tp->mac_mode & MAC_MODE_PORT_INT_LPBACK))
 			current_link_up = 1;
 	}
 
@@ -6309,6 +6309,43 @@ dma_error:
 	return NETDEV_TX_OK;
 }
 
+static void tg3_set_loopback(struct net_device *dev)
+{
+	struct tg3 *tp = netdev_priv(dev);
+
+	if (tg3_flag(tp, LOOPBACK_ENABLED)) {
+		if (tp->mac_mode & MAC_MODE_PORT_INT_LPBACK)
+			return;
+
+		/*
+		 * Clear MAC_MODE_HALF_DUPLEX or you won't get packets back in
+		 * loopback mode if Half-Duplex mode was negotiated earlier.
+		 */
+		tp->mac_mode &= ~MAC_MODE_HALF_DUPLEX;
+
+		/* Enable internal MAC loopback mode */
+		tp->mac_mode |= MAC_MODE_PORT_INT_LPBACK;
+		spin_lock_bh(&tp->lock);
+		tw32(MAC_MODE, tp->mac_mode);
+		netif_carrier_on(tp->dev);
+		spin_unlock_bh(&tp->lock);
+		netdev_info(dev, "Internal MAC loopback mode enabled.\n");
+	} else {
+		if (!(tp->mac_mode & MAC_MODE_PORT_INT_LPBACK))
+			return;
+
+		/* Disable internal MAC loopback mode */
+		tp->mac_mode &= ~MAC_MODE_PORT_INT_LPBACK;
+		spin_lock_bh(&tp->lock);
+		tw32(MAC_MODE, tp->mac_mode);
+		/* Force link status check */
+		tg3_setup_phy(tp, 1);
+		spin_unlock_bh(&tp->lock);
+		netdev_info(dev, "Internal MAC loopback mode disabled.\n");
+	}
+
+}
+
 static u32 tg3_fix_features(struct net_device *dev, u32 features)
 {
 	struct tg3 *tp = netdev_priv(dev);
@@ -6319,6 +6356,24 @@ static u32 tg3_fix_features(struct net_device *dev, u32 features)
 	return features;
 }
 
+static int tg3_set_features(struct net_device *dev, u32 features)
+{
+	struct tg3 *tp = netdev_priv(dev);
+	u32 changed = dev->features ^ features;
+
+	if (changed & NETIF_F_LOOPBACK) {
+		if (tg3_flag(tp, LOOPBACK_ENABLED))
+			tg3_flag_clear(tp, LOOPBACK_ENABLED);
+		else
+			tg3_flag_set(tp, LOOPBACK_ENABLED);
+
+		if (netif_running(dev))
+			tg3_set_loopback(dev);
+	}
+
+	return 0;
+}
+
 static inline void tg3_set_mtu(struct net_device *dev, struct tg3 *tp,
 			       int new_mtu)
 {
@@ -9485,6 +9540,13 @@ static int tg3_open(struct net_device *dev)
 
 	netif_tx_start_all_queues(dev);
 
+	/*
+	 * Reset loopback feature if it was turned on while the device was down
+	 * make sure that it's installed properly now.
+	 */
+	if (tg3_flag(tp, LOOPBACK_ENABLED))
+		tg3_set_loopback(dev);
+
 	return 0;
 
 err_out3:
@@ -15029,6 +15091,7 @@ static const struct net_device_ops tg3_netdev_ops = {
 	.ndo_tx_timeout		= tg3_tx_timeout,
 	.ndo_change_mtu		= tg3_change_mtu,
 	.ndo_fix_features	= tg3_fix_features,
+	.ndo_set_features	= tg3_set_features,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= tg3_poll_controller,
 #endif
@@ -15045,6 +15108,7 @@ static const struct net_device_ops tg3_netdev_ops_dma_bug = {
 	.ndo_do_ioctl		= tg3_ioctl,
 	.ndo_tx_timeout		= tg3_tx_timeout,
 	.ndo_change_mtu		= tg3_change_mtu,
+	.ndo_set_features	= tg3_set_features,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= tg3_poll_controller,
 #endif
@@ -15242,6 +15306,16 @@ static int __devinit tg3_init_one(struct pci_dev *pdev,
 	dev->features |= hw_features;
 	dev->vlan_features |= hw_features;
 
+	/*
+	 * Add loopback capability only for a subset of devices that support
+	 * MAC-LOOPBACK. Eventually this need to be enhanced to allow INT-PHY
+	 * loopback for the remaining devices.
+	 */
+	if (GET_ASIC_REV(tp->pci_chip_rev_id) != ASIC_REV_5780 &&
+	    !tg3_flag(tp, CPMU_PRESENT))
+		/* Add the loopback capability */
+		dev->hw_features |= NETIF_F_LOOPBACK;
+
 	if (tp->pci_chip_rev_id == CHIPREV_ID_5705_A1 &&
 	    !tg3_flag(tp, TSO_CAPABLE) &&
 	    !(tr32(TG3PCI_PCISTATE) & PCISTATE_BUS_SPEED_HIGH)) {
diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h
index ce010cd3..d087ef0 100644
--- a/drivers/net/tg3.h
+++ b/drivers/net/tg3.h
@@ -2891,6 +2891,7 @@ enum TG3_FLAGS {
 	TG3_FLAG_57765_PLUS,
 	TG3_FLAG_APE_HAS_NCSI,
 	TG3_FLAG_5717_PLUS,
+	TG3_FLAG_LOOPBACK_ENABLED,
 
 	/* Add new flags before this comment and TG3_FLAG_NUMBER_OF_FLAGS */
 	TG3_FLAG_NUMBER_OF_FLAGS,	/* Last entry in enum TG3_FLAGS */
-- 
1.7.3.1


^ permalink raw reply related

* RE: [RFC v2] virtio: add virtio-over-PCI driver
From: Kushwaha Prabhakar-B32579 @ 2011-05-07  5:59 UTC (permalink / raw)
  To: Ira W. Snyder
  Cc: Zang Roy-R61911, Gala Kumar-B11780, Gupta Maneesh-B18878,
	Aggrwal Poonam-B10812, Kalra Ashish-B00888,
	linux-kernel@vger.kernel.org, linuxppc-dev@lists.ozlabs.org,
	netdev@vger.kernel.org, linuxppc-dev@ozlabs.org
In-Reply-To: <20110506160627.GB14069@ovro.caltech.edu>

Thanks Ira for your kind reply.
I will look for the mentioned pointers :) 


Prabhakar 

> -----Original Message-----
> From: Ira W. Snyder [mailto:iws@ovro.caltech.edu]
> Sent: Friday, May 06, 2011 9:36 PM
> To: Kushwaha Prabhakar-B32579
> Cc: Zang Roy-R61911; Gala Kumar-B11780; Gupta Maneesh-B18878; Aggrwal
> Poonam-B10812; Kalra Ashish-B00888; linux-kernel@vger.kernel.org;
> linuxppc-dev@ozlabs.org; netdev@vger.kernel.org
> Subject: Re: [RFC v2] virtio: add virtio-over-PCI driver
> 
> On Fri, May 06, 2011 at 12:00:34PM +0000, Kushwaha Prabhakar-B32579
> wrote:
> > Hi,
> >
> > I want to use this patch as base patch for "FSL 85xx platform" to
> support PCIe Agent.
> > The work looks to be little old now. So wanted to understand if any
> development has happened further on it.
> >
> > In case no, I would take this work forward for PCIe Agent.
> >
> > Any help/suggestions are most appreciated in this regard.
> >
> 
> Hi Prabhakar,
> 
> I use PCI agent mode on an mpc8349emds board. All of the important setup
> is done very early in the boot process, by U-Boot. Search the U-Boot
> source for CONFIG_PCISLAVE. I hunch that the setup needed for 85xx boards
> are similar.
> 
> This virtio-over-PCI work is now very old. It was intended to provide a
> communication mechanism between a PCI Master and many PCI Agents
> (slaves).
> Dave Miller (networking maintainer) suggested to use virtio for this so
> that many different devices could be used. Such as:
> - network interface
> - serial port (for serial console)
> 
> I am aware of other ongoing work in this area. Specifically, some ARM
> developers are working on a virtio API using their message registers.
> This work is much newer, and will be a much better starting place for
> you.
> 
> Search the virtualization mailing list for:
> "[PATCH 00/02] virtio: Virtio platform driver"
> 
> Here is a link to some of their code:
> http://www.spinics.net/lists/linux-sh/msg07188.html
> 
> I am currently using a custom driver to provide a network device on my
> PCI agents. Searching the mailing list archives for "PCINet", you will
> find early versions of the driver. I am happy to provide you a current
> copy. It does not use virtio at all, and is unlikely to be accepted into
> mainline Linux.
> 
> I am happy to provide any of my code if you think it would help you get
> started. Specifically, the current version of "PCINet" show how to use
> the DMA controller in order to get good network performance. I am also
> happy to help port code to 83xx, as well as test on 83xx. Please ask any
> questions you may have.
> 
> I have people ask about this code about once every two months. There is
> plenty of interest in a mainline Linux solution to this problem. :) I
> will be moving to 85xx someday, and I hope there is an accepted mainline
> solution by then.
> 
> I hope it helps,
> Ira
> 
> > -----Original Message-----
> > From: linux-kernel-owner@vger.kernel.org
> > [mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Ira Snyder
> > Sent: Friday, 27 February, 2009 3:19 AM
> > To: Arnd Bergmann
> > Cc: linux-kernel@vger.kernel.org; Rusty Russell; Jan-Bernd Themann;
> > linuxppc-dev@ozlabs.org; netdev@vger.kernel.org
> > Subject: Re: [RFC v2] virtio: add virtio-over-PCI driver
> >
> > On Thu, Feb 26, 2009 at 09:37:14PM +0100, Arnd Bergmann wrote:
> > > On Thursday 26 February 2009, Ira Snyder wrote:
> > > > On Thu, Feb 26, 2009 at 05:15:27PM +0100, Arnd Bergmann wrote:
> > > >
> > > > I think so too. I was just getting something working, and thought
> > > > it would be better to have it "out there" rather than be working
> > > > on it forever. I'll try to break things up as I have time.
> > >
> > > Ok, perfect!
> > >
> > > > For the "libraries", would you suggest breaking things into
> > > > seperate code files, and using EXPORT_SYMBOL_GPL()? I'm not very
> > > > familiar with doing that, I've mostly been writing code within the
> > > > existing device driver frameworks. Or do I need export symbol at
> all? I'm not sure...
> > >
> > > You have both options. When you list each file as a separate module
> > > in the Makefile, you use EXPORT_SYMBOL_GPL to mark functions that
> > > get called by dependent modules, but this will work only in one way.
> > >
> > > You can also link multiple files together into one module, although
> > > it is less common to link a single source file into multiple modules.
> > >
> >
> > Ok. I'm more familiar with the EXPORT_SYMBOL_GPL interface, so I'll do
> that. If we decide it sucks later, we'll change it.
> >
> > > > I always thought you were supposed to use packed for data
> > > > structures that are external to the system. I purposely designed
> > > > the structures so they wouldn't need padding.
> > >
> > > That would only make sense for structures that are explicitly
> > > unaligned, like a register layout using
> > >
> > > struct my_registers {
> > > 	__le16 first;
> > > 	__le32 second __attribute__((packed));
> > > 	__le16 third;
> > > };
> > >
> > > Even here, I'd recommend listing the individual members as packed
> > > rather than the entire struct. Obviously if you layout the members
> > > in a sane way, you don't need either.
> > >
> >
> > Ok. I'll drop the __attribute__((packed)) and make sure there aren't
> problems. I don't suspect any, though.
> >
> > > > I mostly don't need it. In fact, the only place I'm using
> > > > registers not specific to the messaging unit is in the probe
> > > > routine, where I setup the 1GB window into host memory and setting
> > > > up access to the guest memory on the PCI bus.
> > >
> > > You could add the registers you need for this to the "reg" property
> > > of your device, to be mapped with of_iomap.
> > >
> > > If the registers for setting up this window don't logically fit into
> > > the same device as the one you already use, the cleanest solution
> > > would be to have another device just for this and then make a
> > > function call into that driver to set up the window.
> > >
> >
> > The registers are part of the board control registers. They don't fit
> at all in the message unit. Doing this in the bootloader seems like a
> logical place, but that would require any testers to flash a new U-Boot
> image into their mpc8349emds boards.
> >
> > The first set of access is used to set up a 1GB region in the memory
> map that accesses the host's memory. Any reads/writes to addresses
> 0x80000000-0xc0000000 actually hit the host's memory.
> >
> > The last access sets up PCI BAR1 to hit the memory from
> dma_alloc_coherent(). The bootloader already sets up the window as 16K,
> it just doesn't point it anywhere. Maybe this /should/ go into the
> bootloader. Like above, it would require testers to flash a new U-Boot
> image into their mpc8349emds boards.
> >
> > > > Now, I wouldn't need to access these registers at all if the
> > > > bootloader could handle it. I just don't know if it is possible to
> > > > have Linux not use some memory that the bootloader allocated,
> > > > other than with the mem=XXX trick, which I'm sure wouldn't be
> acceptable.
> > > > I've just used regular RAM so this is portable to my custom board
> > > > (mpc8349emds based) and a regular mpc8349emds. I didn't want to
> > > > change anything board specific.
> > > >
> > > > I would love to have the bootloader allocate (or reserve somewhere
> > > > in the memory map) 16K of RAM, and not be required to allocate it
> > > > with dma_alloc_coherent(). It would save me plenty of headaches.
> > >
> > > I believe you can do that through the "memory" devices in the device
> > > tree, by leaving out a small part of the description of main memory,
> > > at putting it into the "reg" property of your own device.
> > >
> >
> > I'll explore this option. I didn't even know you could do this.  Is a
> driver that requires the trick acceptable for mainline inclusion? Just
> like setting up the 16K PCI window, this is very platform specific.
> >
> > This limits the guest driver to systems which are able to change
> Linux's view of their memory somehow. Maybe this isn't a problem.
> >
> > > > Code complexity only. Also, it was easier to write 80-char lines
> > > > with something like:
> > > >
> > > > vop_get_desc(vq, idx, &desc);
> > > > if (desc.flags & VOP_DESC_F_NEXT) {
> > > > 	/* do something */
> > > > }
> > > >
> > > > Instead of:
> > > > if (le16_to_cpu(vq->desc[idx].flags) & VOP_DESC_F_NEXT) {
> > > > 	/* do something */
> > > > }
> > > >
> > > > Plus, I didn't have to remember how many bits were in each field.
> > > > I just thought it made everything simpler to understand.
> Suggestions?
> > >
> > > hmm, in this particular case, you could change the definition of
> > > VOP_DESC_F_NEXT to
> > >
> > > #define VOP_DESC_F_NEXT cpu_to_le16(1)
> > >
> > > and then do the code as the even simpler (source and object code
> > > wise)
> > >
> > > if (vq->desc[idx].flags) & VOP_DESC_F_NEXT)
> > >
> > > I'm not sure if you can do something along these lines for the other
> > > cases as well though.
> > >
> >
> > That's a good idea. It wouldn't fix the addresses, lengths, and next
> fields, though. I'll make the change and see how bad it is, then report
> back. It may not be so bad after all.
> >
> > > > I used 3 so they would would align to 1024 byte boundaries within
> > > > a 4K page. Then the layout was 16K on the bus, each 4K page is a
> > > > single virtio-device, and each 1K block is a single virtqueue. The
> > > > first 1K is for virtio-device status and feature bits, etc.
> > > >
> > > > Packing them differently isn't a problem. It was just easier to
> > > > code because setting up a window with the correct size is so
> > > > platform specific.
> > >
> > > Ok. I guess the important question is what part of the code makes
> > > this decision. Ideally, the virtio-net glue would instantiate the
> > > device with the right number of queues.
> > >
> >
> > Yeah, virtio doesn't work that way.
> >
> > The virtio drivers just call find_vq() with a different index for each
> queue they want to use. You have no way of knowing how many queues each
> virtio driver will want, unless you go read their source code.
> >
> > virtio-net currently uses 3 queues, but we only support the first two.
> > The third is optional (for now...), and non-symmetric.
> >
> > Thanks again,
> > Ira
> > --
> > To unsubscribe from this list: send the line "unsubscribe
> > linux-kernel" in the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> >
> >
> >
> >



^ permalink raw reply

* Re: [PATCH 2/2] ipv4: Initialize on-stack cork more efficiently.
From: David Miller @ 2011-05-07  5:21 UTC (permalink / raw)
  To: eric.dumazet; +Cc: netdev, herbert
In-Reply-To: <1304745104.2821.590.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sat, 07 May 2011 07:11:44 +0200

> By the way, when I spotted this "struct inet_cork cork = {};" to be
> optimized, my idea was to add yet another case of fastpath to UDP send :
> 
> For small datagrams (most UDP uses : RTP, DNS...),
> perform the user->kernel copy before route lookup, so that we can
> perform an RCU route lookup. This would tremendously speedup UDP, since
> the refcount handling is our last hot spot (not counting qdisc if
> present)

Interesting idea.

This reminds me, remember about the input noref route lookup stuff
going away with the routing cache removal?  It turns out that when we
do my "routes embedded in fib nexthop" for input, the noref stuff can
be used. :)

^ permalink raw reply

* Re: [PATCH 2/2] ipv4: Initialize on-stack cork more efficiently.
From: Eric Dumazet @ 2011-05-07  5:11 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, herbert
In-Reply-To: <1304721317.2821.156.camel@edumazet-laptop>

Le samedi 07 mai 2011 à 00:35 +0200, Eric Dumazet a écrit :
> Le vendredi 06 mai 2011 à 15:26 -0700, David Miller a écrit :
> > ip_setup_cork() explicitly initializes every member of
> > inet_cork except flags, addr, and opt.  So we can simply
> > set those three members to zero instead of using a
> > memset() via an empty struct assignment.
> > 
> > Signed-off-by: David S. Miller <davem@davemloft.net>
> > ---
> >  net/ipv4/ip_output.c |    5 ++++-
> >  1 files changed, 4 insertions(+), 1 deletions(-)
> > 
> > diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
> > index eb0647a..5f5fe4f 100644
> > --- a/net/ipv4/ip_output.c
> > +++ b/net/ipv4/ip_output.c
> > @@ -1408,7 +1408,7 @@ struct sk_buff *ip_make_skb(struct sock *sk,
> >  			    struct ipcm_cookie *ipc, struct rtable **rtp,
> >  			    unsigned int flags)
> >  {
> > -	struct inet_cork cork = {};
> > +	struct inet_cork cork;
> >  	struct sk_buff_head queue;
> >  	int err;
> >  
> > @@ -1417,6 +1417,9 @@ struct sk_buff *ip_make_skb(struct sock *sk,
> >  
> >  	__skb_queue_head_init(&queue);
> >  
> > +	cork.flags = 0;
> > +	cork.addr = 0;
> > +	cork.opt = 0;
> >  	err = ip_setup_cork(sk, &cork, ipc, rtp);
> >  	if (err)
> >  		return ERR_PTR(err);
> 
> Very nice, thanks for finishing this stuff :)
> 
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
> 

By the way, when I spotted this "struct inet_cork cork = {};" to be
optimized, my idea was to add yet another case of fastpath to UDP send :

For small datagrams (most UDP uses : RTP, DNS...),
perform the user->kernel copy before route lookup, so that we can
perform an RCU route lookup. This would tremendously speedup UDP, since
the refcount handling is our last hot spot (not counting qdisc if
present)

   PerfTop:   16142 irqs/sec  kernel:97.5%  exact:  0.0% [1000Hz cycles],  (all, 16 CPUs)
-----------------------------------------------------------------------------------------------------------

             samples  pcnt function                 DSO
             _______ _____ ________________________ ______________________

            16735.00 24.2% __ip_route_output_key    vmlinux               
             9706.00 14.1% dst_release              vmlinux               
             6754.00  9.8% __ip_make_skb            vmlinux               
             5737.00  8.3% udp_send_skb             vmlinux               
             5384.00  7.8% ip_finish_output         vmlinux               
             3578.00  5.2% udp_sendmsg              vmlinux               
             1435.00  2.1% copy_user_generic_string vmlinux               
             1358.00  2.0% ia32_sysenter_target     vmlinux               
             1095.00  1.6% __ip_append_data         vmlinux               
              832.00  1.2% kfree                    vmlinux               
              794.00  1.2% __memset                 vmlinux               
              677.00  1.0% fget_light               vmlinux               
              641.00  0.9% sock_wfree               vmlinux               
              637.00  0.9% dev_queue_xmit           vmlinux               



^ permalink raw reply

* [PATCH 3/7] ns proc: Add support for the network namespace.
From: Eric W. Biederman @ 2011-05-07  2:24 UTC (permalink / raw)
  To: linux-arch
  Cc: linux-kernel, netdev, linux-fsdevel, jamal, Daniel Lezcano,
	Linux Containers, Renato Westphal, Eric W. Biederman
In-Reply-To: <1304735101-1824-1-git-send-email-ebiederm@xmission.com>

Implementing file descriptors for the network namespace
is simple and straight forward.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/proc/namespaces.c     |    3 +++
 include/linux/proc_fs.h  |    1 +
 net/core/net_namespace.c |   31 +++++++++++++++++++++++++++++++
 3 files changed, 35 insertions(+), 0 deletions(-)

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 6ae9f07..dcbd483 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -16,6 +16,9 @@
 
 
 static const struct proc_ns_operations *ns_entries[] = {
+#ifdef CONFIG_NET_NS
+	&netns_operations,
+#endif
 };
 
 static const struct file_operations ns_file_operations = {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index a6d2c6d..62126ec 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -265,6 +265,7 @@ struct proc_ns_operations {
 	void (*put)(void *ns);
 	int (*install)(struct nsproxy *nsproxy, void *ns);
 };
+extern const struct proc_ns_operations netns_operations;
 
 union proc_op {
 	int (*proc_get_link)(struct inode *, struct path *);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 3f86026..bf7707e 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -573,3 +573,34 @@ void unregister_pernet_device(struct pernet_operations *ops)
 	mutex_unlock(&net_mutex);
 }
 EXPORT_SYMBOL_GPL(unregister_pernet_device);
+
+#ifdef CONFIG_NET_NS
+static void *netns_get(struct task_struct *task)
+{
+	struct net *net;
+	rcu_read_lock();
+	net = get_net(task->nsproxy->net_ns);
+	rcu_read_unlock();
+	return net;
+}
+
+static void netns_put(void *ns)
+{
+	put_net(ns);
+}
+
+static int netns_install(struct nsproxy *nsproxy, void *ns)
+{
+	put_net(nsproxy->net_ns);
+	nsproxy->net_ns = get_net(ns);
+	return 0;
+}
+
+const struct proc_ns_operations netns_operations = {
+	.name		= "net",
+	.type		= CLONE_NEWNET,
+	.get		= netns_get,
+	.put		= netns_put,
+	.install	= netns_install,
+};
+#endif
-- 
1.6.5.2.143.g8cc62


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox