Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] net: ibmveth: convert to hw_features
From: Michał Mirosław @ 2011-04-19 12:14 UTC (permalink / raw)
  To: netdev; +Cc: Santiago Leon

A minimal conversion.

ibmveth_set_csum_offload() can be folded into ibmveth_set_features()
and adapter->rx_csum removed - left for later cleanup.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/ibmveth.c |   96 ++++++++++++++-----------------------------------
 1 files changed, 27 insertions(+), 69 deletions(-)

diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index 5522d45..4855f1f 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -729,45 +729,24 @@ static void netdev_get_drvinfo(struct net_device *dev,
 		sizeof(info->version) - 1);
 }
 
-static void ibmveth_set_rx_csum_flags(struct net_device *dev, u32 data)
+static u32 ibmveth_fix_features(struct net_device *dev, u32 features)
 {
-	struct ibmveth_adapter *adapter = netdev_priv(dev);
+	/*
+	 * Since the ibmveth firmware interface does not have the
+	 * concept of separate tx/rx checksum offload enable, if rx
+	 * checksum is disabled we also have to disable tx checksum
+	 * offload. Once we disable rx checksum offload, we are no
+	 * longer allowed to send tx buffers that are not properly
+	 * checksummed.
+	 */
 
-	if (data) {
-		adapter->rx_csum = 1;
-	} else {
-		/*
-		 * Since the ibmveth firmware interface does not have the
-		 * concept of separate tx/rx checksum offload enable, if rx
-		 * checksum is disabled we also have to disable tx checksum
-		 * offload. Once we disable rx checksum offload, we are no
-		 * longer allowed to send tx buffers that are not properly
-		 * checksummed.
-		 */
-		adapter->rx_csum = 0;
-		dev->features &= ~NETIF_F_IP_CSUM;
-		dev->features &= ~NETIF_F_IPV6_CSUM;
-	}
-}
-
-static void ibmveth_set_tx_csum_flags(struct net_device *dev, u32 data)
-{
-	struct ibmveth_adapter *adapter = netdev_priv(dev);
+	if (!(features & NETIF_F_RXCSUM))
+		features &= ~NETIF_F_ALL_CSUM;
 
-	if (data) {
-		if (adapter->fw_ipv4_csum_support)
-			dev->features |= NETIF_F_IP_CSUM;
-		if (adapter->fw_ipv6_csum_support)
-			dev->features |= NETIF_F_IPV6_CSUM;
-		adapter->rx_csum = 1;
-	} else {
-		dev->features &= ~NETIF_F_IP_CSUM;
-		dev->features &= ~NETIF_F_IPV6_CSUM;
-	}
+	return features;
 }
 
-static int ibmveth_set_csum_offload(struct net_device *dev, u32 data,
-				    void (*done) (struct net_device *, u32))
+static int ibmveth_set_csum_offload(struct net_device *dev, u32 data)
 {
 	struct ibmveth_adapter *adapter = netdev_priv(dev);
 	unsigned long set_attr, clr_attr, ret_attr;
@@ -827,8 +806,8 @@ static int ibmveth_set_csum_offload(struct net_device *dev, u32 data,
 		} else
 			adapter->fw_ipv6_csum_support = data;
 
-		if (ret == H_SUCCESS || ret6 == H_SUCCESS)
-			done(dev, data);
+		if (ret != H_SUCCESS || ret6 != H_SUCCESS)
+			adapter->rx_csum = data;
 		else
 			rc1 = -EIO;
 	} else {
@@ -844,41 +823,22 @@ static int ibmveth_set_csum_offload(struct net_device *dev, u32 data,
 	return rc1 ? rc1 : rc2;
 }
 
-static int ibmveth_set_rx_csum(struct net_device *dev, u32 data)
+static int ibmveth_set_features(struct net_device *dev, u32 features)
 {
 	struct ibmveth_adapter *adapter = netdev_priv(dev);
+	int rx_csum = !!(features & NETIF_F_RXCSUM);
+	int rc;
 
-	if ((data && adapter->rx_csum) || (!data && !adapter->rx_csum))
+	if (rx_csum == adapter->rx_csum)
 		return 0;
 
-	return ibmveth_set_csum_offload(dev, data, ibmveth_set_rx_csum_flags);
-}
-
-static int ibmveth_set_tx_csum(struct net_device *dev, u32 data)
-{
-	struct ibmveth_adapter *adapter = netdev_priv(dev);
-	int rc = 0;
-
-	if (data && (dev->features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)))
-		return 0;
-	if (!data && !(dev->features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)))
-		return 0;
-
-	if (data && !adapter->rx_csum)
-		rc = ibmveth_set_csum_offload(dev, data,
-					      ibmveth_set_tx_csum_flags);
-	else
-		ibmveth_set_tx_csum_flags(dev, data);
+	rc = ibmveth_set_csum_offload(dev, rx_csum);
+	if (rc && !adapter->rx_csum)
+		dev->features = features & ~(NETIF_F_ALL_CSUM | NETIF_F_RXCSUM);
 
 	return rc;
 }
 
-static u32 ibmveth_get_rx_csum(struct net_device *dev)
-{
-	struct ibmveth_adapter *adapter = netdev_priv(dev);
-	return adapter->rx_csum;
-}
-
 static void ibmveth_get_strings(struct net_device *dev, u32 stringset, u8 *data)
 {
 	int i;
@@ -914,13 +874,9 @@ static const struct ethtool_ops netdev_ethtool_ops = {
 	.get_drvinfo		= netdev_get_drvinfo,
 	.get_settings		= netdev_get_settings,
 	.get_link		= ethtool_op_get_link,
-	.set_tx_csum		= ibmveth_set_tx_csum,
-	.get_rx_csum		= ibmveth_get_rx_csum,
-	.set_rx_csum		= ibmveth_set_rx_csum,
 	.get_strings		= ibmveth_get_strings,
 	.get_sset_count		= ibmveth_get_sset_count,
 	.get_ethtool_stats	= ibmveth_get_ethtool_stats,
-	.set_sg			= ethtool_op_set_sg,
 };
 
 static int ibmveth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
@@ -1345,6 +1301,8 @@ static const struct net_device_ops ibmveth_netdev_ops = {
 	.ndo_set_multicast_list	= ibmveth_set_multicast_list,
 	.ndo_do_ioctl		= ibmveth_ioctl,
 	.ndo_change_mtu		= ibmveth_change_mtu,
+	.ndo_fix_features	= ibmveth_fix_features,
+	.ndo_set_features	= ibmveth_set_features,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_set_mac_address	= eth_mac_addr,
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -1412,7 +1370,9 @@ static int __devinit ibmveth_probe(struct vio_dev *dev,
 	netdev->netdev_ops = &ibmveth_netdev_ops;
 	netdev->ethtool_ops = &netdev_ethtool_ops;
 	SET_NETDEV_DEV(netdev, &dev->dev);
-	netdev->features |= NETIF_F_SG;
+	netdev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM |
+		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	netdev->features |= netdev->hw_features;
 
 	memcpy(netdev->dev_addr, &adapter->mac_addr, netdev->addr_len);
 
@@ -1437,8 +1397,6 @@ static int __devinit ibmveth_probe(struct vio_dev *dev,
 
 	netdev_dbg(netdev, "registering netdev...\n");
 
-	ibmveth_set_csum_offload(netdev, 1, ibmveth_set_tx_csum_flags);
-
 	rc = register_netdev(netdev);
 
 	if (rc) {
-- 
1.7.2.5


^ permalink raw reply related

* Re: Add NAPI support to ll_temac driver
From: Ben Hutchings @ 2011-04-19 12:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: monstr, netdev
In-Reply-To: <1303209787.3480.9.camel@edumazet-laptop>

On Tue, 2011-04-19 at 12:43 +0200, Eric Dumazet wrote:
[...]
> One possible way to get better performance is to change driver to
> allocate skbs only right before calling netif_rx(), so that you dont
> have to access cold sk_buff data twice (once when allocating skb and put
> it in ring buffer, a second time when receiving frame)
>
> drivers/net/niu.c is a good example for this (NAPI + netdev_alloc_skb()
> just in time + pull in skbhead only first cache line of packet)
[...]

If the hardware can do RX checksumming (it's not clear) then the driver
should pass the paged buffers into GRO and that will take care of skb
allocation as necessary.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: Add NAPI support to ll_temac driver
From: Michal Simek @ 2011-04-19 12:26 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1303209787.3480.9.camel@edumazet-laptop>

Eric Dumazet wrote:
> Le mardi 19 avril 2011 à 11:35 +0200, Michal Simek a écrit :
>> Hi,
>>
>> I would like to try to add NAPI support for ll_temac and look if help us to 
>> improve performance on Microblaze system. I would expect that bandwidth should 
>> be increased.
>> We have the second non mainline driver which use tasklets and it provides better 
>>   performance than mainline driver but not so big that's why I think that NAPI 
>> can increase performance.
>>
>> Can you please point me to any driver which I could use as a template?
>> Or any developer guide to do so.
>>
>> Do you know any other option how to improve driver performance on low speed cpu?
>>
>> I have found that driver spends a lot of time on skb allocation and preallocated 
>> SKBs help a little bit. I have done a test where I increased number of 
>> preallocated BDs(SKBs) for rx to 35000 and disable new BD(SKB) allocation in 
>> rx_irq. 35000 BDs is setup because I need them to successfully finish netperf 
>> test. I have got 25% bandwidth increasing.
>>
>> It will be also nice to be able to allocate several BDs(SKBs) which could be 
>> faster than allocate them in sequence.
> 
> Depends if your cpu has some cache. The best performance is to try to
> get high cache hit ratios.

Yes it has icache and dcache (write-back or write-through).


> 
> One possible way to get better performance is to change driver to
> allocate skbs only right before calling netif_rx(), so that you dont
> have to access cold sk_buff data twice (once when allocating skb and put
> it in ring buffer, a second time when receiving frame)

ok. But I need to allocate BD for dma with pointer to skb where dma should copy 
data to. I could do it in irq but I would have to wait till dma copy data from 
ethernet controller to memory. I haven't measure how slow/fast is that copying.

> 
> drivers/net/niu.c is a good example for this (NAPI + netdev_alloc_skb()
> just in time + pull in skbhead only first cache line of packet)
> 
> drivers/net/ftmac100.c is also a recent driver (and probably a better
> start with less complex hardware than NIU) using these tricks
> 
> { skb = netdev_alloc_skb_ip_align(netdev, 128);
>  __pskb_pull_tail(skb, min(length, 64)); 
> }

I have change rx for napi but need to debug it a little bit. It works for some 
packets but I am not able to run any test right now.

Thanks,
Michal


-- 
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian

^ permalink raw reply

* [PATCH] net: qlcnic: convert to hw_features
From: Michał Mirosław @ 2011-04-19 12:42 UTC (permalink / raw)
  To: netdev; +Cc: Amit Kumar Salecha, Anirban Chakraborty, linux-driver

Bit more than minimal conversion. There might be some issues because
of qlcnic_set_netdev_features() if it's called after netdev init.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/qlcnic/qlcnic.h         |    3 +-
 drivers/net/qlcnic/qlcnic_ethtool.c |  120 -----------------------------------
 drivers/net/qlcnic/qlcnic_hw.c      |   35 ++++++++++
 drivers/net/qlcnic/qlcnic_init.c    |    4 +-
 drivers/net/qlcnic/qlcnic_main.c    |   35 +++++------
 5 files changed, 55 insertions(+), 142 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index e5d3053..fa5b15c 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -926,7 +926,6 @@ struct qlcnic_adapter {
 	u8 max_rds_rings;
 	u8 max_sds_rings;
 	u8 msix_supported;
-	u8 rx_csum;
 	u8 portnum;
 	u8 physical_port;
 	u8 reset_context;
@@ -1247,6 +1246,8 @@ void qlcnic_advert_link_change(struct qlcnic_adapter *adapter, int linkup);
 
 int qlcnic_fw_cmd_set_mtu(struct qlcnic_adapter *adapter, int mtu);
 int qlcnic_change_mtu(struct net_device *netdev, int new_mtu);
+u32 qlcnic_fix_features(struct net_device *netdev, u32 features);
+int qlcnic_set_features(struct net_device *netdev, u32 features);
 int qlcnic_config_hw_lro(struct qlcnic_adapter *adapter, int enable);
 int qlcnic_config_bridged_mode(struct qlcnic_adapter *adapter, u32 enable);
 int qlcnic_send_lro_cleanup(struct qlcnic_adapter *adapter);
diff --git a/drivers/net/qlcnic/qlcnic_ethtool.c b/drivers/net/qlcnic/qlcnic_ethtool.c
index 3cd8a16..615a5ab 100644
--- a/drivers/net/qlcnic/qlcnic_ethtool.c
+++ b/drivers/net/qlcnic/qlcnic_ethtool.c
@@ -764,73 +764,6 @@ qlcnic_get_ethtool_stats(struct net_device *dev,
 	qlcnic_fill_device_stats(&index, data, &port_stats.tx);
 }
 
-static int qlcnic_set_tx_csum(struct net_device *dev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-
-	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED))
-		return -EOPNOTSUPP;
-	if (data)
-		dev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-	else
-		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-
-	return 0;
-
-}
-static u32 qlcnic_get_tx_csum(struct net_device *dev)
-{
-	return dev->features & NETIF_F_IP_CSUM;
-}
-
-static u32 qlcnic_get_rx_csum(struct net_device *dev)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-	return adapter->rx_csum;
-}
-
-static int qlcnic_set_rx_csum(struct net_device *dev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-
-	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED))
-		return -EOPNOTSUPP;
-	if (!!data) {
-		adapter->rx_csum = !!data;
-		return 0;
-	}
-
-	if (dev->features & NETIF_F_LRO) {
-		if (qlcnic_config_hw_lro(adapter, QLCNIC_LRO_DISABLED))
-			return -EIO;
-
-		dev->features &= ~NETIF_F_LRO;
-		qlcnic_send_lro_cleanup(adapter);
-		dev_info(&adapter->pdev->dev,
-					"disabling LRO as rx_csum is off\n");
-	}
-	adapter->rx_csum = !!data;
-	return 0;
-}
-
-static u32 qlcnic_get_tso(struct net_device *dev)
-{
-	return (dev->features & (NETIF_F_TSO | NETIF_F_TSO6)) != 0;
-}
-
-static int qlcnic_set_tso(struct net_device *dev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-	if (!(adapter->capabilities & QLCNIC_FW_CAPABILITY_TSO))
-		return -EOPNOTSUPP;
-	if (data)
-		dev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
-	else
-		dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
-
-	return 0;
-}
-
 static int qlcnic_set_led(struct net_device *dev,
 			  enum ethtool_phys_id_state state)
 {
@@ -993,50 +926,6 @@ static int qlcnic_get_intr_coalesce(struct net_device *netdev,
 	return 0;
 }
 
-static int qlcnic_set_flags(struct net_device *netdev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(netdev);
-	int hw_lro;
-
-	if (ethtool_invalid_flags(netdev, data, ETH_FLAG_LRO))
-		return -EINVAL;
-
-	if (data & ETH_FLAG_LRO) {
-
-		if (netdev->features & NETIF_F_LRO)
-			return 0;
-
-		if (!(adapter->capabilities & QLCNIC_FW_CAPABILITY_HW_LRO))
-			return -EINVAL;
-
-		if (!adapter->rx_csum) {
-			dev_info(&adapter->pdev->dev, "rx csum is off, "
-				"cannot toggle lro\n");
-			return -EINVAL;
-		}
-
-		hw_lro = QLCNIC_LRO_ENABLED;
-		netdev->features |= NETIF_F_LRO;
-
-	} else {
-
-		if (!(netdev->features & NETIF_F_LRO))
-			return 0;
-
-		hw_lro = 0;
-		netdev->features &= ~NETIF_F_LRO;
-	}
-
-	if (qlcnic_config_hw_lro(adapter, hw_lro))
-		return -EIO;
-
-	if ((hw_lro == 0) && qlcnic_send_lro_cleanup(adapter))
-		return -EIO;
-
-
-	return 0;
-}
-
 static u32 qlcnic_get_msglevel(struct net_device *netdev)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
@@ -1064,23 +953,14 @@ const struct ethtool_ops qlcnic_ethtool_ops = {
 	.set_ringparam = qlcnic_set_ringparam,
 	.get_pauseparam = qlcnic_get_pauseparam,
 	.set_pauseparam = qlcnic_set_pauseparam,
-	.get_tx_csum = qlcnic_get_tx_csum,
-	.set_tx_csum = qlcnic_set_tx_csum,
-	.set_sg = ethtool_op_set_sg,
-	.get_tso = qlcnic_get_tso,
-	.set_tso = qlcnic_set_tso,
 	.get_wol = qlcnic_get_wol,
 	.set_wol = qlcnic_set_wol,
 	.self_test = qlcnic_diag_test,
 	.get_strings = qlcnic_get_strings,
 	.get_ethtool_stats = qlcnic_get_ethtool_stats,
 	.get_sset_count = qlcnic_get_sset_count,
-	.get_rx_csum = qlcnic_get_rx_csum,
-	.set_rx_csum = qlcnic_set_rx_csum,
 	.get_coalesce = qlcnic_get_intr_coalesce,
 	.set_coalesce = qlcnic_set_intr_coalesce,
-	.get_flags = ethtool_op_get_flags,
-	.set_flags = qlcnic_set_flags,
 	.set_phys_id = qlcnic_set_led,
 	.set_msglevel = qlcnic_set_msglevel,
 	.get_msglevel = qlcnic_get_msglevel,
diff --git a/drivers/net/qlcnic/qlcnic_hw.c b/drivers/net/qlcnic/qlcnic_hw.c
index 498cca9..82613c8 100644
--- a/drivers/net/qlcnic/qlcnic_hw.c
+++ b/drivers/net/qlcnic/qlcnic_hw.c
@@ -758,6 +758,41 @@ int qlcnic_change_mtu(struct net_device *netdev, int mtu)
 	return rc;
 }
 
+
+u32 qlcnic_fix_features(struct net_device *netdev, u32 features)
+{
+	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED)) {
+		u32 changed = features ^ dev->features;
+		features ^= changed & (NETIF_F_ALL_CSUM | NETIF_F_RXCSUM);
+	}
+
+	if (!(features & NETIF_F_RXCSUM))
+		features &= ~NETIF_F_LRO;
+
+	return features;
+}
+
+
+int qlcnic_set_features(struct net_device *netdev, u32 features)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	u32 changed = dev->features ^ features;
+	int hw_lro = (features & NETIF_F_LRO) ? QLCNIC_LRO_ENABLED : 0;
+
+	if (!(changed & NETIF_F_LRO))
+		return 0;
+
+	dev->features = features;
+
+	if (qlcnic_config_hw_lro(adapter, hw_lro))
+		return -EIO;
+
+	if ((hw_lro == 0) && qlcnic_send_lro_cleanup(adapter))
+		return -EIO;
+
+	return 1;
+}
+
 /*
  * Changes the CRB window to the specified window.
  */
diff --git a/drivers/net/qlcnic/qlcnic_init.c b/drivers/net/qlcnic/qlcnic_init.c
index 4ec0eeb..d0f338b 100644
--- a/drivers/net/qlcnic/qlcnic_init.c
+++ b/drivers/net/qlcnic/qlcnic_init.c
@@ -1390,8 +1390,8 @@ static struct sk_buff *qlcnic_process_rxbuf(struct qlcnic_adapter *adapter,
 
 	skb = buffer->skb;
 
-	if (likely(adapter->rx_csum && (cksum == STATUS_CKSUM_OK ||
-						cksum == STATUS_CKSUM_LOOP))) {
+	if (likely((adapter->netdev->features & NETIF_F_RXCSUM) &&
+	    (cksum == STATUS_CKSUM_OK || cksum == STATUS_CKSUM_LOOP))) {
 		adapter->stats.csummed++;
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	} else {
diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index e9e9ba6..06bb233 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -328,6 +328,8 @@ static const struct net_device_ops qlcnic_netdev_ops = {
 	.ndo_set_multicast_list = qlcnic_set_multi,
 	.ndo_set_mac_address    = qlcnic_set_mac,
 	.ndo_change_mtu	   = qlcnic_change_mtu,
+	.ndo_fix_features  = qlcnic_fix_features,
+	.ndo_set_features  = qlcnic_set_features,
 	.ndo_tx_timeout	   = qlcnic_tx_timeout,
 	.ndo_vlan_rx_add_vid	= qlcnic_vlan_rx_add,
 	.ndo_vlan_rx_kill_vid	= qlcnic_vlan_rx_del,
@@ -764,7 +766,7 @@ qlcnic_set_netdev_features(struct qlcnic_adapter *adapter,
 	struct net_device *netdev = adapter->netdev;
 	unsigned long features, vlan_features;
 
-	features = (NETIF_F_SG | NETIF_F_IP_CSUM |
+	features = (NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
 			NETIF_F_IPV6_CSUM | NETIF_F_GRO);
 	vlan_features = (NETIF_F_SG | NETIF_F_IP_CSUM |
 			NETIF_F_IPV6_CSUM | NETIF_F_HW_VLAN_FILTER);
@@ -779,14 +781,12 @@ qlcnic_set_netdev_features(struct qlcnic_adapter *adapter,
 
 	if (esw_cfg->offload_flags & BIT_0) {
 		netdev->features |= features;
-		adapter->rx_csum = 1;
 		if (!(esw_cfg->offload_flags & BIT_1))
 			netdev->features &= ~NETIF_F_TSO;
 		if (!(esw_cfg->offload_flags & BIT_2))
 			netdev->features &= ~NETIF_F_TSO6;
 	} else {
 		netdev->features &= ~features;
-		adapter->rx_csum = 0;
 	}
 
 	netdev->vlan_features = (features & vlan_features);
@@ -1436,7 +1436,6 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter,
 	int err;
 	struct pci_dev *pdev = adapter->pdev;
 
-	adapter->rx_csum = 1;
 	adapter->mc_enabled = 0;
 	adapter->max_mc_count = 38;
 
@@ -1447,26 +1446,24 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter,
 
 	SET_ETHTOOL_OPS(netdev, &qlcnic_ethtool_ops);
 
-	netdev->features |= (NETIF_F_SG | NETIF_F_IP_CSUM |
-		NETIF_F_IPV6_CSUM | NETIF_F_GRO | NETIF_F_HW_VLAN_RX);
-	netdev->vlan_features |= (NETIF_F_SG | NETIF_F_IP_CSUM |
-		NETIF_F_IPV6_CSUM | NETIF_F_HW_VLAN_FILTER);
-
-	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_TSO) {
-		netdev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
-		netdev->vlan_features |= (NETIF_F_TSO | NETIF_F_TSO6);
-	}
+	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
+		NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM;
 
+	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_TSO)
+		netdev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
 	if (pci_using_dac) {
-		netdev->features |= NETIF_F_HIGHDMA;
-		netdev->vlan_features |= NETIF_F_HIGHDMA;
-	}
+		netdev->hw_features |= NETIF_F_HIGHDMA;
+
+	netdev->vlan_features = netdev->hw_features;
 
 	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_FVLANTX)
-		netdev->features |= (NETIF_F_HW_VLAN_TX);
-
+		netdev->hw_features |= NETIF_F_HW_VLAN_TX;
 	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_HW_LRO)
-		netdev->features |= NETIF_F_LRO;
+		netdev->hw_features |= NETIF_F_LRO;
+
+	netdev->features |= netdev->hw_features |
+		NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
+
 	netdev->irq = adapter->msix_entries[0].vector;
 
 	netif_carrier_off(netdev);
-- 
1.7.2.5


^ permalink raw reply related

* Re: Add NAPI support to ll_temac driver
From: Michal Simek @ 2011-04-19 12:48 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Eric Dumazet, netdev
In-Reply-To: <1303215925.3464.54.camel@localhost>

Ben Hutchings wrote:
> On Tue, 2011-04-19 at 12:43 +0200, Eric Dumazet wrote:
> [...]
>> One possible way to get better performance is to change driver to
>> allocate skbs only right before calling netif_rx(), so that you dont
>> have to access cold sk_buff data twice (once when allocating skb and put
>> it in ring buffer, a second time when receiving frame)
>>
>> drivers/net/niu.c is a good example for this (NAPI + netdev_alloc_skb()
>> just in time + pull in skbhead only first cache line of packet)
> [...]
> 
> If the hardware can do RX checksumming (it's not clear) then the driver
> should pass the paged buffers into GRO and that will take care of skb
> allocation as necessary.

Hardware supports RX and TX partial checksumming. I can enable it. The driver 
has also this option and from my tests there is of course some performance 
improvemetn.

Just for sure - here are links on documentation.
http://www.xilinx.com/support/documentation/ip_documentation/xps_ll_temac.pdf
or
http://www.xilinx.com/support/documentation/ip_documentation/axi_ethernet/v2_01_a/ds759_axi_ethernet.pdf

About SKB allocation. I fixed our non mainline driver to allocate skb based on 
current mtu size. Mainline driver allocate max mtu (9k). This has also impact on 
performance because Microblaze works with smaller SKBs.

Can you please be more specific about passing the paged buffers into GRO?
Or point me to any documentation or code which can help me to understand what 
that means.

Thanks,
Michal

-- 
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian

^ permalink raw reply

* [PATCH v2] net: qlcnic: convert to hw_features
From: Michał Mirosław @ 2011-04-19 13:03 UTC (permalink / raw)
  To: netdev; +Cc: Amit Kumar Salecha, Anirban Chakraborty, linux-driver
In-Reply-To: <20110419124238.AECB513909@rere.qmqm.pl>

Bit more than minimal conversion. There might be some issues because
of qlcnic_set_netdev_features() if it's called after netdev init.

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
v2: fix compilation issues (I forgot to 'stg refresh' before sending v1)

 drivers/net/qlcnic/qlcnic.h         |    3 +-
 drivers/net/qlcnic/qlcnic_ethtool.c |  120 -----------------------------------
 drivers/net/qlcnic/qlcnic_hw.c      |   37 +++++++++++
 drivers/net/qlcnic/qlcnic_init.c    |    4 +-
 drivers/net/qlcnic/qlcnic_main.c    |   35 +++++------
 5 files changed, 57 insertions(+), 142 deletions(-)

diff --git a/drivers/net/qlcnic/qlcnic.h b/drivers/net/qlcnic/qlcnic.h
index e5d3053..fa5b15c 100644
--- a/drivers/net/qlcnic/qlcnic.h
+++ b/drivers/net/qlcnic/qlcnic.h
@@ -926,7 +926,6 @@ struct qlcnic_adapter {
 	u8 max_rds_rings;
 	u8 max_sds_rings;
 	u8 msix_supported;
-	u8 rx_csum;
 	u8 portnum;
 	u8 physical_port;
 	u8 reset_context;
@@ -1247,6 +1246,8 @@ void qlcnic_advert_link_change(struct qlcnic_adapter *adapter, int linkup);
 
 int qlcnic_fw_cmd_set_mtu(struct qlcnic_adapter *adapter, int mtu);
 int qlcnic_change_mtu(struct net_device *netdev, int new_mtu);
+u32 qlcnic_fix_features(struct net_device *netdev, u32 features);
+int qlcnic_set_features(struct net_device *netdev, u32 features);
 int qlcnic_config_hw_lro(struct qlcnic_adapter *adapter, int enable);
 int qlcnic_config_bridged_mode(struct qlcnic_adapter *adapter, u32 enable);
 int qlcnic_send_lro_cleanup(struct qlcnic_adapter *adapter);
diff --git a/drivers/net/qlcnic/qlcnic_ethtool.c b/drivers/net/qlcnic/qlcnic_ethtool.c
index 3cd8a16..615a5ab 100644
--- a/drivers/net/qlcnic/qlcnic_ethtool.c
+++ b/drivers/net/qlcnic/qlcnic_ethtool.c
@@ -764,73 +764,6 @@ qlcnic_get_ethtool_stats(struct net_device *dev,
 	qlcnic_fill_device_stats(&index, data, &port_stats.tx);
 }
 
-static int qlcnic_set_tx_csum(struct net_device *dev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-
-	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED))
-		return -EOPNOTSUPP;
-	if (data)
-		dev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-	else
-		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-
-	return 0;
-
-}
-static u32 qlcnic_get_tx_csum(struct net_device *dev)
-{
-	return dev->features & NETIF_F_IP_CSUM;
-}
-
-static u32 qlcnic_get_rx_csum(struct net_device *dev)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-	return adapter->rx_csum;
-}
-
-static int qlcnic_set_rx_csum(struct net_device *dev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-
-	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED))
-		return -EOPNOTSUPP;
-	if (!!data) {
-		adapter->rx_csum = !!data;
-		return 0;
-	}
-
-	if (dev->features & NETIF_F_LRO) {
-		if (qlcnic_config_hw_lro(adapter, QLCNIC_LRO_DISABLED))
-			return -EIO;
-
-		dev->features &= ~NETIF_F_LRO;
-		qlcnic_send_lro_cleanup(adapter);
-		dev_info(&adapter->pdev->dev,
-					"disabling LRO as rx_csum is off\n");
-	}
-	adapter->rx_csum = !!data;
-	return 0;
-}
-
-static u32 qlcnic_get_tso(struct net_device *dev)
-{
-	return (dev->features & (NETIF_F_TSO | NETIF_F_TSO6)) != 0;
-}
-
-static int qlcnic_set_tso(struct net_device *dev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(dev);
-	if (!(adapter->capabilities & QLCNIC_FW_CAPABILITY_TSO))
-		return -EOPNOTSUPP;
-	if (data)
-		dev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
-	else
-		dev->features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
-
-	return 0;
-}
-
 static int qlcnic_set_led(struct net_device *dev,
 			  enum ethtool_phys_id_state state)
 {
@@ -993,50 +926,6 @@ static int qlcnic_get_intr_coalesce(struct net_device *netdev,
 	return 0;
 }
 
-static int qlcnic_set_flags(struct net_device *netdev, u32 data)
-{
-	struct qlcnic_adapter *adapter = netdev_priv(netdev);
-	int hw_lro;
-
-	if (ethtool_invalid_flags(netdev, data, ETH_FLAG_LRO))
-		return -EINVAL;
-
-	if (data & ETH_FLAG_LRO) {
-
-		if (netdev->features & NETIF_F_LRO)
-			return 0;
-
-		if (!(adapter->capabilities & QLCNIC_FW_CAPABILITY_HW_LRO))
-			return -EINVAL;
-
-		if (!adapter->rx_csum) {
-			dev_info(&adapter->pdev->dev, "rx csum is off, "
-				"cannot toggle lro\n");
-			return -EINVAL;
-		}
-
-		hw_lro = QLCNIC_LRO_ENABLED;
-		netdev->features |= NETIF_F_LRO;
-
-	} else {
-
-		if (!(netdev->features & NETIF_F_LRO))
-			return 0;
-
-		hw_lro = 0;
-		netdev->features &= ~NETIF_F_LRO;
-	}
-
-	if (qlcnic_config_hw_lro(adapter, hw_lro))
-		return -EIO;
-
-	if ((hw_lro == 0) && qlcnic_send_lro_cleanup(adapter))
-		return -EIO;
-
-
-	return 0;
-}
-
 static u32 qlcnic_get_msglevel(struct net_device *netdev)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
@@ -1064,23 +953,14 @@ const struct ethtool_ops qlcnic_ethtool_ops = {
 	.set_ringparam = qlcnic_set_ringparam,
 	.get_pauseparam = qlcnic_get_pauseparam,
 	.set_pauseparam = qlcnic_set_pauseparam,
-	.get_tx_csum = qlcnic_get_tx_csum,
-	.set_tx_csum = qlcnic_set_tx_csum,
-	.set_sg = ethtool_op_set_sg,
-	.get_tso = qlcnic_get_tso,
-	.set_tso = qlcnic_set_tso,
 	.get_wol = qlcnic_get_wol,
 	.set_wol = qlcnic_set_wol,
 	.self_test = qlcnic_diag_test,
 	.get_strings = qlcnic_get_strings,
 	.get_ethtool_stats = qlcnic_get_ethtool_stats,
 	.get_sset_count = qlcnic_get_sset_count,
-	.get_rx_csum = qlcnic_get_rx_csum,
-	.set_rx_csum = qlcnic_set_rx_csum,
 	.get_coalesce = qlcnic_get_intr_coalesce,
 	.set_coalesce = qlcnic_set_intr_coalesce,
-	.get_flags = ethtool_op_get_flags,
-	.set_flags = qlcnic_set_flags,
 	.set_phys_id = qlcnic_set_led,
 	.set_msglevel = qlcnic_set_msglevel,
 	.get_msglevel = qlcnic_get_msglevel,
diff --git a/drivers/net/qlcnic/qlcnic_hw.c b/drivers/net/qlcnic/qlcnic_hw.c
index 498cca9..cbb27f2 100644
--- a/drivers/net/qlcnic/qlcnic_hw.c
+++ b/drivers/net/qlcnic/qlcnic_hw.c
@@ -758,6 +758,43 @@ int qlcnic_change_mtu(struct net_device *netdev, int mtu)
 	return rc;
 }
 
+
+u32 qlcnic_fix_features(struct net_device *netdev, u32 features)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+
+	if ((adapter->flags & QLCNIC_ESWITCH_ENABLED)) {
+		u32 changed = features ^ netdev->features;
+		features ^= changed & (NETIF_F_ALL_CSUM | NETIF_F_RXCSUM);
+	}
+
+	if (!(features & NETIF_F_RXCSUM))
+		features &= ~NETIF_F_LRO;
+
+	return features;
+}
+
+
+int qlcnic_set_features(struct net_device *netdev, u32 features)
+{
+	struct qlcnic_adapter *adapter = netdev_priv(netdev);
+	u32 changed = netdev->features ^ features;
+	int hw_lro = (features & NETIF_F_LRO) ? QLCNIC_LRO_ENABLED : 0;
+
+	if (!(changed & NETIF_F_LRO))
+		return 0;
+
+	netdev->features = features ^ NETIF_F_LRO;
+
+	if (qlcnic_config_hw_lro(adapter, hw_lro))
+		return -EIO;
+
+	if ((hw_lro == 0) && qlcnic_send_lro_cleanup(adapter))
+		return -EIO;
+
+	return 0;
+}
+
 /*
  * Changes the CRB window to the specified window.
  */
diff --git a/drivers/net/qlcnic/qlcnic_init.c b/drivers/net/qlcnic/qlcnic_init.c
index 4ec0eeb..d0f338b 100644
--- a/drivers/net/qlcnic/qlcnic_init.c
+++ b/drivers/net/qlcnic/qlcnic_init.c
@@ -1390,8 +1390,8 @@ static struct sk_buff *qlcnic_process_rxbuf(struct qlcnic_adapter *adapter,
 
 	skb = buffer->skb;
 
-	if (likely(adapter->rx_csum && (cksum == STATUS_CKSUM_OK ||
-						cksum == STATUS_CKSUM_LOOP))) {
+	if (likely((adapter->netdev->features & NETIF_F_RXCSUM) &&
+	    (cksum == STATUS_CKSUM_OK || cksum == STATUS_CKSUM_LOOP))) {
 		adapter->stats.csummed++;
 		skb->ip_summed = CHECKSUM_UNNECESSARY;
 	} else {
diff --git a/drivers/net/qlcnic/qlcnic_main.c b/drivers/net/qlcnic/qlcnic_main.c
index e9e9ba6..6e61951 100644
--- a/drivers/net/qlcnic/qlcnic_main.c
+++ b/drivers/net/qlcnic/qlcnic_main.c
@@ -328,6 +328,8 @@ static const struct net_device_ops qlcnic_netdev_ops = {
 	.ndo_set_multicast_list = qlcnic_set_multi,
 	.ndo_set_mac_address    = qlcnic_set_mac,
 	.ndo_change_mtu	   = qlcnic_change_mtu,
+	.ndo_fix_features  = qlcnic_fix_features,
+	.ndo_set_features  = qlcnic_set_features,
 	.ndo_tx_timeout	   = qlcnic_tx_timeout,
 	.ndo_vlan_rx_add_vid	= qlcnic_vlan_rx_add,
 	.ndo_vlan_rx_kill_vid	= qlcnic_vlan_rx_del,
@@ -764,7 +766,7 @@ qlcnic_set_netdev_features(struct qlcnic_adapter *adapter,
 	struct net_device *netdev = adapter->netdev;
 	unsigned long features, vlan_features;
 
-	features = (NETIF_F_SG | NETIF_F_IP_CSUM |
+	features = (NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
 			NETIF_F_IPV6_CSUM | NETIF_F_GRO);
 	vlan_features = (NETIF_F_SG | NETIF_F_IP_CSUM |
 			NETIF_F_IPV6_CSUM | NETIF_F_HW_VLAN_FILTER);
@@ -779,14 +781,12 @@ qlcnic_set_netdev_features(struct qlcnic_adapter *adapter,
 
 	if (esw_cfg->offload_flags & BIT_0) {
 		netdev->features |= features;
-		adapter->rx_csum = 1;
 		if (!(esw_cfg->offload_flags & BIT_1))
 			netdev->features &= ~NETIF_F_TSO;
 		if (!(esw_cfg->offload_flags & BIT_2))
 			netdev->features &= ~NETIF_F_TSO6;
 	} else {
 		netdev->features &= ~features;
-		adapter->rx_csum = 0;
 	}
 
 	netdev->vlan_features = (features & vlan_features);
@@ -1436,7 +1436,6 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter,
 	int err;
 	struct pci_dev *pdev = adapter->pdev;
 
-	adapter->rx_csum = 1;
 	adapter->mc_enabled = 0;
 	adapter->max_mc_count = 38;
 
@@ -1447,26 +1446,24 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter,
 
 	SET_ETHTOOL_OPS(netdev, &qlcnic_ethtool_ops);
 
-	netdev->features |= (NETIF_F_SG | NETIF_F_IP_CSUM |
-		NETIF_F_IPV6_CSUM | NETIF_F_GRO | NETIF_F_HW_VLAN_RX);
-	netdev->vlan_features |= (NETIF_F_SG | NETIF_F_IP_CSUM |
-		NETIF_F_IPV6_CSUM | NETIF_F_HW_VLAN_FILTER);
+	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
+		NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM;
 
-	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_TSO) {
-		netdev->features |= (NETIF_F_TSO | NETIF_F_TSO6);
-		netdev->vlan_features |= (NETIF_F_TSO | NETIF_F_TSO6);
-	}
+	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_TSO)
+		netdev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+	if (pci_using_dac)
+		netdev->hw_features |= NETIF_F_HIGHDMA;
 
-	if (pci_using_dac) {
-		netdev->features |= NETIF_F_HIGHDMA;
-		netdev->vlan_features |= NETIF_F_HIGHDMA;
-	}
+	netdev->vlan_features = netdev->hw_features;
 
 	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_FVLANTX)
-		netdev->features |= (NETIF_F_HW_VLAN_TX);
-
+		netdev->hw_features |= NETIF_F_HW_VLAN_TX;
 	if (adapter->capabilities & QLCNIC_FW_CAPABILITY_HW_LRO)
-		netdev->features |= NETIF_F_LRO;
+		netdev->hw_features |= NETIF_F_LRO;
+
+	netdev->features |= netdev->hw_features |
+		NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
+
 	netdev->irq = adapter->msix_entries[0].vector;
 
 	netif_carrier_off(netdev);
-- 
1.7.2.5


^ permalink raw reply related

* Re: Add NAPI support to ll_temac driver
From: Eric Dumazet @ 2011-04-19 13:13 UTC (permalink / raw)
  To: monstr; +Cc: Ben Hutchings, netdev
In-Reply-To: <4DAD84B1.9020405@monstr.eu>

Le mardi 19 avril 2011 à 14:48 +0200, Michal Simek a écrit :

> Can you please be more specific about passing the paged buffers into GRO?
> Or point me to any documentation or code which can help me to understand what 
> that means.

Search for napi_get_frags() :

drivers/net/mlx4/en_rx.c:597:					struct sk_buff *gro_skb = napi_get_frags(&cq->napi);
drivers/net/cxgb3/sge.c:2091:		skb = napi_get_frags(&qs->napi);
drivers/net/cxgb4/sge.c:1517:	skb = napi_get_frags(&rxq->rspq.napi);
drivers/net/qlge/qlge_main.c:1484:	skb = napi_get_frags(napi);
drivers/net/sfc/rx.c:471:		skb = napi_get_frags(napi);
drivers/net/benet/be_main.c:1039:	skb = napi_get_frags(&eq_obj->napi);
drivers/net/cxgb4vf/sge.c:1479:	skb = napi_get_frags(&rxq->rspq.napi);




^ permalink raw reply

* Re: Add NAPI support to ll_temac driver
From: Ben Hutchings @ 2011-04-19 13:14 UTC (permalink / raw)
  To: monstr; +Cc: Eric Dumazet, netdev
In-Reply-To: <4DAD84B1.9020405@monstr.eu>

On Tue, 2011-04-19 at 14:48 +0200, Michal Simek wrote:
> Ben Hutchings wrote:
> > On Tue, 2011-04-19 at 12:43 +0200, Eric Dumazet wrote:
> > [...]
> >> One possible way to get better performance is to change driver to
> >> allocate skbs only right before calling netif_rx(), so that you dont
> >> have to access cold sk_buff data twice (once when allocating skb and put
> >> it in ring buffer, a second time when receiving frame)
> >>
> >> drivers/net/niu.c is a good example for this (NAPI + netdev_alloc_skb()
> >> just in time + pull in skbhead only first cache line of packet)
> > [...]
> > 
> > If the hardware can do RX checksumming (it's not clear) then the driver
> > should pass the paged buffers into GRO and that will take care of skb
> > allocation as necessary.
> 
> Hardware supports RX and TX partial checksumming. I can enable it. The driver 
> has also this option and from my tests there is of course some performance 
> improvemetn.
> 
> Just for sure - here are links on documentation.
> http://www.xilinx.com/support/documentation/ip_documentation/xps_ll_temac.pdf
> or
> http://www.xilinx.com/support/documentation/ip_documentation/axi_ethernet/v2_01_a/ds759_axi_ethernet.pdf

I'm not going to read those.  Just providing brief advice.

> About SKB allocation. I fixed our non mainline driver to allocate skb based on 
> current mtu size. Mainline driver allocate max mtu (9k). This has also impact on 
> performance because Microblaze works with smaller SKBs.
> 
> Can you please be more specific about passing the paged buffers into GRO?
> Or point me to any documentation or code which can help me to understand what 
> that means.

You would use napi_get_frags() to get a new or recycled skb, fill in
skb->frags, then call napi_gro_frags() to pass it into GRO.  The benet,
cxgb3 and sfc drivers do this.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Ian Campbell @ 2011-04-19 13:17 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <20110419115612.C4ACA13909@rere.qmqm.pl>

On Tue, 2011-04-19 at 12:56 +0100, Michał Mirosław wrote:
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>

Thanks for beating me to this! However the prototype for
xenvif_fix_features is wrong (needs to take a net_device not a xenvif).

I fixed it with the following, I also moved the !can_sg MTU clamping
into a set_features hook (like we do with netfront). Am I right that
this pattern copes with changes to SG via ethtool etc better? I think
it's more future proof in any case.

NB: I'm having some issues with my test hardware at the moment so this
is reviewed by eye and compile tested only...

I'm also happy for this to be folded into the original with my
"Signed-off-/Acked-by Ian Campbell <ian.campbell@citrix.com>" if that is
preferable.

Cheers,
Ian.
8<-----------------

net: xen-netback: correct prototype of xenvif_fix_features.

Also check MTU vs NETIF_F_SG in ndo_set_features hook to allow for
dynamic modification.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>

diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index fe25308..61757bd 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -165,9 +165,9 @@ static int xenvif_change_mtu(struct net_device *dev, int mtu)
 	return 0;
 }
 
-static u32 xenvif_fix_features(struct xenvif *vif, u32 features)
+static u32 xenvif_fix_features(struct net_device *dev, u32 features)
 {
-	struct net_device *dev = vif->dev;
+	struct xenvif *vif = netdev_priv(dev);
 
 	if (!vif->can_sg)
 		features &= ~NETIF_F_SG;
@@ -179,6 +179,16 @@ static u32 xenvif_fix_features(struct xenvif *vif, u32 features)
 	return features;
 }
 
+static int xenvif_set_features(struct net_device *dev, u32 features)
+{
+	if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN) {
+		netdev_info(dev, "Reducing MTU because no SG offload");
+		dev->mtu = ETH_DATA_LEN;
+	}
+
+	return 0;
+}
+
 static const struct xenvif_stat {
 	char name[ETH_GSTRING_LEN];
 	u16 offset;
@@ -237,6 +247,7 @@ static struct net_device_ops xenvif_netdev_ops = {
 	.ndo_stop	= xenvif_close,
 	.ndo_change_mtu	= xenvif_change_mtu,
 	.ndo_fix_features = xenvif_fix_features,
+	.ndo_set_features = xenvif_set_features,
 };
 
 struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
@@ -329,8 +340,6 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 	rtnl_lock();
 	if (netif_running(vif->dev))
 		xenvif_up(vif);
-	if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN)
-		dev_set_mtu(vif->dev, ETH_DATA_LEN);
 	netdev_update_features(vif->dev);
 	netif_carrier_on(vif->dev);
 	rtnl_unlock();



^ permalink raw reply related

* Re: Add NAPI support to ll_temac driver
From: Michal Simek @ 2011-04-19 13:18 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Eric Dumazet, netdev
In-Reply-To: <1303218898.3464.59.camel@localhost>

Ben Hutchings wrote:
> On Tue, 2011-04-19 at 14:48 +0200, Michal Simek wrote:
>> Ben Hutchings wrote:
>>> On Tue, 2011-04-19 at 12:43 +0200, Eric Dumazet wrote:
>>> [...]
>>>> One possible way to get better performance is to change driver to
>>>> allocate skbs only right before calling netif_rx(), so that you dont
>>>> have to access cold sk_buff data twice (once when allocating skb and put
>>>> it in ring buffer, a second time when receiving frame)
>>>>
>>>> drivers/net/niu.c is a good example for this (NAPI + netdev_alloc_skb()
>>>> just in time + pull in skbhead only first cache line of packet)
>>> [...]
>>>
>>> If the hardware can do RX checksumming (it's not clear) then the driver
>>> should pass the paged buffers into GRO and that will take care of skb
>>> allocation as necessary.
>> Hardware supports RX and TX partial checksumming. I can enable it. The driver 
>> has also this option and from my tests there is of course some performance 
>> improvemetn.
>>
>> Just for sure - here are links on documentation.
>> http://www.xilinx.com/support/documentation/ip_documentation/xps_ll_temac.pdf
>> or
>> http://www.xilinx.com/support/documentation/ip_documentation/axi_ethernet/v2_01_a/ds759_axi_ethernet.pdf
> 
> I'm not going to read those.  Just providing brief advice.
> 
>> About SKB allocation. I fixed our non mainline driver to allocate skb based on 
>> current mtu size. Mainline driver allocate max mtu (9k). This has also impact on 
>> performance because Microblaze works with smaller SKBs.
>>
>> Can you please be more specific about passing the paged buffers into GRO?
>> Or point me to any documentation or code which can help me to understand what 
>> that means.
> 
> You would use napi_get_frags() to get a new or recycled skb, fill in
> skb->frags, then call napi_gro_frags() to pass it into GRO.  The benet,
> cxgb3 and sfc drivers do this.

ok. Will look.

Thanks,
Michal

-- 
Michal Simek, Ing. (M.Eng)
w: www.monstr.eu p: +42-0-721842854
Maintainer of Linux kernel 2.6 Microblaze Linux - http://www.monstr.eu/fdt/
Microblaze U-BOOT custodian

^ permalink raw reply

* Re: Add NAPI support to ll_temac driver
From: Eric Dumazet @ 2011-04-19 13:21 UTC (permalink / raw)
  To: monstr; +Cc: Ben Hutchings, netdev
In-Reply-To: <4DAD84B1.9020405@monstr.eu>

Le mardi 19 avril 2011 à 14:48 +0200, Michal Simek a écrit :

> About SKB allocation. I fixed our non mainline driver to allocate skb based on 
> current mtu size. Mainline driver allocate max mtu (9k). This has also impact on 
> performance because Microblaze works with smaller SKBs.
> 

Make sure it wont crash/freeze if some evil guy sends a big frame




^ permalink raw reply

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Michał Mirosław @ 2011-04-19 13:30 UTC (permalink / raw)
  To: Ian Campbell; +Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <1303219073.5997.191.camel@zakaz.uk.xensource.com>

On Tue, Apr 19, 2011 at 02:17:53PM +0100, Ian Campbell wrote:
> On Tue, 2011-04-19 at 12:56 +0100, Michał Mirosław wrote:
> > Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> Thanks for beating me to this! However the prototype for
> xenvif_fix_features is wrong (needs to take a net_device not a xenvif).

I'll resend v2 with this fix.

> I fixed it with the following, I also moved the !can_sg MTU clamping
> into a set_features hook (like we do with netfront). Am I right that
> this pattern copes with changes to SG via ethtool etc better? I think
> it's more future proof in any case.

This looks wrong. Even if SG is turned on, you might get big skbs which
are linearized. There is a difference in SG capability and SG offload
status and as I see it the capability is what you need to test for MTU.

> NB: I'm having some issues with my test hardware at the moment so this
> is reviewed by eye and compile tested only...
> 
> I'm also happy for this to be folded into the original with my
> "Signed-off-/Acked-by Ian Campbell <ian.campbell@citrix.com>" if that is
> preferable.

Thanks.

Best Regards,
Michał Mirosław

^ permalink raw reply

* [PATCH v2] net: xen-netback: convert to hw_features
From: Michał Mirosław @ 2011-04-19 13:35 UTC (permalink / raw)
  To: netdev; +Cc: Ian Campbell, xen-devel
In-Reply-To: <20110419115612.C4ACA13909@rere.qmqm.pl>

Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
Acked-by: Ian Campbell <ian.campbell@citrix.com>

---
v2: fix xenvif_fix_features() arguments

 drivers/net/xen-netback/common.h    |    3 -
 drivers/net/xen-netback/interface.c |   84 ++++++----------------------------
 2 files changed, 15 insertions(+), 72 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 5d7bbf2..8753e6d 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -73,9 +73,6 @@ struct xenvif {
 	struct vm_struct *tx_comms_area;
 	struct vm_struct *rx_comms_area;
 
-	/* Flags that must not be set in dev->features */
-	u32 features_disabled;
-
 	/* Frontend feature information. */
 	u8 can_sg:1;
 	u8 gso:1;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index de569cc..0ca86f9 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -165,69 +165,18 @@ static int xenvif_change_mtu(struct net_device *dev, int mtu)
 	return 0;
 }
 
-static void xenvif_set_features(struct xenvif *vif)
-{
-	struct net_device *dev = vif->dev;
-	u32 features = dev->features;
-
-	if (vif->can_sg)
-		features |= NETIF_F_SG;
-	if (vif->gso || vif->gso_prefix)
-		features |= NETIF_F_TSO;
-	if (vif->csum)
-		features |= NETIF_F_IP_CSUM;
-
-	features &= ~(vif->features_disabled);
-
-	if (!(features & NETIF_F_SG) && dev->mtu > ETH_DATA_LEN)
-		dev->mtu = ETH_DATA_LEN;
-
-	dev->features = features;
-}
-
-static int xenvif_set_tx_csum(struct net_device *dev, u32 data)
+static u32 xenvif_fix_features(struct net_device *dev, u32 features)
 {
 	struct xenvif *vif = netdev_priv(dev);
-	if (data) {
-		if (!vif->csum)
-			return -EOPNOTSUPP;
-		vif->features_disabled &= ~NETIF_F_IP_CSUM;
-	} else {
-		vif->features_disabled |= NETIF_F_IP_CSUM;
-	}
-
-	xenvif_set_features(vif);
-	return 0;
-}
 
-static int xenvif_set_sg(struct net_device *dev, u32 data)
-{
-	struct xenvif *vif = netdev_priv(dev);
-	if (data) {
-		if (!vif->can_sg)
-			return -EOPNOTSUPP;
-		vif->features_disabled &= ~NETIF_F_SG;
-	} else {
-		vif->features_disabled |= NETIF_F_SG;
-	}
-
-	xenvif_set_features(vif);
-	return 0;
-}
-
-static int xenvif_set_tso(struct net_device *dev, u32 data)
-{
-	struct xenvif *vif = netdev_priv(dev);
-	if (data) {
-		if (!vif->gso && !vif->gso_prefix)
-			return -EOPNOTSUPP;
-		vif->features_disabled &= ~NETIF_F_TSO;
-	} else {
-		vif->features_disabled |= NETIF_F_TSO;
-	}
+	if (!vif->can_sg)
+		features &= ~NETIF_F_SG;
+	if (!vif->gso && !vif->gso_prefix)
+		features &= ~NETIF_F_TSO;
+	if (!vif->csum)
+		features &= ~NETIF_F_IP_CSUM;
 
-	xenvif_set_features(vif);
-	return 0;
+	return features;
 }
 
 static const struct xenvif_stat {
@@ -274,12 +223,6 @@ static void xenvif_get_strings(struct net_device *dev, u32 stringset, u8 * data)
 }
 
 static struct ethtool_ops xenvif_ethtool_ops = {
-	.get_tx_csum	= ethtool_op_get_tx_csum,
-	.set_tx_csum	= xenvif_set_tx_csum,
-	.get_sg		= ethtool_op_get_sg,
-	.set_sg		= xenvif_set_sg,
-	.get_tso	= ethtool_op_get_tso,
-	.set_tso	= xenvif_set_tso,
 	.get_link	= ethtool_op_get_link,
 
 	.get_sset_count = xenvif_get_sset_count,
@@ -293,6 +236,7 @@ static struct net_device_ops xenvif_netdev_ops = {
 	.ndo_open	= xenvif_open,
 	.ndo_stop	= xenvif_close,
 	.ndo_change_mtu	= xenvif_change_mtu,
+	.ndo_fix_features = xenvif_fix_features,
 };
 
 struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
@@ -331,7 +275,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	vif->credit_timeout.expires = jiffies;
 
 	dev->netdev_ops	= &xenvif_netdev_ops;
-	xenvif_set_features(vif);
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
+	dev->features = dev->hw_features;
 	SET_ETHTOOL_OPS(dev, &xenvif_ethtool_ops);
 
 	dev->tx_queue_len = XENVIF_QUEUE_LENGTH;
@@ -367,8 +312,6 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 	if (vif->irq)
 		return 0;
 
-	xenvif_set_features(vif);
-
 	err = xen_netbk_map_frontend_rings(vif, tx_ring_ref, rx_ring_ref);
 	if (err < 0)
 		goto err;
@@ -384,9 +327,12 @@ int xenvif_connect(struct xenvif *vif, unsigned long tx_ring_ref,
 	xenvif_get(vif);
 
 	rtnl_lock();
-	netif_carrier_on(vif->dev);
 	if (netif_running(vif->dev))
 		xenvif_up(vif);
+	if (!vif->can_sg && vif->dev->mtu > ETH_DATA_LEN)
+		dev_set_mtu(vif->dev, ETH_DATA_LEN);
+	netdev_update_features(vif->dev);
+	netif_carrier_on(vif->dev);
 	rtnl_unlock();
 
 	return 0;
-- 
1.7.2.5


^ permalink raw reply related

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Ian Campbell @ 2011-04-19 13:39 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <20110419133019.GA3973@rere.qmqm.pl>

On Tue, 2011-04-19 at 14:30 +0100, Michał Mirosław wrote:
> On Tue, Apr 19, 2011 at 02:17:53PM +0100, Ian Campbell wrote:
> > On Tue, 2011-04-19 at 12:56 +0100, Michał Mirosław wrote:
> > > Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> > Thanks for beating me to this! However the prototype for
> > xenvif_fix_features is wrong (needs to take a net_device not a xenvif).
> 
> I'll resend v2 with this fix.
> 
> > I fixed it with the following, I also moved the !can_sg MTU clamping
> > into a set_features hook (like we do with netfront). Am I right that
> > this pattern copes with changes to SG via ethtool etc better? I think
> > it's more future proof in any case.
> 
> This looks wrong. Even if SG is turned on, you might get big skbs which
> are linearized. There is a difference in SG capability and SG offload
> status and as I see it the capability is what you need to test for MTU.

So the existing stuff in drivers/net/xen-netfront.c is wrong too?

> > NB: I'm having some issues with my test hardware at the moment so this
> > is reviewed by eye and compile tested only...
> > 
> > I'm also happy for this to be folded into the original with my
> > "Signed-off-/Acked-by Ian Campbell <ian.campbell@citrix.com>" if that is
> > preferable.
> 
> Thanks.
> 
> Best Regards,
> Michał Mirosław



^ permalink raw reply

* Re: [PATCH v2] net: xen-netback: convert to hw_features
From: Michał Mirosław @ 2011-04-19 13:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Ian Campbell, xen-devel
In-Reply-To: <20110419133506.D332413909@rere.qmqm.pl>

On Tue, Apr 19, 2011 at 03:35:06PM +0200, Michał Mirosław wrote:
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>

David, I was too quick with this, so please wait with this patch until Ian
acks it again.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Michał Mirosław @ 2011-04-19 13:43 UTC (permalink / raw)
  To: Ian Campbell; +Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <1303220340.5997.197.camel@zakaz.uk.xensource.com>

On Tue, Apr 19, 2011 at 02:39:00PM +0100, Ian Campbell wrote:
> On Tue, 2011-04-19 at 14:30 +0100, Michał Mirosław wrote:
> > On Tue, Apr 19, 2011 at 02:17:53PM +0100, Ian Campbell wrote:
> > > I fixed it with the following, I also moved the !can_sg MTU clamping
> > > into a set_features hook (like we do with netfront). Am I right that
> > > this pattern copes with changes to SG via ethtool etc better? I think
> > > it's more future proof in any case.
> > This looks wrong. Even if SG is turned on, you might get big skbs which
> > are linearized. There is a difference in SG capability and SG offload
> > status and as I see it the capability is what you need to test for MTU.
> So the existing stuff in drivers/net/xen-netfront.c is wrong too?

Looks like it. But I don't really know what are the real constraints for MTU.
What I know is that SG even if turned on needs not be used (and currently
it's not e.g. if checksum offload is disabled). So MTU setting should not
depend on SG offload state but on some capability.

Best Regards,
Michał Mirosław

^ permalink raw reply

* [patch net-next-2.6] bonding: move processing of recv handlers into handle_frame()
From: Jiri Pirko @ 2011-04-19 13:48 UTC (permalink / raw)
  To: netdev
  Cc: davem, shemminger, kaber, fubar, eric.dumazet, nicolas.2p.debian,
	andy

Since now when bonding uses rx_handler, all traffic going into bond
device goes thru bond_handle_frame. So there's no need to go back into
bonding code later via ptype handlers. This patch converts
original ptype handlers into "bonding receive probes". These functions
are called from bond_handle_frame and they are registered per-mode.

Note that vlan packets are also handled because they are always untagged
thanks to vlan_untag()

Note that this also allows arpmon for eth-bond-bridge-vlan topology.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_3ad.c   |   29 ++--------
 drivers/net/bonding/bond_3ad.h   |    4 +-
 drivers/net/bonding/bond_alb.c   |   46 ++++-----------
 drivers/net/bonding/bond_alb.h   |    1 -
 drivers/net/bonding/bond_main.c  |  121 +++++++------------------------------
 drivers/net/bonding/bond_sysfs.c |    6 --
 drivers/net/bonding/bonding.h    |    4 +-
 net/core/dev.c                   |   21 -------
 8 files changed, 43 insertions(+), 189 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 123dd60..d0981c2 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2465,35 +2465,16 @@ out:
 	return NETDEV_TX_OK;
 }
 
-int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev)
+void bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
+			  struct slave *slave)
 {
-	struct bonding *bond = netdev_priv(dev);
-	struct slave *slave = NULL;
-	int ret = NET_RX_DROP;
-
-	if (!(dev->flags & IFF_MASTER))
-		goto out;
-
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		goto out;
+	if (skb->protocol != PKT_TYPE_LACPDU)
+		return;
 
 	if (!pskb_may_pull(skb, sizeof(struct lacpdu)))
-		goto out;
+		return;
 
 	read_lock(&bond->lock);
-	slave = bond_get_slave_by_dev(netdev_priv(dev), orig_dev);
-	if (!slave)
-		goto out_unlock;
-
 	bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len);
-
-	ret = NET_RX_SUCCESS;
-
-out_unlock:
 	read_unlock(&bond->lock);
-out:
-	dev_kfree_skb(skb);
-
-	return ret;
 }
diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h
index b28baff..291dbd4 100644
--- a/drivers/net/bonding/bond_3ad.h
+++ b/drivers/net/bonding/bond_3ad.h
@@ -258,7 +258,6 @@ struct ad_bond_info {
 				 * requested
 				 */
 	struct timer_list ad_timer;
-	struct packet_type ad_pkt_type;
 };
 
 struct ad_slave_info {
@@ -280,7 +279,8 @@ void bond_3ad_adapter_duplex_changed(struct slave *slave);
 void bond_3ad_handle_link_change(struct slave *slave, char link);
 int  bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info);
 int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev);
-int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev);
+void bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
+			  struct slave *slave);
 int bond_3ad_set_carrier(struct bonding *bond);
 #endif //__BOND_3AD_H__
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index ba71582..3b7b040 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -308,49 +308,33 @@ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)
 	_unlock_rx_hashtbl(bond);
 }
 
-static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype, struct net_device *orig_dev)
+static void rlb_arp_recv(struct sk_buff *skb, struct bonding *bond,
+			 struct slave *slave)
 {
-	struct bonding *bond;
-	struct arp_pkt *arp = (struct arp_pkt *)skb->data;
-	int res = NET_RX_DROP;
+	struct arp_pkt *arp;
 
-	while (bond_dev->priv_flags & IFF_802_1Q_VLAN)
-		bond_dev = vlan_dev_real_dev(bond_dev);
-
-	if (!(bond_dev->priv_flags & IFF_BONDING) ||
-	    !(bond_dev->flags & IFF_MASTER))
-		goto out;
+	if (skb->protocol != cpu_to_be16(ETH_P_ARP))
+		return;
 
+	arp = (struct arp_pkt *) skb->data;
 	if (!arp) {
 		pr_debug("Packet has no ARP data\n");
-		goto out;
+		return;
 	}
 
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		goto out;
-
-	if (!pskb_may_pull(skb, arp_hdr_len(bond_dev)))
-		goto out;
+	if (!pskb_may_pull(skb, arp_hdr_len(bond->dev)))
+		return;
 
 	if (skb->len < sizeof(struct arp_pkt)) {
 		pr_debug("Packet is too small to be an ARP\n");
-		goto out;
+		return;
 	}
 
 	if (arp->op_code == htons(ARPOP_REPLY)) {
 		/* update rx hash table for this ARP */
-		bond = netdev_priv(bond_dev);
 		rlb_update_entry_from_arp(bond, arp);
 		pr_debug("Server received an ARP Reply from client\n");
 	}
-
-	res = NET_RX_SUCCESS;
-
-out:
-	dev_kfree_skb(skb);
-
-	return res;
 }
 
 /* Caller must hold bond lock for read */
@@ -759,7 +743,6 @@ static void rlb_init_table_entry(struct rlb_client_info *entry)
 static int rlb_initialize(struct bonding *bond)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
-	struct packet_type *pk_type = &(BOND_ALB_INFO(bond).rlb_pkt_type);
 	struct rlb_client_info	*new_hashtbl;
 	int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);
 	int i;
@@ -784,13 +767,8 @@ static int rlb_initialize(struct bonding *bond)
 
 	_unlock_rx_hashtbl(bond);
 
-	/*initialize packet type*/
-	pk_type->type = cpu_to_be16(ETH_P_ARP);
-	pk_type->dev = bond->dev;
-	pk_type->func = rlb_arp_recv;
-
 	/* register to receive ARPs */
-	dev_add_pack(pk_type);
+	bond->recv_probe = rlb_arp_recv;
 
 	return 0;
 }
@@ -799,8 +777,6 @@ static void rlb_deinitialize(struct bonding *bond)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 
-	dev_remove_pack(&(bond_info->rlb_pkt_type));
-
 	_lock_rx_hashtbl(bond);
 
 	kfree(bond_info->rx_hashtbl);
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 8ca7158..90f140a 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -129,7 +129,6 @@ struct alb_bond_info {
 	int			lp_counter;
 	/* -------- rlb parameters -------- */
 	int rlb_enabled;
-	struct packet_type	rlb_pkt_type;
 	struct rlb_client_info	*rx_hashtbl;	/* Receive hash table */
 	spinlock_t		rx_hashtbl_lock;
 	u32			rx_hashtbl_head;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4ce14bd..66d9dc6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1439,27 +1439,17 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 }
 
 /* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
+ * duplicates except for alb non-mcast/bcast.
  */
 static bool bond_should_deliver_exact_match(struct sk_buff *skb,
 					    struct slave *slave,
 					    struct bonding *bond)
 {
 	if (bond_is_slave_inactive(slave)) {
-		if (slave_do_arp_validate(bond, slave) &&
-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return false;
-
 		if (bond->params.mode == BOND_MODE_ALB &&
 		    skb->pkt_type != PACKET_BROADCAST &&
 		    skb->pkt_type != PACKET_MULTICAST)
-				return false;
-
-		if (bond->params.mode == BOND_MODE_8023AD &&
-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
 			return false;
-
 		return true;
 	}
 	return false;
@@ -1483,6 +1473,15 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 	if (bond->params.arp_interval)
 		slave->dev->last_rx = jiffies;
 
+	if (bond->recv_probe) {
+		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+		if (likely(nskb)) {
+			bond->recv_probe(nskb, bond, slave);
+			dev_kfree_skb(nskb);
+		}
+	}
+
 	if (bond_should_deliver_exact_match(skb, slave, bond)) {
 		return RX_HANDLER_EXACT;
 	}
@@ -2743,48 +2742,26 @@ static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32
 	}
 }
 
-static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static void bond_arp_rcv(struct sk_buff *skb, struct bonding *bond,
+			 struct slave *slave)
 {
 	struct arphdr *arp;
-	struct slave *slave;
-	struct bonding *bond;
 	unsigned char *arp_ptr;
 	__be32 sip, tip;
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
-		/*
-		 * When using VLANS and bonding, dev and oriv_dev may be
-		 * incorrect if the physical interface supports VLAN
-		 * acceleration.  With this change ARP validation now
-		 * works for hosts only reachable on the VLAN interface.
-		 */
-		dev = vlan_dev_real_dev(dev);
-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev),skb->skb_iif);
-	}
-
-	if (!(dev->priv_flags & IFF_BONDING) || !(dev->flags & IFF_MASTER))
-		goto out;
+	if (skb->protocol != __cpu_to_be16(ETH_P_ARP))
+		return;
 
-	bond = netdev_priv(dev);
 	read_lock(&bond->lock);
 
-	pr_debug("bond_arp_rcv: bond %s skb->dev %s orig_dev %s\n",
-		 bond->dev->name, skb->dev ? skb->dev->name : "NULL",
-		 orig_dev ? orig_dev->name : "NULL");
+	pr_debug("bond_arp_rcv: bond %s skb->dev %s\n",
+		 bond->dev->name, skb->dev->name);
 
-	slave = bond_get_slave_by_dev(bond, orig_dev);
-	if (!slave || !slave_do_arp_validate(bond, slave))
-		goto out_unlock;
-
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		goto out_unlock;
-
-	if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+	if (!pskb_may_pull(skb, arp_hdr_len(bond->dev)))
 		goto out_unlock;
 
 	arp = arp_hdr(skb);
-	if (arp->ar_hln != dev->addr_len ||
+	if (arp->ar_hln != bond->dev->addr_len ||
 	    skb->pkt_type == PACKET_OTHERHOST ||
 	    skb->pkt_type == PACKET_LOOPBACK ||
 	    arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -2793,9 +2770,9 @@ static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
 		goto out_unlock;
 
 	arp_ptr = (unsigned char *)(arp + 1);
-	arp_ptr += dev->addr_len;
+	arp_ptr += bond->dev->addr_len;
 	memcpy(&sip, arp_ptr, 4);
-	arp_ptr += 4 + dev->addr_len;
+	arp_ptr += 4 + bond->dev->addr_len;
 	memcpy(&tip, arp_ptr, 4);
 
 	pr_debug("bond_arp_rcv: %s %s/%d av %d sv %d sip %pI4 tip %pI4\n",
@@ -2818,9 +2795,6 @@ static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
 
 out_unlock:
 	read_unlock(&bond->lock);
-out:
-	dev_kfree_skb(skb);
-	return NET_RX_SUCCESS;
 }
 
 /*
@@ -3407,48 +3381,6 @@ static struct notifier_block bond_inetaddr_notifier = {
 	.notifier_call = bond_inetaddr_event,
 };
 
-/*-------------------------- Packet type handling ---------------------------*/
-
-/* register to receive lacpdus on a bond */
-static void bond_register_lacpdu(struct bonding *bond)
-{
-	struct packet_type *pk_type = &(BOND_AD_INFO(bond).ad_pkt_type);
-
-	/* initialize packet type */
-	pk_type->type = PKT_TYPE_LACPDU;
-	pk_type->dev = bond->dev;
-	pk_type->func = bond_3ad_lacpdu_recv;
-
-	dev_add_pack(pk_type);
-}
-
-/* unregister to receive lacpdus on a bond */
-static void bond_unregister_lacpdu(struct bonding *bond)
-{
-	dev_remove_pack(&(BOND_AD_INFO(bond).ad_pkt_type));
-}
-
-void bond_register_arp(struct bonding *bond)
-{
-	struct packet_type *pt = &bond->arp_mon_pt;
-
-	if (pt->type)
-		return;
-
-	pt->type = htons(ETH_P_ARP);
-	pt->dev = bond->dev;
-	pt->func = bond_arp_rcv;
-	dev_add_pack(pt);
-}
-
-void bond_unregister_arp(struct bonding *bond)
-{
-	struct packet_type *pt = &bond->arp_mon_pt;
-
-	dev_remove_pack(pt);
-	pt->type = 0;
-}
-
 /*---------------------------- Hashing Policies -----------------------------*/
 
 /*
@@ -3542,14 +3474,14 @@ static int bond_open(struct net_device *bond_dev)
 
 		queue_delayed_work(bond->wq, &bond->arp_work, 0);
 		if (bond->params.arp_validate)
-			bond_register_arp(bond);
+			bond->recv_probe = bond_arp_rcv;
 	}
 
 	if (bond->params.mode == BOND_MODE_8023AD) {
 		INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
 		queue_delayed_work(bond->wq, &bond->ad_work, 0);
 		/* register to receive LACPDUs */
-		bond_register_lacpdu(bond);
+		bond->recv_probe = bond_3ad_lacpdu_recv;
 		bond_3ad_initiate_agg_selection(bond, 1);
 	}
 
@@ -3560,14 +3492,6 @@ static int bond_close(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	if (bond->params.mode == BOND_MODE_8023AD) {
-		/* Unregister the receive of LACPDUs */
-		bond_unregister_lacpdu(bond);
-	}
-
-	if (bond->params.arp_validate)
-		bond_unregister_arp(bond);
-
 	write_lock_bh(&bond->lock);
 
 	/* signal timers not to re-arm */
@@ -3604,6 +3528,7 @@ static int bond_close(struct net_device *bond_dev)
 		 */
 		bond_alb_deinitialize(bond);
 	}
+	bond->recv_probe = NULL;
 
 	return 0;
 }
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 259ff32..935406a 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -422,11 +422,6 @@ static ssize_t bonding_store_arp_validate(struct device *d,
 		bond->dev->name, arp_validate_tbl[new_value].modename,
 		new_value);
 
-	if (!bond->params.arp_validate && new_value)
-		bond_register_arp(bond);
-	else if (bond->params.arp_validate && !new_value)
-		bond_unregister_arp(bond);
-
 	bond->params.arp_validate = new_value;
 
 	return count;
@@ -923,7 +918,6 @@ static ssize_t bonding_store_miimon(struct device *d,
 				bond->dev->name);
 			bond->params.arp_interval = 0;
 			if (bond->params.arp_validate) {
-				bond_unregister_arp(bond);
 				bond->params.arp_validate =
 					BOND_ARP_VALIDATE_NONE;
 			}
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 6126c6a..85fb822 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -226,6 +226,8 @@ struct bonding {
 	struct   slave *primary_slave;
 	bool     force_primary;
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
+	void     (*recv_probe)(struct sk_buff *, struct bonding *,
+			       struct slave *);
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
 	s8       kill_timers;
@@ -399,8 +401,6 @@ void bond_set_mode_ops(struct bonding *bond, int mode);
 int bond_parse_parm(const char *mode_arg, const struct bond_parm_tbl *tbl);
 void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
-void bond_register_arp(struct bonding *);
-void bond_unregister_arp(struct bonding *);
 void bond_create_debugfs(void);
 void bond_destroy_debugfs(void);
 void bond_debug_register(struct bonding *bond);
diff --git a/net/core/dev.c b/net/core/dev.c
index 3871bf6..f3aed72 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3076,25 +3076,6 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-static void vlan_on_bond_hook(struct sk_buff *skb)
-{
-	/*
-	 * Make sure ARP frames received on VLAN interfaces stacked on
-	 * bonding interfaces still make their way to any base bonding
-	 * device that may have registered for a specific ptype.
-	 */
-	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
-	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
-	    skb->protocol == htons(ETH_P_ARP)) {
-		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
-		if (!skb2)
-			return;
-		skb2->dev = vlan_dev_real_dev(skb->dev);
-		netif_rx(skb2);
-	}
-}
-
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
@@ -3190,8 +3171,6 @@ ncls:
 			goto out;
 	}
 
-	vlan_on_bond_hook(skb);
-
 	/* deliver only exact match when indicated */
 	null_or_dev = deliver_exact ? skb->dev : NULL;
 
-- 
1.7.4.2


^ permalink raw reply related

* Re: [patch net-next-2.6] bonding: move processing of recv handlers into handle_frame()
From: Eric Dumazet @ 2011-04-19 14:02 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, shemminger, kaber, fubar, nicolas.2p.debian, andy
In-Reply-To: <1303220896-9092-1-git-send-email-jpirko@redhat.com>

Le mardi 19 avril 2011 à 15:48 +0200, Jiri Pirko a écrit :
> Since now when bonding uses rx_handler, all traffic going into bond
> device goes thru bond_handle_frame. So there's no need to go back into
> bonding code later via ptype handlers. This patch converts
> original ptype handlers into "bonding receive probes". These functions
> are called from bond_handle_frame and they are registered per-mode.
> 
> Note that vlan packets are also handled because they are always untagged
> thanks to vlan_untag()
> 
> Note that this also allows arpmon for eth-bond-bridge-vlan topology.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>



>  			}
> diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
> index 6126c6a..85fb822 100644
> --- a/drivers/net/bonding/bonding.h
> +++ b/drivers/net/bonding/bonding.h
> @@ -226,6 +226,8 @@ struct bonding {
>  	struct   slave *primary_slave;
>  	bool     force_primary;
>  	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
> +	void     (*recv_probe)(struct sk_buff *, struct bonding *,
> +			       struct slave *);
>  	rwlock_t lock;
>  	rwlock_t curr_slave_lock;
>  	s8       kill_timers;


Any performance numbers ?

I am asking because recv_probe sits in a often dirtied cache line...

It would be nice to separate rx & tx path needs




^ permalink raw reply

* Re: [PATCH] net: s390: convert to hw_features
From: Frank Blaschka @ 2011-04-19 14:11 UTC (permalink / raw)
  To: Michał Mirosław; +Cc: netdev, linux-s390
In-Reply-To: <20110419104320.314AB13A66@rere.qmqm.pl>

On Tue, Apr 19, 2011 at 12:43:20PM +0200, Michał Mirosław wrote:
> options.large_send was easy to get rid of. options.checksum_type has deeper
> roots so is left for later cleanup.
> 
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> ---

Hi Micha,

thx for the patch. There are some small issues I will fix up.
I will add the patch to my queue. It might be a good time to
get rid of the large_send and checksum sysfs attribute as well.
I will work on that too, thx again ...

Frank

^ permalink raw reply

* RE: [PATCH net-next 4/5] be2net: pass domain id to be_cmd_link_status_query
From: Ajit.Khaparde @ 2011-04-19 14:23 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20110418.225659.112612060.davem@davemloft.net>

> From: David Miller [davem@davemloft.net]
> Sent: Tuesday, April 19, 2011 12:56 AM
> To: Khaparde, Ajit
> Cc: netdev@vger.kernel.org
> Subject: Re: [PATCH net-next 4/5] be2net: pass domain id to be_cmd_link_status_query

> From: Ajit Khaparde <ajit.khaparde@emulex.com>
> Date: Mon, 18 Apr 2011 17:30:33 -0500

>
> Signed-off-by: Ajit Khaparde <ajit.khaparde@emulex.com>

> That's not all this patch does:
Oh.. sorry about that. I did not realize this. I had worked on these patches over a couple of days
and on one of those days, I seem to have mixed the patches. Will respin these again.


> It seems to be also converting the driver over to the new VLAN
> interfaces.

> Please seperate this part into a seperate patch and resubmit your
> patch series.

> Thanks.

^ permalink raw reply

* [PATCH 2/3] IPVS: combine consecutive #ifdef CONFIG_PROC_FS blocks
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303223492-13549-1-git-send-email-kaber@trash.net>

From: Simon Horman <horms@verge.net.au>

Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c |    3 ---
 1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 33733c8..36f4495 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1984,9 +1984,6 @@ static const struct file_operations ip_vs_info_fops = {
 	.release = seq_release_private,
 };
 
-#endif
-
-#ifdef CONFIG_PROC_FS
 static int ip_vs_stats_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_single_net(seq);
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH 3/3] netfilter: ipset: SCTP, UDPLITE support added
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303223492-13549-1-git-send-email-kaber@trash.net>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

SCTP and UDPLITE port support added to the hash:*port* set types.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set_getport.h |    2 ++
 net/netfilter/ipset/ip_set_getport.c           |   16 +++++++++++++++-
 net/netfilter/ipset/ip_set_hash_ipport.c       |    2 +-
 net/netfilter/ipset/ip_set_hash_ipportip.c     |    2 +-
 net/netfilter/ipset/ip_set_hash_ipportnet.c    |    2 +-
 net/netfilter/ipset/ip_set_hash_netport.c      |    2 +-
 6 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set_getport.h b/include/linux/netfilter/ipset/ip_set_getport.h
index 5aebd17..90d0930 100644
--- a/include/linux/netfilter/ipset/ip_set_getport.h
+++ b/include/linux/netfilter/ipset/ip_set_getport.h
@@ -22,7 +22,9 @@ static inline bool ip_set_proto_with_ports(u8 proto)
 {
 	switch (proto) {
 	case IPPROTO_TCP:
+	case IPPROTO_SCTP:
 	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
 		return true;
 	}
 	return false;
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 8d52272..757143b 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -11,6 +11,7 @@
 #include <linux/skbuff.h>
 #include <linux/icmp.h>
 #include <linux/icmpv6.h>
+#include <linux/sctp.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
@@ -35,7 +36,20 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
 		*port = src ? th->source : th->dest;
 		break;
 	}
-	case IPPROTO_UDP: {
+	case IPPROTO_SCTP: {
+		sctp_sctphdr_t _sh;
+		const sctp_sctphdr_t *sh;
+
+		sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
+		if (sh == NULL)
+			/* No choice either */
+			return false;
+
+		*port = src ? sh->source : sh->dest;
+		break;
+	}
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE: {
 		struct udphdr _udph;
 		const struct udphdr *uh;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index b921414..14281b6 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -491,7 +491,7 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_ipport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 4642872..401c8a2 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -509,7 +509,7 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_ipportip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 2cb84a5..4743e54 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -574,7 +574,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_ipportnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 8598676..d2a4036 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -526,7 +526,7 @@ static struct ip_set_type hash_netport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_netport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH 1/3] netfilter: get rid of atomic ops in fast path
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303223492-13549-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

We currently use a percpu spinlock to 'protect' rule bytes/packets
counters, after various attempts to use RCU instead.

Lately we added a seqlock so that get_counters() can run without
blocking BH or 'writers'. But we really only need the seqcount in it.

Spinlock itself is only locked by the current/owner cpu, so we can
remove it completely.

This cleanups api, using correct 'writer' vs 'reader' semantic.

At replace time, the get_counters() call makes sure all cpus are done
using the old table.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h |   96 ++++++++++++++++--------------------
 net/ipv4/netfilter/arp_tables.c    |   18 ++++---
 net/ipv4/netfilter/ip_tables.c     |   28 +++++------
 net/ipv6/netfilter/ip6_tables.c    |   19 +++++---
 net/netfilter/x_tables.c           |    9 +--
 5 files changed, 80 insertions(+), 90 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 3721952..32cddf7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -456,72 +456,60 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
 
-/*
- * Per-CPU spinlock associated with per-cpu table entries, and
- * with a counter for the "reading" side that allows a recursive
- * reader to avoid taking the lock and deadlocking.
- *
- * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
- * It needs to ensure that the rules are not being changed while the packet
- * is being processed. In some cases, the read lock will be acquired
- * twice on the same CPU; this is okay because of the count.
- *
- * "writing" is used when reading counters.
- *  During replace any readers that are using the old tables have to complete
- *  before freeing the old table. This is handled by the write locking
- *  necessary for reading the counters.
+/**
+ * xt_recseq - recursive seqcount for netfilter use
+ * 
+ * Packet processing changes the seqcount only if no recursion happened
+ * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
+ * because we use the normal seqcount convention :
+ * Low order bit set to 1 if a writer is active.
  */
-struct xt_info_lock {
-	seqlock_t lock;
-	unsigned char readers;
-};
-DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
+DECLARE_PER_CPU(seqcount_t, xt_recseq);
 
-/*
- * Note: we need to ensure that preemption is disabled before acquiring
- * the per-cpu-variable, so we do it as a two step process rather than
- * using "spin_lock_bh()".
- *
- * We _also_ need to disable bottom half processing before updating our
- * nesting count, to make sure that the only kind of re-entrancy is this
- * code being called by itself: since the count+lock is not an atomic
- * operation, we can allow no races.
+/**
+ * xt_write_recseq_begin - start of a write section
  *
- * _Only_ that special combination of being per-cpu and never getting
- * re-entered asynchronously means that the count is safe.
+ * Begin packet processing : all readers must wait the end
+ * 1) Must be called with preemption disabled
+ * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
+ * Returns :
+ *  1 if no recursion on this cpu
+ *  0 if recursion detected
  */
-static inline void xt_info_rdlock_bh(void)
+static inline unsigned int xt_write_recseq_begin(void)
 {
-	struct xt_info_lock *lock;
+	unsigned int addend;
 
-	local_bh_disable();
-	lock = &__get_cpu_var(xt_info_locks);
-	if (likely(!lock->readers++))
-		write_seqlock(&lock->lock);
-}
+	/*
+	 * Low order bit of sequence is set if we already
+	 * called xt_write_recseq_begin().
+	 */
+	addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;
 
-static inline void xt_info_rdunlock_bh(void)
-{
-	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+	/*
+	 * This is kind of a write_seqcount_begin(), but addend is 0 or 1
+	 * We dont check addend value to avoid a test and conditional jump,
+	 * since addend is most likely 1
+	 */
+	__this_cpu_add(xt_recseq.sequence, addend);
+	smp_wmb();
 
-	if (likely(!--lock->readers))
-		write_sequnlock(&lock->lock);
-	local_bh_enable();
+	return addend;
 }
 
-/*
- * The "writer" side needs to get exclusive access to the lock,
- * regardless of readers.  This must be called with bottom half
- * processing (and thus also preemption) disabled.
+/**
+ * xt_write_recseq_end - end of a write section
+ * @addend: return value from previous xt_write_recseq_begin()
+ *
+ * End packet processing : all readers can proceed
+ * 1) Must be called with preemption disabled
+ * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
  */
-static inline void xt_info_wrlock(unsigned int cpu)
-{
-	write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
-}
-
-static inline void xt_info_wrunlock(unsigned int cpu)
+static inline void xt_write_recseq_end(unsigned int addend)
 {
-	write_sequnlock(&per_cpu(xt_info_locks, cpu).lock);
+	/* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
+	smp_wmb();
+	__this_cpu_add(xt_recseq.sequence, addend);
 }
 
 /*
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 4b5d457..2ea7433 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	void *table_base;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	unsigned int addend;
 
 	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
 		return NF_DROP;
@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	xt_info_rdlock_bh();
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -338,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			/* Verdict */
 			break;
 	} while (!acpar.hotdrop);
-	xt_info_rdunlock_bh();
+	xt_write_recseq_end(addend);
+	local_bh_enable();
 
 	if (acpar.hotdrop)
 		return NF_DROP;
@@ -712,7 +715,7 @@ static void get_counters(const struct xt_table_info *t,
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
-		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -720,10 +723,10 @@ static void get_counters(const struct xt_table_info *t,
 			unsigned int start;
 
 			do {
-				start = read_seqbegin(lock);
+				start = read_seqcount_begin(s);
 				bcnt = iter->counters.bcnt;
 				pcnt = iter->counters.pcnt;
-			} while (read_seqretry(lock, start));
+			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
@@ -1115,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 	int ret = 0;
 	void *loc_cpu_entry;
 	struct arpt_entry *iter;
+	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
 
@@ -1171,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user,
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
 	loc_cpu_entry = private->entries[curcpu];
-	xt_info_wrlock(curcpu);
+	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
 		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
-	xt_info_wrunlock(curcpu);
+	xt_write_recseq_end(addend);
  unlock_up_free:
 	local_bh_enable();
 	xt_table_unlock(t);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ffcea0d..2b6b700 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)
 }
 EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
 
-/*
-   We keep a set of rules for each CPU, so we can avoid write-locking
-   them in the softirq when updating the counters and therefore
-   only need to read-lock in the softirq; doing a write_lock_bh() in user
-   context stops packets coming through and allows user context to read
-   the counters or update the rules.
-
-   Hence the start of any table is given by get_table() below.  */
-
 /* Returns whether matches rule or not. */
 /* Performance critical - called for every packet */
 static inline bool
@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,
 	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	unsigned int addend;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb,
 	acpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	xt_info_rdlock_bh();
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
 	private = table->private;
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
@@ -430,7 +423,9 @@ ipt_do_table(struct sk_buff *skb,
 	pr_debug("Exiting %s; resetting sp from %u to %u\n",
 		 __func__, *stackptr, origptr);
 	*stackptr = origptr;
-	xt_info_rdunlock_bh();
+ 	xt_write_recseq_end(addend);
+ 	local_bh_enable();
+
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
 #else
@@ -886,7 +881,7 @@ get_counters(const struct xt_table_info *t,
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
-		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -894,10 +889,10 @@ get_counters(const struct xt_table_info *t,
 			unsigned int start;
 
 			do {
-				start = read_seqbegin(lock);
+				start = read_seqcount_begin(s);
 				bcnt = iter->counters.bcnt;
 				pcnt = iter->counters.pcnt;
-			} while (read_seqretry(lock, start));
+			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i; /* macro does multi eval of i */
@@ -1312,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user,
 	int ret = 0;
 	void *loc_cpu_entry;
 	struct ipt_entry *iter;
+	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
 
@@ -1368,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user,
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
 	loc_cpu_entry = private->entries[curcpu];
-	xt_info_wrlock(curcpu);
+	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
 		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
-	xt_info_wrunlock(curcpu);
+	xt_write_recseq_end(addend);
  unlock_up_free:
 	local_bh_enable();
 	xt_table_unlock(t);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 0b2af9b..ec7cf57 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -340,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb,
 	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	unsigned int addend;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -358,7 +359,8 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	xt_info_rdlock_bh();
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
 	private = table->private;
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
@@ -442,7 +444,9 @@ ip6t_do_table(struct sk_buff *skb,
 	} while (!acpar.hotdrop);
 
 	*stackptr = origptr;
-	xt_info_rdunlock_bh();
+
+ 	xt_write_recseq_end(addend);
+ 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -899,7 +903,7 @@ get_counters(const struct xt_table_info *t,
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
-		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -907,10 +911,10 @@ get_counters(const struct xt_table_info *t,
 			unsigned int start;
 
 			do {
-				start = read_seqbegin(lock);
+				start = read_seqcount_begin(s);
 				bcnt = iter->counters.bcnt;
 				pcnt = iter->counters.pcnt;
-			} while (read_seqretry(lock, start));
+			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
@@ -1325,6 +1329,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	int ret = 0;
 	const void *loc_cpu_entry;
 	struct ip6t_entry *iter;
+	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
 
@@ -1381,13 +1386,13 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	i = 0;
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	xt_info_wrlock(curcpu);
+	addend = xt_write_recseq_begin();
 	loc_cpu_entry = private->entries[curcpu];
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
 		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
-	xt_info_wrunlock(curcpu);
+	xt_write_recseq_end(addend);
 
  unlock_up_free:
 	local_bh_enable();
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index a9adf4c..52959ef 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -762,8 +762,8 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
-DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
-EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
 
 static int xt_jumpstack_alloc(struct xt_table_info *i)
 {
@@ -1362,10 +1362,7 @@ static int __init xt_init(void)
 	int rv;
 
 	for_each_possible_cpu(i) {
-		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
-
-		seqlock_init(&lock->lock);
-		lock->readers = 0;
+		seqcount_init(&per_cpu(xt_recseq, i));
 	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH 0/3] netfilter: netfilter update
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev

Hi Dave,

following is a small netfilter update for net-next, containing:

- a patch from Eric to remove some atomic ops in the *tables fastpath

- removal of an unnecessary ifdef from IPVS by Simon

- SCTP and UDPLITE support for ipset, from Jozsef

Please pull from:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git master

Thanks!

^ permalink raw reply

* [PATCH 2/3] netfilter: ipset: set match and SET target fixes
From: kaber @ 2011-04-19 14:51 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303224705-17400-1-git-send-email-kaber@trash.net>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

The SET target with --del-set did not work due to using wrongly
the internal dimension of --add-set instead of --del-set.
Also, the checkentries did not release the set references when
returned an error. Bugs reported by Lennert Buytenhek.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_set.c |   18 ++++++++++++++++--
 1 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 061d48c..b3babae 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -81,6 +81,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
+		ip_set_nfnl_put(info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -135,6 +136,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
+			if (info->add_set.index != IPSET_INVALID_ID)
+				ip_set_nfnl_put(info->add_set.index);
 			return -ENOENT;
 		}
 	}
@@ -142,6 +145,10 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 	    info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
+		if (info->add_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->add_set.index);
+		if (info->del_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->del_set.index);
 		return -ERANGE;
 	}
 
@@ -192,6 +199,7 @@ set_match_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.dim > IPSET_DIM_MAX) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
+		ip_set_nfnl_put(info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -219,7 +227,7 @@ set_target(struct sk_buff *skb, const struct xt_action_param *par)
 	if (info->del_set.index != IPSET_INVALID_ID)
 		ip_set_del(info->del_set.index,
 			   skb, par->family,
-			   info->add_set.dim,
+			   info->del_set.dim,
 			   info->del_set.flags);
 
 	return XT_CONTINUE;
@@ -245,13 +253,19 @@ set_target_checkentry(const struct xt_tgchk_param *par)
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
+			if (info->add_set.index != IPSET_INVALID_ID)
+				ip_set_nfnl_put(info->add_set.index);
 			return -ENOENT;
 		}
 	}
 	if (info->add_set.dim > IPSET_DIM_MAX ||
-	    info->del_set.flags > IPSET_DIM_MAX) {
+	    info->del_set.dim > IPSET_DIM_MAX) {
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
+		if (info->add_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->add_set.index);
+		if (info->del_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->del_set.index);
 		return -ERANGE;
 	}
 
-- 
1.7.2.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox