Netdev List
 help / color / mirror / Atom feed
* [PATCH] net: tun: convert to hw_features
From: Michał Mirosław @ 2011-04-19 16:13 UTC (permalink / raw)
  To: netdev; +Cc: Rusty Russell

This changes offload setting behaviour to what I think is correct:
 - offloads set via ethtool mean what admin wants to use (by default
   he wants 'em all)
 - offloads set via ioctl() mean what userspace is expecting to get
   (this limits which admin wishes are granted)
 - TUN_NOCHECKSUM is ignored, as it might cause broken packets when
   forwarded (ip_summed == CHECKSUM_UNNECESSARY means that checksum
   was verified, not that it can be ignored)

If TUN_NOCHECKSUM is implemented, it should set skb->csum_* and
skb->ip_summed (= CHECKSUM_PARTIAL) for known protocols and let others
be verified by kernel when necessary.

TUN_NOCHECKSUM handling was introduced by commit
f43798c27684ab925adde7d8acc34c78c6e50df8:

    tun: Allow GSO using virtio_net_hdr
    
Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
---
 drivers/net/tun.c |   63 +++++++++++++++++++++--------------------------------
 1 files changed, 25 insertions(+), 38 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index f5e9ac0..ade3cf9 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -123,6 +123,9 @@ struct tun_struct {
 	gid_t			group;
 
 	struct net_device	*dev;
+	u32			set_features;
+#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
+			  NETIF_F_TSO6|NETIF_F_UFO)
 	struct fasync_struct	*fasync;
 
 	struct tap_filter       txflt;
@@ -451,12 +454,20 @@ tun_net_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+static u32 tun_net_fix_features(struct net_device *dev, u32 features)
+{
+	struct tun_struct *tun = netdev_priv(dev);
+
+	return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
+}
+
 static const struct net_device_ops tun_netdev_ops = {
 	.ndo_uninit		= tun_net_uninit,
 	.ndo_open		= tun_net_open,
 	.ndo_stop		= tun_net_close,
 	.ndo_start_xmit		= tun_net_xmit,
 	.ndo_change_mtu		= tun_net_change_mtu,
+	.ndo_fix_features	= tun_net_fix_features,
 };
 
 static const struct net_device_ops tap_netdev_ops = {
@@ -465,6 +476,7 @@ static const struct net_device_ops tap_netdev_ops = {
 	.ndo_stop		= tun_net_close,
 	.ndo_start_xmit		= tun_net_xmit,
 	.ndo_change_mtu		= tun_net_change_mtu,
+	.ndo_fix_features	= tun_net_fix_features,
 	.ndo_set_multicast_list	= tun_net_mclist,
 	.ndo_set_mac_address	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
@@ -628,8 +640,7 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
 			kfree_skb(skb);
 			return -EINVAL;
 		}
-	} else if (tun->flags & TUN_NOCHECKSUM)
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
 
 	switch (tun->flags & TUN_TYPE_MASK) {
 	case TUN_TUN_DEV:
@@ -1094,6 +1105,10 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 				goto err_free_sk;
 		}
 
+		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
+			TUN_USER_FEATURES;
+		dev->features = dev->hw_features;
+
 		err = register_netdevice(tun->dev);
 		if (err < 0)
 			goto err_free_sk;
@@ -1158,18 +1173,12 @@ static int tun_get_iff(struct net *net, struct tun_struct *tun,
 
 /* This is like a cut-down ethtool ops, except done via tun fd so no
  * privs required. */
-static int set_offload(struct net_device *dev, unsigned long arg)
+static int set_offload(struct tun_struct *tun, unsigned long arg)
 {
-	u32 old_features, features;
-
-	old_features = dev->features;
-	/* Unset features, set them as we chew on the arg. */
-	features = (old_features & ~(NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST
-				    |NETIF_F_TSO_ECN|NETIF_F_TSO|NETIF_F_TSO6
-				    |NETIF_F_UFO));
+	u32 features = 0;
 
 	if (arg & TUN_F_CSUM) {
-		features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
+		features |= NETIF_F_HW_CSUM;
 		arg &= ~TUN_F_CSUM;
 
 		if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
@@ -1195,9 +1204,8 @@ static int set_offload(struct net_device *dev, unsigned long arg)
 	if (arg)
 		return -EINVAL;
 
-	dev->features = features;
-	if (old_features != dev->features)
-		netdev_features_change(dev);
+	tun->set_features = features;
+	netdev_update_features(tun->dev);
 
 	return 0;
 }
@@ -1262,12 +1270,9 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 
 	case TUNSETNOCSUM:
 		/* Disable/Enable checksum */
-		if (arg)
-			tun->flags |= TUN_NOCHECKSUM;
-		else
-			tun->flags &= ~TUN_NOCHECKSUM;
 
-		tun_debug(KERN_INFO, tun, "checksum %s\n",
+		/* [unimplemented] */
+		tun_debug(KERN_INFO, tun, "ignored: set checksum %s\n",
 			  arg ? "disabled" : "enabled");
 		break;
 
@@ -1316,7 +1321,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		break;
 #endif
 	case TUNSETOFFLOAD:
-		ret = set_offload(tun->dev, arg);
+		ret = set_offload(tun, arg);
 		break;
 
 	case TUNSETTXFILTER:
@@ -1595,30 +1600,12 @@ static void tun_set_msglevel(struct net_device *dev, u32 value)
 #endif
 }
 
-static u32 tun_get_rx_csum(struct net_device *dev)
-{
-	struct tun_struct *tun = netdev_priv(dev);
-	return (tun->flags & TUN_NOCHECKSUM) == 0;
-}
-
-static int tun_set_rx_csum(struct net_device *dev, u32 data)
-{
-	struct tun_struct *tun = netdev_priv(dev);
-	if (data)
-		tun->flags &= ~TUN_NOCHECKSUM;
-	else
-		tun->flags |= TUN_NOCHECKSUM;
-	return 0;
-}
-
 static const struct ethtool_ops tun_ethtool_ops = {
 	.get_settings	= tun_get_settings,
 	.get_drvinfo	= tun_get_drvinfo,
 	.get_msglevel	= tun_get_msglevel,
 	.set_msglevel	= tun_set_msglevel,
 	.get_link	= ethtool_op_get_link,
-	.get_rx_csum	= tun_get_rx_csum,
-	.set_rx_csum	= tun_set_rx_csum
 };
 
 
-- 
1.7.2.5


^ permalink raw reply related

* [BUG] bnx2x: bnx2x_set_pbd_csum() random accesses
From: Eric Dumazet @ 2011-04-19 16:11 UTC (permalink / raw)
  To: Dmitry Kravkov, Eilon Greenstein; +Cc: netdev

Hi guys

bnx2x_set_pbd_csum() / bnx2x_set_pbd_csum_e2() seem to read
tcp_hdrlen(skb) even for non TCP frames ?

Also, (skb_network_header(skb) - skb->data) is signed, so 
	(skb_network_header(skb) - skb->data) / 2 is a bit expensive...

Thanks



^ permalink raw reply

* Re: Hight speed data sending from custom IP out of kernel
From: juice @ 2011-04-19 16:02 UTC (permalink / raw)
  To: monstr, netdev
In-Reply-To: <4DAD76F1.40309@monstr.eu>


Hi!

I can see you are probably going to run into CPU performance problems, but
it depends a lot on the type of traffic you are going to send.

My system requires quite fast processor, but even more important is to
have a network interface card that really supports the full speed of
gigabit ethernet line. The reason for that is that my test traffic
includes streams of very small packets that cause a lot of overhead in
processing.

Most of my test traffic is UDP, but it does not really matter what the
higher layers of the traffic are, this scheme operates on the ethernet
layer and does not care about payload structure.

I tried several NIC:s before i settled using Intel 82576 cards with the
igb driver. If you have less capable interface card, your small packet
performance is going to be a lot poorer.

Using that card I can get to full speed GE line rate even with 64byte
packets, but if you want to send larger packets, say close to 1500byte
then almost any NIC will work OK for you.

You can download the module code and the userland seeding application from
my svn server at https://toosa.swagman.org/svn/streamgen
The streamseed userland application requires libpcap-dev to build
correctly but the streamgen module is self-sufficent.

There is not a lot of documentation, and the module is still "work in
progress" as I am going to fix it to work with more than one interface at
the same time when I get to do it. Currently it can only use one interface
on the sending host machine.

  - Juice -


> Hi Juice,
>
> juice wrote:
>> Hi Michal.
>>
>> How fast do you need to send the data?
>
> It sounds weird but as fast as possible. There is no specific limit
> because I
> want to create demo and test it on various hw configuration which I can
> easily
> create on FPGA. For now the bottleneck is Microblaze cpu. It can run from
> 50MHz
> till 170-180MHz. We also support both endians and have two hw IP
> cores(10/100/1000) which I can use.
>
>> I have an application where I send test stream out to GE line and can
>> fill
>> the total capacity of the ethernet regardless of the packet size.
>
> What cpu do you use?
>
>>
>> The test stream I am sending is stored in kernel memory, and therefore
>> is
>> limited by the amount of free memory. 200M is no problem.
>
> Is it UDP or TCP?
>
>>
>> The solution I am using is loosely based on the pktgen module, except
>> that
>> my module can load a wireshark capture from userland program and then
>> send
>> it from ethernet interface in wire speed.
>
> Sound good. Would it be possible to see it and test it?
>
> Thanks,
> Michal
>
>
>>
>>   - Juice -
>>
>>
>>> Hi,
>>> I would like to create demo for high speed data sending from custom IP
>> through
>>> the ethernet. I think the best description is that there are dmaable
>>> memory
>>> mapped registers or just memory which store data I want to send (for
>> example 200MB).
>>> Linux should handle all communication between target(probably server)
>> and
>>> host
>>> (client) but data in the packets should go from that custom IP and
>>> can't go
>>> through the kernel because of performance issue.
>>> Ethernet core have own DMA which I could use but the question is if
>> there
>>> is any
>>> option how to convince the kernel that data will go directly from
>>> memory
>> mapped
>>> registers and the kernel/driver/... just setup dma BD for headers and
>> second for
>>> data.
>>> Do you have any experience with any solution with passing data
>> completely
>>> out of
>>> kernel?
>>> Thanks,
>>> Michal
>>> --
>>> Michal Simek, Ing. (M.Eng)
>>> w: www.monstr.eu p: +42-0-721842854
>>> Maintainer of Linux kernel 2.6 Microblaze Linux -
>>> http://www.monstr.eu/fdt/
>>> Microblaze U-BOOT custodian
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe netdev" in
>>> the
>> body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>>
>
>
> --
> Michal Simek, Ing. (M.Eng)
> w: www.monstr.eu p: +42-0-721842854
> Maintainer of Linux kernel 2.6 Microblaze Linux -
> http://www.monstr.eu/fdt/
> Microblaze U-BOOT custodian
>



^ permalink raw reply

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Michał Mirosław @ 2011-04-19 15:25 UTC (permalink / raw)
  To: Ian Campbell; +Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <1303226148.5997.233.camel@zakaz.uk.xensource.com>

On Tue, Apr 19, 2011 at 04:15:48PM +0100, Ian Campbell wrote:
> On Tue, 2011-04-19 at 14:43 +0100, Michał Mirosław wrote:
> > On Tue, Apr 19, 2011 at 02:39:00PM +0100, Ian Campbell wrote:
> > > On Tue, 2011-04-19 at 14:30 +0100, Michał Mirosław wrote:
> > > > On Tue, Apr 19, 2011 at 02:17:53PM +0100, Ian Campbell wrote:
> > > > > I fixed it with the following, I also moved the !can_sg MTU clamping
> > > > > into a set_features hook (like we do with netfront). Am I right that
> > > > > this pattern copes with changes to SG via ethtool etc better? I think
> > > > > it's more future proof in any case.
> > > > This looks wrong. Even if SG is turned on, you might get big skbs which
> > > > are linearized. There is a difference in SG capability and SG offload
> > > > status and as I see it the capability is what you need to test for MTU.
> > > So the existing stuff in drivers/net/xen-netfront.c is wrong too?
> > Looks like it. But I don't really know what are the real constraints for MTU.
> > What I know is that SG even if turned on needs not be used (and currently
> > it's not e.g. if checksum offload is disabled).
> The interesting case is the opposite one, isn't it? IOW if NETIF_F_SG is
> disabled but the frontend/backend agree that they have the capability to
> handle >PAGE_SIZE skbs

Then the driver might get bigger skbs but they won't ever be fragmented.

> In my experience, the normal reason for disabling the NETIF_F_SG offload
> status is that the underlying capability is somehow buggy, otherwise is
> there any reason to turn it off?

Some features depend on others to function or on some hardware/software state.
Though in most cases the reason is the one you wrote (capability also includes
what driver has implemented).

Best Regards,
Michał Mirosław

^ permalink raw reply

* [PATCH 3/3] IPVS: init and cleanup restructuring.
From: Hans Schillstrom @ 2011-04-19 15:25 UTC (permalink / raw)
  To: horms, ja, ebiederm, lvs-devel, netdev, netfilter-devel
  Cc: hans.schillstrom, Hans Schillstrom
In-Reply-To: <1303226705-29178-1-git-send-email-hans@schillstrom.com>

This patch tries to restore the initial init and cleanup
sequences that was before name space patch.

The number of calls to register_pernet_device have been
reduced to one for the ip_vs.ko
Schedulers still have their own calls.

This patch adds a function __ip_vs_service_cleanup()
and a throttle or actually on/off switch for
the netfilter hooks.

The nf hooks will be enabled when the first service is loaded
and disabled when the last service is removed or when a
name space exit starts.

Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
---
 include/net/ip_vs.h              |   17 +++++++
 net/netfilter/ipvs/ip_vs_app.c   |   15 +-----
 net/netfilter/ipvs/ip_vs_conn.c  |   20 ++++----
 net/netfilter/ipvs/ip_vs_core.c  |   86 ++++++++++++++++++++++++++++++++------
 net/netfilter/ipvs/ip_vs_ctl.c   |   66 ++++++++++++++++++++++-------
 net/netfilter/ipvs/ip_vs_est.c   |   14 +-----
 net/netfilter/ipvs/ip_vs_proto.c |   11 +----
 net/netfilter/ipvs/ip_vs_sync.c  |   13 +----
 8 files changed, 161 insertions(+), 81 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index d516f00..558e490 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -791,6 +791,7 @@ struct ip_vs_app {
 /* IPVS in network namespace */
 struct netns_ipvs {
 	int			gen;		/* Generation */
+	int			throttle;	/* Instead of nf unreg */
 	/*
 	 *	Hash table: for real service lookups
 	 */
@@ -1089,6 +1090,22 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp)
 	atomic_inc(&ctl_cp->n_control);
 }

+/*
+ * IPVS netns init & cleanup functions
+ */
+extern int __ip_vs_estimator_init(struct net *net);
+extern int __ip_vs_control_init(struct net *net);
+extern int __ip_vs_protocol_init(struct net *net);
+extern int __ip_vs_app_init(struct net *net);
+extern int __ip_vs_conn_init(struct net *net);
+extern int __ip_vs_sync_init(struct net *net);
+extern void __ip_vs_conn_cleanup(struct net *net);
+extern void __ip_vs_app_cleanup(struct net *net);
+extern void __ip_vs_protocol_cleanup(struct net *net);
+extern void __ip_vs_control_cleanup(struct net *net);
+extern void __ip_vs_estimator_cleanup(struct net *net);
+extern void __ip_vs_sync_cleanup(struct net *net);
+extern void __ip_vs_service_cleanup(struct net *net);

 /*
  *      IPVS application functions
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 7e8e769..51f3af7 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -576,7 +576,7 @@ static const struct file_operations ip_vs_app_fops = {
 };
 #endif

-static int __net_init __ip_vs_app_init(struct net *net)
+int __net_init __ip_vs_app_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);

@@ -585,26 +585,17 @@ static int __net_init __ip_vs_app_init(struct net *net)
 	return 0;
 }

-static void __net_exit __ip_vs_app_cleanup(struct net *net)
+void __net_exit __ip_vs_app_cleanup(struct net *net)
 {
 	proc_net_remove(net, "ip_vs_app");
 }

-static struct pernet_operations ip_vs_app_ops = {
-	.init = __ip_vs_app_init,
-	.exit = __ip_vs_app_cleanup,
-};
-
 int __init ip_vs_app_init(void)
 {
-	int rv;
-
-	rv = register_pernet_device(&ip_vs_app_ops);
-	return rv;
+	return 0;
 }


 void ip_vs_app_cleanup(void)
 {
-	unregister_pernet_device(&ip_vs_app_ops);
 }
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 36cd5ea..f8d6702 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1251,30 +1251,30 @@ int __net_init __ip_vs_conn_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);

+	EnterFunction(2);
 	atomic_set(&ipvs->conn_count, 0);

 	proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
 	proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
+	LeaveFunction(2);
 	return 0;
 }

-static void __net_exit __ip_vs_conn_cleanup(struct net *net)
+void __net_exit __ip_vs_conn_cleanup(struct net *net)
 {
+	EnterFunction(2);
 	/* flush all the connection entries first */
 	ip_vs_conn_flush(net);
 	proc_net_remove(net, "ip_vs_conn");
 	proc_net_remove(net, "ip_vs_conn_sync");
+	LeaveFunction(2);
 }
-static struct pernet_operations ipvs_conn_ops = {
-	.init = __ip_vs_conn_init,
-	.exit = __ip_vs_conn_cleanup,
-};

 int __init ip_vs_conn_init(void)
 {
 	int idx;
-	int retc;

+	EnterFunction(2);
 	/* Compute size and mask */
 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
 	ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
@@ -1309,18 +1309,18 @@ int __init ip_vs_conn_init(void)
 		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
 	}

-	retc = register_pernet_device(&ipvs_conn_ops);
-
 	/* calculate the random value for connection hash */
 	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+	LeaveFunction(2);

-	return retc;
+	return 0;
 }

 void ip_vs_conn_cleanup(void)
 {
-	unregister_pernet_device(&ipvs_conn_ops);
+	EnterFunction(2);
 	/* Release the empty cache */
 	kmem_cache_destroy(ip_vs_conn_cachep);
 	vfree(ip_vs_conn_tab);
+	LeaveFunction(2);
 }
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index a7bb81d..dc27fdf 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1343,6 +1343,10 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
 		return NF_ACCEPT; /* The packet looks wrong, ignore */

 	net = skb_net(skb);
+	/* Name space in use ? */
+	if (net->ipvs->throttle)
+		return NF_ACCEPT;
+
 	pd = ip_vs_proto_data_get(net, cih->protocol);
 	if (!pd)
 		return NF_ACCEPT;
@@ -1563,6 +1567,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 		}

 	net = skb_net(skb);
+	if (net->ipvs->throttle)
+		return NF_ACCEPT;
 	/* Protocol supported? */
 	pd = ip_vs_proto_data_get(net, iph.protocol);
 	if (unlikely(!pd))
@@ -1588,7 +1594,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
 	}

 	IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
-	net = skb_net(skb);
 	ipvs = net_ipvs(net);
 	/* Check the server status */
 	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -1879,24 +1884,73 @@ static int __net_init __ip_vs_init(struct net *net)
 {
 	struct netns_ipvs *ipvs;

+	EnterFunction(2);
 	ipvs = net_generic(net, ip_vs_net_id);
 	if (ipvs == NULL) {
 		pr_err("%s(): no memory.\n", __func__);
 		return -ENOMEM;
 	}
+	/* Hold the beast until a service is registerd */
+	ipvs->throttle = -1;
 	ipvs->net = net;
 	/* Counters used for creating unique names */
 	ipvs->gen = atomic_read(&ipvs_netns_cnt);
 	atomic_inc(&ipvs_netns_cnt);
 	net->ipvs = ipvs;
+
+	if ( __ip_vs_estimator_init(net) < 0)
+		goto estimator_fail;
+
+	if (__ip_vs_control_init(net) < 0)
+		goto control_fail;
+
+	if (__ip_vs_protocol_init(net) < 0)
+		goto protocol_fail;
+
+	if (__ip_vs_app_init(net) < 0)
+		goto app_fail;
+
+	if (__ip_vs_conn_init(net) < 0)
+		goto conn_fail;
+
+	if (__ip_vs_sync_init(net) < 0)
+		goto sync_fail;
+
+	LeaveFunction(2);
 	printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n",
 			 sizeof(struct netns_ipvs), ipvs->gen);
 	return 0;
+/*
+ * Error handling
+ */
+
+sync_fail:
+	__ip_vs_conn_cleanup(net);
+conn_fail:
+	__ip_vs_app_cleanup(net);
+app_fail:
+	__ip_vs_protocol_cleanup(net);
+protocol_fail:
+	__ip_vs_control_cleanup(net);
+control_fail:
+	__ip_vs_estimator_cleanup(net);
+estimator_fail:
+	return -ENOMEM;
 }

 static void __net_exit __ip_vs_cleanup(struct net *net)
 {
-	IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen);
+	net->ipvs->throttle = -1;
+	EnterFunction(2);
+	__ip_vs_sync_cleanup(net);
+	__ip_vs_service_cleanup(net);	/* ip_vs_flush() with locks */
+	__ip_vs_conn_cleanup(net);
+	__ip_vs_app_cleanup(net);
+	__ip_vs_protocol_cleanup(net);
+	__ip_vs_control_cleanup(net);
+	__ip_vs_estimator_cleanup(net);
+	LeaveFunction(2);
+	IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
 }

 static struct pernet_operations ipvs_core_ops = {
@@ -1913,10 +1967,7 @@ static int __init ip_vs_init(void)
 {
 	int ret;

-	ret = register_pernet_device(&ipvs_core_ops);	/* Alloc ip_vs struct */
-	if (ret < 0)
-		return ret;
-
+	EnterFunction(2);
 	ip_vs_estimator_init();
 	ret = ip_vs_control_init();
 	if (ret < 0) {
@@ -1944,41 +1995,50 @@ static int __init ip_vs_init(void)
 		goto cleanup_conn;
 	}

+	ret = register_pernet_device(&ipvs_core_ops);	/* Alloc ip_vs struct */
+	if (ret < 0)
+		goto cleanup_sync;
+
 	ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
 	if (ret < 0) {
 		pr_err("can't register hooks.\n");
-		goto cleanup_sync;
+		goto cleanup_net;
 	}

 	pr_info("ipvs loaded.\n");
+	LeaveFunction(2);
+
 	return ret;

+cleanup_net:
+	unregister_pernet_device(&ipvs_core_ops);       /* free ip_vs struct */
 cleanup_sync:
 	ip_vs_sync_cleanup();
-  cleanup_conn:
+cleanup_conn:
 	ip_vs_conn_cleanup();
-  cleanup_app:
+cleanup_app:
 	ip_vs_app_cleanup();
-  cleanup_protocol:
+cleanup_protocol:
 	ip_vs_protocol_cleanup();
 	ip_vs_control_cleanup();
-  cleanup_estimator:
+cleanup_estimator:
 	ip_vs_estimator_cleanup();
-	unregister_pernet_device(&ipvs_core_ops);	/* free ip_vs struct */
 	return ret;
 }

 static void __exit ip_vs_cleanup(void)
 {
+	EnterFunction(2);
 	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
+	unregister_pernet_device(&ipvs_core_ops);	/* free ip_vs struct */
 	ip_vs_sync_cleanup();
 	ip_vs_conn_cleanup();
 	ip_vs_app_cleanup();
 	ip_vs_protocol_cleanup();
 	ip_vs_control_cleanup();
 	ip_vs_estimator_cleanup();
-	unregister_pernet_device(&ipvs_core_ops);	/* free ip_vs struct */
 	pr_info("ipvs unloaded.\n");
+	LeaveFunction(2);
 }

 module_init(ip_vs_init);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 08715d8..6534ca3 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -69,6 +69,11 @@ int ip_vs_get_debug_level(void)
 }
 #endif

+
+/*  Protos */
+static void __ip_vs_del_service(struct ip_vs_service *svc);
+
+
 #ifdef CONFIG_IP_VS_IPV6
 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
 static int __ip_vs_addr_is_local_v6(struct net *net,
@@ -345,6 +350,9 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)

 	svc->flags &= ~IP_VS_SVC_F_HASHED;
 	atomic_dec(&svc->refcnt);
+	/* No more services then no need for input */
+	if (atomic_read(&svc->refcnt) == 0)
+		svc->net->ipvs->throttle = -1;
 	return 1;
 }

@@ -480,7 +488,6 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
 	}
 }

-
 /*
  *	Returns hash value for real service
  */
@@ -1214,6 +1221,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
 	write_unlock_bh(&__ip_vs_svc_lock);

 	*svc_p = svc;
+	/* Now whe have a service - full throttle */
+	ipvs->throttle = 0;
 	return 0;


@@ -1472,6 +1481,41 @@ static int ip_vs_flush(struct net *net)
 	return 0;
 }

+/*
+ *	Delete service by {netns} in the service table.
+ *	Called by __ip_vs_cleanup()
+ */
+void __ip_vs_service_cleanup(struct net *net)
+{
+	unsigned hash;
+	struct ip_vs_service *svc, *tmp;
+
+	EnterFunction(2);
+	/* Check for "full" addressed entries */
+	for (hash = 0; hash<IP_VS_SVC_TAB_SIZE; hash++) {
+		write_lock_bh(&__ip_vs_svc_lock);
+		list_for_each_entry_safe(svc, tmp, &ip_vs_svc_table[hash],
+					 s_list) {
+			if (net_eq(svc->net, net)) {
+				ip_vs_svc_unhash(svc);
+				/*  Wait until all the svc users go away. */
+				IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+				__ip_vs_del_service(svc);
+			}
+		}
+		list_for_each_entry_safe(svc, tmp, &ip_vs_svc_fwm_table[hash],
+					 f_list) {
+			if (net_eq(svc->net, net)) {
+				ip_vs_svc_unhash(svc);
+				/*  Wait until all the svc users go away. */
+				IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+				__ip_vs_del_service(svc);
+			}
+		}
+		write_unlock_bh(&__ip_vs_svc_lock);
+	}
+	LeaveFunction(2);
+}

 /*
  *	Zero counters in a service or all services
@@ -3593,6 +3637,7 @@ int __net_init __ip_vs_control_init(struct net *net)
 	int idx;
 	struct netns_ipvs *ipvs = net_ipvs(net);

+	EnterFunction(2);
 	ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);

 	/* Initialize rs_table */
@@ -3619,6 +3664,7 @@ int __net_init __ip_vs_control_init(struct net *net)
 	if (__ip_vs_control_init_sysctl(net))
 		goto err;

+	LeaveFunction(2);
 	return 0;

 err:
@@ -3626,10 +3672,11 @@ err:
 	return -ENOMEM;
 }

-static void __net_exit __ip_vs_control_cleanup(struct net *net)
+void __net_exit __ip_vs_control_cleanup(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);

+	EnterFunction(2);
 	ip_vs_trash_cleanup(net);
 	ip_vs_stop_estimator(net, &ipvs->tot_stats);
 	__ip_vs_control_cleanup_sysctl(net);
@@ -3637,13 +3684,9 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net)
 	proc_net_remove(net, "ip_vs_stats");
 	proc_net_remove(net, "ip_vs");
 	free_percpu(ipvs->tot_stats.cpustats);
+	LeaveFunction(2);
 }

-static struct pernet_operations ipvs_control_ops = {
-	.init = __ip_vs_control_init,
-	.exit = __ip_vs_control_cleanup,
-};
-
 int __init ip_vs_control_init(void)
 {
 	int idx;
@@ -3657,12 +3700,6 @@ int __init ip_vs_control_init(void)
 		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
 	}

-	ret = register_pernet_device(&ipvs_control_ops);
-	if (ret) {
-		pr_err("cannot register namespace.\n");
-		goto err;
-	}
-
 	smp_wmb();	/* Do we really need it now ? */

 	ret = nf_register_sockopt(&ip_vs_sockopts);
@@ -3682,8 +3719,6 @@ int __init ip_vs_control_init(void)
 	return 0;

 err_net:
-	unregister_pernet_device(&ipvs_control_ops);
-err:
 	return ret;
 }

@@ -3691,7 +3726,6 @@ err:
 void ip_vs_control_cleanup(void)
 {
 	EnterFunction(2);
-	unregister_pernet_device(&ipvs_control_ops);
 	ip_vs_genl_unregister();
 	nf_unregister_sockopt(&ip_vs_sockopts);
 	LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 759163e..508cce9 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -192,7 +192,7 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst,
 	dst->outbps = (e->outbps + 0xF) >> 5;
 }

-static int __net_init __ip_vs_estimator_init(struct net *net)
+int __net_init __ip_vs_estimator_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);

@@ -203,24 +203,16 @@ static int __net_init __ip_vs_estimator_init(struct net *net)
 	return 0;
 }

-static void __net_exit __ip_vs_estimator_exit(struct net *net)
+void __net_exit __ip_vs_estimator_cleanup(struct net *net)
 {
 	del_timer_sync(&net_ipvs(net)->est_timer);
 }
-static struct pernet_operations ip_vs_app_ops = {
-	.init = __ip_vs_estimator_init,
-	.exit = __ip_vs_estimator_exit,
-};

 int __init ip_vs_estimator_init(void)
 {
-	int rv;
-
-	rv = register_pernet_device(&ip_vs_app_ops);
-	return rv;
+	return 0;
 }

 void ip_vs_estimator_cleanup(void)
 {
-	unregister_pernet_device(&ip_vs_app_ops);
 }
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index f7021fc..eb86028 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -316,7 +316,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
 /*
  * per network name-space init
  */
-static int __net_init __ip_vs_protocol_init(struct net *net)
+int __net_init __ip_vs_protocol_init(struct net *net)
 {
 #ifdef CONFIG_IP_VS_PROTO_TCP
 	register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
@@ -336,7 +336,7 @@ static int __net_init __ip_vs_protocol_init(struct net *net)
 	return 0;
 }

-static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
+void __net_exit __ip_vs_protocol_cleanup(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 	struct ip_vs_proto_data *pd;
@@ -349,11 +349,6 @@ static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
 	}
 }

-static struct pernet_operations ipvs_proto_ops = {
-	.init = __ip_vs_protocol_init,
-	.exit = __ip_vs_protocol_cleanup,
-};
-
 int __init ip_vs_protocol_init(void)
 {
 	char protocols[64];
@@ -382,7 +377,6 @@ int __init ip_vs_protocol_init(void)
 	REGISTER_PROTOCOL(&ip_vs_protocol_esp);
 #endif
 	pr_info("Registered protocols (%s)\n", &protocols[2]);
-	return register_pernet_device(&ipvs_proto_ops);

 	return 0;
 }
@@ -393,7 +387,6 @@ void ip_vs_protocol_cleanup(void)
 	struct ip_vs_protocol *pp;
 	int i;

-	unregister_pernet_device(&ipvs_proto_ops);
 	/* unregister all the ipvs protocols */
 	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
 		while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 1aeca1d..e911f03 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1664,7 +1664,7 @@ int stop_sync_thread(struct net *net, int state)
 /*
  * Initialize data struct for each netns
  */
-static int __net_init __ip_vs_sync_init(struct net *net)
+int __net_init __ip_vs_sync_init(struct net *net)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);

@@ -1678,24 +1678,17 @@ static int __net_init __ip_vs_sync_init(struct net *net)
 	return 0;
 }

-static void __ip_vs_sync_cleanup(struct net *net)
+void __ip_vs_sync_cleanup(struct net *net)
 {
 	stop_sync_thread(net, IP_VS_STATE_MASTER);
 	stop_sync_thread(net, IP_VS_STATE_BACKUP);
 }

-static struct pernet_operations ipvs_sync_ops = {
-	.init = __ip_vs_sync_init,
-	.exit = __ip_vs_sync_cleanup,
-};
-
-
 int __init ip_vs_sync_init(void)
 {
-	return register_pernet_device(&ipvs_sync_ops);
+	return 0;
 }

 void ip_vs_sync_cleanup(void)
 {
-	unregister_pernet_device(&ipvs_sync_ops);
 }
--
1.7.2.3


^ permalink raw reply related

* [PATCH 2/3] IPVS: Change of register_pernet_subsys to register_pernet_device
From: Hans Schillstrom @ 2011-04-19 15:25 UTC (permalink / raw)
  To: horms, ja, ebiederm, lvs-devel, netdev, netfilter-devel
  Cc: hans.schillstrom, Hans Schillstrom
In-Reply-To: <1303226705-29178-1-git-send-email-hans@schillstrom.com>

This is part 1 of a makeover of the init and cleanup
functions in ip_vs using name space.

Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
---
 net/netfilter/ipvs/ip_vs_app.c   |    4 ++--
 net/netfilter/ipvs/ip_vs_conn.c  |    4 ++--
 net/netfilter/ipvs/ip_vs_core.c  |    6 +++---
 net/netfilter/ipvs/ip_vs_ctl.c   |    6 +++---
 net/netfilter/ipvs/ip_vs_est.c   |    4 ++--
 net/netfilter/ipvs/ip_vs_ftp.c   |    4 ++--
 net/netfilter/ipvs/ip_vs_lblc.c  |    6 +++---
 net/netfilter/ipvs/ip_vs_lblcr.c |    6 +++---
 net/netfilter/ipvs/ip_vs_proto.c |    4 ++--
 net/netfilter/ipvs/ip_vs_sync.c  |    4 ++--
 10 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 2dc6de1..7e8e769 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -599,12 +599,12 @@ int __init ip_vs_app_init(void)
 {
 	int rv;
 
-	rv = register_pernet_subsys(&ip_vs_app_ops);
+	rv = register_pernet_device(&ip_vs_app_ops);
 	return rv;
 }
 
 
 void ip_vs_app_cleanup(void)
 {
-	unregister_pernet_subsys(&ip_vs_app_ops);
+	unregister_pernet_device(&ip_vs_app_ops);
 }
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index c97bd45..36cd5ea 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1309,7 +1309,7 @@ int __init ip_vs_conn_init(void)
 		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
 	}
 
-	retc = register_pernet_subsys(&ipvs_conn_ops);
+	retc = register_pernet_device(&ipvs_conn_ops);
 
 	/* calculate the random value for connection hash */
 	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
@@ -1319,7 +1319,7 @@ int __init ip_vs_conn_init(void)
 
 void ip_vs_conn_cleanup(void)
 {
-	unregister_pernet_subsys(&ipvs_conn_ops);
+	unregister_pernet_device(&ipvs_conn_ops);
 	/* Release the empty cache */
 	kmem_cache_destroy(ip_vs_conn_cachep);
 	vfree(ip_vs_conn_tab);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 07accf6..a7bb81d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1913,7 +1913,7 @@ static int __init ip_vs_init(void)
 {
 	int ret;
 
-	ret = register_pernet_subsys(&ipvs_core_ops);	/* Alloc ip_vs struct */
+	ret = register_pernet_device(&ipvs_core_ops);	/* Alloc ip_vs struct */
 	if (ret < 0)
 		return ret;
 
@@ -1964,7 +1964,7 @@ cleanup_sync:
 	ip_vs_control_cleanup();
   cleanup_estimator:
 	ip_vs_estimator_cleanup();
-	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */
+	unregister_pernet_device(&ipvs_core_ops);	/* free ip_vs struct */
 	return ret;
 }
 
@@ -1977,7 +1977,7 @@ static void __exit ip_vs_cleanup(void)
 	ip_vs_protocol_cleanup();
 	ip_vs_control_cleanup();
 	ip_vs_estimator_cleanup();
-	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */
+	unregister_pernet_device(&ipvs_core_ops);	/* free ip_vs struct */
 	pr_info("ipvs unloaded.\n");
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ae47090..08715d8 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3657,7 +3657,7 @@ int __init ip_vs_control_init(void)
 		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
 	}
 
-	ret = register_pernet_subsys(&ipvs_control_ops);
+	ret = register_pernet_device(&ipvs_control_ops);
 	if (ret) {
 		pr_err("cannot register namespace.\n");
 		goto err;
@@ -3682,7 +3682,7 @@ int __init ip_vs_control_init(void)
 	return 0;
 
 err_net:
-	unregister_pernet_subsys(&ipvs_control_ops);
+	unregister_pernet_device(&ipvs_control_ops);
 err:
 	return ret;
 }
@@ -3691,7 +3691,7 @@ err:
 void ip_vs_control_cleanup(void)
 {
 	EnterFunction(2);
-	unregister_pernet_subsys(&ipvs_control_ops);
+	unregister_pernet_device(&ipvs_control_ops);
 	ip_vs_genl_unregister();
 	nf_unregister_sockopt(&ip_vs_sockopts);
 	LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 8c8766c..759163e 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -216,11 +216,11 @@ int __init ip_vs_estimator_init(void)
 {
 	int rv;
 
-	rv = register_pernet_subsys(&ip_vs_app_ops);
+	rv = register_pernet_device(&ip_vs_app_ops);
 	return rv;
 }
 
 void ip_vs_estimator_cleanup(void)
 {
-	unregister_pernet_subsys(&ip_vs_app_ops);
+	unregister_pernet_device(&ip_vs_app_ops);
 }
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 6b5dd6d..dfa04d3 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -451,7 +451,7 @@ int __init ip_vs_ftp_init(void)
 {
 	int rv;
 
-	rv = register_pernet_subsys(&ip_vs_ftp_ops);
+	rv = register_pernet_device(&ip_vs_ftp_ops);
 	return rv;
 }
 
@@ -460,7 +460,7 @@ int __init ip_vs_ftp_init(void)
  */
 static void __exit ip_vs_ftp_exit(void)
 {
-	unregister_pernet_subsys(&ip_vs_ftp_ops);
+	unregister_pernet_device(&ip_vs_ftp_ops);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 87e40ea..96765d0 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -603,20 +603,20 @@ static int __init ip_vs_lblc_init(void)
 {
 	int ret;
 
-	ret = register_pernet_subsys(&ip_vs_lblc_ops);
+	ret = register_pernet_device(&ip_vs_lblc_ops);
 	if (ret)
 		return ret;
 
 	ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
 	if (ret)
-		unregister_pernet_subsys(&ip_vs_lblc_ops);
+		unregister_pernet_device(&ip_vs_lblc_ops);
 	return ret;
 }
 
 static void __exit ip_vs_lblc_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
-	unregister_pernet_subsys(&ip_vs_lblc_ops);
+	unregister_pernet_device(&ip_vs_lblc_ops);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 90f618a..5de425f 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -799,20 +799,20 @@ static int __init ip_vs_lblcr_init(void)
 {
 	int ret;
 
-	ret = register_pernet_subsys(&ip_vs_lblcr_ops);
+	ret = register_pernet_device(&ip_vs_lblcr_ops);
 	if (ret)
 		return ret;
 
 	ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
 	if (ret)
-		unregister_pernet_subsys(&ip_vs_lblcr_ops);
+		unregister_pernet_device(&ip_vs_lblcr_ops);
 	return ret;
 }
 
 static void __exit ip_vs_lblcr_cleanup(void)
 {
 	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
-	unregister_pernet_subsys(&ip_vs_lblcr_ops);
+	unregister_pernet_device(&ip_vs_lblcr_ops);
 }
 
 
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index 17484a4..f7021fc 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -382,7 +382,7 @@ int __init ip_vs_protocol_init(void)
 	REGISTER_PROTOCOL(&ip_vs_protocol_esp);
 #endif
 	pr_info("Registered protocols (%s)\n", &protocols[2]);
-	return register_pernet_subsys(&ipvs_proto_ops);
+	return register_pernet_device(&ipvs_proto_ops);
 
 	return 0;
 }
@@ -393,7 +393,7 @@ void ip_vs_protocol_cleanup(void)
 	struct ip_vs_protocol *pp;
 	int i;
 
-	unregister_pernet_subsys(&ipvs_proto_ops);
+	unregister_pernet_device(&ipvs_proto_ops);
 	/* unregister all the ipvs protocols */
 	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
 		while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3f87555..1aeca1d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1692,10 +1692,10 @@ static struct pernet_operations ipvs_sync_ops = {
 
 int __init ip_vs_sync_init(void)
 {
-	return register_pernet_subsys(&ipvs_sync_ops);
+	return register_pernet_device(&ipvs_sync_ops);
 }
 
 void ip_vs_sync_cleanup(void)
 {
-	unregister_pernet_subsys(&ipvs_sync_ops);
+	unregister_pernet_device(&ipvs_sync_ops);
 }
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 1/3] IPVS: Change of socket usage to enable name space exit.
From: Hans Schillstrom @ 2011-04-19 15:25 UTC (permalink / raw)
  To: horms, ja, ebiederm, lvs-devel, netdev, netfilter-devel
  Cc: hans.schillstrom, Hans Schillstrom

This is the first patch in a series of three.
The cleanup doesn't work when not exit in a clean way by using ipvsadm.
Killing of a namespace causes a hanging ipvs, this series will cure that.

If the sync daemons run in a namespace while it crashes
or get killed, there is no way to stop them except for a reboot.

Kernel threads should not increment the use count of a socket.
By calling sk_change_net() after creating a socket this is avoided.
sock_release cant be used, instead sk_release_kernel() should be used.

Thanks to Eric W Biederman.

This patch is based on net-next-2.6  ver 2.6.39-rc2

Signed-off-by: Hans Schillstrom <hans@schillstrom.com>
---
 net/netfilter/ipvs/ip_vs_sync.c |   28 +++++++++++++++++++---------
 1 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 3e7961e..3f87555 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1309,7 +1309,12 @@ static struct socket *make_send_sock(struct net *net)
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-
+	/*
+	 * Kernel sockets that are a part of a namespace, should not
+	 * hold a reference to a namespace in order to allow to stop it.
+	 * After sk_change_net should be released using sk_release_kernel.
+	 */
+	sk_change_net(sock->sk, net);
 	result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
 	if (result < 0) {
 		pr_err("Error setting outbound mcast interface\n");
@@ -1334,8 +1339,8 @@ static struct socket *make_send_sock(struct net *net)

 	return sock;

-  error:
-	sock_release(sock);
+error:
+	sk_release_kernel(sock->sk);
 	return ERR_PTR(result);
 }

@@ -1355,7 +1360,12 @@ static struct socket *make_receive_sock(struct net *net)
 		pr_err("Error during creation of socket; terminating\n");
 		return ERR_PTR(result);
 	}
-
+	/*
+	 * Kernel sockets that are a part of a namespace, should not
+	 * hold a reference to a namespace in order to allow to stop it.
+	 * After sk_change_net should be released using sk_release_kernel.
+	 */
+	sk_change_net(sock->sk, net);
 	/* it is equivalent to the REUSEADDR option in user-space */
 	sock->sk->sk_reuse = 1;

@@ -1377,8 +1387,8 @@ static struct socket *make_receive_sock(struct net *net)

 	return sock;

-  error:
-	sock_release(sock);
+error:
+	sk_release_kernel(sock->sk);
 	return ERR_PTR(result);
 }

@@ -1473,7 +1483,7 @@ static int sync_thread_master(void *data)
 		ip_vs_sync_buff_release(sb);

 	/* release the sending multicast socket */
-	sock_release(tinfo->sock);
+	sk_release_kernel(tinfo->sock->sk);
 	kfree(tinfo);

 	return 0;
@@ -1513,7 +1523,7 @@ static int sync_thread_backup(void *data)
 	}

 	/* release the sending multicast socket */
-	sock_release(tinfo->sock);
+	sk_release_kernel(tinfo->sock->sk);
 	kfree(tinfo->buf);
 	kfree(tinfo);

@@ -1601,7 +1611,7 @@ outtinfo:
 outbuf:
 	kfree(buf);
 outsocket:
-	sock_release(sock);
+	sk_release_kernel(sock->sk);
 out:
 	return result;
 }
--
1.7.2.3


^ permalink raw reply related

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Ian Campbell @ 2011-04-19 15:15 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <20110419134352.GB4716@rere.qmqm.pl>

On Tue, 2011-04-19 at 14:43 +0100, Michał Mirosław wrote:
> On Tue, Apr 19, 2011 at 02:39:00PM +0100, Ian Campbell wrote:
> > On Tue, 2011-04-19 at 14:30 +0100, Michał Mirosław wrote:
> > > On Tue, Apr 19, 2011 at 02:17:53PM +0100, Ian Campbell wrote:
> > > > I fixed it with the following, I also moved the !can_sg MTU clamping
> > > > into a set_features hook (like we do with netfront). Am I right that
> > > > this pattern copes with changes to SG via ethtool etc better? I think
> > > > it's more future proof in any case.
> > > This looks wrong. Even if SG is turned on, you might get big skbs which
> > > are linearized. There is a difference in SG capability and SG offload
> > > status and as I see it the capability is what you need to test for MTU.
> > So the existing stuff in drivers/net/xen-netfront.c is wrong too?
> 
> Looks like it. But I don't really know what are the real constraints for MTU.
> What I know is that SG even if turned on needs not be used (and currently
> it's not e.g. if checksum offload is disabled).

The interesting case is the opposite one, isn't it? IOW if NETIF_F_SG is
disabled but the frontend/backend agree that they have the capability to
handle >PAGE_SIZE skbs

In my experience, the normal reason for disabling the NETIF_F_SG offload
status is that the underlying capability is somehow buggy, otherwise is
there any reason to turn it off?

> So MTU setting should not depend on SG offload state but on some capability.

Ian.



^ permalink raw reply

* Re: DSCP values in TCP handshake
From: Matt Mathis @ 2011-04-19 15:09 UTC (permalink / raw)
  To: Mikael Abrahamsson; +Cc: Stephen Hemminger, Joe Buehler, Eric Dumazet, netdev
In-Reply-To: <alpine.DEB.2.00.1104190620070.14027@uplift.swm.pp.se>

> I don't know why this didn't make it into RFC, I can inquiry if there is
> interest.

Please do.    This missing spec is one of the things that makes Less
than Best Effort (aka scavenger service) unusable.   Only the client
knows if they are fetching data in the background.   The server
doesn't care.

The other botch it is the spec that DSCP can be cleared under certain
conditions, which has the effect of promoting LBE to BE.   I have lost
track of the details.

Making LBE work would go a long way to solving the buffer bloat
problem and more....

Thanks,
--MM--
The best way to predict the future is to create it.  - Alan Kay




On Tue, Apr 19, 2011 at 12:28 AM, Mikael Abrahamsson <swmike@swm.pp.se> wrote:
> On Mon, 18 Apr 2011, Stephen Hemminger wrote:
>
>> Linux does not look at DSCP of incoming packets (there is no queue).
>
> Then I see no reason for the policy of not reflecting DSCP.
>
> If we receive the DSCP marked packet then it means the network is either not
> QoS enabled (it doesn't care) or it's actually allowed through the border
> router with DSCP unchanged. Either means it's safe to reflect the DSCP
> value, either it will have no effect or it's actually meant to be
> prioritized.
>
> With precedence, it originally was mandated that if the precedence value
> changed, the TCP session should be reset. Fortunately, this was changed but
> I would still say that it's thought that DSCP values should be reflected by
> the server.
>
> For instance:
>
> <http://tools.ietf.org/html/draft-ietf-ieprep-reflexive-dscp-02>
>
> "The requester could initiate this. Thus, if the DSCP
>   received on one TCP segment differs from the TCP used on a prior TCP
>   segment in a session, the new DSCP SHOULD be reflected unless local
>   policy prevents this."
>
> I don't know why this didn't make it into RFC, I can inquiry if there is
> interest.
>
> --
> Mikael Abrahamsson    email: swmike@swm.pp.se
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH net-next-2.6 3/3] bonding,ipv4,ipv6,vlan: Handle NETDEV_BONDING_FAILOVER like NETDEV_NOTIFY_PEERS
From: Ben Hutchings @ 2011-04-19 14:56 UTC (permalink / raw)
  To: Brian Haley
  Cc: Jay Vosburgh, David Miller, Andy Gospodarek, Patrick McHardy,
	netdev
In-Reply-To: <4DACE638.9060909@hp.com>

On Mon, 2011-04-18 at 21:32 -0400, Brian Haley wrote:
> On 04/18/2011 03:09 PM, Ben Hutchings wrote:
> > How about restoring the parameters like this:
> > 
> > ---
> > From: Ben Hutchings <bhutchings@solarflare.com>
> > Date: Mon, 18 Apr 2011 19:36:48 +0100
> > Subject: [PATCH net-next-2.6] ipv4,ipv6,bonding: Restore control over number of peer notifications
> > 
> > For backward compatibility, we should retain the module parameters and
> > sysfs attributes to control the number of peer notifications
> > (gratuitous ARPs and unsolicited NAs) sent after bonding failover.
> > Also, it is possible for failover to take place even though the new
> > active slave does not have link up, and in that case the peer
> > notification should be deferred until it does.
> > 
> > Change ipv4 and ipv6 so they do not automatically send peer
> > notifications on bonding failover.  Change the bonding driver to send
> > separate NETDEV_NOTIFY_PEERS notifications when the link is up, as
> > many times as requested.  Since it does not directly control which
> > protocols send notifications, make num_grat_arp and num_unsol_na
> > aliases for a single parameter.
> 
> Hi Ben,
> 
> I think this looks good, I'll try and get this tested here when I have
> a chance, but for now I can:
> 
> Acked-by: Brian Haley <brian.haley@hp.com>
> 
> Should we just go ahead and make a new parameter for peer notification?
> Compiled but untested patch below.
[...]

If everyone is in agreement to deprecate the old parameters in favour of
a single parameter (or none) I think there should be a target date for
removal, documented in Documentation/feature-removal-schedule.txt.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [PATCH 0/3] netfilter: netfilter fixes for 2.6.39-rc4
From: kaber @ 2011-04-19 14:51 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev

Hi Dave,

following are three netfilter fixes for 2.6.39-rc4, containing:

- a fix for the bitmap:ip,mac set type to require the src/dst parameter
  to be set to src, from Jozsef

- a fix to make --del-set of the SET target work, from Jozsef

- a patch to fix the order in which sets are dumped, from Jozsef

Please pull from:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-2.6.git master

Thanks!


^ permalink raw reply

* [PATCH 1/3] netfilter: ipset: bitmap:ip,mac type requires "src" for MAC
From: kaber @ 2011-04-19 14:51 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303224705-17400-1-git-send-email-kaber@trash.net>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

Enforce that the second "src/dst" parameter of the set match and SET target
must be "src", because we have access to the source MAC only in the packet.
The previous behaviour, that the type required the second parameter
but actually ignored the value was counter-intuitive and confusing.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipset/ip_set_bitmap_ipmac.c |    4 ++++
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 00a3324..a274300 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -343,6 +343,10 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
 	ipset_adtfn adtfn = set->variant->adt[adt];
 	struct ipmac data;
 
+	/* MAC can be src only */
+	if (!(flags & IPSET_DIM_TWO_SRC))
+		return 0;
+
 	data.id = ntohl(ip4addr(skb, flags & IPSET_DIM_ONE_SRC));
 	if (data.id < map->first_ip || data.id > map->last_ip)
 		return -IPSET_ERR_BITMAP_RANGE;
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 3/3] netfilter: ipset: Fix the order of listing of sets
From: kaber @ 2011-04-19 14:51 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303224705-17400-1-git-send-email-kaber@trash.net>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

A restoreable saving of sets requires that list:set type of sets
come last and the code part which should have taken into account
the ordering was broken. The patch fixes the listing order.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/ipset/ip_set_core.c |   18 ++++++++++--------
 1 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index e88ac3c..d87e03b 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1022,8 +1022,9 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 	if (cb->args[1] >= ip_set_max)
 		goto out;
 
-	pr_debug("args[0]: %ld args[1]: %ld\n", cb->args[0], cb->args[1]);
 	max = cb->args[0] == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
+dump_last:
+	pr_debug("args[0]: %ld args[1]: %ld\n", cb->args[0], cb->args[1]);
 	for (; cb->args[1] < max; cb->args[1]++) {
 		index = (ip_set_id_t) cb->args[1];
 		set = ip_set_list[index];
@@ -1038,8 +1039,8 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 		 * so that lists (unions of sets) are dumped last.
 		 */
 		if (cb->args[0] != DUMP_ONE &&
-		    !((cb->args[0] == DUMP_ALL) ^
-		      (set->type->features & IPSET_DUMP_LAST)))
+		    ((cb->args[0] == DUMP_ALL) ==
+		     !!(set->type->features & IPSET_DUMP_LAST)))
 			continue;
 		pr_debug("List set: %s\n", set->name);
 		if (!cb->args[2]) {
@@ -1083,6 +1084,12 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 			goto release_refcount;
 		}
 	}
+	/* If we dump all sets, continue with dumping last ones */
+	if (cb->args[0] == DUMP_ALL) {
+		cb->args[0] = DUMP_LAST;
+		cb->args[1] = 0;
+		goto dump_last;
+	}
 	goto out;
 
 nla_put_failure:
@@ -1093,11 +1100,6 @@ release_refcount:
 		pr_debug("release set %s\n", ip_set_list[index]->name);
 		ip_set_put_byindex(index);
 	}
-
-	/* If we dump all sets, continue with dumping last ones */
-	if (cb->args[0] == DUMP_ALL && cb->args[1] >= max && !cb->args[2])
-		cb->args[0] = DUMP_LAST;
-
 out:
 	if (nlh) {
 		nlmsg_end(skb, nlh);
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 2/3] netfilter: ipset: set match and SET target fixes
From: kaber @ 2011-04-19 14:51 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303224705-17400-1-git-send-email-kaber@trash.net>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

The SET target with --del-set did not work due to using wrongly
the internal dimension of --add-set instead of --del-set.
Also, the checkentries did not release the set references when
returned an error. Bugs reported by Lennert Buytenhek.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_set.c |   18 ++++++++++++++++--
 1 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 061d48c..b3babae 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -81,6 +81,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
+		ip_set_nfnl_put(info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -135,6 +136,8 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
+			if (info->add_set.index != IPSET_INVALID_ID)
+				ip_set_nfnl_put(info->add_set.index);
 			return -ENOENT;
 		}
 	}
@@ -142,6 +145,10 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 	    info->del_set.u.flags[IPSET_DIM_MAX-1] != 0) {
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
+		if (info->add_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->add_set.index);
+		if (info->del_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->del_set.index);
 		return -ERANGE;
 	}
 
@@ -192,6 +199,7 @@ set_match_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.dim > IPSET_DIM_MAX) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
+		ip_set_nfnl_put(info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -219,7 +227,7 @@ set_target(struct sk_buff *skb, const struct xt_action_param *par)
 	if (info->del_set.index != IPSET_INVALID_ID)
 		ip_set_del(info->del_set.index,
 			   skb, par->family,
-			   info->add_set.dim,
+			   info->del_set.dim,
 			   info->del_set.flags);
 
 	return XT_CONTINUE;
@@ -245,13 +253,19 @@ set_target_checkentry(const struct xt_tgchk_param *par)
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
+			if (info->add_set.index != IPSET_INVALID_ID)
+				ip_set_nfnl_put(info->add_set.index);
 			return -ENOENT;
 		}
 	}
 	if (info->add_set.dim > IPSET_DIM_MAX ||
-	    info->del_set.flags > IPSET_DIM_MAX) {
+	    info->del_set.dim > IPSET_DIM_MAX) {
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
+		if (info->add_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->add_set.index);
+		if (info->del_set.index != IPSET_INVALID_ID)
+			ip_set_nfnl_put(info->del_set.index);
 		return -ERANGE;
 	}
 
-- 
1.7.2.3


^ permalink raw reply related

* [PATCH 0/3] netfilter: netfilter update
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev

Hi Dave,

following is a small netfilter update for net-next, containing:

- a patch from Eric to remove some atomic ops in the *tables fastpath

- removal of an unnecessary ifdef from IPVS by Simon

- SCTP and UDPLITE support for ipset, from Jozsef

Please pull from:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git master

Thanks!

^ permalink raw reply

* [PATCH 1/3] netfilter: get rid of atomic ops in fast path
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303223492-13549-1-git-send-email-kaber@trash.net>

From: Eric Dumazet <eric.dumazet@gmail.com>

We currently use a percpu spinlock to 'protect' rule bytes/packets
counters, after various attempts to use RCU instead.

Lately we added a seqlock so that get_counters() can run without
blocking BH or 'writers'. But we really only need the seqcount in it.

Spinlock itself is only locked by the current/owner cpu, so we can
remove it completely.

This cleanups api, using correct 'writer' vs 'reader' semantic.

At replace time, the get_counters() call makes sure all cpus are done
using the old table.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/x_tables.h |   96 ++++++++++++++++--------------------
 net/ipv4/netfilter/arp_tables.c    |   18 ++++---
 net/ipv4/netfilter/ip_tables.c     |   28 +++++------
 net/ipv6/netfilter/ip6_tables.c    |   19 +++++---
 net/netfilter/x_tables.c           |    9 +--
 5 files changed, 80 insertions(+), 90 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 3721952..32cddf7 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -456,72 +456,60 @@ extern void xt_proto_fini(struct net *net, u_int8_t af);
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
 
-/*
- * Per-CPU spinlock associated with per-cpu table entries, and
- * with a counter for the "reading" side that allows a recursive
- * reader to avoid taking the lock and deadlocking.
- *
- * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu.
- * It needs to ensure that the rules are not being changed while the packet
- * is being processed. In some cases, the read lock will be acquired
- * twice on the same CPU; this is okay because of the count.
- *
- * "writing" is used when reading counters.
- *  During replace any readers that are using the old tables have to complete
- *  before freeing the old table. This is handled by the write locking
- *  necessary for reading the counters.
+/**
+ * xt_recseq - recursive seqcount for netfilter use
+ * 
+ * Packet processing changes the seqcount only if no recursion happened
+ * get_counters() can use read_seqcount_begin()/read_seqcount_retry(),
+ * because we use the normal seqcount convention :
+ * Low order bit set to 1 if a writer is active.
  */
-struct xt_info_lock {
-	seqlock_t lock;
-	unsigned char readers;
-};
-DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
+DECLARE_PER_CPU(seqcount_t, xt_recseq);
 
-/*
- * Note: we need to ensure that preemption is disabled before acquiring
- * the per-cpu-variable, so we do it as a two step process rather than
- * using "spin_lock_bh()".
- *
- * We _also_ need to disable bottom half processing before updating our
- * nesting count, to make sure that the only kind of re-entrancy is this
- * code being called by itself: since the count+lock is not an atomic
- * operation, we can allow no races.
+/**
+ * xt_write_recseq_begin - start of a write section
  *
- * _Only_ that special combination of being per-cpu and never getting
- * re-entered asynchronously means that the count is safe.
+ * Begin packet processing : all readers must wait the end
+ * 1) Must be called with preemption disabled
+ * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
+ * Returns :
+ *  1 if no recursion on this cpu
+ *  0 if recursion detected
  */
-static inline void xt_info_rdlock_bh(void)
+static inline unsigned int xt_write_recseq_begin(void)
 {
-	struct xt_info_lock *lock;
+	unsigned int addend;
 
-	local_bh_disable();
-	lock = &__get_cpu_var(xt_info_locks);
-	if (likely(!lock->readers++))
-		write_seqlock(&lock->lock);
-}
+	/*
+	 * Low order bit of sequence is set if we already
+	 * called xt_write_recseq_begin().
+	 */
+	addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1;
 
-static inline void xt_info_rdunlock_bh(void)
-{
-	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
+	/*
+	 * This is kind of a write_seqcount_begin(), but addend is 0 or 1
+	 * We dont check addend value to avoid a test and conditional jump,
+	 * since addend is most likely 1
+	 */
+	__this_cpu_add(xt_recseq.sequence, addend);
+	smp_wmb();
 
-	if (likely(!--lock->readers))
-		write_sequnlock(&lock->lock);
-	local_bh_enable();
+	return addend;
 }
 
-/*
- * The "writer" side needs to get exclusive access to the lock,
- * regardless of readers.  This must be called with bottom half
- * processing (and thus also preemption) disabled.
+/**
+ * xt_write_recseq_end - end of a write section
+ * @addend: return value from previous xt_write_recseq_begin()
+ *
+ * End packet processing : all readers can proceed
+ * 1) Must be called with preemption disabled
+ * 2) softirqs must be disabled too (or we should use irqsafe_cpu_add())
  */
-static inline void xt_info_wrlock(unsigned int cpu)
-{
-	write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
-}
-
-static inline void xt_info_wrunlock(unsigned int cpu)
+static inline void xt_write_recseq_end(unsigned int addend)
 {
-	write_sequnlock(&per_cpu(xt_info_locks, cpu).lock);
+	/* this is kind of a write_seqcount_end(), but addend is 0 or 1 */
+	smp_wmb();
+	__this_cpu_add(xt_recseq.sequence, addend);
 }
 
 /*
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 4b5d457..2ea7433 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -260,6 +260,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	void *table_base;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	unsigned int addend;
 
 	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
 		return NF_DROP;
@@ -267,7 +268,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	xt_info_rdlock_bh();
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
 	private = table->private;
 	table_base = private->entries[smp_processor_id()];
 
@@ -338,7 +340,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			/* Verdict */
 			break;
 	} while (!acpar.hotdrop);
-	xt_info_rdunlock_bh();
+	xt_write_recseq_end(addend);
+	local_bh_enable();
 
 	if (acpar.hotdrop)
 		return NF_DROP;
@@ -712,7 +715,7 @@ static void get_counters(const struct xt_table_info *t,
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
-		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -720,10 +723,10 @@ static void get_counters(const struct xt_table_info *t,
 			unsigned int start;
 
 			do {
-				start = read_seqbegin(lock);
+				start = read_seqcount_begin(s);
 				bcnt = iter->counters.bcnt;
 				pcnt = iter->counters.pcnt;
-			} while (read_seqretry(lock, start));
+			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
@@ -1115,6 +1118,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 	int ret = 0;
 	void *loc_cpu_entry;
 	struct arpt_entry *iter;
+	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
 
@@ -1171,12 +1175,12 @@ static int do_add_counters(struct net *net, const void __user *user,
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
 	loc_cpu_entry = private->entries[curcpu];
-	xt_info_wrlock(curcpu);
+	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
 		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
-	xt_info_wrunlock(curcpu);
+	xt_write_recseq_end(addend);
  unlock_up_free:
 	local_bh_enable();
 	xt_table_unlock(t);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ffcea0d..2b6b700 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -68,15 +68,6 @@ void *ipt_alloc_initial_table(const struct xt_table *info)
 }
 EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
 
-/*
-   We keep a set of rules for each CPU, so we can avoid write-locking
-   them in the softirq when updating the counters and therefore
-   only need to read-lock in the softirq; doing a write_lock_bh() in user
-   context stops packets coming through and allows user context to read
-   the counters or update the rules.
-
-   Hence the start of any table is given by get_table() below.  */
-
 /* Returns whether matches rule or not. */
 /* Performance critical - called for every packet */
 static inline bool
@@ -311,6 +302,7 @@ ipt_do_table(struct sk_buff *skb,
 	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	unsigned int addend;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -331,7 +323,8 @@ ipt_do_table(struct sk_buff *skb,
 	acpar.hooknum = hook;
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	xt_info_rdlock_bh();
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
 	private = table->private;
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
@@ -430,7 +423,9 @@ ipt_do_table(struct sk_buff *skb,
 	pr_debug("Exiting %s; resetting sp from %u to %u\n",
 		 __func__, *stackptr, origptr);
 	*stackptr = origptr;
-	xt_info_rdunlock_bh();
+ 	xt_write_recseq_end(addend);
+ 	local_bh_enable();
+
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
 #else
@@ -886,7 +881,7 @@ get_counters(const struct xt_table_info *t,
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
-		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -894,10 +889,10 @@ get_counters(const struct xt_table_info *t,
 			unsigned int start;
 
 			do {
-				start = read_seqbegin(lock);
+				start = read_seqcount_begin(s);
 				bcnt = iter->counters.bcnt;
 				pcnt = iter->counters.pcnt;
-			} while (read_seqretry(lock, start));
+			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i; /* macro does multi eval of i */
@@ -1312,6 +1307,7 @@ do_add_counters(struct net *net, const void __user *user,
 	int ret = 0;
 	void *loc_cpu_entry;
 	struct ipt_entry *iter;
+	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
 
@@ -1368,12 +1364,12 @@ do_add_counters(struct net *net, const void __user *user,
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
 	loc_cpu_entry = private->entries[curcpu];
-	xt_info_wrlock(curcpu);
+	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
 		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
-	xt_info_wrunlock(curcpu);
+	xt_write_recseq_end(addend);
  unlock_up_free:
 	local_bh_enable();
 	xt_table_unlock(t);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 0b2af9b..ec7cf57 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -340,6 +340,7 @@ ip6t_do_table(struct sk_buff *skb,
 	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	unsigned int addend;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -358,7 +359,8 @@ ip6t_do_table(struct sk_buff *skb,
 
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 
-	xt_info_rdlock_bh();
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
 	private = table->private;
 	cpu        = smp_processor_id();
 	table_base = private->entries[cpu];
@@ -442,7 +444,9 @@ ip6t_do_table(struct sk_buff *skb,
 	} while (!acpar.hotdrop);
 
 	*stackptr = origptr;
-	xt_info_rdunlock_bh();
+
+ 	xt_write_recseq_end(addend);
+ 	local_bh_enable();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -899,7 +903,7 @@ get_counters(const struct xt_table_info *t,
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
-		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
@@ -907,10 +911,10 @@ get_counters(const struct xt_table_info *t,
 			unsigned int start;
 
 			do {
-				start = read_seqbegin(lock);
+				start = read_seqcount_begin(s);
 				bcnt = iter->counters.bcnt;
 				pcnt = iter->counters.pcnt;
-			} while (read_seqretry(lock, start));
+			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
@@ -1325,6 +1329,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	int ret = 0;
 	const void *loc_cpu_entry;
 	struct ip6t_entry *iter;
+	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
 
@@ -1381,13 +1386,13 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	i = 0;
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	xt_info_wrlock(curcpu);
+	addend = xt_write_recseq_begin();
 	loc_cpu_entry = private->entries[curcpu];
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
 		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
-	xt_info_wrunlock(curcpu);
+	xt_write_recseq_end(addend);
 
  unlock_up_free:
 	local_bh_enable();
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index a9adf4c..52959ef 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -762,8 +762,8 @@ void xt_compat_unlock(u_int8_t af)
 EXPORT_SYMBOL_GPL(xt_compat_unlock);
 #endif
 
-DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks);
-EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks);
+DEFINE_PER_CPU(seqcount_t, xt_recseq);
+EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
 
 static int xt_jumpstack_alloc(struct xt_table_info *i)
 {
@@ -1362,10 +1362,7 @@ static int __init xt_init(void)
 	int rv;
 
 	for_each_possible_cpu(i) {
-		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
-
-		seqlock_init(&lock->lock);
-		lock->readers = 0;
+		seqcount_init(&per_cpu(xt_recseq, i));
 	}
 
 	xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL);
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH 3/3] netfilter: ipset: SCTP, UDPLITE support added
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303223492-13549-1-git-send-email-kaber@trash.net>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

SCTP and UDPLITE port support added to the hash:*port* set types.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/ipset/ip_set_getport.h |    2 ++
 net/netfilter/ipset/ip_set_getport.c           |   16 +++++++++++++++-
 net/netfilter/ipset/ip_set_hash_ipport.c       |    2 +-
 net/netfilter/ipset/ip_set_hash_ipportip.c     |    2 +-
 net/netfilter/ipset/ip_set_hash_ipportnet.c    |    2 +-
 net/netfilter/ipset/ip_set_hash_netport.c      |    2 +-
 6 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set_getport.h b/include/linux/netfilter/ipset/ip_set_getport.h
index 5aebd17..90d0930 100644
--- a/include/linux/netfilter/ipset/ip_set_getport.h
+++ b/include/linux/netfilter/ipset/ip_set_getport.h
@@ -22,7 +22,9 @@ static inline bool ip_set_proto_with_ports(u8 proto)
 {
 	switch (proto) {
 	case IPPROTO_TCP:
+	case IPPROTO_SCTP:
 	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
 		return true;
 	}
 	return false;
diff --git a/net/netfilter/ipset/ip_set_getport.c b/net/netfilter/ipset/ip_set_getport.c
index 8d52272..757143b 100644
--- a/net/netfilter/ipset/ip_set_getport.c
+++ b/net/netfilter/ipset/ip_set_getport.c
@@ -11,6 +11,7 @@
 #include <linux/skbuff.h>
 #include <linux/icmp.h>
 #include <linux/icmpv6.h>
+#include <linux/sctp.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
@@ -35,7 +36,20 @@ get_port(const struct sk_buff *skb, int protocol, unsigned int protooff,
 		*port = src ? th->source : th->dest;
 		break;
 	}
-	case IPPROTO_UDP: {
+	case IPPROTO_SCTP: {
+		sctp_sctphdr_t _sh;
+		const sctp_sctphdr_t *sh;
+
+		sh = skb_header_pointer(skb, protooff, sizeof(_sh), &_sh);
+		if (sh == NULL)
+			/* No choice either */
+			return false;
+
+		*port = src ? sh->source : sh->dest;
+		break;
+	}
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE: {
 		struct udphdr _udph;
 		const struct udphdr *uh;
 
diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c
index b921414..14281b6 100644
--- a/net/netfilter/ipset/ip_set_hash_ipport.c
+++ b/net/netfilter/ipset/ip_set_hash_ipport.c
@@ -491,7 +491,7 @@ static struct ip_set_type hash_ipport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_ipport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c
index 4642872..401c8a2 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportip.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportip.c
@@ -509,7 +509,7 @@ static struct ip_set_type hash_ipportip_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_ipportip_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 2cb84a5..4743e54 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -574,7 +574,7 @@ static struct ip_set_type hash_ipportnet_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2,
 	.dimension	= IPSET_DIM_THREE,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_ipportnet_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 8598676..d2a4036 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -526,7 +526,7 @@ static struct ip_set_type hash_netport_type __read_mostly = {
 	.features	= IPSET_TYPE_IP | IPSET_TYPE_PORT,
 	.dimension	= IPSET_DIM_TWO,
 	.family		= AF_UNSPEC,
-	.revision	= 0,
+	.revision	= 1,
 	.create		= hash_netport_create,
 	.create_policy	= {
 		[IPSET_ATTR_HASHSIZE]	= { .type = NLA_U32 },
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH 2/3] IPVS: combine consecutive #ifdef CONFIG_PROC_FS blocks
From: kaber @ 2011-04-19 14:31 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1303223492-13549-1-git-send-email-kaber@trash.net>

From: Simon Horman <horms@verge.net.au>

Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c |    3 ---
 1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 33733c8..36f4495 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1984,9 +1984,6 @@ static const struct file_operations ip_vs_info_fops = {
 	.release = seq_release_private,
 };
 
-#endif
-
-#ifdef CONFIG_PROC_FS
 static int ip_vs_stats_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_single_net(seq);
-- 
1.7.4.4


^ permalink raw reply related

* RE: [PATCH net-next 4/5] be2net: pass domain id to be_cmd_link_status_query
From: Ajit.Khaparde @ 2011-04-19 14:23 UTC (permalink / raw)
  To: davem; +Cc: netdev
In-Reply-To: <20110418.225659.112612060.davem@davemloft.net>

> From: David Miller [davem@davemloft.net]
> Sent: Tuesday, April 19, 2011 12:56 AM
> To: Khaparde, Ajit
> Cc: netdev@vger.kernel.org
> Subject: Re: [PATCH net-next 4/5] be2net: pass domain id to be_cmd_link_status_query

> From: Ajit Khaparde <ajit.khaparde@emulex.com>
> Date: Mon, 18 Apr 2011 17:30:33 -0500

>
> Signed-off-by: Ajit Khaparde <ajit.khaparde@emulex.com>

> That's not all this patch does:
Oh.. sorry about that. I did not realize this. I had worked on these patches over a couple of days
and on one of those days, I seem to have mixed the patches. Will respin these again.


> It seems to be also converting the driver over to the new VLAN
> interfaces.

> Please seperate this part into a seperate patch and resubmit your
> patch series.

> Thanks.

^ permalink raw reply

* Re: [PATCH] net: s390: convert to hw_features
From: Frank Blaschka @ 2011-04-19 14:11 UTC (permalink / raw)
  To: Michał Mirosław; +Cc: netdev, linux-s390
In-Reply-To: <20110419104320.314AB13A66@rere.qmqm.pl>

On Tue, Apr 19, 2011 at 12:43:20PM +0200, Michał Mirosław wrote:
> options.large_send was easy to get rid of. options.checksum_type has deeper
> roots so is left for later cleanup.
> 
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> ---

Hi Micha,

thx for the patch. There are some small issues I will fix up.
I will add the patch to my queue. It might be a good time to
get rid of the large_send and checksum sysfs attribute as well.
I will work on that too, thx again ...

Frank


^ permalink raw reply

* Re: [patch net-next-2.6] bonding: move processing of recv handlers into handle_frame()
From: Eric Dumazet @ 2011-04-19 14:02 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, shemminger, kaber, fubar, nicolas.2p.debian, andy
In-Reply-To: <1303220896-9092-1-git-send-email-jpirko@redhat.com>

Le mardi 19 avril 2011 à 15:48 +0200, Jiri Pirko a écrit :
> Since now when bonding uses rx_handler, all traffic going into bond
> device goes thru bond_handle_frame. So there's no need to go back into
> bonding code later via ptype handlers. This patch converts
> original ptype handlers into "bonding receive probes". These functions
> are called from bond_handle_frame and they are registered per-mode.
> 
> Note that vlan packets are also handled because they are always untagged
> thanks to vlan_untag()
> 
> Note that this also allows arpmon for eth-bond-bridge-vlan topology.
> 
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>



>  			}
> diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
> index 6126c6a..85fb822 100644
> --- a/drivers/net/bonding/bonding.h
> +++ b/drivers/net/bonding/bonding.h
> @@ -226,6 +226,8 @@ struct bonding {
>  	struct   slave *primary_slave;
>  	bool     force_primary;
>  	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
> +	void     (*recv_probe)(struct sk_buff *, struct bonding *,
> +			       struct slave *);
>  	rwlock_t lock;
>  	rwlock_t curr_slave_lock;
>  	s8       kill_timers;


Any performance numbers ?

I am asking because recv_probe sits in a often dirtied cache line...

It would be nice to separate rx & tx path needs




^ permalink raw reply

* [patch net-next-2.6] bonding: move processing of recv handlers into handle_frame()
From: Jiri Pirko @ 2011-04-19 13:48 UTC (permalink / raw)
  To: netdev
  Cc: davem, shemminger, kaber, fubar, eric.dumazet, nicolas.2p.debian,
	andy

Since now when bonding uses rx_handler, all traffic going into bond
device goes thru bond_handle_frame. So there's no need to go back into
bonding code later via ptype handlers. This patch converts
original ptype handlers into "bonding receive probes". These functions
are called from bond_handle_frame and they are registered per-mode.

Note that vlan packets are also handled because they are always untagged
thanks to vlan_untag()

Note that this also allows arpmon for eth-bond-bridge-vlan topology.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/bonding/bond_3ad.c   |   29 ++--------
 drivers/net/bonding/bond_3ad.h   |    4 +-
 drivers/net/bonding/bond_alb.c   |   46 ++++-----------
 drivers/net/bonding/bond_alb.h   |    1 -
 drivers/net/bonding/bond_main.c  |  121 +++++++------------------------------
 drivers/net/bonding/bond_sysfs.c |    6 --
 drivers/net/bonding/bonding.h    |    4 +-
 net/core/dev.c                   |   21 -------
 8 files changed, 43 insertions(+), 189 deletions(-)

diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c
index 123dd60..d0981c2 100644
--- a/drivers/net/bonding/bond_3ad.c
+++ b/drivers/net/bonding/bond_3ad.c
@@ -2465,35 +2465,16 @@ out:
 	return NETDEV_TX_OK;
 }
 
-int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev)
+void bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
+			  struct slave *slave)
 {
-	struct bonding *bond = netdev_priv(dev);
-	struct slave *slave = NULL;
-	int ret = NET_RX_DROP;
-
-	if (!(dev->flags & IFF_MASTER))
-		goto out;
-
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		goto out;
+	if (skb->protocol != PKT_TYPE_LACPDU)
+		return;
 
 	if (!pskb_may_pull(skb, sizeof(struct lacpdu)))
-		goto out;
+		return;
 
 	read_lock(&bond->lock);
-	slave = bond_get_slave_by_dev(netdev_priv(dev), orig_dev);
-	if (!slave)
-		goto out_unlock;
-
 	bond_3ad_rx_indication((struct lacpdu *) skb->data, slave, skb->len);
-
-	ret = NET_RX_SUCCESS;
-
-out_unlock:
 	read_unlock(&bond->lock);
-out:
-	dev_kfree_skb(skb);
-
-	return ret;
 }
diff --git a/drivers/net/bonding/bond_3ad.h b/drivers/net/bonding/bond_3ad.h
index b28baff..291dbd4 100644
--- a/drivers/net/bonding/bond_3ad.h
+++ b/drivers/net/bonding/bond_3ad.h
@@ -258,7 +258,6 @@ struct ad_bond_info {
 				 * requested
 				 */
 	struct timer_list ad_timer;
-	struct packet_type ad_pkt_type;
 };
 
 struct ad_slave_info {
@@ -280,7 +279,8 @@ void bond_3ad_adapter_duplex_changed(struct slave *slave);
 void bond_3ad_handle_link_change(struct slave *slave, char link);
 int  bond_3ad_get_active_agg_info(struct bonding *bond, struct ad_info *ad_info);
 int bond_3ad_xmit_xor(struct sk_buff *skb, struct net_device *dev);
-int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type* ptype, struct net_device *orig_dev);
+void bond_3ad_lacpdu_recv(struct sk_buff *skb, struct bonding *bond,
+			  struct slave *slave);
 int bond_3ad_set_carrier(struct bonding *bond);
 #endif //__BOND_3AD_H__
 
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index ba71582..3b7b040 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -308,49 +308,33 @@ static void rlb_update_entry_from_arp(struct bonding *bond, struct arp_pkt *arp)
 	_unlock_rx_hashtbl(bond);
 }
 
-static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct packet_type *ptype, struct net_device *orig_dev)
+static void rlb_arp_recv(struct sk_buff *skb, struct bonding *bond,
+			 struct slave *slave)
 {
-	struct bonding *bond;
-	struct arp_pkt *arp = (struct arp_pkt *)skb->data;
-	int res = NET_RX_DROP;
+	struct arp_pkt *arp;
 
-	while (bond_dev->priv_flags & IFF_802_1Q_VLAN)
-		bond_dev = vlan_dev_real_dev(bond_dev);
-
-	if (!(bond_dev->priv_flags & IFF_BONDING) ||
-	    !(bond_dev->flags & IFF_MASTER))
-		goto out;
+	if (skb->protocol != cpu_to_be16(ETH_P_ARP))
+		return;
 
+	arp = (struct arp_pkt *) skb->data;
 	if (!arp) {
 		pr_debug("Packet has no ARP data\n");
-		goto out;
+		return;
 	}
 
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		goto out;
-
-	if (!pskb_may_pull(skb, arp_hdr_len(bond_dev)))
-		goto out;
+	if (!pskb_may_pull(skb, arp_hdr_len(bond->dev)))
+		return;
 
 	if (skb->len < sizeof(struct arp_pkt)) {
 		pr_debug("Packet is too small to be an ARP\n");
-		goto out;
+		return;
 	}
 
 	if (arp->op_code == htons(ARPOP_REPLY)) {
 		/* update rx hash table for this ARP */
-		bond = netdev_priv(bond_dev);
 		rlb_update_entry_from_arp(bond, arp);
 		pr_debug("Server received an ARP Reply from client\n");
 	}
-
-	res = NET_RX_SUCCESS;
-
-out:
-	dev_kfree_skb(skb);
-
-	return res;
 }
 
 /* Caller must hold bond lock for read */
@@ -759,7 +743,6 @@ static void rlb_init_table_entry(struct rlb_client_info *entry)
 static int rlb_initialize(struct bonding *bond)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
-	struct packet_type *pk_type = &(BOND_ALB_INFO(bond).rlb_pkt_type);
 	struct rlb_client_info	*new_hashtbl;
 	int size = RLB_HASH_TABLE_SIZE * sizeof(struct rlb_client_info);
 	int i;
@@ -784,13 +767,8 @@ static int rlb_initialize(struct bonding *bond)
 
 	_unlock_rx_hashtbl(bond);
 
-	/*initialize packet type*/
-	pk_type->type = cpu_to_be16(ETH_P_ARP);
-	pk_type->dev = bond->dev;
-	pk_type->func = rlb_arp_recv;
-
 	/* register to receive ARPs */
-	dev_add_pack(pk_type);
+	bond->recv_probe = rlb_arp_recv;
 
 	return 0;
 }
@@ -799,8 +777,6 @@ static void rlb_deinitialize(struct bonding *bond)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 
-	dev_remove_pack(&(bond_info->rlb_pkt_type));
-
 	_lock_rx_hashtbl(bond);
 
 	kfree(bond_info->rx_hashtbl);
diff --git a/drivers/net/bonding/bond_alb.h b/drivers/net/bonding/bond_alb.h
index 8ca7158..90f140a 100644
--- a/drivers/net/bonding/bond_alb.h
+++ b/drivers/net/bonding/bond_alb.h
@@ -129,7 +129,6 @@ struct alb_bond_info {
 	int			lp_counter;
 	/* -------- rlb parameters -------- */
 	int rlb_enabled;
-	struct packet_type	rlb_pkt_type;
 	struct rlb_client_info	*rx_hashtbl;	/* Receive hash table */
 	spinlock_t		rx_hashtbl_lock;
 	u32			rx_hashtbl_head;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 4ce14bd..66d9dc6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1439,27 +1439,17 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
 }
 
 /* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
- * ARP on active-backup slaves with arp_validate enabled.
+ * duplicates except for alb non-mcast/bcast.
  */
 static bool bond_should_deliver_exact_match(struct sk_buff *skb,
 					    struct slave *slave,
 					    struct bonding *bond)
 {
 	if (bond_is_slave_inactive(slave)) {
-		if (slave_do_arp_validate(bond, slave) &&
-		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
-			return false;
-
 		if (bond->params.mode == BOND_MODE_ALB &&
 		    skb->pkt_type != PACKET_BROADCAST &&
 		    skb->pkt_type != PACKET_MULTICAST)
-				return false;
-
-		if (bond->params.mode == BOND_MODE_8023AD &&
-		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
 			return false;
-
 		return true;
 	}
 	return false;
@@ -1483,6 +1473,15 @@ static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
 	if (bond->params.arp_interval)
 		slave->dev->last_rx = jiffies;
 
+	if (bond->recv_probe) {
+		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+		if (likely(nskb)) {
+			bond->recv_probe(nskb, bond, slave);
+			dev_kfree_skb(nskb);
+		}
+	}
+
 	if (bond_should_deliver_exact_match(skb, slave, bond)) {
 		return RX_HANDLER_EXACT;
 	}
@@ -2743,48 +2742,26 @@ static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32
 	}
 }
 
-static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+static void bond_arp_rcv(struct sk_buff *skb, struct bonding *bond,
+			 struct slave *slave)
 {
 	struct arphdr *arp;
-	struct slave *slave;
-	struct bonding *bond;
 	unsigned char *arp_ptr;
 	__be32 sip, tip;
 
-	if (dev->priv_flags & IFF_802_1Q_VLAN) {
-		/*
-		 * When using VLANS and bonding, dev and oriv_dev may be
-		 * incorrect if the physical interface supports VLAN
-		 * acceleration.  With this change ARP validation now
-		 * works for hosts only reachable on the VLAN interface.
-		 */
-		dev = vlan_dev_real_dev(dev);
-		orig_dev = dev_get_by_index_rcu(dev_net(skb->dev),skb->skb_iif);
-	}
-
-	if (!(dev->priv_flags & IFF_BONDING) || !(dev->flags & IFF_MASTER))
-		goto out;
+	if (skb->protocol != __cpu_to_be16(ETH_P_ARP))
+		return;
 
-	bond = netdev_priv(dev);
 	read_lock(&bond->lock);
 
-	pr_debug("bond_arp_rcv: bond %s skb->dev %s orig_dev %s\n",
-		 bond->dev->name, skb->dev ? skb->dev->name : "NULL",
-		 orig_dev ? orig_dev->name : "NULL");
+	pr_debug("bond_arp_rcv: bond %s skb->dev %s\n",
+		 bond->dev->name, skb->dev->name);
 
-	slave = bond_get_slave_by_dev(bond, orig_dev);
-	if (!slave || !slave_do_arp_validate(bond, slave))
-		goto out_unlock;
-
-	skb = skb_share_check(skb, GFP_ATOMIC);
-	if (!skb)
-		goto out_unlock;
-
-	if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+	if (!pskb_may_pull(skb, arp_hdr_len(bond->dev)))
 		goto out_unlock;
 
 	arp = arp_hdr(skb);
-	if (arp->ar_hln != dev->addr_len ||
+	if (arp->ar_hln != bond->dev->addr_len ||
 	    skb->pkt_type == PACKET_OTHERHOST ||
 	    skb->pkt_type == PACKET_LOOPBACK ||
 	    arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -2793,9 +2770,9 @@ static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
 		goto out_unlock;
 
 	arp_ptr = (unsigned char *)(arp + 1);
-	arp_ptr += dev->addr_len;
+	arp_ptr += bond->dev->addr_len;
 	memcpy(&sip, arp_ptr, 4);
-	arp_ptr += 4 + dev->addr_len;
+	arp_ptr += 4 + bond->dev->addr_len;
 	memcpy(&tip, arp_ptr, 4);
 
 	pr_debug("bond_arp_rcv: %s %s/%d av %d sv %d sip %pI4 tip %pI4\n",
@@ -2818,9 +2795,6 @@ static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct pack
 
 out_unlock:
 	read_unlock(&bond->lock);
-out:
-	dev_kfree_skb(skb);
-	return NET_RX_SUCCESS;
 }
 
 /*
@@ -3407,48 +3381,6 @@ static struct notifier_block bond_inetaddr_notifier = {
 	.notifier_call = bond_inetaddr_event,
 };
 
-/*-------------------------- Packet type handling ---------------------------*/
-
-/* register to receive lacpdus on a bond */
-static void bond_register_lacpdu(struct bonding *bond)
-{
-	struct packet_type *pk_type = &(BOND_AD_INFO(bond).ad_pkt_type);
-
-	/* initialize packet type */
-	pk_type->type = PKT_TYPE_LACPDU;
-	pk_type->dev = bond->dev;
-	pk_type->func = bond_3ad_lacpdu_recv;
-
-	dev_add_pack(pk_type);
-}
-
-/* unregister to receive lacpdus on a bond */
-static void bond_unregister_lacpdu(struct bonding *bond)
-{
-	dev_remove_pack(&(BOND_AD_INFO(bond).ad_pkt_type));
-}
-
-void bond_register_arp(struct bonding *bond)
-{
-	struct packet_type *pt = &bond->arp_mon_pt;
-
-	if (pt->type)
-		return;
-
-	pt->type = htons(ETH_P_ARP);
-	pt->dev = bond->dev;
-	pt->func = bond_arp_rcv;
-	dev_add_pack(pt);
-}
-
-void bond_unregister_arp(struct bonding *bond)
-{
-	struct packet_type *pt = &bond->arp_mon_pt;
-
-	dev_remove_pack(pt);
-	pt->type = 0;
-}
-
 /*---------------------------- Hashing Policies -----------------------------*/
 
 /*
@@ -3542,14 +3474,14 @@ static int bond_open(struct net_device *bond_dev)
 
 		queue_delayed_work(bond->wq, &bond->arp_work, 0);
 		if (bond->params.arp_validate)
-			bond_register_arp(bond);
+			bond->recv_probe = bond_arp_rcv;
 	}
 
 	if (bond->params.mode == BOND_MODE_8023AD) {
 		INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
 		queue_delayed_work(bond->wq, &bond->ad_work, 0);
 		/* register to receive LACPDUs */
-		bond_register_lacpdu(bond);
+		bond->recv_probe = bond_3ad_lacpdu_recv;
 		bond_3ad_initiate_agg_selection(bond, 1);
 	}
 
@@ -3560,14 +3492,6 @@ static int bond_close(struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 
-	if (bond->params.mode == BOND_MODE_8023AD) {
-		/* Unregister the receive of LACPDUs */
-		bond_unregister_lacpdu(bond);
-	}
-
-	if (bond->params.arp_validate)
-		bond_unregister_arp(bond);
-
 	write_lock_bh(&bond->lock);
 
 	/* signal timers not to re-arm */
@@ -3604,6 +3528,7 @@ static int bond_close(struct net_device *bond_dev)
 		 */
 		bond_alb_deinitialize(bond);
 	}
+	bond->recv_probe = NULL;
 
 	return 0;
 }
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 259ff32..935406a 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -422,11 +422,6 @@ static ssize_t bonding_store_arp_validate(struct device *d,
 		bond->dev->name, arp_validate_tbl[new_value].modename,
 		new_value);
 
-	if (!bond->params.arp_validate && new_value)
-		bond_register_arp(bond);
-	else if (bond->params.arp_validate && !new_value)
-		bond_unregister_arp(bond);
-
 	bond->params.arp_validate = new_value;
 
 	return count;
@@ -923,7 +918,6 @@ static ssize_t bonding_store_miimon(struct device *d,
 				bond->dev->name);
 			bond->params.arp_interval = 0;
 			if (bond->params.arp_validate) {
-				bond_unregister_arp(bond);
 				bond->params.arp_validate =
 					BOND_ARP_VALIDATE_NONE;
 			}
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 6126c6a..85fb822 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -226,6 +226,8 @@ struct bonding {
 	struct   slave *primary_slave;
 	bool     force_primary;
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
+	void     (*recv_probe)(struct sk_buff *, struct bonding *,
+			       struct slave *);
 	rwlock_t lock;
 	rwlock_t curr_slave_lock;
 	s8       kill_timers;
@@ -399,8 +401,6 @@ void bond_set_mode_ops(struct bonding *bond, int mode);
 int bond_parse_parm(const char *mode_arg, const struct bond_parm_tbl *tbl);
 void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
-void bond_register_arp(struct bonding *);
-void bond_unregister_arp(struct bonding *);
 void bond_create_debugfs(void);
 void bond_destroy_debugfs(void);
 void bond_debug_register(struct bonding *bond);
diff --git a/net/core/dev.c b/net/core/dev.c
index 3871bf6..f3aed72 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3076,25 +3076,6 @@ void netdev_rx_handler_unregister(struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
 
-static void vlan_on_bond_hook(struct sk_buff *skb)
-{
-	/*
-	 * Make sure ARP frames received on VLAN interfaces stacked on
-	 * bonding interfaces still make their way to any base bonding
-	 * device that may have registered for a specific ptype.
-	 */
-	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
-	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
-	    skb->protocol == htons(ETH_P_ARP)) {
-		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
-		if (!skb2)
-			return;
-		skb2->dev = vlan_dev_real_dev(skb->dev);
-		netif_rx(skb2);
-	}
-}
-
 static int __netif_receive_skb(struct sk_buff *skb)
 {
 	struct packet_type *ptype, *pt_prev;
@@ -3190,8 +3171,6 @@ ncls:
 			goto out;
 	}
 
-	vlan_on_bond_hook(skb);
-
 	/* deliver only exact match when indicated */
 	null_or_dev = deliver_exact ? skb->dev : NULL;
 
-- 
1.7.4.2


^ permalink raw reply related

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Michał Mirosław @ 2011-04-19 13:43 UTC (permalink / raw)
  To: Ian Campbell; +Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <1303220340.5997.197.camel@zakaz.uk.xensource.com>

On Tue, Apr 19, 2011 at 02:39:00PM +0100, Ian Campbell wrote:
> On Tue, 2011-04-19 at 14:30 +0100, Michał Mirosław wrote:
> > On Tue, Apr 19, 2011 at 02:17:53PM +0100, Ian Campbell wrote:
> > > I fixed it with the following, I also moved the !can_sg MTU clamping
> > > into a set_features hook (like we do with netfront). Am I right that
> > > this pattern copes with changes to SG via ethtool etc better? I think
> > > it's more future proof in any case.
> > This looks wrong. Even if SG is turned on, you might get big skbs which
> > are linearized. There is a difference in SG capability and SG offload
> > status and as I see it the capability is what you need to test for MTU.
> So the existing stuff in drivers/net/xen-netfront.c is wrong too?

Looks like it. But I don't really know what are the real constraints for MTU.
What I know is that SG even if turned on needs not be used (and currently
it's not e.g. if checksum offload is disabled). So MTU setting should not
depend on SG offload state but on some capability.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH v2] net: xen-netback: convert to hw_features
From: Michał Mirosław @ 2011-04-19 13:39 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, Ian Campbell, xen-devel
In-Reply-To: <20110419133506.D332413909@rere.qmqm.pl>

On Tue, Apr 19, 2011 at 03:35:06PM +0200, Michał Mirosław wrote:
> Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>

David, I was too quick with this, so please wait with this patch until Ian
acks it again.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [PATCH] net: xen-netback: convert to hw_features
From: Ian Campbell @ 2011-04-19 13:39 UTC (permalink / raw)
  To: Michał Mirosław
  Cc: netdev@vger.kernel.org, xen-devel@lists.xensource.com
In-Reply-To: <20110419133019.GA3973@rere.qmqm.pl>

On Tue, 2011-04-19 at 14:30 +0100, Michał Mirosław wrote:
> On Tue, Apr 19, 2011 at 02:17:53PM +0100, Ian Campbell wrote:
> > On Tue, 2011-04-19 at 12:56 +0100, Michał Mirosław wrote:
> > > Signed-off-by: Michał Mirosław <mirq-linux@rere.qmqm.pl>
> > Thanks for beating me to this! However the prototype for
> > xenvif_fix_features is wrong (needs to take a net_device not a xenvif).
> 
> I'll resend v2 with this fix.
> 
> > I fixed it with the following, I also moved the !can_sg MTU clamping
> > into a set_features hook (like we do with netfront). Am I right that
> > this pattern copes with changes to SG via ethtool etc better? I think
> > it's more future proof in any case.
> 
> This looks wrong. Even if SG is turned on, you might get big skbs which
> are linearized. There is a difference in SG capability and SG offload
> status and as I see it the capability is what you need to test for MTU.

So the existing stuff in drivers/net/xen-netfront.c is wrong too?

> > NB: I'm having some issues with my test hardware at the moment so this
> > is reviewed by eye and compile tested only...
> > 
> > I'm also happy for this to be folded into the original with my
> > "Signed-off-/Acked-by Ian Campbell <ian.campbell@citrix.com>" if that is
> > preferable.
> 
> Thanks.
> 
> Best Regards,
> Michał Mirosław



^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox