Netdev List
 help / color / mirror / Atom feed
* [PATCHv2] fragment locally-generated IPsec6 packets that need it
From: David L Stevens @ 2010-12-14 18:32 UTC (permalink / raw)
  To: Herbert Xu, davem; +Cc: netdev

This patch modifies IPsec6 to fragment IPv6 packets that are
locally generated as needed.

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>

diff -ruNp linux-2.6.36-rc8/include/net/ip6_route.h linux-2.6.36-rc8DLS/include/net/ip6_route.h
--- linux-2.6.36-rc8/include/net/ip6_route.h	2010-10-14 16:26:43.000000000 -0700
+++ linux-2.6.36-rc8DLS/include/net/ip6_route.h	2010-12-12 09:22:48.582141401 -0800
@@ -164,5 +164,15 @@ static inline int ipv6_unicast_destinati
 	return rt->rt6i_flags & RTF_LOCAL;
 }
 
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
+
+static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
+{
+	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+
+	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
+	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
+}
+
 #endif
 #endif
diff -ruNp linux-2.6.36-rc8/net/ipv6/ip6_output.c linux-2.6.36-rc8DLS/net/ipv6/ip6_output.c
--- linux-2.6.36-rc8/net/ipv6/ip6_output.c	2010-10-14 16:26:43.000000000 -0700
+++ linux-2.6.36-rc8DLS/net/ipv6/ip6_output.c	2010-12-14 09:51:45.260779308 -0800
@@ -56,7 +56,7 @@
 #include <net/checksum.h>
 #include <linux/mroute6.h>
 
-static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 int __ip6_local_out(struct sk_buff *skb)
 {
@@ -145,14 +145,6 @@ static int ip6_finish_output2(struct sk_
 	return -EINVAL;
 }
 
-static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
-{
-	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
-
-	return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
-	       skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
-}
-
 static int ip6_finish_output(struct sk_buff *skb)
 {
 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
@@ -601,7 +593,7 @@ int ip6_find_1stfragopt(struct sk_buff *
 	return offset;
 }
 
-static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct sk_buff *frag;
 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
diff -ruNp linux-2.6.36-rc8/net/ipv6/xfrm6_output.c linux-2.6.36-rc8DLS/net/ipv6/xfrm6_output.c
--- linux-2.6.36-rc8/net/ipv6/xfrm6_output.c	2010-10-14 16:26:43.000000000 -0700
+++ linux-2.6.36-rc8DLS/net/ipv6/xfrm6_output.c	2010-12-12 09:30:21.019560623 -0800
@@ -17,6 +17,7 @@
 #include <linux/netfilter_ipv6.h>
 #include <net/dst.h>
 #include <net/ipv6.h>
+#include <net/ip6_route.h>
 #include <net/xfrm.h>
 
 int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
@@ -88,8 +89,17 @@ static int xfrm6_output_finish(struct sk
 	return xfrm_output(skb);
 }
 
-int xfrm6_output(struct sk_buff *skb)
+static int __xfrm6_output(struct sk_buff *skb)
 {
 	return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL,
 		       skb_dst(skb)->dev, xfrm6_output_finish);
 }
+
+int xfrm6_output(struct sk_buff *skb)
+{
+	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+		dst_allfrag(skb_dst(skb))) {
+			return ip6_fragment(skb, __xfrm6_output);
+	}
+	return __xfrm6_output(skb);
+}




^ permalink raw reply

* RE: [PATCH] ixgb: Convert to new vlan model.
From: Ben Hutchings @ 2010-12-14 18:12 UTC (permalink / raw)
  To: Tantilov, Emil S
  Cc: Jesse Gross, David Miller, netdev@vger.kernel.org,
	Kirsher, Jeffrey T, Duyck, Alexander H
In-Reply-To: <EA929A9653AAE14F841771FB1DE5A136602D675056@rrsmsx501.amr.corp.intel.com>

On Tue, 2010-12-14 at 11:09 -0700, Tantilov, Emil S wrote:
> Ben Hutchings wrote:
> > On Mon, 2010-12-13 at 19:42 -0800, Jesse Gross wrote:
> >> This switches the ixgb driver to use the new vlan interfaces.
> >> In doing this, it completes the work begun in
> >> ae54496f9e8d40c89e5668205c181dccfa9ecda1 allowing the use of
> >> hardware vlan insertion without having a vlan group configured. [...]
> >> diff --git a/drivers/net/ixgb/ixgb_ethtool.c
> >> b/drivers/net/ixgb/ixgb_ethtool.c 
> >> index 43994c1..0e4c527 100644
> >> --- a/drivers/net/ixgb/ixgb_ethtool.c
> >> +++ b/drivers/net/ixgb/ixgb_ethtool.c
> >> @@ -706,6 +706,45 @@ ixgb_get_strings(struct net_device *netdev, u32
> >>  stringset, u8 *data)  	} }
> >> 
> >> +static int ixgb_set_flags(struct net_device *netdev, u32 data) +{
> >> +	struct ixgb_adapter *adapter = netdev_priv(netdev); +	bool
> >> need_reset; +	int rc;
> >> +
> >> +	/* The hardware requires that RX vlan stripping and TX vlan
> >> insertion +	 * be configured together.  Therefore, if one setting
> >> changes adjust the +	 * other one to match. +	 */
> >> +	if (!!(data & ETH_FLAG_RXVLAN) != !!(data & ETH_FLAG_TXVLAN)) {
> >> +		if ((data & ETH_FLAG_RXVLAN) !=
> >> +		    (netdev->features & NETIF_F_HW_VLAN_RX))
> >> +			data ^= ETH_FLAG_TXVLAN;
> >> +		else if ((data & ETH_FLAG_TXVLAN) !=
> >> +		    (netdev->features & NETIF_F_HW_VLAN_TX))
> >> +			data ^= ETH_FLAG_RXVLAN;
> >> +	}
> > [...]
> > 
> > I think this should reject attempts to change just one flag with
> > -EINVAL, rather than quietly 'fixing' the setting.
> > 
> > Ben.
> 
> I'm not sure this is a good idea. At least not without some sort of 
> explanation. Since there is no way for the user to know that he needs
> to disable both.

Document the limitation in Documentation/networking/ixgb.txt.  You could
also send a patch for the ethtool manual page stating that this
restriction might exist.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* RE: [PATCH] ixgb: Convert to new vlan model.
From: Tantilov, Emil S @ 2010-12-14 18:09 UTC (permalink / raw)
  To: Ben Hutchings, Jesse Gross
  Cc: David Miller, netdev@vger.kernel.org, Kirsher, Jeffrey T,
	Duyck, Alexander H
In-Reply-To: <1292344315.20458.6.camel@bwh-desktop>

Ben Hutchings wrote:
> On Mon, 2010-12-13 at 19:42 -0800, Jesse Gross wrote:
>> This switches the ixgb driver to use the new vlan interfaces.
>> In doing this, it completes the work begun in
>> ae54496f9e8d40c89e5668205c181dccfa9ecda1 allowing the use of
>> hardware vlan insertion without having a vlan group configured. [...]
>> diff --git a/drivers/net/ixgb/ixgb_ethtool.c
>> b/drivers/net/ixgb/ixgb_ethtool.c 
>> index 43994c1..0e4c527 100644
>> --- a/drivers/net/ixgb/ixgb_ethtool.c
>> +++ b/drivers/net/ixgb/ixgb_ethtool.c
>> @@ -706,6 +706,45 @@ ixgb_get_strings(struct net_device *netdev, u32
>>  stringset, u8 *data)  	} }
>> 
>> +static int ixgb_set_flags(struct net_device *netdev, u32 data) +{
>> +	struct ixgb_adapter *adapter = netdev_priv(netdev); +	bool
>> need_reset; +	int rc;
>> +
>> +	/* The hardware requires that RX vlan stripping and TX vlan
>> insertion +	 * be configured together.  Therefore, if one setting
>> changes adjust the +	 * other one to match. +	 */
>> +	if (!!(data & ETH_FLAG_RXVLAN) != !!(data & ETH_FLAG_TXVLAN)) {
>> +		if ((data & ETH_FLAG_RXVLAN) !=
>> +		    (netdev->features & NETIF_F_HW_VLAN_RX))
>> +			data ^= ETH_FLAG_TXVLAN;
>> +		else if ((data & ETH_FLAG_TXVLAN) !=
>> +		    (netdev->features & NETIF_F_HW_VLAN_TX))
>> +			data ^= ETH_FLAG_RXVLAN;
>> +	}
> [...]
> 
> I think this should reject attempts to change just one flag with
> -EINVAL, rather than quietly 'fixing' the setting.
> 
> Ben.

I'm not sure this is a good idea. At least not without some sort of 
explanation. Since there is no way for the user to know that he needs
to disable both.

Thanks,
Emil

^ permalink raw reply

* RE: [PATCH 10/27] ixgb: Don't check for vlan group on transmit
From: Tantilov, Emil S @ 2010-12-14 18:00 UTC (permalink / raw)
  To: Jesse Gross
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	gospo@redhat.com, bphilips@novell.com
In-Reply-To: <AANLkTi=9+F-dxeZrvnTUXHjFniQdtxzHyCcdvMpn-ygf@mail.gmail.com>

Jesse Gross wrote:
> On Mon, Dec 13, 2010 at 10:43 AM, Tantilov, Emil S
> <emil.s.tantilov@intel.com> wrote:
>>> -----Original Message-----
>>> From: Jesse Gross [mailto:jesse@nicira.com]
>>> Sent: Sunday, December 12, 2010 5:01 PM
>>> To: Kirsher, Jeffrey T
>>> Cc: davem@davemloft.net; Tantilov, Emil S; netdev@vger.kernel.org;
>>> gospo@redhat.com; bphilips@novell.com
>>> Subject: Re: [PATCH 10/27] ixgb: Don't check for vlan group on
>>> transmit 
>>> 
>>> On Fri, Dec 10, 2010 at 10:17 PM, Jeff Kirsher
>>> <jeffrey.t.kirsher@intel.com> wrote:
>>>> From: Emil Tantilov <emil.s.tantilov@intel.com>
>>>> 
>>>> Based on a patch from Jesse Gross.
>>>> 
>>>> Enable vlan tag insertion even when vlan group is not configured.
>>>> 
>>>> For ixgb HW both CTRL0.VME and VLE bit in the Tx descriptor need
>>>> to be set in order to enable HW acceleration.
>>>> 
>>>> Introduced separate functions for enabling/disabling of vlan tag
>>>> stripping similar to ixgbe.
>>> 
>>> Thanks for working on this.  However, I don't think that this patch
>>> actually does what it says. 
>>> 
>>> In ixgb_xmit_frame() it's still checking whether adapter->vlgrp is
>>> non-null before inserting a tag, so it will drop tags unless a vlan
>>> group is configured.  Also, since it's not currently possible to
>>> toggle NETIF_F_HW_VLAN_RX, vlan stripping will never get disabled.
>>> This is actually a regression since before vlan stripping would get
>>> disabled if no vlan group was configured.  Now, vlan headers will
>>> get silently dropped if there is no vlan group.
>> I'm sorry. This patch was supposed to include your original patch
>> that 
>> removed the vlgrp check on Tx. Somehow that didn't make it (I may
>> have generated the patch from the wrong branch). 
>> 
>>> Regardless of that, I still think this is a useful change on the
>>> road towards adopting the new vlan interfaces, the problem is just
>>> that currently it's halfway in between the old and the new.  Given
>>> that, it would obviously be much better to move all the way over
>>> the new when addressing this.
>> Since this patch is already applied can you submit your change again?
> 
> Sure.  I think it's probably best to just complete the conversion to
> the new vlan interfaces, so I went ahead and did that.  It's much
> easier now that you've pulled the hardware specific bits out for
> enabling/disabling vlan offloading.  I'll send out that patch shortly
> - please take a look since I don't have the hardware to test it.

Thanks Jesse!

I'll ask Jeff to pick the patch in his tree and we'll run some tests.

Emil

^ permalink raw reply

* Re: [PATCH] fragment locally-generated IPsec6 packets that need it
From: David Stevens @ 2010-12-14 17:56 UTC (permalink / raw)
  To: Herbert Xu; +Cc: davem, netdev
In-Reply-To: <20101214010929.GA10354@gondor.apana.org.au>

Herbert Xu <herbert@gondor.apana.org.au> wrote on 12/13/2010 05:09:29 PM:

> > +EXPORT_SYMBOL_GPL(ip6_fragment);
> 
> There is no need to export this as xfrm6_output.c is always part
> of ipv6.o.

Herbert, thanks! I'll remove it and repost.

                                        +-DLS


^ permalink raw reply

* Re: [PATCH net-next-2.6] bnx2: remove cancel_work_sync() from remove_one
From: Michael Chan @ 2010-12-14 17:48 UTC (permalink / raw)
  To: Tejun Heo; +Cc: lkml, David S. Miller, netdev
In-Reply-To: <4D0796D7.3030309@kernel.org>


On Tue, 2010-12-14 at 08:09 -0800, Tejun Heo wrote:
> Michael pointed out that bnx2_close() already cancels bp->reset_task
> and thus it is guaranteed to be idle when bnx2_remove_one() is called.
> Remove the unnecessary cancel_work_sync() in remove_one.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: Michael Chan <mchan@broadcom.com>

Acked-by: Michael Chan <mchan@broadcom.com>

> ---
>  drivers/net/bnx2.c |    2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
> index 5c811f3..85fc2c8 100644
> --- a/drivers/net/bnx2.c
> +++ b/drivers/net/bnx2.c
> @@ -8393,8 +8393,6 @@ bnx2_remove_one(struct pci_dev *pdev)
>  	struct net_device *dev = pci_get_drvdata(pdev);
>  	struct bnx2 *bp = netdev_priv(dev);
> 
> -	cancel_work_sync(&bp->reset_task);
> -
>  	unregister_netdev(dev);
> 
>  	if (bp->mips_firmware)
> 

^ permalink raw reply

* [PATCH net-2.6 v2] net: fix nulls list corruptions in sk_prot_alloc
From: Octavian Purdila @ 2010-12-14 17:19 UTC (permalink / raw)
  To: netdev; +Cc: Octavian Purdila, Leonard Crestez, Eric Dumazet, stable

Special care is taken inside sk_port_alloc to avoid overwriting
skc_node/skc_nulls_node. We should also avoid overwriting
skc_bind_node/skc_portaddr_node.

The patch fixes the following crash:

 BUG: unable to handle kernel paging request at fffffffffffffff0
 IP: [<ffffffff812ec6dd>] udp4_lib_lookup2+0xad/0x370
 [<ffffffff812ecc22>] __udp4_lib_lookup+0x282/0x360
 [<ffffffff812ed63e>] __udp4_lib_rcv+0x31e/0x700
 [<ffffffff812bba45>] ? ip_local_deliver_finish+0x65/0x190
 [<ffffffff812bbbf8>] ? ip_local_deliver+0x88/0xa0
 [<ffffffff812eda35>] udp_rcv+0x15/0x20
 [<ffffffff812bba45>] ip_local_deliver_finish+0x65/0x190
 [<ffffffff812bbbf8>] ip_local_deliver+0x88/0xa0
 [<ffffffff812bb2cd>] ip_rcv_finish+0x32d/0x6f0
 [<ffffffff8128c14c>] ? netif_receive_skb+0x99c/0x11c0
 [<ffffffff812bb94b>] ip_rcv+0x2bb/0x350
 [<ffffffff8128c14c>] netif_receive_skb+0x99c/0x11c0

Signed-off-by: Leonard Crestez <lcrestez@ixiacom.com>
Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: stable@kernel.org
---
 include/net/sock.h |    3 +++
 net/core/sock.c    |   47 +++++++++++++++++++++++++++++++++++------------
 net/ipv4/udp.c     |    1 +
 net/ipv4/udplite.c |    1 +
 net/ipv6/udp.c     |    1 +
 net/ipv6/udplite.c |    1 +
 6 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 659d968..7d3f7ce 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -754,6 +754,7 @@ struct proto {
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
+	void			(*clear_sk)(struct sock *sk, int size);
 
 	/* Keeping track of sockets in use */
 #ifdef CONFIG_PROC_FS
@@ -852,6 +853,8 @@ static inline void __sk_prot_rehash(struct sock *sk)
 	sk->sk_prot->hash(sk);
 }
 
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
+
 /* About 10 seconds */
 #define SOCK_DESTROY_TIME (10*HZ)
 
diff --git a/net/core/sock.c b/net/core/sock.c
index fb60801..d931388 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1009,6 +1009,36 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
 #endif
 }
 
+/*
+ * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
+ * un-modified. Special care is taken when initializing object to zero.
+ */
+static inline void sk_prot_clear_nulls(struct sock *sk, int size)
+{
+	if (offsetof(struct sock, sk_node.next) != 0)
+		memset(sk, 0, offsetof(struct sock, sk_node.next));
+	memset(&sk->sk_node.pprev, 0,
+	       size - offsetof(struct sock, sk_node.pprev));
+}
+
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+{
+	unsigned long nulls1, nulls2;
+
+	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
+	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
+	if (nulls1 > nulls2)
+		swap(nulls1, nulls2);
+
+	if (nulls1 != 0)
+		memset((char *)sk, 0, nulls1);
+	memset((char *)sk + nulls1 + sizeof(void *), 0,
+	       nulls2 - nulls1 - sizeof(void *));
+	memset((char *)sk + nulls2 + sizeof(void *), 0,
+	       size - nulls2 - sizeof(void *));
+}
+
+
 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		int family)
 {
@@ -1021,19 +1051,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		if (!sk)
 			return sk;
 		if (priority & __GFP_ZERO) {
-			/*
-			 * caches using SLAB_DESTROY_BY_RCU should let
-			 * sk_node.next un-modified. Special care is taken
-			 * when initializing object to zero.
-			 */
-			if (offsetof(struct sock, sk_node.next) != 0)
-				memset(sk, 0, offsetof(struct sock, sk_node.next));
-			memset(&sk->sk_node.pprev, 0,
-			       prot->obj_size - offsetof(struct sock,
-							 sk_node.pprev));
+			if (prot->clear_sk)
+				prot->clear_sk(sk, prot->obj_size);
+			else
+				sk_prot_clear_nulls(sk, prot->obj_size);
 		}
-	}
-	else
+	} else
 		sk = kmalloc(prot->obj_size, priority);
 
 	if (sk != NULL) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5e0a3a5..2d3ded4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1899,6 +1899,7 @@ struct proto udp_prot = {
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udp_prot);
 
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ab76aa9..aee9963 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -57,6 +57,7 @@ struct proto 	udplite_prot = {
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udplite_prot);
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 91def93..cd6cb7c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1477,6 +1477,7 @@ struct proto udpv6_prot = {
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 
 static struct inet_protosw udpv6_protosw = {
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 5f48fad..986c4de 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -55,6 +55,7 @@ struct proto udplitev6_prot = {
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 
 static struct inet_protosw udplite6_protosw = {
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH 05/19] c/r: documentation
From: Dan Smith @ 2010-12-14 16:46 UTC (permalink / raw)
  To: linux-api; +Cc: linux-mm, netdev, Dave Hansen
In-Reply-To: <1292343307-7870-5-git-send-email-danms@us.ibm.com>

DS> Cc: linux-api@vger.kernel.org
DS> Cc: linux-mm@kvack.org
DS> Cc: netdev@vger.kernel.org
DS> Signed-off-by: Dave Hansen <dave@linux.vnet.ibm.com>

My apologies to the above mentioned.  I didn't mean to git-send-email
this set.  Please ignore.

-- 
Dan Smith
IBM Linux Technology Center
email: danms@us.ibm.com

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] ixgb: Convert to new vlan model.
From: Ben Hutchings @ 2010-12-14 16:31 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David Miller, netdev, Emil Tantilov, Jeff Kirsher, Alex Duyck
In-Reply-To: <1292298163-30343-1-git-send-email-jesse@nicira.com>

On Mon, 2010-12-13 at 19:42 -0800, Jesse Gross wrote:
> This switches the ixgb driver to use the new vlan interfaces.
> In doing this, it completes the work begun in
> ae54496f9e8d40c89e5668205c181dccfa9ecda1 allowing the use of
> hardware vlan insertion without having a vlan group configured.
[...]
> diff --git a/drivers/net/ixgb/ixgb_ethtool.c b/drivers/net/ixgb/ixgb_ethtool.c
> index 43994c1..0e4c527 100644
> --- a/drivers/net/ixgb/ixgb_ethtool.c
> +++ b/drivers/net/ixgb/ixgb_ethtool.c
> @@ -706,6 +706,45 @@ ixgb_get_strings(struct net_device *netdev, u32 stringset, u8 *data)
>  	}
>  }
>  
> +static int ixgb_set_flags(struct net_device *netdev, u32 data)
> +{
> +	struct ixgb_adapter *adapter = netdev_priv(netdev);
> +	bool need_reset;
> +	int rc;
> +
> +	/* The hardware requires that RX vlan stripping and TX vlan insertion
> +	 * be configured together.  Therefore, if one setting changes adjust the
> +	 * other one to match.
> +	 */
> +	if (!!(data & ETH_FLAG_RXVLAN) != !!(data & ETH_FLAG_TXVLAN)) {
> +		if ((data & ETH_FLAG_RXVLAN) !=
> +		    (netdev->features & NETIF_F_HW_VLAN_RX))
> +			data ^= ETH_FLAG_TXVLAN;
> +		else if ((data & ETH_FLAG_TXVLAN) !=
> +		    (netdev->features & NETIF_F_HW_VLAN_TX))
> +			data ^= ETH_FLAG_RXVLAN;
> +	}
[...]

I think this should reject attempts to change just one flag with
-EINVAL, rather than quietly 'fixing' the setting.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH net-2.6] net: fix nulls list corruptions in sk_prot_alloc
From: Octavian Purdila @ 2010-12-14 16:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Leonard Crestez
In-Reply-To: <1292343557.5934.22.camel@edumazet-laptop>

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tuesday 14 December 2010, 18:19:17

> Hmm very good catch, but why a so invasive patch ?
> 
> Only udp needs a special care.
> 
> Other protocols could use the default 'cleaner', you dont need to force
> them to use the default ;)
> 

Ah, OK now I get it. I thought that for protocols using non nulls lists we 
must clear .next but I now see that we can skip that.

So default cleaner can be the nulls cleaner and portaddr cleaner will be used 
for UDP and UDP lite. I'll rework the patch and post a new version.


^ permalink raw reply

* pull request: wireless-next-2.6 2010-12-14
From: John W. Linville @ 2010-12-14 16:19 UTC (permalink / raw)
  To: davem; +Cc: linux-wireless, netdev

Dave,

Here is another batch of updates intended for 2.6.38.  This batch
includes some mac80211 updates from Johannes Berg, Felix Fietkau,
and Helmut Schaa, some ath cleanups from Joe Perches, more b43 N-phy
updates from Rafał Miłecki, more ath5k updates from Nick Kossifidis,
some ath9k bits from the Atheros guys, and a smattering of other bits.
This also includes another wireless-2.6 pull to grab some prerequisites
for later patches.

Please let me know if there are problems!

Thanks,

John

---

The following changes since commit 0dbaee3b37e118a96bb7b8eb0d9bbaeeb46264be:

  net: Abstract default ADVMSS behind an accessor. (2010-12-13 12:52:14 -0800)

are available in the git repository at:
  git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6.git for-davem

Amitkumar Karwar (1):
      cfg80211: add some element IDs in enum ieee80211_eid

Ben Greear (3):
      ath9k: Make DMA warning in ath_stoprecv WARN_ON_ONCE.
      ath9k: Check for NULL sta in ath_tx_start
      mac80211: Show max number of probe tries in debug message.

Bruno Randolf (4):
      ath5k: Use EWMA factor of 1024 instead of 1000
      lib: Improve EWMA efficiency by using bitshifts
      nl80211/mac80211: Report signal average
      ath5k: Use capabilities information for the number of TX queues

David Kilroy (4):
      orinoco: allow IW_AUTH_MFP to pass through
      orinoco: initialise priv->hw before assigning the interrupt
      orinoco: clear countermeasure setting on commit
      orinoco: fix TKIP countermeasure behaviour

Eliad Peller (1):
      mac80211: fix dynamic-ps/pm_qos magic numbers

Felix Fietkau (4):
      mac80211: remove a redundant check
      mac80211: speed up AP probing using nullfunc frames
      ath9k: fix a DMA related race condition on reset
      mac80211: fix a compiler warning

Grumbach, Emmanuel (1):
      iwlagn: Enable PCI L1 ACTIVE state after uCode has been loaded

Helmut Schaa (4):
      mac80211: Update last_tx_rate only for data frames
      mac80211: Fix BUG in pskb_expand_head when transmitting shared skbs
      cfg80211: Add new BSS attribute ht_opmode
      mac80211: Apply ht_opmode changes in ieee80211_change_bss

Javier Cardona (5):
      nl80211/mac80211: define and allow configuring mesh element TTL
      mac80211: Fix compilation error when mesh is disabled
      ath5k: Fix beaconing in mesh mode
      ath5k: Prevent mesh interfaces from being counted as ad-hoc
      ath5k: Put the right tsf value in mesh beacons

Joe Perches (5):
      MAINTAINERS: Add ATH GENERIC UTILITIES
      ath: Add and use ath_printk and ath_<level>
      ath: Convert ath_print(.., ATH_DBG_FATAL to ath_err
      ath: Convert ath_print to ath_dbg
      ath: Fix ath_dbg access beyond array bound

Johannes Berg (4):
      mac80211: move mesh filter adjusting
      cfg80211: require add_virtual_intf to return new dev
      nl80211: refactor mesh parameter parsing
      cfg80211/mac80211: add mesh join/leave commands

John W. Linville (5):
      Merge branch 'wireless-next-2.6' of git://git.kernel.org/.../iwlwifi/iwlwifi-2.6
      ath5k: remove MODULE_VERSION
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless-2.6
      ath: fix build break with ATH_DBG_WARN_ON_ONCE
      Merge branch 'master' of git://git.kernel.org/.../linville/wireless-next-2.6 into for-davem

Luis R. Rodriguez (2):
      ath9k: skip ATH9K_INT_TIM_TIMER when we are idle
      ath9k_hw: warn if we cannot change the power to the chip

Matteo Croce (1):
      ath9k: fix bug in tx power

Mohammed Shafi Shajakhan (3):
      ath9k: Properly use unlikely check macro
      ath9k: Parse DTIM period from mac80211
      ath9k: Remove dead code in recv.c

Nick Kossifidis (5):
      ath5k: Always write tx powertable on hw
      ath5k: Always free tx buffers before reset
      ath5k: Disable ANI during reset
      ath5k: Fix reporting of RX dma stop failure
      ath5k: Include tx ack reporting on hw flags

Rafał Miłecki (8):
      b43: N-PHY: update init tables
      b43: N-PHY: reorder and optimize tables initialization
      b43: N-PHY: implement own maskset
      b43: flush PHY writes when needed
      b43: N-PHY: silence warnings
      b43: set TMS to work with current band width for N-PHY
      b43: fix split of N-PHY devices into supported and not (based on PHY rev)
      b43: rename config option for N-PHY, drop BROKEN

Rajkumar Manoharan (1):
      ath9k: fix beacon resource related race condition

Sedat Dilek (1):
      ath5k: Fix modinfo does not list alias -> pci-id lines

Senthil Balasubramanian (1):
      ath9k: Fix STA disconnect issue due to received MIC failed bcast frames

Stanislaw Gruszka (2):
      iwl3945: prevent too frequent firmware resets
      iwlwifi: jiffies based tx queues watchdog

Sujith Manoharan (4):
      ath9k_htc: Cleanup device identification
      ath9k_htc: Add support for handling TX power configuration
      ath9k_htc: Fix panic on FW download failure
      ath9k_htc: Fix suspend/resume

Vasanthakumar Thiagarajan (26):
      ath9k_hw: Define hw version macros for AR9485
      ath9k_hw: Add initvals.h for AR9485
      ath9k_hw: Enable hw initialization for AR9485
      ath9k_hw: Initialize mode registers for AR9485
      ath9k_hw: Initialize tx/rx gain table from initvals.h for AR9485
      ath9k_hw: Eeeprom changes for AR9485
      ath9k_hw: Disable LDPC for AR9485
      ath9k: Disable TX STBC for AR9485
      ath9k: Enable extended synch for AR9485 to fix L0s recovery issue
      ath9k_hw: Find the maximum number of chains that hw supports
      ath9k: Configure pll control for AR9485
      ath9k_hw: Find chansel of AR_PHY_65NM_CH0_SYNTH7 for AR9485
      ath9k_hw: Add a helper function to get spur channel pointer from cal data for AR9003 family
      ath9k: Read spur channel information from eeprom for AR9485
      ath9k_hw: Configure xpa bias level for AR9485
      ath9k_hw: Read and configure antenna diversity control for AR9485
      ath9k_hw: Configure internal regulator for AR9485
      ath9k_hw: Read and configure turnning caps to regulate freq accuracy
      ath9k_hw: Configure power control only for the supported chains
      ath9k_hw: Program appropriate chianmask for AR9485 before starting AGC/IQ cal
      ath9k_hw: Define IQcal correction coefficient registers using index
      ath9k_hw: Add IQ cal changes for AR9485
      ath9k_hw: Program appropriate register for temperature compensation cal for AR9485
      ath9k_hw: Setup paprd only for supported chains
      ath9k_hw: Disable MRC CCK for AR9485
      ath9k: Add device id of AR9485 to pci table

Wey-Yi Guy (4):
      iwlagn: fix race condition when reprogram sta
      iwlagn: remove structure name reference to gen2
      iwlwifi: check for STATUS_EXIT_PENDING when send RXON command
      iwlagn: name change for bt_ch_announce module parameter

 MAINTAINERS                                      |    6 +
 drivers/net/wireless/ath/ath.h                   |  115 +++-
 drivers/net/wireless/ath/ath5k/base.c            |  103 ++-
 drivers/net/wireless/ath/ath5k/debug.c           |    1 -
 drivers/net/wireless/ath/ath5k/dma.c             |    2 +-
 drivers/net/wireless/ath/ath5k/pci.c             |    1 +
 drivers/net/wireless/ath/ath5k/phy.c             |   33 +-
 drivers/net/wireless/ath/ath5k/qcu.c             |    4 +-
 drivers/net/wireless/ath/ath9k/ahb.c             |    7 +-
 drivers/net/wireless/ath/ath9k/ani.c             |   99 ++--
 drivers/net/wireless/ath/ath9k/ar5008_phy.c      |  199 ++---
 drivers/net/wireless/ath/ath9k/ar9002_calib.c    |  220 +++---
 drivers/net/wireless/ath/ath9k/ar9002_hw.c       |    6 +-
 drivers/net/wireless/ath/ath9k/ar9002_mac.c      |   20 +-
 drivers/net/wireless/ath/ath9k/ar9003_calib.c    |  473 ++++++++---
 drivers/net/wireless/ath/ath9k/ar9003_eeprom.c   |  361 +++++----
 drivers/net/wireless/ath/ath9k/ar9003_eeprom.h   |    2 +
 drivers/net/wireless/ath/ath9k/ar9003_hw.c       |  278 +++++--
 drivers/net/wireless/ath/ath9k/ar9003_mac.c      |   16 +-
 drivers/net/wireless/ath/ath9k/ar9003_paprd.c    |   37 +-
 drivers/net/wireless/ath/ath9k/ar9003_phy.c      |  245 +++---
 drivers/net/wireless/ath/ath9k/ar9003_phy.h      |   70 ++-
 drivers/net/wireless/ath/ath9k/ar9485_initvals.h |  943 ++++++++++++++++++++++
 drivers/net/wireless/ath/ath9k/ath9k.h           |    2 +-
 drivers/net/wireless/ath/ath9k/beacon.c          |   79 +-
 drivers/net/wireless/ath/ath9k/calib.c           |   59 +-
 drivers/net/wireless/ath/ath9k/common.c          |    4 +-
 drivers/net/wireless/ath/ath9k/common.h          |    1 -
 drivers/net/wireless/ath/ath9k/eeprom.c          |    4 +-
 drivers/net/wireless/ath/ath9k/eeprom_4k.c       |   72 +-
 drivers/net/wireless/ath/ath9k/eeprom_9287.c     |   45 +-
 drivers/net/wireless/ath/ath9k/eeprom_def.c      |   79 +-
 drivers/net/wireless/ath/ath9k/gpio.c            |   20 +-
 drivers/net/wireless/ath/ath9k/hif_usb.c         |   43 +-
 drivers/net/wireless/ath/ath9k/hif_usb.h         |    2 +
 drivers/net/wireless/ath/ath9k/htc.h             |    3 +
 drivers/net/wireless/ath/ath9k/htc_drv_beacon.c  |   24 +-
 drivers/net/wireless/ath/ath9k/htc_drv_gpio.c    |   15 +-
 drivers/net/wireless/ath/ath9k/htc_drv_init.c    |   78 +-
 drivers/net/wireless/ath/ath9k/htc_drv_main.c    |  202 +++---
 drivers/net/wireless/ath/ath9k/htc_drv_txrx.c    |   33 +-
 drivers/net/wireless/ath/ath9k/hw.c              |  201 +++---
 drivers/net/wireless/ath/ath9k/hw.h              |    7 +-
 drivers/net/wireless/ath/ath9k/init.c            |   35 +-
 drivers/net/wireless/ath/ath9k/mac.c             |  120 ++--
 drivers/net/wireless/ath/ath9k/main.c            |  216 +++---
 drivers/net/wireless/ath/ath9k/pci.c             |   20 +-
 drivers/net/wireless/ath/ath9k/phy.h             |    1 +
 drivers/net/wireless/ath/ath9k/rc.c              |   18 +-
 drivers/net/wireless/ath/ath9k/recv.c            |   81 +-
 drivers/net/wireless/ath/ath9k/reg.h             |   18 +-
 drivers/net/wireless/ath/ath9k/virtual.c         |    7 +-
 drivers/net/wireless/ath/ath9k/wmi.c             |   12 +-
 drivers/net/wireless/ath/ath9k/xmit.c            |  100 +--
 drivers/net/wireless/ath/debug.c                 |   20 -
 drivers/net/wireless/ath/debug.h                 |   92 ---
 drivers/net/wireless/ath/key.c                   |   28 +-
 drivers/net/wireless/ath/main.c                  |   20 +
 drivers/net/wireless/b43/Kconfig                 |   13 +-
 drivers/net/wireless/b43/Makefile                |    8 +-
 drivers/net/wireless/b43/main.c                  |   12 +-
 drivers/net/wireless/b43/phy_common.c            |   14 +-
 drivers/net/wireless/b43/phy_common.h            |    8 +
 drivers/net/wireless/b43/phy_n.c                 |   26 +-
 drivers/net/wireless/b43/tables_nphy.c           |  224 +++---
 drivers/net/wireless/iwlwifi/iwl-1000.c          |    3 +-
 drivers/net/wireless/iwlwifi/iwl-3945.c          |    7 +-
 drivers/net/wireless/iwlwifi/iwl-4965.c          |    4 +-
 drivers/net/wireless/iwlwifi/iwl-5000.c          |    4 +-
 drivers/net/wireless/iwlwifi/iwl-6000.c          |   28 +-
 drivers/net/wireless/iwlwifi/iwl-agn-lib.c       |    1 +
 drivers/net/wireless/iwlwifi/iwl-agn-rxon.c      |    3 +
 drivers/net/wireless/iwlwifi/iwl-agn-ucode.c     |    4 +
 drivers/net/wireless/iwlwifi/iwl-agn.c           |   91 +--
 drivers/net/wireless/iwlwifi/iwl-agn.h           |   20 +-
 drivers/net/wireless/iwlwifi/iwl-core.c          |  111 ++--
 drivers/net/wireless/iwlwifi/iwl-core.h          |   10 +-
 drivers/net/wireless/iwlwifi/iwl-debugfs.c       |   24 +-
 drivers/net/wireless/iwlwifi/iwl-dev.h           |   16 +-
 drivers/net/wireless/iwlwifi/iwl-sta.c           |   13 +
 drivers/net/wireless/iwlwifi/iwl-tx.c            |    2 -
 drivers/net/wireless/iwlwifi/iwl3945-base.c      |   28 +-
 drivers/net/wireless/orinoco/main.c              |    6 +
 drivers/net/wireless/orinoco/orinoco_cs.c        |   14 +-
 drivers/net/wireless/orinoco/spectrum_cs.c       |   14 +-
 drivers/net/wireless/orinoco/wext.c              |   12 +-
 include/linux/average.h                          |    4 +-
 include/linux/ieee80211.h                        |    3 +
 include/linux/nl80211.h                          |   18 +
 include/net/cfg80211.h                           |   58 +-
 lib/average.c                                    |   20 +-
 net/mac80211/Kconfig                             |    1 +
 net/mac80211/cfg.c                               |   70 ++-
 net/mac80211/debugfs_netdev.c                    |    2 +
 net/mac80211/ieee80211_i.h                       |   16 +-
 net/mac80211/iface.c                             |   30 +-
 net/mac80211/main.c                              |    5 +-
 net/mac80211/mesh.c                              |   36 +-
 net/mac80211/mesh.h                              |   23 -
 net/mac80211/mesh_hwmp.c                         |    9 +-
 net/mac80211/mesh_pathtbl.c                      |    7 +-
 net/mac80211/mlme.c                              |  104 ++-
 net/mac80211/rx.c                                |    1 +
 net/mac80211/sta_info.c                          |    2 +
 net/mac80211/sta_info.h                          |    3 +
 net/mac80211/status.c                            |   18 +-
 net/mac80211/tx.c                                |   28 +-
 net/mac80211/work.c                              |    5 +-
 net/wireless/Makefile                            |    2 +-
 net/wireless/core.c                              |   15 +-
 net/wireless/core.h                              |   13 +
 net/wireless/mesh.c                              |  140 ++++
 net/wireless/nl80211.c                           |  212 ++++-
 net/wireless/util.c                              |    1 +
 114 files changed, 4237 insertions(+), 2331 deletions(-)
 create mode 100644 drivers/net/wireless/ath/ath9k/ar9485_initvals.h
 delete mode 100644 drivers/net/wireless/ath/debug.h
 create mode 100644 net/wireless/mesh.c

Omnibus patch is available here:

	http://www.kernel.org/pub/linux/kernel/people/linville/wireless-next-2.6-2010-12-14.patch.bz2

-- 
John W. Linville		Someday the world will need a hero, and you
linville@tuxdriver.com			might be all we have.  Be ready.

^ permalink raw reply

* Re: Possible regression: Packet drops during iptables calls
From: Eric Dumazet @ 2010-12-14 16:24 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Stephen Hemminger, netfilter-devel, netdev
In-Reply-To: <1292342958.9155.91.camel@firesoul.comx.local>

Le mardi 14 décembre 2010 à 17:09 +0100, Jesper Dangaard Brouer a
écrit :
> On Tue, 2010-12-14 at 16:31 +0100, Eric Dumazet wrote:
> > Le mardi 14 décembre 2010 à 15:46 +0100, Jesper Dangaard Brouer a
> > écrit :
> > > I'm experiencing RX packet drops during call to iptables, on my
> > > production servers.
> > > 
> > > Further investigations showed, that its only the CPU executing the
> > > iptables command that experience packet drops!?  Thus, a quick fix was
> > > to force the iptables command to run on one of the idle CPUs (This can
> > > be achieved with the "taskset" command).
> > > 
> > > I have a 2x Xeon 5550 CPU system, thus 16 CPUs (with HT enabled).  We
> > > only use 8 CPUs due to a multiqueue limitation of 8 queues in the
> > > 1Gbit/s NICs (82576 chips).  CPUs 0 to 7 is assigned for packet
> > > processing via smp_affinity.
> > > 
> > > Can someone explain why the packet drops only occur on the CPU
> > > executing the iptables command?
> > > 
> > 
> > It blocks BH
> > 
> > take a look at commits :
> > 
> > 24b36f0193467fa727b85b4c004016a8dae999b9
> > netfilter: {ip,ip6,arp}_tables: dont block bottom half more than
> > necessary 
> > 
> > 001389b9581c13fe5fc357a0f89234f85af4215d
> > netfilter: {ip,ip6,arp}_tables: avoid lockdep false positive
> > 
> > for attempts to let BH fly ...
> > 
> > Unfortunately, lockdep rules :(
> 
> Is the lockdep check a false positive?

Yes its a false positive.

> Could I run with 24b36f0193 in production, to fix my problem?
> 

Yes, but you could also run a kernel with both commits:

We now block BH for each cpu we are "summing", instead of blocking BH
for the whole 16 possible cpus summation. (so BH should be blocked for
smaller amount of time)

> I forgot to mention I run kernel 2.6.35.8-comx01+ (based on Greg's stable kernel tree).
> 
> $ git describe --contains 24b36f019346
> v2.6.36-rc1~571^2~46^2~7
> $ git describe --contains 001389b9581c1
> v2.6.36-rc3~2^2~42
> 
> 
> > > What can we do to solve this issue?
> 
> Any ideas how we can proceed?
> 
> Looking closer at the two combined code change, I see that the code path
> has been improved (a bit), as the local BH is only disabled inside the
> for_each_possible_cpu(cpu).  Before local_bh was disabled for the hole
> function.  Guess I need to reproduce this in my testlab.
> 

Yes, so current kernel is a bit better.

Note that even with the 'false positive' problem, we had to blocks BH
for the current cpu sum, so the max BH latency is probably the same with
or without 001389b9581c13fe5




^ permalink raw reply

* Re: [PATCH net-2.6] net: fix nulls list corruptions in sk_prot_alloc
From: Eric Dumazet @ 2010-12-14 16:19 UTC (permalink / raw)
  To: Octavian Purdila; +Cc: netdev, Leonard Crestez
In-Reply-To: <1292341443-18360-1-git-send-email-opurdila@ixiacom.com>

Le mardi 14 décembre 2010 à 17:44 +0200, Octavian Purdila a écrit :
> Special care is taken inside sk_port_alloc to avoid overwriting
> skc_node/skc_nulls_node. We should also avoid overwriting
> skc_bind_node/skc_portaddr_node.
> 
> The patch fixes the following crash:
> 
>  BUG: unable to handle kernel paging request at fffffffffffffff0
>  IP: [<ffffffff812ec6dd>] udp4_lib_lookup2+0xad/0x370
>  [<ffffffff812ecc22>] __udp4_lib_lookup+0x282/0x360
>  [<ffffffff812ed63e>] __udp4_lib_rcv+0x31e/0x700
>  [<ffffffff812bba45>] ? ip_local_deliver_finish+0x65/0x190
>  [<ffffffff812bbbf8>] ? ip_local_deliver+0x88/0xa0
>  [<ffffffff812eda35>] udp_rcv+0x15/0x20
>  [<ffffffff812bba45>] ip_local_deliver_finish+0x65/0x190
>  [<ffffffff812bbbf8>] ip_local_deliver+0x88/0xa0
>  [<ffffffff812bb2cd>] ip_rcv_finish+0x32d/0x6f0
>  [<ffffffff8128c14c>] ? netif_receive_skb+0x99c/0x11c0
>  [<ffffffff812bb94b>] ip_rcv+0x2bb/0x350
>  [<ffffffff8128c14c>] netif_receive_skb+0x99c/0x11c0
> 
> Signed-off-by: Leonard Crestez <lcrestez@ixiacom.com>
> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> ---

Hmm very good catch, but why a so invasive patch ?

Only udp needs a special care.

Other protocols could use the default 'cleaner', you dont need to force
them to use the default ;)

Unless you want to fix another bug, not mentioned in Changelog ?




^ permalink raw reply

* [PATCH 07/19] c/r: basic infrastructure for checkpoint/restart
From: Dan Smith @ 2010-12-14 16:14 UTC (permalink / raw)
  To: danms; +Cc: linux-mm, linux-fsdevel, netdev, Oren Laadan
In-Reply-To: <1292343307-7870-1-git-send-email-danms@us.ibm.com>

From: Oren Laadan <orenl@cs.columbia.edu>

Add those interfaces, as well as helpers needed to easily manage the
file format. The code is roughly broken out as follows:

kernel/checkpoint/sys.c - user/kernel data transfer, as well as setup
  of the c/r context (a per-checkpoint data structure for housekeeping)

kernel/checkpoint/checkpoint.c - output wrappers and checkpoint handling

kernel/checkpoint/restart.c - input wrappers and restart handling

kernel/checkpoint/process.c - c/r of task data

For now, we can only checkpoint the 'current' task ("self" checkpoint),
and the 'pid' argument to the syscall is ignored.

Patches to add the per-architecture support as well as the actual
work to do the memory checkpoint follow in subsequent patches.

Changelog[v21]:
  - Complain if checkpoint_hdr.h included without CONFIG_CHECKPOINT
  - Do not include checkpoint_hdr.h explicitly
  - Consolidate ckpt_read/write with kernel_read/write
  - Reorganize code:move checkpoint/* to kernel/checkpoint/*
  - [Christoffer Dall] Fix trivial bug in ckpt_msg macro
Changelog[v20]:
  - Export key symbols to enable c/r from kernel modules
Changelog[v19]:
  - [Serge Hallyn] Use ckpt_err() to for bad header values
Changelog[v19-rc3]:
  - sys_{checkpoint,restart} to use ptregs prototype
Changelog[v19-rc1]:
  - Set ctx->errno in do_ckpt_msg() if needed
  - Document prototype of ckpt_write_err in header
  - Update prototype of ckpt_read_obj()
  - Fix up headers so we can munge them for use by userspace
  - [Matt Helsley] Check for empty string for _ckpt_write_err()
  - [Matt Helsley] Add cpp definitions for enums
  - [Serge Hallyn] Add global section container to image format
  - [Matt Helsley] Fix total byte read/write count for large images
  - ckpt_read_buf_type() to accept max payload (excludes ckpt_hdr)
  - [Serge Hallyn] Define new api for error and debug logging
  - Use logfd in sys_{checkpoint,restart}
Changelog[v18]:
  - Detect error-headers in input data on restart, and abort.
  - Standard format for checkpoint error strings (and documentation)
  - [Matt Helsley] Rename headerless struct ckpt_hdr_* to struct ckpt_*
  - [Dan Smith] Add an errno validation function
  - Add ckpt_read_payload(): read a variable-length object (no header)
  - Add ckpt_read_string(): same for strings (ensures null-terminated)
  - Add ckpt_read_consume(): consumes next object without processing
Changelog[v17]:
  - Fix compilation for architectures that don't support checkpoint
  - Save/restore t->{set,clear}_child_tid
  - Restart(2) isn't idempotent: must return -EINTR if interrupted
  - ckpt_debug does not depend on DYNAMIC_DEBUG, on by default
  - Export generic checkpoint headers to userespace
  - Fix comment for prototype of sys_restart
  - Have ckpt_debug() print global-pid and __LINE__
  - Only save and test kernel constants once (in header)
Changelog[v16]:
  - Split ctx->flags to ->uflags (user flags) and ->kflags (kernel flags)
  - Introduce __ckpt_write_err() and ckpt_write_err() to report errors
  - Allow @ptr == NULL to write (or read) header only without payload
  - Introduce _ckpt_read_obj_type()
Changelog[v15]:
  - Replace header buffer in ckpt_ctx (hbuf,hpos) with kmalloc/kfree()
Changelog[v14]:
  - Cleanup interface to get/put hdr buffers
  - Merge checkpoint and restart code into a single file (per subsystem)
  - Take uts_sem around access to uts->{release,version,machine}
  - Embed ckpt_hdr in all ckpt_hdr_...., cleanup read/write helpers
  - Define sys_checkpoint(0,...) as asking for a self-checkpoint (Serge)
  - Revert use of 'pr_fmt' to avoid tainting whom includes us (Nathan Lynch)
  - Explicitly indicate length of UTS fields in header
  - Discard field 'h->parent' from ckpt_hdr
Changelog[v12]:
  - ckpt_kwrite/ckpt_kread() again use vfs_read(), vfs_write() (safer)
  - Split ckpt_write/ckpt_read() to two parts: _ckpt_write/read() helper
  - Befriend with sparse : explicit conversion to 'void __user *'
  - Redfine 'pr_fmt' instead of using special ckpt_debug()
Changelog[v10]:
  - add ckpt_write_buffer(), ckpt_read_buffer() and ckpt_read_buf_type()
  - force end-of-string in ckpt_read_string() (fix possible DoS)
Changelog[v9]:
  - ckpt_kwrite/ckpt_kread() use file->f_op->write() directly
  - Drop ckpt_uwrite/ckpt_uread() since they aren't used anywhere
Changelog[v6]:
  - Balance all calls to ckpt_hbuf_get() with matching ckpt_hbuf_put()
    (although it's not really needed)
Changelog[v5]:
  - Rename headers files s/ckpt/checkpoint/
Changelog[v2]:
  - Added utsname->{release,version,machine} to checkpoint header
  - Pad header structures to 64 bits to ensure compatibility

Cc: linux-mm@kvack.org
Cc: linux-fsdevel@vger.kernel.org
Cc: netdev@vger.kernel.org
Signed-off-by: Oren Laadan <orenl@cs.columbia.edu>
Acked-by: Serge E. Hallyn <serue@us.ibm.com>
Tested-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/Kbuild             |    3 +
 include/linux/checkpoint.h       |  202 +++++++++++++++++
 include/linux/checkpoint_hdr.h   |  135 +++++++++++
 include/linux/checkpoint_types.h |   44 ++++
 include/linux/magic.h            |    3 +
 include/linux/syscalls.h         |    4 -
 kernel/checkpoint/Makefile       |    6 +-
 kernel/checkpoint/checkpoint.c   |  213 ++++++++++++++++++
 kernel/checkpoint/process.c      |  101 +++++++++
 kernel/checkpoint/restart.c      |  460 +++++++++++++++++++++++++++++++++++++
 kernel/checkpoint/sys.c          |  461 +++++++++++++++++++++++++++++++++++++-
 lib/Kconfig.debug                |   13 +
 12 files changed, 1632 insertions(+), 13 deletions(-)
 create mode 100644 include/linux/checkpoint.h
 create mode 100644 include/linux/checkpoint_hdr.h
 create mode 100644 include/linux/checkpoint_types.h
 create mode 100644 kernel/checkpoint/checkpoint.c
 create mode 100644 kernel/checkpoint/process.c
 create mode 100644 kernel/checkpoint/restart.c

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 97319a8..1fe511b 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -81,6 +81,9 @@ header-y += cciss_ioctl.h
 header-y += cdk.h
 header-y += cdrom.h
 header-y += cgroupstats.h
+header-y += checkpoint.h
+header-y += checkpoint_hdr.h
+header-y += checkpoint_types.h
 header-y += chio.h
 header-y += cm4000_cs.h
 header-y += cn_proc.h
diff --git a/include/linux/checkpoint.h b/include/linux/checkpoint.h
new file mode 100644
index 0000000..4bb5b8d
--- /dev/null
+++ b/include/linux/checkpoint.h
@@ -0,0 +1,202 @@
+#ifndef _LINUX_CHECKPOINT_H_
+#define _LINUX_CHECKPOINT_H_
+/*
+ *  Generic checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define CHECKPOINT_VERSION  3
+
+/* misc user visible */
+#define CHECKPOINT_FD_NONE	-1
+
+#ifdef __KERNEL__
+#ifdef CONFIG_CHECKPOINT
+
+#include <linux/checkpoint_types.h>
+#include <linux/checkpoint_hdr.h>
+#include <linux/err.h>
+
+/* sycall helpers */
+extern long do_sys_checkpoint(pid_t pid, int fd,
+			      unsigned long flags, int logfd);
+extern long do_sys_restart(pid_t pid, int fd,
+			   unsigned long flags, int logfd);
+
+/* ckpt_ctx: kflags */
+#define CKPT_CTX_CHECKPOINT_BIT		0
+#define CKPT_CTX_RESTART_BIT		1
+#define CKPT_CTX_ERROR_BIT		3
+
+#define CKPT_CTX_CHECKPOINT	(1 << CKPT_CTX_CHECKPOINT_BIT)
+#define CKPT_CTX_RESTART	(1 << CKPT_CTX_RESTART_BIT)
+#define CKPT_CTX_ERROR		(1 << CKPT_CTX_ERROR_BIT)
+
+
+extern int ckpt_kwrite(struct ckpt_ctx *ctx, void *buf, size_t count);
+extern int ckpt_kread(struct ckpt_ctx *ctx, void *buf, size_t count);
+
+extern void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int n);
+extern void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr);
+extern void *ckpt_hdr_get(struct ckpt_ctx *ctx, int n);
+extern void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int n, int type);
+
+extern int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h);
+extern int ckpt_write_obj_type(struct ckpt_ctx *ctx,
+			       void *ptr, int len, int type);
+extern int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len);
+
+extern int _ckpt_read_obj_type(struct ckpt_ctx *ctx,
+			       void *ptr, int len, int type);
+extern int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len);
+extern int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len);
+extern void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type);
+extern void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type);
+extern int ckpt_read_payload(struct ckpt_ctx *ctx,
+			     void **ptr, int max, int type);
+extern char *ckpt_read_string(struct ckpt_ctx *ctx, int max);
+extern int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type);
+
+extern long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid);
+extern long do_restart(struct ckpt_ctx *ctx, pid_t pid);
+
+/* task */
+extern int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t);
+extern int restore_task(struct ckpt_ctx *ctx);
+
+static inline int ckpt_validate_errno(int errno)
+{
+	return (errno >= 0) && (errno < MAX_ERRNO);
+}
+
+/* debugging flags */
+#define CKPT_DBASE	0x1		/* anything */
+#define CKPT_DSYS	0x2		/* generic (system) */
+#define CKPT_DRW	0x4		/* image read/write */
+
+#define CKPT_DDEFAULT	0xffff		/* default debug level */
+
+#ifndef CKPT_DFLAG
+#define CKPT_DFLAG	0xffff		/* everything */
+#endif
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+extern unsigned long ckpt_debug_level;
+
+/*
+ * This is deprecated
+ */
+/* use this to select a specific debug level */
+#define _ckpt_debug(level, fmt, args...)				\
+	do {								\
+		if (ckpt_debug_level & (level))				\
+			printk(KERN_DEBUG "[%d:%d:c/r:%s:%d] " fmt,	\
+				current->pid,				\
+				current->nsproxy ?			\
+				task_pid_vnr(current) : -1,		\
+				__func__, __LINE__, ## args);		\
+	} while (0)
+
+/*
+ * CKPT_DBASE is the base flags, doesn't change
+ * CKPT_DFLAG is to be redfined in each source file
+ */
+#define ckpt_debug(fmt, args...)  \
+	_ckpt_debug(CKPT_DBASE | CKPT_DFLAG, fmt, ## args)
+
+#else
+
+/*
+ * This is deprecated
+ */
+#define _ckpt_debug(level, fmt, args...)	do { } while (0)
+#define ckpt_debug(fmt, args...)		do { } while (0)
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
+
+/*
+ * prototypes for the new logging api
+ */
+
+extern void ckpt_msg_lock(struct ckpt_ctx *ctx);
+extern void ckpt_msg_unlock(struct ckpt_ctx *ctx);
+
+extern void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...);
+extern void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...);
+
+/*
+ * Append formatted msg to ctx->msg[ctx->msg_len].
+ * Must be called after expanding format.
+ * May be called under spinlock.
+ * Must be called under ckpt_msg_lock().
+ */
+extern void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...);
+
+/*
+ * Write ctx->msg to all relevant places.
+ * Must not be called under spinlock.
+ * Must be called under ckpt_msg_lock().
+ */
+extern void _ckpt_msg_complete(struct ckpt_ctx *ctx);
+
+/*
+ * Append an enhanced formatted message to ctx->msg.
+ * This will not write the message out to the applicable files, so
+ * the caller will have to use _ckpt_msg_complete() to finish up.
+ * @ctx must be a valid checkpoint context.
+ * @fmt is the extended format
+ *
+ * Must be called with ckpt_msg_lock held.
+ */
+#define _ckpt_msg(ctx, fmt, args...) do {	\
+	_do_ckpt_msg(ctx, 0, fmt, ##args);	\
+} while (0)
+
+/*
+ * Append an enhanced formatted message to ctx->msg.
+ * This will take the ckpt_msg_lock and also write the message out
+ * to the applicable files by calling _ckpt_msg_complete().
+ * @ctx must be a valid checkpoint context.
+ * @fmt is the extended format
+ *
+ * Must not be called under spinlock.
+ */
+#define ckpt_msg(ctx, fmt, args...) do {	\
+	do_ckpt_msg(ctx, 0, fmt, ##args);	\
+} while (0)
+
+/*
+ * Report an error.
+ * This will take the ckpt_msg_lock and also write the message out
+ * to the applicable files by calling _ckpt_msg_complete().
+ * @ctx must be a valid checkpoint context.
+ * @err is the error value
+ * @fmt is the extended format
+ *
+ * Must not be called under spinlock.
+ */
+
+#define ckpt_err(ctx, err, fmt, args...) do {				\
+	do_ckpt_msg(ctx, err, "[E @ %s:%d]" fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+/*
+ * Same as ckpt_err() but
+ *	must be called with ctx->msg_mutex held
+ *	can be called under spinlock
+ *	must be followed by a call to _ckpt_msg_complete()
+ */
+#define _ckpt_err(ctx, err, fmt, args...) do {				\
+	_do_ckpt_msg(ctx, err, "[E @ %s:%d]" fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+#endif /* CONFIG_CHECKPOINT */
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CHECKPOINT_H_ */
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
new file mode 100644
index 0000000..7ccebc7
--- /dev/null
+++ b/include/linux/checkpoint_hdr.h
@@ -0,0 +1,135 @@
+#ifndef _CHECKPOINT_CKPT_HDR_H_
+#define _CHECKPOINT_CKPT_HDR_H_
+/*
+ *  Generic container checkpoint-restart
+ *
+ *  Copyright (C) 2008-2010 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#ifndef __KERNEL__
+#include <sys/types.h>
+#include <linux/types.h>
+#endif
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+
+#ifndef CONFIG_CHECKPOINT
+#error linux/checkpoint_hdr.h included directly (without CONFIG_CHECKPOINT)
+#endif
+
+#endif
+
+#include <linux/utsname.h>
+
+/*
+ * To maintain compatibility between 32-bit and 64-bit architecture flavors,
+ * keep data 64-bit aligned: use padding for structure members, and use
+ * __attribute__((aligned (8))) for the entire structure.
+ *
+ * Quoting Arnd Bergmann:
+ *   "This structure has an odd multiple of 32-bit members, which means
+ *   that if you put it into a larger structure that also contains 64-bit
+ *   members, the larger structure may get different alignment on x86-32
+ *   and x86-64, which you might want to avoid. I can't tell if this is
+ *   an actual problem here. ... In this case, I'm pretty sure that
+ *   sizeof(ckpt_hdr_task) on x86-32 is different from x86-64, since it
+ *   will be 32-bit aligned on x86-32."
+ */
+
+/*
+ * header format: 'struct ckpt_hdr' must prefix all other headers. Therfore
+ * when a header is passed around, the information about it (type, size)
+ * is readily available. Structs that include a struct ckpt_hdr are named
+ * struct ckpt_hdr_* by convention (usualy the struct ckpt_hdr is the first
+ * member).
+ */
+struct ckpt_hdr {
+	__u32 type;
+	__u32 len;
+} __attribute__((aligned(8)));
+
+/* header types */
+enum {
+	CKPT_HDR_HEADER = 1,
+#define CKPT_HDR_HEADER CKPT_HDR_HEADER
+	CKPT_HDR_CONTAINER,
+#define CKPT_HDR_CONTAINER CKPT_HDR_CONTAINER
+	CKPT_HDR_BUFFER,
+#define CKPT_HDR_BUFFER CKPT_HDR_BUFFER
+	CKPT_HDR_STRING,
+#define CKPT_HDR_STRING CKPT_HDR_STRING
+
+	CKPT_HDR_TASK = 101,
+#define CKPT_HDR_TASK CKPT_HDR_TASK
+
+	CKPT_HDR_TAIL = 9001,
+#define CKPT_HDR_TAIL CKPT_HDR_TAIL
+
+	CKPT_HDR_ERROR = 9999,
+#define CKPT_HDR_ERROR CKPT_HDR_ERROR
+};
+
+/* kernel constants */
+struct ckpt_const {
+	/* task */
+	__u16 task_comm_len;
+	/* uts */
+	__u16 uts_release_len;
+	__u16 uts_version_len;
+	__u16 uts_machine_len;
+} __attribute__((aligned(8)));
+
+/* checkpoint image header */
+struct ckpt_hdr_header {
+	struct ckpt_hdr h;
+	__u64 magic;
+
+	__u16 _padding;
+
+	__u16 major;
+	__u16 minor;
+	__u16 patch;
+	__u16 rev;
+
+	struct ckpt_const constants;
+
+	__u64 time;	/* when checkpoint taken */
+	__u64 uflags;	/* uflags from checkpoint */
+
+	/*
+	 * the header is followed by three strings:
+	 *   char release[const.uts_release_len];
+	 *   char version[const.uts_version_len];
+	 *   char machine[const.uts_machine_len];
+	 */
+} __attribute__((aligned(8)));
+
+/* checkpoint image trailer */
+struct ckpt_hdr_tail {
+	struct ckpt_hdr h;
+	__u64 magic;
+} __attribute__((aligned(8)));
+
+/* container configuration section header */
+struct ckpt_hdr_container {
+	struct ckpt_hdr h;
+} __attribute__((aligned(8)));;
+
+/* task data */
+struct ckpt_hdr_task {
+	struct ckpt_hdr h;
+	__u32 state;
+	__u32 exit_state;
+	__u32 exit_code;
+	__u32 exit_signal;
+
+	__u64 set_child_tid;
+	__u64 clear_child_tid;
+} __attribute__((aligned(8)));
+
+#endif /* _CHECKPOINT_CKPT_HDR_H_ */
diff --git a/include/linux/checkpoint_types.h b/include/linux/checkpoint_types.h
new file mode 100644
index 0000000..13d6dd5
--- /dev/null
+++ b/include/linux/checkpoint_types.h
@@ -0,0 +1,44 @@
+#ifndef _LINUX_CHECKPOINT_TYPES_H_
+#define _LINUX_CHECKPOINT_TYPES_H_
+/*
+ *  Generic checkpoint-restart
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+
+struct ckpt_ctx {
+	int crid;		/* unique checkpoint id */
+
+	pid_t root_pid;		/* container identifier */
+
+	unsigned long kflags;	/* kerenl flags */
+	unsigned long uflags;	/* user flags */
+	unsigned long oflags;	/* restart: uflags from checkpoint */
+
+	struct file *file;	/* input/output file */
+	struct file *logfile;	/* status/debug log file */
+	loff_t total;		/* total read/written */
+
+	struct task_struct *tsk;/* checkpoint: current target task */
+	char err_string[256];	/* checkpoint: error string */
+
+	int errno;		/* errno that caused failure */
+
+#define CKPT_MSG_LEN 1024
+	char fmt[CKPT_MSG_LEN];
+	char msg[CKPT_MSG_LEN];
+	int msglen;
+	struct mutex msg_mutex;
+};
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_CHECKPOINT_TYPES_H_ */
diff --git a/include/linux/magic.h b/include/linux/magic.h
index ff690d0..30cd986 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -59,4 +59,7 @@
 #define SOCKFS_MAGIC		0x534F434B
 #define V9FS_MAGIC		0x01021997
 
+#define CHECKPOINT_MAGIC_HEAD  0x00feed0cc0a2d200LL
+#define CHECKPOINT_MAGIC_TAIL  0x002d2a0cc0deef00LL
+
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 20be1a6..cacc27a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -820,10 +820,6 @@ asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags
 asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
 				  u64 mask, int fd,
 				  const char  __user *pathname);
-asmlinkage long sys_checkpoint(pid_t pid, int fd, unsigned long flags,
-			       int logfd);
-asmlinkage long sys_restart(pid_t pid, int fd, unsigned long flags,
-			    int logfd);
 
 int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]);
 
diff --git a/kernel/checkpoint/Makefile b/kernel/checkpoint/Makefile
index 8a32c6f..99364cc 100644
--- a/kernel/checkpoint/Makefile
+++ b/kernel/checkpoint/Makefile
@@ -2,4 +2,8 @@
 # Makefile for linux checkpoint/restart.
 #
 
-obj-$(CONFIG_CHECKPOINT) += sys.o
+obj-$(CONFIG_CHECKPOINT) += \
+	sys.o \
+	checkpoint.o \
+	restart.o \
+	process.o
diff --git a/kernel/checkpoint/checkpoint.c b/kernel/checkpoint/checkpoint.c
new file mode 100644
index 0000000..75b43e6
--- /dev/null
+++ b/kernel/checkpoint/checkpoint.c
@@ -0,0 +1,213 @@
+/*
+ *  Checkpoint logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/utsname.h>
+#include <linux/magic.h>
+#include <linux/checkpoint.h>
+
+/* unique checkpoint identifier (FIXME: should be per-container ?) */
+static atomic_t ctx_count = ATOMIC_INIT(0);
+
+/**
+ * ckpt_write_obj - write an object
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ */
+int ckpt_write_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	return ckpt_kwrite(ctx, h, h->len);
+}
+EXPORT_SYMBOL(ckpt_write_obj);
+
+/**
+ * ckpt_write_obj_type - write an object (from a pointer)
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ * @type: desired type
+ *
+ * If @ptr is NULL, then write only the header (payload to follow)
+ */
+int ckpt_write_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr *h;
+	int ret;
+
+	h = ckpt_hdr_get(ctx, sizeof(*h));
+	if (!h)
+		return -ENOMEM;
+
+	h->type = type;
+	h->len = len + sizeof(*h);
+
+	_ckpt_debug(CKPT_DRW, "type %d len %d\n", h->type, h->len);
+	ret = ckpt_kwrite(ctx, h, sizeof(*h));
+	if (ret < 0)
+		goto out;
+	if (ptr)
+		ret = ckpt_kwrite(ctx, ptr, len);
+ out:
+	_ckpt_hdr_put(ctx, h, sizeof(*h));
+	return ret;
+}
+EXPORT_SYMBOL(ckpt_write_obj_type);
+
+/**
+ * ckpt_write_buffer - write an object of type buffer
+ * @ctx: checkpoint context
+ * @ptr: buffer pointer
+ * @len: buffer size
+ */
+int ckpt_write_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	return ckpt_write_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+EXPORT_SYMBOL(ckpt_write_buffer);
+
+/**
+ * ckpt_write_string - write an object of type string
+ * @ctx: checkpoint context
+ * @str: string pointer
+ * @len: string length
+ */
+int ckpt_write_string(struct ckpt_ctx *ctx, char *str, int len)
+{
+	return ckpt_write_obj_type(ctx, str, len, CKPT_HDR_STRING);
+}
+EXPORT_SYMBOL(ckpt_write_string);
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+static void fill_kernel_const(struct ckpt_const *h)
+{
+	struct task_struct *tsk;
+	struct new_utsname *uts;
+
+	/* task */
+	h->task_comm_len = sizeof(tsk->comm);
+	/* uts */
+	h->uts_release_len = sizeof(uts->release);
+	h->uts_version_len = sizeof(uts->version);
+	h->uts_machine_len = sizeof(uts->machine);
+}
+
+/* write the checkpoint header */
+static int checkpoint_write_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts;
+	struct timeval ktv;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (!h)
+		return -ENOMEM;
+
+	do_gettimeofday(&ktv);
+	uts = utsname();
+
+	h->magic = CHECKPOINT_MAGIC_HEAD;
+	h->major = (LINUX_VERSION_CODE >> 16) & 0xff;
+	h->minor = (LINUX_VERSION_CODE >> 8) & 0xff;
+	h->patch = (LINUX_VERSION_CODE) & 0xff;
+
+	h->rev = CHECKPOINT_VERSION;
+
+	h->uflags = ctx->uflags;
+	h->time = ktv.tv_sec;
+
+	fill_kernel_const(&h->constants);
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	down_read(&uts_sem);
+	ret = ckpt_write_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto up;
+	ret = ckpt_write_buffer(ctx, uts->machine, sizeof(uts->machine));
+ up:
+	up_read(&uts_sem);
+	return ret;
+}
+
+/* write the container configuration section */
+static int checkpoint_container(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_container *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+	if (!h)
+		return -ENOMEM;
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/* write the checkpoint trailer */
+static int checkpoint_write_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (!h)
+		return -ENOMEM;
+
+	h->magic = CHECKPOINT_MAGIC_TAIL;
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+long do_checkpoint(struct ckpt_ctx *ctx, pid_t pid)
+{
+	long ret;
+
+	ret = checkpoint_write_header(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_container(ctx);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_task(ctx, current);
+	if (ret < 0)
+		goto out;
+	ret = checkpoint_write_tail(ctx);
+	if (ret < 0)
+		goto out;
+
+	/* on success, return (unique) checkpoint identifier */
+	ctx->crid = atomic_inc_return(&ctx_count);
+	ret = ctx->crid;
+ out:
+	return ret;
+}
diff --git a/kernel/checkpoint/process.c b/kernel/checkpoint/process.c
new file mode 100644
index 0000000..abd9025
--- /dev/null
+++ b/kernel/checkpoint/process.c
@@ -0,0 +1,101 @@
+/*
+ *  Checkpoint task structure
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/sched.h>
+#include <linux/checkpoint.h>
+
+/***********************************************************************
+ * Checkpoint
+ */
+
+/* dump the task_struct of a given task */
+static int checkpoint_task_struct(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	struct ckpt_hdr_task *h;
+	int ret;
+
+	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (!h)
+		return -ENOMEM;
+
+	h->state = t->state;
+	h->exit_state = t->exit_state;
+	h->exit_code = t->exit_code;
+	h->exit_signal = t->exit_signal;
+
+	h->set_child_tid = (unsigned long) t->set_child_tid;
+	h->clear_child_tid = (unsigned long) t->clear_child_tid;
+
+	/* FIXME: save remaining relevant task_struct fields */
+
+	ret = ckpt_write_obj(ctx, &h->h);
+	ckpt_hdr_put(ctx, h);
+	if (ret < 0)
+		return ret;
+
+	return ckpt_write_string(ctx, t->comm, TASK_COMM_LEN);
+}
+
+/* dump the entire state of a given task */
+int checkpoint_task(struct ckpt_ctx *ctx, struct task_struct *t)
+{
+	int ret;
+
+	ctx->tsk = t;
+
+	ret = checkpoint_task_struct(ctx, t);
+	ckpt_debug("task %d\n", ret);
+
+	ctx->tsk = NULL;
+	return ret;
+}
+
+/***********************************************************************
+ * Restart
+ */
+
+/* read the task_struct into the current task */
+static int restore_task_struct(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_task *h;
+	struct task_struct *t = current;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TASK);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	memset(t->comm, 0, TASK_COMM_LEN);
+	ret = _ckpt_read_string(ctx, t->comm, TASK_COMM_LEN);
+	if (ret < 0)
+		goto out;
+
+	t->set_child_tid = (int __user *) (unsigned long) h->set_child_tid;
+	t->clear_child_tid = (int __user *) (unsigned long) h->clear_child_tid;
+
+	/* FIXME: restore remaining relevant task_struct fields */
+ out:
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* read the entire state of the current task */
+int restore_task(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	ret = restore_task_struct(ctx);
+	ckpt_debug("task %d\n", ret);
+
+	return ret;
+}
diff --git a/kernel/checkpoint/restart.c b/kernel/checkpoint/restart.c
new file mode 100644
index 0000000..cd9945c
--- /dev/null
+++ b/kernel/checkpoint/restart.c
@@ -0,0 +1,460 @@
+/*
+ *  Restart logic and helpers
+ *
+ *  Copyright (C) 2008-2009 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/magic.h>
+#include <linux/utsname.h>
+#include <linux/checkpoint.h>
+
+static int _ckpt_read_err(struct ckpt_ctx *ctx, struct ckpt_hdr *h)
+{
+	char *ptr;
+	int len, ret;
+
+	len = h->len - sizeof(*h);
+	ptr = kzalloc(len + 1, GFP_KERNEL);
+	if (!ptr) {
+		ckpt_debug("insufficient memory to report image error\n");
+		return -ENOMEM;
+	}
+
+	ret = ckpt_kread(ctx, ptr, len);
+	if (ret >= 0) {
+		ckpt_debug("%s\n", &ptr[1]);
+		ret = -EIO;
+	}
+
+	kfree(ptr);
+	return ret;
+}
+
+/**
+ * _ckpt_read_obj - read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: desired ckpt_hdr
+ * @ptr: desired buffer
+ * @len: desired object length (if 0, flexible)
+ * @max: maximum object length (if 0, flexible)
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+static int _ckpt_read_obj(struct ckpt_ctx *ctx, struct ckpt_hdr *h,
+			  void *ptr, int len, int max)
+{
+	int ret;
+
+ again:
+	ret = ckpt_kread(ctx, h, sizeof(*h));
+	if (ret < 0)
+		return ret;
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    h->type, h->len, len, max);
+	if (h->len < sizeof(*h))
+		return -EINVAL;
+
+	if (h->type == CKPT_HDR_ERROR) {
+		ret = _ckpt_read_err(ctx, h);
+		if (ret < 0)
+			return ret;
+		goto again;
+	}
+
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && h->len != len) || (!len && max && h->len > max))
+		return -EINVAL;
+
+	if (ptr)
+		ret = ckpt_kread(ctx, ptr, h->len - sizeof(struct ckpt_hdr));
+	return ret;
+}
+
+/**
+ * _ckpt_read_obj_type - read an object of some type
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ * @type: buffer type
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: actual _payload_ length
+ */
+int _ckpt_read_obj_type(struct ckpt_ctx *ctx, void *ptr, int len, int type)
+{
+	struct ckpt_hdr h;
+	int ret;
+
+	if (len)
+		len += sizeof(struct ckpt_hdr);
+	ret = _ckpt_read_obj(ctx, &h, ptr, len, len);
+	if (ret < 0)
+		return ret;
+	if (h.type != type)
+		return -EINVAL;
+	return h.len - sizeof(h);
+}
+EXPORT_SYMBOL(_ckpt_read_obj_type);
+
+/**
+ * _ckpt_read_buffer - read an object of type buffer (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: buffer length
+ *
+ * If @ptr is NULL, then read only the header (payload to follow).
+ * @len specifies the expected buffer length (ignored if set to 0).
+ * Returns: _payload_ length.
+ */
+int _ckpt_read_buffer(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	BUG_ON(!len);
+	return _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_BUFFER);
+}
+EXPORT_SYMBOL(_ckpt_read_buffer);
+
+/**
+ * _ckpt_read_string - read an object of type string (set length)
+ * @ctx: checkpoint context
+ * @ptr: provided buffer
+ * @len: string length (including '\0')
+ *
+ * If @ptr is NULL, then read only the header (payload to follow)
+ */
+int _ckpt_read_string(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	int ret;
+
+	BUG_ON(!len);
+	ret = _ckpt_read_obj_type(ctx, ptr, len, CKPT_HDR_STRING);
+	if (ret < 0)
+		return ret;
+	if (ptr)
+		((char *) ptr)[len - 1] = '\0';	/* always play it safe */
+	return 0;
+}
+EXPORT_SYMBOL(_ckpt_read_string);
+
+/**
+ * ckpt_read_obj - allocate and read an object (ckpt_hdr followed by payload)
+ * @ctx: checkpoint context
+ * @h: object descriptor
+ * @len: desired total length (if 0, flexible)
+ * @max: maximum total length
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+static void *ckpt_read_obj(struct ckpt_ctx *ctx, int len, int max)
+{
+	struct ckpt_hdr hh;
+	struct ckpt_hdr *h;
+	int ret;
+
+	ret = ckpt_kread(ctx, &hh, sizeof(hh));
+	if (ret < 0)
+		return ERR_PTR(ret);
+	_ckpt_debug(CKPT_DRW, "type %d len %d(%d,%d)\n",
+		    hh.type, hh.len, len, max);
+	if (hh.len < sizeof(*h))
+		return ERR_PTR(-EINVAL);
+	/* if len specified, enforce, else if maximum specified, enforce */
+	if ((len && hh.len != len) || (!len && max && hh.len > max))
+		return ERR_PTR(-EINVAL);
+
+	h = ckpt_hdr_get(ctx, hh.len);
+	if (!h)
+		return ERR_PTR(-ENOMEM);
+
+	*h = hh;	/* yay ! */
+
+	ret = ckpt_kread(ctx, (h + 1), hh.len - sizeof(struct ckpt_hdr));
+	if (ret < 0) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(ret);
+	}
+
+	return h;
+}
+
+/**
+ * ckpt_read_obj_type - allocate and read an object of some type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_obj_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	BUG_ON(!len);
+
+	h = ckpt_read_obj(ctx, len, len);
+	if (IS_ERR(h))
+		return h;
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+EXPORT_SYMBOL(ckpt_read_obj_type);
+
+/**
+ * ckpt_read_buf_type - allocate and read an object of some type (flxible)
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @type: desired object type
+ *
+ * This differs from ckpt_read_obj_type() in that the length of the
+ * incoming object is flexible (up to the maximum specified by @max;
+ * unlimited if @max is 0), as determined by the ckpt_hdr data.
+ *
+ * NOTE: for symmetry with checkpoint, @max is the maximum _payload_
+ * size, excluding the header.
+ *
+ * Return: new buffer allocated on success, error pointer otherwise
+ */
+void *ckpt_read_buf_type(struct ckpt_ctx *ctx, int max, int type)
+{
+	struct ckpt_hdr *h;
+
+	if (max)
+		max += sizeof(struct ckpt_hdr);
+
+	h = ckpt_read_obj(ctx, 0, max);
+	if (IS_ERR(h))
+		return h;
+
+	if (h->type != type) {
+		ckpt_hdr_put(ctx, h);
+		h = ERR_PTR(-EINVAL);
+	}
+
+	return h;
+}
+EXPORT_SYMBOL(ckpt_read_buf_type);
+
+/**
+ * ckpt_read_payload - allocate and read the payload of an object
+ * @ctx: checkpoint context
+ * @max: maximum payload length
+ * @str: pointer to buffer to be allocated (caller must free)
+ * @type: desired object type
+ *
+ * This can be used to read a variable-length _payload_ from the checkpoint
+ * stream. @max limits the size of the resulting buffer.
+ *
+ * Return: actual _payload_ length
+ */
+int ckpt_read_payload(struct ckpt_ctx *ctx, void **ptr, int max, int type)
+{
+	int len, ret;
+
+	len = _ckpt_read_obj_type(ctx, NULL, 0, type);
+	if (len < 0)
+		return len;
+	else if (len > max)
+		return -EINVAL;
+
+	*ptr = kmalloc(len, GFP_KERNEL);
+	if (!*ptr)
+		return -ENOMEM;
+
+	ret = ckpt_kread(ctx, *ptr, len);
+	if (ret < 0) {
+		kfree(*ptr);
+		return ret;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ckpt_read_payload);
+
+/**
+ * ckpt_read_string - allocate and read a string (variable length)
+ * @ctx: checkpoint context
+ * @max: maximum acceptable length
+ *
+ * Return: allocate string or error pointer
+ */
+char *ckpt_read_string(struct ckpt_ctx *ctx, int max)
+{
+	char *str;
+	int len;
+
+	len = ckpt_read_payload(ctx, (void **)&str, max, CKPT_HDR_STRING);
+	if (len < 0)
+		return ERR_PTR(len);
+	str[len - 1] = '\0';	/* always play it safe */
+	return str;
+}
+EXPORT_SYMBOL(ckpt_read_string);
+
+/**
+ * ckpt_read_consume - consume the next object of expected type
+ * @ctx: checkpoint context
+ * @len: desired object length
+ * @type: desired object type
+ *
+ * This can be used to skip an object in the input stream when the
+ * data is unnecessary for the restart. @len indicates the length of
+ * the object); if @len is zero the length is unconstrained.
+ */
+int ckpt_read_consume(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+	int ret = 0;
+
+	h = ckpt_read_obj(ctx, len, 0);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	if (h->type != type)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+EXPORT_SYMBOL(ckpt_read_consume);
+
+/***********************************************************************
+ * Restart
+ */
+
+static int check_kernel_const(struct ckpt_const *h)
+{
+	struct task_struct *tsk;
+	struct new_utsname *uts;
+
+	/* task */
+	if (h->task_comm_len != sizeof(tsk->comm))
+		return -EINVAL;
+	/* uts */
+	if (h->uts_release_len != sizeof(uts->release))
+		return -EINVAL;
+	if (h->uts_version_len != sizeof(uts->version))
+		return -EINVAL;
+	if (h->uts_machine_len != sizeof(uts->machine))
+		return -EINVAL;
+
+	return 0;
+}
+
+/* read the checkpoint header */
+static int restore_read_header(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_header *h;
+	struct new_utsname *uts = NULL;
+	int ret;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_HEADER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	ret = -EINVAL;
+	if (h->magic != CHECKPOINT_MAGIC_HEAD ||
+	    h->rev != CHECKPOINT_VERSION ||
+	    h->major != ((LINUX_VERSION_CODE >> 16) & 0xff) ||
+	    h->minor != ((LINUX_VERSION_CODE >> 8) & 0xff) ||
+	    h->patch != ((LINUX_VERSION_CODE) & 0xff)) {
+		ckpt_err(ctx, ret, "incompatible kernel version");
+		goto out;
+	}
+	if (h->uflags) {
+		ckpt_err(ctx, ret, "incompatible restart user flags");
+		goto out;
+	}
+
+	ret = check_kernel_const(&h->constants);
+	if (ret < 0) {
+		ckpt_err(ctx, ret, "incompatible kernel constants");
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	uts = kmalloc(sizeof(*uts), GFP_KERNEL);
+	if (!uts)
+		goto out;
+
+	ctx->oflags = h->uflags;
+
+	/* FIX: verify compatibility of release, version and machine */
+	ret = _ckpt_read_buffer(ctx, uts->release, sizeof(uts->release));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->version, sizeof(uts->version));
+	if (ret < 0)
+		goto out;
+	ret = _ckpt_read_buffer(ctx, uts->machine, sizeof(uts->machine));
+ out:
+	kfree(uts);
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+/* read the container configuration section */
+static int restore_container(struct ckpt_ctx *ctx)
+{
+	int ret = 0;
+	struct ckpt_hdr_container *h;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_CONTAINER);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+	ckpt_hdr_put(ctx, h);
+
+	return ret;
+}
+
+/* read the checkpoint trailer */
+static int restore_read_tail(struct ckpt_ctx *ctx)
+{
+	struct ckpt_hdr_tail *h;
+	int ret = 0;
+
+	h = ckpt_read_obj_type(ctx, sizeof(*h), CKPT_HDR_TAIL);
+	if (IS_ERR(h))
+		return PTR_ERR(h);
+
+	if (h->magic != CHECKPOINT_MAGIC_TAIL)
+		ret = -EINVAL;
+
+	ckpt_hdr_put(ctx, h);
+	return ret;
+}
+
+long do_restart(struct ckpt_ctx *ctx, pid_t pid)
+{
+	long ret;
+
+	ret = restore_read_header(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_container(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_task(ctx);
+	if (ret < 0)
+		return ret;
+	ret = restore_read_tail(ctx);
+
+	/* on success, adjust the return value if needed [TODO] */
+	return ret;
+}
diff --git a/kernel/checkpoint/sys.c b/kernel/checkpoint/sys.c
index a81750a..af8c1bf 100644
--- a/kernel/checkpoint/sys.c
+++ b/kernel/checkpoint/sys.c
@@ -8,12 +8,398 @@
  *  distribution for more details.
  */
 
+/* default debug level for output */
+#define CKPT_DFLAG  CKPT_DSYS
+
 #include <linux/sched.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/uaccess.h>
+#include <linux/capability.h>
+#include <linux/checkpoint.h>
+
+/*
+ * Helpers to write(read) from(to) kernel space to(from) the checkpoint
+ * image file descriptor (similar to how a core-dump is performed).
+ *
+ *   _ckpt_kwrite() - write a kernel-space buffer to a file
+ *   _ckpt_kread() - read from a file to a kernel-space buffer
+ *
+ *   ckpt_kread() - read from the checkpoint image to a kernel-space buffer
+ *   ckpt_kwrite() - write a kernel-space buffer to the checkpoint image
+ *
+ * They latter two succeed only if the entire read or write succeeds,
+ * and return 0, or negative error otherwise.
+ */
+
+static ssize_t _ckpt_kwrite(struct file *file, void *addr, size_t count)
+{
+	loff_t pos;
+	int ret;
+
+	pos = file_pos_read(file);
+	ret = kernel_write(file, pos, addr, count);
+	if (ret < 0)
+		return ret;
+	file_pos_write(file, pos + ret);
+	return ret;
+}
+
+/* returns 0 on success */
+int ckpt_kwrite(struct ckpt_ctx *ctx, void *addr, size_t count)
+{
+	int ret;
+
+	ret = _ckpt_kwrite(ctx->file, addr, count);
+	if (ret < 0)
+		return ret;
+
+	ctx->total += count;
+	return 0;
+}
+
+static ssize_t _ckpt_kread(struct file *file, void *addr, size_t count)
+{
+	loff_t pos;
+	int ret;
+
+	pos = file_pos_read(file);
+	ret = kernel_read(file, pos, addr, count);
+	if (ret < 0)
+		return ret;
+	file_pos_write(file, pos + ret);
+	return ret;
+}
+
+/* returns 0 on success */
+int ckpt_kread(struct ckpt_ctx *ctx, void *addr, size_t count)
+{
+	int ret;
+
+	ret = _ckpt_kread(ctx->file, addr, count);
+	if (ret < 0)
+		return ret;
+	if (ret != count)
+		return -EPIPE;
+
+	ctx->total += count;
+	return 0;
+}
+
+/**
+ * ckpt_hdr_get - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: desired length
+ *
+ * Returns pointer to header
+ */
+void *ckpt_hdr_get(struct ckpt_ctx *ctx, int len)
+{
+	return kzalloc(len, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ckpt_hdr_get);
+
+/**
+ * _ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ * @len: header length
+ *
+ * (requiring 'ptr' makes it easily interchangable with kmalloc/kfree
+ */
+void _ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr, int len)
+{
+	kfree(ptr);
+}
+EXPORT_SYMBOL(_ckpt_hdr_put);
+
+/**
+ * ckpt_hdr_put - free a hdr allocated with ckpt_hdr_get
+ * @ctx: checkpoint context
+ * @ptr: header to free
+ *
+ * It is assumed that @ptr begins with a 'struct ckpt_hdr'.
+ */
+void ckpt_hdr_put(struct ckpt_ctx *ctx, void *ptr)
+{
+	struct ckpt_hdr *h = (struct ckpt_hdr *) ptr;
+	_ckpt_hdr_put(ctx, ptr, h->len);
+}
+EXPORT_SYMBOL(ckpt_hdr_put);
+
+/**
+ * ckpt_hdr_get_type - get a hdr of certain size
+ * @ctx: checkpoint context
+ * @len: number of bytes to reserve
+ *
+ * Returns pointer to reserved space on hbuf
+ */
+void *ckpt_hdr_get_type(struct ckpt_ctx *ctx, int len, int type)
+{
+	struct ckpt_hdr *h;
+
+	h = ckpt_hdr_get(ctx, len);
+	if (!h)
+		return NULL;
+
+	h->type = type;
+	h->len = len;
+	return h;
+}
+EXPORT_SYMBOL(ckpt_hdr_get_type);
+
+/*
+ * Helpers to manage c/r contexts: allocated for each checkpoint and/or
+ * restart operation, and persists until the operation is completed.
+ */
+
+static void ckpt_ctx_free(struct ckpt_ctx *ctx)
+{
+	if (ctx->file)
+		fput(ctx->file);
+	if (ctx->logfile)
+		fput(ctx->logfile);
+	kfree(ctx);
+}
+
+static struct ckpt_ctx *ckpt_ctx_alloc(int fd, unsigned long uflags,
+				       unsigned long kflags, int logfd)
+{
+	struct ckpt_ctx *ctx;
+	int err;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+
+	ctx->uflags = uflags;
+	ctx->kflags = kflags;
+
+	mutex_init(&ctx->msg_mutex);
+
+	err = -EBADF;
+	ctx->file = fget(fd);
+	if (!ctx->file)
+		goto err;
+	if (logfd == CHECKPOINT_FD_NONE)
+		goto nolog;
+	ctx->logfile = fget(logfd);
+	if (!ctx->logfile)
+		goto err;
+ nolog:
+	return ctx;
+ err:
+	ckpt_ctx_free(ctx);
+	return ERR_PTR(err);
+}
+
+static void ckpt_set_error(struct ckpt_ctx *ctx, int err)
+{
+	ctx->errno = err;
+}
+
+/* helpers to handler log/dbg/err messages */
+void ckpt_msg_lock(struct ckpt_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	mutex_lock(&ctx->msg_mutex);
+	ctx->msg[0] = '\0';
+	ctx->msglen = 1;
+}
+
+void ckpt_msg_unlock(struct ckpt_ctx *ctx)
+{
+	if (!ctx)
+		return;
+	mutex_unlock(&ctx->msg_mutex);
+}
+
+static inline int is_special_flag(char *s)
+{
+	if (*s == '%' && s[1] == '(' && s[2] != '\0' && s[3] == ')')
+		return 1;
+	return 0;
+}
+
+/*
+ * _ckpt_generate_fmt - handle the special flags in the enhanced format
+ * strings used by checkpoint/restart error messages.
+ * @ctx: checkpoint context
+ * @fmt: message format
+ *
+ * The special flags are surrounded by %() to help them visually stand
+ * out.  For instance, %(O) means an objref.  The following special
+ * flags are recognized:
+ *	O: objref
+ *	P: pointer
+ *	T: task
+ *	S: string
+ *	V: variable
+ *
+ * %(O) will be expanded to "[obj %d]".  Likewise P, S, and V, will
+ * also expand to format flags requiring an argument to the subsequent
+ * sprintf or printk.  T will be expanded to a string with no flags,
+ * requiring no further arguments.
+ *
+ * These do not accept any extra flags (i.e. min field width, precision,
+ * etc).
+ *
+ * The caller of ckpt_err() and _ckpt_err() must provide
+ * the additional variabes, in order, to match the @fmt (except for
+ * the T key), e.g.:
+ *
+ *	ckpt_err(ctx, err, "%(T)FILE flags %d %(O)\n", flags, objref);
+ *
+ * May be called under spinlock.
+ * Must be called with ctx->msg_mutex held.  The expanded format
+ * will be placed in ctx->fmt.
+ */
+static void _ckpt_generate_fmt(struct ckpt_ctx *ctx, char *fmt)
+{
+	char *s = ctx->fmt;
+	int len = 0;
+
+	for (; *fmt && len < CKPT_MSG_LEN; fmt++) {
+		if (!is_special_flag(fmt)) {
+			s[len++] = *fmt;
+			continue;
+		}
+		switch (fmt[2]) {
+		case 'O':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[obj %%d]");
+			break;
+		case 'P':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[ptr %%p]");
+			break;
+		case 'V':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[sym %%pS]");
+			break;
+		case 'S':
+			len += snprintf(s+len, CKPT_MSG_LEN-len, "[str %%s]");
+			break;
+		case 'T':
+			if (ctx->tsk)
+				len += snprintf(s+len, CKPT_MSG_LEN-len,
+					"[pid %d tsk %s]",
+					task_pid_vnr(ctx->tsk), ctx->tsk->comm);
+			else
+				len += snprintf(s+len, CKPT_MSG_LEN-len,
+					"[pid -1 tsk NULL]");
+			break;
+		default:
+			printk(KERN_ERR "c/r: bad format specifier %c\n",
+					fmt[2]);
+			BUG();
+		}
+		fmt += 3;
+	}
+	if (len == CKPT_MSG_LEN)
+		s[CKPT_MSG_LEN-1] = '\0';
+	else
+		s[len] = '\0';
+}
+
+static void _ckpt_msg_appendv(struct ckpt_ctx *ctx, int err, char *fmt,
+				va_list ap)
+{
+	int len = ctx->msglen;
+
+	if (err) {
+		len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[err %d]",
+				 err);
+		if (len > CKPT_MSG_LEN)
+			goto full;
+	}
+
+	len += snprintf(&ctx->msg[len], CKPT_MSG_LEN-len, "[pos %lld]",
+			ctx->total);
+	len += vsnprintf(&ctx->msg[len], CKPT_MSG_LEN-len, fmt, ap);
+	if (len > CKPT_MSG_LEN) {
+full:
+		len = CKPT_MSG_LEN;
+		ctx->msg[CKPT_MSG_LEN-1] = '\0';
+	}
+	ctx->msglen = len;
+}
+
+void _ckpt_msg_append(struct ckpt_ctx *ctx, char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	_ckpt_msg_appendv(ctx, 0, fmt, ap);
+	va_end(ap);
+}
+
+void _ckpt_msg_complete(struct ckpt_ctx *ctx)
+{
+	int ret;
+
+	/* Don't write an empty or uninitialized msg */
+	if (ctx->msglen <= 1)
+		return;
+
+	if (ctx->kflags & CKPT_CTX_CHECKPOINT && ctx->errno) {
+		ret = ckpt_write_obj_type(ctx, NULL, 0, CKPT_HDR_ERROR);
+		if (!ret)
+			ret = ckpt_write_string(ctx, ctx->msg, ctx->msglen);
+		if (ret < 0)
+			printk(KERN_NOTICE "c/r: error string unsaved (%d): %s\n",
+			       ret, ctx->msg+1);
+	}
+
+	if (ctx->logfile) {
+		struct file *logfile = ctx->logfile;
+		loff_t pos = file_pos_read(logfile);
+		ret = kernel_write(logfile, pos, ctx->msg+1, ctx->msglen-1);
+		if (ret > 0)
+			file_pos_write(logfile, pos + ret);
+	}
+
+#ifdef CONFIG_CHECKPOINT_DEBUG
+	printk(KERN_DEBUG "%s", ctx->msg+1);
+#endif
+
+	ctx->msglen = 0;
+}
+
+#define __do_ckpt_msg(ctx, err, fmt) do {		\
+	va_list ap;					\
+	_ckpt_generate_fmt(ctx, fmt);			\
+	va_start(ap, fmt);				\
+	_ckpt_msg_appendv(ctx, err, ctx->fmt, ap);	\
+	va_end(ap);					\
+} while (0)
+
+void _do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+	__do_ckpt_msg(ctx, err, fmt);
+}
+
+void do_ckpt_msg(struct ckpt_ctx *ctx, int err, char *fmt, ...)
+{
+	if (!ctx)
+		return;
+
+	ckpt_msg_lock(ctx);
+	__do_ckpt_msg(ctx, err, fmt);
+	_ckpt_msg_complete(ctx);
+	ckpt_msg_unlock(ctx);
+
+	if (err)
+		ckpt_set_error(ctx, err);
+}
+EXPORT_SYMBOL(do_ckpt_msg);
+
+/* checkpoint/restart syscalls */
 
 /**
- * sys_checkpoint - checkpoint a container
+ * do_sys_checkpoint - checkpoint a container
  * @pid: pid of the container init(1) process
  * @fd: file to which dump the checkpoint image
  * @flags: checkpoint operation flags
@@ -22,14 +408,32 @@
  * Returns positive identifier on success, 0 when returning from restart
  * or negative value on error
  */
-SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
-		unsigned long, flags, int, logfd)
+long do_sys_checkpoint(pid_t pid, int fd, unsigned long flags, int logfd)
 {
-	return -ENOSYS;
+	struct ckpt_ctx *ctx;
+	long ret;
+
+	/* no flags for now */
+	if (flags)
+		return -EINVAL;
+
+	if (pid == 0)
+		pid = task_pid_vnr(current);
+	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_CHECKPOINT, logfd);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_checkpoint(ctx, pid);
+
+	if (!ret)
+		ret = ctx->crid;
+
+	ckpt_ctx_free(ctx);
+	return ret;
 }
 
 /**
- * sys_restart - restart a container
+ * do_sys_restart - restart a container
  * @pid: pid of task root (in coordinator's namespace), or 0
  * @fd: file from which read the checkpoint image
  * @flags: restart operation flags
@@ -38,8 +442,49 @@ SYSCALL_DEFINE4(checkpoint, pid_t, pid, int, fd,
  * Returns negative value on error, or otherwise returns in the realm
  * of the original checkpoint
  */
-SYSCALL_DEFINE4(restart, pid_t, pid, int, fd,
-		unsigned long, flags, int, logfd)
+long do_sys_restart(pid_t pid, int fd, unsigned long flags, int logfd)
+{
+	struct ckpt_ctx *ctx = NULL;
+	long ret;
+
+	/* no flags for now */
+	if (flags)
+		return -EINVAL;
+
+	ctx = ckpt_ctx_alloc(fd, flags, CKPT_CTX_RESTART, logfd);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = do_restart(ctx, pid);
+
+	/* restart(2) isn't idempotent: can't restart syscall */
+	if (ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+	    ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK)
+		ret = -EINTR;
+
+	ckpt_ctx_free(ctx);
+	return ret;
+}
+
+
+/* 'ckpt_debug_level' controls the verbosity level of c/r code */
+#ifdef CONFIG_CHECKPOINT_DEBUG
+
+/* FIX: allow to change during runtime */
+unsigned long __read_mostly ckpt_debug_level = CKPT_DDEFAULT;
+EXPORT_SYMBOL(ckpt_debug_level);
+
+static __init int ckpt_debug_setup(char *s)
 {
-	return -ENOSYS;
+	long val, ret;
+
+	ret = strict_strtoul(s, 10, &val);
+	if (ret < 0)
+		return ret;
+	ckpt_debug_level = val;
+	return 0;
 }
+
+__setup("ckpt_debug=", ckpt_debug_setup);
+
+#endif /* CONFIG_CHECKPOINT_DEBUG */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28b42b9..df9a344 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1230,6 +1230,19 @@ config ASYNC_RAID6_TEST
 
 	  If unsure, say N.
 
+config CHECKPOINT_DEBUG
+	bool "Checkpoint/restart debugging (EXPERIMENTAL)"
+	depends on CHECKPOINT
+	default y
+	help
+	  This options turns on the debugging output of checkpoint/restart.
+	  The level of verbosity is controlled by 'ckpt_debug_level' and can
+	  be set at boot time with "ckpt_debug=" option.
+
+	  Turning this option off will reduce the size of the c/r code. If
+	  turned on, it is unlikely to incur visible overhead if the debug
+	  level is set to zero.
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
-- 
1.7.2.2

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH 05/19] c/r: documentation
From: Dan Smith @ 2010-12-14 16:14 UTC (permalink / raw)
  To: danms-r/Jw6+rmf7HQT0dZR+AlfA
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, Oren Laadan, Dave Hansen
In-Reply-To: <1292343307-7870-1-git-send-email-danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>

From: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>

Covers application checkpoint/restart, overall design, interfaces,
usage, shared objects, and and checkpoint image format.

Changelog[v19-rc1]:
  - Update documentation and examples for new syscalls API
  - [Liu Alexander] Fix typos
  - [Serge Hallyn] Update checkpoint image format
Changelog[v16]:
  - Update documentation
  - Unify into readme.txt and usage.txt
Changelog[v14]:
  - Discard the 'h.parent' field
  - New image format (shared objects appear before they are referenced
    unless they are compound)
Changelog[v8]:
  - Split into multiple files in Documentation/checkpoint/...
  - Extend documentation, fix typos and comments from feedback

Cc: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg@public.gmane.org
Cc: linux-fsdevel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Signed-off-by: Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
Signed-off-by: Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
Acked-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
Tested-by: Serge E. Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
---
 Documentation/checkpoint/checkpoint.c      |   38 +++
 Documentation/checkpoint/readme.txt        |  370 ++++++++++++++++++++++++++++
 Documentation/checkpoint/self_checkpoint.c |   69 +++++
 Documentation/checkpoint/self_restart.c    |   40 +++
 Documentation/checkpoint/usage.txt         |  247 +++++++++++++++++++
 5 files changed, 764 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/checkpoint/checkpoint.c
 create mode 100644 Documentation/checkpoint/readme.txt
 create mode 100644 Documentation/checkpoint/self_checkpoint.c
 create mode 100644 Documentation/checkpoint/self_restart.c
 create mode 100644 Documentation/checkpoint/usage.txt

diff --git a/Documentation/checkpoint/checkpoint.c b/Documentation/checkpoint/checkpoint.c
new file mode 100644
index 0000000..8560f30
--- /dev/null
+++ b/Documentation/checkpoint/checkpoint.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+	return syscall(__NR_checkpoint, pid, fd, flags);
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t pid;
+	int ret;
+
+	if (argc != 2) {
+		printf("usage: ckpt PID\n");
+		exit(1);
+	}
+
+	pid = atoi(argv[1]);
+	if (pid <= 0) {
+		printf("invalid pid\n");
+		exit(1);
+	}
+
+	ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+
+	if (ret < 0)
+		perror("checkpoint");
+	else
+		printf("checkpoint id %d\n", ret);
+
+	return (ret > 0 ? 0 : 1);
+}
diff --git a/Documentation/checkpoint/readme.txt b/Documentation/checkpoint/readme.txt
new file mode 100644
index 0000000..4fa5560
--- /dev/null
+++ b/Documentation/checkpoint/readme.txt
@@ -0,0 +1,370 @@
+
+	      Checkpoint-Restart support in the Linux kernel
+	==========================================================
+
+Copyright (C) 2008-2010 Oren Laadan
+
+Author:		Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
+
+License:	The GNU Free Documentation License, Version 1.2
+		(dual licensed under the GPL v2)
+
+Contributors:	Oren Laadan <orenl-eQaUEPhvms7ENvBUuze7eA@public.gmane.org>
+		Serge Hallyn <serue-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+		Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+		Matt Helsley <matthltc-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
+		Nathan Lynch <ntl-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
+		Sukadev Bhattiprolu <sukadev-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
+		Dave Hansen <dave-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
+
+
+Introduction
+============
+
+Application checkpoint/restart [C/R] is the ability to save the state
+of a running application so that it can later resume its execution
+from the time at which it was checkpointed. An application can be
+migrated by checkpointing it on one machine and restarting it on
+another. C/R can provide many potential benefits:
+
+* Failure recovery: by rolling back to a previous checkpoint
+
+* Improved response time: by restarting applications from checkpoints
+  instead of from scratch.
+
+* Improved system utilization: by suspending long running CPU
+  intensive jobs and resuming them when load decreases.
+
+* Fault resilience: by migrating applications off faulty hosts.
+
+* Dynamic load balancing: by migrating applications to less loaded
+  hosts.
+
+* Improved service availability and administration: by migrating
+  applications before host maintenance so that they continue to run
+  with minimal downtime
+
+* Time-travel: by taking periodic checkpoints and restarting from
+  any previous checkpoint.
+
+Compared to hypervisor approaches, application C/R is more lightweight
+since it need only save the state associated with applications, while
+operating system data structures (e.g. buffer cache, drivers state
+and the like) are uninteresting.
+
+
+Overall design
+==============
+
+Checkpoint and restart are done in the kernel as much as possible.
+Two new system calls are introduced to provide C/R: sys_checkpoint()
+and sys_restart(). They both operate on a process tree (hierarchy),
+either a whole container or a subtree of a container.
+
+Checkpointing entire containers ensures that there are no dependencies
+on anything outside the container, which guarantees that a matching
+restart will succeed (assuming that the file system state remains
+consistent). However, it requires that users will always run the tasks
+that they wish to checkpoint inside containers. This is ideal for,
+e.g., private virtual servers and the like.
+
+In contrast, when checkpointing a subtree of a container it is up to
+the user to ensure that dependencies either don't exist or can be
+safely ignored. This is useful, for instance, for HPC scenarios or
+even a user that would like to periodically checkpoint a long-running
+batch job.
+
+An additional system call, a la madvise(), is planned, so that tasks
+can advise the kernel how to handle specific resources. For instance,
+a task could ask to skip a memory area at checkpoint to save space,
+or to use a preset file descriptor at restart instead of restoring it
+from the checkpoint image. It will provide the flexibility that is
+particularly useful to address the needs of a diverse crowd of users
+and use-cases.
+
+Syscall sys_checkpoint() is given a pid that indicates the top of the
+hierarchy, a file descriptor to store the image, and flags. The code
+serializes internal user- and kernel-state and writes it out to the
+file descriptor. The resulting image is stream-able. The processes are
+expected to be frozen for the duration of the checkpoint.
+
+In general, a checkpoint consists of 5 steps:
+1. Pre-dump
+2. Freeze the container/subtree
+3. Save tasks' and kernel state		<-- sys_checkpoint()
+4. Thaw (or kill) the container/subtree
+5. Post-dump
+
+Step 3 is done by calling sys_checkpoint(). Steps 1 and 5 are an
+optimization to reduce application downtime. In particular, "pre-dump"
+works before freezing the container, e.g. the pre-copy for live
+migration, and "post-dump" works after the container resumes
+execution, e.g. write-back the data to secondary storage.
+
+The kernel exports a relatively opaque 'blob' of data to userspace
+which can then be handed to the new kernel at restart time.  The
+'blob' contains data and state of select portions of kernel structures
+such as VMAs and mm_structs, as well as copies of the actual memory
+that the tasks use. Any changes in this blob's format between kernel
+revisions can be handled by an in-userspace conversion program.
+
+To restart, userspace first create a process hierarchy that matches
+that of the checkpoint, and each task calls sys_restart(). The syscall
+reads the saved kernel state from a file descriptor, and re-creates
+the resources that the tasks need to resume execution. The restart
+code is executed by each task that is restored in the new hierarchy to
+reconstruct its own state.
+
+In general, a restart consists of 3 steps:
+1. Create hierarchy
+2. Restore tasks' and kernel state	<-- sys_restart()
+3. Resume userspace (or freeze tasks)
+
+Because the process hierarchy, during restart in created in userspace,
+the restarting tasks have the flexibility to prepare before calling
+sys_restart().
+
+
+Checkpoint image format
+=======================
+
+The checkpoint image format is built of records that consist of a
+pre-header identifying its contents, followed by a payload. This
+format allow userspace tools to easily parse and skip through the
+image without requiring intimate knowledge of the data. It will also
+be handy to enable parallel checkpointing in the future where multiple
+threads interleave data from multiple processes into a single stream.
+
+The pre-header is defined by 'struct ckpt_hdr' as follows: @type
+identifies the type of the payload, @len tells its length in bytes
+including the pre-header.
+
+struct ckpt_hdr {
+	__s32 type;
+	__s32 len;
+};
+
+The pre-header must be the first component in all other headers. For
+instance, the task data is saved in 'struct ckpt_hdr_task', which
+looks something like this:
+
+struct ckpt_hdr_task {
+	struct ckpt_hdr h;
+	__u32 pid;
+	...
+};
+
+THE IMAGE FORMAT IS EXPECTED TO CHANGE over time as more features are
+supported, or as existing features change in the kernel and require to
+adjust their representation. Any such changes will be be handled by
+in-userspace conversion tools.
+
+The general format of the checkpoint image is as follows:
+* Image header
+* Container configuration
+* Task hierarchy
+* Tasks' state
+* Image trailer
+
+The image always begins with a general header that holds a magic
+number, an architecture identifier (little endian format), a format
+version number (@rev), followed by information about the kernel
+(currently version and UTS data). It also holds the time of the
+checkpoint and the flags given to sys_checkpoint(). This header is
+followed by an arch-specific header.
+
+The container configuration section containers information that is
+global to the container. Security (LSM) configuration is one example.
+Network configuration and container-wide mounts may also go here, so
+that the userspace restart coordinator can re-create a suitable
+environment.
+
+The task hierarchy comes next so that userspace tools can read it
+early (even from a stream) and re-create the restarting tasks. This is
+basically an array of all checkpointed tasks, and their relationships
+(parent, siblings, threads, etc).
+
+Then the state of all tasks is saved, in the order that they appear in
+the tasks array above. For each state, we save data like task_struct,
+namespaces, open files, memory layout, memory contents, cpu state,
+signals and signal handlers, etc. For resources that are shared among
+multiple processes, we first checkpoint said resource (and only once),
+and in the task data we give a reference to it. More about shared
+resources below.
+
+Finally, the image always ends with a trailer that holds a (different)
+magic number, serving for sanity check.
+
+
+Shared objects
+==============
+
+Many resources may be shared by multiple tasks (e.g. file descriptors,
+memory address space, etc), or even have multiple references from
+other resources (e.g. a single inode that represents two ends of a
+pipe).
+
+Shared objects are tracked using a hash table (objhash) to ensure that
+they are only checkpointed or restored once. To handle a shared
+object, it is first looked up in the hash table, to determine if is
+the first encounter or a recurring appearance.  The hash table itself
+is not saved as part of the checkpoint image: it is constructed
+dynamically during both checkpoint and restart, and discarded at the
+end of the operation.
+
+During checkpoint, when a shared object is encountered for the first
+time, it is inserted to the hash table, indexed by its kernel address.
+It is assigned an identifier (@objref) in order of appearance, and
+then its state is saved. Subsequent lookups of that object in the hash
+will yield that entry, in which case only the @objref is saved, as
+opposed the entire state of the object.
+
+During restart, shared objects are indexed by their @objref as given
+during the checkpoint. On the first appearance of each shared object,
+a new resource will be created and its state restored from the image.
+Then the object is added to the hash table. Subsequent lookups of the
+same unique identifier in the hash table will yield that entry, and
+then the existing object instance is reused instead of creating
+a new one.
+
+The hash grabs a reference to each object that is inserted, and
+maintains this reference for the entire lifetime of the hash. Thus,
+it is always safe to reference an object that is stored in the hash.
+The hash is "one-way" in the sense that objects that are added are
+never deleted from the hash until the hash is discarded. This, in
+turn, happens only when the checkpoint (or restart) terminates.
+
+Shared objects are thus saved when they are first seen, and _before_
+the parent object that uses them. Therefore by the time the parent
+objects needs them, they should already be in the objhash. The one
+exception is when more than a single shared resource will be restarted
+at once (e.g. like the two ends of a pipe, or all the namespaces in an
+nsproxy). In this case the parent object is dumped first followed by
+the individual sub-resources).
+
+The checkpoint image is stream-able, meaning that restarting from it
+may not require lseek(). This is enforced at checkpoint time, by
+carefully selecting the order of shared objects, to respect the rule
+that an object is always saved before the objects that refers to it.
+
+
+Memory contents format
+======================
+
+The memory contents of a given memory address space (->mm) is dumped
+as a sequence of vma objects, represented by 'struct ckpt_hdr_vma'.
+This header details the vma properties, and a reference to a file
+(if file backed) or an inode (or shared memory) object.
+
+The vma header is followed by the actual contents - but only those
+pages that need to be saved, i.e. dirty pages. They are written in
+chunks of data, where each chunks contains a header that indicates
+that number of pages in the chunk, followed by an array of virtual
+addresses and then an array of actual page contents. The last chunk
+holds zero pages.
+
+To illustrate this, consider a single simple task with two vmas: one
+is file mapped with two dumped pages, and the other is anonymous with
+three dumped pages. The memory dump will look like this:
+
+	ckpt_hdr + ckpt_hdr_vma
+		ckpt_hdr_pgarr (nr_pages = 2)
+			addr1, addr2
+			page1, page2
+		ckpt_hdr_pgarr (nr_pages = 0)
+	ckpt_hdr + ckpt_hdr_vma
+		ckpt_hdr_pgarr (nr_pages = 3)
+		addr3, addr4, addr5
+		page3, page4, page5
+		ckpt_hdr_pgarr (nr_pages = 0)
+
+
+Error handling
+==============
+
+Both checkpoint and restart operations may fail due to a variety of
+reasons. Using a simple, single return value from the system call is
+insufficient to report the reason of a failure.
+
+Instead, both sys_checkpoint() and sys_restart() accept an additional
+argument - a file descriptor to which the kernel writes diagnostic
+and debugging information. Both the checkpoint and restart userspace
+utilities have options to specify a filename to store this log.
+
+In addition, checkpoint provides informative status report upon
+failure in the checkpoint image in the form of (one or more) error
+objects, 'struct ckpt_hdr_err'.  An error objects consists of a
+mandatory pre-header followed by a null character ('\0'), and then a
+string that describes the error. By default, if an error occurs, this
+will be the last object written to the checkpoint image.
+
+Upon failure, the caller can examine the image (e.g. with 'ckptinfo')
+and extract the detailed error message. The leading '\0' is useful if
+one wants to seek back from the end of the checkpoint image, instead
+of parsing the entire image separately.
+
+
+Security
+========
+
+The main question is whether sys_checkpoint() and sys_restart()
+require privileged or unprivileged operation.
+
+Early versions checked capable(CAP_SYS_ADMIN) assuming that we would
+attempt to remove the need for privilege, so that all users could
+safely use it. Arnd Bergmann pointed out that it'd make more sense to
+let unprivileged users use them now, so that we'll be more careful
+about the security as patches roll in.
+
+Checkpoint: the main concern is whether a task that performs the
+checkpoint of another task has sufficient privileges to access its
+state. We address this by requiring that the checkpointer task will be
+able to ptrace the target task, by means of ptrace_may_access() with
+access mode.
+
+Restart: the main concern is that we may allow an unprivileged user to
+feed the kernel with random data. To this end, the restart works in a
+way that does not skip the usual security checks. Task credentials,
+i.e. euid, reuid, and LSM security contexts currently come from the
+caller, not the checkpoint image.  As credentials are restored too,
+the ability of a task that calls sys_restore() to setresuid/setresgid
+to those values must be checked.
+
+Keeping the restart procedure to operate within the limits of the
+caller's credentials means that there various scenarios that cannot
+be supported. For instance, a setuid program that opened a protected
+log file and then dropped privileges will fail the restart, because
+the user won't have enough credentials to reopen the file. In these
+cases, we should probably treat restarting like inserting a kernel
+module: surely the user can cause havoc by providing incorrect data,
+but then again we must trust the root account.
+
+So that's why we don't want CAP_SYS_ADMIN required up-front. That way
+we will be forced to more carefully review each of those features.
+However, this can be controlled with a sysctl-variable.
+
+
+Kernel interfaces
+=================
+
+* To checkpoint a vma, the 'struct vm_operations_struct' needs to
+  provide a method ->checkpoint:
+    int checkpoint(struct ckpt_ctx *, struct vma_struct *)
+  Restart requires a matching (exported) restore:
+    int restore(struct ckpt_ctx *, struct mm_struct *, struct ckpt_hdr_vma *)
+
+* To checkpoint a file, the 'struct file_operations' needs to provide
+  the methods ->checkpoint and ->collect:
+    int checkpoint(struct ckpt_ctx *, struct file *)
+    int collect(struct ckpt_ctx *, struct file *)
+  Restart requires a matching (exported) restore:
+    int restore(struct ckpt_ctx *, struct ckpt_hdr_file *)
+  For most file systems, generic_file_{checkpoint,restore}() can be
+  used.
+
+* To checkpoint a socket, the 'struct proto_ops' needs to provide
+  the methods ->checkpoint, ->collect and ->restore:
+    int checkpoint(struct ckpt_ctx *ctx, struct socket *sock);
+    int collect(struct ckpt_ctx *ctx, struct socket *sock);
+    int restore(struct ckpt_ctx *, struct socket *sock, struct ckpt_hdr_socket *h)
+
diff --git a/Documentation/checkpoint/self_checkpoint.c b/Documentation/checkpoint/self_checkpoint.c
new file mode 100644
index 0000000..27dba0d
--- /dev/null
+++ b/Documentation/checkpoint/self_checkpoint.c
@@ -0,0 +1,69 @@
+/*
+ *  self_checkpoint.c: demonstrate self-checkpoint
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <math.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int checkpoint(pid_t pid, int fd, unsigned long flags)
+{
+	return syscall(__NR_checkpoint, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+#define OUTFILE  "/tmp/cr-self.out"
+
+int main(int argc, char *argv[])
+{
+	pid_t pid = getpid();
+	FILE *file;
+	int i, ret;
+
+	close(0);
+	close(2);
+
+	unlink(OUTFILE);
+	file = fopen(OUTFILE, "w+");
+	if (!file) {
+		perror("open");
+		exit(1);
+	}
+	if (dup2(0, 2) < 0) {
+		perror("dup2");
+		exit(1);
+	}
+
+	fprintf(file, "hello, world!\n");
+	fflush(file);
+
+	for (i = 0; i < 1000; i++) {
+		sleep(1);
+		fprintf(file, "count %d\n", i);
+		fflush(file);
+
+		if (i != 2)
+			continue;
+		ret = checkpoint(pid, STDOUT_FILENO, CHECKPOINT_SUBTREE);
+		if (ret < 0) {
+			fprintf(file, "ckpt: %s\n", strerror(errno));
+			exit(2);
+		}
+
+		fprintf(file, "checkpoint ret: %d\n", ret);
+		fflush(file);
+	}
+
+	return 0;
+}
diff --git a/Documentation/checkpoint/self_restart.c b/Documentation/checkpoint/self_restart.c
new file mode 100644
index 0000000..647ce51
--- /dev/null
+++ b/Documentation/checkpoint/self_restart.c
@@ -0,0 +1,40 @@
+/*
+ *  self_restart.c: demonstrate self-restart
+ *
+ *  Copyright (C) 2008 Oren Laadan
+ *
+ *  This file is subject to the terms and conditions of the GNU General Public
+ *  License.  See the file COPYING in the main directory of the Linux
+ *  distribution for more details.
+ */
+
+#define _GNU_SOURCE        /* or _BSD_SOURCE or _SVID_SOURCE */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+#include <linux/checkpoint.h>
+
+static inline int restart(pid_t pid, int fd, unsigned long flags)
+{
+	return syscall(__NR_restart, pid, fd, flags, CHECKPOINT_FD_NONE);
+}
+
+int main(int argc, char *argv[])
+{
+	pid_t pid = getpid();
+	int ret;
+
+	ret = restart(pid, STDIN_FILENO, RESTART_TASKSELF);
+	if (ret < 0)
+		perror("restart");
+
+	printf("should not reach here !\n");
+
+	return 0;
+}
diff --git a/Documentation/checkpoint/usage.txt b/Documentation/checkpoint/usage.txt
new file mode 100644
index 0000000..c6fc045
--- /dev/null
+++ b/Documentation/checkpoint/usage.txt
@@ -0,0 +1,247 @@
+
+	      How to use Checkpoint-Restart
+	=========================================
+
+
+API
+===
+
+The API consists of three new system calls:
+
+* long checkpoint(pid_t pid, int fd, unsigned long flag, int logfd);
+
+ Checkpoint a (sub-)container whose root task is identified by @pid,
+ to the open file indicated by @fd. If @logfd isn't -1, it indicates
+ an open file to which error and debug messages are written. @flags
+ may be one or more of:
+   - CHECKPOINT_SUBTREE : allow checkpoint of sub-container
+ (other value are not allowed).
+
+ Returns: a positive checkpoint identifier (ckptid) upon success, 0 if
+ it returns from a restart, and -1 if an error occurs. The ckptid will
+ uniquely identify a checkpoint image, for as long as the checkpoint
+ is kept in the kernel (e.g. if one wishes to keep a checkpoint, or a
+ partial checkpoint, residing in kernel memory).
+
+* long sys_restart(pid_t pid, int fd, unsigned long flags, int logfd);
+
+ Restart a process hierarchy from a checkpoint image that is read from
+ the blob stored in the file indicated by @fd.  If @logfd isn't -1, it
+ indicates an open file to which error and debug messages are written.
+ @flags will have future meaning (must be 0 for now). @pid indicates
+ the root of the hierarchy as seen in the coordinator's pid-namespace,
+ and is expected to be a child of the coordinator. @flags may be one
+ or more of:
+   - RESTART_TASKSELF : (self) restart of a single process
+   - RESTART_FROEZN : processes remain frozen once restart completes
+   - RESTART_GHOST : process is a ghost (placeholder for a pid)
+ (Note that this argument may mean 'ckptid' to identify an in-kernel
+ checkpoint image, with some @flags in the future).
+
+ Returns: -1 if an error occurs, 0 on success when restarting from a
+ "self" checkpoint, and return value of system call at the time of the
+ checkpoint when restarting from an "external" checkpoint.
+
+ (If a process was frozen for checkpoint while in userspace, it will
+ resume running in userspace exactly where it was interrupted. If it
+ was frozen while in kernel doing a syscall, it will return what the
+ syscall returned when interrupted/completed, and proceed from there
+ as if it had only been frozen and then thawed. Finally, if it did a
+ self-checkpoint, it will resume to the first instruction after the
+ call to checkpoint(2), having returned 0, to indicate whether the
+ return is from the checkpoint or a restart).
+
+* int clone_with_pid(unsigned long clone_flags, void *news,
+		     int *parent_tidptr, int *child_tidptr,
+		     struct target_pid_set *pid_set)
+
+  struct target_pid_set {
+	 int num_pids;
+	 pid_t *target_pids;
+  }
+
+ Container restart requires that a task have the same pid it had when
+ it was checkpointed. When containers are nested the tasks within the
+ containers exist in multiple pid namespaces and hence have multiple
+ pids to specify during restart.
+
+ clone_with_pids(), intended for use during restart, is similar to
+ clone(), except that it takes a 'target_pid_set' parameter. This
+ parameter lets caller choose specific pid numbers for the child
+ process, in the process's active and ancestor pid namespaces.
+
+ Unlike clone(), clone_with_pids() needs CAP_SYS_ADMIN, at least for
+ now, to prevent unprivileged processes from misusing this interface.
+
+ If a target-pid is 0, the kernel continues to assign a pid for the
+ process in that namespace. If a requested pid is taken, the system
+ call fails with -EBUSY. If 'pid_set.num_pids' exceeds the current
+ nesting level of pid namespaces, the system call fails with -EINVAL.
+
+
+Sysctl/proc
+===========
+
+/proc/sys/kernel/ckpt_unpriv_allowed		[default = 1]
+  controls whether c/r operation is allowed for unprivileged users
+
+
+Operation
+=========
+
+The granularity of a checkpoint usually is a process hierarchy. The
+'pid' argument is interpreted in the caller's pid namespace. So to
+checkpoint a container whose init task (pid 1 in that pidns) appears
+as pid 3497 the caller's pidns, the caller must use pid 3497. Passing
+pid 1 will attempt to checkpoint the caller's container, and if the
+caller isn't privileged and init is owned by root, it will fail.
+
+Unless the CHECKPOINT_SUBTREE flag is set, if the caller passes a pid
+which does not refer to a container's init task, then sys_checkpoint()
+would return -EINVAL.
+
+We assume that during checkpoint and restart the container state is
+quiescent. During checkpoint, this means that all affected tasks are
+frozen (or otherwise stopped). During restart, this means that all
+affected tasks are executing the sys_restart() call. In both cases, if
+there are other tasks possible sharing state with the container, they
+must not modify it during the operation. It is the responsibility of
+the caller to follow this requirement.
+
+If the assumption that all tasks are frozen and that there is no other
+sharing doesn't hold - then the results of the operation are undefined
+(just as, e.g. not calling execve() immediately after vfork() produces
+undefined results). In particular, either checkpoint will fail, or it
+may produce a checkpoint image that can't be restarted, or (unlikely)
+the restart may produce a container whose state does not match that of
+the original container.
+
+
+User tools
+==========
+
+* checkpoint(1): a tool to perform a checkpoint of a container/subtree
+* restart(1): a tool to restart a container/subtree
+* ckptinfo: a tool to examine a checkpoint image
+
+It is best to use the dedicated user tools for checkpoint and restart.
+
+If you insist, then here is a code snippet that illustrates how a
+checkpoint is initiated by a process inside a container - the logic is
+similar to fork():
+	...
+	ckptid = checkpoint(0, ...);
+	switch (crid) {
+	case -1:
+		perror("checkpoint failed");
+		break;
+	default:
+		fprintf(stderr, "checkpoint succeeded, CRID=%d\n", ret);
+		/* proceed with execution after checkpoint */
+		...
+		break;
+	case 0:
+		fprintf(stderr, "returned after restart\n");
+		/* proceed with action required following a restart */
+		...
+		break;
+	}
+	...
+
+And to initiate a restart, the process in an empty container can use
+logic similar to execve():
+	...
+	if (restart(pid, ...) < 0)
+		perror("restart failed");
+	/* only get here if restart failed */
+	...
+
+Note, that the code also supports "self" checkpoint, where a process
+can checkpoint itself. This mode does not capture the relationships of
+the task with other tasks, or any shared resources. It is useful for
+application that wish to be able to save and restore their state.
+They will either not use (or care about) shared resources, or they
+will be aware of the operations and adapt suitably after a restart.
+The code above can also be used for "self" checkpoint.
+
+
+You may find the following sample programs useful:
+
+* checkpoint.c: accepts a 'pid' and checkpoint that task to stdout
+* self_checkpoint.c: a simple test program doing self-checkpoint
+* self_restart.c: restarts a (self-) checkpoint image from stdin
+
+See also the utilities 'checkpoint' and 'restart' (from user-cr).
+
+
+"External" checkpoint
+=====================
+
+To do "external" checkpoint, you need to first freeze that other task
+either using the freezer cgroup.
+
+Restart does not preserve the original PID yet, (because we haven't
+solved yet the fork-with-specific-pid issue). In a real scenario, you
+probably want to first create a new names space, and have the init
+task there call 'sys_restart()'.
+
+I tested it this way:
+	$ ./test &
+	[1] 3493
+
+	$ echo 3493 > /cgroup/0/tasks
+	$ echo FROZEN > /cgroup/0/freezer.state
+	$ ./checkpoint 3493 > ckpt.image
+
+	$ mv /tmp/cr-test.out /tmp/cr-test.out.orig
+	$ cp /tmp/cr-test.out.orig /tmp/cr-test.out
+
+	$ echo THAWED > /cgroup/0/freezer.state
+
+	$ ./self_restart < ckpt.image
+Now compare the output of the two output files.
+
+
+"Self" checkpoint
+================
+
+To do self-checkpoint, you can incorporate the code from
+self_checkpoint.c into your application.
+
+Here is how to test the self-checkpoint:
+	$ ./self_checkpoint > self.image &
+	[1] 3512
+
+	$ sleep 3
+	$ mv /tmp/cr-self.out /tmp/cr-self.out.orig
+	$ cp /tmp/cr-self.out.orig /tmp/cr-self.out
+
+	$ cat /tmp/cr-self.out
+	hello, world!
+	count 0
+	count 1
+	count 2
+	checkpoint ret: 1
+	count 3
+	...
+
+	$ sed -i 's/count/xxxxx/g' /tmp/cr-self.out
+
+	$ ./self_restart < self.image &
+
+Now compare the output of the two output files.
+	$ cat /tmp/cr-self.out
+	hello, world!
+	xxxxx 0
+	xxxxx 1
+	xxxxx 2
+	checkpoint ret: 0
+	count 3
+	...
+
+
+Note how in test.c we close stdin, stdout, stderr - that's because
+currently we only support regular files (not ttys/ptys).
+
+If you check the output of ps, you'll see that "self_restart" changed
+its name to "test" or "self_checkpoint", as expected.
-- 
1.7.2.2

^ permalink raw reply related

* [PATCH net-next-2.6] bnx2: remove cancel_work_sync() from remove_one
From: Tejun Heo @ 2010-12-14 16:09 UTC (permalink / raw)
  To: lkml, David S. Miller, Michael Chan, netdev

Michael pointed out that bnx2_close() already cancels bp->reset_task
and thus it is guaranteed to be idle when bnx2_remove_one() is called.
Remove the unnecessary cancel_work_sync() in remove_one.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Michael Chan <mchan@broadcom.com>
---
 drivers/net/bnx2.c |    2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 5c811f3..85fc2c8 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -8393,8 +8393,6 @@ bnx2_remove_one(struct pci_dev *pdev)
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct bnx2 *bp = netdev_priv(dev);

-	cancel_work_sync(&bp->reset_task);
-
 	unregister_netdev(dev);

 	if (bp->mips_firmware)

^ permalink raw reply related

* Re: Possible regression: Packet drops during iptables calls
From: Jesper Dangaard Brouer @ 2010-12-14 16:09 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, netfilter-devel, netdev
In-Reply-To: <1292340702.5934.5.camel@edumazet-laptop>

On Tue, 2010-12-14 at 16:31 +0100, Eric Dumazet wrote:
> Le mardi 14 décembre 2010 à 15:46 +0100, Jesper Dangaard Brouer a
> écrit :
> > I'm experiencing RX packet drops during call to iptables, on my
> > production servers.
> > 
> > Further investigations showed, that its only the CPU executing the
> > iptables command that experience packet drops!?  Thus, a quick fix was
> > to force the iptables command to run on one of the idle CPUs (This can
> > be achieved with the "taskset" command).
> > 
> > I have a 2x Xeon 5550 CPU system, thus 16 CPUs (with HT enabled).  We
> > only use 8 CPUs due to a multiqueue limitation of 8 queues in the
> > 1Gbit/s NICs (82576 chips).  CPUs 0 to 7 is assigned for packet
> > processing via smp_affinity.
> > 
> > Can someone explain why the packet drops only occur on the CPU
> > executing the iptables command?
> > 
> 
> It blocks BH
> 
> take a look at commits :
> 
> 24b36f0193467fa727b85b4c004016a8dae999b9
> netfilter: {ip,ip6,arp}_tables: dont block bottom half more than
> necessary 
> 
> 001389b9581c13fe5fc357a0f89234f85af4215d
> netfilter: {ip,ip6,arp}_tables: avoid lockdep false positive
> 
> for attempts to let BH fly ...
> 
> Unfortunately, lockdep rules :(

Is the lockdep check a false positive?
Could I run with 24b36f0193 in production, to fix my problem?

I forgot to mention I run kernel 2.6.35.8-comx01+ (based on Greg's stable kernel tree).

$ git describe --contains 24b36f019346
v2.6.36-rc1~571^2~46^2~7
$ git describe --contains 001389b9581c1
v2.6.36-rc3~2^2~42


> > What can we do to solve this issue?

Any ideas how we can proceed?

Looking closer at the two combined code change, I see that the code path
has been improved (a bit), as the local BH is only disabled inside the
for_each_possible_cpu(cpu).  Before local_bh was disabled for the hole
function.  Guess I need to reproduce this in my testlab.

Thanks for your 'ninja' input ;-)
-- 
Med venlig hilsen / Best regards
  Jesper Brouer
  ComX Networks A/S
  Linux Network Kernel Developer
  Cand. Scient Datalog / MSc.CS
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH 4/5 v2] ifb: add multiqueue support
From: Eric Dumazet @ 2010-12-14 15:59 UTC (permalink / raw)
  To: Changli Gao; +Cc: Jamal Hadi Salim, David S. Miller, netdev
In-Reply-To: <1292340098-25537-1-git-send-email-xiaosuo@gmail.com>

Le mardi 14 décembre 2010 à 23:21 +0800, Changli Gao a écrit :


> +static struct rtnl_link_stats64 *ifb_get_stats64(struct net_device *dev,
> +		struct rtnl_link_stats64 *stats)
> +{
> +	struct ifb_q_private *q;
> +	struct netdev_queue *txq;
> +	int cpu;
> +	u64 rx_packets, rx_bytes, rx_dropped;
> +	u64 tx_packets, tx_bytes, tx_dropped;
> +	unsigned int start;
> +
> +	for_each_possible_cpu(cpu) {
> +		q = per_cpu_ptr(ifb_priv(dev), cpu);
> +		txq = q->txq;
> +		do {
> +			start = u64_stats_fetch_begin_bh(&q->syncp);
> +			rx_packets = q->rx_packets;
> +			rx_bytes = q->rx_bytes;
> +			rx_dropped = q->rx_dropped;
> +			tx_packets = txq->tx_packets;
> +			tx_bytes = txq->tx_bytes;
> +			tx_dropped = txq->tx_dropped;
> +		} while (u64_stats_fetch_retry_bh(&q->syncp, start));
> +		stats->rx_packets += rx_packets;
> +		stats->rx_bytes += rx_bytes;
> +		stats->rx_dropped += rx_dropped;
> +		stats->tx_packets += tx_packets;
> +		stats->tx_bytes += tx_bytes;
> +		stats->tx_dropped += tx_dropped;
> +	}
> +
> +	return stats;
> +}
> +


There is a problem here.

You should sum in the loop rx_counters only, (the counters syncp
protected), and use dev_txq_stats_fold() to get the tx_counters from
core network.

dev_txq_stats_fold(dev, stats);
for_each_possible_cpu(cpu) {
	q = per_cpu_ptr(ifb_priv(dev), cpu);
	txq = q->txq;
	do {
		start = u64_stats_fetch_begin_bh(&q->syncp);
		rx_packets = q->rx_packets;
		rx_bytes   = q->rx_bytes;
		rx_dropped = q->rx_dropped;
	} while (u64_stats_fetch_retry_bh(&q->syncp, start));
	stats->rx_packets += rx_packets;
	stats->rx_bytes   += rx_bytes;
	stats->rx_dropped += rx_dropped;
}
return stats;



^ permalink raw reply

* Re: [RFC][net-next-2.6 PATCH 0/2] rtnetlink: New IFLA_PORT_PROTO_* attr
From: Arnd Bergmann @ 2010-12-14 15:47 UTC (permalink / raw)
  To: Christian Benvenuti; +Cc: davem, netdev
In-Reply-To: <20101208042925.16856.89232.stgit@savbu-pc100.cisco.com>

On Wednesday 08 December 2010, Christian Benvenuti wrote:
> 
> In order to be able to scope a port profile, as part of the 802.1Qbh
> implementation we would like to add a new attribute: IFLA_PORT_CLUSTER_UUID.
> This parameter (perhaps known under a different name) is already in use
> (or going to be added) by most Virtual Machine Managers to define migration
> domains. In the case of 802.1Qbh a port profile would most likely be scoped
> using the same ID used by VM manager to represent the migration domain.
> 
> Adding another attribute (IFLA_PORT_CLUSTER_UUID in this case) to the list of
> IFLA_PORT_* attributes is an option.

Sounds reasonable.

> However, we thought that it would be better to 1st re-arrange the current
> Netlink attribute scheme in order to better group the IFLA_PORT_* attributes
> (for example by protocol).

We don't normally rearrange protocols once they are in an upstream
release. Having to maintain compatibility to two different versions
of the API is a huge burden for maintainance, so IMHO the only
reason why we should deprecate the current API and introduce
a new one is if it is absolutely impossible to implement necessary
features without breaking compatibility. Your explanations are
very detailed and well explained, but I have not found anything
in there that describes why it cannot be done without changing the
existing interface.

> --------------------------------------------------
> 2) REASON FOR THIS CHANGE
> --------------------------------------------------
> We would like to add one more attribute (IFLA_PORT_CLUSTER_UUID), and the list 
> of IFLA_PORT_* attributes may need to grow again due to the changes that may
> be required by the two still evolving standard protocols 802.1Qbh/802.1Qbg.
> Because of that, if you see a value in the re-organization of the
> IFLA_PORT_* attributes that we are proposing, it would be better to address
> such changes sooner than later, in order to reduce the impact of backward
> compatibility issues later.

The changes you propose now seem reasonable and we could probably have
done it that way initially, but as far as I'm concerned they are too late.
The burden imposed by the change is larger than the risk of breaking
backwards compatibility later by not fixing it now, as far as I'm concerned.

To give another example, the split between IFLA_VFINFO_LIST and
IFLA_VF_PORTS is totally arbitrary, we should have merged them at the
time, but because of timing concerns of the two going in during the
merge window, we are stuck with two separate lists of VFs now, and
I don't think we should change them any more.

> --------------------------------------------------
> 4) IFLA_* versus IFLA_PORT_*
> --------------------------------------------------
> 
> Here is an alternative way to introduce the new Netlink attribute scheme.
> We personally like better the previous scheme, but I'll include this one too
> should someone find it interesting.

The impact of doing this would be even bigger.
 
> OPTION_3: According to the new Netlink attribute scheme that we are
>           proposing, each protocol has its own set of attributes and
>           therefore it would not be considered superfluous to have the
>           same (or a similar) attribute defined for both protocols.
>           (in this case it would be manager_ID for 802.1qbg and
>           cluster_uuid for 802.1qbh).
> 
> To us OPTION_3 looks like the option that offers most flexibility.

I don't see this depending on the change to split attributes per
protocol. Just introducing a new IFLA_PORT_CLUSTER_UUID should be
all you need. Since a cluster UUID is not exactly the same concept
as a vsi manager id, there is no need to share the same netlink
attribute.

> --------------------------------------------------
> 8) OPT1: MORE CONFIGURATION FLEXIBILITY
> --------------------------------------------------
> The change described in this section is orthogonal to the ones Discussed above.
> We believe it would add value to the new scheme.
> We would like to include it as part of the new Netlink scheme (but the current
> patch does not include it).
> 
> In order to allow device drivers (or a generic consumer of the Netlink messages)
> to provide extra features or simple optimizations I would suggest the
> introduction of a new nested attribute that I will call IFLA_PORT_DATA for now.
> 
> This attribute would allow the use of extra attributes that are not part of
> the official protocol specs (802.1Qbg/bh for now) or simply allow device
> drivers to start supporting pre-standard parameters that would not be included
> in the Netlink scheme before they reach some stability.

I really don't think that you should add per-driver attributes. If we
believe that we need an extension for a specific feature in the netlink
interface, it should be defined in a way that is generic enough to work
for other hardware implementing the same feature.

> Here are a couple of examples of use.
> Let's suppose that driver ABC needed to receive a couple of parameters
> more (that are not part of the official 802.1Qbh/bg protocols).
> In this case driver ABC can use the new attribute IFLA_PORT_DATA to
> receive its two additional parameters without any need to touch/modify
> the IFLA_PORT_* list of attributes.

We can in theory add features that are not part of the official
standard. IMHO it is more important that the features are of
general interest and are being actively used. They should of
course not conflict with other features or the standard.

> If in the future driver ABC needed to change any of its private
> parameters (those it receives through the IFLA_PORT_DATA attribute), it
> can do it by updating its parsing routine (of course it would need
> to implement a basic versioning scheme for its private attributes), but
> no change would be required in the core Netlink code.

A data structure being private to a driver would not save you from
maintaining backwards compatibility, you still cannot just go and
change it as you like.

> If we do not want to add IFLA_PORT_DATA, an alternative solution would
> be that of using a separate control channel to provide that extra
> info, for example based on something like the NETLINK_GENERIC Netlink
> protocol.
> This alternative approach would offer the same flexibility, but I
> can see One drawback: this solution would require some extra code
> to synchronize the two control channels
> (generic NETLINK_ROUTE/IFLA_PORT_XXX and NETLINK_GENERIC/Driver).

Right, using generic netlink for this does not help, it has all the
problems of your IFLA_PORT_DATA suggestions and is more complex.

Just don't add driver-private interfaces, make them official!

If we give driver writers a way to add their own interfaces, there
is a very realistic risk of these interface being defined in a
broken way, with people relying on them before the code gets
submitted for mainline inclusion.

	Arnd

^ permalink raw reply

* [PATCH net-2.6] net: fix nulls list corruptions in sk_prot_alloc
From: Octavian Purdila @ 2010-12-14 15:44 UTC (permalink / raw)
  To: netdev; +Cc: Octavian Purdila, Leonard Crestez

Special care is taken inside sk_port_alloc to avoid overwriting
skc_node/skc_nulls_node. We should also avoid overwriting
skc_bind_node/skc_portaddr_node.

The patch fixes the following crash:

 BUG: unable to handle kernel paging request at fffffffffffffff0
 IP: [<ffffffff812ec6dd>] udp4_lib_lookup2+0xad/0x370
 [<ffffffff812ecc22>] __udp4_lib_lookup+0x282/0x360
 [<ffffffff812ed63e>] __udp4_lib_rcv+0x31e/0x700
 [<ffffffff812bba45>] ? ip_local_deliver_finish+0x65/0x190
 [<ffffffff812bbbf8>] ? ip_local_deliver+0x88/0xa0
 [<ffffffff812eda35>] udp_rcv+0x15/0x20
 [<ffffffff812bba45>] ip_local_deliver_finish+0x65/0x190
 [<ffffffff812bbbf8>] ip_local_deliver+0x88/0xa0
 [<ffffffff812bb2cd>] ip_rcv_finish+0x32d/0x6f0
 [<ffffffff8128c14c>] ? netif_receive_skb+0x99c/0x11c0
 [<ffffffff812bb94b>] ip_rcv+0x2bb/0x350
 [<ffffffff8128c14c>] netif_receive_skb+0x99c/0x11c0

Signed-off-by: Leonard Crestez <lcrestez@ixiacom.com>
Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
---
 include/net/sock.h  |    4 ++++
 net/core/sock.c     |   49 +++++++++++++++++++++++++++++++++++++------------
 net/dccp/ipv4.c     |    1 +
 net/dccp/ipv6.c     |    1 +
 net/ipv4/tcp_ipv4.c |    1 +
 net/ipv4/udp.c      |    1 +
 net/ipv4/udplite.c  |    1 +
 net/ipv6/tcp_ipv6.c |    1 +
 net/ipv6/udp.c      |    1 +
 net/ipv6/udplite.c  |    1 +
 net/llc/af_llc.c    |    1 +
 11 files changed, 50 insertions(+), 12 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 659d968..23747a8 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -754,6 +754,7 @@ struct proto {
 	void			(*unhash)(struct sock *sk);
 	void			(*rehash)(struct sock *sk);
 	int			(*get_port)(struct sock *sk, unsigned short snum);
+	void			(*clear_sk)(struct sock *sk, int size);
 
 	/* Keeping track of sockets in use */
 #ifdef CONFIG_PROC_FS
@@ -852,6 +853,9 @@ static inline void __sk_prot_rehash(struct sock *sk)
 	sk->sk_prot->hash(sk);
 }
 
+void sk_prot_clear_nulls(struct sock *sk, int size);
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
+
 /* About 10 seconds */
 #define SOCK_DESTROY_TIME (10*HZ)
 
diff --git a/net/core/sock.c b/net/core/sock.c
index fb60801..35e40e0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1009,6 +1009,36 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
 #endif
 }
 
+/*
+ * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
+ * un-modified. Special care is taken when initializing object to zero.
+ */
+void sk_prot_clear_nulls(struct sock *sk, int size)
+{
+	if (offsetof(struct sock, sk_node.next) != 0)
+		memset(sk, 0, offsetof(struct sock, sk_node.next));
+	memset(&sk->sk_node.pprev, 0,
+	       size - offsetof(struct sock, sk_node.pprev));
+}
+
+void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
+{
+	unsigned long nulls1, nulls2;
+
+	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
+	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
+	if (nulls1 > nulls2)
+		swap(nulls1, nulls2);
+
+	if (nulls1 != 0)
+		memset((char *)sk, 0, nulls1);
+	memset((char *)sk + nulls1 + sizeof(void *), 0,
+	       nulls2 - nulls1 - sizeof(void *));
+	memset((char *)sk + nulls2 + sizeof(void *), 0,
+	       size - nulls2 - sizeof(void *));
+}
+
+
 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		int family)
 {
@@ -1021,19 +1051,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
 		if (!sk)
 			return sk;
 		if (priority & __GFP_ZERO) {
-			/*
-			 * caches using SLAB_DESTROY_BY_RCU should let
-			 * sk_node.next un-modified. Special care is taken
-			 * when initializing object to zero.
-			 */
-			if (offsetof(struct sock, sk_node.next) != 0)
-				memset(sk, 0, offsetof(struct sock, sk_node.next));
-			memset(&sk->sk_node.pprev, 0,
-			       prot->obj_size - offsetof(struct sock,
-							 sk_node.pprev));
+			if (prot->clear_sk)
+				prot->clear_sk(sk, prot->obj_size);
+			else
+				memset(sk, 0, prot->obj_size);
 		}
-	}
-	else
+	} else
 		sk = kmalloc(prot->obj_size, priority);
 
 	if (sk != NULL) {
@@ -2331,6 +2354,8 @@ static inline void release_proto_idx(struct proto *prot)
 
 int proto_register(struct proto *prot, int alloc_slab)
 {
+	BUG_ON((prot->slab_flags & SLAB_DESTROY_BY_RCU) && !prot->clear_sk);
+
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3f69ea1..6ad0efb 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -956,6 +956,7 @@ static struct proto dccp_v4_prot = {
 	.compat_setsockopt	= compat_dccp_setsockopt,
 	.compat_getsockopt	= compat_dccp_getsockopt,
 #endif
+	.clear_sk		= sk_prot_clear_nulls,
 };
 
 static const struct net_protocol dccp_v4_protocol = {
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index dca711d..738b893 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1134,6 +1134,7 @@ static struct proto dccp_v6_prot = {
 	.compat_setsockopt = compat_dccp_setsockopt,
 	.compat_getsockopt = compat_dccp_getsockopt,
 #endif
+	.clear_sk		= sk_prot_clear_nulls,
 };
 
 static const struct inet6_protocol dccp_v6_protocol = {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e13da6d..69964c5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2627,6 +2627,7 @@ struct proto tcp_prot = {
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
 #endif
+	.clear_sk		= sk_prot_clear_nulls,
 };
 EXPORT_SYMBOL(tcp_prot);
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5e0a3a5..2d3ded4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1899,6 +1899,7 @@ struct proto udp_prot = {
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udp_prot);
 
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index ab76aa9..aee9963 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -57,6 +57,7 @@ struct proto 	udplite_prot = {
 	.compat_setsockopt = compat_udp_setsockopt,
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 EXPORT_SYMBOL(udplite_prot);
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7e41e2c..ea42a2d 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2171,6 +2171,7 @@ struct proto tcpv6_prot = {
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
 #endif
+	.clear_sk		= sk_prot_clear_nulls,
 };
 
 static const struct inet6_protocol tcpv6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 91def93..cd6cb7c 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1477,6 +1477,7 @@ struct proto udpv6_prot = {
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 
 static struct inet_protosw udpv6_protosw = {
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 5f48fad..986c4de 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -55,6 +55,7 @@ struct proto udplitev6_prot = {
 	.compat_setsockopt = compat_udpv6_setsockopt,
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
+	.clear_sk	   = sk_prot_clear_portaddr_nulls,
 };
 
 static struct inet_protosw udplite6_protosw = {
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index e35dbe5..1d54c6a 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -142,6 +142,7 @@ static struct proto llc_proto = {
 	.owner	  = THIS_MODULE,
 	.obj_size = sizeof(struct llc_sock),
 	.slab_flags = SLAB_DESTROY_BY_RCU,
+	.clear_sk = sk_prot_clear_nulls,
 };
 
 /**
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH net-2.6] be2net: use mutex instead of spin lock for mbox_lock
From: Ivan Vecera @ 2010-12-14 15:43 UTC (permalink / raw)
  To: netdev; +Cc: sathyap, subbus, sarveshwarb, ajitk, davem, bhutchings
In-Reply-To: <1292336483.20458.1.camel@bwh-desktop>

On Tue, 2010-12-14 at 14:21 +0000, Ben Hutchings wrote:
> On Tue, 2010-12-14 at 14:46 +0100, Ivan Vecera wrote:
> > Since the mbox polling uses the schedule_timeout, the mbox_lock should be
> > a semaphore and not a spin lock.
> > The commit f25b03a replaced udelay() with schedule_timeout() but didn't
> > change the mbox_lock to a semaphore or a mutex.
> [...]
> 
> I see no reason for this to be a semaphore; use a mutex instead.
> 
> Ben
Ok, Ben... the new version

Since the mbox polling uses the schedule_timeout, the mbox_lock should be
a mutex and not a spin lock.
The commit f25b03a replaced udelay() with schedule_timeout() but didn't
change mbox_lock to semaphore or mutex.

Signed-off-by: Ivan Vecera <ivecera@redhat.com>
---
 drivers/net/benet/be.h      |    2 +-
 drivers/net/benet/be_cmds.c |   75 +++++++++++++++++++++++++-----------------
 drivers/net/benet/be_main.c |    2 +-
 3 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/drivers/net/benet/be.h b/drivers/net/benet/be.h
index 4594a28..d64313b 100644
--- a/drivers/net/benet/be.h
+++ b/drivers/net/benet/be.h
@@ -234,7 +234,7 @@ struct be_adapter {
 	u8 __iomem *db;		/* Door Bell */
 	u8 __iomem *pcicfg;	/* PCI config space */
 
-	spinlock_t mbox_lock;	/* For serializing mbox cmds to BE card */
+	struct mutex mbox_lock; /* For serializing mbox cmds to BE card */
 	struct be_dma_mem mbox_mem;
 	/* Mbox mem is adjusted to align to 16 bytes. The allocated addr
 	 * is stored for freeing purpose */
diff --git a/drivers/net/benet/be_cmds.c b/drivers/net/benet/be_cmds.c
index e4465d2..1c8c79c 100644
--- a/drivers/net/benet/be_cmds.c
+++ b/drivers/net/benet/be_cmds.c
@@ -462,7 +462,8 @@ int be_cmd_fw_init(struct be_adapter *adapter)
 	u8 *wrb;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = (u8 *)wrb_from_mbox(adapter);
 	*wrb++ = 0xFF;
@@ -476,7 +477,7 @@ int be_cmd_fw_init(struct be_adapter *adapter)
 
 	status = be_mbox_notify_wait(adapter);
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
@@ -491,7 +492,8 @@ int be_cmd_fw_clean(struct be_adapter *adapter)
 	if (adapter->eeh_err)
 		return -EIO;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = (u8 *)wrb_from_mbox(adapter);
 	*wrb++ = 0xFF;
@@ -505,7 +507,7 @@ int be_cmd_fw_clean(struct be_adapter *adapter)
 
 	status = be_mbox_notify_wait(adapter);
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 int be_cmd_eq_create(struct be_adapter *adapter,
@@ -516,7 +518,8 @@ int be_cmd_eq_create(struct be_adapter *adapter,
 	struct be_dma_mem *q_mem = &eq->dma_mem;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -546,7 +549,7 @@ int be_cmd_eq_create(struct be_adapter *adapter,
 		eq->created = true;
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
@@ -558,7 +561,8 @@ int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
 	struct be_cmd_req_mac_query *req;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -583,7 +587,7 @@ int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
 		memcpy(mac_addr, resp->mac.addr, ETH_ALEN);
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
@@ -667,7 +671,8 @@ int be_cmd_cq_create(struct be_adapter *adapter,
 	void *ctxt;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -701,7 +706,7 @@ int be_cmd_cq_create(struct be_adapter *adapter,
 		cq->created = true;
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 
 	return status;
 }
@@ -724,7 +729,8 @@ int be_cmd_mccq_create(struct be_adapter *adapter,
 	void *ctxt;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -754,7 +760,7 @@ int be_cmd_mccq_create(struct be_adapter *adapter,
 		mccq->id = le16_to_cpu(resp->id);
 		mccq->created = true;
 	}
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 
 	return status;
 }
@@ -769,7 +775,8 @@ int be_cmd_txq_create(struct be_adapter *adapter,
 	void *ctxt;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -801,7 +808,7 @@ int be_cmd_txq_create(struct be_adapter *adapter,
 		txq->created = true;
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 
 	return status;
 }
@@ -816,7 +823,8 @@ int be_cmd_rxq_create(struct be_adapter *adapter,
 	struct be_dma_mem *q_mem = &rxq->dma_mem;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -843,7 +851,7 @@ int be_cmd_rxq_create(struct be_adapter *adapter,
 		*rss_id = resp->rss_id;
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 
 	return status;
 }
@@ -862,7 +870,8 @@ int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q,
 	if (adapter->eeh_err)
 		return -EIO;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -899,7 +908,7 @@ int be_cmd_q_destroy(struct be_adapter *adapter, struct be_queue_info *q,
 
 	status = be_mbox_notify_wait(adapter);
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 
 	return status;
 }
@@ -915,7 +924,8 @@ int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags,
 	struct be_cmd_req_if_create *req;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -941,7 +951,7 @@ int be_cmd_if_create(struct be_adapter *adapter, u32 cap_flags, u32 en_flags,
 			*pmac_id = le32_to_cpu(resp->pmac_id);
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
@@ -955,7 +965,8 @@ int be_cmd_if_destroy(struct be_adapter *adapter, u32 interface_id)
 	if (adapter->eeh_err)
 		return -EIO;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -970,7 +981,7 @@ int be_cmd_if_destroy(struct be_adapter *adapter, u32 interface_id)
 
 	status = be_mbox_notify_wait(adapter);
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 
 	return status;
 }
@@ -1060,7 +1071,8 @@ int be_cmd_get_fw_ver(struct be_adapter *adapter, char *fw_ver)
 	struct be_cmd_req_get_fw_version *req;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -1077,7 +1089,7 @@ int be_cmd_get_fw_ver(struct be_adapter *adapter, char *fw_ver)
 		strncpy(fw_ver, resp->firmware_version_string, FW_VER_LEN);
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
@@ -1322,7 +1334,8 @@ int be_cmd_query_fw_cfg(struct be_adapter *adapter, u32 *port_num,
 	struct be_cmd_req_query_fw_cfg *req;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -1341,7 +1354,7 @@ int be_cmd_query_fw_cfg(struct be_adapter *adapter, u32 *port_num,
 		*caps = le32_to_cpu(resp->function_caps);
 	}
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
@@ -1352,7 +1365,8 @@ int be_cmd_reset_function(struct be_adapter *adapter)
 	struct be_cmd_req_hdr *req;
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -1365,7 +1379,7 @@ int be_cmd_reset_function(struct be_adapter *adapter)
 
 	status = be_mbox_notify_wait(adapter);
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
@@ -1376,7 +1390,8 @@ int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable, u16 table_size)
 	u32 myhash[10];
 	int status;
 
-	spin_lock(&adapter->mbox_lock);
+	if (mutex_lock_interruptible(&adapter->mbox_lock))
+		return -1;
 
 	wrb = wrb_from_mbox(adapter);
 	req = embedded_payload(wrb);
@@ -1396,7 +1411,7 @@ int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable, u16 table_size)
 
 	status = be_mbox_notify_wait(adapter);
 
-	spin_unlock(&adapter->mbox_lock);
+	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
 
diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c
index 93354ee..fd251b5 100644
--- a/drivers/net/benet/be_main.c
+++ b/drivers/net/benet/be_main.c
@@ -2677,7 +2677,7 @@ static int be_ctrl_init(struct be_adapter *adapter)
 	}
 	memset(mc_cmd_mem->va, 0, mc_cmd_mem->size);
 
-	spin_lock_init(&adapter->mbox_lock);
+	mutex_init(&adapter->mbox_lock);
 	spin_lock_init(&adapter->mcc_lock);
 	spin_lock_init(&adapter->mcc_cq_lock);
 
-- 
1.7.2.2


^ permalink raw reply related

* Re: Possible regression: Packet drops during iptables calls
From: Eric Dumazet @ 2010-12-14 15:31 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Stephen Hemminger, netfilter-devel, netdev
In-Reply-To: <1292337974.9155.68.camel@firesoul.comx.local>

Le mardi 14 décembre 2010 à 15:46 +0100, Jesper Dangaard Brouer a
écrit :
> I'm experiencing RX packet drops during call to iptables, on my
> production servers.
> 
> Further investigations showed, that its only the CPU executing the
> iptables command that experience packet drops!?  Thus, a quick fix was
> to force the iptables command to run on one of the idle CPUs (This can
> be achieved with the "taskset" command).
> 
> I have a 2x Xeon 5550 CPU system, thus 16 CPUs (with HT enabled).  We
> only use 8 CPUs due to a multiqueue limitation of 8 queues in the
> 1Gbit/s NICs (82576 chips).  CPUs 0 to 7 is assigned for packet
> processing via smp_affinity.
> 
> Can someone explain why the packet drops only occur on the CPU
> executing the iptables command?
> 
> 

It blocks BH

take a look at commits :

24b36f0193467fa727b85b4c004016a8dae999b9
netfilter: {ip,ip6,arp}_tables: dont block bottom half more than
necessary 

001389b9581c13fe5fc357a0f89234f85af4215d
netfilter: {ip,ip6,arp}_tables: avoid lockdep false positive

for attempts to let BH fly ...

Unfortunately, lockdep rules :(


> What can we do to solve this issue?
> 
> 
> I should note that I have a very large ruleset on this machine, and
> the production machine is routing around 800 Mbit/s, in each
> direction.  The issue occurs on a simple iptables rule listing.
> 
> 
> I think (untested) the problem is related to kernel git commit:
> 
>  commit 942e4a2bd680c606af0211e64eb216be2e19bf61
>  Author: Stephen Hemminger <shemminger@vyatta.com>
>  Date: Tue Apr 28 22:36:33 2009 -0700
> 
>  netfilter: revised locking for x_tables
> 
>  The x_tables are organized with a table structure and a per-cpu copies
>  of the counters and rules. On older kernels there was a reader/writer
>  lock per table which was a performance bottleneck. In 2.6.30-rc, this
>  was converted to use RCU and the counters/rules which solved the performance
>  problems for do_table but made replacing rules much slower because of
>  the necessary RCU grace period.
> 
>  This version uses a per-cpu set of spinlocks and counters to allow to
>  table processing to proceed without the cache thrashing of a global
>  reader lock and keeps the same performance for table updates.
> 
>  Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>  Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
>  Signed-off-by: David S. Miller <davem@davemloft.net>
> 


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 2/2] workqueue: deprecate cancel_rearming_delayed_work[queue]()
From: Tejun Heo @ 2010-12-14 15:23 UTC (permalink / raw)
  To: linux-kernel
  Cc: jgarzik, benh, mchehab, davem, netdev, cbou, dwmw2, zbr, gregkh,
	bfields, neilb, aelder, xfs-masters, cl, penberg, akpm,
	netfilter-devel, Trond.Myklebust, linux-nfs
In-Reply-To: <4D078B6D.5060202@kernel.org>

There's no in-kernel user left for these two obsolete functions.  Mark
them deprecated and schedule for removal during 2.6.39 cycle.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/feature-removal-schedule.txt |   10 ++++++++++
 include/linux/workqueue.h                  |    4 ++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 6c2f55e..4ff47de 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -564,3 +564,13 @@ Why:	This field is deprecated. I2C device drivers shouldn't change their
 Who:	Jean Delvare <khali@linux-fr.org>

 ----------------------------
+
+What:	cancel_rearming_delayed_work[queue]()
+When:	2.6.39
+
+Why:	The functions have been superceded by cancel_delayed_work_sync()
+	quite some time ago.  The conversion is trivial and there is no
+	in-kernel user left.
+Who:	Tejun Heo <tj@kernel.org>
+
+----------------------------
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0c0771f..6b5193d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -401,7 +401,7 @@ static inline bool __cancel_delayed_work(struct delayed_work *work)
 }

 /* Obsolete. use cancel_delayed_work_sync() */
-static inline
+static inline __deprecated
 void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
 					struct delayed_work *work)
 {
@@ -409,7 +409,7 @@ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq,
 }

 /* Obsolete. use cancel_delayed_work_sync() */
-static inline
+static inline __deprecated
 void cancel_rearming_delayed_work(struct delayed_work *work)
 {
 	cancel_delayed_work_sync(work);

^ permalink raw reply related

* [PATCH 4/5 v2] ifb: add multiqueue support
From: Changli Gao @ 2010-12-14 15:21 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: David S. Miller, netdev, Changli Gao, Eric Dumazet

Each ifb NIC has nr_cpu_ids rx queues and nr_cpu_ids queues. Packets
transmitted to ifb are enqueued to the corresponding per cpu tx queues,
and processed in the corresponding per cpu tasklet latter.

The stats are converted to the u64 ones.

tq is a stack variable now. It makes ifb_q_private smaller and tx queue
locked only once in ri_tasklet.

The tx_queue_len is multiplied by the number of online CPUs.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
v2: addressed the comments from Eirc. Thank him.
 drivers/net/ifb.c |  211 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 138 insertions(+), 73 deletions(-)
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 57c5cfb..918a38e 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -37,56 +37,60 @@
 #include <net/net_namespace.h>
 
 #define TX_Q_LIMIT    32
-struct ifb_private {
-	struct tasklet_struct   ifb_tasklet;
-	int     tasklet_pending;
-	struct sk_buff_head     rq;
-	struct sk_buff_head     tq;
+struct ifb_q_private {
+	struct net_device	*dev;
+	struct netdev_queue	*txq;
+	struct tasklet_struct	ifb_tasklet;
+	struct sk_buff_head	rq;
+	struct u64_stats_sync	syncp;
+	u64			rx_packets;
+	u64			rx_bytes;
+	u64			rx_dropped;
 };
 
+#define ifb_priv(dev) ((struct ifb_q_private __percpu *)(dev)->ml_priv)
+
 static int numifbs = 2;
 
-static void ri_tasklet(unsigned long dev)
+static void ri_tasklet(unsigned long arg)
 {
-
-	struct net_device *_dev = (struct net_device *)dev;
-	struct ifb_private *dp = netdev_priv(_dev);
-	struct net_device_stats *stats = &_dev->stats;
-	struct netdev_queue *txq;
+	struct ifb_q_private *qp = (struct ifb_q_private *)arg;
+	struct net_device *dev = qp->dev;
+	struct netdev_queue *txq = qp->txq;
 	struct sk_buff *skb;
+	struct sk_buff_head tq;
 
-	txq = netdev_get_tx_queue(_dev, 0);
-	skb = skb_peek(&dp->tq);
-	if (skb == NULL) {
-		if (__netif_tx_trylock(txq)) {
-			skb_queue_splice_tail_init(&dp->rq, &dp->tq);
-			__netif_tx_unlock(txq);
-		} else {
-			/* reschedule */
-			goto resched;
-		}
+	__skb_queue_head_init(&tq);
+	if (!__netif_tx_trylock(txq)) {
+		tasklet_schedule(&qp->ifb_tasklet);
+		return;
 	}
+	skb_queue_splice_tail_init(&qp->rq, &tq);
+	if (netif_tx_queue_stopped(txq))
+		netif_tx_wake_queue(txq);
+	__netif_tx_unlock(txq);
 
-	while ((skb = skb_dequeue(&dp->tq)) != NULL) {
+	while ((skb = __skb_dequeue(&tq)) != NULL) {
 		u32 from = G_TC_FROM(skb->tc_verd);
 
 		skb->tc_verd = 0;
 		skb->tc_verd = SET_TC_NCLS(skb->tc_verd);
-		stats->tx_packets++;
-		stats->tx_bytes += skb->len;
+		u64_stats_update_begin(&qp->syncp);
+		txq->tx_packets++;
+		txq->tx_bytes += skb->len;
 
 		rcu_read_lock();
 		skb->dev = dev_get_by_index_rcu(&init_net, skb->skb_iif);
 		if (!skb->dev) {
 			rcu_read_unlock();
+			txq->tx_dropped++;
+			u64_stats_update_end(&qp->syncp);
 			dev_kfree_skb(skb);
-			stats->tx_dropped++;
-			if (skb_queue_len(&dp->tq) != 0)
-				goto resched;
-			break;
+			continue;
 		}
 		rcu_read_unlock();
-		skb->skb_iif = _dev->ifindex;
+		u64_stats_update_end(&qp->syncp);
+		skb->skb_iif = dev->ifindex;
 
 		if (from & AT_EGRESS) {
 			dev_queue_xmit(skb);
@@ -96,82 +100,135 @@ static void ri_tasklet(unsigned long dev)
 		} else
 			BUG();
 	}
-
-	if (__netif_tx_trylock(txq)) {
-		skb = skb_peek(&dp->rq);
-		if (skb == NULL) {
-			dp->tasklet_pending = 0;
-			if (netif_queue_stopped(_dev))
-				netif_wake_queue(_dev);
-		} else {
-			__netif_tx_unlock(txq);
-			goto resched;
-		}
-		__netif_tx_unlock(txq);
-	} else {
-resched:
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
-
 }
 
 static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
+	int qid = skb_get_queue_mapping(skb);
+	struct ifb_q_private *qp = per_cpu_ptr(ifb_priv(dev), qid);
 	u32 from = G_TC_FROM(skb->tc_verd);
 
-	stats->rx_packets++;
-	stats->rx_bytes += skb->len;
+	WARN_ON(qid != smp_processor_id());
+
+	u64_stats_update_begin(&qp->syncp);
+	qp->rx_packets++;
+	qp->rx_bytes += skb->len;
 
 	if (!(from & (AT_INGRESS|AT_EGRESS)) || !skb->skb_iif) {
+		qp->rx_dropped++;
+		u64_stats_update_end(&qp->syncp);
 		dev_kfree_skb(skb);
-		stats->rx_dropped++;
 		return NETDEV_TX_OK;
 	}
+	u64_stats_update_end(&qp->syncp);
 
-	__skb_queue_tail(&dp->rq, skb);
-	if (!dp->tasklet_pending) {
-		dp->tasklet_pending = 1;
-		tasklet_schedule(&dp->ifb_tasklet);
-	}
+	__skb_queue_tail(&qp->rq, skb);
+	if (skb_queue_len(&qp->rq) == 1)
+		tasklet_schedule(&qp->ifb_tasklet);
 
-	if (skb_queue_len(&dp->rq) >= dev->tx_queue_len)
-		netif_stop_queue(dev);
+	if (skb_queue_len(&qp->rq) >= dev->tx_queue_len)
+		netif_tx_stop_queue(qp->txq);
 
 	return NETDEV_TX_OK;
 }
 
 static int ifb_close(struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
-
-	tasklet_kill(&dp->ifb_tasklet);
-	netif_stop_queue(dev);
-	__skb_queue_purge(&dp->rq);
-	__skb_queue_purge(&dp->tq);
+	struct ifb_q_private *qp;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		qp = per_cpu_ptr(ifb_priv(dev), cpu);
+		tasklet_kill(&qp->ifb_tasklet);
+		netif_tx_stop_queue(qp->txq);
+		__skb_queue_purge(&qp->rq);
+	}
 
 	return 0;
 }
 
 static int ifb_open(struct net_device *dev)
 {
-	struct ifb_private *dp = netdev_priv(dev);
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		netif_tx_start_queue(netdev_get_tx_queue(dev, cpu));
 
-	tasklet_init(&dp->ifb_tasklet, ri_tasklet, (unsigned long)dev);
-	__skb_queue_head_init(&dp->rq);
-	__skb_queue_head_init(&dp->tq);
-	netif_start_queue(dev);
+	return 0;
+}
+
+static int ifb_init(struct net_device *dev)
+{
+	struct ifb_q_private *q;
+	int cpu;
+
+	dev->ml_priv = alloc_percpu(struct ifb_q_private);
+	if (!ifb_priv(dev))
+		return -ENOMEM;
+	for_each_possible_cpu(cpu) {
+		q = per_cpu_ptr(ifb_priv(dev), cpu);
+		__skb_queue_head_init(&q->rq);
+		q->txq = netdev_get_tx_queue(dev, cpu);
+		q->dev = dev;
+		tasklet_init(&q->ifb_tasklet, ri_tasklet, (unsigned long)q);
+		netdev_queue_numa_node_write(q->txq, cpu_to_node(cpu));
+	}
 
 	return 0;
 }
 
+static void ifb_uninit(struct net_device *dev)
+{
+	free_percpu(ifb_priv(dev));
+}
+
+static u16 ifb_select_queue(struct net_device *dev, struct sk_buff *skb)
+{
+	return smp_processor_id();
+}
+
+static struct rtnl_link_stats64 *ifb_get_stats64(struct net_device *dev,
+		struct rtnl_link_stats64 *stats)
+{
+	struct ifb_q_private *q;
+	struct netdev_queue *txq;
+	int cpu;
+	u64 rx_packets, rx_bytes, rx_dropped;
+	u64 tx_packets, tx_bytes, tx_dropped;
+	unsigned int start;
+
+	for_each_possible_cpu(cpu) {
+		q = per_cpu_ptr(ifb_priv(dev), cpu);
+		txq = q->txq;
+		do {
+			start = u64_stats_fetch_begin_bh(&q->syncp);
+			rx_packets = q->rx_packets;
+			rx_bytes = q->rx_bytes;
+			rx_dropped = q->rx_dropped;
+			tx_packets = txq->tx_packets;
+			tx_bytes = txq->tx_bytes;
+			tx_dropped = txq->tx_dropped;
+		} while (u64_stats_fetch_retry_bh(&q->syncp, start));
+		stats->rx_packets += rx_packets;
+		stats->rx_bytes += rx_bytes;
+		stats->rx_dropped += rx_dropped;
+		stats->tx_packets += tx_packets;
+		stats->tx_bytes += tx_bytes;
+		stats->tx_dropped += tx_dropped;
+	}
+
+	return stats;
+}
+
 static const struct net_device_ops ifb_netdev_ops = {
+	.ndo_init		= ifb_init,
+	.ndo_uninit		= ifb_uninit,
 	.ndo_open		= ifb_open,
 	.ndo_stop		= ifb_close,
 	.ndo_start_xmit		= ifb_xmit,
 	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_select_queue	= ifb_select_queue,
+	.ndo_get_stats64	= ifb_get_stats64,
 };
 
 static void ifb_setup(struct net_device *dev)
@@ -202,11 +259,20 @@ static int ifb_validate(struct nlattr *tb[], struct nlattr *data[])
 	return 0;
 }
 
+static int ifb_get_tx_queues(struct net *net, struct nlattr *tb[],
+			     unsigned int *tx_queues,
+			     unsigned int *real_tx_queues)
+{
+	*real_tx_queues = *tx_queues = nr_cpu_ids;
+
+	return 0;
+}
+
 static struct rtnl_link_ops ifb_link_ops __read_mostly = {
 	.kind		= "ifb",
-	.priv_size	= sizeof(struct ifb_private),
 	.setup		= ifb_setup,
 	.validate	= ifb_validate,
+	.get_tx_queues	= ifb_get_tx_queues,
 };
 
 /* Number of ifb devices to be set up by this module. */
@@ -218,8 +284,7 @@ static int __init ifb_init_one(int index)
 	struct net_device *dev_ifb;
 	int err;
 
-	dev_ifb = alloc_netdev(sizeof(struct ifb_private), "ifb%d", ifb_setup);
-
+	dev_ifb = alloc_netdev_mq(0, "ifb%d", ifb_setup, nr_cpu_ids);
 	if (!dev_ifb)
 		return -ENOMEM;
 

^ permalink raw reply related

* [PATCH 1/2] workqueue: convert cancel_rearming_delayed_work[queue]() users to cancel_delayed_work_sync()
From: Tejun Heo @ 2010-12-14 15:21 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: jgarzik-e+AXbWqSrlAAvxtiuMwx3w,
	benh-XVmvHMARGAS8U2dJNN8I7kB+6BGkLq7r,
	mchehab-wEGCiKHe2LqWVfeAwA7xHQ, davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	netdev-u79uwXL29TY76Z2rM5mHXA, cbou-JGs/UdohzUI,
	dwmw2-wEGCiKHe2LqWVfeAwA7xHQ, zbr-i6C2adt8DTjR7s880joybQ,
	gregkh-l3A5Bk7waGM, bfields-uC3wQj2KruNg9hUCZPvPmw,
	neilb-l3A5Bk7waGM, aelder-sJ/iWh9BUns,
	xfs-masters-VZNHf3L845pBDgjK7y7TUQ,
	cl-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	penberg-bbCR+/B0CizivPeTLB3BmA,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	netfilter-devel-u79uwXL29TY76Z2rM5mHXA,
	Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA,
	linux-nfs-u79uwXL29TY76Z2rM5mHXA

cancel_rearming_delayed_work[queue]() has been superceded by
cancel_delayed_work_sync() quite some time ago.  Convert all the
in-kernel users.  The conversions are completely equivalent and
trivial.

Signed-off-by: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
Cc: Jeff Garzik <jgarzik-e+AXbWqSrlAAvxtiuMwx3w@public.gmane.org>
Cc: Benjamin Herrenschmidt <benh-XVmvHMARGAS8U2dJNN8I7kB+6BGkLq7r@public.gmane.org>
Cc: Mauro Carvalho Chehab <mchehab-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
Cc: "David S. Miller" <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
Cc: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: Anton Vorontsov <cbou-JGs/UdohzUI@public.gmane.org>
Cc: David Woodhouse <dwmw2-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
Cc: Evgeniy Polyakov <zbr-i6C2adt8DTjR7s880joybQ@public.gmane.org>
Cc: Greg Kroah-Hartman <gregkh-l3A5Bk7waGM@public.gmane.org>
Cc: "J. Bruce Fields" <bfields-uC3wQj2KruNg9hUCZPvPmw@public.gmane.org>
Cc: Neil Brown <neilb-l3A5Bk7waGM@public.gmane.org>
Cc: Alex Elder <aelder-sJ/iWh9BUns@public.gmane.org>
Cc: xfs-masters-VZNHf3L845pBDgjK7y7TUQ@public.gmane.org
Cc: Christoph Lameter <cl-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: Pekka Enberg <penberg-bbCR+/B0CizivPeTLB3BmA@public.gmane.org>
Cc: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
Cc: netfilter-devel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
Cc: Trond Myklebust <Trond.Myklebust-HgOvQuBEEgTQT0dZR+AlfA@public.gmane.org>
Cc: linux-nfs-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
These two patches convert all cancel_rearming users and deprecate the
two long obsolete functions.  Unless someone objects, I'll push these
two through the workqueue tree in the next merge window.  The next
patch will mark the two functions __deprecated and schedule for
removal during 2.6.39 cycle.

Thank you.

 drivers/ata/libata-core.c                  |    2 +-
 drivers/ata/libata-sff.c                   |    2 +-
 drivers/macintosh/rack-meter.c             |    4 ++--
 drivers/media/dvb/dvb-usb/dvb-usb-remote.c |    2 +-
 drivers/media/video/em28xx/em28xx-input.c  |    2 +-
 drivers/net/chelsio/my3126.c               |    2 +-
 drivers/net/ibm_newemac/core.c             |    4 ++--
 drivers/net/wireless/zd1211rw/zd_mac.c     |    3 +--
 drivers/power/ds2760_battery.c             |    6 ++----
 drivers/power/intel_mid_battery.c          |    6 ++----
 drivers/staging/pohmelfs/inode.c           |    4 ++--
 drivers/usb/atm/cxacru.c                   |    2 +-
 drivers/video/fb_defio.c                   |    2 +-
 drivers/video/omap/lcd_mipid.c             |    2 +-
 fs/nfsd/nfs4state.c                        |    2 +-
 fs/xfs/xfs_mru_cache.c                     |    2 +-
 mm/slab.c                                  |    2 +-
 mm/vmstat.c                                |    2 +-
 net/atm/lec.c                              |    2 +-
 net/core/netpoll.c                         |    2 +-
 net/netfilter/ipvs/ip_vs_ctl.c             |    2 +-
 net/sunrpc/xprtsock.c                      |    2 +-
 22 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 7f77c67..6669b44 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -6122,7 +6122,7 @@ static void ata_port_detach(struct ata_port *ap)
 	/* it better be dead now */
 	WARN_ON(!(ap->pflags & ATA_PFLAG_UNLOADED));

-	cancel_rearming_delayed_work(&ap->hotplug_task);
+	cancel_delayed_work_sync(&ap->hotplug_task);

  skip_eh:
 	if (ap->pmp_link) {
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index d05387d..8660a70 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -1320,7 +1320,7 @@ void ata_sff_flush_pio_task(struct ata_port *ap)
 {
 	DPRINTK("ENTER\n");

-	cancel_rearming_delayed_work(&ap->sff_pio_task);
+	cancel_delayed_work_sync(&ap->sff_pio_task);
 	ap->hsm_task_state = HSM_ST_IDLE;

 	if (ata_msg_ctl(ap))
diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c
index 53cce3a..39f660b 100644
--- a/drivers/macintosh/rack-meter.c
+++ b/drivers/macintosh/rack-meter.c
@@ -285,8 +285,8 @@ static void __devinit rackmeter_init_cpu_sniffer(struct rackmeter *rm)

 static void __devexit rackmeter_stop_cpu_sniffer(struct rackmeter *rm)
 {
-	cancel_rearming_delayed_work(&rm->cpu[0].sniffer);
-	cancel_rearming_delayed_work(&rm->cpu[1].sniffer);
+	cancel_delayed_work_sync(&rm->cpu[0].sniffer);
+	cancel_delayed_work_sync(&rm->cpu[1].sniffer);
 }

 static int __devinit rackmeter_setup(struct rackmeter *rm)
diff --git a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
index b579fed..0831469 100644
--- a/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
+++ b/drivers/media/dvb/dvb-usb/dvb-usb-remote.c
@@ -298,7 +298,7 @@ int dvb_usb_remote_init(struct dvb_usb_device *d)
 int dvb_usb_remote_exit(struct dvb_usb_device *d)
 {
 	if (d->state & DVB_USB_STATE_REMOTE) {
-		cancel_rearming_delayed_work(&d->rc_query_work);
+		cancel_delayed_work_sync(&d->rc_query_work);
 		flush_scheduled_work();
 		if (d->props.rc.mode == DVB_RC_LEGACY)
 			input_unregister_device(d->rc_input_dev);
diff --git a/drivers/media/video/em28xx/em28xx-input.c b/drivers/media/video/em28xx/em28xx-input.c
index 6759cd5..99403c7 100644
--- a/drivers/media/video/em28xx/em28xx-input.c
+++ b/drivers/media/video/em28xx/em28xx-input.c
@@ -557,7 +557,7 @@ void em28xx_deregister_snapshot_button(struct em28xx *dev)
 {
 	if (dev->sbutton_input_dev != NULL) {
 		em28xx_info("Deregistering snapshot button\n");
-		cancel_rearming_delayed_work(&dev->sbutton_query_work);
+		cancel_delayed_work_sync(&dev->sbutton_query_work);
 		input_unregister_device(dev->sbutton_input_dev);
 		dev->sbutton_input_dev = NULL;
 	}
diff --git a/drivers/net/chelsio/my3126.c b/drivers/net/chelsio/my3126.c
index 4c60285..a683fd3 100644
--- a/drivers/net/chelsio/my3126.c
+++ b/drivers/net/chelsio/my3126.c
@@ -22,7 +22,7 @@ static int my3126_interrupt_enable(struct cphy *cphy)

 static int my3126_interrupt_disable(struct cphy *cphy)
 {
-	cancel_rearming_delayed_work(&cphy->phy_update);
+	cancel_delayed_work_sync(&cphy->phy_update);
 	return 0;
 }

diff --git a/drivers/net/ibm_newemac/core.c b/drivers/net/ibm_newemac/core.c
index 06bb9b7..e209efa 100644
--- a/drivers/net/ibm_newemac/core.c
+++ b/drivers/net/ibm_newemac/core.c
@@ -1279,7 +1279,7 @@ static void emac_force_link_update(struct emac_instance *dev)
 	netif_carrier_off(dev->ndev);
 	smp_rmb();
 	if (dev->link_polling) {
-		cancel_rearming_delayed_work(&dev->link_work);
+		cancel_delayed_work_sync(&dev->link_work);
 		if (dev->link_polling)
 			schedule_delayed_work(&dev->link_work,  PHY_POLL_LINK_OFF);
 	}
@@ -1294,7 +1294,7 @@ static int emac_close(struct net_device *ndev)

 	if (dev->phy.address >= 0) {
 		dev->link_polling = 0;
-		cancel_rearming_delayed_work(&dev->link_work);
+		cancel_delayed_work_sync(&dev->link_work);
 	}
 	mutex_lock(&dev->link_lock);
 	emac_netif_stop(dev);
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index 43307bd..6107304 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -1207,7 +1207,6 @@ static void housekeeping_enable(struct zd_mac *mac)
 static void housekeeping_disable(struct zd_mac *mac)
 {
 	dev_dbg_f(zd_mac_dev(mac), "\n");
-	cancel_rearming_delayed_workqueue(zd_workqueue,
-		&mac->housekeeping.link_led_work);
+	cancel_delayed_work_sync(&mac->housekeeping.link_led_work);
 	zd_chip_control_leds(&mac->chip, ZD_LED_OFF);
 }
diff --git a/drivers/power/ds2760_battery.c b/drivers/power/ds2760_battery.c
index b3c01c1..e7f8978 100644
--- a/drivers/power/ds2760_battery.c
+++ b/drivers/power/ds2760_battery.c
@@ -580,10 +580,8 @@ static int ds2760_battery_remove(struct platform_device *pdev)
 {
 	struct ds2760_device_info *di = platform_get_drvdata(pdev);

-	cancel_rearming_delayed_workqueue(di->monitor_wqueue,
-					  &di->monitor_work);
-	cancel_rearming_delayed_workqueue(di->monitor_wqueue,
-					  &di->set_charged_work);
+	cancel_delayed_work_sync(&di->monitor_work);
+	cancel_delayed_work_sync(&di->set_charged_work);
 	destroy_workqueue(di->monitor_wqueue);
 	power_supply_unregister(&di->bat);
 	kfree(di);
diff --git a/drivers/power/intel_mid_battery.c b/drivers/power/intel_mid_battery.c
index 2a10cd3..36cf402 100644
--- a/drivers/power/intel_mid_battery.c
+++ b/drivers/power/intel_mid_battery.c
@@ -730,8 +730,7 @@ static __devinit int probe(int irq, struct device *dev)
 power_reg_failed_1:
 	power_supply_unregister(&pbi->batt);
 power_reg_failed:
-	cancel_rearming_delayed_workqueue(pbi->monitor_wqueue,
-						&pbi->monitor_battery);
+	cancel_delayed_work_sync(&pbi->monitor_battery);
 requestirq_failed:
 	destroy_workqueue(pbi->monitor_wqueue);
 wqueue_failed:
@@ -760,8 +759,7 @@ static int __devexit platform_pmic_battery_remove(struct platform_device *pdev)
 	struct pmic_power_module_info *pbi = dev_get_drvdata(&pdev->dev);

 	free_irq(pbi->irq, pbi);
-	cancel_rearming_delayed_workqueue(pbi->monitor_wqueue,
-					&pbi->monitor_battery);
+	cancel_delayed_work_sync(&pbi->monitor_battery);
 	destroy_workqueue(pbi->monitor_wqueue);

 	power_supply_unregister(&pbi->usb);
diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c
index 61685cc..d4a1f20 100644
--- a/drivers/staging/pohmelfs/inode.c
+++ b/drivers/staging/pohmelfs/inode.c
@@ -1318,8 +1318,8 @@ static void pohmelfs_put_super(struct super_block *sb)
 	}

 	psb->trans_scan_timeout = psb->drop_scan_timeout = 0;
-	cancel_rearming_delayed_work(&psb->dwork);
-	cancel_rearming_delayed_work(&psb->drop_dwork);
+	cancel_delayed_work_sync(&psb->dwork);
+	cancel_delayed_work_sync(&psb->drop_dwork);
 	flush_scheduled_work();

 	dprintk("%s: stopped workqueues.\n", __func__);
diff --git a/drivers/usb/atm/cxacru.c b/drivers/usb/atm/cxacru.c
index f383cb4..a845f8b 100644
--- a/drivers/usb/atm/cxacru.c
+++ b/drivers/usb/atm/cxacru.c
@@ -1247,7 +1247,7 @@ static void cxacru_unbind(struct usbatm_data *usbatm_instance,
 	mutex_unlock(&instance->poll_state_serialize);

 	if (is_polling)
-		cancel_rearming_delayed_work(&instance->poll_work);
+		cancel_delayed_work_sync(&instance->poll_work);

 	usb_kill_urb(instance->snd_urb);
 	usb_kill_urb(instance->rcv_urb);
diff --git a/drivers/video/fb_defio.c b/drivers/video/fb_defio.c
index 6b93ef9..8040001 100644
--- a/drivers/video/fb_defio.c
+++ b/drivers/video/fb_defio.c
@@ -75,7 +75,7 @@ int fb_deferred_io_fsync(struct file *file, int datasync)
 		return 0;

 	/* Kill off the delayed work */
-	cancel_rearming_delayed_work(&info->deferred_work);
+	cancel_delayed_work_sync(&info->deferred_work);

 	/* Run it immediately */
 	return schedule_delayed_work(&info->deferred_work, 0);
diff --git a/drivers/video/omap/lcd_mipid.c b/drivers/video/omap/lcd_mipid.c
index 64dcc74..90e3bdd 100644
--- a/drivers/video/omap/lcd_mipid.c
+++ b/drivers/video/omap/lcd_mipid.c
@@ -396,7 +396,7 @@ static void mipid_esd_start_check(struct mipid_device *md)
 static void mipid_esd_stop_check(struct mipid_device *md)
 {
 	if (md->esd_check != NULL)
-		cancel_rearming_delayed_workqueue(md->esd_wq, &md->esd_work);
+		cancel_delayed_work_sync(&md->esd_work);
 }

 static void mipid_esd_work(struct work_struct *work)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 116cab9..fbd18c3 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4336,7 +4336,7 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
-	cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+	cancel_delayed_work_sync(&laundromat_work);
 	destroy_workqueue(laundry_wq);
 	locks_end_grace(&nfsd4_manager);
 	nfs4_lock_state();
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index 45ce15d..edfa178 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -408,7 +408,7 @@ xfs_mru_cache_flush(
 	spin_lock(&mru->lock);
 	if (mru->queued) {
 		spin_unlock(&mru->lock);
-		cancel_rearming_delayed_workqueue(xfs_mru_reap_wq, &mru->work);
+		cancel_delayed_work_sync(&mru->work);
 		spin_lock(&mru->lock);
 	}

diff --git a/mm/slab.c b/mm/slab.c
index b1e40da..dc98386 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1293,7 +1293,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 		 * anything expensive but will only modify reap_work
 		 * and reschedule the timer.
 		*/
-		cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
+		cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
 		/* Now the cache_reaper is guaranteed to be not running. */
 		per_cpu(slab_reap_work, cpu).work.func = NULL;
   		break;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8f62f17..33c33e7 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1033,7 +1033,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
-		cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+		cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
 		per_cpu(vmstat_work, cpu).work.func = NULL;
 		break;
 	case CPU_DOWN_FAILED:
diff --git a/net/atm/lec.c b/net/atm/lec.c
index 181d70c..96a4a4b 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1608,7 +1608,7 @@ static void lec_arp_destroy(struct lec_priv *priv)
 	struct lec_arp_table *entry;
 	int i;

-	cancel_rearming_delayed_work(&priv->lec_arp_work);
+	cancel_delayed_work_sync(&priv->lec_arp_work);

 	/*
 	 * Remove all entries
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 4e98ffa..d291094 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -925,7 +925,7 @@ void __netpoll_cleanup(struct netpoll *np)

 		skb_queue_purge(&npinfo->arp_tx);
 		skb_queue_purge(&npinfo->txq);
-		cancel_rearming_delayed_work(&npinfo->tx_work);
+		cancel_delayed_work_sync(&npinfo->tx_work);

 		/* clean after last, unfinished work */
 		__skb_queue_purge(&npinfo->txq);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5f5daa3..96334e0 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3432,7 +3432,7 @@ void ip_vs_control_cleanup(void)
 {
 	EnterFunction(2);
 	ip_vs_trash_cleanup();
-	cancel_rearming_delayed_work(&defense_work);
+	cancel_delayed_work_sync(&defense_work);
 	cancel_work_sync(&defense_work.work);
 	ip_vs_kill_estimator(&ip_vs_stats);
 	unregister_sysctl_table(sysctl_header);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index dfcab5a..96549df 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -770,7 +770,7 @@ static void xs_destroy(struct rpc_xprt *xprt)

 	dprintk("RPC:       xs_destroy xprt %p\n", xprt);

-	cancel_rearming_delayed_work(&transport->connect_worker);
+	cancel_delayed_work_sync(&transport->connect_worker);

 	xs_close(xprt);
 	xs_free_peer_addresses(xprt);
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox