Netdev List
 help / color / mirror / Atom feed
* [PATCH v3 1/2] Always free struct memcg through schedule_work()
From: Glauber Costa @ 2012-04-26 21:24 UTC (permalink / raw)
  To: cgroups
  Cc: netdev, Li Zefan, Tejun Heo, kamezawa.hiroyu, linux-mm, devel,
	Glauber Costa, Johannes Weiner, Michal Hocko
In-Reply-To: <1335475463-25167-1-git-send-email-glommer@parallels.com>

Right now we free struct memcg with kfree right after a
rcu grace period, but defer it if we need to use vfree() to get
rid of that memory area. We do that by need, because we need vfree
to be called in a process context.

This patch unifies this behavior, by ensuring that even kfree will
happen in a separate thread. The goal is to have a stable place to
call the upcoming jump label destruction function outside the realm
of the complicated and quite far-reaching cgroup lock (that can't be
held when calling neither the cpu_hotplug.lock nor the jump_label_mutex)

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
---
 mm/memcontrol.c |   24 +++++++++++++-----------
 1 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7832b4d..b0076cc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -245,8 +245,8 @@ struct mem_cgroup {
 		 */
 		struct rcu_head rcu_freeing;
 		/*
-		 * But when using vfree(), that cannot be done at
-		 * interrupt time, so we must then queue the work.
+		 * We also need some space for a worker in deferred freeing.
+		 * By the time we call it, rcu_freeing is not longer in use.
 		 */
 		struct work_struct work_freeing;
 	};
@@ -4826,23 +4826,28 @@ out_free:
 }
 
 /*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
  * but in process context.  The work_freeing structure is overlaid
  * on the rcu_freeing structure, which itself is overlaid on memsw.
  */
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
 {
 	struct mem_cgroup *memcg;
+	int size = sizeof(struct mem_cgroup);
 
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
-	vfree(memcg);
+	if (size < PAGE_SIZE)
+		kfree(memcg);
+	else
+		vfree(memcg);
 }
-static void vfree_rcu(struct rcu_head *rcu_head)
+
+static void free_rcu(struct rcu_head *rcu_head)
 {
 	struct mem_cgroup *memcg;
 
 	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, vfree_work);
+	INIT_WORK(&memcg->work_freeing, free_work);
 	schedule_work(&memcg->work_freeing);
 }
 
@@ -4868,10 +4873,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_zone_info(memcg, node);
 
 	free_percpu(memcg->stat);
-	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
-		kfree_rcu(memcg, rcu_freeing);
-	else
-		call_rcu(&memcg->rcu_freeing, vfree_rcu);
+	call_rcu(&memcg->rcu_freeing, free_rcu);
 }
 
 static void mem_cgroup_get(struct mem_cgroup *memcg)
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH v3 2/2] decrement static keys on real destroy time
From: Glauber Costa @ 2012-04-26 21:24 UTC (permalink / raw)
  To: cgroups
  Cc: netdev, Li Zefan, Tejun Heo, kamezawa.hiroyu, linux-mm, devel,
	Glauber Costa
In-Reply-To: <1335475463-25167-1-git-send-email-glommer@parallels.com>

We call the destroy function when a cgroup starts to be removed,
such as by a rmdir event.

However, because of our reference counters, some objects are still
inflight. Right now, we are decrementing the static_keys at destroy()
time, meaning that if we get rid of the last static_key reference,
some objects will still have charges, but the code to properly
uncharge them won't be run.

This becomes a problem specially if it is ever enabled again, because
now new charges will be added to the staled charges making keeping
it pretty much impossible.

We just need to be careful with the static branch activation:
since there is no particular preferred order of their activation,
we need to make sure that we only start using it after all
call sites are active. This is achieved by having a per-memcg
flag that is only updated after static_key_slow_inc() returns.
At this time, we are sure all sites are active.

This is made per-memcg, not global, for a reason:
it also has the effect of making socket accounting more
consistent. The first memcg to be limited will trigger static_key()
activation, therefore, accounting. But all the others will then be
accounted no matter what. After this patch, only limited memcgs
will have its sockets accounted.

[v2: changed a tcp limited flag for a generic proto limited flag ]
[v3: update the current active flag only after the static_key update ]
[v4: disarm_static_keys() inside free_work ]

Signed-off-by: Glauber Costa <glommer@parallels.com>
---
 include/net/sock.h        |    9 ++++++
 mm/memcontrol.c           |   31 ++++++++++++++++++-
 net/ipv4/tcp_memcontrol.c |   70 ++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b3ebe6b..c5a2010 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -914,6 +914,15 @@ struct cg_proto {
 	int			*memory_pressure;
 	long			*sysctl_mem;
 	/*
+	 * active means it is currently active, and new sockets should
+	 * be assigned to cgroups.
+	 *
+	 * activated means it was ever activated, and we need to
+	 * disarm the static keys on destruction
+	 */
+	bool			activated;
+	bool			active; 
+	/*
 	 * memcg field is used to find which memcg we belong directly
 	 * Each memcg struct can hold more than one cg_proto, so container_of
 	 * won't really cut.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b0076cc..53a0815 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -404,6 +404,7 @@ void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
+		struct cg_proto *cg_proto;
 
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 
@@ -423,9 +424,10 @@ void sock_update_memcg(struct sock *sk)
 
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
-		if (!mem_cgroup_is_root(memcg)) {
+		cg_proto = sk->sk_prot->proto_cgroup(memcg);
+		if (!mem_cgroup_is_root(memcg) && cg_proto->active) {
 			mem_cgroup_get(memcg);
-			sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
@@ -442,6 +444,14 @@ void sock_release_memcg(struct sock *sk)
 	}
 }
 
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+#ifdef CONFIG_INET
+	if (memcg->tcp_mem.cg_proto.activated)
+		static_key_slow_dec(&memcg_socket_limit_enabled);
+#endif
+}
+
 #ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
@@ -452,6 +462,11 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
+#else
+static inline void disarm_static_keys(struct mem_cgroup *memcg)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -4836,6 +4851,18 @@ static void free_work(struct work_struct *work)
 	int size = sizeof(struct mem_cgroup);
 
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
+	/*
+	 * We need to make sure that (at least for now), the jump label
+	 * destruction code runs outside of the cgroup lock. It is in theory
+	 * possible to call the cgroup destruction function outside of that
+	 * lock, but it is not yet done. rate limiting plus the deferred
+	 * interface for static_branch destruction guarantees that it will
+	 * run through schedule_work(), therefore, not holding any cgroup
+	 * related lock (this is, of course, until someone decides to write
+	 * a schedule_work cgroup :p )
+	 */
+
+	disarm_static_keys(memcg);
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 1517037..7790008 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -54,6 +54,8 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	cg_proto->sysctl_mem = tcp->tcp_prot_mem;
 	cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
 	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
+	cg_proto->active = false;
+	cg_proto->activated = false;
 	cg_proto->memcg = memcg;
 
 	return 0;
@@ -74,12 +76,43 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
 
 	val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-
-	if (val != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
+/*
+ * This is to prevent two writes arriving at the same time
+ * at kmem.tcp.limit_in_bytes.
+ *
+ * There is a race at the first time we write to this file:
+ *
+ * - cg_proto->activated == false for all writers.
+ * - They all do a static_key_slow_inc().
+ * - When we are finally read to decrement the static_keys,
+ *   we'll do it only once per activated cgroup. So we won't
+ *   be able to disable it.
+ *
+ *   Also, after the first caller increments the static_branch
+ *   counter, all others will return right away. That does not mean,
+ *   however, that the update is finished.
+ *
+ *   Without this mutex, it would then be possible for a second writer
+ *   to get to the update site, return 
+ *
+ *   When a user updates limit of 2 cgroups at once, following happens.
+ *
+ *   	CPU A				CPU B
+ *
+ *	if (cg_proto->activated)	if (cg->proto_activated)
+ *		static_key_inc()		static_key_inc()
+ * 		=> set counter 0->1		=> set counter 1->2,
+ * 						return immediately.
+ * 		=> hold mutex			=> cg_proto->activated = true. 
+ * 		=> overwrite jmps.
+ *
+ * This race was described by Kamezawa Hiroyuki.
+ */
+static DEFINE_MUTEX(tcp_set_limit_mutex);
+
 static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 {
 	struct net *net = current->nsproxy->net_ns;
@@ -107,10 +140,33 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
 					     net->ipv4.sysctl_tcp_mem[i]);
 
-	if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
-	else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
-		static_key_slow_inc(&memcg_socket_limit_enabled);
+	if (val == RESOURCE_MAX)
+		cg_proto->active = false;
+	else if (val != RESOURCE_MAX) {
+		/*
+		 * ->activated needs to be written after the static_key update.
+		 *  This is what guarantees that the socket activation function
+		 *  is the last one to run. See sock_update_memcg() for details,
+		 *  and note that we don't mark any socket as belonging to this
+		 *  memcg until that flag is up.
+		 *
+		 *  We need to do this, because static_keys will span multiple
+		 *  sites, but we can't control their order. If we mark a socket
+		 *  as accounted, but the accounting functions are not patched in
+		 *  yet, we'll lose accounting.
+		 *
+		 *  We never race with the readers in sock_update_memcg(), because
+		 *  when this value change, the code to process it is not patched in
+		 *  yet.
+		 */
+		mutex_lock(&tcp_set_limit_mutex);
+		if (!cg_proto->activated) {
+			static_key_slow_inc(&memcg_socket_limit_enabled);
+			cg_proto->activated = true;
+		}
+		mutex_unlock(&tcp_set_limit_mutex);
+		cg_proto->active = true;
+	}
 
 	return 0;
 }
-- 
1.7.7.6

^ permalink raw reply related

* Re: [PATCH v3 2/2] decrement static keys on real destroy time
From: Tejun Heo @ 2012-04-26 21:39 UTC (permalink / raw)
  To: Glauber Costa
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	Li Zefan, kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, devel-GEFAQzZX7r8dnm+yROfE0A
In-Reply-To: <1335475463-25167-3-git-send-email-glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

Hello, Glauber.

Overall, I like this approach much better.  Just some nits below.

On Thu, Apr 26, 2012 at 06:24:23PM -0300, Glauber Costa wrote:
> @@ -4836,6 +4851,18 @@ static void free_work(struct work_struct *work)
>  	int size = sizeof(struct mem_cgroup);
>  
>  	memcg = container_of(work, struct mem_cgroup, work_freeing);
> +	/*
> +	 * We need to make sure that (at least for now), the jump label
> +	 * destruction code runs outside of the cgroup lock. It is in theory
> +	 * possible to call the cgroup destruction function outside of that
> +	 * lock, but it is not yet done. rate limiting plus the deferred
> +	 * interface for static_branch destruction guarantees that it will
> +	 * run through schedule_work(), therefore, not holding any cgroup
> +	 * related lock (this is, of course, until someone decides to write
> +	 * a schedule_work cgroup :p )
> +	 */

Isn't the above a bit too verbose?  Wouldn't just stating the locking
dependency be enough?

> +	disarm_static_keys(memcg);
>  	if (size < PAGE_SIZE)
>  		kfree(memcg);
>  	else
> diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
> index 1517037..7790008 100644
> --- a/net/ipv4/tcp_memcontrol.c
> +++ b/net/ipv4/tcp_memcontrol.c
> @@ -54,6 +54,8 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
>  	cg_proto->sysctl_mem = tcp->tcp_prot_mem;
>  	cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
>  	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
> +	cg_proto->active = false;
> +	cg_proto->activated = false;

Isn't the memory zallocd?  I find 0 / NULL / false inits unnecessary
and even misleading (can the memory be non-zero here?).  Another side
effect is that it tends to get out of sync as more fields are added.

> +/*
> + * This is to prevent two writes arriving at the same time
> + * at kmem.tcp.limit_in_bytes.
> + *
> + * There is a race at the first time we write to this file:
> + *
> + * - cg_proto->activated == false for all writers.
> + * - They all do a static_key_slow_inc().
> + * - When we are finally read to decrement the static_keys,
                            ^
                            ready

> + *   we'll do it only once per activated cgroup. So we won't
> + *   be able to disable it.
> + *
> + *   Also, after the first caller increments the static_branch
> + *   counter, all others will return right away. That does not mean,
> + *   however, that the update is finished.
> + *
> + *   Without this mutex, it would then be possible for a second writer
> + *   to get to the update site, return 

I kinda don't follow the above sentence.

> + *   When a user updates limit of 2 cgroups at once, following happens.
> + *
> + *   	CPU A				CPU B
> + *
> + *	if (cg_proto->activated)	if (cg->proto_activated)
> + *		static_key_inc()		static_key_inc()
> + * 		=> set counter 0->1		=> set counter 1->2,
> + * 						return immediately.
> + * 		=> hold mutex			=> cg_proto->activated = true. 
> + * 		=> overwrite jmps.

Isn't this something which should be solved from static_keys API?  Why
is this being worked around from memcg?  Also, I again hope that the
explanation is slightly more concise.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: Heavy spin_lock contention in __udp4_lib_mcast_deliver increase
From: Shawn Bohrer @ 2012-04-26 21:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1335457888.2775.55.camel@edumazet-glaptop>

On Thu, Apr 26, 2012 at 06:31:28PM +0200, Eric Dumazet wrote:
> On Thu, 2012-04-26 at 11:28 -0500, Shawn Bohrer wrote:
> 
> > No in this case it is 300 unique multicast addresses, and there is one
> > socket listening to each multicast address.  So a single message is
> > only copied once to a single socket.  The bottle neck appears to be
> > that even though a single message is only going to get copied to a
> > single socket we still have to walk the list of all 300 sockets while
> > holding the spin lock to figure that out.  The incoming packet rate is
> > also roughly evenly distributed across all 300 multicast addresses so
> > even though we have multiple receive queues they are all contending
> > for the same spin lock.
> > 
> 
> I repeat my question : Are these 300 sockets bound to the same UDP
> port ?
> 
> If not, they should be spreaded in hash table.
> 
> You can make this hash table very big to reduce hash collisions
> 
> Boot parameter : uhash_entries=65536

Thanks Eric I don't know how I missed this.  In my test all 300
sockets were bound to the same UDP port so they were all falling into
the same bucket.  Switching the test to use unique ports solves the
issue.

I didn't try your other patch to increase the stack size up to 512
sockets because I don't think we need it.  We rarely have more than a
single socket per machine receiving packets on a multicast address so
I think the current stack size is sufficient for us.  Or perhaps once
again I may be misunderstanding the purpose of that patch.

--
Shawn

-- 

---------------------------------------------------------------
This email, along with any attachments, is confidential. If you 
believe you received this message in error, please contact the 
sender immediately and delete all copies of the message.  
Thank you.

^ permalink raw reply

* [PATCH] NET: smsc-ircc2: mark non-experimental
From: Linus Walleij @ 2012-04-26 21:51 UTC (permalink / raw)
  To: netdev; +Cc: Linus Walleij

From: Linus Walleij <linus.walleij@linaro.org>

This has been used by me and others for ages, let's stop calling it
experimental.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/net/irda/Kconfig      |    4 ++--
 drivers/net/irda/smsc-ircc2.c |    1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/irda/Kconfig b/drivers/net/irda/Kconfig
index 4680478..3575844 100644
--- a/drivers/net/irda/Kconfig
+++ b/drivers/net/irda/Kconfig
@@ -321,8 +321,8 @@ config AU1000_FIR
 	  Say M to build a module; it will be called au1k_ir.ko
 
 config SMC_IRCC_FIR
-	tristate "SMSC IrCC (EXPERIMENTAL)"
-	depends on EXPERIMENTAL && IRDA && ISA_DMA_API
+	tristate "SMSC IrCC"
+	depends on IRDA && ISA_DMA_API
 	help
 	  Say Y here if you want to build support for the SMC Infrared
 	  Communications Controller.  It is used in a wide variety of
diff --git a/drivers/net/irda/smsc-ircc2.c b/drivers/net/irda/smsc-ircc2.c
index 6c95d40..a926813 100644
--- a/drivers/net/irda/smsc-ircc2.c
+++ b/drivers/net/irda/smsc-ircc2.c
@@ -1,7 +1,6 @@
 /*********************************************************************
  *
  * Description:   Driver for the SMC Infrared Communications Controller
- * Status:        Experimental.
  * Author:        Daniele Peri (peri@csai.unipa.it)
  * Created at:
  * Modified at:
-- 
1.7.9.2

^ permalink raw reply related

* Re: [PATCH v3 2/2] decrement static keys on real destroy time
From: Glauber Costa @ 2012-04-26 21:58 UTC (permalink / raw)
  To: Tejun Heo; +Cc: cgroups, netdev, Li Zefan, kamezawa.hiroyu, linux-mm, devel
In-Reply-To: <20120426213916.GD27486@google.com>

On 04/26/2012 06:39 PM, Tejun Heo wrote:
> Hello, Glauber.
>
> Overall, I like this approach much better.  Just some nits below.
>
> On Thu, Apr 26, 2012 at 06:24:23PM -0300, Glauber Costa wrote:
>> @@ -4836,6 +4851,18 @@ static void free_work(struct work_struct *work)
>>   	int size = sizeof(struct mem_cgroup);
>>
>>   	memcg = container_of(work, struct mem_cgroup, work_freeing);
>> +	/*
>> +	 * We need to make sure that (at least for now), the jump label
>> +	 * destruction code runs outside of the cgroup lock. It is in theory
>> +	 * possible to call the cgroup destruction function outside of that
>> +	 * lock, but it is not yet done. rate limiting plus the deferred
>> +	 * interface for static_branch destruction guarantees that it will
>> +	 * run through schedule_work(), therefore, not holding any cgroup
>> +	 * related lock (this is, of course, until someone decides to write
>> +	 * a schedule_work cgroup :p )
>> +	 */
>
> Isn't the above a bit too verbose?  Wouldn't just stating the locking
> dependency be enough?

I used a lot of verbosity here because it is a tricky and racy issue.
I am fine with trimming the comments if this is considered too much.

>>   	cg_proto->sysctl_mem = tcp->tcp_prot_mem;
>>   	cg_proto->memory_allocated =&tcp->tcp_memory_allocated;
>>   	cg_proto->sockets_allocated =&tcp->tcp_sockets_allocated;
>> +	cg_proto->active = false;
>> +	cg_proto->activated = false;
>
> Isn't the memory zallocd?  I find 0 / NULL / false inits unnecessary
> and even misleading (can the memory be non-zero here?).  Another side
> effect is that it tends to get out of sync as more fields are added.

I can take them off.

>
>> +/*
>> + * This is to prevent two writes arriving at the same time
>> + * at kmem.tcp.limit_in_bytes.
>> + *
>> + * There is a race at the first time we write to this file:
>> + *
>> + * - cg_proto->activated == false for all writers.
>> + * - They all do a static_key_slow_inc().
>> + * - When we are finally read to decrement the static_keys,
>                              ^
>                              ready

Thanks.

>> + *   we'll do it only once per activated cgroup. So we won't
>> + *   be able to disable it.
>> + *
>> + *   Also, after the first caller increments the static_branch
>> + *   counter, all others will return right away. That does not mean,
>> + *   however, that the update is finished.
>> + *
>> + *   Without this mutex, it would then be possible for a second writer
>> + *   to get to the update site, return
>
> I kinda don't follow the above sentence.

I will try to rephrase it for more clarity. But this is the thing behind 
this patchset coming and going with so many attempts:

jump label updates are atomic given a single patch site. But they are 
*not* atomic given multiple patch sites.

In our case, they are pretty spread around. Which means that while some 
of them are already patched, some are not. If the socket marking in 
sock_update_memcg is done last, we're fine, because all the accounters 
test for that. Otherwise, we can misaccount.

To protect against that, we use the "activated" field. But it need to be 
lock-protected, otherwise a second writer can arrive here before the 
update is finished, update the accounted field, and we're down to the 
same problem as before.

>> + *   When a user updates limit of 2 cgroups at once, following happens.
>> + *
>> + *   	CPU A				CPU B
>> + *
>> + *	if (cg_proto->activated)	if (cg->proto_activated)
>> + *		static_key_inc()		static_key_inc()
>> + * 		=>  set counter 0->1		=>  set counter 1->2,
>> + * 						return immediately.
>> + * 		=>  hold mutex			=>  cg_proto->activated = true.
>> + * 		=>  overwrite jmps.
>
> Isn't this something which should be solved from static_keys API?  Why
> is this being worked around from memcg?  Also, I again hope that the
> explanation is slightly more concise.
>

At first I though that we could get rid of all this complication by 
calling stop machine from the static_branch API. This would all 
magically go away. I actually even tried it.

However, reading the code for other architectures (other than x86), I 
found that they usually rely on the fixed instruction size to just patch 
an instruction atomically and go home happy.

Using stop machine and the like would slow them down considerably. Not 
only slow down the static branch update (which is acceptable), but 
everybody else (which is horrible). It seemed to defeat the purpose of 
static branches a bit.

The other users of static branches seems to be fine coping with the fact 
that in cases with multiple-sites, they will spread in time.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v3 2/2] decrement static keys on real destroy time
From: Tejun Heo @ 2012-04-26 22:13 UTC (permalink / raw)
  To: Glauber Costa; +Cc: cgroups, netdev, Li Zefan, kamezawa.hiroyu, linux-mm, devel
In-Reply-To: <4F99C50D.6070503@parallels.com>

Hello, Glauber.

On Thu, Apr 26, 2012 at 06:58:37PM -0300, Glauber Costa wrote:
> At first I though that we could get rid of all this complication by
> calling stop machine from the static_branch API. This would all
> magically go away. I actually even tried it.
> 
> However, reading the code for other architectures (other than x86),
> I found that they usually rely on the fixed instruction size to just
> patch an instruction atomically and go home happy.
> 
> Using stop machine and the like would slow them down considerably.
> Not only slow down the static branch update (which is acceptable),
> but everybody else (which is horrible). It seemed to defeat the
> purpose of static branches a bit.
> 
> The other users of static branches seems to be fine coping with the
> fact that in cases with multiple-sites, they will spread in time.

No, what I mean is that why can't you do about the same mutexed
activated inside static_key API function instead of requiring every
user to worry about the function returning asynchronously.
ie. synchronize inside static_key API instead of in the callers.

Thanks.

-- 
tejun

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v3 2/2] decrement static keys on real destroy time
From: Glauber Costa @ 2012-04-26 22:17 UTC (permalink / raw)
  To: Tejun Heo; +Cc: cgroups, netdev, Li Zefan, kamezawa.hiroyu, linux-mm, devel
In-Reply-To: <20120426221324.GE27486@google.com>

[-- Attachment #1: Type: text/plain, Size: 281 bytes --]


> No, what I mean is that why can't you do about the same mutexed
> activated inside static_key API function instead of requiring every
> user to worry about the function returning asynchronously.
> ie. synchronize inside static_key API instead of in the callers.
>

Like this?



[-- Attachment #2: jump_label.patch --]
[-- Type: text/x-patch, Size: 1360 bytes --]

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 4304919..f7cdc18 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -57,10 +57,11 @@ static void jump_label_update(struct static_key *key, int enable);
 
 void static_key_slow_inc(struct static_key *key)
 {
+	jump_label_lock();
+
 	if (atomic_inc_not_zero(&key->enabled))
-		return;
+		goto out;
 
-	jump_label_lock();
 	if (atomic_read(&key->enabled) == 0) {
 		if (!jump_label_get_branch_default(key))
 			jump_label_update(key, JUMP_LABEL_ENABLE);
@@ -68,6 +69,7 @@ void static_key_slow_inc(struct static_key *key)
 			jump_label_update(key, JUMP_LABEL_DISABLE);
 	}
 	atomic_inc(&key->enabled);
+out:
 	jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -75,10 +77,11 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
-	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
+	jump_label_lock();
+	if (atomic_dec_and_test(&key->enabled)) {
 		WARN(atomic_read(&key->enabled) < 0,
 		     "jump label: negative count!\n");
-		return;
+		goto out;
 	}
 
 	if (rate_limit) {
@@ -90,6 +93,8 @@ static void __static_key_slow_dec(struct static_key *key,
 		else
 			jump_label_update(key, JUMP_LABEL_ENABLE);
 	}
+
+out:
 	jump_label_unlock();
 }
 

^ permalink raw reply related

* Re: [PATCH v3 2/2] decrement static keys on real destroy time
From: Tejun Heo @ 2012-04-26 22:22 UTC (permalink / raw)
  To: Glauber Costa
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	Li Zefan, kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, devel-GEFAQzZX7r8dnm+yROfE0A
In-Reply-To: <4F99C980.3030801-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

Hello,

On Thu, Apr 26, 2012 at 3:17 PM, Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org> wrote:
>
>> No, what I mean is that why can't you do about the same mutexed
>> activated inside static_key API function instead of requiring every
>> user to worry about the function returning asynchronously.
>> ie. synchronize inside static_key API instead of in the callers.
>>
>
> Like this?

Yeah, something like that.  If keeping the inc operation a single
atomic op is important for performance or whatever reasons, you can
play some trick with large negative bias value while activation is
going on and use atomic_add_return() to determine both whether it's
the first incrementer and someone else is in the process of
activating.

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH v3 2/2] decrement static keys on real destroy time
From: Glauber Costa @ 2012-04-26 22:28 UTC (permalink / raw)
  To: Tejun Heo; +Cc: cgroups, netdev, Li Zefan, kamezawa.hiroyu, linux-mm, devel
In-Reply-To: <CAOS58YOKUq7GTTZRcw19dth+HgThoNTEcqBKeNO0ftB4rFJ97A@mail.gmail.com>

On 04/26/2012 07:22 PM, Tejun Heo wrote:
> Hello,
>
> On Thu, Apr 26, 2012 at 3:17 PM, Glauber Costa<glommer@parallels.com>  wrote:
>>
>>> No, what I mean is that why can't you do about the same mutexed
>>> activated inside static_key API function instead of requiring every
>>> user to worry about the function returning asynchronously.
>>> ie. synchronize inside static_key API instead of in the callers.
>>>
>>
>> Like this?
>
> Yeah, something like that.  If keeping the inc operation a single
> atomic op is important for performance or whatever reasons, you can
> play some trick with large negative bias value while activation is
> going on and use atomic_add_return() to determine both whether it's
> the first incrementer and someone else is in the process of
> activating.
>
> Thanks.
>
We need a broader audience for this, but if I understand the interface 
right, those functions should not be called in fast paths at all 
(contrary to the static_branch tests)

The static_branch tests can be called from irq context, so we can't just 
get rid of the atomic op and use the mutex everywhere, we'd have
to live with both.

I will repost this series, with some more people in the CC list.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v3 2/2] decrement static keys on real destroy time
From: Tejun Heo @ 2012-04-26 22:32 UTC (permalink / raw)
  To: Glauber Costa
  Cc: cgroups-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	Li Zefan, kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A,
	linux-mm-Bw31MaZKKs3YtjvyW6yDsg, devel-GEFAQzZX7r8dnm+yROfE0A
In-Reply-To: <4F99CC17.4080006-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>

On Thu, Apr 26, 2012 at 3:28 PM, Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org> wrote:
> We need a broader audience for this, but if I understand the interface
> right, those functions should not be called in fast paths at all (contrary
> to the static_branch tests)
>
> The static_branch tests can be called from irq context, so we can't just get
> rid of the atomic op and use the mutex everywhere, we'd have
> to live with both.
>
> I will repost this series, with some more people in the CC list.

Great, thanks!

-- 
tejun

^ permalink raw reply

* [PATCH v4 0/3] fix problem with static_branch() for sock memcg
From: Glauber Costa @ 2012-04-26 22:51 UTC (permalink / raw)
  To: cgroups
  Cc: netdev, linux-kernel, Li Zefan, Tejun Heo, kamezawa.hiroyu,
	linux-mm, devel

Hi,

While trying to fulfill's Christoph's request for using static_branches
to do part of the role of number_of_cpusets in the cpuset cgroup, I took
a much more extensive look at the cpuset code (Thanks Christoph).

I started to feel that removing the cgroup_lock() from cpuset's
destroy is not as safe as I first imagined. At the very best, is not safe
enough to be bundled in a bugfix and deserves its own analysis.

I started then to consider another approach. While I voiced many times
that I would not like to do deferred updates for the static_branches, doing
that during destroy time would be perfectly acceptable IMHO (creation is
another story). In a summary, we are effectively calling the static_branch
updates only when the last reference to the memcg is gone. And that is
already asynchronous by nature, and we cope well with that.

In memcg, it turns out that we already do deferred freeing of the memcg
structure depending on the size of struct mem_cgroup.

My proposal is to always do that, and then we get a worker more or less
for free. Patch 3 is basically the same I had posted before, but without
the mutex lock protection, now in the static branch guaranteed interface.

Let me know if this is acceptable.

Thanks

Glauber Costa (3):
  make jump_labels wait while updates are in place
  Always free struct memcg through schedule_work()
  decrement static keys on real destroy time

 include/net/sock.h        |    9 ++++++++
 kernel/jump_label.c       |   13 ++++++++---
 mm/memcontrol.c           |   50 +++++++++++++++++++++++++++++++++-----------
 net/ipv4/tcp_memcontrol.c |   34 ++++++++++++++++++++++++------
 4 files changed, 82 insertions(+), 24 deletions(-)

-- 
1.7.7.6

^ permalink raw reply

* [PATCH v4 1/3] make jump_labels wait while updates are in place
From: Glauber Costa @ 2012-04-26 22:51 UTC (permalink / raw)
  To: cgroups
  Cc: netdev, linux-kernel, Li Zefan, Tejun Heo, kamezawa.hiroyu,
	linux-mm, devel, Glauber Costa, Johannes Weiner, Michal Hocko,
	Ingo Molnar, Jason Baron
In-Reply-To: <1335480667-8301-1-git-send-email-glommer@parallels.com>

In mem cgroup, we need to guarantee that two concurrent updates
of the jump_label interface wait for each other. IOW, we can't have
other updates returning while the first one is still patching the
kernel around, otherwise we'll race.

I believe this is something that can fit well in the static branch
API, without noticeable disadvantages:

* in the common case, it will be a quite simple lock/unlock operation
* Every context that calls static_branch_slow* already expects to be
  in sleeping context because it will mutex_lock the unlikely case.
* static_key_slow_inc is not expected to be called in any fast path,
  otherwise it would be expected to have quite a different name. Therefore
  the mutex + atomic combination instead of just an atomic should not kill
  us.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
CC: Ingo Molnar <mingo@elte.hu>
CC: Jason Baron <jbaron@redhat.com>
---
 kernel/jump_label.c |   21 +++++++++++----------
 1 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 4304919..5d09cb4 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -57,17 +57,16 @@ static void jump_label_update(struct static_key *key, int enable);
 
 void static_key_slow_inc(struct static_key *key)
 {
+	jump_label_lock();
 	if (atomic_inc_not_zero(&key->enabled))
-		return;
+		goto out;
 
-	jump_label_lock();
-	if (atomic_read(&key->enabled) == 0) {
-		if (!jump_label_get_branch_default(key))
-			jump_label_update(key, JUMP_LABEL_ENABLE);
-		else
-			jump_label_update(key, JUMP_LABEL_DISABLE);
-	}
+	if (!jump_label_get_branch_default(key))
+		jump_label_update(key, JUMP_LABEL_ENABLE);
+	else
+		jump_label_update(key, JUMP_LABEL_DISABLE);
 	atomic_inc(&key->enabled);
+out:
 	jump_label_unlock();
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -75,10 +74,11 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
 static void __static_key_slow_dec(struct static_key *key,
 		unsigned long rate_limit, struct delayed_work *work)
 {
-	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
+	jump_label_lock();
+	if (atomic_dec_and_test(&key->enabled)) {
 		WARN(atomic_read(&key->enabled) < 0,
 		     "jump label: negative count!\n");
-		return;
+		goto out;
 	}
 
 	if (rate_limit) {
@@ -90,6 +90,7 @@ static void __static_key_slow_dec(struct static_key *key,
 		else
 			jump_label_update(key, JUMP_LABEL_ENABLE);
 	}
+out:
 	jump_label_unlock();
 }
 
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH v4 2/3] Always free struct memcg through schedule_work()
From: Glauber Costa @ 2012-04-26 22:51 UTC (permalink / raw)
  To: cgroups
  Cc: netdev, linux-kernel, Li Zefan, Tejun Heo, kamezawa.hiroyu,
	linux-mm, devel, Glauber Costa, Johannes Weiner, Michal Hocko
In-Reply-To: <1335480667-8301-1-git-send-email-glommer@parallels.com>

Right now we free struct memcg with kfree right after a
rcu grace period, but defer it if we need to use vfree() to get
rid of that memory area. We do that by need, because we need vfree
to be called in a process context.

This patch unifies this behavior, by ensuring that even kfree will
happen in a separate thread. The goal is to have a stable place to
call the upcoming jump label destruction function outside the realm
of the complicated and quite far-reaching cgroup lock (that can't be
held when calling neither the cpu_hotplug.lock nor the jump_label_mutex)

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
---
 mm/memcontrol.c |   24 +++++++++++++-----------
 1 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7832b4d..b0076cc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -245,8 +245,8 @@ struct mem_cgroup {
 		 */
 		struct rcu_head rcu_freeing;
 		/*
-		 * But when using vfree(), that cannot be done at
-		 * interrupt time, so we must then queue the work.
+		 * We also need some space for a worker in deferred freeing.
+		 * By the time we call it, rcu_freeing is not longer in use.
 		 */
 		struct work_struct work_freeing;
 	};
@@ -4826,23 +4826,28 @@ out_free:
 }
 
 /*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
  * but in process context.  The work_freeing structure is overlaid
  * on the rcu_freeing structure, which itself is overlaid on memsw.
  */
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
 {
 	struct mem_cgroup *memcg;
+	int size = sizeof(struct mem_cgroup);
 
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
-	vfree(memcg);
+	if (size < PAGE_SIZE)
+		kfree(memcg);
+	else
+		vfree(memcg);
 }
-static void vfree_rcu(struct rcu_head *rcu_head)
+
+static void free_rcu(struct rcu_head *rcu_head)
 {
 	struct mem_cgroup *memcg;
 
 	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, vfree_work);
+	INIT_WORK(&memcg->work_freeing, free_work);
 	schedule_work(&memcg->work_freeing);
 }
 
@@ -4868,10 +4873,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 		free_mem_cgroup_per_zone_info(memcg, node);
 
 	free_percpu(memcg->stat);
-	if (sizeof(struct mem_cgroup) < PAGE_SIZE)
-		kfree_rcu(memcg, rcu_freeing);
-	else
-		call_rcu(&memcg->rcu_freeing, vfree_rcu);
+	call_rcu(&memcg->rcu_freeing, free_rcu);
 }
 
 static void mem_cgroup_get(struct mem_cgroup *memcg)
-- 
1.7.7.6

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v4 3/3] decrement static keys on real destroy time
From: Glauber Costa @ 2012-04-26 22:51 UTC (permalink / raw)
  To: cgroups
  Cc: netdev, linux-kernel, Li Zefan, Tejun Heo, kamezawa.hiroyu,
	linux-mm, devel, Glauber Costa, Johannes Weiner, Michal Hocko
In-Reply-To: <1335480667-8301-1-git-send-email-glommer@parallels.com>

We call the destroy function when a cgroup starts to be removed,
such as by a rmdir event.

However, because of our reference counters, some objects are still
inflight. Right now, we are decrementing the static_keys at destroy()
time, meaning that if we get rid of the last static_key reference,
some objects will still have charges, but the code to properly
uncharge them won't be run.

This becomes a problem specially if it is ever enabled again, because
now new charges will be added to the staled charges making keeping
it pretty much impossible.

We just need to be careful with the static branch activation:
since there is no particular preferred order of their activation,
we need to make sure that we only start using it after all
call sites are active. This is achieved by having a per-memcg
flag that is only updated after static_key_slow_inc() returns.
At this time, we are sure all sites are active.

This is made per-memcg, not global, for a reason:
it also has the effect of making socket accounting more
consistent. The first memcg to be limited will trigger static_key()
activation, therefore, accounting. But all the others will then be
accounted no matter what. After this patch, only limited memcgs
will have its sockets accounted.

[v2: changed a tcp limited flag for a generic proto limited flag ]
[v3: update the current active flag only after the static_key update ]
[v4: disarm_static_keys() inside free_work ]
[v5: got rid of tcp_limit_mutex, now in the static_key interface ]

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
---
 include/net/sock.h        |    9 +++++++++
 mm/memcontrol.c           |   26 ++++++++++++++++++++++++--
 net/ipv4/tcp_memcontrol.c |   34 +++++++++++++++++++++++++++-------
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index b3ebe6b..c5a2010 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -914,6 +914,15 @@ struct cg_proto {
 	int			*memory_pressure;
 	long			*sysctl_mem;
 	/*
+	 * active means it is currently active, and new sockets should
+	 * be assigned to cgroups.
+	 *
+	 * activated means it was ever activated, and we need to
+	 * disarm the static keys on destruction
+	 */
+	bool			activated;
+	bool			active; 
+	/*
 	 * memcg field is used to find which memcg we belong directly
 	 * Each memcg struct can hold more than one cg_proto, so container_of
 	 * won't really cut.
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b0076cc..3d004ee 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -404,6 +404,7 @@ void sock_update_memcg(struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled) {
 		struct mem_cgroup *memcg;
+		struct cg_proto *cg_proto;
 
 		BUG_ON(!sk->sk_prot->proto_cgroup);
 
@@ -423,9 +424,10 @@ void sock_update_memcg(struct sock *sk)
 
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
-		if (!mem_cgroup_is_root(memcg)) {
+		cg_proto = sk->sk_prot->proto_cgroup(memcg);
+		if (!mem_cgroup_is_root(memcg) && cg_proto->active) {
 			mem_cgroup_get(memcg);
-			sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
 	}
@@ -442,6 +444,14 @@ void sock_release_memcg(struct sock *sk)
 	}
 }
 
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+#ifdef CONFIG_INET
+	if (memcg->tcp_mem.cg_proto.activated)
+		static_key_slow_dec(&memcg_socket_limit_enabled);
+#endif
+}
+
 #ifdef CONFIG_INET
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 {
@@ -452,6 +462,11 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
+#else
+static inline void disarm_static_keys(struct mem_cgroup *memcg)
+{
+}
+
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -4836,6 +4851,13 @@ static void free_work(struct work_struct *work)
 	int size = sizeof(struct mem_cgroup);
 
 	memcg = container_of(work, struct mem_cgroup, work_freeing);
+	/*
+	 * We need to make sure that (at least for now), the jump label
+	 * destruction code runs outside of the cgroup lock. schedule_work()
+	 * will guarantee this happens. Be careful if you need to move this
+	 * disarm_static_keys around
+	 */
+	disarm_static_keys(memcg);
 	if (size < PAGE_SIZE)
 		kfree(memcg);
 	else
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 1517037..81004df 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -54,6 +54,8 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	cg_proto->sysctl_mem = tcp->tcp_prot_mem;
 	cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
 	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
+	cg_proto->active = false;
+	cg_proto->activated = false;
 	cg_proto->memcg = memcg;
 
 	return 0;
@@ -74,9 +76,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
 
 	val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-
-	if (val != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
@@ -107,10 +106,31 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
 		tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
 					     net->ipv4.sysctl_tcp_mem[i]);
 
-	if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
-		static_key_slow_dec(&memcg_socket_limit_enabled);
-	else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
-		static_key_slow_inc(&memcg_socket_limit_enabled);
+	if (val == RESOURCE_MAX)
+		cg_proto->active = false;
+	else if (val != RESOURCE_MAX) {
+		/*
+		 * ->activated needs to be written after the static_key update.
+		 *  This is what guarantees that the socket activation function
+		 *  is the last one to run. See sock_update_memcg() for details,
+		 *  and note that we don't mark any socket as belonging to this
+		 *  memcg until that flag is up.
+		 *
+		 *  We need to do this, because static_keys will span multiple
+		 *  sites, but we can't control their order. If we mark a socket
+		 *  as accounted, but the accounting functions are not patched in
+		 *  yet, we'll lose accounting.
+		 *
+		 *  We never race with the readers in sock_update_memcg(), because
+		 *  when this value change, the code to process it is not patched in
+		 *  yet.
+		 */
+		if (!cg_proto->activated) {
+			static_key_slow_inc(&memcg_socket_limit_enabled);
+			cg_proto->activated = true;
+		}
+		cg_proto->active = true;
+	}
 
 	return 0;
 }
-- 
1.7.7.6

^ permalink raw reply related

* static_branch() for netpoll?
From: Stephen Hemminger @ 2012-04-26 23:22 UTC (permalink / raw)
  To: Eric Dumazet, David Miller; +Cc: netdev

It would be good to use static_branch instead of the current logic used
to detect if traffic trapping is need for netpoll. Most distro's end up
enabling NETPOLL, yet most users never need it.  Not sure about the locking
requirements since netpoll_rx() runs in both hard and soft irq context.

^ permalink raw reply

* [PATCH] e1000e:  MSI interrupt test failed, using legacy interrupt
From: prasanna.panchamukhi @ 2012-04-27  0:05 UTC (permalink / raw)
  To: bruce.w.allan, jeffrey.t.kirsher, jeffrey.e.pieper, bhutchings,
	gospo, sassmann, jesse.brandeburg
  Cc: e1000-devel, netdev, prasanna.panchamukhi

From: Prasanna S. Panchamukhi <ppanchamukhi@riverbed.com>

Following logs where seen on Systems with multiple NICs & ports,
while using MSI interrupts as shown below:

Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0d.0: lan0_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: wan0_1: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: lan0_1: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:32 (none) user.warn kernel: 0000:40:0e.0: wan4_0: MSI interrupt
test failed, using legacy interrupt.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0e.0: wan1_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0e.0: lan1_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: wan2_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: lan2_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: wan3_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: lan3_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0e.0: lan4_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: wan5_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX
Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: lan5_0: NIC Link is Up
1000 Mbps Full Duplex, Flow Control: RX/TX

This patch changes the IRQ tests to use polling loops starting with a
delay of 1 tick and doubling that if necessary up to a maximum total
delay of approximately 1 second.

Signed-off-by: Prasanna S. Panchamukhi <ppanchamukhi@riverbed.com>
---
 drivers/net/ethernet/intel/e1000e/netdev.c |   27 +++++++++++++++++++++++----
 1 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 19ab215..8ff0fff 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -98,6 +98,16 @@ struct e1000_reg_info {
 #define E1000_TDFTS	0x03428	/* Tx Data FIFO Tail Saved - RW */
 #define E1000_TDFPC	0x03430	/* Tx Data FIFO Packet Count - RW */
 
+/* IRQ latency can be enormous because:
+ * - All IRQs may be disabled on a CPU for a *long* time by e.g. a
+ *   slow serial console or an old IDE driver doing error recovery
+ * - The PREEMPT_RT patches mostly deal with this, but also allow a
+ *   tasklet or normal task to be given higher priority than our IRQ
+ *   threads
+ * Try to avoid blaming the hardware for this.
+ */
+#define IRQ_TIMEOUT HZ
+
 static const struct e1000_reg_info e1000_reg_info_tbl[] = {
 
 	/* General Registers */
@@ -3767,6 +3777,7 @@ static int e1000_test_msi_interrupt(struct e1000_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
 	struct e1000_hw *hw = &adapter->hw;
+	unsigned long timeout, wait;
 	int err;
 
 	/* poll_enable hasn't been called yet, so don't need disable */
@@ -3799,12 +3810,20 @@ static int e1000_test_msi_interrupt(struct e1000_adapter *adapter)
 	/* fire an unusual interrupt on the test handler */
 	ew32(ICS, E1000_ICS_RXSEQ);
 	e1e_flush();
-	msleep(50);
-
+	timeout = jiffies + IRQ_TIMEOUT;
+	wait = 1;
+	e_dbg("Waiting for MSI interrupt!\n");
+	do {
+		schedule_timeout_uninterruptible(wait);
+		rmb();
+		if (!(adapter->flags & FLAG_MSI_TEST_FAILED))
+			goto success;
+		wait *= 2;
+	} while (time_before(jiffies, timeout));
+
+success:
 	e1000_irq_disable(adapter);
 
-	rmb();
-
 	if (adapter->flags & FLAG_MSI_TEST_FAILED) {
 		adapter->int_mode = E1000E_INT_MODE_LEGACY;
 		e_info("MSI interrupt test failed, using legacy interrupt.\n");
-- 
1.7.0.4


------------------------------------------------------------------------------
Live Security Virtual Conference
Exclusive live event will cover all the ways today's security and 
threat landscape has changed and how IT managers can respond. Discussions 
will include endpoint security, mobile security and the latest in malware 
threats. http://www.accelacomm.com/jaw/sfrnl04242012/114/50122263/
_______________________________________________
E1000-devel mailing list
E1000-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/e1000-devel
To learn more about Intel&#174; Ethernet, visit http://communities.intel.com/community/wired

^ permalink raw reply related

* Re: [PATCH] e1000e:  MSI interrupt test failed, using legacy interrupt
From: Jeff Kirsher @ 2012-04-27  0:20 UTC (permalink / raw)
  To: prasanna.panchamukhi
  Cc: bruce.w.allan, jeffrey.t.kirsher, jeffrey.e.pieper, bhutchings,
	gospo, sassmann, jesse.brandeburg, e1000-devel, netdev
In-Reply-To: <1335485150-2765-1-git-send-email-prasanna.panchamukhi@riverbed.com>

[-- Attachment #1: Type: text/plain, Size: 2478 bytes --]

On 04/26/2012 05:05 PM, prasanna.panchamukhi@riverbed.com wrote:
> From: Prasanna S. Panchamukhi <ppanchamukhi@riverbed.com>
>
> Following logs where seen on Systems with multiple NICs & ports,
> while using MSI interrupts as shown below:
>
> Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0d.0: lan0_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: wan0_1: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: lan0_1: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:32 (none) user.warn kernel: 0000:40:0e.0: wan4_0: MSI interrupt
> test failed, using legacy interrupt.
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0e.0: wan1_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0e.0: lan1_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: wan2_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: lan2_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: wan3_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: lan3_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0e.0: lan4_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: wan5_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: lan5_0: NIC Link is Up
> 1000 Mbps Full Duplex, Flow Control: RX/TX
>
> This patch changes the IRQ tests to use polling loops starting with a
> delay of 1 tick and doubling that if necessary up to a maximum total
> delay of approximately 1 second.
>
> Signed-off-by: Prasanna S. Panchamukhi <ppanchamukhi@riverbed.com>
> ---
>  drivers/net/ethernet/intel/e1000e/netdev.c |   27 +++++++++++++++++++++++----
>  1 files changed, 23 insertions(+), 4 deletions(-)
Thanks Prasanna, Dave did accept the previous patch, so does this patch
take that into account?



[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 900 bytes --]

^ permalink raw reply

* Re: [PATCH] e1000e:  MSI interrupt test failed, using legacy interrupt
From: Prasanna Panchamukhi @ 2012-04-27  0:42 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: Jeff Kirsher, prasanna.panchamukhi, bruce.w.allan,
	jeffrey.e.pieper, bhutchings, gospo, sassmann, jesse.brandeburg,
	e1000-devel, netdev
In-Reply-To: <4F99E653.4000608@gmail.com>

On 04/26/2012 05:20 PM, Jeff Kirsher wrote:
> On 04/26/2012 05:05 PM, prasanna.panchamukhi@riverbed.com wrote:
>> From: Prasanna S. Panchamukhi<ppanchamukhi@riverbed.com>
>>
>> Following logs where seen on Systems with multiple NICs&  ports,
>> while using MSI interrupts as shown below:
>>
>> Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0d.0: lan0_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: wan0_1: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: lan0_1: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:32 (none) user.warn kernel: 0000:40:0e.0: wan4_0: MSI interrupt
>> test failed, using legacy interrupt.
>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>> Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0e.0: wan1_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0e.0: lan1_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: wan2_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: lan2_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: wan3_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: lan3_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0e.0: lan4_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: wan5_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: lan5_0: NIC Link is Up
>> 1000 Mbps Full Duplex, Flow Control: RX/TX
>>
>> This patch changes the IRQ tests to use polling loops starting with a
>> delay of 1 tick and doubling that if necessary up to a maximum total
>> delay of approximately 1 second.
>>
>> Signed-off-by: Prasanna S. Panchamukhi<ppanchamukhi@riverbed.com>
>> ---
>>   drivers/net/ethernet/intel/e1000e/netdev.c |   27 +++++++++++++++++++++++----
>>   1 files changed, 23 insertions(+), 4 deletions(-)
> Thanks Prasanna, Dave did accept the previous patch, so does this patch
> take that into account?
No, If Dave has already accepted previous patch we can ignore this patch 
for now.

Thanks,
Prasanna
>
>

^ permalink raw reply

* Re: [PATCH v4 1/3] make jump_labels wait while updates are in place
From: Steven Rostedt @ 2012-04-27  0:43 UTC (permalink / raw)
  To: Glauber Costa
  Cc: cgroups, netdev, linux-kernel, Li Zefan, Tejun Heo,
	kamezawa.hiroyu, linux-mm, devel, Johannes Weiner, Michal Hocko,
	Ingo Molnar, Jason Baron
In-Reply-To: <1335480667-8301-2-git-send-email-glommer@parallels.com>

On Thu, Apr 26, 2012 at 07:51:05PM -0300, Glauber Costa wrote:
> In mem cgroup, we need to guarantee that two concurrent updates
> of the jump_label interface wait for each other. IOW, we can't have
> other updates returning while the first one is still patching the
> kernel around, otherwise we'll race.

But it shouldn't. The code as is should prevent that.

> 
> I believe this is something that can fit well in the static branch
> API, without noticeable disadvantages:
> 
> * in the common case, it will be a quite simple lock/unlock operation
> * Every context that calls static_branch_slow* already expects to be
>   in sleeping context because it will mutex_lock the unlikely case.
> * static_key_slow_inc is not expected to be called in any fast path,
>   otherwise it would be expected to have quite a different name. Therefore
>   the mutex + atomic combination instead of just an atomic should not kill
>   us.
> 
> Signed-off-by: Glauber Costa <glommer@parallels.com>
> CC: Tejun Heo <tj@kernel.org>
> CC: Li Zefan <lizefan@huawei.com>
> CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> CC: Johannes Weiner <hannes@cmpxchg.org>
> CC: Michal Hocko <mhocko@suse.cz>
> CC: Ingo Molnar <mingo@elte.hu>
> CC: Jason Baron <jbaron@redhat.com>
> ---
>  kernel/jump_label.c |   21 +++++++++++----------
>  1 files changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/kernel/jump_label.c b/kernel/jump_label.c
> index 4304919..5d09cb4 100644
> --- a/kernel/jump_label.c
> +++ b/kernel/jump_label.c
> @@ -57,17 +57,16 @@ static void jump_label_update(struct static_key *key, int enable);
>  
>  void static_key_slow_inc(struct static_key *key)
>  {
> +	jump_label_lock();
>  	if (atomic_inc_not_zero(&key->enabled))
> -		return;

If key->enabled is not zero, there's nothing to be done. As the jump
label has already been enabled. Note, the key->enabled doesn't get set
until after the jump label is updated. Thus, if two tasks were to come
in, they both would be locked on the jump_label_lock().


> +		goto out;
>  
> -	jump_label_lock();
> -	if (atomic_read(&key->enabled) == 0) {
> -		if (!jump_label_get_branch_default(key))
> -			jump_label_update(key, JUMP_LABEL_ENABLE);
> -		else
> -			jump_label_update(key, JUMP_LABEL_DISABLE);
> -	}
> +	if (!jump_label_get_branch_default(key))
> +		jump_label_update(key, JUMP_LABEL_ENABLE);
> +	else
> +		jump_label_update(key, JUMP_LABEL_DISABLE);
>  	atomic_inc(&key->enabled);
> +out:
>  	jump_label_unlock();
>  }
>  EXPORT_SYMBOL_GPL(static_key_slow_inc);
> @@ -75,10 +74,11 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
>  static void __static_key_slow_dec(struct static_key *key,
>  		unsigned long rate_limit, struct delayed_work *work)
>  {
> -	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {
> +	jump_label_lock();
> +	if (atomic_dec_and_test(&key->enabled)) {
>  		WARN(atomic_read(&key->enabled) < 0,
>  		     "jump label: negative count!\n");
> -		return;

Here, it is similar. If enabled is > 1, it wouldn't need to do anything,
thus it would dec the counter and return. But if it were one, then the
lock would be taken. and set to zero. There shouldn't be a case where
two tasks came in to set it less than zero (then something is
unbalanced).

Are you hitting the WARN_ON?

-- Steve

> +		goto out;
>  	}
>  
>  	if (rate_limit) {
> @@ -90,6 +90,7 @@ static void __static_key_slow_dec(struct static_key *key,
>  		else
>  			jump_label_update(key, JUMP_LABEL_ENABLE);
>  	}
> +out:
>  	jump_label_unlock();
>  }
>  
> -- 
> 1.7.7.6

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] e1000e:  MSI interrupt test failed, using legacy interrupt
From: Jeff Kirsher @ 2012-04-27  0:46 UTC (permalink / raw)
  To: prasanna.panchamukhi
  Cc: bruce.w.allan, jeffrey.e.pieper, bhutchings, gospo, sassmann,
	jesse.brandeburg, e1000-devel, netdev
In-Reply-To: <4F99EB6B.7090000@riverbed.com>

[-- Attachment #1: Type: text/plain, Size: 2909 bytes --]

On Thu, 2012-04-26 at 17:42 -0700, Prasanna Panchamukhi wrote:
> On 04/26/2012 05:20 PM, Jeff Kirsher wrote:
> > On 04/26/2012 05:05 PM, prasanna.panchamukhi@riverbed.com wrote:
> >> From: Prasanna S. Panchamukhi<ppanchamukhi@riverbed.com>
> >>
> >> Following logs where seen on Systems with multiple NICs&  ports,
> >> while using MSI interrupts as shown below:
> >>
> >> Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0d.0: lan0_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: wan0_1: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:32 (none) user.notice kernel: 0000:40:0d.0: lan0_1: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:32 (none) user.warn kernel: 0000:40:0e.0: wan4_0: MSI interrupt
> >> test failed, using legacy interrupt.
> >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >> Feb 16 15:09:32 (none) user.notice kernel: 0000:00:0e.0: wan1_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0e.0: lan1_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: wan2_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:33 (none) user.notice kernel: 0000:00:0f.0: lan2_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: wan3_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:33 (none) user.notice kernel: 0000:40:0a.0: lan3_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0e.0: lan4_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: wan5_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >> Feb 16 15:09:34 (none) user.notice kernel: 0000:40:0f.0: lan5_0: NIC Link is Up
> >> 1000 Mbps Full Duplex, Flow Control: RX/TX
> >>
> >> This patch changes the IRQ tests to use polling loops starting with a
> >> delay of 1 tick and doubling that if necessary up to a maximum total
> >> delay of approximately 1 second.
> >>
> >> Signed-off-by: Prasanna S. Panchamukhi<ppanchamukhi@riverbed.com>
> >> ---
> >>   drivers/net/ethernet/intel/e1000e/netdev.c |   27 +++++++++++++++++++++++----
> >>   1 files changed, 23 insertions(+), 4 deletions(-)
> > Thanks Prasanna, Dave did accept the previous patch, so does this patch
> > take that into account?
> No, If Dave has already accepted previous patch we can ignore this patch 
> for now.
> 
> Thanks,
> Prasanna

Yeah, he has accepted the previous patch so I will ignore this patch.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* [PATCH 1/5] ipvs: add check in ftp for initialized core
From: Simon Horman @ 2012-04-27  0:53 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335488039-13471-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Avoid crash when registering ip_vs_ftp after
the IPVS core initialization for netns fails. Do this by
checking for present core (net->ipvs).

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ftp.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 538d74e..e39f693 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -439,6 +439,8 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
 	struct ip_vs_app *app;
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
+	if (!ipvs)
+		return -ENOENT;
 	app = kmemdup(&ip_vs_ftp, sizeof(struct ip_vs_app), GFP_KERNEL);
 	if (!app)
 		return -ENOMEM;
-- 
1.7.9.5


^ permalink raw reply related

* [PATCH 2/5] ipvs: reset ipvs pointer in netns
From: Simon Horman @ 2012-04-27  0:53 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335488039-13471-1-git-send-email-horms@verge.net.au>

From: Julian Anastasov <ja@ssi.bg>

	Make sure net->ipvs is reset on netns cleanup or failed
initialization. It is needed for IPVS applications to know that
IPVS core is not loaded in netns.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_core.c |    2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 2555816..260b9ef 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1924,6 +1924,7 @@ protocol_fail:
 control_fail:
 	ip_vs_estimator_net_cleanup(net);
 estimator_fail:
+	net->ipvs = NULL;
 	return -ENOMEM;
 }
 
@@ -1936,6 +1937,7 @@ static void __net_exit __ip_vs_cleanup(struct net *net)
 	ip_vs_control_net_cleanup(net);
 	ip_vs_estimator_net_cleanup(net);
 	IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen);
+	net->ipvs = NULL;
 }
 
 static void __net_exit __ip_vs_dev_cleanup(struct net *net)
-- 
1.7.9.5


^ permalink raw reply related

* [PATCH 4/5] ipvs: take care of return value from protocol init_netns
From: Simon Horman @ 2012-04-27  0:53 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335488039-13471-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

ip_vs_create_timeout_table() can return NULL
All functions protocol init_netns is affected of this patch.

Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h                   |    2 +-
 net/netfilter/ipvs/ip_vs_proto.c      |   11 +++++++++--
 net/netfilter/ipvs/ip_vs_proto_sctp.c |    5 ++++-
 net/netfilter/ipvs/ip_vs_proto_tcp.c  |    5 ++++-
 net/netfilter/ipvs/ip_vs_proto_udp.c  |    5 ++++-
 5 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 2bdee51..6d90dda 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -393,7 +393,7 @@ struct ip_vs_protocol {
 
 	void (*exit)(struct ip_vs_protocol *pp);
 
-	void (*init_netns)(struct net *net, struct ip_vs_proto_data *pd);
+	int (*init_netns)(struct net *net, struct ip_vs_proto_data *pd);
 
 	void (*exit_netns)(struct net *net, struct ip_vs_proto_data *pd);
 
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index a62360e..ed835e6 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -78,8 +78,15 @@ register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
 	ipvs->proto_data_table[hash] = pd;
 	atomic_set(&pd->appcnt, 0);	/* Init app counter */
 
-	if (pp->init_netns != NULL)
-		pp->init_netns(net, pd);
+	if (pp->init_netns != NULL) {
+		int ret = pp->init_netns(net, pd);
+		if (ret) {
+			/* unlink an free proto data */
+			ipvs->proto_data_table[hash] = pd->next;
+			kfree(pd);
+			return ret;
+		}
+	}
 
 	return 0;
 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1fbf7a2..9f3fb75 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -1090,7 +1090,7 @@ out:
  *   timeouts is netns related now.
  * ---------------------------------------------
  */
-static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
@@ -1098,6 +1098,9 @@ static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
 	spin_lock_init(&ipvs->sctp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
 							sizeof(sctp_timeouts));
+	if (!pd->timeout_table)
+		return -ENOMEM;
+	return 0;
 }
 
 static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index ef8641f..cd609cc 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -677,7 +677,7 @@ void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
  *   timeouts is netns related now.
  * ---------------------------------------------
  */
-static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
@@ -685,7 +685,10 @@ static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
 	spin_lock_init(&ipvs->tcp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
 							sizeof(tcp_timeouts));
+	if (!pd->timeout_table)
+		return -ENOMEM;
 	pd->tcp_state_table =  tcp_states;
+	return 0;
 }
 
 static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index f4b7262..2fedb2d 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -467,7 +467,7 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
 	cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
 }
 
-static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
+static int __udp_init(struct net *net, struct ip_vs_proto_data *pd)
 {
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
@@ -475,6 +475,9 @@ static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
 	spin_lock_init(&ipvs->udp_app_lock);
 	pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
 							sizeof(udp_timeouts));
+	if (!pd->timeout_table)
+		return -ENOMEM;
+	return 0;
 }
 
 static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
-- 
1.7.9.5


^ permalink raw reply related

* [PATCH 5/5] ipvs: kernel oops - do_ip_vs_get_ctl
From: Simon Horman @ 2012-04-27  0:53 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvs-devel, netdev, netfilter-devel, Wensong Zhang,
	Julian Anastasov, Hans Schillstrom, Jesper Dangaard Brouer,
	Simon Horman
In-Reply-To: <1335488039-13471-1-git-send-email-horms@verge.net.au>

From: Hans Schillstrom <hans.schillstrom@ericsson.com>

Change order of init so netns init is ready
when register ioctl and netlink.

Ver2
	Whitespace fixes and __init added.

Reported-by: "Ryan O'Hara" <rohara@redhat.com>
Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/net/ip_vs.h             |    2 ++
 net/netfilter/ipvs/ip_vs_core.c |    9 +++++++
 net/netfilter/ipvs/ip_vs_ctl.c  |   52 ++++++++++++++++++++++-----------------
 3 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 6d90dda..72522f0 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1203,6 +1203,8 @@ ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
 
 extern int ip_vs_use_count_inc(void);
 extern void ip_vs_use_count_dec(void);
+extern int ip_vs_register_nl_ioctl(void);
+extern void ip_vs_unregister_nl_ioctl(void);
 extern int ip_vs_control_init(void);
 extern void ip_vs_control_cleanup(void);
 extern struct ip_vs_dest *
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 260b9ef..00bdb1d 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1995,10 +1995,18 @@ static int __init ip_vs_init(void)
 		goto cleanup_dev;
 	}
 
+	ret = ip_vs_register_nl_ioctl();
+	if (ret < 0) {
+		pr_err("can't register netlink/ioctl.\n");
+		goto cleanup_hooks;
+	}
+
 	pr_info("ipvs loaded.\n");
 
 	return ret;
 
+cleanup_hooks:
+	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
 cleanup_dev:
 	unregister_pernet_device(&ipvs_core_dev_ops);
 cleanup_sub:
@@ -2014,6 +2022,7 @@ exit:
 
 static void __exit ip_vs_cleanup(void)
 {
+	ip_vs_unregister_nl_ioctl();
 	nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
 	unregister_pernet_device(&ipvs_core_dev_ops);
 	unregister_pernet_subsys(&ipvs_core_ops);	/* free ip_vs struct */
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 376d2b1..f558998 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3750,21 +3750,10 @@ void __net_exit ip_vs_control_net_cleanup(struct net *net)
 	free_percpu(ipvs->tot_stats.cpustats);
 }
 
-int __init ip_vs_control_init(void)
+int __init ip_vs_register_nl_ioctl(void)
 {
-	int idx;
 	int ret;
 
-	EnterFunction(2);
-
-	/* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
-	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
-		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
-		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
-	}
-
-	smp_wmb();	/* Do we really need it now ? */
-
 	ret = nf_register_sockopt(&ip_vs_sockopts);
 	if (ret) {
 		pr_err("cannot register sockopt.\n");
@@ -3776,28 +3765,47 @@ int __init ip_vs_control_init(void)
 		pr_err("cannot register Generic Netlink interface.\n");
 		goto err_genl;
 	}
-
-	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
-	if (ret < 0)
-		goto err_notf;
-
-	LeaveFunction(2);
 	return 0;
 
-err_notf:
-	ip_vs_genl_unregister();
 err_genl:
 	nf_unregister_sockopt(&ip_vs_sockopts);
 err_sock:
 	return ret;
 }
 
+void ip_vs_unregister_nl_ioctl(void)
+{
+	ip_vs_genl_unregister();
+	nf_unregister_sockopt(&ip_vs_sockopts);
+}
+
+int __init ip_vs_control_init(void)
+{
+	int idx;
+	int ret;
+
+	EnterFunction(2);
+
+	/* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+	}
+
+	smp_wmb();	/* Do we really need it now ? */
+
+	ret = register_netdevice_notifier(&ip_vs_dst_notifier);
+	if (ret < 0)
+		return ret;
+
+	LeaveFunction(2);
+	return 0;
+}
+
 
 void ip_vs_control_cleanup(void)
 {
 	EnterFunction(2);
 	unregister_netdevice_notifier(&ip_vs_dst_notifier);
-	ip_vs_genl_unregister();
-	nf_unregister_sockopt(&ip_vs_sockopts);
 	LeaveFunction(2);
 }
-- 
1.7.9.5


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox