Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v7 4/8] per-cgroup tcp buffers control
From: Glauber Costa @ 2011-10-13 13:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: akpm, lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen,
	netdev, linux-mm, kirill, avagin, devel, Glauber Costa
In-Reply-To: <1318511382-31051-1-git-send-email-glommer@parallels.com>

With all the infrastructure in place, this patch implements
per-cgroup control for tcp memory pressure handling.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: KAMEZAWA Hiroyuki<kamezawa.hiroyu@jp.fujitsu.com>
CC: David S. Miller <davem@davemloft.net>
CC: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/memcontrol.h |    4 +
 include/net/sock.h         |   14 ++++
 include/net/tcp.h          |   17 +++++
 mm/memcontrol.c            |  141 ++++++++++++++++++++++++++++++++++++++++++++
 net/core/sock.c            |   39 +++++++++++-
 net/ipv4/tcp.c             |   47 +++++++--------
 net/ipv4/tcp_ipv4.c        |   11 ++++
 net/ipv6/tcp_ipv6.c        |   10 +++-
 8 files changed, 255 insertions(+), 28 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 99a8ba2..a27dad9 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -393,6 +393,10 @@ void sock_update_memcg(struct sock *sk);
 void sock_release_memcg(struct sock *sk);
 void memcg_sockets_allocated_dec(struct mem_cgroup *memcg, struct proto *prot);
 void memcg_sockets_allocated_inc(struct mem_cgroup *memcg, struct proto *prot);
+int tcp_init_cgroup(const struct proto *prot, struct cgroup *cgrp,
+		    struct cgroup_subsys *ss);
+void tcp_destroy_cgroup(const struct proto *prot, struct cgroup *cgrp,
+			struct cgroup_subsys *ss);
 #else
 /* memcontrol includes sockets.h, that includes memcontrol.h ... */
 static inline void memcg_sockets_allocated_dec(struct mem_cgroup *memcg,
diff --git a/include/net/sock.h b/include/net/sock.h
index 163f87b..efd7664 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -64,6 +64,8 @@
 #include <net/dst.h>
 #include <net/checksum.h>
 
+int sockets_populate(struct cgroup *cgrp, struct cgroup_subsys *ss);
+void sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss);
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
@@ -819,6 +821,18 @@ struct proto {
 	/* Pointer to the per-cgroup version of the the sysctl_mem field */
 	long			*(*prot_mem)(const struct mem_cgroup *memcg);
 
+	/*
+	 * cgroup specific init/deinit functions. Called once for all
+	 * protocols that implement it, from cgroups populate function.
+	 * This function has to setup any files the protocol want to
+	 * appear in the kmem cgroup filesystem.
+	 */
+	int			(*init_cgroup)(const struct proto *prot,
+					       struct cgroup *cgrp,
+					       struct cgroup_subsys *ss);
+	void			(*destroy_cgroup)(const struct proto *prot,
+						  struct cgroup *cgrp,
+						  struct cgroup_subsys *ss);
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
 	int			max_header;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index eac7bf6..ec57cf2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -31,6 +31,7 @@
 #include <linux/crypto.h>
 #include <linux/cryptohash.h>
 #include <linux/kref.h>
+#include <linux/res_counter.h>
 
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -255,6 +256,21 @@ extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 
 struct mem_cgroup;
+struct tcp_memcontrol {
+	/* per-cgroup tcp memory pressure knobs */
+	struct res_counter tcp_memory_allocated;
+	struct percpu_counter tcp_sockets_allocated;
+	/* those two are read-mostly, leave them at the end */
+	long tcp_prot_mem[3];
+	int tcp_memory_pressure;
+};
+
+extern long *tcp_sysctl_mem_nocg(const struct mem_cgroup *memcg);
+struct percpu_counter *sockets_allocated_tcp_nocg(const struct mem_cgroup *memcg);
+int *memory_pressure_tcp_nocg(const struct mem_cgroup *memcg);
+long memory_allocated_tcp_add_nocg(struct mem_cgroup *memcg, long val,
+				   int *parent_status);
+
 extern long *tcp_sysctl_mem(const struct mem_cgroup *memcg);
 struct percpu_counter *sockets_allocated_tcp(const struct mem_cgroup *memcg);
 int *memory_pressure_tcp(const struct mem_cgroup *memcg);
@@ -1023,6 +1039,7 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	ireq->loc_port = tcp_hdr(skb)->dest;
 }
 
+extern void tcp_enter_memory_pressure_nocg(struct sock *sk);
 extern void tcp_enter_memory_pressure(struct sock *sk);
 
 static inline int keepalive_intvl_when(const struct tcp_sock *tp)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4e71fd8..4e79171 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,9 @@
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include "internal.h"
+#ifdef CONFIG_INET
+#include <net/tcp.h>
+#endif
 
 #include <asm/uaccess.h>
 
@@ -294,6 +297,10 @@ struct mem_cgroup {
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
+
+#ifdef CONFIG_INET
+	struct tcp_memcontrol tcp;
+#endif
 };
 
 /* Stuffs for move charges at task migration. */
@@ -377,10 +384,12 @@ enum mem_type {
 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
+static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
 /* Writing them here to avoid exposing memcg's inner layout */
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 #ifdef CONFIG_INET
 #include <net/sock.h>
+#include <net/ip.h>
 
 void sock_update_memcg(struct sock *sk)
 {
@@ -426,6 +435,119 @@ void memcg_sockets_allocated_inc(struct mem_cgroup *memcg, struct proto *prot)
 		percpu_counter_inc(prot->sockets_allocated(memcg));
 }
 EXPORT_SYMBOL(memcg_sockets_allocated_inc);
+
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the __sk_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+void tcp_enter_memory_pressure(struct sock *sk)
+{
+	struct mem_cgroup *memcg = sk->sk_cgrp;
+	if (!memcg->tcp.tcp_memory_pressure) {
+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
+		memcg->tcp.tcp_memory_pressure = 1;
+	}
+}
+EXPORT_SYMBOL(tcp_enter_memory_pressure);
+
+#define CONSTCG(m) ((struct mem_cgroup *)(m))
+long *tcp_sysctl_mem(const struct mem_cgroup *memcg)
+{
+	return CONSTCG(memcg)->tcp.tcp_prot_mem;
+}
+EXPORT_SYMBOL(tcp_sysctl_mem);
+
+/*
+ * We will be passed a value in pages. But our limits are internally
+ * all in bytes. We need to convert it before testing the allocation,
+ * and convert it back when returning data to the network layer
+ */
+long memory_allocated_tcp_add(struct mem_cgroup *memcg, long val,
+			      int *parent_status)
+{
+	int ret = 0;
+	struct res_counter *failed;
+
+	if (val > 0) {
+		val <<= PAGE_SHIFT;
+		ret = res_counter_charge(&memcg->tcp.tcp_memory_allocated,
+					 val, &failed);
+		if (!ret)
+			*parent_status = UNDER_LIMIT;
+		else
+			*parent_status = OVER_LIMIT;
+	} else if (val < 0) {
+		if (*parent_status == OVER_LIMIT)
+			/*
+			 * res_counter charge already surely uncharged the
+			 * parent if something went wrong.
+			 */
+			WARN_ON(1);
+		else {
+			val = (-val) << PAGE_SHIFT;
+			res_counter_uncharge(&memcg->tcp.tcp_memory_allocated,
+					     val);
+		}
+	}
+
+	return res_counter_read_u64(&memcg->tcp.tcp_memory_allocated,
+				    RES_USAGE) >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(memory_allocated_tcp_add);
+
+int *memory_pressure_tcp(const struct mem_cgroup *memcg)
+{
+	return &CONSTCG(memcg)->tcp.tcp_memory_pressure;
+}
+EXPORT_SYMBOL(memory_pressure_tcp);
+
+struct percpu_counter *sockets_allocated_tcp(const struct mem_cgroup *memcg)
+{
+	return &CONSTCG(memcg)->tcp.tcp_sockets_allocated;
+}
+EXPORT_SYMBOL(sockets_allocated_tcp);
+
+static void tcp_create_cgroup(struct mem_cgroup *cg, struct cgroup_subsys *ss)
+{
+	struct res_counter *parent_res_counter = NULL;
+	struct mem_cgroup *parent = parent_mem_cgroup(cg);
+
+	if (parent)
+		parent_res_counter = &parent->tcp.tcp_memory_allocated;
+
+	cg->tcp.tcp_memory_pressure = 0;
+	res_counter_init(&cg->tcp.tcp_memory_allocated, parent_res_counter);
+	percpu_counter_init(&cg->tcp.tcp_sockets_allocated, 0);
+}
+
+int tcp_init_cgroup(const struct proto *prot, struct cgroup *cgrp,
+		    struct cgroup_subsys *ss)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	/*
+	 * We need to initialize it at populate, not create time.
+	 * This is because net sysctl tables are not up until much
+	 * later
+	 */
+	memcg->tcp.tcp_prot_mem[0] = sysctl_tcp_mem[0];
+	memcg->tcp.tcp_prot_mem[1] = sysctl_tcp_mem[1];
+	memcg->tcp.tcp_prot_mem[2] = sysctl_tcp_mem[2];
+
+	return 0;
+}
+EXPORT_SYMBOL(tcp_init_cgroup);
+
+void tcp_destroy_cgroup(const struct proto *prot, struct cgroup *cgrp,
+			struct cgroup_subsys *ss)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+	percpu_counter_destroy(&memcg->tcp.tcp_sockets_allocated);
+}
+EXPORT_SYMBOL(tcp_destroy_cgroup);
+#undef CONSTCG
 #endif /* CONFIG_INET */
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
@@ -4833,14 +4955,27 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
 
 	ret = cgroup_add_files(cont, ss, kmem_cgroup_files,
 			       ARRAY_SIZE(kmem_cgroup_files));
+
+	if (!ret)
+		ret = sockets_populate(cont, ss);
 	return ret;
 };
 
+static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	sockets_destroy(cont, ss);
+}
 #else
 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
 {
 	return 0;
 }
+
+static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+}
 #endif
 
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
@@ -5058,6 +5193,10 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	mem->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&mem->oom_notify);
 
+#if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_INET)
+	tcp_create_cgroup(mem, ss);
+#endif
+
 	if (parent)
 		mem->swappiness = mem_cgroup_swappiness(parent);
 	atomic_set(&mem->refcnt, 1);
@@ -5083,6 +5222,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 
+	kmem_cgroup_destroy(ss, cont);
+
 	mem_cgroup_put(mem);
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 22ef143..3fa3ccb 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -135,6 +135,42 @@
 #include <net/tcp.h>
 #endif
 
+static DEFINE_RWLOCK(proto_list_lock);
+static LIST_HEAD(proto_list);
+
+int sockets_populate(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct proto *proto;
+	int ret = 0;
+
+	read_lock(&proto_list_lock);
+	list_for_each_entry(proto, &proto_list, node) {
+		if (proto->init_cgroup)
+			ret = proto->init_cgroup(proto, cgrp, ss);
+			if (ret)
+				goto out;
+	}
+
+	read_unlock(&proto_list_lock);
+	return ret;
+out:
+	list_for_each_entry_continue_reverse(proto, &proto_list, node)
+		if (proto->destroy_cgroup)
+			proto->destroy_cgroup(proto, cgrp, ss);
+	read_unlock(&proto_list_lock);
+	return ret;
+}
+
+void sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct proto *proto;
+	read_lock(&proto_list_lock);
+	list_for_each_entry_reverse(proto, &proto_list, node)
+		if (proto->destroy_cgroup)
+			proto->destroy_cgroup(proto, cgrp, ss);
+	read_unlock(&proto_list_lock);
+}
+
 /*
  * Each address family might have different locking rules, so we have
  * one slock key per address family:
@@ -2262,9 +2298,6 @@ void sk_common_release(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_common_release);
 
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
-
 #ifdef CONFIG_PROC_FS
 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
 struct prot_inuse {
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dc8f01e..259f6d9 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -290,13 +290,6 @@ EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
-atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
-
-/*
- * Current number of TCP sockets.
- */
-struct percpu_counter tcp_sockets_allocated;
-
 /*
  * TCP splice context
  */
@@ -306,47 +299,51 @@ struct tcp_splice_state {
 	unsigned int flags;
 };
 
-/*
- * Pressure flag: try to collapse.
- * Technical note: it is used by multiple contexts non atomically.
- * All the __sk_mem_schedule() is of this nature: accounting
- * is strict, actions are advisory and have some latency.
- */
+/* Current number of TCP sockets. */
+struct percpu_counter tcp_sockets_allocated;
+atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
 int tcp_memory_pressure __read_mostly;
 
-int *memory_pressure_tcp(const struct mem_cgroup *memcg)
+int *memory_pressure_tcp_nocg(const struct mem_cgroup *memcg)
 {
 	return &tcp_memory_pressure;
 }
-EXPORT_SYMBOL(memory_pressure_tcp);
+EXPORT_SYMBOL(memory_pressure_tcp_nocg);
 
-struct percpu_counter *sockets_allocated_tcp(const struct mem_cgroup *memcg)
+struct percpu_counter
+*sockets_allocated_tcp_nocg(const struct mem_cgroup *memcg)
 {
 	return &tcp_sockets_allocated;
 }
-EXPORT_SYMBOL(sockets_allocated_tcp);
+EXPORT_SYMBOL(sockets_allocated_tcp_nocg);
 
-void tcp_enter_memory_pressure(struct sock *sk)
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the __sk_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+void tcp_enter_memory_pressure_nocg(struct sock *sk)
 {
 	if (!tcp_memory_pressure) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
 		tcp_memory_pressure = 1;
 	}
 }
-EXPORT_SYMBOL(tcp_enter_memory_pressure);
+EXPORT_SYMBOL(tcp_enter_memory_pressure_nocg);
 
-long *tcp_sysctl_mem(const struct mem_cgroup *memcg)
+long *tcp_sysctl_mem_nocg(const struct mem_cgroup *memcg)
 {
 	return sysctl_tcp_mem;
 }
-EXPORT_SYMBOL(tcp_sysctl_mem);
+EXPORT_SYMBOL(tcp_sysctl_mem_nocg);
 
-long memory_allocated_tcp_add(struct mem_cgroup *memcg, long val,
-			      int *parent_status)
+long memory_allocated_tcp_add_nocg(struct mem_cgroup *memcg, long val,
+				   int *parent_status)
 {
 	return atomic_long_add_return(val, &tcp_memory_allocated);
 }
-EXPORT_SYMBOL(memory_allocated_tcp_add);
+EXPORT_SYMBOL(memory_allocated_tcp_add_nocg);
 
 /* Convert seconds to retransmits based on initial and max timeout */
 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
@@ -3248,7 +3245,9 @@ void __init tcp_init(void)
 
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
 
+#ifndef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 	percpu_counter_init(&tcp_sockets_allocated, 0);
+#endif
 	percpu_counter_init(&tcp_orphan_count, 0);
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7072060..aac71e9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2607,12 +2607,23 @@ struct proto tcp_prot = {
 	.hash			= inet_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+	.orphan_count		= &tcp_orphan_count,
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	.init_cgroup		= tcp_init_cgroup,
+	.destroy_cgroup		= tcp_destroy_cgroup,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.memory_pressure	= memory_pressure_tcp,
 	.sockets_allocated	= sockets_allocated_tcp,
 	.orphan_count		= &tcp_orphan_count,
 	.mem_allocated_add	= memory_allocated_tcp_add,
 	.prot_mem		= tcp_sysctl_mem,
+#else
+	.enter_memory_pressure	= tcp_enter_memory_pressure_nocg,
+	.memory_pressure	= memory_pressure_tcp_nocg,
+	.sockets_allocated	= sockets_allocated_tcp_nocg,
+	.mem_allocated_add	= memory_allocated_tcp_add_nocg,
+	.prot_mem		= tcp_sysctl_mem_nocg,
+#endif
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index bdc0003..0a52587 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2200,12 +2200,20 @@ struct proto tcpv6_prot = {
 	.hash			= tcp_v6_hash,
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
+	.orphan_count		= &tcp_orphan_count,
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
 	.sockets_allocated	= sockets_allocated_tcp,
 	.mem_allocated_add	= memory_allocated_tcp_add,
 	.memory_pressure	= memory_pressure_tcp,
-	.orphan_count		= &tcp_orphan_count,
 	.prot_mem		= tcp_sysctl_mem,
+#else
+	.enter_memory_pressure	= tcp_enter_memory_pressure_nocg,
+	.sockets_allocated	= sockets_allocated_tcp_nocg,
+	.mem_allocated_add	= memory_allocated_tcp_add_nocg,
+	.memory_pressure	= memory_pressure_tcp_nocg,
+	.prot_mem		= tcp_sysctl_mem_nocg,
+#endif
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v7 3/8] foundations of per-cgroup memory pressure controlling.
From: Glauber Costa @ 2011-10-13 13:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: akpm, lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen,
	netdev, linux-mm, kirill, avagin, devel, Glauber Costa
In-Reply-To: <1318511382-31051-1-git-send-email-glommer@parallels.com>

This patch converts struct sock fields memory_pressure,
memory_allocated, sockets_allocated, and sysctl_mem (now prot_mem)
to function pointers, receiving a struct mem_cgroup parameter.

enter_memory_pressure is kept the same, since all its callers
have socket a context, and the kmem_cgroup can be derived from
the socket itself.

To keep things working, the patch convert all users of those fields
to use acessor functions.

In my benchmarks I didn't see a significant performance difference
with this patch applied compared to a baseline (around 1 % diff, thus
inside error margin).

Signed-off-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: KAMEZAWA Hiroyuki<kamezawa.hiroyu@jp.fujitsu.com>
CC: David S. Miller <davem@davemloft.net>
CC: Eric W. Biederman <ebiederm@xmission.com>
---
 crypto/af_alg.c             |    8 +++-
 include/linux/memcontrol.h  |   22 ++++++++-
 include/net/sock.h          |  114 ++++++++++++++++++++++++++++++++++++++++--
 include/net/tcp.h           |   12 +++--
 include/net/udp.h           |    4 +-
 include/trace/events/sock.h |   10 ++--
 mm/memcontrol.c             |   19 ++++++-
 net/core/sock.c             |   62 ++++++++++++++---------
 net/decnet/af_decnet.c      |   22 +++++++-
 net/ipv4/proc.c             |    7 ++-
 net/ipv4/tcp.c              |   28 +++++++++-
 net/ipv4/tcp_input.c        |   12 ++--
 net/ipv4/tcp_ipv4.c         |   12 ++--
 net/ipv4/tcp_output.c       |    2 +-
 net/ipv4/tcp_timer.c        |    2 +-
 net/ipv4/udp.c              |   21 ++++++--
 net/ipv6/tcp_ipv6.c         |   10 ++--
 net/ipv6/udp.c              |    4 +-
 net/sctp/socket.c           |   37 +++++++++++---
 19 files changed, 320 insertions(+), 88 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index ac33d5f..09cdf11 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -29,10 +29,16 @@ struct alg_type_list {
 
 static atomic_long_t alg_memory_allocated;
 
+static long memory_allocated_alg(struct mem_cgroup *memcg, long val,
+				 int *parent_status)
+{
+	return atomic_long_add_return(val, &alg_memory_allocated);
+}
+
 static struct proto alg_proto = {
 	.name			= "ALG",
 	.owner			= THIS_MODULE,
-	.memory_allocated	= &alg_memory_allocated,
+	.mem_allocated_add	= memory_allocated_alg,
 	.obj_size		= sizeof(struct alg_sock),
 };
 
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 88aea1b..99a8ba2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -361,6 +361,10 @@ static inline
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
+static inline struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+{
+	return NULL;
+}
 #endif /* CONFIG_CGROUP_MEM_CONT */
 
 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
@@ -377,12 +381,28 @@ mem_cgroup_print_bad_page(struct page *page)
 #endif
 
 #ifdef CONFIG_INET
+enum {
+	UNDER_LIMIT,
+	OVER_LIMIT,
+};
+
 struct sock;
+struct proto;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 void sock_update_memcg(struct sock *sk);
 void sock_release_memcg(struct sock *sk);
-
+void memcg_sockets_allocated_dec(struct mem_cgroup *memcg, struct proto *prot);
+void memcg_sockets_allocated_inc(struct mem_cgroup *memcg, struct proto *prot);
 #else
+/* memcontrol includes sockets.h, that includes memcontrol.h ... */
+static inline void memcg_sockets_allocated_dec(struct mem_cgroup *memcg,
+					       struct proto *prot)
+{
+}
+static inline void memcg_sockets_allocated_inc(struct mem_cgroup *memcg,
+					       struct proto *prot)
+{
+}
 static inline void sock_update_memcg(struct sock *sk)
 {
 }
diff --git a/include/net/sock.h b/include/net/sock.h
index afe1467..163f87b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -54,6 +54,7 @@
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/cgroup.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -168,6 +169,8 @@ struct sock_common {
 	/* public: */
 };
 
+struct mem_cgroup;
+
 /**
   *	struct sock - network layer representation of sockets
   *	@__sk_common: shared layout with inet_timewait_sock
@@ -786,18 +789,36 @@ struct proto {
 	unsigned int		inuse_idx;
 #endif
 
+	/*
+	 * per-cgroup memory tracking:
+	 *
+	 * The following functions track memory consumption of network buffers
+	 * by cgroup (kmem_cgroup) for the current protocol. As of the rest
+	 * of the fields in this structure, not all protocols are required
+	 * to implement them. Protocols that don't want to do per-cgroup
+	 * memory pressure management, can just assume the root cgroup is used.
+	 *
+	 */
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
-	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
-	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	/*
-	 * Pressure flag: try to collapse.
+	 * Add a value in pages to the current memory allocation,
+	 * and return the current value.
+	 */
+	long			(*mem_allocated_add)(struct mem_cgroup *memcg,
+						     long val, int *parent_status);
+	/* Pointer to the current number of sockets in this cgroup. */
+	struct percpu_counter	*(*sockets_allocated)(const struct mem_cgroup *memcg);
+	/*
+	 * Per cgroup pointer to the pressure flag: try to collapse.
 	 * Technical note: it is used by multiple contexts non atomically.
 	 * All the __sk_mem_schedule() is of this nature: accounting
 	 * is strict, actions are advisory and have some latency.
 	 */
-	int			*memory_pressure;
-	long			*sysctl_mem;
+	int			*(*memory_pressure)(const struct mem_cgroup *memcg);
+	/* Pointer to the per-cgroup version of the the sysctl_mem field */
+	long			*(*prot_mem)(const struct mem_cgroup *memcg);
+
 	int			*sysctl_wmem;
 	int			*sysctl_rmem;
 	int			max_header;
@@ -856,6 +877,87 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 #define sk_refcnt_debug_release(sk) do { } while (0)
 #endif /* SOCK_REFCNT_DEBUG */
 
+#include <linux/memcontrol.h>
+static inline int *sk_memory_pressure(struct sock *sk)
+{
+	int *ret = NULL;
+	if (sk->sk_prot->memory_pressure)
+		ret = sk->sk_prot->memory_pressure(sk->sk_cgrp);
+	return ret;
+}
+
+static inline long sk_prot_mem(struct sock *sk, int index)
+{
+	long *prot = sk->sk_prot->prot_mem(sk->sk_cgrp);
+	return prot[index];
+}
+
+static inline long
+sk_memory_allocated(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+	struct mem_cgroup *cg = sk->sk_cgrp;
+
+	return prot->mem_allocated_add(cg, 0, NULL);
+}
+
+static inline long
+sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
+{
+	struct proto *prot = sk->sk_prot;
+	struct mem_cgroup *cg = sk->sk_cgrp;
+
+	return prot->mem_allocated_add(cg, amt, parent_status);
+}
+
+static inline void
+sk_memory_allocated_sub(struct sock *sk, int amt, int parent_status)
+{
+	struct proto *prot = sk->sk_prot;
+	struct mem_cgroup *cg = sk->sk_cgrp;
+
+	prot->mem_allocated_add(cg, -amt, &parent_status);
+}
+
+static inline void sk_sockets_allocated_dec(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+	struct mem_cgroup *cg = sk->sk_cgrp;
+
+	percpu_counter_dec(prot->sockets_allocated(cg));
+	memcg_sockets_allocated_dec(cg, prot);
+}
+
+static inline void sk_sockets_allocated_inc(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+	struct mem_cgroup *cg = sk->sk_cgrp;
+
+	percpu_counter_inc(prot->sockets_allocated(cg));
+	memcg_sockets_allocated_inc(cg, prot);
+}
+
+static inline int
+sk_sockets_allocated_read_positive(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+	struct mem_cgroup *cg = sk->sk_cgrp;
+
+	return percpu_counter_sum_positive(prot->sockets_allocated(cg));
+}
+
+static inline int
+kcg_sockets_allocated_sum_positive(struct proto *prot, struct mem_cgroup *cg)
+{
+	return percpu_counter_sum_positive(prot->sockets_allocated(cg));
+}
+
+static inline long
+kcg_memory_allocated(struct proto *prot, struct mem_cgroup *cg)
+{
+	return prot->mem_allocated_add(cg, 0, NULL);
+}
+
 
 #ifdef CONFIG_PROC_FS
 /* Called with local bh disabled */
@@ -952,7 +1054,7 @@ static inline int sk_mem_pages(int amt)
 static inline int sk_has_account(struct sock *sk)
 {
 	/* return true if protocol supports memory accounting */
-	return !!sk->sk_prot->memory_allocated;
+	return !!sk->sk_prot->mem_allocated_add;
 }
 
 static inline int sk_wmem_schedule(struct sock *sk, int size)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index acc620a..eac7bf6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,6 +45,7 @@
 #include <net/dst.h>
 
 #include <linux/seq_file.h>
+#include <linux/memcontrol.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -253,9 +254,12 @@ extern int sysctl_tcp_cookie_size;
 extern int sysctl_tcp_thin_linear_timeouts;
 extern int sysctl_tcp_thin_dupack;
 
-extern atomic_long_t tcp_memory_allocated;
-extern struct percpu_counter tcp_sockets_allocated;
-extern int tcp_memory_pressure;
+struct mem_cgroup;
+extern long *tcp_sysctl_mem(const struct mem_cgroup *memcg);
+struct percpu_counter *sockets_allocated_tcp(const struct mem_cgroup *memcg);
+int *memory_pressure_tcp(const struct mem_cgroup *memcg);
+long memory_allocated_tcp_add(struct mem_cgroup *memcg, long val,
+			      int *parent_status);
 
 /*
  * The next routines deal with comparing 32 bit unsigned ints
@@ -286,7 +290,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 	}
 
 	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
-	    atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])
+	    sk_memory_allocated(sk) > sk_prot_mem(sk, 2))
 		return true;
 	return false;
 }
diff --git a/include/net/udp.h b/include/net/udp.h
index 67ea6fc..eecd727 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -105,7 +105,9 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 
 extern struct proto udp_prot;
 
-extern atomic_long_t udp_memory_allocated;
+long memory_allocated_udp_add(struct mem_cgroup *memcg, long val,
+			      int *parent_status);
+long *udp_sysctl_mem(const struct mem_cgroup *memcg);
 
 /* sysctl variables for udp */
 extern long sysctl_udp_mem[3];
diff --git a/include/trace/events/sock.h b/include/trace/events/sock.h
index 779abb9..12a6083 100644
--- a/include/trace/events/sock.h
+++ b/include/trace/events/sock.h
@@ -37,7 +37,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
 
 	TP_STRUCT__entry(
 		__array(char, name, 32)
-		__field(long *, sysctl_mem)
+		__field(long *, prot_mem)
 		__field(long, allocated)
 		__field(int, sysctl_rmem)
 		__field(int, rmem_alloc)
@@ -45,7 +45,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
 
 	TP_fast_assign(
 		strncpy(__entry->name, prot->name, 32);
-		__entry->sysctl_mem = prot->sysctl_mem;
+		__entry->prot_mem = sk->sk_prot->prot_mem(sk->sk_cgrp);
 		__entry->allocated = allocated;
 		__entry->sysctl_rmem = prot->sysctl_rmem[0];
 		__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
@@ -54,9 +54,9 @@ TRACE_EVENT(sock_exceed_buf_limit,
 	TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld "
 		"sysctl_rmem=%d rmem_alloc=%d",
 		__entry->name,
-		__entry->sysctl_mem[0],
-		__entry->sysctl_mem[1],
-		__entry->sysctl_mem[2],
+		__entry->prot_mem[0],
+		__entry->prot_mem[1],
+		__entry->prot_mem[2],
 		__entry->allocated,
 		__entry->sysctl_rmem,
 		__entry->rmem_alloc)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 623841d..4e71fd8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -376,6 +376,7 @@ enum mem_type {
 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 /* Writing them here to avoid exposing memcg's inner layout */
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 #ifdef CONFIG_INET
@@ -409,13 +410,27 @@ void sock_release_memcg(struct sock *sk)
 {
 	cgroup_release_and_wakeup_rmdir(mem_cgroup_css(sk->sk_cgrp));
 }
+
+void memcg_sockets_allocated_dec(struct mem_cgroup *memcg, struct proto *prot)
+{
+	memcg = parent_mem_cgroup(memcg);
+	for (; memcg; memcg = parent_mem_cgroup(memcg))
+		percpu_counter_dec(prot->sockets_allocated(memcg));
+}
+EXPORT_SYMBOL(memcg_sockets_allocated_dec);
+
+void memcg_sockets_allocated_inc(struct mem_cgroup *memcg, struct proto *prot)
+{
+	memcg = parent_mem_cgroup(memcg);
+	for (; memcg; memcg = parent_mem_cgroup(memcg))
+		percpu_counter_inc(prot->sockets_allocated(memcg));
+}
+EXPORT_SYMBOL(memcg_sockets_allocated_inc);
 #endif /* CONFIG_INET */
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
-
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void drain_all_stock_async(struct mem_cgroup *mem);
 
 static struct mem_cgroup_per_zone *
diff --git a/net/core/sock.c b/net/core/sock.c
index 5426ba0..22ef143 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1293,7 +1293,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		newsk->sk_wq = NULL;
 
 		if (newsk->sk_prot->sockets_allocated)
-			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+			sk_sockets_allocated_inc(newsk);
 
 		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
 		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1684,30 +1684,33 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 	struct proto *prot = sk->sk_prot;
 	int amt = sk_mem_pages(size);
 	long allocated;
+	int *memory_pressure;
+	int parent_status = UNDER_LIMIT;
 
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-	allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+	memory_pressure = sk_memory_pressure(sk);
+	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
+
+	/* Over hard limit (we, or our parents) */
+	if ((parent_status == OVER_LIMIT) || (allocated > sk_prot_mem(sk, 2)))
+		goto suppress_allocation;
 
 	/* Under limit. */
-	if (allocated <= prot->sysctl_mem[0]) {
-		if (prot->memory_pressure && *prot->memory_pressure)
-			*prot->memory_pressure = 0;
-		return 1;
-	}
+	if (allocated <= sk_prot_mem(sk, 0))
+		if (memory_pressure && *memory_pressure)
+			*memory_pressure = 0;
 
 	/* Under pressure. */
-	if (allocated > prot->sysctl_mem[1])
+	if (allocated > sk_prot_mem(sk, 1))
 		if (prot->enter_memory_pressure)
 			prot->enter_memory_pressure(sk);
 
-	/* Over hard limit. */
-	if (allocated > prot->sysctl_mem[2])
-		goto suppress_allocation;
-
 	/* guarantee minimum buffer size under pressure */
 	if (kind == SK_MEM_RECV) {
 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
 			return 1;
+
 	} else { /* SK_MEM_SEND */
 		if (sk->sk_type == SOCK_STREAM) {
 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
@@ -1717,13 +1720,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 				return 1;
 	}
 
-	if (prot->memory_pressure) {
+	if (memory_pressure) {
 		int alloc;
 
-		if (!*prot->memory_pressure)
+		if (!*memory_pressure)
 			return 1;
-		alloc = percpu_counter_read_positive(prot->sockets_allocated);
-		if (prot->sysctl_mem[2] > alloc *
+		alloc = sk_sockets_allocated_read_positive(sk);
+		if (sk_prot_mem(sk, 2) > alloc *
 		    sk_mem_pages(sk->sk_wmem_queued +
 				 atomic_read(&sk->sk_rmem_alloc) +
 				 sk->sk_forward_alloc))
@@ -1746,7 +1749,9 @@ suppress_allocation:
 
 	/* Alas. Undo changes. */
 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-	atomic_long_sub(amt, prot->memory_allocated);
+
+	sk_memory_allocated_sub(sk, amt, parent_status);
+
 	return 0;
 }
 EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1757,15 +1762,15 @@ EXPORT_SYMBOL(__sk_mem_schedule);
  */
 void __sk_mem_reclaim(struct sock *sk)
 {
-	struct proto *prot = sk->sk_prot;
+	int *memory_pressure = sk_memory_pressure(sk);
 
-	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
-		   prot->memory_allocated);
+	sk_memory_allocated_sub(sk,
+				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);
 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
 
-	if (prot->memory_pressure && *prot->memory_pressure &&
-	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
-		*prot->memory_pressure = 0;
+	if (memory_pressure && *memory_pressure &&
+	    (sk_memory_allocated(sk) < sk_prot_mem(sk, 0)))
+		*memory_pressure = 0;
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
@@ -2484,13 +2489,20 @@ static char proto_method_implemented(const void *method)
 
 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
 {
+	struct mem_cgroup *cg = mem_cgroup_from_task(current);
+	int *memory_pressure = NULL;
+
+	if (proto->memory_pressure)
+		memory_pressure = proto->memory_pressure(cg);
+
 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
 		   proto->name,
 		   proto->obj_size,
 		   sock_prot_inuse_get(seq_file_net(seq), proto),
-		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
-		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+		   proto->mem_allocated_add != NULL ?
+			kcg_memory_allocated(proto, cg) : -1L,
+		   memory_pressure != NULL ? *memory_pressure ? "yes" : "no" : "NI",
 		   proto->max_header,
 		   proto->slab == NULL ? "no" : "yes",
 		   module_name(proto->owner),
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 19acd00..724ac73 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -458,13 +458,29 @@ static void dn_enter_memory_pressure(struct sock *sk)
 	}
 }
 
+static long memory_allocated_dn_add(struct mem_cgroup *memcg,
+					      long val, int *parent_status)
+{
+	return atomic_long_add_return(val, &decnet_memory_allocated);
+}
+
+static int *memory_pressure_dn(const struct mem_cgroup *memcg)
+{
+	return &dn_memory_pressure;
+}
+
+static long *dn_sysctl_mem(const struct mem_cgroup *memcg)
+{
+	return sysctl_decnet_mem;
+}
+
 static struct proto dn_proto = {
 	.name			= "NSP",
 	.owner			= THIS_MODULE,
 	.enter_memory_pressure	= dn_enter_memory_pressure,
-	.memory_pressure	= &dn_memory_pressure,
-	.memory_allocated	= &decnet_memory_allocated,
-	.sysctl_mem		= sysctl_decnet_mem,
+	.memory_pressure	= memory_pressure_dn,
+	.mem_allocated_add	= memory_allocated_dn_add,
+	.prot_mem		= dn_sysctl_mem,
 	.sysctl_wmem		= sysctl_decnet_wmem,
 	.sysctl_rmem		= sysctl_decnet_rmem,
 	.max_header		= DN_MAX_NSP_DATA_HEADER + 64,
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4bfad5d..535456d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,20 +52,21 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
 	int orphans, sockets;
+	struct mem_cgroup *cg = mem_cgroup_from_task(current);
 
 	local_bh_disable();
 	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
-	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
+	sockets = kcg_sockets_allocated_sum_positive(&tcp_prot, cg);
 	local_bh_enable();
 
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
 		   tcp_death_row.tw_count, sockets,
-		   atomic_long_read(&tcp_memory_allocated));
+		   kcg_memory_allocated(&tcp_prot, cg));
 	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
-		   atomic_long_read(&udp_memory_allocated));
+		   kcg_memory_allocated(&udp_prot, cg));
 	seq_printf(seq, "UDPLITE: inuse %d\n",
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 46febca..dc8f01e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -291,13 +291,11 @@ EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
 atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
-EXPORT_SYMBOL(tcp_memory_allocated);
 
 /*
  * Current number of TCP sockets.
  */
 struct percpu_counter tcp_sockets_allocated;
-EXPORT_SYMBOL(tcp_sockets_allocated);
 
 /*
  * TCP splice context
@@ -315,7 +313,18 @@ struct tcp_splice_state {
  * is strict, actions are advisory and have some latency.
  */
 int tcp_memory_pressure __read_mostly;
-EXPORT_SYMBOL(tcp_memory_pressure);
+
+int *memory_pressure_tcp(const struct mem_cgroup *memcg)
+{
+	return &tcp_memory_pressure;
+}
+EXPORT_SYMBOL(memory_pressure_tcp);
+
+struct percpu_counter *sockets_allocated_tcp(const struct mem_cgroup *memcg)
+{
+	return &tcp_sockets_allocated;
+}
+EXPORT_SYMBOL(sockets_allocated_tcp);
 
 void tcp_enter_memory_pressure(struct sock *sk)
 {
@@ -326,6 +335,19 @@ void tcp_enter_memory_pressure(struct sock *sk)
 }
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
 
+long *tcp_sysctl_mem(const struct mem_cgroup *memcg)
+{
+	return sysctl_tcp_mem;
+}
+EXPORT_SYMBOL(tcp_sysctl_mem);
+
+long memory_allocated_tcp_add(struct mem_cgroup *memcg, long val,
+			      int *parent_status)
+{
+	return atomic_long_add_return(val, &tcp_memory_allocated);
+}
+EXPORT_SYMBOL(memory_allocated_tcp_add);
+
 /* Convert seconds to retransmits based on initial and max timeout */
 static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
 {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d73aab3..87520ed 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -316,7 +316,7 @@ static void tcp_grow_window(struct sock *sk, struct sk_buff *skb)
 	/* Check #1 */
 	if (tp->rcv_ssthresh < tp->window_clamp &&
 	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
-	    !tcp_memory_pressure) {
+	    !sk_memory_pressure(sk)) {
 		int incr;
 
 		/* Check #2. Increase window, if skb with such overhead
@@ -398,8 +398,8 @@ static void tcp_clamp_window(struct sock *sk)
 
 	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
-	    !tcp_memory_pressure &&
-	    atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+	    !sk_memory_pressure(sk) &&
+	    sk_memory_allocated(sk) < sk_prot_mem(sk, 0)) {
 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
 				    sysctl_tcp_rmem[2]);
 	}
@@ -4804,7 +4804,7 @@ static int tcp_prune_queue(struct sock *sk)
 
 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		tcp_clamp_window(sk);
-	else if (tcp_memory_pressure)
+	else if (sk_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
@@ -4870,11 +4870,11 @@ static int tcp_should_expand_sndbuf(struct sock *sk)
 		return 0;
 
 	/* If we are under global TCP memory pressure, do not expand.  */
-	if (tcp_memory_pressure)
+	if (sk_memory_pressure(sk))
 		return 0;
 
 	/* If we are under soft global TCP memory pressure, do not expand.  */
-	if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+	if (sk_memory_allocated(sk) >= sk_prot_mem(sk, 0))
 		return 0;
 
 	/* If we filled the congestion window, do not expand.  */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7963e03..7072060 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1911,7 +1911,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
 	local_bh_disable();
-	percpu_counter_inc(&tcp_sockets_allocated);
+	sk_sockets_allocated_inc(sk);
 	local_bh_enable();
 
 	return 0;
@@ -1967,7 +1967,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		tp->cookie_values = NULL;
 	}
 
-	percpu_counter_dec(&tcp_sockets_allocated);
+	sk_sockets_allocated_dec(sk);
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
 
@@ -2608,11 +2608,11 @@ struct proto tcp_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
-	.sockets_allocated	= &tcp_sockets_allocated,
+	.memory_pressure	= memory_pressure_tcp,
+	.sockets_allocated	= sockets_allocated_tcp,
 	.orphan_count		= &tcp_orphan_count,
-	.memory_allocated	= &tcp_memory_allocated,
-	.memory_pressure	= &tcp_memory_pressure,
-	.sysctl_mem		= sysctl_tcp_mem,
+	.mem_allocated_add	= memory_allocated_tcp_add,
+	.prot_mem		= tcp_sysctl_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 882e0b0..06aeb31 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1912,7 +1912,7 @@ u32 __tcp_select_window(struct sock *sk)
 	if (free_space < (full_space >> 1)) {
 		icsk->icsk_ack.quick = 0;
 
-		if (tcp_memory_pressure)
+		if (sk_memory_pressure(sk))
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
 					       4U * tp->advmss);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ecd44b0..2c67617 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data)
 	}
 
 out:
-	if (tcp_memory_pressure)
+	if (sk_memory_pressure(sk))
 		sk_mem_reclaim(sk);
 out_unlock:
 	bh_unlock_sock(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1b5a193..21604b4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -120,9 +120,6 @@ EXPORT_SYMBOL(sysctl_udp_rmem_min);
 int sysctl_udp_wmem_min __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_wmem_min);
 
-atomic_long_t udp_memory_allocated;
-EXPORT_SYMBOL(udp_memory_allocated);
-
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
@@ -1918,6 +1915,20 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 }
 EXPORT_SYMBOL(udp_poll);
 
+static atomic_long_t udp_memory_allocated;
+long memory_allocated_udp_add(struct mem_cgroup *memcg, long val,
+			      int *parent_status)
+{
+	return atomic_long_add_return(val, &udp_memory_allocated);
+}
+EXPORT_SYMBOL(memory_allocated_udp_add);
+
+long *udp_sysctl_mem(const struct mem_cgroup *memcg)
+{
+	return sysctl_udp_mem;
+}
+EXPORT_SYMBOL(udp_sysctl_mem);
+
 struct proto udp_prot = {
 	.name		   = "UDP",
 	.owner		   = THIS_MODULE,
@@ -1936,8 +1947,8 @@ struct proto udp_prot = {
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v4_rehash,
 	.get_port	   = udp_v4_get_port,
-	.memory_allocated  = &udp_memory_allocated,
-	.sysctl_mem	   = sysctl_udp_mem,
+	.mem_allocated_add = &memory_allocated_udp_add,
+	.prot_mem	   = udp_sysctl_mem,
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp_sock),
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7b8fc57..bdc0003 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1992,7 +1992,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
 	local_bh_disable();
-	percpu_counter_inc(&tcp_sockets_allocated);
+	sk_sockets_allocated_inc(sk);
 	local_bh_enable();
 
 	return 0;
@@ -2201,11 +2201,11 @@ struct proto tcpv6_prot = {
 	.unhash			= inet_unhash,
 	.get_port		= inet_csk_get_port,
 	.enter_memory_pressure	= tcp_enter_memory_pressure,
-	.sockets_allocated	= &tcp_sockets_allocated,
-	.memory_allocated	= &tcp_memory_allocated,
-	.memory_pressure	= &tcp_memory_pressure,
+	.sockets_allocated	= sockets_allocated_tcp,
+	.mem_allocated_add	= memory_allocated_tcp_add,
+	.memory_pressure	= memory_pressure_tcp,
 	.orphan_count		= &tcp_orphan_count,
-	.sysctl_mem		= sysctl_tcp_mem,
+	.prot_mem		= tcp_sysctl_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index bb95e8e..0be7cbc 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1465,8 +1465,8 @@ struct proto udpv6_prot = {
 	.unhash		   = udp_lib_unhash,
 	.rehash		   = udp_v6_rehash,
 	.get_port	   = udp_v6_get_port,
-	.memory_allocated  = &udp_memory_allocated,
-	.sysctl_mem	   = sysctl_udp_mem,
+	.mem_allocated_add = memory_allocated_udp_add,
+	.prot_mem	   = udp_sysctl_mem,
 	.sysctl_wmem	   = &sysctl_udp_wmem_min,
 	.sysctl_rmem	   = &sysctl_udp_rmem_min,
 	.obj_size	   = sizeof(struct udp6_sock),
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 836aa63..8c0cde9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -119,11 +119,32 @@ static int sctp_memory_pressure;
 static atomic_long_t sctp_memory_allocated;
 struct percpu_counter sctp_sockets_allocated;
 
+static long *sctp_sysctl_mem(const struct mem_cgroup *memcg)
+{
+	return sysctl_sctp_mem;
+}
+
 static void sctp_enter_memory_pressure(struct sock *sk)
 {
 	sctp_memory_pressure = 1;
 }
 
+static int *memory_pressure_sctp(const struct mem_cgroup *memcg)
+{
+	return &sctp_memory_pressure;
+}
+
+static long memory_allocated_sctp_add(struct mem_cgroup *memcg, long val,
+				      int *parent_status)
+{
+	return atomic_long_add_return(val, &sctp_memory_allocated);
+}
+
+static struct
+percpu_counter *sockets_allocated_sctp(const struct mem_cgroup *memcg)
+{
+	return &sctp_sockets_allocated;
+}
 
 /* Get the sndbuf space available at the time on the association.  */
 static inline int sctp_wspace(struct sctp_association *asoc)
@@ -6831,13 +6852,13 @@ struct proto sctp_prot = {
 	.unhash      =	sctp_unhash,
 	.get_port    =	sctp_get_port,
 	.obj_size    =  sizeof(struct sctp_sock),
-	.sysctl_mem  =  sysctl_sctp_mem,
+	.prot_mem    =  sctp_sysctl_mem,
 	.sysctl_rmem =  sysctl_sctp_rmem,
 	.sysctl_wmem =  sysctl_sctp_wmem,
-	.memory_pressure = &sctp_memory_pressure,
+	.memory_pressure = memory_pressure_sctp,
 	.enter_memory_pressure = sctp_enter_memory_pressure,
-	.memory_allocated = &sctp_memory_allocated,
-	.sockets_allocated = &sctp_sockets_allocated,
+	.mem_allocated_add = memory_allocated_sctp_add,
+	.sockets_allocated = sockets_allocated_sctp,
 };
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
@@ -6863,12 +6884,12 @@ struct proto sctpv6_prot = {
 	.unhash		= sctp_unhash,
 	.get_port	= sctp_get_port,
 	.obj_size	= sizeof(struct sctp6_sock),
-	.sysctl_mem	= sysctl_sctp_mem,
+	.prot_mem	= sctp_sysctl_mem,
 	.sysctl_rmem	= sysctl_sctp_rmem,
 	.sysctl_wmem	= sysctl_sctp_wmem,
-	.memory_pressure = &sctp_memory_pressure,
+	.memory_pressure = memory_pressure_sctp,
 	.enter_memory_pressure = sctp_enter_memory_pressure,
-	.memory_allocated = &sctp_memory_allocated,
-	.sockets_allocated = &sctp_sockets_allocated,
+	.mem_allocated_add = memory_allocated_sctp_add,
+	.sockets_allocated = sockets_allocated_sctp,
 };
 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v7 2/8] socket: initial cgroup code.
From: Glauber Costa @ 2011-10-13 13:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: akpm, lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen,
	netdev, linux-mm, kirill, avagin, devel, Glauber Costa
In-Reply-To: <1318511382-31051-1-git-send-email-glommer@parallels.com>

We aim to control the amount of kernel memory pinned at any
time by tcp sockets. To lay the foundations for this work,
this patch adds a pointer to the kmem_cgroup to the socket
structure.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kirill A. Shutemov<kirill@shutemov.name>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujtsu.com>
CC: David S. Miller <davem@davemloft.net>
CC: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/memcontrol.h |   15 +++++++++++++++
 include/net/sock.h         |    2 ++
 mm/memcontrol.c            |   37 +++++++++++++++++++++++++++++++++++++
 net/core/sock.c            |    3 +++
 4 files changed, 57 insertions(+), 0 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 343bd76..88aea1b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -376,5 +376,20 @@ mem_cgroup_print_bad_page(struct page *page)
 }
 #endif
 
+#ifdef CONFIG_INET
+struct sock;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+void sock_update_memcg(struct sock *sk);
+void sock_release_memcg(struct sock *sk);
+
+#else
+static inline void sock_update_memcg(struct sock *sk)
+{
+}
+static inline void sock_release_memcg(struct sock *sk)
+{
+}
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#endif /* CONFIG_INET */
 #endif /* _LINUX_MEMCONTROL_H */
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 8e4062f..afe1467 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -228,6 +228,7 @@ struct sock_common {
   *	@sk_security: used by security modules
   *	@sk_mark: generic packet mark
   *	@sk_classid: this socket's cgroup classid
+  *	@sk_cgrp: this socket's kernel memory (kmem) cgroup
   *	@sk_write_pending: a write to stream socket waits to start
   *	@sk_state_change: callback to indicate change in the state of the sock
   *	@sk_data_ready: callback to indicate there is data to be processed
@@ -339,6 +340,7 @@ struct sock {
 #endif
 	__u32			sk_mark;
 	u32			sk_classid;
+	struct mem_cgroup	*sk_cgrp;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
 	void			(*sk_write_space)(struct sock *sk);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f8a5bb..623841d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -376,6 +376,43 @@ enum mem_type {
 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
+/* Writing them here to avoid exposing memcg's inner layout */
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_INET
+#include <net/sock.h>
+
+void sock_update_memcg(struct sock *sk)
+{
+	/* right now a socket spends its whole life in the same cgroup */
+	if (sk->sk_cgrp) {
+		WARN_ON(1);
+		return;
+	}
+
+	rcu_read_lock();
+	sk->sk_cgrp = mem_cgroup_from_task(current);
+
+	/*
+	 * We don't need to protect against anything task-related, because
+	 * we are basically stuck with the sock pointer that won't change,
+	 * even if the task that originated the socket changes cgroups.
+	 *
+	 * What we do have to guarantee, is that the chain leading us to
+	 * the top level won't change under our noses. Incrementing the
+	 * reference count via cgroup_exclude_rmdir guarantees that.
+	 */
+	cgroup_exclude_rmdir(mem_cgroup_css(sk->sk_cgrp));
+	rcu_read_unlock();
+}
+
+void sock_release_memcg(struct sock *sk)
+{
+	cgroup_release_and_wakeup_rmdir(mem_cgroup_css(sk->sk_cgrp));
+}
+#endif /* CONFIG_INET */
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+
+
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
diff --git a/net/core/sock.c b/net/core/sock.c
index bc745d0..5426ba0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -125,6 +125,7 @@
 #include <net/xfrm.h>
 #include <linux/ipsec.h>
 #include <net/cls_cgroup.h>
+#include <linux/memcontrol.h>
 
 #include <linux/filter.h>
 
@@ -1141,6 +1142,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		atomic_set(&sk->sk_wmem_alloc, 1);
 
 		sock_update_classid(sk);
+		sock_update_memcg(sk);
 	}
 
 	return sk;
@@ -1172,6 +1174,7 @@ static void __sk_free(struct sock *sk)
 		put_cred(sk->sk_peer_cred);
 	put_pid(sk->sk_peer_pid);
 	put_net(sock_net(sk));
+	sock_release_memcg(sk);
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v7 1/8] Basic kernel memory functionality for the Memory Controller
From: Glauber Costa @ 2011-10-13 13:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: akpm, lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen,
	netdev, linux-mm, kirill, avagin, devel, Glauber Costa
In-Reply-To: <1318511382-31051-1-git-send-email-glommer@parallels.com>

This patch lays down the foundation for the kernel memory component
of the Memory Controller.

As of today, I am only laying down the following files:

 * memory.independent_kmem_limit
 * memory.kmem.limit_in_bytes (currently ignored)
 * memory.kmem.usage_in_bytes (always zero)

Signed-off-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Paul Menage <paul@paulmenage.org>
CC: Greg Thelen <gthelen@google.com>
---
 Documentation/cgroups/memory.txt |   36 ++++++++++++++-
 init/Kconfig                     |   14 ++++++
 mm/memcontrol.c                  |   89 +++++++++++++++++++++++++++++++++++--
 3 files changed, 132 insertions(+), 7 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 06eb6d9..0dafd70 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -44,8 +44,9 @@ Features:
  - oom-killer disable knob and oom-notifier
  - Root cgroup has no limit controls.
 
- Kernel memory and Hugepages are not under control yet. We just manage
- pages on LRU. To add more controls, we have to take care of performance.
+ Hugepages is not under control yet. We just manage pages on LRU. To add more
+ controls, we have to take care of performance. Kernel memory support is work
+ in progress, and the current version provides basically functionality.
 
 Brief summary of control files.
 
@@ -56,8 +57,11 @@ Brief summary of control files.
 				 (See 5.5 for details)
  memory.memsw.usage_in_bytes	 # show current res_counter usage for memory+Swap
 				 (See 5.5 for details)
+ memory.kmem.usage_in_bytes	 # show current res_counter usage for kmem only.
+				 (See 2.7 for details)
  memory.limit_in_bytes		 # set/show limit of memory usage
  memory.memsw.limit_in_bytes	 # set/show limit of memory+Swap usage
+ memory.kmem.limit_in_bytes	 # if allowed, set/show limit of kernel memory
  memory.failcnt			 # show the number of memory usage hits limits
  memory.memsw.failcnt		 # show the number of memory+Swap hits limits
  memory.max_usage_in_bytes	 # show max memory usage recorded
@@ -72,6 +76,9 @@ Brief summary of control files.
  memory.oom_control		 # set/show oom controls.
  memory.numa_stat		 # show the number of memory usage per numa node
 
+ memory.independent_kmem_limit	 # select whether or not kernel memory limits are
+				   independent of user limits
+
 1. History
 
 The memory controller has a long history. A request for comments for the memory
@@ -255,6 +262,31 @@ When oom event notifier is registered, event will be delivered.
   per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
   zone->lru_lock, it has no lock of its own.
 
+2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+
+With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+Kernel memory limits are not imposed for the root cgroup.
+
+Memory limits as specified by the standard Memory Controller may or may not
+take kernel memory into consideration. This is achieved through the file
+memory.independent_kmem_limit. A value different than 0 will allow for kernel
+memory to be controlled separately.
+
+When kernel memory limits are not independent, the limit values set in
+memory.kmem files are ignored.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+CAUTION: As of this writing, the kmem extention may prevent tasks from moving
+among cgroups. If a task has kmem accounting in a cgroup, the task cannot be
+moved until the kmem resource is released. Also, until the resource is fully
+released, the cgroup cannot be destroyed. So, please consider your use cases
+and set kmem extention config option carefully.
+
 3. User Interface
 
 0. Configuration
diff --git a/init/Kconfig b/init/Kconfig
index d627783..b62b9e0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -689,6 +689,20 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 	  For those who want to have the feature enabled by default should
 	  select this option (if, for some reason, they need to disable it
 	  then swapaccount=0 does the trick).
+config CGROUP_MEM_RES_CTLR_KMEM
+	bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
+	depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL
+	default n
+	help
+	  The Kernel Memory extension for Memory Resource Controller can limit
+	  the amount of memory used by kernel objects in the system. Those are
+	  fundamentally different from the entities handled by the standard
+	  Memory Controller, which are page-based, and can be swapped. Users of
+	  the kmem extension can use it to guarantee that no group of processes
+	  will ever exhaust kernel resources alone.
+
+	  WARNING: The current experimental implementation does not allow a
+	  task to move among different cgroups with a kmem resource being held.
 
 config CGROUP_PERF
 	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3508777..4f8a5bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -226,6 +226,10 @@ struct mem_cgroup {
 	 */
 	struct res_counter memsw;
 	/*
+	 * the counter to account for kmem usage.
+	 */
+	struct res_counter kmem;
+	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
@@ -276,6 +280,11 @@ struct mem_cgroup {
 	 */
 	unsigned long 	move_charge_at_immigrate;
 	/*
+	 * Should kernel memory limits be stabilished independently
+	 * from user memory ?
+	 */
+	int		kmem_independent_accounting;
+	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu *stat;
@@ -343,9 +352,14 @@ enum charge_type {
 };
 
 /* for encoding cft->private value on file */
-#define _MEM			(0)
-#define _MEMSWAP		(1)
-#define _OOM_TYPE		(2)
+
+enum mem_type {
+	_MEM = 0,
+	_MEMSWAP,
+	_OOM_TYPE,
+	_KMEM,
+};
+
 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
@@ -3837,10 +3851,15 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
 	u64 val;
 
 	if (!mem_cgroup_is_root(mem)) {
+		val = 0;
+		if (!mem->kmem_independent_accounting)
+			val = res_counter_read_u64(&mem->kmem, RES_USAGE);
 		if (!swap)
-			return res_counter_read_u64(&mem->res, RES_USAGE);
+			val += res_counter_read_u64(&mem->res, RES_USAGE);
 		else
-			return res_counter_read_u64(&mem->memsw, RES_USAGE);
+			val += res_counter_read_u64(&mem->memsw, RES_USAGE);
+
+		return val;
 	}
 
 	val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
@@ -3873,6 +3892,10 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 		else
 			val = res_counter_read_u64(&mem->memsw, name);
 		break;
+	case _KMEM:
+		val = res_counter_read_u64(&mem->kmem, name);
+		break;
+
 	default:
 		BUG();
 		break;
@@ -4603,6 +4626,20 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 }
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+static u64 kmem_limit_independent_read(struct cgroup *cont, struct cftype *cft)
+{
+	return mem_cgroup_from_cont(cont)->kmem_independent_accounting;
+}
+
+static int kmem_limit_independent_write(struct cgroup *cont, struct cftype *cft,
+					u64 val)
+{
+	mem_cgroup_from_cont(cont)->kmem_independent_accounting = !!val;
+	return 0;
+}
+#endif
+
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -4718,6 +4755,42 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 }
 #endif
 
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+static struct cftype kmem_cgroup_files[] = {
+	{
+		.name = "independent_kmem_limit",
+		.read_u64 = kmem_limit_independent_read,
+		.write_u64 = kmem_limit_independent_write,
+	},
+	{
+		.name = "kmem.usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+		.read_u64 = mem_cgroup_read,
+	},
+	{
+		.name = "kmem.limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+		.read_u64 = mem_cgroup_read,
+	},
+};
+
+static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+	int ret = 0;
+
+	ret = cgroup_add_files(cont, ss, kmem_cgroup_files,
+			       ARRAY_SIZE(kmem_cgroup_files));
+	return ret;
+};
+
+#else
+static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+	return 0;
+}
+#endif
+
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
@@ -4916,6 +4989,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&mem->res, &parent->res);
 		res_counter_init(&mem->memsw, &parent->memsw);
+		res_counter_init(&mem->kmem, &parent->kmem);
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
@@ -4926,6 +5000,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	} else {
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
+		res_counter_init(&mem->kmem, NULL);
 	}
 	mem->last_scanned_child = 0;
 	mem->last_scanned_node = MAX_NUMNODES;
@@ -4969,6 +5044,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
 
 	if (!ret)
 		ret = register_memsw_files(cont, ss);
+
+	if (!ret)
+		ret = register_kmem_files(cont, ss);
+
 	return ret;
 }
 
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v7 0/8] Request for inclusion: tcp memory buffers
From: Glauber Costa @ 2011-10-13 13:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: akpm, lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen,
	netdev, linux-mm, kirill, avagin, devel

Hi Andrew,

This series was extensively reviewed over the past month, and after
all major comments were merged, I feel it is ready for inclusion when
the next merge window opens. Minor fixes will be provided if they
prove to be necessary.

You can see the most recent past discussions at:

https://lkml.org/lkml/2011/10/10/116
https://lkml.org/lkml/2011/10/4/116
https://lkml.org/lkml/2011/10/3/133

It basically lays the foundation for kernel memory limitation as a
general framework, and uses it to control the existent tcp memory pressure
thresholds in a per-cgroup way.

Follow up patches are expected for soft limits, which are not handled.

Please consider this for inclusion in your tree

Thanks,

Glauber Costa (8):
  Basic kernel memory functionality for the Memory Controller
  socket: initial cgroup code.
  foundations of per-cgroup memory pressure controlling.
  per-cgroup tcp buffers control
  per-netns ipv4 sysctl_tcp_mem
  tcp buffer limitation: per-cgroup limit
  Display current tcp memory allocation in kmem cgroup
  Disable task moving when using kernel memory accounting

 Documentation/cgroups/memory.txt |   38 ++++-
 crypto/af_alg.c                  |    8 +-
 include/linux/memcontrol.h       |   49 +++++
 include/net/netns/ipv4.h         |    1 +
 include/net/sock.h               |  130 +++++++++++++-
 include/net/tcp.h                |   30 +++-
 include/net/udp.h                |    4 +-
 include/trace/events/sock.h      |   10 +-
 init/Kconfig                     |   14 ++
 mm/memcontrol.c                  |  373 ++++++++++++++++++++++++++++++++++++--
 net/core/sock.c                  |  104 ++++++++---
 net/decnet/af_decnet.c           |   22 ++-
 net/ipv4/proc.c                  |    7 +-
 net/ipv4/sysctl_net_ipv4.c       |   71 +++++++-
 net/ipv4/tcp.c                   |   60 ++++---
 net/ipv4/tcp_input.c             |   12 +-
 net/ipv4/tcp_ipv4.c              |   23 ++-
 net/ipv4/tcp_output.c            |    2 +-
 net/ipv4/tcp_timer.c             |    2 +-
 net/ipv4/udp.c                   |   21 ++-
 net/ipv6/tcp_ipv6.c              |   20 ++-
 net/ipv6/udp.c                   |    4 +-
 net/sctp/socket.c                |   37 +++-
 23 files changed, 910 insertions(+), 132 deletions(-)

-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: Asserting ECN from userspace?
From: David Täht @ 2011-10-13 12:44 UTC (permalink / raw)
  To: bloat, netdev, Juliusz Chroboczek

[-- Attachment #1: Type: text/plain, Size: 3499 bytes --]

My original attempt at sending this was blocked due to the ip address
embedded in the url I'd included.... resending...

On 10/13/2011 01:30 PM, Juliusz Chroboczek wrote:
> Dave,
>
> I'm not sure what you are getting at.  ECN is designed for routers, not
> for end-points.

Which happens to be what I'm primarily working on at the moment.
Secondly, in mesh networks, all machines are routers.

>   Assering ECN congestion-experienced at the sender will
> cause the sender to react to the congestion indication after a whole RTT
> (after the ECN-echo is received).  For end-to-end flow control, it is
> both simpler and more efficient to reduce the sending rate immediately,
> without going over the network.
>
> There's a good reason why we're careful to distinguish congestion
> control (router-to-endpoint) and flow control (endpoint-to-endpoint).
> The latter is much easier.
My message was overly broad in scope and I failed to distinguish between 'asserting ECN from userspace', and 'indicating a flow was ECN capable'

>> 1) Applications such as bittorrent (transmission, etc) that are much
>> more aware of their overall environment could assert ECN on their UDP
>> streams to indicate congestion.
> The sender can react to congestion by simply reducing the sending rate.
> The receiver can react to congestion by pipelining fewer chunk requests
> to the sender.
>

And what I meant here, was more 'indicate a flow is ecn capable' than
'assert ECN', so routers in the middle can do better signalling.

...although I can still imagine circumstances where asserting ECN makes
sense at the endpoint. See for example:

http://en.wikipedia.org/wiki/Data_Center_TCP

>> 3) Web Proxies. A web proxy could note when it was experiencing
>> congestion on one side of the proxied connection (or another) and signal
>> the other side to slow down.
> It can cause the other side to slow down by simply stopping reading,
> thus causing the normal TCP flow control (not congestion control) to
> kick in.

Yes, but in this case a TCP flow will continue to scale the window and
thus sending rate until those buffers fill. Signalling ECN sooner would
allow a middlebox/web proxy/split tcp session that is blocking on send
to throttle the overall rate from the other side at the speed it is
actually capable of retransmitting.

> What would be useful, on the other hand, would be the ability to set the
> ECN enabled codepoint on UDP packets, 

Agreed.

> and have some means to reliably
> check whether the Congestion-Experienced codepoint has been set by some
> intermediate router. 
>From the previous discussion on this thread it looks as though the core
capabilities exist, if not application code... which looks simple to
play with, and then apply to the AQM work ongoing.

>  But that's different from what you appear to be
> suggesting.
>

You make my thoughts clearer, as always.

Also, as we discussed elsewhere, setting the ECN capable bit on streams
such as VOIP has a packet
preservation-through-a-single-congested-endpoint feature that may well
be useful.

Completely as a side note, I've been looking over the shortest queue
first algorithm for AQM, which is quite promising in and of itself,
without qos or signalling needed at all.

My first attempt at sending this mail failed due to the embedded ip
address for the paper in the google search - if you search for this, you
can find the paper I'm referring to:

  Self-Prioritization of Audio and Video Traffic

> -- Juliusz

-- 
Dave Täht

[-- Attachment #2: dave_taht.vcf --]
[-- Type: text/x-vcard, Size: 204 bytes --]

begin:vcard
fn;quoted-printable:Dave T=C3=A4ht
n;quoted-printable:T=C3=A4ht;Dave
email;internet:dave.taht@gmail.com
tel;home:1-239-829-5608
tel;cell:0638645374
x-mozilla-html:FALSE
version:2.1
end:vcard

^ permalink raw reply

* Re: Asserting ECN from userspace?
From: Juliusz Chroboczek @ 2011-10-13 11:30 UTC (permalink / raw)
  To: David Täht; +Cc: netdev, bloat-devel
In-Reply-To: <4E8BF6B2.6030101@gmail.com>

Dave,

I'm not sure what you are getting at.  ECN is designed for routers, not
for end-points.  Assering ECN congestion-experienced at the sender will
cause the sender to react to the congestion indication after a whole RTT
(after the ECN-echo is received).  For end-to-end flow control, it is
both simpler and more efficient to reduce the sending rate immediately,
without going over the network.

There's a good reason why we're careful to distinguish congestion
control (router-to-endpoint) and flow control (endpoint-to-endpoint).
The latter is much easier.

> 1) Applications such as bittorrent (transmission, etc) that are much
> more aware of their overall environment could assert ECN on their UDP
> streams to indicate congestion.

The sender can react to congestion by simply reducing the sending rate.
The receiver can react to congestion by pipelining fewer chunk requests
to the sender.

> 3) Web Proxies. A web proxy could note when it was experiencing
> congestion on one side of the proxied connection (or another) and signal
> the other side to slow down.

It can cause the other side to slow down by simply stopping reading,
thus causing the normal TCP flow control (not congestion control) to
kick in.

What would be useful, on the other hand, would be the ability to set the
ECN enabled codepoint on UDP packets, and have some means to reliably
check whether the Congestion-Experienced codepoint has been set by some
intermediate router.  But that's different from what you appear to be
suggesting.

-- Juliusz

^ permalink raw reply

* [PATCH] MAINTAINERS: can: the mailinglist moved to vger.kernel.org
From: Marc Kleine-Budde @ 2011-10-13 11:42 UTC (permalink / raw)
  To: linux-can; +Cc: netdev, Marc Kleine-Budde

Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 MAINTAINERS |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index aac56f9..5008b08 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1671,7 +1671,7 @@ CAN NETWORK LAYER
 M:	Oliver Hartkopp <socketcan@hartkopp.net>
 M:	Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
 M:	Urs Thuermann <urs.thuermann@volkswagen.de>
-L:	socketcan-core@lists.berlios.de (subscribers-only)
+L:	linux-can@vger.kernel.org
 L:	netdev@vger.kernel.org
 W:	http://developer.berlios.de/projects/socketcan/
 S:	Maintained
@@ -1683,7 +1683,7 @@ F:	include/linux/can/raw.h
 
 CAN NETWORK DRIVERS
 M:	Wolfgang Grandegger <wg@grandegger.com>
-L:	socketcan-core@lists.berlios.de (subscribers-only)
+L:	linux-can@vger.kernel.org
 L:	netdev@vger.kernel.org
 W:	http://developer.berlios.de/projects/socketcan/
 S:	Maintained
-- 
1.7.4.1

^ permalink raw reply related

* Re: __kfree_skb eventually calls kfree_skb violating dropwatch assumption
From: Neil Horman @ 2011-10-13 11:09 UTC (permalink / raw)
  To: Suresh, Charles; +Cc: netdev@vger.kernel.org
In-Reply-To: <27F465BDABE6954AABB2A4E3599BDAC702AD1DC3A7@sausexmbp02.amd.com>

On Wed, Oct 12, 2011 at 09:17:56PM -0500, Suresh, Charles wrote:
> kfree_skb can be called from __kfree_skb through the following call chain :
> 
> kfree_skb  <- skb_drop_list <- skb_drop_fraglist <- skb_release_data <-  skb_release_all <- __kfree_skb (on the 2.38.4 kernel).
> 
> This violates the assumption in the dropwatch tool that discarded packets go through the kfree_skb path and all others must go through the consume_skb path (thus resulting in the over-counting of discarded packets in dropwatch).
> 
> Neil Horman the author of dropwatch suggested that this could be fixed by skb_drop_list calling consume_skb instead of kfree_skb.
> 
> Charles
> 
I assume theres meant to be a patch attached here?
Regards
Neil

^ permalink raw reply

* Re: [PATCH 1/4] ipv4: Fix pmtu propagating
From: Steffen Klassert @ 2011-10-13 10:09 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20111012.170218.279202034199478303.davem@davemloft.net>

On Wed, Oct 12, 2011 at 05:02:18PM -0400, David Miller wrote:
> 
> This dst_check() call will only do something if dst->obsolete is non-zero.
> 
> If dst->obsolete can be set in these circumstances, that's a bug.  The
> caller is responsible for providing either a freshly looked up route
> or a cached route which has had dst_check() or sk_dst_check() invoked
> upon it.

dst->obsolete was -1 in all cases I observed, regardless if connected or not.

> 
> I am pretty sure these rules are followed by the current code.
> 
> Again, there are only two scenerios:
> 
> 1) 'rt' is just looked up by caller (f.e. udp_sendmsg() in rt == NULL case),
>    here dst->obsolete is very unlikely to be non-zero.
> 
> 2) Connected case, and we use cached route from the socket, but here
>    we'll use sk_dst_check() to validate the route.  sk_dst_check()
>    makes the necessary dst->ops->check() call if dst->obsolete is
>    non-zero, and in fact that is it's one and only job.
> 

At least it seems that raw_sendmsg() and ping_sendmsg() don't use
a cached route, they do the route lookup in any case. I don't see
where we check if we learned a new pmtu in this cases. 

I added some debugging output and saw that peer->pmtu_learned
has the correct pmtu value, but it never ends up in the metric
of the dst_entry.

With a ping -s 1400 to a destination where the pmtu is 1000
I get for each packet 'Frag needed and DF set (mtu = 1000)'.
That's how I noticed that something changed.

With the dst_check() in ip_setup_cork() I get
'Frag needed and DF set (mtu = 1000)' for the first packet and
all further packets reach the destination, as it was before
commmit 2c8cec5c.

^ permalink raw reply

* Re: Seamless Service Restart / Port Takeover
From: Olaf van der Spek @ 2011-10-13  9:56 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev
In-Reply-To: <4E95CAD8.3050405@hp.com>

On Wed, Oct 12, 2011 at 7:14 PM, Rick Jones <rick.jones2@hp.com> wrote:
> On 10/12/2011 09:53 AM, Olaf van der Spek wrote:
>>
>> Hi,
>>
>> When restarting a service, a webserver for example, you'd like this to
>> be seamless for clients. Often, first a signal is send to the old
>> process to close listening sockets, then the new process is started.
>> This has both a race condition (sometimes 'avoided' with a sleep in
>> between) and service interruption.
>> Wouldn't it be possible to introduce mv / move like behaviour, where
>> the socket can be rebound without races and without interruption?
>
> I believe you can do that today if you write the application such that the
> old instance, before it closes the listen endpoint and terminates, instead
> passes access to it to the new instance via a Unix domain socket between the
> two.

Hi Rick,

That sounds rather complicated. What daemons implement the
functionality this way?
I think an easier way to accomplish this would be quite welcome.

Olaf

^ permalink raw reply

* [PATCH 3/3] dp83640: free packet queues on remove
From: Richard Cochran @ 2011-10-13  9:46 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Eric Dumazet, Johannes Berg, stable
In-Reply-To: <cover.1318498805.git.richard.cochran@omicron.at>

If the PHY should disappear (for example, on an USB Ethernet MAC), then
the driver would leak any undelivered time stamp packets. This commit
fixes the issue by calling the appropriate functions to free any packets
left in the transmit and receive queues.

The driver first appeared in v3.0.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: <stable@kernel.org>
---
 drivers/net/phy/dp83640.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index 13e5713..9663e0b 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -1007,6 +1007,7 @@ static void dp83640_remove(struct phy_device *phydev)
 	struct dp83640_clock *clock;
 	struct list_head *this, *next;
 	struct dp83640_private *tmp, *dp83640 = phydev->priv;
+	struct sk_buff *skb;
 
 	if (phydev->addr == BROADCAST_ADDR)
 		return;
@@ -1014,6 +1015,12 @@ static void dp83640_remove(struct phy_device *phydev)
 	enable_status_frames(phydev, false);
 	cancel_work_sync(&dp83640->ts_work);
 
+	while ((skb = skb_dequeue(&dp83640->rx_queue)) != NULL)
+		kfree_skb(skb);
+
+	while ((skb = skb_dequeue(&dp83640->tx_queue)) != NULL)
+		skb_complete_tx_timestamp(skb, NULL);
+
 	clock = dp83640_clock_get(dp83640->clock);
 
 	if (dp83640 == clock->chosen) {
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 2/3] dp83640: use proper function to free transmit time stamping packets
From: Richard Cochran @ 2011-10-13  9:46 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Eric Dumazet, Johannes Berg, stable
In-Reply-To: <cover.1318498805.git.richard.cochran@omicron.at>

The previous commit enforces a new rule for handling the cloned packets
for transmit time stamping. These packets must not be freed using any other
function than skb_complete_tx_timestamp. This commit fixes the one and only
driver using this API.

The driver first appeared in v3.0.

Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: <stable@kernel.org>
---
 drivers/net/phy/dp83640.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/dp83640.c b/drivers/net/phy/dp83640.c
index c588a16..13e5713 100644
--- a/drivers/net/phy/dp83640.c
+++ b/drivers/net/phy/dp83640.c
@@ -1192,7 +1192,7 @@ static void dp83640_txtstamp(struct phy_device *phydev,
 
 	case HWTSTAMP_TX_ONESTEP_SYNC:
 		if (is_sync(skb, type)) {
-			kfree_skb(skb);
+			skb_complete_tx_timestamp(skb, NULL);
 			return;
 		}
 		/* fall through */
@@ -1203,7 +1203,7 @@ static void dp83640_txtstamp(struct phy_device *phydev,
 
 	case HWTSTAMP_TX_OFF:
 	default:
-		kfree_skb(skb);
+		skb_complete_tx_timestamp(skb, NULL);
 		break;
 	}
 }
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 1/3] net: hold sock reference while processing tx timestamps
From: Richard Cochran @ 2011-10-13  9:46 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Eric Dumazet, Johannes Berg, stable
In-Reply-To: <cover.1318498805.git.richard.cochran@omicron.at>

The pair of functions,

 * skb_clone_tx_timestamp()
 * skb_complete_tx_timestamp()

were designed to allow timestamping in PHY devices. The first
function, called during the MAC driver's hard_xmit method, identifies
PTP protocol packets, clones them, and gives them to the PHY device
driver. The PHY driver may hold onto the packet and deliver it at a
later time using the second function, which adds the packet to the
socket's error queue.

As pointed out by Johannes, nothing prevents the socket from
disappearing while the cloned packet is sitting in the PHY driver
awaiting a timestamp. This patch fixes the issue by taking a reference
on the socket for each such packet. In addition, the comments
regarding the usage of these function are expanded to highlight the
rule that PHY drivers must use skb_complete_tx_timestamp() to release
the packet, in order to release the socket reference, too.

These functions first appeared in v2.6.36.

Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Richard Cochran <richard.cochran@omicron.at>
Cc: <stable@kernel.org>
---
 include/linux/phy.h     |    2 +-
 include/linux/skbuff.h  |    7 ++++++-
 net/core/timestamping.c |    7 ++++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 54fc413..79f337c 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -420,7 +420,7 @@ struct phy_driver {
 
 	/*
 	 * Requests a Tx timestamp for 'skb'. The phy driver promises
-	 * to deliver it to the socket's error queue as soon as a
+	 * to deliver it using skb_complete_tx_timestamp() as soon as a
 	 * timestamp becomes available. One of the PTP_CLASS_ values
 	 * is passed in 'type'.
 	 */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 8bd383c..0f96646 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2020,8 +2020,13 @@ static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
 /**
  * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
  *
+ * PHY drivers may accept clones of transmitted packets for
+ * timestamping via their phy_driver.txtstamp method. These drivers
+ * must call this function to return the skb back to the stack, with
+ * or without a timestamp.
+ *
  * @skb: clone of the the original outgoing packet
- * @hwtstamps: hardware time stamps
+ * @hwtstamps: hardware time stamps, may be NULL if not available
  *
  */
 void skb_complete_tx_timestamp(struct sk_buff *skb,
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 98a5264..29e59d6 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -60,6 +60,7 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
 			clone = skb_clone(skb, GFP_ATOMIC);
 			if (!clone)
 				return;
+			sock_hold(sk);
 			clone->sk = sk;
 			phydev->drv->txtstamp(phydev, clone, type);
 		}
@@ -77,8 +78,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
 	struct sock_exterr_skb *serr;
 	int err;
 
-	if (!hwtstamps)
+	if (!hwtstamps) {
+		sock_put(sk);
+		kfree_skb(skb);
 		return;
+	}
 
 	*skb_hwtstamps(skb) = *hwtstamps;
 	serr = SKB_EXT_ERR(skb);
@@ -87,6 +91,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
 	serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
 	skb->sk = NULL;
 	err = sock_queue_err_skb(sk, skb);
+	sock_put(sk);
 	if (err)
 		kfree_skb(skb);
 }
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 0/3] net: time stamping fixes
From: Richard Cochran @ 2011-10-13  9:46 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Eric Dumazet, Johannes Berg
In-Reply-To: <56185ca8a7dc0223031ca0f0996302cac1b497eb.1318444117.git.richard.cochran@omicron.at>

The first patch fixes a bug in the time stamping code introduced in
v2.6.36 of the kernel.

The other two patches depend on the first patch and fix two bugs in a
PTP Hardware Clock driver. This driver was first introduced in Linux
version 3.0.

Richard Cochran (3):
  net: hold sock reference while processing tx timestamps
  dp83640: use proper function to free transmit time stamping packets
  dp83640: free packet queues on remove

 drivers/net/phy/dp83640.c |   11 +++++++++--
 include/linux/phy.h       |    2 +-
 include/linux/skbuff.h    |    7 ++++++-
 net/core/timestamping.c   |    7 ++++++-
 4 files changed, 22 insertions(+), 5 deletions(-)

-- 
1.7.2.5

^ permalink raw reply

* Re: e100 + VLANs?
From: Michael Tokarev @ 2011-10-13  9:22 UTC (permalink / raw)
  To: Jesse Brandeburg
  Cc: David Lamparter, Eric Dumazet, Kirsher, Jeffrey T, netdev
In-Reply-To: <20111011163855.0000646f@unknown>

On 12.10.2011 03:38, Jesse Brandeburg wrote:
[..]
>> The knowledge and code for this is actually around line 1142 of e100.c:
>>         if (nic->mac >= mac_82558_D101_A4) {
>>                 config->fc_disable = 0x1;       /* 1=Tx fc off, 0=Tx fc on */
>>                 config->mwi_enable = 0x1;       /* 1=enable, 0=disable */
>>                 config->standard_tcb = 0x0;     /* 1=standard, 0=extended */
>>                 config->rx_long_ok = 0x1;       /* 1=VLANs ok, 0=standard */
>>
>> where rx_long_ok is the configuration bit to enable frame reception
>> for >1514 byte frames. I guess your card is < mac_82558_D101_A4...
>>
>> (cf. "Intel 8255x 10/100 Mbps Ethernet Controller Family Open Source
>> Software Developer Manual" page 78/86 - "Long Receive OK. This bit is
>> reserved on the 82557 and should be set to 0. When this bit is set on
>> the 82558 or 82559, the device considers received frames that have
>> a data field longer than 1500 bytes as good frames.")
> 
> David, thank you for posting that, while you were typing I was
> researching the same thing, so FWIW, I concur with your conclusion.
> 
> ouch, OP your hardware is really really old:
>> 00:12.0 Ethernet controller: Intel Corporation 82557/8/9/0/1 Ethernet Pro 100 (rev 02)
>> Subsystem: Intel Corporation EtherExpress PRO/100B (TX)
> 
> rev 2 is D100_C, which is 82557.
> 
> the hardware is NOT capable of long receives (i.e. vlan packets).
> If it was then they should generally fit in the receive buffer and be
> handled and not discarded.

Can this knowlege be added to the driver somehow, so that others
will not hit the same trap as I did? :)  _If_ there's any value
in that (I guess there aren't many users with that hardware left),
and if that's easy to do ofcourse... ;)

Thanks!

/mjt

^ permalink raw reply

* Re: linux-next: manual merge of the staging tree with the net tree
From: Arend van Spriel @ 2011-10-13  9:08 UTC (permalink / raw)
  To: Stephen Rothwell
  Cc: Greg KH, linux-next@vger.kernel.org, linux-kernel@vger.kernel.org,
	David Miller, netdev@vger.kernel.org, Jiri Pirko, Eliad Peller,
	John W. Linville
In-Reply-To: <20111013164353.09c495e1b63d23e380cca1f7@canb.auug.org.au>


[-- Attachment #1.1: Type: text/plain, Size: 537 bytes --]

On 10/13/2011 07:43 AM, Stephen Rothwell wrote:
> Hi Greg,
> 
> Today's linux-next merge of the staging tree got a conflicts in
> drivers/staging/brcm80211/brcmfmac/dhd_linux.c and
> drivers/staging/brcm80211/brcmsmac/mac80211_if.c between commits from the
> net-next tree and commit fc2d6e573be6 ("staging: brcm80211: remove
> brcm80211 driver from the staging tree") from the staging tree.
> 
> The latter removed the files, so I did that

Great. The move to drivers/net/wireless has completed. Thanks for this.

Gr. AvS

[-- Attachment #1.2: 0xB5E1A116.asc --]
[-- Type: application/pgp-keys, Size: 3165 bytes --]

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 900 bytes --]

^ permalink raw reply

* Re: [PATCH 1/1] net: hold sock reference while processing tx timestamps
From: Johannes Berg @ 2011-10-13  8:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Richard Cochran, netdev, David Miller
In-Reply-To: <1318449159.2644.9.camel@edumazet-laptop>

On Wed, 2011-10-12 at 21:52 +0200, Eric Dumazet wrote:

> > The other thing I was wondering -- what if we just set truesize to 1 (we
> > don't have any truesize checks) and account the skb to the socket
> > normally. Not really a good way either though.
> > 
> 
> Changing truesize is not allowed here see below.

Oh, good point.

> > FWIW I just decided to do it the other way around in mac80211 -- keep
> > the original SKB that's charged to the socket for the error queue, and
> > use a clone to actually do the TX.
> 
> Hmm, please take a look at IFF_SKB_TX_SHARING stuff, added in commit
> d88733150 (net: add IFF_SKB_TX_SHARED flag to priv_flags)

mac80211 got that flag cleared -- and right now I see no way to change
that. 11ac/ad may require work in this area to be able to do this
better, and I have a number of ideas on how to do that, but right now I
don't see how we can do that.

johannes

^ permalink raw reply

* [iproute2 PATCH] Fix unterminated readlink() buffer usage
From: Thomas Jarosch @ 2011-10-13  8:30 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger


Signed-off-by: Thomas Jarosch <thomas.jarosch@intra2net.com>
---
 misc/ss.c |   10 ++++++++--
 1 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index b00841b..1353620 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -273,13 +273,19 @@ static void user_ent_hash_build(void)
 			unsigned int ino;
 			char lnk[64];
 			int fd;
+			ssize_t link_len;
 
 			if (sscanf(d1->d_name, "%d%c", &fd, &crap) != 1)
 				continue;
 
 			sprintf(name+pos, "%d", fd);
-			if (readlink(name, lnk, sizeof(lnk)-1) < 0 ||
-			    strncmp(lnk, pattern, strlen(pattern)))
+
+			link_len = readlink(name, lnk, sizeof(lnk)-1);
+			if (link_len == -1)
+				continue;
+			lnk[link_len] = '\0';
+
+			if (strncmp(lnk, pattern, strlen(pattern)))
 				continue;
 
 			sscanf(lnk, "socket:[%u]", &ino);
-- 
1.7.4.4

^ permalink raw reply related

* Re: [PATCH v6 3/8] foundations of per-cgroup memory pressure controlling.
From: Glauber Costa @ 2011-10-13  8:25 UTC (permalink / raw)
  To: KAMEZAWA Hiroyuki
  Cc: linux-kernel, paul, lizf, ebiederm, davem, gthelen, netdev,
	linux-mm, kirill, avagin, devel
In-Reply-To: <20111013145353.161009ea.kamezawa.hiroyu@jp.fujitsu.com>

On 10/13/2011 09:53 AM, KAMEZAWA Hiroyuki wrote:
> On Mon, 10 Oct 2011 14:24:23 +0400
> Glauber Costa<glommer@parallels.com>  wrote:
>
>> This patch converts struct sock fields memory_pressure,
>> memory_allocated, sockets_allocated, and sysctl_mem (now prot_mem)
>> to function pointers, receiving a struct mem_cgroup parameter.
>>
>> enter_memory_pressure is kept the same, since all its callers
>> have socket a context, and the kmem_cgroup can be derived from
>> the socket itself.
>>
>> To keep things working, the patch convert all users of those fields
>> to use acessor functions.
>>
>> In my benchmarks I didn't see a significant performance difference
>> with this patch applied compared to a baseline (around 1 % diff, thus
>> inside error margin).
>>
>> Signed-off-by: Glauber Costa<glommer@parallels.com>
>> CC: David S. Miller<davem@davemloft.net>
>> CC: Hiroyouki Kamezawa<kamezawa.hiroyu@jp.fujitsu.com>
>> CC: Eric W. Biederman<ebiederm@xmission.com>
>
> Reviewed-by: KAMEZAWA Hiroyuki<kamezawa.hiroyu@jp.fujitsu.com>
>
> a nitpick.
>
>
>>   #ifdef CONFIG_INET
>> +enum {
>> +	UNDER_LIMIT,
>> +	OVER_LIMIT,
>> +};
>> +
>
> It may be better to move this to res_counter.h or memcontrol.h
>
Sorry Kame,

It is in memcontrol.h already. What exactly do you mean here ?



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v6 1/8] Basic kernel memory functionality for the Memory Controller
From: Glauber Costa @ 2011-10-13  8:19 UTC (permalink / raw)
  To: Greg Thelen
  Cc: linux-kernel, paul, lizf, kamezawa.hiroyu, ebiederm, davem,
	netdev, linux-mm, kirill, avagin, devel
In-Reply-To: <CAHH2K0awiPZZ9EJLyZy_p_ehf0-waQ-vGUAhAZEpdCMnYqKidA@mail.gmail.com>

On 10/13/2011 11:18 AM, Greg Thelen wrote:
> On Mon, Oct 10, 2011 at 3:24 AM, Glauber Costa<glommer@parallels.com>  wrote:
>> diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
>> index 06eb6d9..bf00cd2 100644
>> --- a/Documentation/cgroups/memory.txt
>> +++ b/Documentation/cgroups/memory.txt
> ...
>> @@ -255,6 +262,31 @@ When oom event notifier is registered, event will be delivered.
>>    per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
>>    zone->lru_lock, it has no lock of its own.
>>
>> +2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
>> +
>> + With the Kernel memory extension, the Memory Controller is able to limit
>
> Extra leading space before 'With'.
>
>> +the amount of kernel memory used by the system. Kernel memory is fundamentally
>> +different than user memory, since it can't be swapped out, which makes it
>> +possible to DoS the system by consuming too much of this precious resource.
>> +Kernel memory limits are not imposed for the root cgroup.
>> +
>> +Memory limits as specified by the standard Memory Controller may or may not
>> +take kernel memory into consideration. This is achieved through the file
>> +memory.independent_kmem_limit. A Value different than 0 will allow for kernel
>
> s/Value/value/
>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 3508777..d25c5cb 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
> ...
>> +static int kmem_limit_independent_write(struct cgroup *cont, struct cftype *cft,
>> +                                       u64 val)
>> +{
>> +       cgroup_lock();
>> +       mem_cgroup_from_cont(cont)->kmem_independent_accounting = !!val;
>> +       cgroup_unlock();
>
> I do not think cgroup_lock,unlock are needed here.  The cont and
> associated cgroup should be guaranteed by the caller to be valid.
> Does this lock provide some other synchronization?
Yeah, I think I was being overcautious.

With the following comments addressed, can I add your Reviewed-by to 
this one ?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* 群发软件+买家搜索机+109届广交会买家、海关数据,B2B询盘买家500万。
From: 仅10元每天 @ 2011-10-13  8:13 UTC (permalink / raw)
  To: nesconsk, ness, ness, nessa_quilts, netdev, netdog8, netdsmait,
	netease_4370

群发软件+109届广交会买家、海关数据、搜索引擎买家,B2B询盘买家共500万,仅10元每天。 
群发软件+109届广交会买家、海关数据、搜索引擎买家,B2B询盘买家共500万,仅10元每天。 
保证每天都有买家回复。
保证每天都有买家回复。
保证每天都有买家回复。

1、群发软件： 操作简单，功能强大，模仿人工操作模式，到达率高，日发送5万封以上。 
2、500万买家资源： 赠送的500万买家资源库，更新 (可以按照您的产品提取出来，更精准开发)。 
3、超级海外买家Email搜索机： 内置了600万行业关键词，根据长尾词搜索，结果更精确匹配；每天能搜索1-2万以上买家真实EMAIL，成单率高。 
 

要的抓紧联系QQ: 1339625218    或者立即回复邮箱: 1339625218@qq.com
要的抓紧联系QQ: 1339625218    或者立即回复邮箱: 1339625218@qq.com
要的抓紧联系QQ: 1339625218    或者立即回复邮箱: 1339625218@qq.com
 
免费赠送:
一共8个包(数据是全行业的，按照行业分好类，并且可以按照关键词查询的)： 
1，2011春季109届广交会买家现场询盘数据库新鲜出炉，超级新鲜买家，新鲜数据，容易成单！ 
2，购买后可以免费更新2011秋季广交会+2012春季广交会买家数据。太超值了。
3，最新全球买家库,共451660条数据。 (最新更新日期 2011-05-16日)
4，2008年,2009年,2010年 春季+秋季广交会买家名录，103 104 105 106 107 108 共六届 共120.6万数据。
5，2010年国际促销协会（PPAI）成员名单 PPAI Members Directory，非常重要的大买家。
6，2010年到香港采购的国外客人名录(香港贸发局提供)，共7.2万数据，超级重要的买家。
7，48.68万条最新买家询盘，购买后每月更新 1-2万条，包括2部分，1，最新的询盘 2，最新的展会买家。免费更新36个月。
8，2009年海关提单数据piers版数据 1千万。


诚信为本，支持支付宝担保交易 (先发货并安装设置群发软件，然后付款) 彻底打消您的 顾虑。

 


 

精准数据-成单率极高
精准数据-成单率极高
精准数据-成单率极高
精准数据-成单率极高
精准数据-成单率极高

^ permalink raw reply

* [PATCH 5/5] sysfs: Reject with a warning invalid uses of tagged directories.
From: Eric W. Biederman @ 2011-10-13  8:02 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David Miller
  Cc: linux-kernel, netdev, Tejun Heo, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <m1ehyh9v6p.fsf_-_@fess.ebiederm.org>

sysfs is a core piece of ifrastructure that many people use and
few people have all of the rules in their head on how to use
it correctly.  Add warnings for people using tagged directories
improperly to that any misuses can be caught and diagnosed quickly.

A single inexpensive test in sysfs_find_dirent is almost sufficient
to catch all possible misuses.  An additional warning is needed
in sysfs_add_dirent so that we actually fail when attempting to
add an untagged dirent in a tagged directory.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/sysfs/dir.c  |   14 ++++++++++++++
 fs/sysfs/file.c |    3 ---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 352d26d..26f370a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -384,6 +384,13 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;

+	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
+			acxt->parent_sd->s_name, sd->s_name);
+		return -EINVAL;
+	}
+
 	if (sysfs_find_dirent(acxt->parent_sd, sd->s_ns, sd->s_name))
 		return -EEXIST;

@@ -542,6 +549,13 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 {
 	struct sysfs_dirent *sd;

+	if (!!sysfs_ns_type(parent_sd) != !!ns) {
+		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
+			sysfs_ns_type(parent_sd)? "required": "invalid",
+			parent_sd->s_name, name);
+		return NULL;
+	}
+
 	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
 		if (sd->s_ns != ns)
 			continue;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 07c1b4e..d4e6080 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -466,9 +466,6 @@ void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
 	mutex_lock(&sysfs_mutex);

 	if (sd && dir)
-		/* Only directories are tagged, so no need to pass
-		 * a tag explicitly.
-		 */
 		sd = sysfs_find_dirent(sd, NULL, dir);
 	if (sd && attr)
 		sd = sysfs_find_dirent(sd, NULL, attr);
-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 4/5] sysfs: Remove support for tagged directories with untagged members.
From: Eric W. Biederman @ 2011-10-13  8:01 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David Miller
  Cc: linux-kernel, netdev, Tejun Heo, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <m1mxd59vfa.fsf_-_@fess.ebiederm.org>

Now that /sys/class/net/bonding_masters is implemented as a tagged sysfs
file we can remove support for untagged files in tagged directories.

This change removes any ambiguity of what a NULL namespace value
means.  A NULL namespace parameter after this patch means
that we are talking about an untagged sysfs dirent.

This makes the sysfs code much less prone to mistakes when during
maintenance.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/sysfs/dir.c   |    6 +++---
 fs/sysfs/inode.c |    2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index ea9120a..352d26d 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -543,7 +543,7 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 	struct sysfs_dirent *sd;

 	for (sd = parent_sd->s_dir.children; sd; sd = sd->s_sibling) {
-		if (ns && sd->s_ns && (sd->s_ns != ns))
+		if (sd->s_ns != ns)
 			continue;
 		if (!strcmp(sd->s_name, name))
 			return sd;
@@ -885,7 +885,7 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 		while (pos && (ino > pos->s_ino))
 			pos = pos->s_sibling;
 	}
-	while (pos && pos->s_ns && pos->s_ns != ns)
+	while (pos && pos->s_ns != ns)
 		pos = pos->s_sibling;
 	return pos;
 }
@@ -896,7 +896,7 @@ static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
 	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
 	if (pos)
 		pos = pos->s_sibling;
-	while (pos && pos->s_ns && pos->s_ns != ns)
+	while (pos && pos->s_ns != ns)
 		pos = pos->s_sibling;
 	return pos;
 }
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index e3f091a..527f0cc 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -336,8 +336,6 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 	sysfs_addrm_start(&acxt, dir_sd);

 	sd = sysfs_find_dirent(dir_sd, ns, name);
-	if (sd && (sd->s_ns != ns))
-		sd = NULL;
 	if (sd)
 		sysfs_remove_one(&acxt, sd);

-- 
1.7.2.5

^ permalink raw reply related

* [PATCH 3/5] bonding: Use a per netns implementation of /sys/class/net/bonding_masters.
From: Eric W. Biederman @ 2011-10-13  7:56 UTC (permalink / raw)
  To: Greg Kroah-Hartman, David Miller
  Cc: linux-kernel, netdev, Tejun Heo, Jay Vosburgh, Andy Gospodarek
In-Reply-To: <m1sjmx9vhf.fsf_-_@fess.ebiederm.org>


This fixes a network namespace misfeature that bonding_masters looked at
current instead of the remembering the context where in which
/sys/class/net/bonding_masters was opened in to see which network
namespace to act upon.

This removes the need for sysfs to handle tagged directories with
untagged members allowing for a conceptually simpler sysfs
implementation.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/net/bonding/bond_main.c  |    7 +----
 drivers/net/bonding/bond_sysfs.c |   45 ++++++++++++++++++++++++++-----------
 drivers/net/bonding/bonding.h    |    7 ++++-
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 6d79b78..e084b1c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4898,6 +4898,7 @@ static int __net_init bond_net_init(struct net *net)
 	INIT_LIST_HEAD(&bn->dev_list);
 
 	bond_create_proc_dir(bn);
+	bond_create_sysfs(bn);
 	
 	return 0;
 }
@@ -4906,6 +4907,7 @@ static void __net_exit bond_net_exit(struct net *net)
 {
 	struct bond_net *bn = net_generic(net, bond_net_id);
 
+	bond_destroy_sysfs(bn);
 	bond_destroy_proc_dir(bn);
 }
 
@@ -4943,10 +4945,6 @@ static int __init bonding_init(void)
 			goto err;
 	}
 
-	res = bond_create_sysfs();
-	if (res)
-		goto err;
-
 	register_netdevice_notifier(&bond_netdev_notifier);
 	register_inetaddr_notifier(&bond_inetaddr_notifier);
 out:
@@ -4964,7 +4962,6 @@ static void __exit bonding_exit(void)
 	unregister_netdevice_notifier(&bond_netdev_notifier);
 	unregister_inetaddr_notifier(&bond_inetaddr_notifier);
 
-	bond_destroy_sysfs();
 	bond_destroy_debugfs();
 
 	rtnl_link_unregister(&bond_link_ops);
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 2dfb4bf..6044ff8 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -55,8 +55,8 @@ static ssize_t bonding_show_bonds(struct class *cls,
 				  struct class_attribute *attr,
 				  char *buf)
 {
-	struct net *net = current->nsproxy->net_ns;
-	struct bond_net *bn = net_generic(net, bond_net_id);
+	struct bond_net *bn =
+		container_of(attr, struct bond_net, class_attr_bonding_masters);
 	int res = 0;
 	struct bonding *bond;
 
@@ -79,9 +79,8 @@ static ssize_t bonding_show_bonds(struct class *cls,
 	return res;
 }
 
-static struct net_device *bond_get_by_name(struct net *net, const char *ifname)
+static struct net_device *bond_get_by_name(struct bond_net *bn, const char *ifname)
 {
-	struct bond_net *bn = net_generic(net, bond_net_id);
 	struct bonding *bond;
 
 	list_for_each_entry(bond, &bn->dev_list, bond_list) {
@@ -103,7 +102,8 @@ static ssize_t bonding_store_bonds(struct class *cls,
 				   struct class_attribute *attr,
 				   const char *buffer, size_t count)
 {
-	struct net *net = current->nsproxy->net_ns;
+	struct bond_net *bn =
+		container_of(attr, struct bond_net, class_attr_bonding_masters);
 	char command[IFNAMSIZ + 1] = {0, };
 	char *ifname;
 	int rv, res = count;
@@ -116,7 +116,7 @@ static ssize_t bonding_store_bonds(struct class *cls,
 
 	if (command[0] == '+') {
 		pr_info("%s is being created...\n", ifname);
-		rv = bond_create(net, ifname);
+		rv = bond_create(bn->net, ifname);
 		if (rv) {
 			if (rv == -EEXIST)
 				pr_info("%s already exists.\n", ifname);
@@ -128,7 +128,7 @@ static ssize_t bonding_store_bonds(struct class *cls,
 		struct net_device *bond_dev;
 
 		rtnl_lock();
-		bond_dev = bond_get_by_name(net, ifname);
+		bond_dev = bond_get_by_name(bn, ifname);
 		if (bond_dev) {
 			pr_info("%s is being deleted...\n", ifname);
 			unregister_netdevice(bond_dev);
@@ -150,9 +150,24 @@ err_no_cmd:
 	return -EPERM;
 }
 
+static const void *bonding_namespace(struct class *cls,
+				     const struct class_attribute *attr)
+{
+	const struct bond_net *bn =
+		container_of(attr, struct bond_net, class_attr_bonding_masters);
+	return bn->net;
+}
+
 /* class attribute for bond_masters file.  This ends up in /sys/class/net */
-static CLASS_ATTR(bonding_masters,  S_IWUSR | S_IRUGO,
-		  bonding_show_bonds, bonding_store_bonds);
+static const struct class_attribute class_attr_bonding_masters = {
+	.attr = {
+		.name = "bonding_masters",
+		.mode = S_IWUSR | S_IRUGO,
+	},
+	.show = bonding_show_bonds,
+	.store = bonding_store_bonds,
+	.namespace = bonding_namespace,
+};
 
 int bond_create_slave_symlinks(struct net_device *master,
 			       struct net_device *slave)
@@ -1655,11 +1670,13 @@ static struct attribute_group bonding_group = {
  * Initialize sysfs.  This sets up the bonding_masters file in
  * /sys/class/net.
  */
-int bond_create_sysfs(void)
+int bond_create_sysfs(struct bond_net *bn)
 {
 	int ret;
 
-	ret = netdev_class_create_file(&class_attr_bonding_masters);
+	bn->class_attr_bonding_masters = class_attr_bonding_masters;
+
+	ret = netdev_class_create_file(&bn->class_attr_bonding_masters);
 	/*
 	 * Permit multiple loads of the module by ignoring failures to
 	 * create the bonding_masters sysfs file.  Bonding devices
@@ -1673,7 +1690,7 @@ int bond_create_sysfs(void)
 	 */
 	if (ret == -EEXIST) {
 		/* Is someone being kinky and naming a device bonding_master? */
-		if (__dev_get_by_name(&init_net,
+		if (__dev_get_by_name(bn->net,
 				      class_attr_bonding_masters.attr.name))
 			pr_err("network device named %s already exists in sysfs",
 			       class_attr_bonding_masters.attr.name);
@@ -1687,9 +1704,9 @@ int bond_create_sysfs(void)
 /*
  * Remove /sys/class/net/bonding_masters.
  */
-void bond_destroy_sysfs(void)
+void bond_destroy_sysfs(struct bond_net *bn)
 {
-	netdev_class_remove_file(&class_attr_bonding_masters);
+	netdev_class_remove_file(&bn->class_attr_bonding_masters);
 }
 
 /*
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 43526a2..1007f9a 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -380,11 +380,13 @@ static inline bool bond_is_slave_inactive(struct slave *slave)
 	return slave->inactive;
 }
 
+struct bond_net;
+
 struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr);
 int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev);
 int bond_create(struct net *net, const char *name);
-int bond_create_sysfs(void);
-void bond_destroy_sysfs(void);
+int bond_create_sysfs(struct bond_net *net);
+void bond_destroy_sysfs(struct bond_net *net);
 void bond_prepare_sysfs_group(struct bonding *bond);
 int bond_create_slave_symlinks(struct net_device *master, struct net_device *slave);
 void bond_destroy_slave_symlinks(struct net_device *master, struct net_device *slave);
@@ -410,6 +412,7 @@ struct bond_net {
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *	proc_dir;
 #endif
+	struct class_attribute	class_attr_bonding_masters;
 };
 
 #ifdef CONFIG_PROC_FS
-- 
1.7.2.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox