Netdev List
 help / color / mirror / Atom feed
* [PATCH v6 06/10] tcp buffer limitation: per-cgroup limit
From: Glauber Costa @ 2011-11-25 17:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen, netdev,
	linux-mm, kirill, avagin, devel, eric.dumazet, cgroups,
	Glauber Costa
In-Reply-To: <1322242696-27682-1-git-send-email-glommer@parallels.com>

This patch uses the "tcp.limit_in_bytes" field of the kmem_cgroup to
effectively control the amount of kernel memory pinned by a cgroup.

This value is ignored in the root cgroup, and in all others,
caps the value specified by the admin in the net namespaces'
view of tcp_sysctl_mem.

If namespaces are being used, the admin is allowed to set a
value bigger than cgroup's maximum, the same way it is allowed
to set pretty much unlimited values in a real box.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: David S. Miller <davem@davemloft.net>
CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
---
 Documentation/cgroups/memory.txt |    1 +
 include/net/tcp_memcg.h          |    3 +
 net/ipv4/sysctl_net_ipv4.c       |   14 ++++
 net/ipv4/tcp_memcg.c             |  138 +++++++++++++++++++++++++++++++++++++-
 4 files changed, 154 insertions(+), 2 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index bf00cd2..c1db134 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -78,6 +78,7 @@ Brief summary of control files.
 
  memory.independent_kmem_limit	 # select whether or not kernel memory limits are
 				   independent of user limits
+ memory.kmem.tcp.limit_in_bytes  # set/show hard limit for tcp buf memory
 
 1. History
 
diff --git a/include/net/tcp_memcg.h b/include/net/tcp_memcg.h
index 5f5e158..2c8bb6b 100644
--- a/include/net/tcp_memcg.h
+++ b/include/net/tcp_memcg.h
@@ -14,4 +14,7 @@ struct tcp_memcontrol {
 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
+unsigned long long tcp_max_memory(const struct mem_cgroup *memcg);
+void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
+int tcp_update_limit(struct mem_cgroup *memcg, u64 val);
 #endif /* _TCP_MEMCG_H */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index bbd67ab..17aaa1b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,6 +24,7 @@
 #include <net/cipso_ipv4.h>
 #include <net/inet_frag.h>
 #include <net/ping.h>
+#include <net/tcp_memcg.h>
 
 static int zero;
 static int tcp_retr1_max = 255;
@@ -182,6 +183,9 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
 	int ret;
 	unsigned long vec[3];
 	struct net *net = current->nsproxy->net_ns;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	struct mem_cgroup *cg;
+#endif
 
 	ctl_table tmp = {
 		.data = &vec,
@@ -198,6 +202,16 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	rcu_read_lock();
+	cg = mem_cgroup_from_task(current);
+
+	tcp_prot_mem(cg, vec[0], 0);
+	tcp_prot_mem(cg, vec[1], 1);
+	tcp_prot_mem(cg, vec[2], 2);
+	rcu_read_unlock();
+#endif
+
 	net->ipv4.sysctl_tcp_mem[0] = vec[0];
 	net->ipv4.sysctl_tcp_mem[1] = vec[1];
 	net->ipv4.sysctl_tcp_mem[2] = vec[2];
diff --git a/net/ipv4/tcp_memcg.c b/net/ipv4/tcp_memcg.c
index 1dbc0f3..b3721c3 100644
--- a/net/ipv4/tcp_memcg.c
+++ b/net/ipv4/tcp_memcg.c
@@ -5,6 +5,19 @@
 #include <linux/nsproxy.h>
 #include <linux/memcontrol.h>
 
+static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft);
+static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
+			    const char *buffer);
+
+static struct cftype tcp_files[] = {
+	{
+		.name = "kmem.tcp.limit_in_bytes",
+		.write_string = tcp_cgroup_write,
+		.read_u64 = tcp_cgroup_read,
+		.private = RES_LIMIT,
+	},
+};
+
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
 {
 	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -26,7 +39,7 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
-		return 0;
+		goto create_files;
 
 	tcp = tcp_from_cgproto(cg_proto);
 	cg_proto->parent = tcp_prot.proto_cgroup(parent);
@@ -47,7 +60,9 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
 	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
 
-	return 0;
+create_files:
+	return cgroup_add_files(cgrp, ss, tcp_files,
+				ARRAY_SIZE(tcp_files));
 }
 EXPORT_SYMBOL(tcp_init_cgroup);
 
@@ -56,6 +71,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct cg_proto *cg_proto;
 	struct tcp_memcontrol *tcp;
+	u64 val;
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
@@ -63,5 +79,123 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 
 	tcp = tcp_from_cgproto(cg_proto);
 	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
+
+	val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE);
+
+	if (val != RESOURCE_MAX)
+		jump_label_dec(&memcg_socket_limit_enabled);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
+
+int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+	int i;
+	int ret;
+
+	cg_proto = tcp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return -EINVAL;
+
+	tcp = tcp_from_cgproto(cg_proto);
+
+	ret = res_counter_set_limit(&tcp->tcp_memory_allocated, val);
+	if (ret)
+		return ret;
+
+	val >>= PAGE_SHIFT;
+
+	for (i = 0; i < 3; i++)
+		tcp->tcp_prot_mem[i] = min_t(long, val,
+					     net->ipv4.sysctl_tcp_mem[i]);
+
+	if (val == RESOURCE_MAX)
+		jump_label_dec(&memcg_socket_limit_enabled);
+	else {
+		u64 old_lim;
+		old_lim = res_counter_read_u64(&tcp->tcp_memory_allocated,
+					       RES_LIMIT);
+		if (old_lim == RESOURCE_MAX)
+			jump_label_inc(&memcg_socket_limit_enabled);
+	}
+	return 0;
+}
+
+static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft,
+			    const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long long val;
+	int ret = 0;
+
+	switch (cft->private) {
+	case RES_LIMIT:
+		/* see memcontrol.c */
+		ret = res_counter_memparse_write_strategy(buffer, &val);
+		if (ret)
+			break;
+		ret = tcp_update_limit(memcg, val);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static u64 tcp_read_stat(struct mem_cgroup *memcg, int type, u64 default_val)
+{
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+
+	cg_proto = tcp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return default_val;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	return res_counter_read_u64(&tcp->tcp_memory_allocated, type);
+}
+
+static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	u64 val;
+
+	switch (cft->private) {
+	case RES_LIMIT:
+		val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX);
+		break;
+	default:
+		BUG();
+	}
+	return val;
+}
+
+unsigned long long tcp_max_memory(const struct mem_cgroup *memcg)
+{
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+
+	cg_proto = tcp_prot.proto_cgroup((struct mem_cgroup *)memcg);
+	if (!cg_proto)
+		return 0;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
+}
+
+void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx)
+{
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+
+	cg_proto = tcp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return;
+
+	tcp = tcp_from_cgproto(cg_proto);
+
+	tcp->tcp_prot_mem[idx] = val;
+}
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v6 05/10] per-netns ipv4 sysctl_tcp_mem
From: Glauber Costa @ 2011-11-25 17:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen, netdev,
	linux-mm, kirill, avagin, devel, eric.dumazet, cgroups,
	Glauber Costa
In-Reply-To: <1322242696-27682-1-git-send-email-glommer@parallels.com>

This patch allows each namespace to independently set up
its levels for tcp memory pressure thresholds. This patch
alone does not buy much: we need to make this values
per group of process somehow. This is achieved in the
patches that follows in this patchset.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: David S. Miller <davem@davemloft.net>
CC: Eric W. Biederman <ebiederm@xmission.com>
---
 include/net/netns/ipv4.h   |    1 +
 include/net/tcp.h          |    1 -
 net/ipv4/af_inet.c         |    2 +
 net/ipv4/sysctl_net_ipv4.c |   51 +++++++++++++++++++++++++++++++++++++------
 net/ipv4/tcp.c             |   11 +-------
 net/ipv4/tcp_ipv4.c        |    1 -
 net/ipv4/tcp_memcg.c       |    9 +++++--
 net/ipv6/af_inet6.c        |    2 +
 net/ipv6/tcp_ipv6.c        |    1 -
 9 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index d786b4f..bbd023a 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -55,6 +55,7 @@ struct netns_ipv4 {
 	int current_rt_cache_rebuild_count;
 
 	unsigned int sysctl_ping_group_range[2];
+	long sysctl_tcp_mem[3];
 
 	atomic_t rt_genid;
 	atomic_t dev_addr_genid;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ccaa3b6..f3cc395 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -230,7 +230,6 @@ extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_ecn;
 extern int sysctl_tcp_dsack;
-extern long sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1b5096a..a8bbcff 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1671,6 +1671,8 @@ static int __init inet_init(void)
 	ip_static_sysctl_init();
 #endif
 
+	tcp_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
+
 	/*
 	 *	Add all the base protocols.
 	 */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 69fd720..bbd67ab 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/nsproxy.h>
+#include <linux/swap.h>
 #include <net/snmp.h>
 #include <net/icmp.h>
 #include <net/ip.h>
@@ -174,6 +175,36 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
 	return ret;
 }
 
+static int ipv4_tcp_mem(ctl_table *ctl, int write,
+			   void __user *buffer, size_t *lenp,
+			   loff_t *ppos)
+{
+	int ret;
+	unsigned long vec[3];
+	struct net *net = current->nsproxy->net_ns;
+
+	ctl_table tmp = {
+		.data = &vec,
+		.maxlen = sizeof(vec),
+		.mode = ctl->mode,
+	};
+
+	if (!write) {
+		ctl->data = &net->ipv4.sysctl_tcp_mem;
+		return proc_doulongvec_minmax(ctl, write, buffer, lenp, ppos);
+	}
+
+	ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (ret)
+		return ret;
+
+	net->ipv4.sysctl_tcp_mem[0] = vec[0];
+	net->ipv4.sysctl_tcp_mem[1] = vec[1];
+	net->ipv4.sysctl_tcp_mem[2] = vec[2];
+
+	return 0;
+}
+
 static struct ctl_table ipv4_table[] = {
 	{
 		.procname	= "tcp_timestamps",
@@ -433,13 +464,6 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec
 	},
 	{
-		.procname	= "tcp_mem",
-		.data		= &sysctl_tcp_mem,
-		.maxlen		= sizeof(sysctl_tcp_mem),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax
-	},
-	{
 		.procname	= "tcp_wmem",
 		.data		= &sysctl_tcp_wmem,
 		.maxlen		= sizeof(sysctl_tcp_wmem),
@@ -721,6 +745,12 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= ipv4_ping_group_range,
 	},
+	{
+		.procname	= "tcp_mem",
+		.maxlen		= sizeof(init_net.ipv4.sysctl_tcp_mem),
+		.mode		= 0644,
+		.proc_handler	= ipv4_tcp_mem,
+	},
 	{ }
 };
 
@@ -734,6 +764,7 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
 static __net_init int ipv4_sysctl_init_net(struct net *net)
 {
 	struct ctl_table *table;
+	unsigned long limit;
 
 	table = ipv4_net_table;
 	if (!net_eq(net, &init_net)) {
@@ -769,6 +800,12 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
 
 	net->ipv4.sysctl_rt_cache_rebuild_count = 4;
 
+	limit = nr_free_buffer_pages() / 8;
+	limit = max(limit, 128UL);
+	net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
+	net->ipv4.sysctl_tcp_mem[1] = limit;
+	net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+
 	net->ipv4.ipv4_hdr = register_net_sysctl_table(net,
 			net_ipv4_ctl_path, table);
 	if (net->ipv4.ipv4_hdr == NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 89a2bfe..631e6b3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -282,11 +282,9 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
-long sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
 
-EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
@@ -3274,14 +3272,9 @@ void __init tcp_init(void)
 	sysctl_tcp_max_orphans = cnt / 2;
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
-	limit = nr_free_buffer_pages() / 8;
-	limit = max(limit, 128UL);
-	sysctl_tcp_mem[0] = limit / 4 * 3;
-	sysctl_tcp_mem[1] = limit;
-	sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
-
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
-	limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7);
+	limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1])
+		<< (PAGE_SHIFT - 7);
 	max_share = min(4UL*1024*1024, limit);
 
 	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c517d04..8920f98 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2617,7 +2617,6 @@ struct proto tcp_prot = {
 	.orphan_count		= &tcp_orphan_count,
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
-	.sysctl_mem		= sysctl_tcp_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
diff --git a/net/ipv4/tcp_memcg.c b/net/ipv4/tcp_memcg.c
index cc91918..1dbc0f3 100644
--- a/net/ipv4/tcp_memcg.c
+++ b/net/ipv4/tcp_memcg.c
@@ -1,6 +1,8 @@
 #include <net/tcp.h>
 #include <net/tcp_memcg.h>
 #include <net/sock.h>
+#include <net/ip.h>
+#include <linux/nsproxy.h>
 #include <linux/memcontrol.h>
 
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
@@ -20,6 +22,7 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	struct tcp_memcontrol *tcp;
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+	struct net *net = current->nsproxy->net_ns;
 
 	cg_proto = tcp_prot.proto_cgroup(memcg);
 	if (!cg_proto)
@@ -28,9 +31,9 @@ int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
 	tcp = tcp_from_cgproto(cg_proto);
 	cg_proto->parent = tcp_prot.proto_cgroup(parent);
 
-	tcp->tcp_prot_mem[0] = sysctl_tcp_mem[0];
-	tcp->tcp_prot_mem[1] = sysctl_tcp_mem[1];
-	tcp->tcp_prot_mem[2] = sysctl_tcp_mem[2];
+	tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
+	tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
+	tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
 	tcp->tcp_memory_pressure = 0;
 
 	if (cg_proto->parent)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index d27c797..49b2145 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1115,6 +1115,8 @@ static int __init inet6_init(void)
 	if (err)
 		goto static_sysctl_fail;
 #endif
+	tcpv6_prot.sysctl_mem = init_net.ipv4.sysctl_tcp_mem;
+
 	/*
 	 *	ipngwg API draft makes clear that the correct semantics
 	 *	for TCP and UDP is to consider one TCP and UDP instance
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index d57d7a7..b6451f2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -2209,7 +2209,6 @@ struct proto tcpv6_prot = {
 	.memory_allocated	= &tcp_memory_allocated,
 	.memory_pressure	= &tcp_memory_pressure,
 	.orphan_count		= &tcp_orphan_count,
-	.sysctl_mem		= sysctl_tcp_mem,
 	.sysctl_wmem		= sysctl_tcp_wmem,
 	.sysctl_rmem		= sysctl_tcp_rmem,
 	.max_header		= MAX_TCP_HEADER,
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v6 04/10] Account tcp memory as kernel memory
From: Glauber Costa @ 2011-11-25 17:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen, netdev,
	linux-mm, kirill, avagin, devel, eric.dumazet, cgroups,
	Glauber Costa, KAMEZAWA Hiroyuki
In-Reply-To: <1322242696-27682-1-git-send-email-glommer@parallels.com>

Now that we account and control tcp memory buffers memory for pressure
controlling purposes, display this information as part of the normal memcg
files and other usages.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujtisu.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/memcontrol.h |    3 ++
 include/net/sock.h         |    3 ++
 include/net/tcp_memcg.h    |   17 +++++++++++
 mm/memcontrol.c            |   39 ++++++++++++++++++++++++--
 net/core/sock.c            |   42 ++++++++++++++++++++++++++--
 net/ipv4/Makefile          |    1 +
 net/ipv4/tcp_ipv4.c        |    8 +++++-
 net/ipv4/tcp_memcg.c       |   64 ++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/tcp_ipv6.c        |    4 +++
 9 files changed, 174 insertions(+), 7 deletions(-)
 create mode 100644 include/net/tcp_memcg.h
 create mode 100644 net/ipv4/tcp_memcg.c

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6644d90..1aff2f6 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,6 +85,9 @@ extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
 
+extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
+extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
+
 static inline
 int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
 {
diff --git a/include/net/sock.h b/include/net/sock.h
index d802761..da38de2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -65,6 +65,9 @@
 #include <net/dst.h>
 #include <net/checksum.h>
 
+int sockets_populate(struct cgroup *cgrp, struct cgroup_subsys *ss);
+void sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss);
+
 /*
  * This structure really needs to be cleaned up.
  * Most of it is for TCP, and not used by any of
diff --git a/include/net/tcp_memcg.h b/include/net/tcp_memcg.h
new file mode 100644
index 0000000..5f5e158
--- /dev/null
+++ b/include/net/tcp_memcg.h
@@ -0,0 +1,17 @@
+#ifndef _TCP_MEMCG_H
+#define _TCP_MEMCG_H
+
+struct tcp_memcontrol {
+	struct cg_proto cg_proto;
+	/* per-cgroup tcp memory pressure knobs */
+	struct res_counter tcp_memory_allocated;
+	struct percpu_counter tcp_sockets_allocated;
+	/* those two are read-mostly, leave them at the end */
+	long tcp_prot_mem[3];
+	int tcp_memory_pressure;
+};
+
+struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
+int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
+void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss);
+#endif /* _TCP_MEMCG_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5f29194..2df5d3c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,8 @@
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include "internal.h"
+#include <net/sock.h>
+#include <net/tcp_memcg.h>
 
 #include <asm/uaccess.h>
 
@@ -294,6 +296,10 @@ struct mem_cgroup {
 	 */
 	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
+
+#ifdef CONFIG_INET
+	struct tcp_memcontrol tcp_mem;
+#endif
 };
 
 /* Stuffs for move charges at task migration. */
@@ -385,6 +391,7 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
 #ifdef CONFIG_INET
 #include <net/sock.h>
+#include <net/ip.h>
 
 void sock_update_memcg(struct sock *sk)
 {
@@ -406,13 +413,21 @@ void sock_update_memcg(struct sock *sk)
 	}
 }
 
+struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
+{
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return NULL;
+
+	return &memcg->tcp_mem.cg_proto;
+}
+EXPORT_SYMBOL(tcp_proto_cgroup);
+
 #endif /* CONFIG_INET */
 #endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
 
 
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
 static void drain_all_stock_async(struct mem_cgroup *mem);
 
 static struct mem_cgroup_per_zone *
@@ -787,7 +802,7 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
 	}
 }
 
-static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
+struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
 	return container_of(cgroup_subsys_state(cont,
 				mem_cgroup_subsys_id), struct mem_cgroup,
@@ -4828,14 +4843,28 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
 
 	ret = cgroup_add_files(cont, ss, kmem_cgroup_files,
 			       ARRAY_SIZE(kmem_cgroup_files));
+
+	if (!ret)
+		ret = sockets_populate(cont, ss);
+
 	return ret;
 };
 
+static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+	sockets_destroy(cont, ss);
+}
 #else
 static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
 {
 	return 0;
 }
+
+static void kmem_cgroup_destroy(struct cgroup_subsys *ss,
+				struct cgroup *cont)
+{
+}
 #endif
 
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
@@ -4954,7 +4983,7 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
 /*
  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
  */
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
 {
 	if (!mem->res.parent)
 		return NULL;
@@ -5037,6 +5066,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 		res_counter_init(&mem->res, &parent->res);
 		res_counter_init(&mem->memsw, &parent->memsw);
 		res_counter_init(&mem->kmem, &parent->kmem);
+
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
@@ -5053,6 +5083,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	mem->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&mem->oom_notify);
 
+
 	if (parent)
 		mem->swappiness = mem_cgroup_swappiness(parent);
 	atomic_set(&mem->refcnt, 1);
@@ -5078,6 +5109,8 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
 
+	kmem_cgroup_destroy(ss, cont);
+
 	mem_cgroup_put(mem);
 }
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 8382c80..399e566 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -135,6 +135,45 @@
 #include <net/tcp.h>
 #endif
 
+static DEFINE_RWLOCK(proto_list_lock);
+static LIST_HEAD(proto_list);
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+int sockets_populate(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct proto *proto;
+	int ret = 0;
+
+	read_lock(&proto_list_lock);
+	list_for_each_entry(proto, &proto_list, node) {
+		if (proto->init_cgroup)
+			ret = proto->init_cgroup(cgrp, ss);
+			if (ret)
+				goto out;
+	}
+
+	read_unlock(&proto_list_lock);
+	return ret;
+out:
+	list_for_each_entry_continue_reverse(proto, &proto_list, node)
+		if (proto->destroy_cgroup)
+			proto->destroy_cgroup(cgrp, ss);
+	read_unlock(&proto_list_lock);
+	return ret;
+}
+
+void sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct proto *proto;
+
+	read_lock(&proto_list_lock);
+	list_for_each_entry_reverse(proto, &proto_list, node)
+		if (proto->destroy_cgroup)
+			proto->destroy_cgroup(cgrp, ss);
+	read_unlock(&proto_list_lock);
+}
+#endif
+
 /*
  * Each address family might have different locking rules, so we have
  * one slock key per address family:
@@ -2259,9 +2298,6 @@ void sk_common_release(struct sock *sk)
 }
 EXPORT_SYMBOL(sk_common_release);
 
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
-
 #ifdef CONFIG_PROC_FS
 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
 struct prot_inuse {
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f2dc69c..393e0af 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcg.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f124a4b..c517d04 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,6 +73,7 @@
 #include <net/xfrm.h>
 #include <net/netdma.h>
 #include <net/secure_seq.h>
+#include <net/tcp_memcg.h>
 
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -1917,6 +1918,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	sk_sockets_allocated_inc(sk);
 	local_bh_enable();
 
+	sock_update_memcg(sk);
 	return 0;
 }
 
@@ -2629,10 +2631,14 @@ struct proto tcp_prot = {
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	.init_cgroup		= tcp_init_cgroup,
+	.destroy_cgroup		= tcp_destroy_cgroup,
+	.proto_cgroup		= tcp_proto_cgroup,
+#endif
 };
 EXPORT_SYMBOL(tcp_prot);
 
-
 static int __net_init tcp_sk_init(struct net *net)
 {
 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
diff --git a/net/ipv4/tcp_memcg.c b/net/ipv4/tcp_memcg.c
new file mode 100644
index 0000000..cc91918
--- /dev/null
+++ b/net/ipv4/tcp_memcg.c
@@ -0,0 +1,64 @@
+#include <net/tcp.h>
+#include <net/tcp_memcg.h>
+#include <net/sock.h>
+#include <linux/memcontrol.h>
+
+static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
+{
+	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
+}
+
+int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	/*
+	 * The root cgroup does not use res_counters, but rather,
+	 * rely on the data already collected by the network
+	 * subsystem
+	 */
+	struct res_counter *res_parent = NULL;
+	struct cg_proto *cg_proto;
+	struct tcp_memcontrol *tcp;
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+	cg_proto = tcp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return 0;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	cg_proto->parent = tcp_prot.proto_cgroup(parent);
+
+	tcp->tcp_prot_mem[0] = sysctl_tcp_mem[0];
+	tcp->tcp_prot_mem[1] = sysctl_tcp_mem[1];
+	tcp->tcp_prot_mem[2] = sysctl_tcp_mem[2];
+	tcp->tcp_memory_pressure = 0;
+
+	if (cg_proto->parent)
+		res_parent = cg_proto->parent->memory_allocated;
+
+	res_counter_init(&tcp->tcp_memory_allocated, res_parent);
+	percpu_counter_init(&tcp->tcp_sockets_allocated, 0);
+
+	cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
+	cg_proto->sysctl_mem = tcp->tcp_prot_mem;
+	cg_proto->memory_allocated = &tcp->tcp_memory_allocated;
+	cg_proto->sockets_allocated = &tcp->tcp_sockets_allocated;
+
+	return 0;
+}
+EXPORT_SYMBOL(tcp_init_cgroup);
+
+void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	struct cg_proto *cg_proto;
+	struct tcp_memcontrol *tcp;
+
+	cg_proto = tcp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
+}
+EXPORT_SYMBOL(tcp_destroy_cgroup);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 3a08fcd..d57d7a7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -62,6 +62,7 @@
 #include <net/netdma.h>
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
+#include <net/tcp_memcg.h>
 
 #include <asm/uaccess.h>
 
@@ -2222,6 +2223,9 @@ struct proto tcpv6_prot = {
 	.compat_setsockopt	= compat_tcp_setsockopt,
 	.compat_getsockopt	= compat_tcp_getsockopt,
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	.proto_cgroup		= tcp_proto_cgroup,
+#endif
 };
 
 static const struct inet6_protocol tcpv6_protocol = {
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v6 03/10] socket: initial cgroup code.
From: Glauber Costa @ 2011-11-25 17:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen, netdev,
	linux-mm, kirill, avagin, devel, eric.dumazet, cgroups,
	Glauber Costa
In-Reply-To: <1322242696-27682-1-git-send-email-glommer@parallels.com>

The goal of this work is to move the memory pressure tcp
controls to a cgroup, instead of just relying on global
conditions.

To avoid excessive overhead in the network fast paths,
the code that accounts allocated memory to a cgroup is
hidden inside a static_branch(). This branch is patched out
until the first non-root cgroup is created. So when nobody
is using cgroups, even if it is mounted, no significant performance
penalty should be seen.

This patch handles the generic part of the code, and has nothing
tcp-specific.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kirill A. Shutemov<kirill@shutemov.name>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujtsu.com>
CC: David S. Miller <davem@davemloft.net>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/memcontrol.h |   16 ++++
 include/net/sock.h         |  169 +++++++++++++++++++++++++++++++++++++++++++-
 mm/memcontrol.c            |   40 +++++++++--
 net/core/sock.c            |   21 ++++--
 4 files changed, 230 insertions(+), 16 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ac797fa..6644d90 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -377,5 +377,21 @@ mem_cgroup_print_bad_page(struct page *page)
 }
 #endif
 
+#ifdef CONFIG_INET
+enum {
+	UNDER_LIMIT,
+	SOFT_LIMIT,
+	OVER_LIMIT,
+};
+
+struct sock;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+void sock_update_memcg(struct sock *sk);
+#else
+static inline void sock_update_memcg(struct sock *sk)
+{
+}
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#endif /* CONFIG_INET */
 #endif /* _LINUX_MEMCONTROL_H */
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 0d054e0..d802761 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -55,6 +55,7 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 #include <linux/memcontrol.h>
+#include <linux/res_counter.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -169,6 +170,7 @@ struct sock_common {
 	/* public: */
 };
 
+struct cg_proto;
 /**
   *	struct sock - network layer representation of sockets
   *	@__sk_common: shared layout with inet_timewait_sock
@@ -229,6 +231,7 @@ struct sock_common {
   *	@sk_security: used by security modules
   *	@sk_mark: generic packet mark
   *	@sk_classid: this socket's cgroup classid
+  *	@sk_cgrp: this socket's cgroup-specific proto data
   *	@sk_write_pending: a write to stream socket waits to start
   *	@sk_state_change: callback to indicate change in the state of the sock
   *	@sk_data_ready: callback to indicate there is data to be processed
@@ -340,6 +343,7 @@ struct sock {
 #endif
 	__u32			sk_mark;
 	u32			sk_classid;
+	struct cg_proto		*sk_cgrp;
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
 	void			(*sk_write_space)(struct sock *sk);
@@ -834,6 +838,27 @@ struct proto {
 #ifdef SOCK_REFCNT_DEBUG
 	atomic_t		socks;
 #endif
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	/*
+	 * cgroup specific init/deinit functions. Called once for all
+	 * protocols that implement it, from cgroups populate function.
+	 * This function has to setup any files the protocol want to
+	 * appear in the kmem cgroup filesystem.
+	 */
+	int			(*init_cgroup)(struct cgroup *cgrp,
+					       struct cgroup_subsys *ss);
+	void			(*destroy_cgroup)(struct cgroup *cgrp,
+						  struct cgroup_subsys *ss);
+	struct cg_proto		*(*proto_cgroup)(struct mem_cgroup *memcg);
+#endif
+};
+
+struct cg_proto {
+	struct res_counter	*memory_allocated;	/* Current allocated memory. */
+	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
+	int			*memory_pressure;
+	long			*sysctl_mem;
+	struct cg_proto		*parent;
 };
 
 extern int proto_register(struct proto *prot, int alloc_slab);
@@ -864,47 +889,149 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 #define sk_refcnt_debug_release(sk) do { } while (0)
 #endif /* SOCK_REFCNT_DEBUG */
 
+extern struct jump_label_key memcg_socket_limit_enabled;
 static inline int *sk_memory_pressure(const struct sock *sk)
 {
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		int *ret = NULL;
+		struct cg_proto *cg_proto = sk->sk_cgrp;
+
+		if (!cg_proto)
+			goto nocgroup;
+		if (cg_proto->memory_pressure)
+			ret = cg_proto->memory_pressure;
+		return ret;
+	} else
+nocgroup:
+#endif
+
 	return sk->sk_prot->memory_pressure;
 }
 
 static inline long sk_prot_mem(const struct sock *sk, int index)
 {
 	long *prot = sk->sk_prot->sysctl_mem;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto = sk->sk_cgrp;
+		if (!cg_proto) /* this handles the case with existing sockets */
+			goto nocgroup;
+
+		prot = cg_proto->sysctl_mem;
+	}
+nocgroup:
+#endif
 	return prot[index];
 }
 
+static inline void memcg_memory_allocated_add(struct cg_proto *prot,
+					      unsigned long amt,
+					      int *parent_status)
+{
+	struct res_counter *fail;
+	int ret;
+
+	ret = res_counter_charge(prot->memory_allocated,
+				 amt << PAGE_SHIFT, &fail);
+
+	if (ret < 0)
+		*parent_status = OVER_LIMIT;
+}
+
+static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
+					      unsigned long amt)
+{
+	res_counter_uncharge(prot->memory_allocated, amt << PAGE_SHIFT);
+}
+
+static inline u64 memcg_memory_allocated_read(struct cg_proto *prot)
+{
+	u64 ret;
+	ret = res_counter_read_u64(prot->memory_allocated, RES_USAGE);
+	return ret >> PAGE_SHIFT;
+}
+
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
 	struct proto *prot = sk->sk_prot;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto = sk->sk_cgrp;
+		if (!cg_proto) /* this handles the case with existing sockets */
+			goto nocgroup;
+
+		return memcg_memory_allocated_read(cg_proto);
+	}
+nocgroup:
+#endif
 	return atomic_long_read(prot->memory_allocated);
 }
 
 static inline long
-sk_memory_allocated_add(struct sock *sk, int amt)
+sk_memory_allocated_add(struct sock *sk, int amt, int *parent_status)
 {
 	struct proto *prot = sk->sk_prot;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto = sk->sk_cgrp;
+
+		if (!cg_proto)
+			goto nocgroup;
+
+		memcg_memory_allocated_add(cg_proto, amt, parent_status);
+	}
+nocgroup:
+#endif
 	return atomic_long_add_return(amt, prot->memory_allocated);
 }
 
 static inline void
-sk_memory_allocated_sub(struct sock *sk, int amt)
+sk_memory_allocated_sub(struct sock *sk, int amt, int parent_status)
 {
 	struct proto *prot = sk->sk_prot;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto = sk->sk_cgrp;
+
+		if (!cg_proto)
+			goto nocgroup;
+
+		/* Otherwise it was uncharged already */
+		if (parent_status != OVER_LIMIT)
+			memcg_memory_allocated_sub(cg_proto, amt);
+	}
+nocgroup:
+#endif
 	atomic_long_sub(amt, prot->memory_allocated);
 }
 
 static inline void sk_sockets_allocated_dec(struct sock *sk)
 {
 	struct proto *prot = sk->sk_prot;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto = sk->sk_cgrp;
+
+		for (; cg_proto; cg_proto = cg_proto->parent)
+			percpu_counter_dec(cg_proto->sockets_allocated);
+	}
+#endif
 	percpu_counter_dec(prot->sockets_allocated);
 }
 
 static inline void sk_sockets_allocated_inc(struct sock *sk)
 {
 	struct proto *prot = sk->sk_prot;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto = sk->sk_cgrp;
+
+		for (; cg_proto; cg_proto = cg_proto->parent)
+			percpu_counter_inc(cg_proto->sockets_allocated);
+	}
+#endif
 	percpu_counter_inc(prot->sockets_allocated);
 }
 
@@ -912,19 +1039,57 @@ static inline int
 sk_sockets_allocated_read_positive(struct sock *sk)
 {
 	struct proto *prot = sk->sk_prot;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto = sk->sk_cgrp;
 
+		if (!cg_proto)
+			goto nocgroup;
+
+		return percpu_counter_sum_positive(cg_proto->sockets_allocated);
+	}
+nocgroup:
+#endif
 	return percpu_counter_sum_positive(prot->sockets_allocated);
 }
 
 static inline int
 kcg_sockets_allocated_sum_positive(struct proto *prot, struct mem_cgroup *cg)
 {
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto;
+		if (!prot->proto_cgroup)
+			goto nocgroup;
+
+		cg_proto = prot->proto_cgroup(cg);
+		if (!cg_proto)
+			goto nocgroup;
+
+		return percpu_counter_sum_positive(cg_proto->sockets_allocated);
+	}
+nocgroup:
+#endif
 	return percpu_counter_sum_positive(prot->sockets_allocated);
 }
 
 static inline long
 kcg_memory_allocated(struct proto *prot, struct mem_cgroup *cg)
 {
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct cg_proto *cg_proto;
+		if (!prot->proto_cgroup)
+			goto nocgroup;
+
+		cg_proto = prot->proto_cgroup(cg);
+		if (!cg_proto)
+			goto nocgroup;
+
+		return memcg_memory_allocated_read(cg_proto);
+	}
+nocgroup:
+#endif
 	return atomic_long_read(prot->memory_allocated);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1cb7daa..5f29194 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -376,6 +376,40 @@ enum mem_type {
 #define MEM_CGROUP_RECLAIM_SOFT_BIT	0x2
 #define MEM_CGROUP_RECLAIM_SOFT		(1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
 
+static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+{
+	return (mem == root_mem_cgroup);
+}
+
+/* Writing them here to avoid exposing memcg's inner layout */
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_INET
+#include <net/sock.h>
+
+void sock_update_memcg(struct sock *sk)
+{
+	/* right now a socket spends its whole life in the same cgroup */
+	if (sk->sk_cgrp) {
+		WARN_ON(1);
+		return;
+	}
+	if (static_branch(&memcg_socket_limit_enabled)) {
+		struct mem_cgroup *memcg;
+
+		BUG_ON(!sk->sk_prot->proto_cgroup);
+
+		rcu_read_lock();
+		memcg = mem_cgroup_from_task(current);
+		if (!mem_cgroup_is_root(memcg))
+			sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+		rcu_read_unlock();
+	}
+}
+
+#endif /* CONFIG_INET */
+#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+
+
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
@@ -872,12 +906,6 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
 #define for_each_mem_cgroup_all(iter) \
 	for_each_mem_cgroup_tree_cond(iter, NULL, true)
 
-
-static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
-{
-	return (mem == root_mem_cgroup);
-}
-
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *mem;
diff --git a/net/core/sock.c b/net/core/sock.c
index 26bdb1c..8382c80 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -111,6 +111,7 @@
 #include <linux/init.h>
 #include <linux/highmem.h>
 #include <linux/user_namespace.h>
+#include <linux/jump_label.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -141,6 +142,9 @@
 static struct lock_class_key af_family_keys[AF_MAX];
 static struct lock_class_key af_family_slock_keys[AF_MAX];
 
+struct jump_label_key memcg_socket_limit_enabled;
+EXPORT_SYMBOL(memcg_socket_limit_enabled);
+
 /*
  * Make lock validator output more readable. (we pre-construct these
  * strings build-time, so that runtime initialization of socket
@@ -1678,24 +1682,25 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 	int amt = sk_mem_pages(size);
 	long allocated;
 	int *memory_pressure;
+	int parent_status = UNDER_LIMIT;
 
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
 
 	memory_pressure = sk_memory_pressure(sk);
-	allocated = sk_memory_allocated_add(sk, amt);
+	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
 
 	/* Under limit. */
-	if (allocated <= sk_prot_mem(sk, 0))
+	if (parent_status == UNDER_LIMIT && allocated <= sk_prot_mem(sk, 0))
 		if (memory_pressure && *memory_pressure)
 			*memory_pressure = 0;
 
-	/* Under pressure. */
-	if (allocated > sk_prot_mem(sk, 1))
+	/* Under pressure. (we or our parents) */
+	if ((parent_status > SOFT_LIMIT) || allocated > sk_prot_mem(sk, 1))
 		if (prot->enter_memory_pressure)
 			prot->enter_memory_pressure(sk);
 
-	/* Over hard limit. */
-	if (allocated > sk_prot_mem(sk, 2))
+	/* Over hard limit (we or our parents) */
+	if ((parent_status == OVER_LIMIT) || (allocated > sk_prot_mem(sk, 2)))
 		goto suppress_allocation;
 
 	/* guarantee minimum buffer size under pressure */
@@ -1742,7 +1747,7 @@ suppress_allocation:
 	/* Alas. Undo changes. */
 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
 
-	sk_memory_allocated_sub(sk, amt);
+	sk_memory_allocated_sub(sk, amt, parent_status);
 
 	return 0;
 }
@@ -1757,7 +1762,7 @@ void __sk_mem_reclaim(struct sock *sk)
 	int *memory_pressure = sk_memory_pressure(sk);
 
 	sk_memory_allocated_sub(sk,
-				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
+				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);
 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
 
 	if (memory_pressure && *memory_pressure &&
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v6 02/10] foundations of per-cgroup memory pressure controlling.
From: Glauber Costa @ 2011-11-25 17:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen, netdev,
	linux-mm, kirill, avagin, devel, eric.dumazet, cgroups,
	Glauber Costa
In-Reply-To: <1322242696-27682-1-git-send-email-glommer@parallels.com>

This patch replaces all uses of struct sock fields' memory_pressure,
memory_allocated, sockets_allocated, and sysctl_mem to acessor
macros. Those macros can either receive a socket argument, or a mem_cgroup
argument, depending on the context they live in.

Since we're only doing a macro wrapping here, no performance impact at all is
expected in the case where we don't have cgroups disabled.

Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: David S. Miller <davem@davemloft.net>
CC: Hiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/net/sock.h    |   64 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/net/tcp.h     |    3 +-
 net/core/sock.c       |   55 +++++++++++++++++++++++++-----------------
 net/ipv4/proc.c       |    7 +++--
 net/ipv4/tcp.c        |    6 +++-
 net/ipv4/tcp_input.c  |   12 ++++----
 net/ipv4/tcp_ipv4.c   |    4 +-
 net/ipv4/tcp_output.c |    2 +-
 net/ipv4/tcp_timer.c  |    2 +-
 net/ipv6/tcp_ipv6.c   |    2 +-
 10 files changed, 118 insertions(+), 39 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index c6658be..0d054e0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -54,6 +54,7 @@
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/memcontrol.h>
 
 #include <linux/filter.h>
 #include <linux/rculist_nulls.h>
@@ -863,6 +864,69 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 #define sk_refcnt_debug_release(sk) do { } while (0)
 #endif /* SOCK_REFCNT_DEBUG */
 
+static inline int *sk_memory_pressure(const struct sock *sk)
+{
+	return sk->sk_prot->memory_pressure;
+}
+
+static inline long sk_prot_mem(const struct sock *sk, int index)
+{
+	long *prot = sk->sk_prot->sysctl_mem;
+	return prot[index];
+}
+
+static inline long
+sk_memory_allocated(const struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+	return atomic_long_read(prot->memory_allocated);
+}
+
+static inline long
+sk_memory_allocated_add(struct sock *sk, int amt)
+{
+	struct proto *prot = sk->sk_prot;
+	return atomic_long_add_return(amt, prot->memory_allocated);
+}
+
+static inline void
+sk_memory_allocated_sub(struct sock *sk, int amt)
+{
+	struct proto *prot = sk->sk_prot;
+	atomic_long_sub(amt, prot->memory_allocated);
+}
+
+static inline void sk_sockets_allocated_dec(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+	percpu_counter_dec(prot->sockets_allocated);
+}
+
+static inline void sk_sockets_allocated_inc(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+	percpu_counter_inc(prot->sockets_allocated);
+}
+
+static inline int
+sk_sockets_allocated_read_positive(struct sock *sk)
+{
+	struct proto *prot = sk->sk_prot;
+
+	return percpu_counter_sum_positive(prot->sockets_allocated);
+}
+
+static inline int
+kcg_sockets_allocated_sum_positive(struct proto *prot, struct mem_cgroup *cg)
+{
+	return percpu_counter_sum_positive(prot->sockets_allocated);
+}
+
+static inline long
+kcg_memory_allocated(struct proto *prot, struct mem_cgroup *cg)
+{
+	return atomic_long_read(prot->memory_allocated);
+}
 
 #ifdef CONFIG_PROC_FS
 /* Called with local bh disabled */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index e147f42..ccaa3b6 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -44,6 +44,7 @@
 #include <net/dst.h>
 
 #include <linux/seq_file.h>
+#include <linux/memcontrol.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -285,7 +286,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 	}
 
 	if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
-	    atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])
+	    sk_memory_allocated(sk) > sk_prot_mem(sk, 2))
 		return true;
 	return false;
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index 4ed7b1d..26bdb1c 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1288,7 +1288,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 		newsk->sk_wq = NULL;
 
 		if (newsk->sk_prot->sockets_allocated)
-			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+			sk_sockets_allocated_inc(newsk);
 
 		if (sock_flag(newsk, SOCK_TIMESTAMP) ||
 		    sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
@@ -1677,30 +1677,32 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 	struct proto *prot = sk->sk_prot;
 	int amt = sk_mem_pages(size);
 	long allocated;
+	int *memory_pressure;
 
 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
-	allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+	memory_pressure = sk_memory_pressure(sk);
+	allocated = sk_memory_allocated_add(sk, amt);
 
 	/* Under limit. */
-	if (allocated <= prot->sysctl_mem[0]) {
-		if (prot->memory_pressure && *prot->memory_pressure)
-			*prot->memory_pressure = 0;
-		return 1;
-	}
+	if (allocated <= sk_prot_mem(sk, 0))
+		if (memory_pressure && *memory_pressure)
+			*memory_pressure = 0;
 
 	/* Under pressure. */
-	if (allocated > prot->sysctl_mem[1])
+	if (allocated > sk_prot_mem(sk, 1))
 		if (prot->enter_memory_pressure)
 			prot->enter_memory_pressure(sk);
 
 	/* Over hard limit. */
-	if (allocated > prot->sysctl_mem[2])
+	if (allocated > sk_prot_mem(sk, 2))
 		goto suppress_allocation;
 
 	/* guarantee minimum buffer size under pressure */
 	if (kind == SK_MEM_RECV) {
 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
 			return 1;
+
 	} else { /* SK_MEM_SEND */
 		if (sk->sk_type == SOCK_STREAM) {
 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
@@ -1710,13 +1712,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
 				return 1;
 	}
 
-	if (prot->memory_pressure) {
+	if (memory_pressure) {
 		int alloc;
 
-		if (!*prot->memory_pressure)
+		if (!*memory_pressure)
 			return 1;
-		alloc = percpu_counter_read_positive(prot->sockets_allocated);
-		if (prot->sysctl_mem[2] > alloc *
+		alloc = sk_sockets_allocated_read_positive(sk);
+		if (sk_prot_mem(sk, 2) > alloc *
 		    sk_mem_pages(sk->sk_wmem_queued +
 				 atomic_read(&sk->sk_rmem_alloc) +
 				 sk->sk_forward_alloc))
@@ -1739,7 +1741,9 @@ suppress_allocation:
 
 	/* Alas. Undo changes. */
 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
-	atomic_long_sub(amt, prot->memory_allocated);
+
+	sk_memory_allocated_sub(sk, amt);
+
 	return 0;
 }
 EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1750,15 +1754,15 @@ EXPORT_SYMBOL(__sk_mem_schedule);
  */
 void __sk_mem_reclaim(struct sock *sk)
 {
-	struct proto *prot = sk->sk_prot;
+	int *memory_pressure = sk_memory_pressure(sk);
 
-	atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
-		   prot->memory_allocated);
+	sk_memory_allocated_sub(sk,
+				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
 
-	if (prot->memory_pressure && *prot->memory_pressure &&
-	    (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
-		*prot->memory_pressure = 0;
+	if (memory_pressure && *memory_pressure &&
+	    (sk_memory_allocated(sk) < sk_prot_mem(sk, 0)))
+		*memory_pressure = 0;
 }
 EXPORT_SYMBOL(__sk_mem_reclaim);
 
@@ -2477,13 +2481,20 @@ static char proto_method_implemented(const void *method)
 
 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
 {
+	struct mem_cgroup *cg = mem_cgroup_from_task(current);
+	int *memory_pressure = NULL;
+
+	if (proto->memory_pressure)
+		memory_pressure = proto->memory_pressure;
+
 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
 		   proto->name,
 		   proto->obj_size,
 		   sock_prot_inuse_get(seq_file_net(seq), proto),
-		   proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
-		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+		   proto->memory_allocated != NULL ?
+			kcg_memory_allocated(proto, cg) : -1L,
+		   memory_pressure != NULL ? *memory_pressure ? "yes" : "no" : "NI",
 		   proto->max_header,
 		   proto->slab == NULL ? "no" : "yes",
 		   module_name(proto->owner),
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4bfad5d..535456d 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -52,20 +52,21 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
 	int orphans, sockets;
+	struct mem_cgroup *cg = mem_cgroup_from_task(current);
 
 	local_bh_disable();
 	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
-	sockets = percpu_counter_sum_positive(&tcp_sockets_allocated);
+	sockets = kcg_sockets_allocated_sum_positive(&tcp_prot, cg);
 	local_bh_enable();
 
 	socket_seq_show(seq);
 	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
 		   sock_prot_inuse_get(net, &tcp_prot), orphans,
 		   tcp_death_row.tw_count, sockets,
-		   atomic_long_read(&tcp_memory_allocated));
+		   kcg_memory_allocated(&tcp_prot, cg));
 	seq_printf(seq, "UDP: inuse %d mem %ld\n",
 		   sock_prot_inuse_get(net, &udp_prot),
-		   atomic_long_read(&udp_memory_allocated));
+		   kcg_memory_allocated(&udp_prot, cg));
 	seq_printf(seq, "UDPLITE: inuse %d\n",
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 34f5db1..89a2bfe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -319,9 +319,11 @@ EXPORT_SYMBOL(tcp_memory_pressure);
 
 void tcp_enter_memory_pressure(struct sock *sk)
 {
-	if (!tcp_memory_pressure) {
+	int *memory_pressure = sk_memory_pressure(sk);
+
+	if (!*memory_pressure) {
 		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
-		tcp_memory_pressure = 1;
+		*memory_pressure = 1;
 	}
 }
 EXPORT_SYMBOL(tcp_enter_memory_pressure);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 52b5c2d..3df862d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -322,7 +322,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
 	/* Check #1 */
 	if (tp->rcv_ssthresh < tp->window_clamp &&
 	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
-	    !tcp_memory_pressure) {
+	    !sk_memory_pressure(sk)) {
 		int incr;
 
 		/* Check #2. Increase window, if skb with such overhead
@@ -411,8 +411,8 @@ static void tcp_clamp_window(struct sock *sk)
 
 	if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
 	    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
-	    !tcp_memory_pressure &&
-	    atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+	    !sk_memory_pressure(sk) &&
+	    sk_memory_allocated(sk) < sk_prot_mem(sk, 0)) {
 		sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
 				    sysctl_tcp_rmem[2]);
 	}
@@ -4864,7 +4864,7 @@ static int tcp_prune_queue(struct sock *sk)
 
 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
 		tcp_clamp_window(sk);
-	else if (tcp_memory_pressure)
+	else if (sk_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
 	tcp_collapse_ofo_queue(sk);
@@ -4930,11 +4930,11 @@ static int tcp_should_expand_sndbuf(const struct sock *sk)
 		return 0;
 
 	/* If we are under global TCP memory pressure, do not expand.  */
-	if (tcp_memory_pressure)
+	if (sk_memory_pressure(sk))
 		return 0;
 
 	/* If we are under soft global TCP memory pressure, do not expand.  */
-	if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0])
+	if (sk_memory_allocated(sk) >= sk_prot_mem(sk, 0))
 		return 0;
 
 	/* If we filled the congestion window, do not expand.  */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 0ea10ee..f124a4b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1914,7 +1914,7 @@ static int tcp_v4_init_sock(struct sock *sk)
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
 	local_bh_disable();
-	percpu_counter_inc(&tcp_sockets_allocated);
+	sk_sockets_allocated_inc(sk);
 	local_bh_enable();
 
 	return 0;
@@ -1970,7 +1970,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		tp->cookie_values = NULL;
 	}
 
-	percpu_counter_dec(&tcp_sockets_allocated);
+	sk_sockets_allocated_dec(sk);
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 980b98f..04e229b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1919,7 +1919,7 @@ u32 __tcp_select_window(struct sock *sk)
 	if (free_space < (full_space >> 1)) {
 		icsk->icsk_ack.quick = 0;
 
-		if (tcp_memory_pressure)
+		if (sk_memory_pressure(sk))
 			tp->rcv_ssthresh = min(tp->rcv_ssthresh,
 					       4U * tp->advmss);
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 2e0f0af..c9f830c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data)
 	}
 
 out:
-	if (tcp_memory_pressure)
+	if (sk_memory_pressure(sk))
 		sk_mem_reclaim(sk);
 out_unlock:
 	bh_unlock_sock(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 10b2b31..3a08fcd 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1995,7 +1995,7 @@ static int tcp_v6_init_sock(struct sock *sk)
 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
 	local_bh_disable();
-	percpu_counter_inc(&tcp_sockets_allocated);
+	sk_sockets_allocated_inc(sk);
 	local_bh_enable();
 
 	return 0;
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v6 01/10] Basic kernel memory functionality for the Memory Controller
From: Glauber Costa @ 2011-11-25 17:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen, netdev,
	linux-mm, kirill, avagin, devel, eric.dumazet, cgroups,
	Glauber Costa
In-Reply-To: <1322242696-27682-1-git-send-email-glommer@parallels.com>

This patch lays down the foundation for the kernel memory component
of the Memory Controller.

As of today, I am only laying down the following files:

 * memory.independent_kmem_limit
 * memory.kmem.limit_in_bytes (currently ignored)
 * memory.kmem.usage_in_bytes (always zero)

Signed-off-by: Glauber Costa <glommer@parallels.com>
Reviewed-by: Kirill A. Shutemov <kirill@shutemov.name>
CC: Paul Menage <paul@paulmenage.org>
CC: Greg Thelen <gthelen@google.com>
---
 Documentation/cgroups/memory.txt |   36 ++++++++++++-
 init/Kconfig                     |   14 +++++
 mm/memcontrol.c                  |  107 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 150 insertions(+), 7 deletions(-)

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 06eb6d9..bf00cd2 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -44,8 +44,9 @@ Features:
  - oom-killer disable knob and oom-notifier
  - Root cgroup has no limit controls.
 
- Kernel memory and Hugepages are not under control yet. We just manage
- pages on LRU. To add more controls, we have to take care of performance.
+ Hugepages is not under control yet. We just manage pages on LRU. To add more
+ controls, we have to take care of performance. Kernel memory support is work
+ in progress, and the current version provides basically functionality.
 
 Brief summary of control files.
 
@@ -56,8 +57,11 @@ Brief summary of control files.
 				 (See 5.5 for details)
  memory.memsw.usage_in_bytes	 # show current res_counter usage for memory+Swap
 				 (See 5.5 for details)
+ memory.kmem.usage_in_bytes	 # show current res_counter usage for kmem only.
+				 (See 2.7 for details)
  memory.limit_in_bytes		 # set/show limit of memory usage
  memory.memsw.limit_in_bytes	 # set/show limit of memory+Swap usage
+ memory.kmem.limit_in_bytes	 # if allowed, set/show limit of kernel memory
  memory.failcnt			 # show the number of memory usage hits limits
  memory.memsw.failcnt		 # show the number of memory+Swap hits limits
  memory.max_usage_in_bytes	 # show max memory usage recorded
@@ -72,6 +76,9 @@ Brief summary of control files.
  memory.oom_control		 # set/show oom controls.
  memory.numa_stat		 # show the number of memory usage per numa node
 
+ memory.independent_kmem_limit	 # select whether or not kernel memory limits are
+				   independent of user limits
+
 1. History
 
 The memory controller has a long history. A request for comments for the memory
@@ -255,6 +262,31 @@ When oom event notifier is registered, event will be delivered.
   per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
   zone->lru_lock, it has no lock of its own.
 
+2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+
+ With the Kernel memory extension, the Memory Controller is able to limit
+the amount of kernel memory used by the system. Kernel memory is fundamentally
+different than user memory, since it can't be swapped out, which makes it
+possible to DoS the system by consuming too much of this precious resource.
+Kernel memory limits are not imposed for the root cgroup.
+
+Memory limits as specified by the standard Memory Controller may or may not
+take kernel memory into consideration. This is achieved through the file
+memory.independent_kmem_limit. A Value different than 0 will allow for kernel
+memory to be controlled separately.
+
+When kernel memory limits are not independent, the limit values set in
+memory.kmem files are ignored.
+
+Currently no soft limit is implemented for kernel memory. It is future work
+to trigger slab reclaim when those limits are reached.
+
+CAUTION: As of this writing, the kmem extention may prevent tasks from moving
+among cgroups. If a task has kmem accounting in a cgroup, the task cannot be
+moved until the kmem resource is released. Also, until the resource is fully
+released, the cgroup cannot be destroyed. So, please consider your use cases
+and set kmem extention config option carefully.
+
 3. User Interface
 
 0. Configuration
diff --git a/init/Kconfig b/init/Kconfig
index 31ba0fd..e4b6246 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -689,6 +689,20 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED
 	  For those who want to have the feature enabled by default should
 	  select this option (if, for some reason, they need to disable it
 	  then swapaccount=0 does the trick).
+config CGROUP_MEM_RES_CTLR_KMEM
+	bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)"
+	depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL
+	default n
+	help
+	  The Kernel Memory extension for Memory Resource Controller can limit
+	  the amount of memory used by kernel objects in the system. Those are
+	  fundamentally different from the entities handled by the standard
+	  Memory Controller, which are page-based, and can be swapped. Users of
+	  the kmem extension can use it to guarantee that no group of processes
+	  will ever exhaust kernel resources alone.
+
+	  WARNING: The current experimental implementation does not allow a
+	  task to move among different cgroups with a kmem resource being held.
 
 config CGROUP_PERF
 	bool "Enable perf_event per-cpu per-container group (cgroup) monitoring"
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d57555..1cb7daa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -226,6 +226,10 @@ struct mem_cgroup {
 	 */
 	struct res_counter memsw;
 	/*
+	 * the counter to account for kmem usage.
+	 */
+	struct res_counter kmem;
+	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
 	 */
@@ -276,6 +280,11 @@ struct mem_cgroup {
 	 */
 	unsigned long 	move_charge_at_immigrate;
 	/*
+	 * Should kernel memory limits be stabilished independently
+	 * from user memory ?
+	 */
+	int		kmem_independent_accounting;
+	/*
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu *stat;
@@ -343,9 +352,14 @@ enum charge_type {
 };
 
 /* for encoding cft->private value on file */
-#define _MEM			(0)
-#define _MEMSWAP		(1)
-#define _OOM_TYPE		(2)
+
+enum mem_type {
+	_MEM = 0,
+	_MEMSWAP,
+	_OOM_TYPE,
+	_KMEM,
+};
+
 #define MEMFILE_PRIVATE(x, val)	(((x) << 16) | (val))
 #define MEMFILE_TYPE(val)	(((val) >> 16) & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)
@@ -3838,10 +3852,17 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
 	u64 val;
 
 	if (!mem_cgroup_is_root(mem)) {
+		val = 0;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+		if (!mem->kmem_independent_accounting)
+			val = res_counter_read_u64(&mem->kmem, RES_USAGE);
+#endif
 		if (!swap)
-			return res_counter_read_u64(&mem->res, RES_USAGE);
+			val += res_counter_read_u64(&mem->res, RES_USAGE);
 		else
-			return res_counter_read_u64(&mem->memsw, RES_USAGE);
+			val += res_counter_read_u64(&mem->memsw, RES_USAGE);
+
+		return val;
 	}
 
 	val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
@@ -3874,6 +3895,11 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 		else
 			val = res_counter_read_u64(&mem->memsw, name);
 		break;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+	case _KMEM:
+		val = res_counter_read_u64(&mem->kmem, name);
+		break;
+#endif
 	default:
 		BUG();
 		break;
@@ -4604,6 +4630,35 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 }
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+static u64 kmem_limit_independent_read(struct cgroup *cgroup, struct cftype *cft)
+{
+	return mem_cgroup_from_cont(cgroup)->kmem_independent_accounting;
+}
+
+static int kmem_limit_independent_write(struct cgroup *cgroup, struct cftype *cft,
+					u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+	val = !!val;
+
+	if (parent && parent->use_hierarchy &&
+	   (val != parent->kmem_independent_accounting))
+		return -EINVAL;
+	/*
+	 * TODO: We need to handle the case in which we are doing
+	 * independent kmem accounting as authorized by our parent,
+	 * but then our parent changes its parameter.
+	 */
+	cgroup_lock();
+	memcg->kmem_independent_accounting = val;
+	cgroup_unlock();
+	return 0;
+}
+#endif
+
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
@@ -4719,6 +4774,42 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 }
 #endif
 
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+static struct cftype kmem_cgroup_files[] = {
+	{
+		.name = "independent_kmem_limit",
+		.read_u64 = kmem_limit_independent_read,
+		.write_u64 = kmem_limit_independent_write,
+	},
+	{
+		.name = "kmem.usage_in_bytes",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+		.read_u64 = mem_cgroup_read,
+	},
+	{
+		.name = "kmem.limit_in_bytes",
+		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+		.read_u64 = mem_cgroup_read,
+	},
+};
+
+static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+	int ret = 0;
+
+	ret = cgroup_add_files(cont, ss, kmem_cgroup_files,
+			       ARRAY_SIZE(kmem_cgroup_files));
+	return ret;
+};
+
+#else
+static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+{
+	return 0;
+}
+#endif
+
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
 	struct mem_cgroup_per_node *pn;
@@ -4917,6 +5008,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	if (parent && parent->use_hierarchy) {
 		res_counter_init(&mem->res, &parent->res);
 		res_counter_init(&mem->memsw, &parent->memsw);
+		res_counter_init(&mem->kmem, &parent->kmem);
 		/*
 		 * We increment refcnt of the parent to ensure that we can
 		 * safely access it on res_counter_charge/uncharge.
@@ -4927,6 +5019,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 	} else {
 		res_counter_init(&mem->res, NULL);
 		res_counter_init(&mem->memsw, NULL);
+		res_counter_init(&mem->kmem, NULL);
 	}
 	mem->last_scanned_child = 0;
 	mem->last_scanned_node = MAX_NUMNODES;
@@ -4970,6 +5063,10 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
 
 	if (!ret)
 		ret = register_memsw_files(cont, ss);
+
+	if (!ret)
+		ret = register_kmem_files(cont, ss);
+
 	return ret;
 }
 
-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [PATCH v6 00/10] Request for inclusion: per-cgroup tcp memory pressure controls
From: Glauber Costa @ 2011-11-25 17:38 UTC (permalink / raw)
  To: linux-kernel
  Cc: lizf, kamezawa.hiroyu, ebiederm, davem, paul, gthelen, netdev,
	linux-mm, kirill, avagin, devel, eric.dumazet, cgroups

Hi Dave,

I hope the following series is in an acceptable state: I modified the tests
in __sk_mem_schedule() in a way that we should still leave the function pretty soon
under no pressure conditions.

Also, I managed to remove almost everything tcp related from memcontrol.c: the only
thing left is a simple function to calculate the address of the tcp sub-structure in
the main memcg structure - I hope this is acceptable, since besides being simple, it
is not related to the protocol itself.

I also believed that by now, all other comments so far were addressed. Let me know if there
are any blocking concerns to this, and I'll address them as soon as I can.

Thanks

Glauber Costa (10):
  Basic kernel memory functionality for the Memory Controller
  foundations of per-cgroup memory pressure controlling.
  socket: initial cgroup code.
  Account tcp memory as kernel memory
  per-netns ipv4 sysctl_tcp_mem
  tcp buffer limitation: per-cgroup limit
  Display current tcp memory allocation in kmem cgroup
  Display current tcp failcnt in kmem cgroup
  Display maximum tcp memory allocation in kmem cgroup
  Disable task moving when using kernel memory accounting

 Documentation/cgroups/memory.txt |   38 +++++-
 include/linux/memcontrol.h       |   19 +++
 include/net/netns/ipv4.h         |    1 +
 include/net/sock.h               |  232 +++++++++++++++++++++++++++++++++
 include/net/tcp.h                |    4 +-
 include/net/tcp_memcg.h          |   20 +++
 init/Kconfig                     |   14 ++
 mm/memcontrol.c                  |  209 ++++++++++++++++++++++++++++--
 net/core/sock.c                  |  106 ++++++++++++----
 net/ipv4/Makefile                |    1 +
 net/ipv4/af_inet.c               |    2 +
 net/ipv4/proc.c                  |    7 +-
 net/ipv4/sysctl_net_ipv4.c       |   65 +++++++++-
 net/ipv4/tcp.c                   |   17 +--
 net/ipv4/tcp_input.c             |   12 +-
 net/ipv4/tcp_ipv4.c              |   13 ++-
 net/ipv4/tcp_memcg.c             |  263 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_output.c            |    2 +-
 net/ipv4/tcp_timer.c             |    2 +-
 net/ipv6/af_inet6.c              |    2 +
 net/ipv6/tcp_ipv6.c              |    7 +-
 21 files changed, 955 insertions(+), 81 deletions(-)
 create mode 100644 include/net/tcp_memcg.h
 create mode 100644 net/ipv4/tcp_memcg.c

-- 
1.7.6.4

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [v4 PATCH 1/2] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Pablo Neira Ayuso @ 2011-11-25 17:36 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: kaber, jengelh, netfilter-devel, netdev, hans.schillstrom
In-Reply-To: <1322213787-25796-2-git-send-email-hans@schillstrom.com>

On Fri, Nov 25, 2011 at 10:36:26AM +0100, Hans Schillstrom wrote:
> diff --git a/include/net/ipv6.h b/include/net/ipv6.h
> index 3f0258d..9e4d4f9 100644
> --- a/include/net/ipv6.h
> +++ b/include/net/ipv6.h
> @@ -39,6 +39,7 @@
>  #define NEXTHDR_ICMP		58	/* ICMP for IPv6. */
>  #define NEXTHDR_NONE		59	/* No next header */
>  #define NEXTHDR_DEST		60	/* Destination options header. */
> +#define NEXTHDR_SCTP		132	/* Stream Control Transport Protocol */
>  #define NEXTHDR_MOBILITY	135	/* Mobility header. */
>  
>  #define NEXTHDR_MAX		255

This has to go in a separated patch. Please, send it to netdev. I
think davem can pick that for 3.2-rc

> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
> index 8260b13..41bee43 100644
> --- a/net/netfilter/Kconfig
> +++ b/net/netfilter/Kconfig
> @@ -471,6 +471,23 @@ config NETFILTER_XT_TARGET_HL
>  	since you can easily create immortal packets that loop
>  	forever on the network.
>  
> +config NETFILTER_XT_TARGET_HMARK
> +	tristate '"HMARK" target support'
> +	depends on NETFILTER_ADVANCED
> +	---help---
> +	This option adds the "HMARK" target.
> +
> +	The target allows you to create rules in the "raw" and "mangle" tables
> +	which alter the netfilter mark (nfmark) field within a given range.
> +	First a 32 bit hash value is generated then modulus by <limit> and
> +	finally an offset is added before it's written to nfmark.
> +
> +	Prior to routing, the nfmark can influence the routing method (see
> +	"Use netfilter MARK value as routing key") and can also be used by
> +	other subsystems to change their behavior.
> +
> +	The mark match can also be used to match nfmark produced by this module.
> +
>  config NETFILTER_XT_TARGET_IDLETIMER
>  	tristate  "IDLETIMER target support"
>  	depends on NETFILTER_ADVANCED
> diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
> index 1a02853..359eeb6 100644
> --- a/net/netfilter/Makefile
> +++ b/net/netfilter/Makefile
> @@ -56,6 +56,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
>  obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o
>  obj-$(CONFIG_NETFILTER_XT_TARGET_DSCP) += xt_DSCP.o
>  obj-$(CONFIG_NETFILTER_XT_TARGET_HL) += xt_HL.o
> +obj-$(CONFIG_NETFILTER_XT_TARGET_HMARK) += xt_hmark.o
>  obj-$(CONFIG_NETFILTER_XT_TARGET_LED) += xt_LED.o
>  obj-$(CONFIG_NETFILTER_XT_TARGET_NFLOG) += xt_NFLOG.o
>  obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o
> diff --git a/net/netfilter/xt_hmark.c b/net/netfilter/xt_hmark.c
> new file mode 100644
> index 0000000..ae33293
> --- /dev/null
> +++ b/net/netfilter/xt_hmark.c
> @@ -0,0 +1,327 @@
> +/*
> + *	xt_hmark - Netfilter module to set mark as hash value
> + *
> + *	(C) 2011 Hans Schillstrom <hans.schillstrom@ericsson.com>
> + *
> + *	Description:
> + *	This module calculates a hash value that can be modified by modulus
> + *	and an offset. The hash value is based on a direction independent
> + *	five tuple: src & dst addr src & dst ports and protocol.
> + *	However src & dst port can be masked and are not used for fragmented
> + *	packets, ESP and AH don't have ports so SPI will be used instead.
> + *	For ICMP error messages the hash mark values will be calculated on
> + *	the source packet i.e. the packet caused the error (If sufficient
> + *	amount of data exists).
> + *
> + *	This program is free software; you can redistribute it and/or modify
> + *	it under the terms of the GNU General Public License version 2 as
> + *	published by the Free Software Foundation.
> + */
> +
> +#include <linux/module.h>
> +#include <linux/skbuff.h>
> +#include <net/ip.h>
> +#include <linux/icmp.h>
> +
> +#include <linux/netfilter/xt_hmark.h>
> +#include <linux/netfilter/x_tables.h>
> +#include <net/netfilter/nf_nat.h>
> +
> +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
> +#	define WITH_IPV6 1
> +#include <net/ipv6.h>
> +#include <linux/netfilter_ipv6/ip6_tables.h>
> +#endif
> +
> +

Comestic: unnecessary extra line.

> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>");
> +MODULE_DESCRIPTION("Xtables: packet range mark operations by hash value");
> +MODULE_ALIAS("ipt_HMARK");
> +MODULE_ALIAS("ip6t_HMARK");
> +
> +/*
> + * ICMP, get inner header so calc can be made on the source message
> + *       not the icmp header, i.e. same hash mark must be produced
> + *       on an icmp error message.
> + */
> +static int get_inner_hdr(struct sk_buff *skb, int iphsz, int nhoff)
> +{
> +	const struct icmphdr *icmph;
> +	struct icmphdr _ih;
> +	struct iphdr *iph = NULL;
> +
> +	/* Not enough header? */
> +	icmph = skb_header_pointer(skb, nhoff + iphsz, sizeof(_ih), &_ih);
> +	if (icmph == NULL)
> +		return nhoff;
> +
> +	if (icmph->type > NR_ICMP_TYPES)
> +		return nhoff;
> +
> +	/* Error message? */
> +	if (icmph->type != ICMP_DEST_UNREACH &&
> +	    icmph->type != ICMP_SOURCE_QUENCH &&
> +	    icmph->type != ICMP_TIME_EXCEEDED &&
> +	    icmph->type != ICMP_PARAMETERPROB &&
> +	    icmph->type != ICMP_REDIRECT)
> +		return nhoff;
> +	/* Checkin full IP header plus 8 bytes of protocol to
> +	 * avoid additional coding at protocol handlers.
> +	 */
> +	if (!pskb_may_pull(skb, nhoff + iphsz + sizeof(_ih) + 8))
> +		return nhoff;

skb_header_pointer again here, if conntrack is enabled, we can benefit
from handling fragments.

> +	iph = (struct iphdr *)(skb->data + nhoff + iphsz + sizeof(_ih));
> +	return nhoff + iphsz + sizeof(_ih);
> +}
> +/*
> + * ICMPv6
> + * Input nhoff Offset into network header
> + *       offset where ICMPv6 header starts
> + * Returns true if it's a icmp error and updates nhoff
> + */
> +#ifdef WITH_IPV6
> +static int get_inner6_hdr(struct sk_buff *skb, int *offset, int hdrlen)
> +{
> +	struct icmp6hdr *icmp6h;
> +	struct icmp6hdr _ih6;
> +
> +	icmp6h = skb_header_pointer(skb, *offset + hdrlen, sizeof(_ih6), &_ih6);
> +	if (icmp6h == NULL)
> +		return 0;
> +
> +	if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) {
> +		*offset += hdrlen + sizeof(_ih6);
> +		return 1;
> +	}
> +	return 0;
> +}
> +/*
> + * Calc hash value, special casre is taken on icmp and fragmented messages
> + * i.e. fragmented messages don't use ports.
> + */
> +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par)
> +{
> +	struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo;
> +	int nhoff, poff, hdrlen;
> +	u32 addr1, addr2, hash;
> +	struct ipv6hdr *ip6;
> +	u8 nexthdr;
> +	int frag = 0, ip6hdrlvl = 0;	/* Header level */
> +	struct ipv6_opt_hdr _hdr, *hp;
> +	union {
> +		u32 v32;
> +		u16 v16[2];
> +	} ports;
> +
> +	ports.v32 = 0;
> +	nhoff = skb_network_offset(skb);
> +
> +hdr_new:
> +	/* Get header info */
> +	ip6 = (struct ipv6hdr *) (skb->data + nhoff);
> +	nexthdr = ip6->nexthdr;
> +	hdrlen = sizeof(struct ipv6hdr);
> +	hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr), &_hdr);

you have to check return value of skb_header_pointer here.

> +	while (nexthdr) {
> +		switch (nexthdr) {
> +		case IPPROTO_ICMPV6:
> +			/* ICMP Error then move ptr to inner header */
> +			if (get_inner6_hdr(skb, &nhoff, hdrlen)) {
> +				ip6hdrlvl++;
> +				if (!pskb_may_pull(skb, sizeof(_hdr) + nhoff))
> +					return XT_CONTINUE;
> +				goto hdr_new;
> +			}
> +			nhoff += hdrlen;
> +			goto hdr_rdy;
> +
> +		case NEXTHDR_FRAGMENT:
> +			if (!ip6hdrlvl) /* Do not use ports if fragmented */
> +				frag = 1;
> +			break;
> +
> +		/* End of hdr traversing cont. with ports and hash calc. */
> +		case NEXTHDR_IPV6:	/* Do not process tunnels */
> +		case NEXTHDR_TCP:
> +		case NEXTHDR_UDP:
> +		case NEXTHDR_ESP:
> +		case NEXTHDR_AUTH:
> +		case NEXTHDR_SCTP:
> +		case NEXTHDR_NONE:	/* Last hdr of something unknown */
> +			nhoff += hdrlen;
> +			goto hdr_rdy;
> +		default:
> +			return XT_CONTINUE;
> +		}
> +		if (!hp)
> +			return XT_CONTINUE;
> +		nhoff += hdrlen;	/* eat current header */
> +		nexthdr =  hp->nexthdr;	/* Next header */
> +		hdrlen = ipv6_optlen(hp);
> +		hp = skb_header_pointer(skb, nhoff + hdrlen, sizeof(_hdr),
> +					&_hdr);

same here.

> +		if (!pskb_may_pull(skb, nhoff))

why this after skb_header_pointer?

[... trimmed off ...]
>       poff = proto_ports_offset(ip_proto);
>       nhoff += ip->ihl * 4 + poff;
>       if (frag || poff < 0 || !pskb_may_pull(skb, nhoff + 4))
>               goto noports;
>
>       ports.v32 = * (__force u32 *) (skb->data + nhoff);
>       if (ip_proto == IPPROTO_ESP || ip_proto == IPPROTO_AH) {
>               ports.v32 = (ports.v32 & info->spimask) | nfo->spiset;
>       } else {
>               if (snatport)   /* Replace nat'ed port(s) */
>                       ports.v16[1] = snatport;
>               if (dnatport)
>                       ports.v16[0] = dnatport;
>               ports.v32 = (ports.v32 & info->pmask.v32) |
>                               info->pset.v32;
>               if (ports.v16[1] < ports.v16[0])
>                       swap(ports.v16[0], ports.v16[1]);
>       }
>
>noports:
>       ip_proto &= info->prmask;
>       /* get a consistent hash (same value on both flow directions)/
>       if (addr2 < addr1)
>               swap(addr1, addr2);
>
>       hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ p_proto;
>       if (info->hmod)
>               skb->mark = (hash % info->hmod) + info->hoffs;
>       return XT_CONTINUE;
> }

Hm, I think the fragmentation handling is broken.

Say that the first fragment contains the transport header
header, then the mark is calculated based on the address and ports.
Then, later on fragments will receive the mark based on the network
header only. They may have different marks.

If you don't want to use conntrack in your setup and you want to handle
fragments, then you have to configure HMARK to calculate the hashing
based on the network addresses. If you want to fully support fragments,
then enable conntrack and you can configure HMARK to calculate the
hashing based on network address + transport bits.

Fix this by removing the fragmentation handling, then assume that
people can select between two hashing configuration for HMARK. One
based for network address which is fragment-safe, one that uses the
transport layer information, that requires conntrack. Otherwise, I
don't see a sane way to handle this situation.

I think this has to be documented in the iptables manpage for HMARK.

^ permalink raw reply

* RE: [PATCH iproute2 1/2] utils: add s32 parser
From: David Laight @ 2011-11-25 17:34 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Hagen Paul Pfeifer, netdev
In-Reply-To: <20111125092451.19e0e9d4@nehalam.linuxnetplumber.net>

> From: Stephen Hemminger [mailto:shemminger@vyatta.com] 
> "David Laight" <David.Laight@ACULAB.COM> wrote:
> 
> > > +	res = strtoul(arg, &ptr, base);
> > > +	if (!ptr || ptr == arg || *ptr || res > INT32_MAX || res <  
> > INT32_MIN)
> > 
> > No need to check !ptr.
> 
> Also don't you want signed value?  Reading strtol() man page,
> the correct way is:
> 	errno = 0;
> 	res = strtol(arg, &ptr, base);
> 	if (ptr == arg || errno)
> 		return -1;
> 
> "RETURN VALUE
>        The strtol() function returns the result of the conversion,
unless  the
>        value  would  underflow  or overflow.  If an underflow occurs,
strtol()
>        returns LONG_MIN.  If an overflow occurs,  strtol() returns
LONG_MAX.
>        In  both  cases,  errno is set to ERANGE.  Precisely the same
holds for
>        strtoll()  (with  LLONG_MIN  and  LLONG_MAX  instead  of
LONG_MIN  and
>        LONG_MAX).

If you are that worried about numeric overflow (IIRC) you have
have to check the result for LONG_MIN/MAX (etc) before looking
at errno.

strtoul() is defined to support -ve values, and I think the
C rules for conversion between signed and unsigned ints
DTRT even for non 2's compliment systems.

Some of these bound checks are a waste of time.
The SUS doesn't require standard utilities to perform them.

	David

^ permalink raw reply

* Re: Open vSwitch Design
From: Stephen Hemminger @ 2011-11-25 17:28 UTC (permalink / raw)
  To: jhs-jkUAjuhPggJWk0Htik3J/w
  Cc: dev-yBygre7rU0TnMu66kgdUjQ, Chris Wright, Herbert Xu,
	Eric Dumazet, netdev, hadi-fAAogVwAN2Kw5LPnMra/2Q, Fastabend,
	John-/PVsmBQoxgPKo9QCiBeYKEEOCMrvLtNR, David Miller
In-Reply-To: <1322220276.1908.75.camel@mojatatu>

On Fri, 25 Nov 2011 06:24:36 -0500
jamal <hadi-fAAogVwAN2Kw5LPnMra/2Q@public.gmane.org> wrote:

> Most hardware bridges out there support all different modes:
> You can have learning in the hardware or defer it to user/control plane
> by setting some flags. You can have broadcasting done in hardware or
> defer to user space. 
> The mods i was thinking of is to bring the Linux bridge to have the 
> same behavior. You then need to allow netlink updates of bridge MAC
> table from user space. There may be weaknesses with the current bridging
> code in relation to Vlans that may need to be addressed.
> 
> [But my concern was not so much the bridge - because changes are needed
> in that case; it is the "match, actionlist" that is already in place
> that got to me.]

The bridge module is already overly complex. Rather than adding more
modes, it should be split into separate modules. If you look at macvlan,
you will see it is already a subset of to bridge. Another example of
this is the team driver which is really just a subset of the bonding
code.

^ permalink raw reply

* Re: [PATCH iproute2 1/2] utils: add s32 parser
From: Stephen Hemminger @ 2011-11-25 17:24 UTC (permalink / raw)
  To: David Laight; +Cc: Hagen Paul Pfeifer, netdev
In-Reply-To: <AE90C24D6B3A694183C094C60CF0A2F6D8AEE2@saturn3.aculab.com>

On Fri, 25 Nov 2011 09:46:09 -0000
"David Laight" <David.Laight@ACULAB.COM> wrote:

> > +	res = strtoul(arg, &ptr, base);
> > +	if (!ptr || ptr == arg || *ptr || res > INT32_MAX || res <  
> INT32_MIN)
> 
> No need to check !ptr.

Also don't you want signed value?  Reading strtol() man page,
the correct way is:
	errno = 0;
	res = strtol(arg, &ptr, base);
	if (ptr == arg || errno)
		return -1;

"RETURN VALUE
       The strtol() function returns the result of the conversion, unless  the
       value  would  underflow  or overflow.  If an underflow occurs, strtol()
       returns LONG_MIN.  If an overflow occurs,  strtol()  returns  LONG_MAX.
       In  both  cases,  errno is set to ERANGE.  Precisely the same holds for
       strtoll()  (with  LLONG_MIN  and  LLONG_MAX  instead  of  LONG_MIN  and
       LONG_MAX).

^ permalink raw reply

* Re: TCP fast retransmit
From: Ilpo Järvinen @ 2011-11-25 16:57 UTC (permalink / raw)
  To: Esztermann, Ansgar; +Cc: netdev@vger.kernel.org
In-Reply-To: <2D9E1426-D432-4D08-BF28-FD2615AAEDBA@mpi-bpc.mpg.de>

On Fri, 25 Nov 2011, Esztermann, Ansgar wrote:

> [originally posted to lkml]
> Hello list,
> 
> is there some documentation available on TCP fast retransmit? There seem 
> to be quite a lot of descriptions -- from informal to scholarly papers 
> -- on the various algorithms available to calculate the proper size of 
> the congestion window, but I have been unable so far to find out *when* 
> a fast retransmit is triggered. RFC 2581 states the third dupACK 
> "should" do it, and this seems to be quoted fairly often. However, I can 
> easily produce connections that fail to perform fast retransmit even 
> after 5 dupACKs. Some people mention Linux uses a different (presumable 
> more sophisticated) algorithm to trigger fast retransmits, but no-one 
> seems to elaborate.

With SACKs dupacks are meaningless (just in case you craft them). Instead 
SACK blocks matter... but how exactly depends of if FACK is in use or 
not... with FACK also holes (segments not reported by sack) below highest 
SACK count.

-- 
 i.

^ permalink raw reply

* [PATCH] l2tp: ensure sk->dst is still valid
From: Florian Westphal @ 2011-11-25 16:47 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal, James Chapman

When using l2tp over ipsec, the tunnel will hang when rekeying
occurs. Reason is that the transformer bundle attached to the dst entry
is now in STATE_DEAD and thus xfrm_output_one() drops all packets
(XfrmOutStateExpired increases).

Fix this by calling __sk_dst_check (which drops the stale dst
if xfrm dst->check callback finds that the bundle is no longer valid).

Cc: James Chapman <jchapman@katalix.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 net/l2tp/l2tp_core.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index cf0f308..89ff8c6 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1072,7 +1072,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
 
 	/* Get routing info from the tunnel socket */
 	skb_dst_drop(skb);
-	skb_dst_set(skb, dst_clone(__sk_dst_get(sk)));
+	skb_dst_set(skb, dst_clone(__sk_dst_check(sk, 0)));
 
 	inet = inet_sk(sk);
 	fl = &inet->cork.fl;
-- 
1.7.3.4

^ permalink raw reply related

* Re: TCP fast retransmit
From: Eric Dumazet @ 2011-11-25 16:39 UTC (permalink / raw)
  To: Esztermann, Ansgar; +Cc: netdev@vger.kernel.org
In-Reply-To: <1322239016.5793.12.camel@edumazet-HP-Compaq-6005-Pro-SFF-PC>

Le vendredi 25 novembre 2011 à 17:36 +0100, Eric Dumazet a écrit :

> Could you send a sample pcap of such problem, but please include full
> tcp sesssion, from the first SYN packet, up to packets following
> restransmits.
> 
> A diff of "netstat -s" taken before your session and after your session
> on receiver would help too, if receiver is not a loaded machine of
> course.
> 
> Also, what version of linux kernel are you using in receiver ?
> 
> 

Oh well, I meant sender , not receiver !

^ permalink raw reply

* Re: TCP fast retransmit
From: Eric Dumazet @ 2011-11-25 16:36 UTC (permalink / raw)
  To: Esztermann, Ansgar; +Cc: netdev@vger.kernel.org
In-Reply-To: <2D9E1426-D432-4D08-BF28-FD2615AAEDBA@mpi-bpc.mpg.de>

Le vendredi 25 novembre 2011 à 14:33 +0100, Esztermann, Ansgar a écrit :
> [originally posted to lkml]
> Hello list,
> 
> is there some documentation available on TCP fast retransmit? There
> seem to be quite a lot of descriptions -- from informal to scholarly
> papers -- on the various algorithms available to calculate the proper
> size of the congestion window, but I have been unable so far to find
> out *when* a fast retransmit is triggered. RFC 2581 states the third
> dupACK "should" do it, and this seems to be quoted fairly often.
> However, I can easily produce connections that fail to perform fast
> retransmit even after 5 dupACKs. Some people mention Linux uses a
> different (presumable more sophisticated) algorithm to trigger fast
> retransmits, but no-one seems to elaborate.

Could you send a sample pcap of such problem, but please include full
tcp sesssion, from the first SYN packet, up to packets following
restransmits.

A diff of "netstat -s" taken before your session and after your session
on receiver would help too, if receiver is not a loaded machine of
course.

Also, what version of linux kernel are you using in receiver ?

^ permalink raw reply

* [PATCH net-next 4/5] be2net: Use new hash key
From: Padmanabh Ratnakar @ 2011-11-25 15:48 UTC (permalink / raw)
  To: netdev; +Cc: Padmanabh Ratnakar

This new hash key gives better distribution of packets across RX
queues.

Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_cmds.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 7988798..62868ea 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -1669,8 +1669,9 @@ int be_cmd_rss_config(struct be_adapter *adapter, u8 *rsstable, u16 table_size)
 {
 	struct be_mcc_wrb *wrb;
 	struct be_cmd_req_rss_config *req;
-	u32 myhash[10] = {0x0123, 0x4567, 0x89AB, 0xCDEF, 0x01EF,
-			0x0123, 0x4567, 0x89AB, 0xCDEF, 0x01EF};
+	u32 myhash[10] = {0x15d43fa5, 0x2534685a, 0x5f87693a, 0x5668494e,
+			0x33cf6a53, 0x383334c6, 0x76ac4257, 0x59b242b2,
+			0x3ea83c02, 0x4a110304};
 	int status;
 
 	if (mutex_lock_interruptible(&adapter->mbox_lock))
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH net-next 5/5] be2net: Fix non utilization of RX queues
From: Padmanabh Ratnakar @ 2011-11-25 15:48 UTC (permalink / raw)
  To: netdev; +Cc: Padmanabh Ratnakar

When non power of two MSIX vectors are given to driver, some RX queues
are not utilized. Program RSS table in such a way that all queues
are utilized.

Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c |   16 ++++++++++------
 1 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 66429ea..7236280 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2312,8 +2312,8 @@ static int be_close(struct net_device *netdev)
 static int be_rx_queues_setup(struct be_adapter *adapter)
 {
 	struct be_rx_obj *rxo;
-	int rc, i;
-	u8 rsstable[MAX_RSS_QS];
+	int rc, i, j;
+	u8 rsstable[128];
 
 	for_all_rx_queues(adapter, rxo, i) {
 		rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
@@ -2325,11 +2325,15 @@ static int be_rx_queues_setup(struct be_adapter *adapter)
 	}
 
 	if (be_multi_rxq(adapter)) {
-		for_all_rss_queues(adapter, rxo, i)
-			rsstable[i] = rxo->rss_id;
+		for (j = 0; j < 128; j += adapter->num_rx_qs - 1) {
+			for_all_rss_queues(adapter, rxo, i) {
+				if ((j + i) >= 128)
+					break;
+				rsstable[j + i] = rxo->rss_id;
+			}
+		}
+		rc = be_cmd_rss_config(adapter, rsstable, 128);
 
-		rc = be_cmd_rss_config(adapter, rsstable,
-			adapter->num_rx_qs - 1);
 		if (rc)
 			return rc;
 	}
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH net-next 3/5] be2net: Add error handling for Lancer
From: Padmanabh Ratnakar @ 2011-11-25 15:48 UTC (permalink / raw)
  To: netdev; +Cc: Padmanabh Ratnakar

Detect error in Lancer by polling a HW register and
recover from this error if it is recoverable.

Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c |  155 ++++++++++++++++++---------
 1 files changed, 106 insertions(+), 49 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index a1b8ebc..66429ea 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -2044,52 +2044,6 @@ void be_detect_dump_ue(struct be_adapter *adapter)
 	}
 }
 
-static void be_worker(struct work_struct *work)
-{
-	struct be_adapter *adapter =
-		container_of(work, struct be_adapter, work.work);
-	struct be_rx_obj *rxo;
-	int i;
-
-	be_detect_dump_ue(adapter);
-
-	/* when interrupts are not yet enabled, just reap any pending
-	* mcc completions */
-	if (!netif_running(adapter->netdev)) {
-		int mcc_compl, status = 0;
-
-		mcc_compl = be_process_mcc(adapter, &status);
-
-		if (mcc_compl) {
-			struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
-			be_cq_notify(adapter, mcc_obj->cq.id, false, mcc_compl);
-		}
-
-		goto reschedule;
-	}
-
-	if (!adapter->stats_cmd_sent) {
-		if (lancer_chip(adapter))
-			lancer_cmd_get_pport_stats(adapter,
-						&adapter->stats_cmd);
-		else
-			be_cmd_get_stats(adapter, &adapter->stats_cmd);
-	}
-
-	for_all_rx_queues(adapter, rxo, i) {
-		be_rx_eqd_update(adapter, rxo);
-
-		if (rxo->rx_post_starved) {
-			rxo->rx_post_starved = false;
-			be_post_rx_frags(rxo, GFP_KERNEL);
-		}
-	}
-
-reschedule:
-	adapter->work_counter++;
-	schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
-}
-
 static void be_msix_disable(struct be_adapter *adapter)
 {
 	if (msix_enabled(adapter)) {
@@ -3328,7 +3282,7 @@ static int be_dev_family_check(struct be_adapter *adapter)
 
 static int lancer_wait_ready(struct be_adapter *adapter)
 {
-#define SLIPORT_READY_TIMEOUT 500
+#define SLIPORT_READY_TIMEOUT 30
 	u32 sliport_status;
 	int status = 0, i;
 
@@ -3337,7 +3291,7 @@ static int lancer_wait_ready(struct be_adapter *adapter)
 		if (sliport_status & SLIPORT_STATUS_RDY_MASK)
 			break;
 
-		msleep(20);
+		msleep(1000);
 	}
 
 	if (i == SLIPORT_READY_TIMEOUT)
@@ -3374,6 +3328,104 @@ static int lancer_test_and_set_rdy_state(struct be_adapter *adapter)
 	return status;
 }
 
+static void lancer_test_and_recover_fn_err(struct be_adapter *adapter)
+{
+	int status;
+	u32 sliport_status;
+
+	if (adapter->eeh_err || adapter->ue_detected)
+		return;
+
+	sliport_status = ioread32(adapter->db + SLIPORT_STATUS_OFFSET);
+
+	if (sliport_status & SLIPORT_STATUS_ERR_MASK) {
+		dev_err(&adapter->pdev->dev,
+				"Adapter in error state."
+				"Trying to recover.\n");
+
+		status = lancer_test_and_set_rdy_state(adapter);
+		if (status)
+			goto err;
+
+		netif_device_detach(adapter->netdev);
+
+		if (netif_running(adapter->netdev))
+			be_close(adapter->netdev);
+
+		be_clear(adapter);
+
+		adapter->fw_timeout = false;
+
+		status = be_setup(adapter);
+		if (status)
+			goto err;
+
+		if (netif_running(adapter->netdev)) {
+			status = be_open(adapter->netdev);
+			if (status)
+				goto err;
+		}
+
+		netif_device_attach(adapter->netdev);
+
+		dev_err(&adapter->pdev->dev,
+				"Adapter error recovery succeeded\n");
+	}
+	return;
+err:
+	dev_err(&adapter->pdev->dev,
+			"Adapter error recovery failed\n");
+}
+
+static void be_worker(struct work_struct *work)
+{
+	struct be_adapter *adapter =
+		container_of(work, struct be_adapter, work.work);
+	struct be_rx_obj *rxo;
+	int i;
+
+	if (lancer_chip(adapter))
+		lancer_test_and_recover_fn_err(adapter);
+
+	be_detect_dump_ue(adapter);
+
+	/* when interrupts are not yet enabled, just reap any pending
+	* mcc completions */
+	if (!netif_running(adapter->netdev)) {
+		int mcc_compl, status = 0;
+
+		mcc_compl = be_process_mcc(adapter, &status);
+
+		if (mcc_compl) {
+			struct be_mcc_obj *mcc_obj = &adapter->mcc_obj;
+			be_cq_notify(adapter, mcc_obj->cq.id, false, mcc_compl);
+		}
+
+		goto reschedule;
+	}
+
+	if (!adapter->stats_cmd_sent) {
+		if (lancer_chip(adapter))
+			lancer_cmd_get_pport_stats(adapter,
+						&adapter->stats_cmd);
+		else
+			be_cmd_get_stats(adapter, &adapter->stats_cmd);
+	}
+
+	for_all_rx_queues(adapter, rxo, i) {
+		be_rx_eqd_update(adapter, rxo);
+
+		if (rxo->rx_post_starved) {
+			rxo->rx_post_starved = false;
+			be_post_rx_frags(rxo, GFP_KERNEL);
+		}
+	}
+
+reschedule:
+	adapter->work_counter++;
+	schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
+}
+
 static int __devinit be_probe(struct pci_dev *pdev,
 			const struct pci_device_id *pdev_id)
 {
@@ -3426,7 +3478,12 @@ static int __devinit be_probe(struct pci_dev *pdev,
 		goto disable_sriov;
 
 	if (lancer_chip(adapter)) {
-		status = lancer_test_and_set_rdy_state(adapter);
+		status = lancer_wait_ready(adapter);
+		if (!status) {
+			iowrite32(SLI_PORT_CONTROL_IP_MASK,
+					adapter->db + SLIPORT_CONTROL_OFFSET);
+			status = lancer_test_and_set_rdy_state(adapter);
+		}
 		if (status) {
 			dev_err(&pdev->dev, "Adapter in non recoverable error\n");
 			goto ctrl_clean;
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH net-next 2/5] be2net: Fix error recovery paths
From: Padmanabh Ratnakar @ 2011-11-25 15:48 UTC (permalink / raw)
  To: netdev; +Cc: Padmanabh Ratnakar

When TX queues are created again after error recovery,
netif_set_real_num_tx_queues() is invoked to update number of real
TX queues created. rtnl lock needs to be held when invoking this routine.

Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_main.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index c6fb7c3..a1b8ebc 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -1666,9 +1666,12 @@ static int be_tx_queues_create(struct be_adapter *adapter)
 	u8 i;
 
 	adapter->num_tx_qs = be_num_txqs_want(adapter);
-	if (adapter->num_tx_qs != MAX_TX_QS)
+	if (adapter->num_tx_qs != MAX_TX_QS) {
+		rtnl_lock();
 		netif_set_real_num_tx_queues(adapter->netdev,
 			adapter->num_tx_qs);
+		rtnl_unlock();
+	}
 
 	adapter->tx_eq.max_eqd = 0;
 	adapter->tx_eq.min_eqd = 0;
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH net-next 1/5] be2net: Move to new SR-IOV implementation in Lancer
From: Padmanabh Ratnakar @ 2011-11-25 15:47 UTC (permalink / raw)
  To: netdev; +Cc: Padmanabh Ratnakar, Mammatha Edhala

SR-IOV implementation is Lancer has changed in following ways -
1)PF driver assigns one MAC addresses for VF using COMMON_SET_IFACE_MAC_LIST.
2)VF driver queries its MAC address using COMMON_GET_IFACE_MAC_LIST command
and assigns it to its interface.

Signed-off-by: Mammatha Edhala <mammatha.edhala@emulex.com>
Signed-off-by: Padmanabh Ratnakar <padmanabh.ratnakar@emulex.com>
---
 drivers/net/ethernet/emulex/benet/be_cmds.c |   99 ++++++++++++++++++++++++++-
 drivers/net/ethernet/emulex/benet/be_cmds.h |   37 ++++++++++-
 drivers/net/ethernet/emulex/benet/be_main.c |   89 ++++++++++++++++++------
 3 files changed, 200 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 64f0c1a..7988798 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -609,7 +609,7 @@ int be_cmd_eq_create(struct be_adapter *adapter,
 
 /* Use MCC */
 int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
-			u8 type, bool permanent, u32 if_handle)
+			u8 type, bool permanent, u32 if_handle, u32 pmac_id)
 {
 	struct be_mcc_wrb *wrb;
 	struct be_cmd_req_mac_query *req;
@@ -631,6 +631,7 @@ int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
 		req->permanent = 1;
 	} else {
 		req->if_id = cpu_to_le16((u16) if_handle);
+		req->pmac_id = cpu_to_le32(pmac_id);
 		req->permanent = 0;
 	}
 
@@ -2280,3 +2281,99 @@ err:
 	mutex_unlock(&adapter->mbox_lock);
 	return status;
 }
+
+/* Uses synchronous MCCQ */
+int be_cmd_get_mac_from_list(struct be_adapter *adapter, u32 domain,
+							u32 *pmac_id)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_get_mac_list *req;
+	int status;
+	int mac_count;
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+	req = embedded_payload(wrb);
+
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+				OPCODE_COMMON_GET_MAC_LIST, sizeof(*req),
+				wrb, NULL);
+
+	req->hdr.domain = domain;
+
+	status = be_mcc_notify_wait(adapter);
+	if (!status) {
+		struct be_cmd_resp_get_mac_list *resp =
+						embedded_payload(wrb);
+		int i;
+		u8 *ctxt = &resp->context[0][0];
+		status = -EIO;
+		mac_count = resp->mac_count;
+		be_dws_le_to_cpu(&resp->context, sizeof(resp->context));
+		for (i = 0; i < mac_count; i++) {
+			if (!AMAP_GET_BITS(struct amap_get_mac_list_context,
+					   act, ctxt)) {
+				*pmac_id = AMAP_GET_BITS
+					(struct amap_get_mac_list_context,
+					 macid, ctxt);
+				status = 0;
+				break;
+			}
+			ctxt += sizeof(struct amap_get_mac_list_context) / 8;
+		}
+	}
+
+err:
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
+
+/* Uses synchronous MCCQ */
+int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array,
+			u8 mac_count, u32 domain)
+{
+	struct be_mcc_wrb *wrb;
+	struct be_cmd_req_set_mac_list *req;
+	int status;
+	struct be_dma_mem cmd;
+
+	memset(&cmd, 0, sizeof(struct be_dma_mem));
+	cmd.size = sizeof(struct be_cmd_req_set_mac_list);
+	cmd.va = dma_alloc_coherent(&adapter->pdev->dev, cmd.size,
+			&cmd.dma, GFP_KERNEL);
+	if (!cmd.va) {
+		dev_err(&adapter->pdev->dev, "Memory alloc failure\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_bh(&adapter->mcc_lock);
+
+	wrb = wrb_from_mccq(adapter);
+	if (!wrb) {
+		status = -EBUSY;
+		goto err;
+	}
+
+	req = cmd.va;
+	be_wrb_cmd_hdr_prepare(&req->hdr, CMD_SUBSYSTEM_COMMON,
+				OPCODE_COMMON_SET_MAC_LIST, sizeof(*req),
+				wrb, &cmd);
+
+	req->hdr.domain = domain;
+	req->mac_count = mac_count;
+	if (mac_count)
+		memcpy(req->mac, mac_array, ETH_ALEN*mac_count);
+
+	status = be_mcc_notify_wait(adapter);
+
+err:
+	dma_free_coherent(&adapter->pdev->dev, cmd.size,
+				cmd.va, cmd.dma);
+	spin_unlock_bh(&adapter->mcc_lock);
+	return status;
+}
diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.h b/drivers/net/ethernet/emulex/benet/be_cmds.h
index ac11246..0b694c6 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.h
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.h
@@ -189,6 +189,8 @@ struct be_mcc_mailbox {
 #define OPCODE_COMMON_GET_PHY_DETAILS			102
 #define OPCODE_COMMON_SET_DRIVER_FUNCTION_CAP		103
 #define OPCODE_COMMON_GET_CNTL_ADDITIONAL_ATTRIBUTES	121
+#define OPCODE_COMMON_GET_MAC_LIST			147
+#define OPCODE_COMMON_SET_MAC_LIST			148
 #define OPCODE_COMMON_READ_OBJECT			171
 #define OPCODE_COMMON_WRITE_OBJECT			172
 
@@ -295,6 +297,7 @@ struct be_cmd_req_mac_query {
 	u8 type;
 	u8 permanent;
 	u16 if_id;
+	u32 pmac_id;
 } __packed;
 
 struct be_cmd_resp_mac_query {
@@ -1340,6 +1343,34 @@ struct be_cmd_resp_set_func_cap {
 	u8 rsvd[212];
 };
 
+/******************** GET/SET_MACLIST  **************************/
+#define BE_MAX_MAC			64
+struct amap_get_mac_list_context {
+	u8 macid[31];
+	u8 act;
+} __packed;
+
+struct be_cmd_req_get_mac_list {
+	struct be_cmd_req_hdr hdr;
+	u32 rsvd;
+} __packed;
+
+struct be_cmd_resp_get_mac_list {
+	struct be_cmd_resp_hdr hdr;
+	u8 mac_count;
+	u8 rsvd1;
+	u16 rsvd2;
+	u8 context[sizeof(struct amap_get_mac_list_context) / 8][BE_MAX_MAC];
+} __packed;
+
+struct be_cmd_req_set_mac_list {
+	struct be_cmd_req_hdr hdr;
+	u8 mac_count;
+	u8 rsvd1;
+	u16 rsvd2;
+	struct macaddr mac[BE_MAX_MAC];
+} __packed;
+
 /*************** HW Stats Get v1 **********************************/
 #define BE_TXP_SW_SZ			48
 struct be_port_rxf_stats_v1 {
@@ -1446,7 +1477,7 @@ static inline void *be_erx_stats_from_cmd(struct be_adapter *adapter)
 extern int be_pci_fnum_get(struct be_adapter *adapter);
 extern int be_cmd_POST(struct be_adapter *adapter);
 extern int be_cmd_mac_addr_query(struct be_adapter *adapter, u8 *mac_addr,
-			u8 type, bool permanent, u32 if_handle);
+			u8 type, bool permanent, u32 if_handle, u32 pmac_id);
 extern int be_cmd_pmac_add(struct be_adapter *adapter, u8 *mac_addr,
 			u32 if_id, u32 *pmac_id, u32 domain);
 extern int be_cmd_pmac_del(struct be_adapter *adapter, u32 if_id,
@@ -1542,4 +1573,8 @@ extern int be_cmd_get_cntl_attributes(struct be_adapter *adapter);
 extern int be_cmd_req_native_mode(struct be_adapter *adapter);
 extern int be_cmd_get_reg_len(struct be_adapter *adapter, u32 *log_size);
 extern void be_cmd_get_regs(struct be_adapter *adapter, u32 buf_len, void *buf);
+extern int be_cmd_get_mac_from_list(struct be_adapter *adapter, u32 domain,
+							u32 *pmac_id);
+extern int be_cmd_set_mac_list(struct be_adapter *adapter, u8 *mac_array,
+						u8 mac_count, u32 domain);
 
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 93869d4..c6fb7c3 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -237,7 +237,8 @@ static int be_mac_addr_set(struct net_device *netdev, void *p)
 		return -EADDRNOTAVAIL;
 
 	status = be_cmd_mac_addr_query(adapter, current_mac,
-			MAC_ADDRESS_TYPE_NETWORK, false, adapter->if_handle);
+				MAC_ADDRESS_TYPE_NETWORK, false,
+				adapter->if_handle, 0);
 	if (status)
 		goto err;
 
@@ -848,11 +849,18 @@ static int be_set_vf_mac(struct net_device *netdev, int vf, u8 *mac)
 	if (!is_valid_ether_addr(mac) || (vf >= num_vfs))
 		return -EINVAL;
 
-	status = be_cmd_pmac_del(adapter, adapter->vf_cfg[vf].vf_if_handle,
+	if (lancer_chip(adapter)) {
+		status = be_cmd_set_mac_list(adapter,  mac, 1, vf + 1);
+	} else {
+		status = be_cmd_pmac_del(adapter,
+				adapter->vf_cfg[vf].vf_if_handle,
 				adapter->vf_cfg[vf].vf_pmac_id, vf + 1);
 
-	status = be_cmd_pmac_add(adapter, mac, adapter->vf_cfg[vf].vf_if_handle,
+		status = be_cmd_pmac_add(adapter, mac,
+				adapter->vf_cfg[vf].vf_if_handle,
 				&adapter->vf_cfg[vf].vf_pmac_id, vf + 1);
+	}
+
 	if (status)
 		dev_err(&adapter->pdev->dev, "MAC %pM set on VF %d Failed\n",
 				mac, vf);
@@ -2465,13 +2473,18 @@ static inline int be_vf_eth_addr_config(struct be_adapter *adapter)
 	be_vf_eth_addr_generate(adapter, mac);
 
 	for (vf = 0; vf < num_vfs; vf++) {
-		status = be_cmd_pmac_add(adapter, mac,
+		if (lancer_chip(adapter)) {
+			status = be_cmd_set_mac_list(adapter,  mac, 1, vf + 1);
+		} else {
+			status = be_cmd_pmac_add(adapter, mac,
 					adapter->vf_cfg[vf].vf_if_handle,
 					&adapter->vf_cfg[vf].vf_pmac_id,
 					vf + 1);
+		}
+
 		if (status)
 			dev_err(&adapter->pdev->dev,
-				"Mac address add failed for VF %d\n", vf);
+			"Mac address assignment failed for VF %d\n", vf);
 		else
 			memcpy(adapter->vf_cfg[vf].vf_mac_addr, mac, ETH_ALEN);
 
@@ -2484,9 +2497,14 @@ static void be_vf_clear(struct be_adapter *adapter)
 {
 	u32 vf;
 
-	for (vf = 0; vf < num_vfs; vf++)
-		be_cmd_pmac_del(adapter, adapter->vf_cfg[vf].vf_if_handle,
-				adapter->vf_cfg[vf].vf_pmac_id, vf + 1);
+	for (vf = 0; vf < num_vfs; vf++) {
+		if (lancer_chip(adapter))
+			be_cmd_set_mac_list(adapter, NULL, 0, vf + 1);
+		else
+			be_cmd_pmac_del(adapter,
+					adapter->vf_cfg[vf].vf_if_handle,
+					adapter->vf_cfg[vf].vf_pmac_id, vf + 1);
+	}
 
 	for (vf = 0; vf < num_vfs; vf++)
 		be_cmd_if_destroy(adapter, adapter->vf_cfg[vf].vf_if_handle,
@@ -2527,7 +2545,9 @@ static int be_vf_setup(struct be_adapter *adapter)
 
 	be_vf_setup_init(adapter);
 
-	cap_flags = en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST;
+	cap_flags = en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
+				BE_IF_FLAGS_MULTICAST;
+
 	for (vf = 0; vf < num_vfs; vf++) {
 		status = be_cmd_if_create(adapter, cap_flags, en_flags, NULL,
 					&adapter->vf_cfg[vf].vf_if_handle,
@@ -2536,11 +2556,9 @@ static int be_vf_setup(struct be_adapter *adapter)
 			goto err;
 	}
 
-	if (!lancer_chip(adapter)) {
-		status = be_vf_eth_addr_config(adapter);
-		if (status)
-			goto err;
-	}
+	status = be_vf_eth_addr_config(adapter);
+	if (status)
+		goto err;
 
 	for (vf = 0; vf < num_vfs; vf++) {
 		status = be_cmd_link_status_query(adapter, NULL, &lnk_speed,
@@ -2564,6 +2582,23 @@ static void be_setup_init(struct be_adapter *adapter)
 	adapter->eq_next_idx = 0;
 }
 
+static int be_configure_mac_from_list(struct be_adapter *adapter, u8 *mac)
+{
+	u32 pmac_id;
+	int status = be_cmd_get_mac_from_list(adapter, 0, &pmac_id);
+	if (status != 0)
+		goto do_none;
+	status = be_cmd_mac_addr_query(adapter, mac,
+			MAC_ADDRESS_TYPE_NETWORK,
+			false, adapter->if_handle, pmac_id);
+	if (status != 0)
+		goto do_none;
+	status = be_cmd_pmac_add(adapter, mac, adapter->if_handle,
+			&adapter->pmac_id, 0);
+do_none:
+	return status;
+}
+
 static int be_setup(struct be_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
@@ -2591,7 +2626,7 @@ static int be_setup(struct be_adapter *adapter)
 
 	memset(mac, 0, ETH_ALEN);
 	status = be_cmd_mac_addr_query(adapter, mac, MAC_ADDRESS_TYPE_NETWORK,
-			true /*permanent */, 0);
+			true /*permanent */, 0, 0);
 	if (status)
 		return status;
 	memcpy(adapter->netdev->dev_addr, mac, ETH_ALEN);
@@ -2618,12 +2653,17 @@ static int be_setup(struct be_adapter *adapter)
 			goto err;
 	}
 
-	/* For BEx, the VF's permanent mac queried from card is incorrect.
-	 * Query the mac configued by the PF using if_handle
-	 */
-	if (!be_physfn(adapter) && !lancer_chip(adapter)) {
-		status = be_cmd_mac_addr_query(adapter, mac,
-			MAC_ADDRESS_TYPE_NETWORK, false, adapter->if_handle);
+	 /* The VF's permanent mac queried from card is incorrect.
+	  * For BEx: Query the mac configued by the PF using if_handle
+	  * For Lancer: Get and use mac_list to obtain mac address.
+	  */
+	if (!be_physfn(adapter)) {
+		if (lancer_chip(adapter))
+			status = be_configure_mac_from_list(adapter, mac);
+		else
+			status = be_cmd_mac_addr_query(adapter, mac,
+					MAC_ADDRESS_TYPE_NETWORK, false,
+					adapter->if_handle, 0);
 		if (!status) {
 			memcpy(adapter->netdev->dev_addr, mac, ETH_ALEN);
 			memcpy(adapter->netdev->perm_addr, mac, ETH_ALEN);
@@ -2639,12 +2679,15 @@ static int be_setup(struct be_adapter *adapter)
 	be_set_rx_mode(adapter->netdev);
 
 	status = be_cmd_get_flow_control(adapter, &tx_fc, &rx_fc);
-	if (status)
+	/* For Lancer: It is legal for this cmd to fail on VF */
+	if (status && (be_physfn(adapter) || !lancer_chip(adapter)))
 		goto err;
+
 	if (rx_fc != adapter->rx_fc || tx_fc != adapter->tx_fc) {
 		status = be_cmd_set_flow_control(adapter, adapter->tx_fc,
 					adapter->rx_fc);
-		if (status)
+		/* For Lancer: It is legal for this cmd to fail on VF */
+		if (status && (be_physfn(adapter) || !lancer_chip(adapter)))
 			goto err;
 	}
 
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH net-next 0/5] be2net updates
From: Padmanabh Ratnakar @ 2011-11-25 15:47 UTC (permalink / raw)
  To: netdev; +Cc: Padmanabh Ratnakar

Please apply.
Thanks,
Padmanabh

Padmanabh Ratnakar (5):
  be2net: Move to new SR-IOV implementation in Lancer
  be2net: Fix error recovery paths
  be2net: Add error handling for Lancer
  be2net: Use new hash key
  be2net: Fix non utilization of RX queues

 drivers/net/ethernet/emulex/benet/be_cmds.c |  104 ++++++++++-
 drivers/net/ethernet/emulex/benet/be_cmds.h |   37 ++++-
 drivers/net/ethernet/emulex/benet/be_main.c |  265 +++++++++++++++++++--------
 3 files changed, 323 insertions(+), 83 deletions(-)

^ permalink raw reply

* RE: [v4 PATCH 2/2] NETFILTER userspace part for target HMARK
From: Jan Engelhardt @ 2011-11-25 15:44 UTC (permalink / raw)
  To: Hans Schillström
  Cc: Hans Schillstrom, kaber@trash.net, pablo@netfilter.org,
	netfilter-devel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <C8A6796DE7C66C4ABCBC18106CB6C1CC1641FB1D64@ESESSCMS0356.eemea.ericsson.se>

On Friday 2011-11-25 15:04, Hans Schillström wrote:

>
>>On Friday 2011-11-25 10:36, Hans Schillstrom wrote:
>>
>>>+Parameters:
>>>+For all masks default is all "1:s", to disable a field use mask 0
>>>+For IPv6 it's just the last 32 bits that is included in the hash
>>
>>Why limit IPv6 to 32?
>
>Performance, and the gain of adding another 192 bits to jhash ain't much.
>However there is some cases when it hurts, i.e. when you can't mask of an subnet
>I'm not sure it it's a problem or not... 

I was thinking about the case where two particular hosts have the same 
trailing 32 bits in their source address. For example, assuming IPv6 
starts to take a stronghold in the real world and home customers start 
assigning <myprefix>::1 to the little home server (i.e. the PPP 
endpoint) of theirs for remote login.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re[2]:  [v4 PATCH 1/2] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Hans Schillstrom @ 2011-11-25 14:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: kaber, pablo, jengelh, netfilter-devel, netdev, hans.schillstrom

>---- Original Message ----
>From: Eric Dumazet <eric.dumazet@gmail.com>
>To: "Hans Schillstrom" <hans@schillstrom.com>
>Cc: kaber@trash.net, pablo@netfilter.org, jengelh@medozas.de, netfilter-devel@vger.kernel.org, netdev@vger.kernel.org, hans.schillstrom@ericsson.com
>Sent: Fri, Nov 25, 2011, 3:43 PM
>Subject: Re: [v4 PATCH 1/2] NETFILTER module xt_hmark, new target for HASH based fwmark
>
>Le vendredi 25 novembre 2011 à 10:36 +0100, Hans Schillstrom a écrit :
>> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
>> 
>> The target allows you to create rules in the "raw" and "mangle" tables
>> which alter the netfilter mark (nfmark) field within a given range.
>> First a 32 bit hash value is generated then modulus by <limit> and
>> finally an offset is added before it's written to nfmark.
>> Prior to routing, the nfmark can influence the routing method (see
>> "Use netfilter MARK value as routing key") and can also be used by
>> other subsystems to change their behavior.
>> 
>
>
>Oh well, yet another duplicated flow dissector ...
>
>> +/*
>> + * Calc hash value, special casre is taken on icmp and fragmented messages
>> + * i.e. fragmented messages don't use ports.
>> + */
>> +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par)
>> +{
>> +	struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo;
>
>
>
>> +no6ports:
>> +	nexthdr &= info->prmask;
>> +	/* get a consistent hash (same value on both flow directions) */
>> +	if (addr2 < addr1)
>> +		swap(addr1, addr2);
>> +	hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ nexthdr;
>
>whats the point computing hash, if info->hmod is null, since we dont set
>skb->mark ?

No point at all :-)
I'll rearange that

>
>> +	if (info->hmod)
>> +		skb->mark = (hash % info->hmod) + info->hoffs;
>> +
>> +	return XT_CONTINUE;
>> +}
>> +#endif
>> +
>
>
>Same problem/question on hmark_v4()

Thanks
Hans

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH v2] netns: fix proxy ARP entries listing on a netns
From: Jorge Boncompte [DTI2] @ 2011-11-25 14:41 UTC (permalink / raw)
  To: netdev; +Cc: Jorge Boncompte [DTI2]
In-Reply-To: <20111123.173613.1360696422325749253.davem@davemloft.net>

From: "Jorge Boncompte [DTI2]" <jorge@dti2.net>

Skip entries from foreign network namespaces.

V2:
    Fixed as suggested by David Miller to avoid a goto.

Signed-off-by: Jorge Boncompte [DTI2] <jorge@dti2.net>
---
 net/core/neighbour.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2684794..27d3fef 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2414,7 +2414,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
 	struct net *net = seq_file_net(seq);
 	struct neigh_table *tbl = state->tbl;
 
-	pn = pn->next;
+	do {
+		pn = pn->next;
+	} while (pn && !net_eq(pneigh_net(pn), net));
+
 	while (!pn) {
 		if (++state->bucket > PNEIGH_HASHMASK)
 			break;
-- 
1.7.7.1

^ permalink raw reply related

* Re: [v4 PATCH 1/2] NETFILTER module xt_hmark, new target for HASH based fwmark
From: Eric Dumazet @ 2011-11-25 14:43 UTC (permalink / raw)
  To: Hans Schillstrom
  Cc: kaber, pablo, jengelh, netfilter-devel, netdev, hans.schillstrom
In-Reply-To: <1322213787-25796-2-git-send-email-hans@schillstrom.com>

Le vendredi 25 novembre 2011 à 10:36 +0100, Hans Schillstrom a écrit :
> From: Hans Schillstrom <hans.schillstrom@ericsson.com>
> 
> The target allows you to create rules in the "raw" and "mangle" tables
> which alter the netfilter mark (nfmark) field within a given range.
> First a 32 bit hash value is generated then modulus by <limit> and
> finally an offset is added before it's written to nfmark.
> Prior to routing, the nfmark can influence the routing method (see
> "Use netfilter MARK value as routing key") and can also be used by
> other subsystems to change their behavior.
> 


Oh well, yet another duplicated flow dissector ...

> +/*
> + * Calc hash value, special casre is taken on icmp and fragmented messages
> + * i.e. fragmented messages don't use ports.
> + */
> +__u32 hmark_v6(struct sk_buff *skb, const struct xt_action_param *par)
> +{
> +	struct xt_hmark_info *info = (struct xt_hmark_info *)par->targinfo;



> +no6ports:
> +	nexthdr &= info->prmask;
> +	/* get a consistent hash (same value on both flow directions) */
> +	if (addr2 < addr1)
> +		swap(addr1, addr2);
> +	hash = jhash_3words(addr1, addr2, ports.v32, info->hashrnd) ^ nexthdr;

whats the point computing hash, if info->hmod is null, since we dont set
skb->mark ?

> +	if (info->hmod)
> +		skb->mark = (hash % info->hmod) + info->hoffs;
> +
> +	return XT_CONTINUE;
> +}
> +#endif
> +


Same problem/question on hmark_v4()


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox