[PATCH 1/2] Traffic control cgroups subsystem

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] Traffic control cgroups subsystem
@ 2008-07-18 21:28 Ranjit Manomohan
  2008-07-21  9:26 ` Li Zefan
  2008-07-21 14:04 ` Patrick McHardy
  0 siblings, 2 replies; 19+ messages in thread
From: Ranjit Manomohan @ 2008-07-18 21:28 UTC (permalink / raw)
  To: linux-kernel, netdev; +Cc: menage

This patch adds a traffic control cgroup subsystem that is used
to tag all packets originating from tasks in this cgroup with a
specific identifier (tc_classid).

Signed-off-by: Ranjit Manomohan <ranjitm@google.com>

---

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e287745..4b12372 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
  #endif

  /* */
+
+#ifdef CONFIG_CGROUP_TC
+SUBSYS(tc)
+#endif
+
+/* */
diff --git a/include/linux/cgroup_tc.h b/include/linux/cgroup_tc.h
new file mode 100644
index 0000000..fa6603f
--- /dev/null
+++ b/include/linux/cgroup_tc.h
@@ -0,0 +1,14 @@
+#ifndef __LINUX_CGROUP_TC_H
+#define __LINUX_CGROUP_TC_H
+
+/* Interface to obtain tasks cgroup identifier. */
+
+#include <linux/cgroup.h>
+
+#ifdef CONFIG_CGROUP_TC
+int cgroup_tc_classid(struct task_struct *tsk);
+#else
+#define cgroup_tc_classid(tsk) 0
+#endif /* CONFIG_CGROUP_TC */
+
+#endif /* __LINUX_CGROUP_TC_H */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 299ec4b..e124294 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -326,6 +326,10 @@ struct sk_buff {
  	__u32			secmark;
  #endif

+#ifdef CONFIG_CGROUP_TC
+	__u32			cgroup_classid;
+#endif
+
  	__u32			mark;

  	sk_buff_data_t		transport_header;
diff --git a/include/net/sock.h b/include/net/sock.h
index dc42b44..7a4e09c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -271,6 +271,9 @@ struct sock {
  	int			sk_write_pending;
  	void			*sk_security;
  	__u32			sk_mark;
+#ifdef CONFIG_CGROUP_TC
+	__u32			sk_cgroup_classid;
+#endif
  	/* XXX 4 bytes hole on 64 bit */
  	void			(*sk_state_change)(struct sock *sk);
  	void			(*sk_data_ready)(struct sock *sk, int bytes);
diff --git a/init/Kconfig b/init/Kconfig
index 6135d07..c28fde8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -289,6 +289,17 @@ config CGROUP_DEBUG

  	  Say N if unsure

+config CGROUP_TC
+	bool "Traffic control cgroup subsystem"
+	depends on CGROUPS
+	default n
+	help
+	  This option enables a simple cgroup subsystem that
+	  allows network traffic to be classified based on the
+	  cgroup of the task originating the traffic.
+
+	  Say N if unsure
+
  config CGROUP_NS
          bool "Namespace cgroup subsystem"
          depends on CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938a..08b217b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
  obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
  obj-$(CONFIG_CPUSETS) += cpuset.o
  obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_TC) += tc_cgroup.o
  obj-$(CONFIG_UTS_NS) += utsname.o
  obj-$(CONFIG_USER_NS) += user_namespace.o
  obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/tc_cgroup.c b/kernel/tc_cgroup.c
new file mode 100644
index 0000000..3013608
--- /dev/null
+++ b/kernel/tc_cgroup.c
@@ -0,0 +1,98 @@
+/*
+ * tc_cgroup.c - traffic control cgroup subsystem
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cgroup_tc.h>
+
+struct tc_cgroup {
+	struct cgroup_subsys_state css;
+	unsigned int classid;
+};
+
+struct cgroup_subsys tc_subsys;
+
+static inline struct tc_cgroup *cgroup_to_tc(
+		struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, tc_subsys_id),
+			    struct tc_cgroup, css);
+}
+
+int cgroup_tc_classid(struct task_struct *tsk)
+{
+	rcu_read_lock();
+	return container_of(task_subsys_state(tsk, tc_subsys_id),
+					 struct tc_cgroup, css)->classid;
+	rcu_read_unlock();
+}
+
+static struct cgroup_subsys_state *tc_create(struct cgroup_subsys *ss,
+						struct cgroup *cgroup)
+{
+	struct tc_cgroup *tc_cgroup;
+
+	tc_cgroup = kzalloc(sizeof(*tc_cgroup), GFP_KERNEL);
+
+	/* Copy parent's class id if present */
+	if (cgroup->parent)
+		tc_cgroup->classid = cgroup_to_tc(cgroup->parent)->classid;
+
+	if (!tc_cgroup)
+		return ERR_PTR(-ENOMEM);
+	return &tc_cgroup->css;
+}
+
+static void tc_destroy(struct cgroup_subsys *ss,
+			struct cgroup *cgroup)
+{
+	kfree(cgroup_to_tc(cgroup));
+}
+
+static int tc_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cgrp);
+
+	cgroup_lock();
+	if (cgroup_is_removed(cgrp)) {
+		cgroup_unlock();
+		return -ENODEV;
+	}
+
+	tc->classid = (unsigned int) (val & 0xffffffff);
+	cgroup_unlock();
+	return 0;
+}
+
+static u64 tc_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cont);
+	return tc->classid;
+}
+
+static struct cftype tc_files[] = {
+	{
+		.name = "classid",
+		.read_u64 = tc_read_u64,
+		.write_u64 = tc_write_u64,
+	}
+};
+
+static int tc_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	int err;
+	err = cgroup_add_files(cont, ss, tc_files, ARRAY_SIZE(tc_files));
+	return err;
+}
+
+struct cgroup_subsys tc_subsys = {
+	.name = "tc",
+	.create = tc_create,
+	.destroy  = tc_destroy,
+	.populate = tc_populate,
+	.subsys_id = tc_subsys_id,
+};
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e527628..7f8ceab 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -168,6 +168,11 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
  	}

  	skb->priority = sk->sk_priority;
+
+#ifdef CONFIG_CGROUP_TC
+	skb->cgroup_classid = sk->sk_cgroup_classid;
+#endif
+
  	skb->mark = sk->sk_mark;

  	/* Send it out. */
@@ -386,6 +391,9 @@ packet_routed:
  			     (skb_shinfo(skb)->gso_segs ?: 1) - 1);

  	skb->priority = sk->sk_priority;
+#ifdef CONFIG_CGROUP_TC
+	skb->cgroup_classid = sk->sk_cgroup_classid;
+#endif
  	skb->mark = sk->sk_mark;

  	return ip_local_out(skb);
@@ -1278,6 +1286,9 @@ int ip_push_pending_frames(struct sock *sk)
  	iph->daddr = rt->rt_dst;

  	skb->priority = sk->sk_priority;
+#ifdef CONFIG_CGROUP_TC
+	skb->cgroup_classid = sk->sk_cgroup_classid;
+#endif
  	skb->mark = sk->sk_mark;
  	skb->dst = dst_clone(&rt->u.dst);

@@ -1387,6 +1398,9 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
  	bh_lock_sock(sk);
  	inet->tos = ip_hdr(skb)->tos;
  	sk->sk_priority = skb->priority;
+#ifdef CONFIG_CGROUP_TC
+	skb->cgroup_classid = sk->sk_cgroup_classid;
+#endif
  	sk->sk_protocol = ip_hdr(skb)->protocol;
  	sk->sk_bound_dev_if = arg->bound_dev_if;
  	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 48cdce9..306bb37 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -257,6 +257,10 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
  	ipv6_addr_copy(&hdr->daddr, first_hop);

  	skb->priority = sk->sk_priority;
+#ifdef CONFIG_CGROUP_TC
+	skb->cgroup_classid = sk->sk_cgroup_classid;
+#endif
+
  	skb->mark = sk->sk_mark;

  	mtu = dst_mtu(dst);
@@ -1448,6 +1452,9 @@ int ip6_push_pending_frames(struct sock *sk)
  	ipv6_addr_copy(&hdr->daddr, final_dst);

  	skb->priority = sk->sk_priority;
+#ifdef CONFIG_CGROUP_TC
+	skb->cgroup_classid = sk->sk_cgroup_classid;
+#endif
  	skb->mark = sk->sk_mark;

  	skb->dst = dst_clone(&rt->u.dst);
diff --git a/net/socket.c b/net/socket.c
index 66c4a8c..7c5183c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -93,6 +93,7 @@

  #include <net/sock.h>
  #include <linux/netfilter.h>
+#include <linux/cgroup_tc.h>

  static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
  static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
@@ -1170,6 +1171,11 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
  	if (err < 0)
  		goto out_module_put;

+#ifdef CONFIG_CGROUP_TC
+	if (sock->sk)
+		sock->sk->sk_cgroup_classid = cgroup_tc_classid(current);
+#endif
+
  	/*
  	 * Now to bump the refcnt of the [loadable] module that owns this
  	 * socket at sock_release time we decrement its refcnt.
@@ -1444,6 +1450,11 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
  	if (err < 0)
  		goto out_fd;

+#ifdef CONFIG_CGROUP_TC
+	if (newsock->sk)
+		newsock->sk->sk_cgroup_classid = cgroup_tc_classid(current);
+#endif
+
  	if (upeer_sockaddr) {
  		if (newsock->ops->getname(newsock, (struct sockaddr *)address,
  					  &len, 2) < 0) {

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-07-18 21:28 Ranjit Manomohan
@ 2008-07-21  9:26 ` Li Zefan
  2008-07-21 14:04 ` Patrick McHardy
  1 sibling, 0 replies; 19+ messages in thread
From: Li Zefan @ 2008-07-21  9:26 UTC (permalink / raw)
  To: Ranjit Manomohan; +Cc: linux-kernel, netdev, menage

Ranjit Manomohan wrote:
> This patch adds a traffic control cgroup subsystem that is used
> to tag all packets originating from tasks in this cgroup with a
> specific identifier (tc_classid).
> 
> Signed-off-by: Ranjit Manomohan <ranjitm@google.com>
> 
> ---
> 
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index e287745..4b12372 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -48,3 +48,9 @@ SUBSYS(devices)
>  #endif
> 
>  /* */
> +
> +#ifdef CONFIG_CGROUP_TC
> +SUBSYS(tc)
> +#endif
> +

seems tc is not a good name... I won't know it stands for traffic-control if I
didn't know beforehand.

> +int cgroup_tc_classid(struct task_struct *tsk)
> +{
> +    rcu_read_lock();
> +    return container_of(task_subsys_state(tsk, tc_subsys_id),
> +                     struct tc_cgroup, css)->classid;
> +    rcu_read_unlock();

How do you unlock after return ;)

> +}
> +
> +static struct cgroup_subsys_state *tc_create(struct cgroup_subsys *ss,
> +                        struct cgroup *cgroup)
> +{
> +    struct tc_cgroup *tc_cgroup;
> +
> +    tc_cgroup = kzalloc(sizeof(*tc_cgroup), GFP_KERNEL);
> +

The 'if (!tc_cgroup)' below should be here.

> +    /* Copy parent's class id if present */
> +    if (cgroup->parent)
> +        tc_cgroup->classid = cgroup_to_tc(cgroup->parent)->classid;
> +
> +    if (!tc_cgroup)
> +        return ERR_PTR(-ENOMEM);
> +    return &tc_cgroup->css;
> +}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-07-18 21:28 Ranjit Manomohan
  2008-07-21  9:26 ` Li Zefan
@ 2008-07-21 14:04 ` Patrick McHardy
  1 sibling, 0 replies; 19+ messages in thread
From: Patrick McHardy @ 2008-07-21 14:04 UTC (permalink / raw)
  To: Ranjit Manomohan; +Cc: linux-kernel, netdev, menage

Ranjit Manomohan wrote:
> This patch adds a traffic control cgroup subsystem that is used
> to tag all packets originating from tasks in this cgroup with a
> specific identifier (tc_classid).
> 
> +#ifdef CONFIG_CGROUP_TC
> +    skb->cgroup_classid = sk->sk_cgroup_classid;
> +#endif

Please wrap this in an inline function so you don't have to
put the #ifdefs everywhere.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/2] Traffic control cgroups subsystem
@ 2008-07-24 23:35 Ranjit Manomohan
  0 siblings, 0 replies; 19+ messages in thread
From: Ranjit Manomohan @ 2008-07-24 23:35 UTC (permalink / raw)
  To: linux-kernel, netdev; +Cc: lizf, menage, kaber, akpm

[Take 4] incorporated additional comments from Patrick McHardy

This patch adds a traffic control cgroup subsystem that is used
to associate all packets originating from tasks in this cgroup with a
specific identifier (tc_classid).

Signed-off-by: Ranjit Manomohan <ranjitm@google.com>

---

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e287745..4b12372 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
 #endif

 /* */
+
+#ifdef CONFIG_CGROUP_TC
+SUBSYS(tc)
+#endif
+
+/* */
diff --git a/include/linux/cgroup_tc.h b/include/linux/cgroup_tc.h
new file mode 100644
index 0000000..e4ba6a1
--- /dev/null
+++ b/include/linux/cgroup_tc.h
@@ -0,0 +1,20 @@
+#ifndef __LINUX_CGROUP_TC_H
+#define __LINUX_CGROUP_TC_H
+
+/* Interface to obtain tasks cgroup identifier. */
+
+#include <linux/cgroup.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+#ifdef CONFIG_CGROUP_TC
+
+void cgroup_tc_set_sock_classid(struct sock *sk);
+
+#else
+
+#define cgroup_tc_set_sock_classid(sk)
+
+#endif /* CONFIG_CGROUP_TC */
+
+#endif /* __LINUX_CGROUP_TC_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index dc42b44..7a4e09c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -271,6 +271,9 @@ struct sock {
 	int			sk_write_pending;
 	void			*sk_security;
 	__u32			sk_mark;
+#ifdef CONFIG_CGROUP_TC
+	__u32			sk_cgroup_classid;
+#endif
 	/* XXX 4 bytes hole on 64 bit */
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
diff --git a/init/Kconfig b/init/Kconfig
index 6135d07..c28fde8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -289,6 +289,17 @@ config CGROUP_DEBUG

 	  Say N if unsure

+config CGROUP_TC
+	bool "Traffic control cgroup subsystem"
+	depends on CGROUPS
+	default n
+	help
+	  This option enables a simple cgroup subsystem that
+	  allows network traffic to be classified based on the
+	  cgroup of the task originating the traffic.
+
+	  Say N if unsure
+
 config CGROUP_NS
         bool "Namespace cgroup subsystem"
         depends on CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938a..08b217b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_TC) += tc_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/tc_cgroup.c b/kernel/tc_cgroup.c
new file mode 100644
index 0000000..9286fb2
--- /dev/null
+++ b/kernel/tc_cgroup.c
@@ -0,0 +1,108 @@
+/*
+ * tc_cgroup.c - traffic control cgroup subsystem
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cgroup_tc.h>
+
+struct tc_cgroup {
+	struct cgroup_subsys_state css;
+	unsigned int classid;
+};
+
+struct cgroup_subsys tc_subsys;
+
+static inline struct tc_cgroup *cgroup_to_tc(
+		struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, tc_subsys_id),
+			    struct tc_cgroup, css);
+}
+
+static unsigned int cgroup_tc_classid(struct task_struct *tsk)
+{
+	unsigned int tc_classid;
+
+	rcu_read_lock();
+	tc_classid =  container_of(task_subsys_state(tsk, tc_subsys_id),
+					 struct tc_cgroup, css)->classid;
+	rcu_read_unlock();
+	return tc_classid;
+}
+
+void cgroup_tc_set_sock_classid(struct sock *sk)
+{
+	if (sk)
+		sk->sk_cgroup_classid = cgroup_tc_classid(current);
+}
+
+static struct cgroup_subsys_state *tc_create(struct cgroup_subsys *ss,
+						struct cgroup *cgroup)
+{
+	struct tc_cgroup *tc_cgroup;
+
+	tc_cgroup = kzalloc(sizeof(*tc_cgroup), GFP_KERNEL);
+
+	if (!tc_cgroup)
+		return ERR_PTR(-ENOMEM);
+
+	/* Copy parent's class id if present */
+	if (cgroup->parent)
+		tc_cgroup->classid = cgroup_to_tc(cgroup->parent)->classid;
+
+	return &tc_cgroup->css;
+}
+
+static void tc_destroy(struct cgroup_subsys *ss,
+			struct cgroup *cgroup)
+{
+	kfree(cgroup_to_tc(cgroup));
+}
+
+static int tc_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cgrp);
+
+	cgroup_lock();
+	if (cgroup_is_removed(cgrp)) {
+		cgroup_unlock();
+		return -ENODEV;
+	}
+
+	tc->classid = (unsigned int) (val & 0xffffffff);
+	cgroup_unlock();
+	return 0;
+}
+
+static u64 tc_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cont);
+	return tc->classid;
+}
+
+static struct cftype tc_files[] = {
+	{
+		.name = "classid",
+		.read_u64 = tc_read_u64,
+		.write_u64 = tc_write_u64,
+	}
+};
+
+static int tc_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	int err;
+	err = cgroup_add_files(cont, ss, tc_files, ARRAY_SIZE(tc_files));
+	return err;
+}
+
+struct cgroup_subsys tc_subsys = {
+	.name = "tc",
+	.create = tc_create,
+	.destroy  = tc_destroy,
+	.populate = tc_populate,
+	.subsys_id = tc_subsys_id,
+};
diff --git a/net/socket.c b/net/socket.c
index 66c4a8c..b7421ec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -93,6 +93,7 @@

 #include <net/sock.h>
 #include <linux/netfilter.h>
+#include <linux/cgroup_tc.h>

 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
@@ -1170,6 +1171,8 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
 	if (err < 0)
 		goto out_module_put;

+	cgroup_tc_set_sock_classid(sock->sk);
+
 	/*
 	 * Now to bump the refcnt of the [loadable] module that owns this
 	 * socket at sock_release time we decrement its refcnt.
@@ -1444,6 +1447,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
 	if (err < 0)
 		goto out_fd;

+	cgroup_tc_set_sock_classid(newsock->sk);
+
 	if (upeer_sockaddr) {
 		if (newsock->ops->getname(newsock, (struct sockaddr *)address,
 					  &len, 2) < 0) {

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 1/2] Traffic control cgroups subsystem
@ 2008-08-22  0:55 Ranjit Manomohan
  2008-08-22  2:11 ` Li Zefan
  0 siblings, 1 reply; 19+ messages in thread
From: Ranjit Manomohan @ 2008-08-22  0:55 UTC (permalink / raw)
  To: linux-kernel, netdev; +Cc: lizf, menage, kaber, akpm

This patch adds a traffic control cgroup subsystem that is used
to associate all packets originating from tasks in this cgroup with a
specific identifier (tc_classid).

Signed-off-by: Ranjit Manomohan <ranjitm@google.com>

---

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e287745..4b12372 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
 #endif

 /* */
+
+#ifdef CONFIG_CGROUP_TC
+SUBSYS(tc)
+#endif
+
+/* */
diff --git a/include/linux/cgroup_tc.h b/include/linux/cgroup_tc.h
new file mode 100644
index 0000000..e4ba6a1
--- /dev/null
+++ b/include/linux/cgroup_tc.h
@@ -0,0 +1,20 @@
+#ifndef __LINUX_CGROUP_TC_H
+#define __LINUX_CGROUP_TC_H
+
+/* Interface to obtain tasks cgroup identifier. */
+
+#include <linux/cgroup.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+#ifdef CONFIG_CGROUP_TC
+
+void cgroup_tc_set_sock_classid(struct sock *sk);
+
+#else
+
+#define cgroup_tc_set_sock_classid(sk)
+
+#endif /* CONFIG_CGROUP_TC */
+
+#endif /* __LINUX_CGROUP_TC_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index dc42b44..7a4e09c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -271,6 +271,9 @@ struct sock {
 	int			sk_write_pending;
 	void			*sk_security;
 	__u32			sk_mark;
+#ifdef CONFIG_CGROUP_TC
+	__u32			sk_cgroup_classid;
+#endif
 	/* XXX 4 bytes hole on 64 bit */
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
diff --git a/init/Kconfig b/init/Kconfig
index 6135d07..c28fde8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -289,6 +289,17 @@ config CGROUP_DEBUG

 	  Say N if unsure

+config CGROUP_TC
+	bool "Traffic control cgroup subsystem"
+	depends on CGROUPS
+	default n
+	help
+	  This option enables a simple cgroup subsystem that
+	  allows network traffic to be classified based on the
+	  cgroup of the task originating the traffic.
+
+	  Say N if unsure
+
 config CGROUP_NS
         bool "Namespace cgroup subsystem"
         depends on CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938a..08b217b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_TC) += tc_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/tc_cgroup.c b/kernel/tc_cgroup.c
new file mode 100644
index 0000000..9286fb2
--- /dev/null
+++ b/kernel/tc_cgroup.c
@@ -0,0 +1,108 @@
+/*
+ * tc_cgroup.c - traffic control cgroup subsystem
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cgroup_tc.h>
+
+struct tc_cgroup {
+	struct cgroup_subsys_state css;
+	unsigned int classid;
+};
+
+struct cgroup_subsys tc_subsys;
+
+static inline struct tc_cgroup *cgroup_to_tc(
+		struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, tc_subsys_id),
+			    struct tc_cgroup, css);
+}
+
+static unsigned int cgroup_tc_classid(struct task_struct *tsk)
+{
+	unsigned int tc_classid;
+
+	rcu_read_lock();
+	tc_classid =  container_of(task_subsys_state(tsk, tc_subsys_id),
+					 struct tc_cgroup, css)->classid;
+	rcu_read_unlock();
+	return tc_classid;
+}
+
+void cgroup_tc_set_sock_classid(struct sock *sk)
+{
+	if (sk)
+		sk->sk_cgroup_classid = cgroup_tc_classid(current);
+}
+
+static struct cgroup_subsys_state *tc_create(struct cgroup_subsys *ss,
+						struct cgroup *cgroup)
+{
+	struct tc_cgroup *tc_cgroup;
+
+	tc_cgroup = kzalloc(sizeof(*tc_cgroup), GFP_KERNEL);
+
+	if (!tc_cgroup)
+		return ERR_PTR(-ENOMEM);
+
+	/* Copy parent's class id if present */
+	if (cgroup->parent)
+		tc_cgroup->classid = cgroup_to_tc(cgroup->parent)->classid;
+
+	return &tc_cgroup->css;
+}
+
+static void tc_destroy(struct cgroup_subsys *ss,
+			struct cgroup *cgroup)
+{
+	kfree(cgroup_to_tc(cgroup));
+}
+
+static int tc_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cgrp);
+
+	cgroup_lock();
+	if (cgroup_is_removed(cgrp)) {
+		cgroup_unlock();
+		return -ENODEV;
+	}
+
+	tc->classid = (unsigned int) (val & 0xffffffff);
+	cgroup_unlock();
+	return 0;
+}
+
+static u64 tc_read_u64(struct cgroup *cont, struct cftype *cft)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cont);
+	return tc->classid;
+}
+
+static struct cftype tc_files[] = {
+	{
+		.name = "classid",
+		.read_u64 = tc_read_u64,
+		.write_u64 = tc_write_u64,
+	}
+};
+
+static int tc_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	int err;
+	err = cgroup_add_files(cont, ss, tc_files, ARRAY_SIZE(tc_files));
+	return err;
+}
+
+struct cgroup_subsys tc_subsys = {
+	.name = "tc",
+	.create = tc_create,
+	.destroy  = tc_destroy,
+	.populate = tc_populate,
+	.subsys_id = tc_subsys_id,
+};
diff --git a/net/socket.c b/net/socket.c
index 66c4a8c..b7421ec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -93,6 +93,7 @@

 #include <net/sock.h>
 #include <linux/netfilter.h>
+#include <linux/cgroup_tc.h>

 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
@@ -1170,6 +1171,8 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
 	if (err < 0)
 		goto out_module_put;

+	cgroup_tc_set_sock_classid(sock->sk);
+
 	/*
 	 * Now to bump the refcnt of the [loadable] module that owns this
 	 * socket at sock_release time we decrement its refcnt.
@@ -1444,6 +1447,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
 	if (err < 0)
 		goto out_fd;

+	cgroup_tc_set_sock_classid(newsock->sk);
+
 	if (upeer_sockaddr) {
 		if (newsock->ops->getname(newsock, (struct sockaddr *)address,
 					  &len, 2) < 0) {

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-08-22  0:55 [PATCH 1/2] Traffic control cgroups subsystem Ranjit Manomohan
@ 2008-08-22  2:11 ` Li Zefan
  0 siblings, 0 replies; 19+ messages in thread
From: Li Zefan @ 2008-08-22  2:11 UTC (permalink / raw)
  To: Ranjit Manomohan; +Cc: linux-kernel, netdev, menage, kaber, akpm

> +static unsigned int cgroup_tc_classid(struct task_struct *tsk)
> +{
> +	unsigned int tc_classid;
> +
> +	rcu_read_lock();
> +	tc_classid =  container_of(task_subsys_state(tsk, tc_subsys_id),

trivial: 2 spaces   ^^

> +					 struct tc_cgroup, css)->classid;
> +	rcu_read_unlock();
> +	return tc_classid;
> +}
> +

...

> +static int tc_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
> +{
> +	struct tc_cgroup *tc = cgroup_to_tc(cgrp);
> +
> +	cgroup_lock();
> +	if (cgroup_is_removed(cgrp)) {
> +		cgroup_unlock();
> +		return -ENODEV;
> +	}
> +

You may use cgroup_lock_live_group():

	if (cgroup_lock_live_group(cgrp0))
		return -ENODEV;

> +	tc->classid = (unsigned int) (val & 0xffffffff);
> +	cgroup_unlock();
> +	return 0;
> +}
> +
> +static u64 tc_read_u64(struct cgroup *cont, struct cftype *cft)

'cont', 'cgrp' and 'cgroup' are used, it's better to make it consistent
to use 'cgrp' (or 'cgroup') only.

> +{
> +	struct tc_cgroup *tc = cgroup_to_tc(cont);
> +	return tc->classid;
> +}

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/2] Traffic control cgroups subsystem
@ 2008-09-10 17:42 Ranjit Manomohan
  2008-09-10 22:01 ` Thomas Graf
  0 siblings, 1 reply; 19+ messages in thread
From: Ranjit Manomohan @ 2008-09-10 17:42 UTC (permalink / raw)
  To: davem, akpm, kaber, lizf, menage, tgraf; +Cc: linux-kernel, netdev

This patch adds a traffic control cgroup subsystem that is used
to associate all packets originating from tasks in this cgroup with a
specific identifier (tc_classid).

Signed-off-by: Ranjit Manomohan <ranjitm@google.com>

---

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e287745..4b12372 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -48,3 +48,9 @@ SUBSYS(devices)
 #endif

 /* */
+
+#ifdef CONFIG_CGROUP_TC
+SUBSYS(tc)
+#endif
+
+/* */
diff --git a/include/linux/cgroup_tc.h b/include/linux/cgroup_tc.h
new file mode 100644
index 0000000..e4ba6a1
--- /dev/null
+++ b/include/linux/cgroup_tc.h
@@ -0,0 +1,20 @@
+#ifndef __LINUX_CGROUP_TC_H
+#define __LINUX_CGROUP_TC_H
+
+/* Interface to obtain tasks cgroup identifier. */
+
+#include <linux/cgroup.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+
+#ifdef CONFIG_CGROUP_TC
+
+void cgroup_tc_set_sock_classid(struct sock *sk);
+
+#else
+
+#define cgroup_tc_set_sock_classid(sk)
+
+#endif /* CONFIG_CGROUP_TC */
+
+#endif /* __LINUX_CGROUP_TC_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index dc42b44..7a4e09c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -271,6 +271,9 @@ struct sock {
 	int			sk_write_pending;
 	void			*sk_security;
 	__u32			sk_mark;
+#ifdef CONFIG_CGROUP_TC
+	__u32			sk_cgroup_classid;
+#endif
 	/* XXX 4 bytes hole on 64 bit */
 	void			(*sk_state_change)(struct sock *sk);
 	void			(*sk_data_ready)(struct sock *sk, int bytes);
diff --git a/init/Kconfig b/init/Kconfig
index 6135d07..c28fde8 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -289,6 +289,17 @@ config CGROUP_DEBUG

 	  Say N if unsure

+config CGROUP_TC
+	bool "Traffic control cgroup subsystem"
+	depends on CGROUPS
+	default n
+	help
+	  This option enables a simple cgroup subsystem that
+	  allows network traffic to be classified based on the
+	  cgroup of the task originating the traffic.
+
+	  Say N if unsure
+
 config CGROUP_NS
         bool "Namespace cgroup subsystem"
         depends on CGROUPS
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938a..08b217b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_TC) += tc_cgroup.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
 obj-$(CONFIG_PID_NS) += pid_namespace.o
diff --git a/kernel/tc_cgroup.c b/kernel/tc_cgroup.c
new file mode 100644
index 0000000..9286fb2
--- /dev/null
+++ b/kernel/tc_cgroup.c
@@ -0,0 +1,105 @@
+/*
+ * tc_cgroup.c - traffic control cgroup subsystem
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cgroup_tc.h>
+
+struct tc_cgroup {
+	struct cgroup_subsys_state css;
+	unsigned int classid;
+};
+
+struct cgroup_subsys tc_subsys;
+
+static inline struct tc_cgroup *cgroup_to_tc(
+		struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, tc_subsys_id),
+			    struct tc_cgroup, css);
+}
+
+static unsigned int cgroup_tc_classid(struct task_struct *tsk)
+{
+	unsigned int tc_classid;
+
+	rcu_read_lock();
+	tc_classid = container_of(task_subsys_state(tsk, tc_subsys_id),
+					 struct tc_cgroup, css)->classid;
+	rcu_read_unlock();
+	return tc_classid;
+}
+
+void cgroup_tc_set_sock_classid(struct sock *sk)
+{
+	if (sk)
+		sk->sk_cgroup_classid = cgroup_tc_classid(current);
+}
+
+static struct cgroup_subsys_state *tc_create(struct cgroup_subsys *ss,
+						struct cgroup *cgroup)
+{
+	struct tc_cgroup *tc_cgroup;
+
+	tc_cgroup = kzalloc(sizeof(*tc_cgroup), GFP_KERNEL);
+
+	if (!tc_cgroup)
+		return ERR_PTR(-ENOMEM);
+
+	/* Copy parent's class id if present */
+	if (cgroup->parent)
+		tc_cgroup->classid = cgroup_to_tc(cgroup->parent)->classid;
+
+	return &tc_cgroup->css;
+}
+
+static void tc_destroy(struct cgroup_subsys *ss,
+			struct cgroup *cgroup)
+{
+	kfree(cgroup_to_tc(cgroup));
+}
+
+static int tc_write_u64(struct cgroup *cgroup, struct cftype *cft, u64 val)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cgroup);
+
+	if (!cgroup_lock_live_group(cgroup))
+		return -ENODEV;
+
+	tc->classid = (unsigned int) (val & 0xffffffff);
+	cgroup_unlock();
+	return 0;
+}
+
+static u64 tc_read_u64(struct cgroup *cgroup, struct cftype *cft)
+{
+	struct tc_cgroup *tc = cgroup_to_tc(cgroup);
+	return tc->classid;
+}
+
+static struct cftype tc_files[] = {
+	{
+		.name = "classid",
+		.read_u64 = tc_read_u64,
+		.write_u64 = tc_write_u64,
+	}
+};
+
+static int tc_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
+{
+	int err;
+	err = cgroup_add_files(cgroup, ss, tc_files, ARRAY_SIZE(tc_files));
+	return err;
+}
+
+struct cgroup_subsys tc_subsys = {
+	.name = "tc",
+	.create = tc_create,
+	.destroy  = tc_destroy,
+	.populate = tc_populate,
+	.subsys_id = tc_subsys_id,
+};
diff --git a/net/socket.c b/net/socket.c
index 66c4a8c..b7421ec 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -93,6 +93,7 @@

 #include <net/sock.h>
 #include <linux/netfilter.h>
+#include <linux/cgroup_tc.h>

 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
 static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
@@ -1170,6 +1171,8 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
 	if (err < 0)
 		goto out_module_put;

+	cgroup_tc_set_sock_classid(sock->sk);
+
 	/*
 	 * Now to bump the refcnt of the [loadable] module that owns this
 	 * socket at sock_release time we decrement its refcnt.
@@ -1444,6 +1447,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
 	if (err < 0)
 		goto out_fd;

+	cgroup_tc_set_sock_classid(newsock->sk);
+
 	if (upeer_sockaddr) {
 		if (newsock->ops->getname(newsock, (struct sockaddr *)address,
 					  &len, 2) < 0) {

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 17:42 Ranjit Manomohan
@ 2008-09-10 22:01 ` Thomas Graf
  2008-09-10 22:56   ` Ranjit Manomohan
  0 siblings, 1 reply; 19+ messages in thread
From: Thomas Graf @ 2008-09-10 22:01 UTC (permalink / raw)
  To: Ranjit Manomohan; +Cc: davem, akpm, kaber, lizf, menage, linux-kernel, netdev

* Ranjit Manomohan <ranjitm@google.com> 2008-09-10 10:42
> +void cgroup_tc_set_sock_classid(struct sock *sk)
> +{
> +	if (sk)
> +		sk->sk_cgroup_classid = cgroup_tc_classid(current);
> +}
> +
> @@ -1170,6 +1171,8 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
>  	if (err < 0)
>  		goto out_module_put;
> 
> +	cgroup_tc_set_sock_classid(sock->sk);
> +
>  	/*
>  	 * Now to bump the refcnt of the [loadable] module that owns this
>  	 * socket at sock_release time we decrement its refcnt.
> @@ -1444,6 +1447,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
>  	if (err < 0)
>  		goto out_fd;
> 
> +	cgroup_tc_set_sock_classid(newsock->sk);
> +
>  	if (upeer_sockaddr) {
>  		if (newsock->ops->getname(newsock, (struct sockaddr *)address,
>  					  &len, 2) < 0) {

The big disadvantage of this method is that it does not allow to change
the classid for sockets which already exist. It inherits the classid
at socket creation time and then sticks to it. So if you want to follow
this approach I'd suggest to at least store a reference to the cgroup
state and reference count it properly.

As for the locking that you mentioned in the other thread. IMHO it is
not possible to lookup a socket without taking at least one lock, but
I might be wrong there. Actually I think it will take even more locks
as different locks are used to f.e. protect listening and established
tcp sockets.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 22:01 ` Thomas Graf
@ 2008-09-10 22:56   ` Ranjit Manomohan
  2008-09-10 23:00     ` David Miller
  2008-09-10 23:04     ` Paul Menage
  0 siblings, 2 replies; 19+ messages in thread
From: Ranjit Manomohan @ 2008-09-10 22:56 UTC (permalink / raw)
  To: Thomas Graf; +Cc: davem, akpm, kaber, lizf, menage, linux-kernel, netdev

On Wed, Sep 10, 2008 at 3:01 PM, Thomas Graf <tgraf@suug.ch> wrote:
> * Ranjit Manomohan <ranjitm@google.com> 2008-09-10 10:42
>> +void cgroup_tc_set_sock_classid(struct sock *sk)
>> +{
>> +     if (sk)
>> +             sk->sk_cgroup_classid = cgroup_tc_classid(current);
>> +}
>> +
>> @@ -1170,6 +1171,8 @@ static int __sock_create(struct net *net, int family, int type, int protocol,
>>       if (err < 0)
>>               goto out_module_put;
>>
>> +     cgroup_tc_set_sock_classid(sock->sk);
>> +
>>       /*
>>        * Now to bump the refcnt of the [loadable] module that owns this
>>        * socket at sock_release time we decrement its refcnt.
>> @@ -1444,6 +1447,8 @@ asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
>>       if (err < 0)
>>               goto out_fd;
>>
>> +     cgroup_tc_set_sock_classid(newsock->sk);
>> +
>>       if (upeer_sockaddr) {
>>               if (newsock->ops->getname(newsock, (struct sockaddr *)address,
>>                                         &len, 2) < 0) {
>
> The big disadvantage of this method is that it does not allow to change
> the classid for sockets which already exist. It inherits the classid
> at socket creation time and then sticks to it. So if you want to follow
> this approach I'd suggest to at least store a reference to the cgroup
> state and reference count it properly.
>

I had considered adding support for moving tasks between cgroups by
going through all the open fds of the task and updating the sockets
(in the cgroup attach method). It is a very heavy weight operation and
not considered a common use case (and even undesirable at times) so I
dropped the idea. I can add it back in if it is considered essential.


> As for the locking that you mentioned in the other thread. IMHO it is
> not possible to lookup a socket without taking at least one lock, but
> I might be wrong there. Actually I think it will take even more locks
> as different locks are used to f.e. protect listening and established
> tcp sockets.

That is correct for ingress, for egress the sk is already available in
the skb so should be fine.

-Thanks,
Ranjit

>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 22:56   ` Ranjit Manomohan
@ 2008-09-10 23:00     ` David Miller
  2008-09-10 23:14       ` Ranjit Manomohan
  2008-09-10 23:04     ` Paul Menage
  1 sibling, 1 reply; 19+ messages in thread
From: David Miller @ 2008-09-10 23:00 UTC (permalink / raw)
  To: ranjitm; +Cc: tgraf, akpm, kaber, lizf, menage, linux-kernel, netdev

From: "Ranjit Manomohan" <ranjitm@google.com>
Date: Wed, 10 Sep 2008 15:56:55 -0700

> That is correct for ingress, for egress the sk is already available in
> the skb so should be fine.

That is not something you can rely upon, even for egress, %100 of the time.

Some forms of reallocation and mangling might decide to orphan the SKB
and thus drop the skb->sk reference before you see the packet.  And they
are absolutely free to do this.

Just grep for skb_orphan().

Therefore, it is absolutely something you should not rely upon for
correct operation.

Like I said from the beginning, Thomas's approach is the superior one.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 22:56   ` Ranjit Manomohan
  2008-09-10 23:00     ` David Miller
@ 2008-09-10 23:04     ` Paul Menage
  2008-09-10 23:24       ` Thomas Graf
  1 sibling, 1 reply; 19+ messages in thread
From: Paul Menage @ 2008-09-10 23:04 UTC (permalink / raw)
  To: Ranjit Manomohan
  Cc: Thomas Graf, davem, akpm, kaber, lizf, linux-kernel, netdev

On Wed, Sep 10, 2008 at 3:56 PM, Ranjit Manomohan <ranjitm@google.com> wrote:
>>
>> The big disadvantage of this method is that it does not allow to change
>> the classid for sockets which already exist. It inherits the classid
>> at socket creation time and then sticks to it. So if you want to follow
>> this approach I'd suggest to at least store a reference to the cgroup
>> state and reference count it properly.
>>
>
> I had considered adding support for moving tasks between cgroups by
> going through all the open fds of the task and updating the sockets
> (in the cgroup attach method). It is a very heavy weight operation and
> not considered a common use case (and even undesirable at times) so I
> dropped the idea. I can add it back in if it is considered essential.
>

That's a bit different from what Thomas is suggesting (I think).

There are three options:

a) socket acquires class id at creation time from its creator task or
its parent socket. So the class id is fixed for the lifetime of the
socket

b) socket acquires a reference to a cgroup at creation time, from its
creator task or its parent socket. So the class id can be updated by
changing the cgroup's class id, but the cgroup of the socket can't be
changed. This can prevent the cgroup from being properly destroyed.

c) socket acquires a reference to a cgroup at creation time, and can
be moved to a different cgroup when tasks that reference it move
between cgroups.

Our patches use option a. Option c is too heavyweight IMO, and has
vague semantics for exactly when movement should occur. Option b
*could* be useful, if you wanted to be able to share class ids between
cgroups, *and* shuffle the sharing relationships around on the fly. I
think that Thomas is suggestion option b. I'm not sure that I see a
concrete use case for it though - Thomas, what use cases did you see?

Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 23:00     ` David Miller
@ 2008-09-10 23:14       ` Ranjit Manomohan
  0 siblings, 0 replies; 19+ messages in thread
From: Ranjit Manomohan @ 2008-09-10 23:14 UTC (permalink / raw)
  To: David Miller; +Cc: tgraf, akpm, kaber, lizf, menage, linux-kernel, netdev

On Wed, Sep 10, 2008 at 4:00 PM, David Miller <davem@davemloft.net> wrote:
> From: "Ranjit Manomohan" <ranjitm@google.com>
> Date: Wed, 10 Sep 2008 15:56:55 -0700
>
>> That is correct for ingress, for egress the sk is already available in
>> the skb so should be fine.
>
> That is not something you can rely upon, even for egress, %100 of the time.
>
> Some forms of reallocation and mangling might decide to orphan the SKB
> and thus drop the skb->sk reference before you see the packet.  And they
> are absolutely free to do this.
>
> Just grep for skb_orphan().
>
> Therefore, it is absolutely something you should not rely upon for
> correct operation.

Thats fine and we do not rely on this. Those packets would just not be
classified, the cgroup classid is only a hint and used when available
(which is most of the time).

>
> Like I said from the beginning, Thomas's approach is the superior one.

It would leave a lot of packets (like acks) unaccounted for and these
do take up a significant portion of network packets transmitted on a
high speed link). I am ok with Thomas' simpler approach too, just
pointing out that it is not as accurate as the proposed alternative.

-Thanks,
Ranjit

>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 23:04     ` Paul Menage
@ 2008-09-10 23:24       ` Thomas Graf
  2008-09-10 23:31         ` Paul Menage
  0 siblings, 1 reply; 19+ messages in thread
From: Thomas Graf @ 2008-09-10 23:24 UTC (permalink / raw)
  To: Paul Menage
  Cc: Ranjit Manomohan, davem, akpm, kaber, lizf, linux-kernel, netdev

* Paul Menage <menage@google.com> 2008-09-10 16:04
> That's a bit different from what Thomas is suggesting (I think).
> 
> There are three options:
> 
> a) socket acquires class id at creation time from its creator task or
> its parent socket. So the class id is fixed for the lifetime of the
> socket
> 
> b) socket acquires a reference to a cgroup at creation time, from its
> creator task or its parent socket. So the class id can be updated by
> changing the cgroup's class id, but the cgroup of the socket can't be
> changed. This can prevent the cgroup from being properly destroyed.
> 
> c) socket acquires a reference to a cgroup at creation time, and can
> be moved to a different cgroup when tasks that reference it move
> between cgroups.
> 
> Our patches use option a. Option c is too heavyweight IMO, and has
> vague semantics for exactly when movement should occur. Option b
> *could* be useful, if you wanted to be able to share class ids between
> cgroups, *and* shuffle the sharing relationships around on the fly. I
> think that Thomas is suggestion option b.

That's right, I had option b) in mind.

> I'm not sure that I see a
> concrete use case for it though - Thomas, what use cases did you see?

Without a) this whole feature is very limited. It requires a process to
be registered to the cgroup before it creates any sockets. Otherwise
these sockets will not have the proper classid value and traffic from
and to this sockets will not be classified. I don't see how this is
practical since many applications create their sockets when the
application is started. F.e. a web browser is causing a bulk data
transfer, admin/user notices this and wants to put it in a restricted
cgroup, won't work.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 23:24       ` Thomas Graf
@ 2008-09-10 23:31         ` Paul Menage
  2008-09-10 23:45           ` Thomas Graf
  0 siblings, 1 reply; 19+ messages in thread
From: Paul Menage @ 2008-09-10 23:31 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Ranjit Manomohan, davem, akpm, kaber, lizf, linux-kernel, netdev

On Wed, Sep 10, 2008 at 4:24 PM, Thomas Graf <tgraf@suug.ch> wrote:
> Without a) this whole feature is very limited. It requires a process to
> be registered to the cgroup before it creates any sockets.

I think that it's a bit more flexible than that - any newly created
sockets (or newly accepted sockets) will inherit the correct classid,
so it's only existing connections that don't get reclassified.

But the common case is that applications are going to be dropped in
the correct cgroup *before* they start, via mechanisms such as pam
login modules or job-control middleware.

> Otherwise
> these sockets will not have the proper classid value and traffic from
> and to this sockets will not be classified. I don't see how this is
> practical since many applications create their sockets when the
> application is started. F.e. a web browser is causing a bulk data
> transfer, admin/user notices this and wants to put it in a restricted
> cgroup, won't work.
>

Yes, for this particular case it doesn't work. But isn't it much more
likely that the admin/user will know that web-browsers tend to trigger
bulk data transfers and configure the system so that the web browser
always starts in its own cgroup, rather than trying to jump on it
after the fact?

Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 23:31         ` Paul Menage
@ 2008-09-10 23:45           ` Thomas Graf
  2008-09-10 23:51             ` Paul Menage
  2008-09-10 23:53             ` Paul Menage
  0 siblings, 2 replies; 19+ messages in thread
From: Thomas Graf @ 2008-09-10 23:45 UTC (permalink / raw)
  To: Paul Menage
  Cc: Ranjit Manomohan, davem, akpm, kaber, lizf, linux-kernel, netdev

* Paul Menage <menage@google.com> 2008-09-10 16:31
> Yes, for this particular case it doesn't work. But isn't it much more
> likely that the admin/user will know that web-browsers tend to trigger
> bulk data transfers and configure the system so that the web browser
> always starts in its own cgroup, rather than trying to jump on it
> after the fact?

That's argueable I guess. Likely or not, it doesn't make sense to
add such restrictions if not required, especially not when it is
trivial to just look at the cgroup of the task directly.

If the ingress case is of such importance, I'd propose to lookup
the classid via skb -> socket -> task -> classid.

Anyways, I think we've listed all the pros and cons of either approach.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 23:45           ` Thomas Graf
@ 2008-09-10 23:51             ` Paul Menage
  2008-09-11  0:07               ` Thomas Graf
  2008-09-10 23:53             ` Paul Menage
  1 sibling, 1 reply; 19+ messages in thread
From: Paul Menage @ 2008-09-10 23:51 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Ranjit Manomohan, davem, akpm, kaber, lizf, linux-kernel, netdev

On Wed, Sep 10, 2008 at 4:45 PM, Thomas Graf <tgraf@suug.ch> wrote:
>
> That's argueable I guess. Likely or not, it doesn't make sense to
> add such restrictions if not required, especially not when it is
> trivial to just look at the cgroup of the task directly.

Isn't a very large fraction of outgoing network traffic driven from
net_tx_action(), which doesn't execute in the context of the sending
process? Or am I missing something?

You claimed "In the most common case, the packet is not queued before
it hits the qdisc so this takes away a lot of complexity and is very
fast but should still be sufficient.". But is that really true on a
system where the network is busy? (Which is after all exactly the case
where you care about accurate traffic accounting/policing)

Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 23:45           ` Thomas Graf
  2008-09-10 23:51             ` Paul Menage
@ 2008-09-10 23:53             ` Paul Menage
  1 sibling, 0 replies; 19+ messages in thread
From: Paul Menage @ 2008-09-10 23:53 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Ranjit Manomohan, davem, akpm, kaber, lizf, linux-kernel, netdev

On Wed, Sep 10, 2008 at 4:45 PM, Thomas Graf <tgraf@suug.ch> wrote:
>
> That's argueable I guess. Likely or not, it doesn't make sense to
> add such restrictions if not required, especially not when it is
> trivial to just look at the cgroup of the task directly.

Isn't a very large fraction of outgoing network traffic driven from
net_tx_action(), which doesn't execute in the context of the sending
process? Or am I missing something?

You claimed "In the most common case, the packet is not queued before
it hits the qdisc so this takes away a lot of complexity and is very
fast but should still be sufficient.". But is that really true on a
system where the network is busy? (Which is after all exactly the case
where you care about accurate traffic accounting/policing)

Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-10 23:51             ` Paul Menage
@ 2008-09-11  0:07               ` Thomas Graf
  2008-09-11  0:09                 ` Paul Menage
  0 siblings, 1 reply; 19+ messages in thread
From: Thomas Graf @ 2008-09-11  0:07 UTC (permalink / raw)
  To: Paul Menage
  Cc: Ranjit Manomohan, davem, akpm, kaber, lizf, linux-kernel, netdev

* Paul Menage <menage@google.com> 2008-09-10 16:51
> On Wed, Sep 10, 2008 at 4:45 PM, Thomas Graf <tgraf@suug.ch> wrote:
> >
> > That's argueable I guess. Likely or not, it doesn't make sense to
> > add such restrictions if not required, especially not when it is
> > trivial to just look at the cgroup of the task directly.
> 
> Isn't a very large fraction of outgoing network traffic driven from
> net_tx_action(), which doesn't execute in the context of the sending
> process? Or am I missing something?

net_tx_action() dequeues from the qdisc. The classification is done
before the packets is enqueued onto the qdisc.

A special case exists when the packets needs to be requeued, in this
case the enqueue() is done in softirq context but it should not invoke
the classifier again if I remember correctly.

> You claimed "In the most common case, the packet is not queued before
> it hits the qdisc so this takes away a lot of complexity and is very
> fast but should still be sufficient.". But is that really true on a
> system where the network is busy? (Which is after all exactly the case
> where you care about accurate traffic accounting/policing)

To my best knowledge it is, otherwise I wouldn't say it. Feel free to
prove me wrong.

F.e. it doesn't work if an ifb device is added to egress between
the user and the cgroup classifier. That's what I mean with common
cases.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/2] Traffic control cgroups subsystem
  2008-09-11  0:07               ` Thomas Graf
@ 2008-09-11  0:09                 ` Paul Menage
  0 siblings, 0 replies; 19+ messages in thread
From: Paul Menage @ 2008-09-11  0:09 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Ranjit Manomohan, davem, akpm, kaber, lizf, linux-kernel, netdev

On Wed, Sep 10, 2008 at 5:07 PM, Thomas Graf <tgraf@suug.ch> wrote:
>> Isn't a very large fraction of outgoing network traffic driven from
>> net_tx_action(), which doesn't execute in the context of the sending
>> process? Or am I missing something?
>
> net_tx_action() dequeues from the qdisc. The classification is done
> before the packets is enqueued onto the qdisc.

Ah, good point.

Thanks,

Paul

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2008-09-11  0:10 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-22  0:55 [PATCH 1/2] Traffic control cgroups subsystem Ranjit Manomohan
2008-08-22  2:11 ` Li Zefan
  -- strict thread matches above, loose matches on Subject: below --
2008-09-10 17:42 Ranjit Manomohan
2008-09-10 22:01 ` Thomas Graf
2008-09-10 22:56   ` Ranjit Manomohan
2008-09-10 23:00     ` David Miller
2008-09-10 23:14       ` Ranjit Manomohan
2008-09-10 23:04     ` Paul Menage
2008-09-10 23:24       ` Thomas Graf
2008-09-10 23:31         ` Paul Menage
2008-09-10 23:45           ` Thomas Graf
2008-09-10 23:51             ` Paul Menage
2008-09-11  0:07               ` Thomas Graf
2008-09-11  0:09                 ` Paul Menage
2008-09-10 23:53             ` Paul Menage
2008-07-24 23:35 Ranjit Manomohan
2008-07-18 21:28 Ranjit Manomohan
2008-07-21  9:26 ` Li Zefan
2008-07-21 14:04 ` Patrick McHardy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).