All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] IP address restricting cgroup subsystem
@ 2009-01-06 23:05 Grzegorz Nosek
       [not found] ` <20090106230554.GB25228-IaEwMO9oKu/77SC2UrCW1JJg/dWx8T/9@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-06 23:05 UTC (permalink / raw)
  To: containers-qjLDD68F18O7TbgM5vRIOg

This is a very simple cgroup subsystem to restrict IP addresses used
by member processes. Currently it is limited to IPv4 only but IPv6 (or
other protocols) should be easy to implement.

IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad
format) and are inherited by descendant cgroups, so a process once
restricted should never be able to get rid of the limits. Any address
may be specified in multiple cgroups. No verification is done to ensure
the addresses are actually configured on the machine, which has its
advantages (may add the addresses later) and disadvantages (if you enter
the wrong address, the cgroup will be effectively cut off from the
network).

Whenever a process inside a restricted cgroup calls bind(2), the address
is checked like this:
 - INADDR_LOOPBACK is explicitly allowed (a special case)
 - INADDR_ANY is remapped to _the_ IP address
 - _the_ IP address is passed through unharmed
 - everything else causes -EPERM

When a process calls connect(2), this subsystem calls bind(_the_IP_)
quietly behind its back, while preserving the original bound port (if
any).

Rationale (or when/why would you want it):
The use case for ipaddr_cgroup doesn't overlap with network namespaces,
which also allow IP address restrictions, because it aims to be much
lighter due to its limited scope (hopefully able to easily support
hundreds or possibly thousands of distinct cgroups). It does not attempt
to hide the existence of other IP addresses from the user.

Signed-off-by: Grzegorz Nosek <root-AfQBxy1nhrQ00sYp1HPQUA@public.gmane.org>
---

This is more of an RFC than a finished patch so any and all comments are
appreciated.

The patch is based to a significant extent on the device_cgroup code,
including bypassing the security infrastructure and hooking directly
into the networking code.

I'd also love to hear your opinion about locking--I have a version of this
patch that uses a seqlock to protect the IP address but I'm not sure this
is the Right Way to do it (and raw non-atomic lockless access looks scary,
regardless of how rarely would the address be changed, i.e. at most
once).

And of course, if the whole idea is stupid, let me know.

 include/linux/cgroup_subsys.h |    6 ++
 include/linux/ipaddr_cgroup.h |   23 +++++
 init/Kconfig                  |    7 ++
 net/socket.c                  |   16 +++-
 security/Makefile             |    1 +
 security/ipaddr_cgroup.c      |  200 +++++++++++++++++++++++++++++++++++++++++
 6 files changed, 250 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/ipaddr_cgroup.h
 create mode 100644 security/ipaddr_cgroup.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 9c22396..70dd375 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -54,3 +54,9 @@ SUBSYS(freezer)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_IPADDR
+SUBSYS(ipaddr)
+#endif
+
+/* */
diff --git a/include/linux/ipaddr_cgroup.h b/include/linux/ipaddr_cgroup.h
new file mode 100644
index 0000000..19dc382
--- /dev/null
+++ b/include/linux/ipaddr_cgroup.h
@@ -0,0 +1,23 @@
+#ifndef HAVE_IPADDR_CGROUP_H
+#define HAVE_IPADDR_CGROUP_H
+
+struct socket;
+struct sockaddr;
+
+#ifdef CONFIG_CGROUP_IPADDR
+int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen);
+int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen);
+
+#else
+static inline int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen)
+{
+	return 0;
+}
+
+static inline int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CGROUP_IPADDR */
+#endif /* HAVE_IPADDR_CGROUP_H */
diff --git a/init/Kconfig b/init/Kconfig
index 35d87b9..db43344 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -338,6 +338,13 @@ config CGROUP_DEVICE
 	  Provides a cgroup implementing whitelists for devices which
 	  a process in the cgroup can mknod or open.
 
+config CGROUP_IPADDR
+	bool "IP address controller for cgroups"
+	depends on CGROUPS && EXPERIMENTAL
+	help
+	  Provides a cgroup restricting IP addresses its member processes
+	  can use.
+
 config CPUSETS
 	bool "Cpuset support"
 	depends on SMP && CGROUPS
diff --git a/net/socket.c b/net/socket.c
index 3e8d4e3..3bd8c08 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -87,6 +87,7 @@
 #include <linux/audit.h>
 #include <linux/wireless.h>
 #include <linux/nsproxy.h>
+#include <linux/ipaddr_cgroup.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1375,9 +1376,13 @@ asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
 	if (sock) {
 		err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
 		if (err >= 0) {
-			err = security_socket_bind(sock,
-						   (struct sockaddr *)&address,
-						   addrlen);
+			err = ipaddr_cgroup_bind(sock,
+						 (struct sockaddr *)&address,
+						 addrlen);
+			if (!err)
+				err = security_socket_bind(sock,
+							   (struct sockaddr *)&address,
+							   addrlen);
 			if (!err)
 				err = sock->ops->bind(sock,
 						      (struct sockaddr *)
@@ -1600,6 +1605,11 @@ asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
 		goto out_put;
 
 	err =
+	    ipaddr_cgroup_connect(sock, (struct sockaddr *)&address, addrlen);
+	if (err)
+		goto out_put;
+
+	err =
 	    security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
 	if (err)
 		goto out_put;
diff --git a/security/Makefile b/security/Makefile
index f654260..aaf225e 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/built-in.o
 obj-$(CONFIG_SECURITY_SMACK)		+= smack/built-in.o
 obj-$(CONFIG_SECURITY_ROOTPLUG)		+= root_plug.o
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
+obj-$(CONFIG_CGROUP_IPADDR)		+= ipaddr_cgroup.o
diff --git a/security/ipaddr_cgroup.c b/security/ipaddr_cgroup.c
new file mode 100644
index 0000000..96ccf27
--- /dev/null
+++ b/security/ipaddr_cgroup.c
@@ -0,0 +1,200 @@
+/*
+ * IP address cgroup subsystem
+ */
+
+#include <linux/ipaddr_cgroup.h>
+
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/seq_file.h>
+#include <linux/socket.h>
+
+#include <net/inet_sock.h>
+
+struct ipaddr_cgroup {
+	struct cgroup_subsys_state css;
+	u32 ipv4_addr;
+};
+
+static inline struct ipaddr_cgroup *css_to_ipcgroup(struct cgroup_subsys_state *s)
+{
+	return container_of(s, struct ipaddr_cgroup, css);
+}
+
+static inline struct ipaddr_cgroup *cgroup_to_ipcgroup(struct cgroup *cgroup)
+{
+	return css_to_ipcgroup(cgroup_subsys_state(cgroup, ipaddr_subsys_id));
+}
+
+static inline struct ipaddr_cgroup *task_ipcgroup(struct task_struct *task)
+{
+	return css_to_ipcgroup(task_subsys_state(task, ipaddr_subsys_id));
+}
+
+struct cgroup_subsys ipaddr_subsys;
+
+static int ipcgroup_can_attach(struct cgroup_subsys *ss,
+		struct cgroup *new_cgroup, struct task_struct *task)
+{
+	struct ipaddr_cgroup *old_ipcgroup, *new_ipcgroup;
+	u32 old_ipv4;
+
+	if (current != task && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	old_ipcgroup = task_ipcgroup(task);
+	new_ipcgroup = cgroup_to_ipcgroup(new_cgroup);
+	old_ipv4 = old_ipcgroup->ipv4_addr;
+
+	if (old_ipv4 != INADDR_ANY && old_ipv4 != new_ipcgroup->ipv4_addr)
+		return -EPERM;
+
+	return 0;
+}
+
+static struct cgroup_subsys_state *ipcgroup_create(struct cgroup_subsys *ss,
+						struct cgroup *cgroup)
+{
+	struct ipaddr_cgroup *ipcgroup, *parent_ipcgroup;
+	struct cgroup *parent_cgroup;
+
+	ipcgroup = kzalloc(sizeof(*ipcgroup), GFP_KERNEL);
+	if (!ipcgroup)
+		return ERR_PTR(-ENOMEM);
+	parent_cgroup = cgroup->parent;
+
+	if (parent_cgroup == NULL) {
+		ipcgroup->ipv4_addr = htonl(INADDR_ANY);
+	} else {
+		parent_ipcgroup = cgroup_to_ipcgroup(parent_cgroup);
+		ipcgroup->ipv4_addr = parent_ipcgroup->ipv4_addr;
+	}
+
+	return &ipcgroup->css;
+}
+
+static void ipcgroup_destroy(struct cgroup_subsys *ss,
+			struct cgroup *cgroup)
+{
+	struct ipaddr_cgroup *ipcgroup;
+
+	ipcgroup = cgroup_to_ipcgroup(cgroup);
+	kfree(ipcgroup);
+}
+
+static int ipcgroup_write_ipv4(struct cgroup *cgrp, struct cftype *cft,
+			const char *buffer)
+{
+	u32 new_addr;
+	struct ipaddr_cgroup *ipcgroup;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ipcgroup = cgroup_to_ipcgroup(cgrp);
+	if (ipcgroup->ipv4_addr != htonl(INADDR_ANY))
+		return -EPERM;
+
+	ret = in4_pton(buffer, -1, (u8 *)&new_addr, '\0', NULL);
+	if (!ret)
+		return -EINVAL;
+
+	/* already network-endian */
+	ipcgroup->ipv4_addr = new_addr;
+	return 0;
+}
+
+static int ipcgroup_read_ipv4(struct cgroup *cgrp, struct cftype *cft,
+			struct seq_file *m)
+{
+	struct ipaddr_cgroup *ipcgroup;
+
+	ipcgroup = cgroup_to_ipcgroup(cgrp);
+	seq_printf(m, NIPQUAD_FMT "\n", NIPQUAD(ipcgroup->ipv4_addr));
+	return 0;
+}
+
+static struct cftype ipaddr_cgroup_files[] = {
+	{
+		.name = "ipv4",
+		.write_string = ipcgroup_write_ipv4,
+		.read_seq_string = ipcgroup_read_ipv4,
+	},
+};
+
+static int ipcgroup_populate(struct cgroup_subsys *ss,
+				struct cgroup *cgroup)
+{
+	return cgroup_add_files(cgroup, ss, ipaddr_cgroup_files,
+					ARRAY_SIZE(ipaddr_cgroup_files));
+}
+
+struct cgroup_subsys ipaddr_subsys = {
+	.name = "ipaddr",
+	.can_attach = ipcgroup_can_attach,
+	.create = ipcgroup_create,
+	.destroy = ipcgroup_destroy,
+	.populate = ipcgroup_populate,
+	.subsys_id = ipaddr_subsys_id
+};
+
+int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen)
+{
+	struct sockaddr_in sa_in;
+	struct ipaddr_cgroup *ipcgroup;
+	struct inet_sock *inet;
+	int err;
+
+	if (address->sa_family != AF_INET)
+		return 0;
+
+	ipcgroup = task_ipcgroup(current);
+	if (ipcgroup->ipv4_addr == htonl(INADDR_ANY))
+		return 0;
+
+	inet = inet_sk(sock->sk);
+
+	sa_in.sin_family = AF_INET;
+	sa_in.sin_addr.s_addr = ipcgroup->ipv4_addr;
+	sa_in.sin_port = inet->sport;
+
+	err = security_socket_bind(sock, (struct sockaddr *)&sa_in, sizeof(sa_in));
+	if (err)
+		return err;
+
+	err = sock->ops->bind(sock, (struct sockaddr *)&sa_in, sizeof(sa_in));
+
+	return err;
+}
+
+int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen)
+{
+	struct sockaddr_in *sa_in;
+	struct ipaddr_cgroup *ipcgroup;
+
+	if (address->sa_family != AF_INET)
+		return 0;
+
+	ipcgroup = task_ipcgroup(current);
+	if (ipcgroup->ipv4_addr == htonl(INADDR_ANY))
+		return 0;
+
+	sa_in = (struct sockaddr_in *) address;
+
+	/* remap INADDR_ANY to cgroup IP address */
+	if (sa_in->sin_addr.s_addr == htonl(INADDR_ANY))
+		sa_in->sin_addr.s_addr = ipcgroup->ipv4_addr;
+
+	/* a very special case */
+	if (sa_in->sin_addr.s_addr == htonl(INADDR_LOOPBACK))
+		return 0;
+
+	if (sa_in->sin_addr.s_addr == ipcgroup->ipv4_addr)
+		return 0;
+
+	return -EPERM;
+}
+
-- 
1.6.1

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found] ` <20090106230554.GB25228-IaEwMO9oKu/77SC2UrCW1JJg/dWx8T/9@public.gmane.org>
@ 2009-01-07  6:01   ` Li Zefan
       [not found]     ` <49644526.8030205-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
  2009-01-07 18:07   ` Serge E. Hallyn
  2009-01-09 21:58   ` [Devel] " Paul Menage
  2 siblings, 1 reply; 33+ messages in thread
From: Li Zefan @ 2009-01-07  6:01 UTC (permalink / raw)
  To: Grzegorz Nosek
  Cc: containers-qjLDD68F18O7TbgM5vRIOg, netdev-u79uwXL29TY76Z2rM5mHXA

CC: netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org

I'll review the cgroup part if this patch is regarded as useful.

Grzegorz Nosek wrote:
> This is a very simple cgroup subsystem to restrict IP addresses used
> by member processes. Currently it is limited to IPv4 only but IPv6 (or
> other protocols) should be easy to implement.
> 
> IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad

Why they should be write-once ?

> format) and are inherited by descendant cgroups, so a process once
> restricted should never be able to get rid of the limits. Any address
> may be specified in multiple cgroups. No verification is done to ensure
> the addresses are actually configured on the machine, which has its
> advantages (may add the addresses later) and disadvantages (if you enter
> the wrong address, the cgroup will be effectively cut off from the
> network).
> 
> Whenever a process inside a restricted cgroup calls bind(2), the address
> is checked like this:
>  - INADDR_LOOPBACK is explicitly allowed (a special case)
>  - INADDR_ANY is remapped to _the_ IP address
>  - _the_ IP address is passed through unharmed
>  - everything else causes -EPERM
> 
> When a process calls connect(2), this subsystem calls bind(_the_IP_)
> quietly behind its back, while preserving the original bound port (if
> any).
> 
> Rationale (or when/why would you want it):
> The use case for ipaddr_cgroup doesn't overlap with network namespaces,
> which also allow IP address restrictions, because it aims to be much
> lighter due to its limited scope (hopefully able to easily support
> hundreds or possibly thousands of distinct cgroups). It does not attempt
> to hide the existence of other IP addresses from the user.
> 
> Signed-off-by: Grzegorz Nosek <root-AfQBxy1nhrQ00sYp1HPQUA@public.gmane.org>
> ---
> 
> This is more of an RFC than a finished patch so any and all comments are
> appreciated.
> 
> The patch is based to a significant extent on the device_cgroup code,
> including bypassing the security infrastructure and hooking directly
> into the networking code.
> 
> I'd also love to hear your opinion about locking--I have a version of this
> patch that uses a seqlock to protect the IP address but I'm not sure this
> is the Right Way to do it (and raw non-atomic lockless access looks scary,
> regardless of how rarely would the address be changed, i.e. at most
> once).
> 
> And of course, if the whole idea is stupid, let me know.
> 
>  include/linux/cgroup_subsys.h |    6 ++
>  include/linux/ipaddr_cgroup.h |   23 +++++
>  init/Kconfig                  |    7 ++
>  net/socket.c                  |   16 +++-
>  security/Makefile             |    1 +
>  security/ipaddr_cgroup.c      |  200 +++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 250 insertions(+), 3 deletions(-)
>  create mode 100644 include/linux/ipaddr_cgroup.h
>  create mode 100644 security/ipaddr_cgroup.c
> 
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 9c22396..70dd375 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -54,3 +54,9 @@ SUBSYS(freezer)
>  #endif
>  
>  /* */
> +
> +#ifdef CONFIG_CGROUP_IPADDR
> +SUBSYS(ipaddr)
> +#endif
> +
> +/* */
> diff --git a/include/linux/ipaddr_cgroup.h b/include/linux/ipaddr_cgroup.h
> new file mode 100644
> index 0000000..19dc382
> --- /dev/null
> +++ b/include/linux/ipaddr_cgroup.h
> @@ -0,0 +1,23 @@
> +#ifndef HAVE_IPADDR_CGROUP_H
> +#define HAVE_IPADDR_CGROUP_H
> +
> +struct socket;
> +struct sockaddr;
> +
> +#ifdef CONFIG_CGROUP_IPADDR
> +int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen);
> +int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen);
> +
> +#else
> +static inline int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +	return 0;
> +}
> +
> +static inline int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +	return 0;
> +}
> +
> +#endif /* CONFIG_CGROUP_IPADDR */
> +#endif /* HAVE_IPADDR_CGROUP_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index 35d87b9..db43344 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -338,6 +338,13 @@ config CGROUP_DEVICE
>  	  Provides a cgroup implementing whitelists for devices which
>  	  a process in the cgroup can mknod or open.
>  
> +config CGROUP_IPADDR
> +	bool "IP address controller for cgroups"
> +	depends on CGROUPS && EXPERIMENTAL
> +	help
> +	  Provides a cgroup restricting IP addresses its member processes
> +	  can use.
> +
>  config CPUSETS
>  	bool "Cpuset support"
>  	depends on SMP && CGROUPS
> diff --git a/net/socket.c b/net/socket.c
> index 3e8d4e3..3bd8c08 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -87,6 +87,7 @@
>  #include <linux/audit.h>
>  #include <linux/wireless.h>
>  #include <linux/nsproxy.h>
> +#include <linux/ipaddr_cgroup.h>
>  
>  #include <asm/uaccess.h>
>  #include <asm/unistd.h>
> @@ -1375,9 +1376,13 @@ asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
>  	if (sock) {
>  		err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
>  		if (err >= 0) {
> -			err = security_socket_bind(sock,
> -						   (struct sockaddr *)&address,
> -						   addrlen);
> +			err = ipaddr_cgroup_bind(sock,
> +						 (struct sockaddr *)&address,
> +						 addrlen);
> +			if (!err)
> +				err = security_socket_bind(sock,
> +							   (struct sockaddr *)&address,
> +							   addrlen);
>  			if (!err)
>  				err = sock->ops->bind(sock,
>  						      (struct sockaddr *)
> @@ -1600,6 +1605,11 @@ asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
>  		goto out_put;
>  
>  	err =
> +	    ipaddr_cgroup_connect(sock, (struct sockaddr *)&address, addrlen);
> +	if (err)
> +		goto out_put;
> +
> +	err =
>  	    security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
>  	if (err)
>  		goto out_put;
> diff --git a/security/Makefile b/security/Makefile
> index f654260..aaf225e 100644
> --- a/security/Makefile
> +++ b/security/Makefile
> @@ -16,3 +16,4 @@ obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/built-in.o
>  obj-$(CONFIG_SECURITY_SMACK)		+= smack/built-in.o
>  obj-$(CONFIG_SECURITY_ROOTPLUG)		+= root_plug.o
>  obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
> +obj-$(CONFIG_CGROUP_IPADDR)		+= ipaddr_cgroup.o
> diff --git a/security/ipaddr_cgroup.c b/security/ipaddr_cgroup.c
> new file mode 100644
> index 0000000..96ccf27
> --- /dev/null
> +++ b/security/ipaddr_cgroup.c
> @@ -0,0 +1,200 @@
> +/*
> + * IP address cgroup subsystem
> + */
> +
> +#include <linux/ipaddr_cgroup.h>
> +
> +#include <linux/cgroup.h>
> +#include <linux/err.h>
> +#include <linux/in.h>
> +#include <linux/inet.h>
> +#include <linux/seq_file.h>
> +#include <linux/socket.h>
> +
> +#include <net/inet_sock.h>
> +
> +struct ipaddr_cgroup {
> +	struct cgroup_subsys_state css;
> +	u32 ipv4_addr;
> +};
> +
> +static inline struct ipaddr_cgroup *css_to_ipcgroup(struct cgroup_subsys_state *s)
> +{
> +	return container_of(s, struct ipaddr_cgroup, css);
> +}
> +
> +static inline struct ipaddr_cgroup *cgroup_to_ipcgroup(struct cgroup *cgroup)
> +{
> +	return css_to_ipcgroup(cgroup_subsys_state(cgroup, ipaddr_subsys_id));
> +}
> +
> +static inline struct ipaddr_cgroup *task_ipcgroup(struct task_struct *task)
> +{
> +	return css_to_ipcgroup(task_subsys_state(task, ipaddr_subsys_id));
> +}
> +
> +struct cgroup_subsys ipaddr_subsys;
> +
> +static int ipcgroup_can_attach(struct cgroup_subsys *ss,
> +		struct cgroup *new_cgroup, struct task_struct *task)
> +{
> +	struct ipaddr_cgroup *old_ipcgroup, *new_ipcgroup;
> +	u32 old_ipv4;
> +
> +	if (current != task && !capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	old_ipcgroup = task_ipcgroup(task);
> +	new_ipcgroup = cgroup_to_ipcgroup(new_cgroup);
> +	old_ipv4 = old_ipcgroup->ipv4_addr;
> +
> +	if (old_ipv4 != INADDR_ANY && old_ipv4 != new_ipcgroup->ipv4_addr)
> +		return -EPERM;
> +
> +	return 0;
> +}
> +
> +static struct cgroup_subsys_state *ipcgroup_create(struct cgroup_subsys *ss,
> +						struct cgroup *cgroup)
> +{
> +	struct ipaddr_cgroup *ipcgroup, *parent_ipcgroup;
> +	struct cgroup *parent_cgroup;
> +
> +	ipcgroup = kzalloc(sizeof(*ipcgroup), GFP_KERNEL);
> +	if (!ipcgroup)
> +		return ERR_PTR(-ENOMEM);
> +	parent_cgroup = cgroup->parent;
> +
> +	if (parent_cgroup == NULL) {
> +		ipcgroup->ipv4_addr = htonl(INADDR_ANY);
> +	} else {
> +		parent_ipcgroup = cgroup_to_ipcgroup(parent_cgroup);
> +		ipcgroup->ipv4_addr = parent_ipcgroup->ipv4_addr;
> +	}
> +
> +	return &ipcgroup->css;
> +}
> +
> +static void ipcgroup_destroy(struct cgroup_subsys *ss,
> +			struct cgroup *cgroup)
> +{
> +	struct ipaddr_cgroup *ipcgroup;
> +
> +	ipcgroup = cgroup_to_ipcgroup(cgroup);
> +	kfree(ipcgroup);
> +}
> +
> +static int ipcgroup_write_ipv4(struct cgroup *cgrp, struct cftype *cft,
> +			const char *buffer)
> +{
> +	u32 new_addr;
> +	struct ipaddr_cgroup *ipcgroup;
> +	int ret;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	ipcgroup = cgroup_to_ipcgroup(cgrp);
> +	if (ipcgroup->ipv4_addr != htonl(INADDR_ANY))
> +		return -EPERM;
> +
> +	ret = in4_pton(buffer, -1, (u8 *)&new_addr, '\0', NULL);
> +	if (!ret)
> +		return -EINVAL;
> +
> +	/* already network-endian */
> +	ipcgroup->ipv4_addr = new_addr;
> +	return 0;
> +}
> +
> +static int ipcgroup_read_ipv4(struct cgroup *cgrp, struct cftype *cft,
> +			struct seq_file *m)
> +{
> +	struct ipaddr_cgroup *ipcgroup;
> +
> +	ipcgroup = cgroup_to_ipcgroup(cgrp);
> +	seq_printf(m, NIPQUAD_FMT "\n", NIPQUAD(ipcgroup->ipv4_addr));
> +	return 0;
> +}
> +
> +static struct cftype ipaddr_cgroup_files[] = {
> +	{
> +		.name = "ipv4",
> +		.write_string = ipcgroup_write_ipv4,
> +		.read_seq_string = ipcgroup_read_ipv4,
> +	},
> +};
> +
> +static int ipcgroup_populate(struct cgroup_subsys *ss,
> +				struct cgroup *cgroup)
> +{
> +	return cgroup_add_files(cgroup, ss, ipaddr_cgroup_files,
> +					ARRAY_SIZE(ipaddr_cgroup_files));
> +}
> +
> +struct cgroup_subsys ipaddr_subsys = {
> +	.name = "ipaddr",
> +	.can_attach = ipcgroup_can_attach,
> +	.create = ipcgroup_create,
> +	.destroy = ipcgroup_destroy,
> +	.populate = ipcgroup_populate,
> +	.subsys_id = ipaddr_subsys_id
> +};
> +
> +int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +	struct sockaddr_in sa_in;
> +	struct ipaddr_cgroup *ipcgroup;
> +	struct inet_sock *inet;
> +	int err;
> +
> +	if (address->sa_family != AF_INET)
> +		return 0;
> +
> +	ipcgroup = task_ipcgroup(current);
> +	if (ipcgroup->ipv4_addr == htonl(INADDR_ANY))
> +		return 0;
> +
> +	inet = inet_sk(sock->sk);
> +
> +	sa_in.sin_family = AF_INET;
> +	sa_in.sin_addr.s_addr = ipcgroup->ipv4_addr;
> +	sa_in.sin_port = inet->sport;
> +
> +	err = security_socket_bind(sock, (struct sockaddr *)&sa_in, sizeof(sa_in));
> +	if (err)
> +		return err;
> +
> +	err = sock->ops->bind(sock, (struct sockaddr *)&sa_in, sizeof(sa_in));
> +
> +	return err;
> +}
> +
> +int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +	struct sockaddr_in *sa_in;
> +	struct ipaddr_cgroup *ipcgroup;
> +
> +	if (address->sa_family != AF_INET)
> +		return 0;
> +
> +	ipcgroup = task_ipcgroup(current);
> +	if (ipcgroup->ipv4_addr == htonl(INADDR_ANY))
> +		return 0;
> +
> +	sa_in = (struct sockaddr_in *) address;
> +
> +	/* remap INADDR_ANY to cgroup IP address */
> +	if (sa_in->sin_addr.s_addr == htonl(INADDR_ANY))
> +		sa_in->sin_addr.s_addr = ipcgroup->ipv4_addr;
> +
> +	/* a very special case */
> +	if (sa_in->sin_addr.s_addr == htonl(INADDR_LOOPBACK))
> +		return 0;
> +
> +	if (sa_in->sin_addr.s_addr == ipcgroup->ipv4_addr)
> +		return 0;
> +
> +	return -EPERM;
> +}
> +

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]     ` <49644526.8030205-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
@ 2009-01-07  7:38       ` Grzegorz Nosek
  2009-01-07  8:36         ` Li Zefan
  0 siblings, 1 reply; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-07  7:38 UTC (permalink / raw)
  To: Li Zefan; +Cc: containers-qjLDD68F18O7TbgM5vRIOg, netdev-u79uwXL29TY76Z2rM5mHXA

On śro, sty 07, 2009 at 02:01:10 +0800, Li Zefan wrote:
> CC: netdev@vger.kernel.org
> 
> I'll review the cgroup part if this patch is regarded as useful.
> 
> Grzegorz Nosek wrote:
> > This is a very simple cgroup subsystem to restrict IP addresses used
> > by member processes. Currently it is limited to IPv4 only but IPv6 (or
> > other protocols) should be easy to implement.
> > 
> > IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad
> 
> Why they should be write-once ?

No real (technical) reason. Making it read-write would be fine with me.
I wanted to make the restriction a one-way road but I guess I can police
that in userspace (simply don't write anything to the file twice).

However, I think that the restriction should be inherited, so that if
CG1 is bound to e.g. 10.0.0.1, CG1/CG2 must be bound to the same
address. But what would I do then with descendant cgroups? Leave them as
is (breaking the inheritance)? Find them all and change their bound
address behind their back (do we have an API for that?)?

I guess I have the same problem right now, anyway (only once instead of
multiple times), so I'd really appreciate your input on this.

Best regards,
 Grzegorz Nosek
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-07  7:38       ` Grzegorz Nosek
@ 2009-01-07  8:36         ` Li Zefan
  2009-01-07  9:16           ` Grzegorz Nosek
  0 siblings, 1 reply; 33+ messages in thread
From: Li Zefan @ 2009-01-07  8:36 UTC (permalink / raw)
  To: Grzegorz Nosek; +Cc: containers, netdev

Grzegorz Nosek wrote:
> On śro, sty 07, 2009 at 02:01:10 +0800, Li Zefan wrote:
>> CC: netdev@vger.kernel.org
>>
>> I'll review the cgroup part if this patch is regarded as useful.
>>
>> Grzegorz Nosek wrote:
>>> This is a very simple cgroup subsystem to restrict IP addresses used
>>> by member processes. Currently it is limited to IPv4 only but IPv6 (or
>>> other protocols) should be easy to implement.
>>>
>>> IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad
>> Why they should be write-once ?
> 
> No real (technical) reason. Making it read-write would be fine with me.
> I wanted to make the restriction a one-way road but I guess I can police
> that in userspace (simply don't write anything to the file twice).
> 

But seems the patch makes it impossible to re-allow a restricted task to
be binded to INADDR_ANY.

> However, I think that the restriction should be inherited, so that if
> CG1 is bound to e.g. 10.0.0.1, CG1/CG2 must be bound to the same
> address. But what would I do then with descendant cgroups? Leave them as
> is (breaking the inheritance)? Find them all and change their bound
> address behind their back (do we have an API for that?)?
> 

Firstly, is inheritance necessary ?

If yes, then how about:

The root cgroup is read-only, so the tasks in it always bind to INADDR_ANY.
For other cgroups, write is allowed only if it has no children and the
parent is INADDR_ANY.

> I guess I have the same problem right now, anyway (only once instead of
> multiple times), so I'd really appreciate your input on this.
> 
> Best regards,
>  Grzegorz Nosek
> 
> 



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-07  8:36         ` Li Zefan
@ 2009-01-07  9:16           ` Grzegorz Nosek
  2009-01-07  9:33             ` Li Zefan
  0 siblings, 1 reply; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-07  9:16 UTC (permalink / raw)
  To: Li Zefan; +Cc: Grzegorz Nosek, containers, netdev

On śro, sty 07, 2009 at 04:36:35 +0800, Li Zefan wrote:
> Grzegorz Nosek wrote:
> >>> IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad
> >> Why they should be write-once ?
> > 
> > No real (technical) reason. Making it read-write would be fine with me.
> > I wanted to make the restriction a one-way road but I guess I can police
> > that in userspace (simply don't write anything to the file twice).
> > 
> 
> But seems the patch makes it impossible to re-allow a restricted task to
> be binded to INADDR_ANY.

Yes, my goal is to disallow that but I don't insist to do that in the
kernel (I'm not currently planning to let untrusted root loose in a
container).

> Firstly, is inheritance necessary ?

It would be nice to have when the container's root is untrusted but
might want to subdivide the container's cgroup for other purposes.
Without inheritance, they would be able to circumvent the IP address
restriction. One could argue that a full untrusted-root container would
need a proper network namespace anyway (and giving CAP_SYS_ADMIN there
is probably a very bad idea), but still, I'd feel uneasy.

> If yes, then how about:
> 
> The root cgroup is read-only, so the tasks in it always bind to INADDR_ANY.
> For other cgroups, write is allowed only if it has no children and the
> parent is INADDR_ANY.

Yes, I like that. Will update the patch. I assume that I must check
list_empty(&cgroup->children)? Should I use cgroup_lock()/cgroup_unlock()
or other locking? I think it will be safe to do without locks but would
rather get some expert advice.

Thanks a lot for your comments.

Best regards,
 Grzegorz Nosek

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-07  9:16           ` Grzegorz Nosek
@ 2009-01-07  9:33             ` Li Zefan
  2009-01-07  9:37               ` Grzegorz Nosek
  2009-01-09 21:38               ` [Devel] " Paul Menage
  0 siblings, 2 replies; 33+ messages in thread
From: Li Zefan @ 2009-01-07  9:33 UTC (permalink / raw)
  To: Grzegorz Nosek; +Cc: containers, netdev

Grzegorz Nosek wrote:
> On śro, sty 07, 2009 at 04:36:35 +0800, Li Zefan wrote:
>> Grzegorz Nosek wrote:
>>>>> IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad
>>>> Why they should be write-once ?
>>> No real (technical) reason. Making it read-write would be fine with me.
>>> I wanted to make the restriction a one-way road but I guess I can police
>>> that in userspace (simply don't write anything to the file twice).
>>>
>> But seems the patch makes it impossible to re-allow a restricted task to
>> be binded to INADDR_ANY.
> 
> Yes, my goal is to disallow that but I don't insist to do that in the
> kernel (I'm not currently planning to let untrusted root loose in a
> container).
> 
>> Firstly, is inheritance necessary ?
> 
> It would be nice to have when the container's root is untrusted but
> might want to subdivide the container's cgroup for other purposes.
> Without inheritance, they would be able to circumvent the IP address
> restriction. One could argue that a full untrusted-root container would
> need a proper network namespace anyway (and giving CAP_SYS_ADMIN there
> is probably a very bad idea), but still, I'd feel uneasy.
> 
>> If yes, then how about:
>>
>> The root cgroup is read-only, so the tasks in it always bind to INADDR_ANY.
>> For other cgroups, write is allowed only if it has no children and the
>> parent is INADDR_ANY.
> 
> Yes, I like that. Will update the patch. I assume that I must check
> list_empty(&cgroup->children)?

Yes.

> Should I use cgroup_lock()/cgroup_unlock()

Yes.

> or other locking? I think it will be safe to do without locks but would
> rather get some expert advice.
> 

No. Without locks, it races with mkdir.

=============

//cgroup_lock();

if (list_empty(&cgrp->children) &&
    parent->ipv4_addr == INADDR_ANY)
					   <--- mkdir()
	ipcgroup->ipv4_addr = new_addr;

//cgroup_unlock();

==============

In the above case, ipcgroup->ipv4_addr = new_addr,
but child_cgroup->ipv4_addr == INADDR_ANY, which is not expected.

> Thanks a lot for your comments.
> 
> Best regards,
>  Grzegorz Nosek
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-07  9:33             ` Li Zefan
@ 2009-01-07  9:37               ` Grzegorz Nosek
  2009-01-09 21:38               ` [Devel] " Paul Menage
  1 sibling, 0 replies; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-07  9:37 UTC (permalink / raw)
  To: Li Zefan; +Cc: Grzegorz Nosek, containers, netdev

On śro, sty 07, 2009 at 05:33:49 +0800, Li Zefan wrote:
> >> The root cgroup is read-only, so the tasks in it always bind to INADDR_ANY.
> >> For other cgroups, write is allowed only if it has no children and the
> >> parent is INADDR_ANY.
> > 
> > Yes, I like that. Will update the patch. I assume that I must check
> > list_empty(&cgroup->children)?
> 
> Yes.
> 
> > Should I use cgroup_lock()/cgroup_unlock()
> 
> Yes.
> 
> > or other locking? I think it will be safe to do without locks but would
> > rather get some expert advice.
> > 
> 
> No. Without locks, it races with mkdir.
> 
> =============
> 
> //cgroup_lock();
> 
> if (list_empty(&cgrp->children) &&
>     parent->ipv4_addr == INADDR_ANY)
> 					   <--- mkdir()
> 	ipcgroup->ipv4_addr = new_addr;
> 
> //cgroup_unlock();
> 
> ==============
> 
> In the above case, ipcgroup->ipv4_addr = new_addr,
> but child_cgroup->ipv4_addr == INADDR_ANY, which is not expected.

I see. Thanks a lot!

Best regards,
 Grzegorz Nosek

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found] ` <20090106230554.GB25228-IaEwMO9oKu/77SC2UrCW1JJg/dWx8T/9@public.gmane.org>
  2009-01-07  6:01   ` Li Zefan
@ 2009-01-07 18:07   ` Serge E. Hallyn
       [not found]     ` <20090107180752.GA19153-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-01-09 21:58   ` [Devel] " Paul Menage
  2 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-01-07 18:07 UTC (permalink / raw)
  To: Grzegorz Nosek; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Grzegorz Nosek (root-AfQBxy1nhrQ00sYp1HPQUA@public.gmane.org):
> This is a very simple cgroup subsystem to restrict IP addresses used
> by member processes. Currently it is limited to IPv4 only but IPv6 (or
> other protocols) should be easy to implement.
> 
> IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad
> format) and are inherited by descendant cgroups, so a process once
> restricted should never be able to get rid of the limits. Any address
> may be specified in multiple cgroups. No verification is done to ensure
> the addresses are actually configured on the machine, which has its
> advantages (may add the addresses later) and disadvantages (if you enter
> the wrong address, the cgroup will be effectively cut off from the
> network).
> 
> Whenever a process inside a restricted cgroup calls bind(2), the address
> is checked like this:
>  - INADDR_LOOPBACK is explicitly allowed (a special case)
>  - INADDR_ANY is remapped to _the_ IP address
>  - _the_ IP address is passed through unharmed
>  - everything else causes -EPERM
> 
> When a process calls connect(2), this subsystem calls bind(_the_IP_)
> quietly behind its back, while preserving the original bound port (if
> any).
> 
> Rationale (or when/why would you want it):
> The use case for ipaddr_cgroup doesn't overlap with network namespaces,
> which also allow IP address restrictions, because it aims to be much
> lighter due to its limited scope (hopefully able to easily support
> hundreds or possibly thousands of distinct cgroups). It does not attempt
> to hide the existence of other IP addresses from the user.

Have you run a test, and found that in fact a network namespace
is too heavyweight to do so?  If so, some numbers here would be
far more pursuasive.

(Mind you I've written a few version of this - based on LSM -
myself in the past, but that was before network namespaces
existed)

-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]     ` <20090107180752.GA19153-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-01-07 19:15       ` Grzegorz Nosek
       [not found]         ` <20090107191536.GA15159-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-07 19:15 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On śro, sty 07, 2009 at 12:07:52 -0600, Serge E. Hallyn wrote:
> Have you run a test, and found that in fact a network namespace
> is too heavyweight to do so?  If so, some numbers here would be
> far more pursuasive.

Is "how long it took me to set up and document this" a valid benchmark?
No, I haven't run any tests yet. However, the overhead I'm thinking of
isn't only related to raw speed, but also includes administrative tasks.

Overall, I'd like to have an environment where users are grouped in
containers but still have them slightly isolated from each other (things
outside normal Unix restrictions include e.g. not seeing others'
processes or not being able to step on their resources--like the IP
address assigned). In the end, I'd like to have up to a dozen or a few
"big" containers and hundreds+ of per-user cgroups (without additional
namespace divisions) per machine. Do you think a bridge together with
several hundred veths in the root namespace won't confuse admin tools
(or the admins themselves)? Or should I use macvlan for that, or
possibly something else altogether?

I'll try to get some numbers but my current dev. machine is a VMware
instance on my laptop and that runs rather abysmally, so they'll be
probably skewed one way or another.

> (Mind you I've written a few version of this - based on LSM -
> myself in the past, but that was before network namespaces
> existed)

Best regards,
 Grzegorz Nosek
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]         ` <20090107191536.GA15159-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
@ 2009-01-07 19:32           ` Serge E. Hallyn
       [not found]             ` <20090107193234.GA22625-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-01-07 19:32 UTC (permalink / raw)
  To: Grzegorz Nosek; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Grzegorz Nosek (root@localdomain.pl):
> On śro, sty 07, 2009 at 12:07:52 -0600, Serge E. Hallyn wrote:
> > Have you run a test, and found that in fact a network namespace
> > is too heavyweight to do so?  If so, some numbers here would be
> > far more pursuasive.
> 
> Is "how long it took me to set up and document this" a valid benchmark?
> No, I haven't run any tests yet. However, the overhead I'm thinking of
> isn't only related to raw speed, but also includes administrative tasks.
> 
> Overall, I'd like to have an environment where users are grouped in
> containers but still have them slightly isolated from each other (things
> outside normal Unix restrictions include e.g. not seeing others'
> processes or not being able to step on their resources--like the IP
> address assigned). In the end, I'd like to have up to a dozen or a few
> "big" containers and hundreds+ of per-user cgroups (without additional
> namespace divisions) per machine. Do you think a bridge together with
> several hundred veths in the root namespace won't confuse admin tools
> (or the admins themselves)? Or should I use macvlan for that, or
> possibly something else altogether?
> 
> I'll try to get some numbers but my current dev. machine is a VMware
> instance on my laptop and that runs rather abysmally, so they'll be
> probably skewed one way or another.
> 
> > (Mind you I've written a few version of this - based on LSM -
> > myself in the past, but that was before network namespaces
> > existed)
> 
> Best regards,
>  Grzegorz Nosek

Does anyone else (Eric? Pavel?) have experience with hundreds
or thousands of network namespaces?

-serge
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]             ` <20090107193234.GA22625-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-01-08 12:43               ` Benny Amorsen
       [not found]                 ` <20090109144122.GA9685@megiteam.pl>
  2009-01-09 16:54               ` Dan Smith
  1 sibling, 1 reply; 33+ messages in thread
From: Benny Amorsen @ 2009-01-08 12:43 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Grzegorz Nosek,
	public-containers-qjLDD68F18O7TbgM5vRIOg-z5DuStaUktnZ+VzJOa5vwg



"Serge E. Hallyn" <serue-r/Jw6+rmf7HQT0dZR+AlfA-XMD5yJDbdMReXY1tMh2IBg@public.gmane.org>
writes:

> Does anyone else (Eric? Pavel?) have experience with hundreds
> or thousands of network namespaces?

Hundreds aren't a problem with OpenVZ (I do that in production) and
the vanilla kernel namespaces shouldn't be heavier. I don't think
performance is a good argument for the patch.

However, I do see the appeal of patch anyway. It would be tempting to
use cgroups inside a network namespace for administrative reasons,
like Grzegorz Nosek proposed. I am not sure if you can create name
spaces with the semantics he proposed:

 - INADDR_LOOPBACK is explicitly allowed (a special case)
 - INADDR_ANY is remapped to _the_ IP address
 - _the_ IP address is passed through unharmed
 - everything else causes -EPERM

If you can get those semantics (or something close) already, then the
patch isn't useful.


/Benny

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]             ` <20090107193234.GA22625-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2009-01-08 12:43               ` Benny Amorsen
@ 2009-01-09 16:54               ` Dan Smith
       [not found]                 ` <87priwifnu.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  1 sibling, 1 reply; 33+ messages in thread
From: Dan Smith @ 2009-01-09 16:54 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

SH> Does anyone else (Eric? Pavel?) have experience with hundreds or
SH> thousands of network namespaces?

I just gave it a shot on linux-next-20090108 with the following test
case:

  int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER \
                  | CLONE_NEWIPC|SIGCHLD|CLONE_NEWNET;
  
  int clone_child(void *data)
  {
          printf("Child %i\n", (int)data);
          sleep(30);
          exit(0);
  }
  
  int main(int argc, char **argv)
  {
          int i;
  
          for (i = 0; i < 100; i++) {
                  char *stack;
                  unsigned int stacksize = getpagesize() * 4;
  
                  stack = malloc(stacksize);
                  if (stack == NULL) {
                          printf("Failed to allocate %i\n", stacksize);
                          return 1;
                  }
  
                  printf("Clone %i\n", i);
                  clone(clone_child, stack + stacksize, flags, (void*)i);
          }
  
          sleep(40);
  }

The loop runs to completion, but only 18 children ever print their
message.  After the test completes, doing something else (like
bringing up a man page) consistently results in this panic:

  BUG: unable to handle kernel paging request at 00c85788
  IP: [<c0252af8>] rb_insert_color+0x28/0x100
  Oops: 0000 [#1] SMP 
  last sysfs file: /sys/devices/pci0000:00/0000:00:01.1/host0/target0:0:1/0:0:1:0/block/sr0/size
  Modules linked in: ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc nfs lockd nfs_acl auth_rpcgss sunrpc af_packet ipv6 binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod uinput virtio_balloon virtio_net evbug evdev pcspkr virtio_pci virtio_ring virtio i2c_piix4 i2c_core sr_mod cdrom sg thermal button processor ata_generic pata_acpi piix ide_core sd_mod crc_t10dif ext3 jbd mbcache
  
  Pid: 2865, comm: man Not tainted (2.6.28-next-20090108 #5) 
  EIP: 0060:[<c0252af8>] EFLAGS: 00010202 CPU: 0
  EIP is at rb_insert_color+0x28/0x100
  EAX: c8578088 EBX: c8578088 ECX: c8578090 EDX: 00c85780
  ESI: c8578088 EDI: 00c85780 EBP: cd93be28 ESP: cd93be14
   DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
  Process man (pid: 2865, ti=cd93a000 task=cb75bd90 task.ti=cd93a000)
  Stack:
   cb56cc00 c8578087 c857807f c8578088 c857809f cd93be40 d0afe76d cb56cc00
   c8ccb00c 0001fe43 cf785f00 cd93be74 d0b04f79 c8ccb00c 0000000c 00000000
   cf7e0e28 c845d180 c8ccbff8 00000001 00000000 cb56cc00 cf7e0e28 cf7e0e28
  Call Trace:
   [<d0afe76d>] ? ext3_htree_store_dirent+0xbd/0x110 [ext3]
   [<d0b04f79>] ? htree_dirblock_to_tree+0x109/0x180 [ext3]
   [<d0b07a11>] ? ext3_htree_fill_tree+0x61/0x210 [ext3]
   [<c01b77e3>] ? nameidata_to_filp+0x53/0x70
   [<d0afe684>] ? ext3_readdir+0x6d4/0x700 [ext3]
   [<d0afe532>] ? ext3_readdir+0x582/0x700 [ext3]
   [<c01bc8b4>] ? cp_new_stat64+0xe4/0x100
   [<c01c6690>] ? filldir+0x0/0xd0
   [<c01bcd52>] ? sys_fstat64+0x22/0x30
   [<c01c68c8>] ? vfs_readdir+0x88/0xa0
   [<c01c6690>] ? filldir+0x0/0xd0
   [<c01c69f8>] ? sys_getdents+0x68/0xb0
   [<c0103762>] ? syscall_call+0x7/0xb
  Code: 8d 76 00 55 89 e5 57 56 53 83 ec 08 89 45 f0 89 55 ec 90 8b 55 f0 8b 02 89 c3 83 e3 fc 74 3c 8b 13 f6 c2 01 75 35 89 d7 83 e7 fc <8b> 77 08 39 de 74 59 85 f6 74 35 8b 06 a8 01 75 2f 83 c8 01 89 
  EIP: [<c0252af8>] rb_insert_color+0x28/0x100 SS:ESP 0068:cd93be14
  ---[ end trace 5af0fea6439f26a1 ]---

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                     ` <20090109162247.GA7925-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-01-09 16:57                       ` Grzegorz Nosek
  0 siblings, 0 replies; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-09 16:57 UTC (permalink / raw)
  To: Serge E. Hallyn
  Cc: Serge E. Hallyn,
	public-containers-qjLDD68F18O7TbgM5vRIOg-z5DuStaUktnZ+VzJOa5vwg,
	Benny Amorsen




On pią, sty 09, 2009 at 10:22:48 -0600, Serge E. Hallyn wrote:
> Well, I think right now we're talking about 'how to do it with network
> namespaces' mostly bc we're curious.  I've heard no strong objections
> to your patch, so please do resubmit once you've addressed Li's
> comments.

Thanks! Will resend. But I'd like to hear some comments about locking
access to the IP address. As Li mentioned, I need the cgroup_lock() to
modify it as I inspect the hierarchy there. However, I don't know what
locking would be required on the _read_ side. Can I expect the address
to be always accessed atomically? What should I use for e.g. IPv6?
A seqlock? RCU (somehow)?

Best regards,
 Grzegorz Nosek


_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                 ` <87priwifnu.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-01-09 17:43                   ` Guenter Roeck
       [not found]                     ` <20090109174334.GA4526-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org>
  2009-01-09 18:30                   ` Serge E. Hallyn
  1 sibling, 1 reply; 33+ messages in thread
From: Guenter Roeck @ 2009-01-09 17:43 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org

I have tried something similar, only with CLONE_FILES|CLONE_FS|CLONE_VM|CLONE_NEWNET,
and actually creating a virtual interface and controlling socket or thread in each new
network namespace. This scales to a couple of thousand interfaces, though interface creation
takes a long time if more than 1,000 interfaces or so are created.

Problems I have seen are
- name hash in kernel is bad. A test program with similar names (eg eth0 to eth1000)
  shows that only every 17th bucket or so is used at all.
- current sysfs implementation doesn't scale to thousands of interfaces.
  Sequential search through file names, especially using strcmp, doesn't work well
  if there are thousands of entries in a directory.
- Using sockets to control network namespaces starts to fail after a couple hundred 
  namespaces and attached interfaces are created. There is no error message, only 
  the socket<->interface/namespace relationship isn't always created. Some interfaces
  stay in the initial network namespace.
- the idea of attaching/associating network namespaces with sockets and/or threads
  doesn't really work well unless used strictly for virtualization. For other
  applications (eg per-customer network namespaces in switches) one can not really
  afford to "loose" a network namespace just because a controlling process dies.

I can send you the code if you like.

Guenter

On Fri, Jan 09, 2009 at 08:54:13AM -0800, Dan Smith wrote:
> SH> Does anyone else (Eric? Pavel?) have experience with hundreds or
> SH> thousands of network namespaces?
> 
> I just gave it a shot on linux-next-20090108 with the following test
> case:
> 
>   int flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER \
>                   | CLONE_NEWIPC|SIGCHLD|CLONE_NEWNET;
> 
>   int clone_child(void *data)
>   {
>           printf("Child %i\n", (int)data);
>           sleep(30);
>           exit(0);
>   }
> 
>   int main(int argc, char **argv)
>   {
>           int i;
> 
>           for (i = 0; i < 100; i++) {
>                   char *stack;
>                   unsigned int stacksize = getpagesize() * 4;
> 
>                   stack = malloc(stacksize);
>                   if (stack == NULL) {
>                           printf("Failed to allocate %i\n", stacksize);
>                           return 1;
>                   }
> 
>                   printf("Clone %i\n", i);
>                   clone(clone_child, stack + stacksize, flags, (void*)i);
>           }
> 
>           sleep(40);
>   }
> 
> The loop runs to completion, but only 18 children ever print their
> message.  After the test completes, doing something else (like
> bringing up a man page) consistently results in this panic:
> 
>   BUG: unable to handle kernel paging request at 00c85788
>   IP: [<c0252af8>] rb_insert_color+0x28/0x100
>   Oops: 0000 [#1] SMP
>   last sysfs file: /sys/devices/pci0000:00/0000:00:01.1/host0/target0:0:1/0:0:1:0/block/sr0/size
>   Modules linked in: ipt_MASQUERADE iptable_nat nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_state nf_conntrack ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp llc nfs lockd nfs_acl auth_rpcgss sunrpc af_packet ipv6 binfmt_misc dm_mirror dm_region_hash dm_log dm_multipath scsi_dh dm_mod uinput virtio_balloon virtio_net evbug evdev pcspkr virtio_pci virtio_ring virtio i2c_piix4 i2c_core sr_mod cdrom sg thermal button processor ata_generic pata_acpi piix ide_core sd_mod crc_t10dif ext3 jbd mbcache
> 
>   Pid: 2865, comm: man Not tainted (2.6.28-next-20090108 #5)
>   EIP: 0060:[<c0252af8>] EFLAGS: 00010202 CPU: 0
>   EIP is at rb_insert_color+0x28/0x100
>   EAX: c8578088 EBX: c8578088 ECX: c8578090 EDX: 00c85780
>   ESI: c8578088 EDI: 00c85780 EBP: cd93be28 ESP: cd93be14
>    DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068
>   Process man (pid: 2865, ti=cd93a000 task=cb75bd90 task.ti=cd93a000)
>   Stack:
>    cb56cc00 c8578087 c857807f c8578088 c857809f cd93be40 d0afe76d cb56cc00
>    c8ccb00c 0001fe43 cf785f00 cd93be74 d0b04f79 c8ccb00c 0000000c 00000000
>    cf7e0e28 c845d180 c8ccbff8 00000001 00000000 cb56cc00 cf7e0e28 cf7e0e28
>   Call Trace:
>    [<d0afe76d>] ? ext3_htree_store_dirent+0xbd/0x110 [ext3]
>    [<d0b04f79>] ? htree_dirblock_to_tree+0x109/0x180 [ext3]
>    [<d0b07a11>] ? ext3_htree_fill_tree+0x61/0x210 [ext3]
>    [<c01b77e3>] ? nameidata_to_filp+0x53/0x70
>    [<d0afe684>] ? ext3_readdir+0x6d4/0x700 [ext3]
>    [<d0afe532>] ? ext3_readdir+0x582/0x700 [ext3]
>    [<c01bc8b4>] ? cp_new_stat64+0xe4/0x100
>    [<c01c6690>] ? filldir+0x0/0xd0
>    [<c01bcd52>] ? sys_fstat64+0x22/0x30
>    [<c01c68c8>] ? vfs_readdir+0x88/0xa0
>    [<c01c6690>] ? filldir+0x0/0xd0
>    [<c01c69f8>] ? sys_getdents+0x68/0xb0
>    [<c0103762>] ? syscall_call+0x7/0xb
>   Code: 8d 76 00 55 89 e5 57 56 53 83 ec 08 89 45 f0 89 55 ec 90 8b 55 f0 8b 02 89 c3 83 e3 fc 74 3c 8b 13 f6 c2 01 75 35 89 d7 83 e7 fc <8b> 77 08 39 de 74 59 85 f6 74 35 8b 06 a8 01 75 2f 83 c8 01 89
>   EIP: [<c0252af8>] rb_insert_color+0x28/0x100 SS:ESP 0068:cd93be14
>   ---[ end trace 5af0fea6439f26a1 ]---
> 
> --
> Dan Smith
> IBM Linux Technology Center
> email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org
> 
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                     ` <20090109174334.GA4526-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org>
@ 2009-01-09 18:12                       ` Dan Smith
       [not found]                         ` <87ljtkic1j.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Dan Smith @ 2009-01-09 18:12 UTC (permalink / raw)
  To: Guenter Roeck; +Cc: containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org

GR> I have tried something similar, only with
GR> CLONE_FILES|CLONE_FS|CLONE_VM|CLONE_NEWNET, and actually creating
GR> a virtual interface and controlling socket or thread in each new
GR> network namespace.

My initial test was to create a veth pair and move one end into the
namespace during create.  That failed in the same way, so I took the
veth's out of the equation with the posted test.

GR> This scales to a couple of thousand interfaces, though interface
GR> creation takes a long time if more than 1,000 interfaces or so are
GR> created.

Yeah, just creating a bunch of pairs starts to slow down after a
hundred veth's or so.  I think that for thousands of network
namespaces, things would be pretty painful.

GR> I can send you the code if you like.

I'd like to see it.

Thanks!

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                 ` <87priwifnu.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
  2009-01-09 17:43                   ` Guenter Roeck
@ 2009-01-09 18:30                   ` Serge E. Hallyn
       [not found]                     ` <20090109183046.GA14063-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  1 sibling, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-01-09 18:30 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

Quoting Dan Smith (danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org):
> SH> Does anyone else (Eric? Pavel?) have experience with hundreds or
> SH> thousands of network namespaces?
> 
> I just gave it a shot on linux-next-20090108 with the following test
> case:

With Linus' tree (pulled yesterday) I get no errors.  Can you bisect?
(Assuming you don't also get the error on plain linux-2.6)

thanks,
-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-07  9:33             ` Li Zefan
  2009-01-07  9:37               ` Grzegorz Nosek
@ 2009-01-09 21:38               ` Paul Menage
  2009-01-10  4:50                 ` Li Zefan
  1 sibling, 1 reply; 33+ messages in thread
From: Paul Menage @ 2009-01-09 21:38 UTC (permalink / raw)
  To: Li Zefan; +Cc: Grzegorz Nosek, containers, netdev

On Wed, Jan 7, 2009 at 1:33 AM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>>
>> Yes, I like that. Will update the patch. I assume that I must check
>> list_empty(&cgroup->children)?
>
> Yes.
>
>> Should I use cgroup_lock()/cgroup_unlock()
>
> Yes.

For checking the "children" list, you can just lock
ipaddr_subsys.hierarchy_mutex.

Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] [RFC][PATCH] IP address restricting cgroup subsystem
       [not found] ` <20090106230554.GB25228-IaEwMO9oKu/77SC2UrCW1JJg/dWx8T/9@public.gmane.org>
  2009-01-07  6:01   ` Li Zefan
  2009-01-07 18:07   ` Serge E. Hallyn
@ 2009-01-09 21:58   ` Paul Menage
       [not found]     ` <6599ad830901091358m11effdbegeff6cbb7ee28e262-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2 siblings, 1 reply; 33+ messages in thread
From: Paul Menage @ 2009-01-09 21:58 UTC (permalink / raw)
  To: Grzegorz Nosek; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

While something to allow restricting network access from tasks in a
cgroup is useful, the basic problem with the patches that have been
proposed is the userspace API.

Once an API makes it into the kernel, we have to support it more or
less indefinitely. So that means we want to come up with something
that will satisfy say 95% of users *before* it gets anywhere near
mainline.

So for example, some people might want multiple IP addresses; others
might want to specify a subnet, but exclude certain addresses;
controlling which ports or port-ranges can be bound to is also useful
(and in fact is what I'd be most interested in).

Ideally we'd avoid making up a brand new userspace API for this. It
would be great if we could somehow make use of the iptables API, which
already has support for specifying these kinds of conditions.

I did once hack together a proof-of-concept that let you use iptables
for controlling connect/accept/bind operations, but it was a complete
mess and wouldn't survive code review :-)

Paul

On Tue, Jan 6, 2009 at 3:05 PM, Grzegorz Nosek <root-AfQBxy1nhrQ00sYp1HPQUA@public.gmane.org> wrote:
> This is a very simple cgroup subsystem to restrict IP addresses used
> by member processes. Currently it is limited to IPv4 only but IPv6 (or
> other protocols) should be easy to implement.
>
> IP addresses are write-once (via /cgroup/.../ipaddr.ipv4 in dotted-quad
> format) and are inherited by descendant cgroups, so a process once
> restricted should never be able to get rid of the limits. Any address
> may be specified in multiple cgroups. No verification is done to ensure
> the addresses are actually configured on the machine, which has its
> advantages (may add the addresses later) and disadvantages (if you enter
> the wrong address, the cgroup will be effectively cut off from the
> network).
>
> Whenever a process inside a restricted cgroup calls bind(2), the address
> is checked like this:
>  - INADDR_LOOPBACK is explicitly allowed (a special case)
>  - INADDR_ANY is remapped to _the_ IP address
>  - _the_ IP address is passed through unharmed
>  - everything else causes -EPERM
>
> When a process calls connect(2), this subsystem calls bind(_the_IP_)
> quietly behind its back, while preserving the original bound port (if
> any).
>
> Rationale (or when/why would you want it):
> The use case for ipaddr_cgroup doesn't overlap with network namespaces,
> which also allow IP address restrictions, because it aims to be much
> lighter due to its limited scope (hopefully able to easily support
> hundreds or possibly thousands of distinct cgroups). It does not attempt
> to hide the existence of other IP addresses from the user.
>
> Signed-off-by: Grzegorz Nosek <root-AfQBxy1nhrQ00sYp1HPQUA@public.gmane.org>
> ---
>
> This is more of an RFC than a finished patch so any and all comments are
> appreciated.
>
> The patch is based to a significant extent on the device_cgroup code,
> including bypassing the security infrastructure and hooking directly
> into the networking code.
>
> I'd also love to hear your opinion about locking--I have a version of this
> patch that uses a seqlock to protect the IP address but I'm not sure this
> is the Right Way to do it (and raw non-atomic lockless access looks scary,
> regardless of how rarely would the address be changed, i.e. at most
> once).
>
> And of course, if the whole idea is stupid, let me know.
>
>  include/linux/cgroup_subsys.h |    6 ++
>  include/linux/ipaddr_cgroup.h |   23 +++++
>  init/Kconfig                  |    7 ++
>  net/socket.c                  |   16 +++-
>  security/Makefile             |    1 +
>  security/ipaddr_cgroup.c      |  200 +++++++++++++++++++++++++++++++++++++++++
>  6 files changed, 250 insertions(+), 3 deletions(-)
>  create mode 100644 include/linux/ipaddr_cgroup.h
>  create mode 100644 security/ipaddr_cgroup.c
>
> diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
> index 9c22396..70dd375 100644
> --- a/include/linux/cgroup_subsys.h
> +++ b/include/linux/cgroup_subsys.h
> @@ -54,3 +54,9 @@ SUBSYS(freezer)
>  #endif
>
>  /* */
> +
> +#ifdef CONFIG_CGROUP_IPADDR
> +SUBSYS(ipaddr)
> +#endif
> +
> +/* */
> diff --git a/include/linux/ipaddr_cgroup.h b/include/linux/ipaddr_cgroup.h
> new file mode 100644
> index 0000000..19dc382
> --- /dev/null
> +++ b/include/linux/ipaddr_cgroup.h
> @@ -0,0 +1,23 @@
> +#ifndef HAVE_IPADDR_CGROUP_H
> +#define HAVE_IPADDR_CGROUP_H
> +
> +struct socket;
> +struct sockaddr;
> +
> +#ifdef CONFIG_CGROUP_IPADDR
> +int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen);
> +int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen);
> +
> +#else
> +static inline int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +       return 0;
> +}
> +
> +static inline int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +       return 0;
> +}
> +
> +#endif /* CONFIG_CGROUP_IPADDR */
> +#endif /* HAVE_IPADDR_CGROUP_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index 35d87b9..db43344 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -338,6 +338,13 @@ config CGROUP_DEVICE
>          Provides a cgroup implementing whitelists for devices which
>          a process in the cgroup can mknod or open.
>
> +config CGROUP_IPADDR
> +       bool "IP address controller for cgroups"
> +       depends on CGROUPS && EXPERIMENTAL
> +       help
> +         Provides a cgroup restricting IP addresses its member processes
> +         can use.
> +
>  config CPUSETS
>        bool "Cpuset support"
>        depends on SMP && CGROUPS
> diff --git a/net/socket.c b/net/socket.c
> index 3e8d4e3..3bd8c08 100644
> --- a/net/socket.c
> +++ b/net/socket.c
> @@ -87,6 +87,7 @@
>  #include <linux/audit.h>
>  #include <linux/wireless.h>
>  #include <linux/nsproxy.h>
> +#include <linux/ipaddr_cgroup.h>
>
>  #include <asm/uaccess.h>
>  #include <asm/unistd.h>
> @@ -1375,9 +1376,13 @@ asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen)
>        if (sock) {
>                err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
>                if (err >= 0) {
> -                       err = security_socket_bind(sock,
> -                                                  (struct sockaddr *)&address,
> -                                                  addrlen);
> +                       err = ipaddr_cgroup_bind(sock,
> +                                                (struct sockaddr *)&address,
> +                                                addrlen);
> +                       if (!err)
> +                               err = security_socket_bind(sock,
> +                                                          (struct sockaddr *)&address,
> +                                                          addrlen);
>                        if (!err)
>                                err = sock->ops->bind(sock,
>                                                      (struct sockaddr *)
> @@ -1600,6 +1605,11 @@ asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr,
>                goto out_put;
>
>        err =
> +           ipaddr_cgroup_connect(sock, (struct sockaddr *)&address, addrlen);
> +       if (err)
> +               goto out_put;
> +
> +       err =
>            security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
>        if (err)
>                goto out_put;
> diff --git a/security/Makefile b/security/Makefile
> index f654260..aaf225e 100644
> --- a/security/Makefile
> +++ b/security/Makefile
> @@ -16,3 +16,4 @@ obj-$(CONFIG_SECURITY_SELINUX)                += selinux/built-in.o
>  obj-$(CONFIG_SECURITY_SMACK)           += smack/built-in.o
>  obj-$(CONFIG_SECURITY_ROOTPLUG)                += root_plug.o
>  obj-$(CONFIG_CGROUP_DEVICE)            += device_cgroup.o
> +obj-$(CONFIG_CGROUP_IPADDR)            += ipaddr_cgroup.o
> diff --git a/security/ipaddr_cgroup.c b/security/ipaddr_cgroup.c
> new file mode 100644
> index 0000000..96ccf27
> --- /dev/null
> +++ b/security/ipaddr_cgroup.c
> @@ -0,0 +1,200 @@
> +/*
> + * IP address cgroup subsystem
> + */
> +
> +#include <linux/ipaddr_cgroup.h>
> +
> +#include <linux/cgroup.h>
> +#include <linux/err.h>
> +#include <linux/in.h>
> +#include <linux/inet.h>
> +#include <linux/seq_file.h>
> +#include <linux/socket.h>
> +
> +#include <net/inet_sock.h>
> +
> +struct ipaddr_cgroup {
> +       struct cgroup_subsys_state css;
> +       u32 ipv4_addr;
> +};
> +
> +static inline struct ipaddr_cgroup *css_to_ipcgroup(struct cgroup_subsys_state *s)
> +{
> +       return container_of(s, struct ipaddr_cgroup, css);
> +}
> +
> +static inline struct ipaddr_cgroup *cgroup_to_ipcgroup(struct cgroup *cgroup)
> +{
> +       return css_to_ipcgroup(cgroup_subsys_state(cgroup, ipaddr_subsys_id));
> +}
> +
> +static inline struct ipaddr_cgroup *task_ipcgroup(struct task_struct *task)
> +{
> +       return css_to_ipcgroup(task_subsys_state(task, ipaddr_subsys_id));
> +}
> +
> +struct cgroup_subsys ipaddr_subsys;
> +
> +static int ipcgroup_can_attach(struct cgroup_subsys *ss,
> +               struct cgroup *new_cgroup, struct task_struct *task)
> +{
> +       struct ipaddr_cgroup *old_ipcgroup, *new_ipcgroup;
> +       u32 old_ipv4;
> +
> +       if (current != task && !capable(CAP_SYS_ADMIN))
> +               return -EPERM;
> +
> +       old_ipcgroup = task_ipcgroup(task);
> +       new_ipcgroup = cgroup_to_ipcgroup(new_cgroup);
> +       old_ipv4 = old_ipcgroup->ipv4_addr;
> +
> +       if (old_ipv4 != INADDR_ANY && old_ipv4 != new_ipcgroup->ipv4_addr)
> +               return -EPERM;
> +
> +       return 0;
> +}
> +
> +static struct cgroup_subsys_state *ipcgroup_create(struct cgroup_subsys *ss,
> +                                               struct cgroup *cgroup)
> +{
> +       struct ipaddr_cgroup *ipcgroup, *parent_ipcgroup;
> +       struct cgroup *parent_cgroup;
> +
> +       ipcgroup = kzalloc(sizeof(*ipcgroup), GFP_KERNEL);
> +       if (!ipcgroup)
> +               return ERR_PTR(-ENOMEM);
> +       parent_cgroup = cgroup->parent;
> +
> +       if (parent_cgroup == NULL) {
> +               ipcgroup->ipv4_addr = htonl(INADDR_ANY);
> +       } else {
> +               parent_ipcgroup = cgroup_to_ipcgroup(parent_cgroup);
> +               ipcgroup->ipv4_addr = parent_ipcgroup->ipv4_addr;
> +       }
> +
> +       return &ipcgroup->css;
> +}
> +
> +static void ipcgroup_destroy(struct cgroup_subsys *ss,
> +                       struct cgroup *cgroup)
> +{
> +       struct ipaddr_cgroup *ipcgroup;
> +
> +       ipcgroup = cgroup_to_ipcgroup(cgroup);
> +       kfree(ipcgroup);
> +}
> +
> +static int ipcgroup_write_ipv4(struct cgroup *cgrp, struct cftype *cft,
> +                       const char *buffer)
> +{
> +       u32 new_addr;
> +       struct ipaddr_cgroup *ipcgroup;
> +       int ret;
> +
> +       if (!capable(CAP_SYS_ADMIN))
> +               return -EPERM;
> +
> +       ipcgroup = cgroup_to_ipcgroup(cgrp);
> +       if (ipcgroup->ipv4_addr != htonl(INADDR_ANY))
> +               return -EPERM;
> +
> +       ret = in4_pton(buffer, -1, (u8 *)&new_addr, '\0', NULL);
> +       if (!ret)
> +               return -EINVAL;
> +
> +       /* already network-endian */
> +       ipcgroup->ipv4_addr = new_addr;
> +       return 0;
> +}
> +
> +static int ipcgroup_read_ipv4(struct cgroup *cgrp, struct cftype *cft,
> +                       struct seq_file *m)
> +{
> +       struct ipaddr_cgroup *ipcgroup;
> +
> +       ipcgroup = cgroup_to_ipcgroup(cgrp);
> +       seq_printf(m, NIPQUAD_FMT "\n", NIPQUAD(ipcgroup->ipv4_addr));
> +       return 0;
> +}
> +
> +static struct cftype ipaddr_cgroup_files[] = {
> +       {
> +               .name = "ipv4",
> +               .write_string = ipcgroup_write_ipv4,
> +               .read_seq_string = ipcgroup_read_ipv4,
> +       },
> +};
> +
> +static int ipcgroup_populate(struct cgroup_subsys *ss,
> +                               struct cgroup *cgroup)
> +{
> +       return cgroup_add_files(cgroup, ss, ipaddr_cgroup_files,
> +                                       ARRAY_SIZE(ipaddr_cgroup_files));
> +}
> +
> +struct cgroup_subsys ipaddr_subsys = {
> +       .name = "ipaddr",
> +       .can_attach = ipcgroup_can_attach,
> +       .create = ipcgroup_create,
> +       .destroy = ipcgroup_destroy,
> +       .populate = ipcgroup_populate,
> +       .subsys_id = ipaddr_subsys_id
> +};
> +
> +int ipaddr_cgroup_connect(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +       struct sockaddr_in sa_in;
> +       struct ipaddr_cgroup *ipcgroup;
> +       struct inet_sock *inet;
> +       int err;
> +
> +       if (address->sa_family != AF_INET)
> +               return 0;
> +
> +       ipcgroup = task_ipcgroup(current);
> +       if (ipcgroup->ipv4_addr == htonl(INADDR_ANY))
> +               return 0;
> +
> +       inet = inet_sk(sock->sk);
> +
> +       sa_in.sin_family = AF_INET;
> +       sa_in.sin_addr.s_addr = ipcgroup->ipv4_addr;
> +       sa_in.sin_port = inet->sport;
> +
> +       err = security_socket_bind(sock, (struct sockaddr *)&sa_in, sizeof(sa_in));
> +       if (err)
> +               return err;
> +
> +       err = sock->ops->bind(sock, (struct sockaddr *)&sa_in, sizeof(sa_in));
> +
> +       return err;
> +}
> +
> +int ipaddr_cgroup_bind(struct socket *sock, struct sockaddr *address, int addrlen)
> +{
> +       struct sockaddr_in *sa_in;
> +       struct ipaddr_cgroup *ipcgroup;
> +
> +       if (address->sa_family != AF_INET)
> +               return 0;
> +
> +       ipcgroup = task_ipcgroup(current);
> +       if (ipcgroup->ipv4_addr == htonl(INADDR_ANY))
> +               return 0;
> +
> +       sa_in = (struct sockaddr_in *) address;
> +
> +       /* remap INADDR_ANY to cgroup IP address */
> +       if (sa_in->sin_addr.s_addr == htonl(INADDR_ANY))
> +               sa_in->sin_addr.s_addr = ipcgroup->ipv4_addr;
> +
> +       /* a very special case */
> +       if (sa_in->sin_addr.s_addr == htonl(INADDR_LOOPBACK))
> +               return 0;
> +
> +       if (sa_in->sin_addr.s_addr == ipcgroup->ipv4_addr)
> +               return 0;
> +
> +       return -EPERM;
> +}
> +
> --
> 1.6.1
>
> _______________________________________________
> Containers mailing list
> Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
>
> _______________________________________________
> Devel mailing list
> Devel-GEFAQzZX7r8dnm+yROfE0A@public.gmane.org
> https://openvz.org/mailman/listinfo/devel
>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                         ` <87ljtkic1j.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
@ 2009-01-09 22:37                           ` Guenter Roeck
       [not found]                             ` <20090109223756.GA22738-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Guenter Roeck @ 2009-01-09 22:37 UTC (permalink / raw)
  To: Dan Smith; +Cc: containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org

[-- Attachment #1: Type: text/plain, Size: 1212 bytes --]

On Fri, Jan 09, 2009 at 10:12:24AM -0800, Dan Smith wrote:
> GR> I have tried something similar, only with
> GR> CLONE_FILES|CLONE_FS|CLONE_VM|CLONE_NEWNET, and actually creating
> GR> a virtual interface and controlling socket or thread in each new
> GR> network namespace.
> 
> My initial test was to create a veth pair and move one end into the
> namespace during create.  That failed in the same way, so I took the
> veth's out of the equation with the posted test.
> 
> GR> This scales to a couple of thousand interfaces, though interface
> GR> creation takes a long time if more than 1,000 interfaces or so are
> GR> created.
> 
This is at least to some degree due to the problems I mentioned earlier.
Enhancing the kernel name hash and the sysfs implementation improves
performance a lot.

> Yeah, just creating a bunch of pairs starts to slow down after a
> hundred veth's or so.  I think that for thousands of network
> namespaces, things would be pretty painful.
> 
> GR> I can send you the code if you like.
> 
> I'd like to see it.
> 
See attached. I used the "ctx" module in the attached code to create interfaces, 
so you'll have to compile and insmod it if you want to create interfaces.

Guenter

[-- Attachment #2: netclone.tar.gz --]
[-- Type: application/octet-stream, Size: 3621 bytes --]

[-- Attachment #3: Type: text/plain, Size: 206 bytes --]

_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                             ` <20090109223756.GA22738-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org>
@ 2009-01-09 22:47                               ` Serge E. Hallyn
       [not found]                                 ` <20090109224742.GA15227-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Serge E. Hallyn @ 2009-01-09 22:47 UTC (permalink / raw)
  To: Guenter Roeck; +Cc: containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org

Quoting Guenter Roeck (groeck-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org):
> On Fri, Jan 09, 2009 at 10:12:24AM -0800, Dan Smith wrote:
> > GR> I have tried something similar, only with
> > GR> CLONE_FILES|CLONE_FS|CLONE_VM|CLONE_NEWNET, and actually creating
> > GR> a virtual interface and controlling socket or thread in each new
> > GR> network namespace.
> > 
> > My initial test was to create a veth pair and move one end into the
> > namespace during create.  That failed in the same way, so I took the
> > veth's out of the equation with the posted test.
> > 
> > GR> This scales to a couple of thousand interfaces, though interface
> > GR> creation takes a long time if more than 1,000 interfaces or so are
> > GR> created.
> > 
> This is at least to some degree due to the problems I mentioned earlier.
> Enhancing the kernel name hash and the sysfs implementation improves
> performance a lot.

Is this something you've had a chance to start addressing?  (Just wondering)

-serge

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                                 ` <20090109224742.GA15227-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-01-09 23:37                                   ` Guenter Roeck
  0 siblings, 0 replies; 33+ messages in thread
From: Guenter Roeck @ 2009-01-09 23:37 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg@public.gmane.org

On Fri, Jan 09, 2009 at 02:47:42PM -0800, Serge E. Hallyn wrote:
> Quoting Guenter Roeck (groeck-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org):
> > On Fri, Jan 09, 2009 at 10:12:24AM -0800, Dan Smith wrote:
> > > GR> I have tried something similar, only with
> > > GR> CLONE_FILES|CLONE_FS|CLONE_VM|CLONE_NEWNET, and actually creating
> > > GR> a virtual interface and controlling socket or thread in each new
> > > GR> network namespace.
> > >
> > > My initial test was to create a veth pair and move one end into the
> > > namespace during create.  That failed in the same way, so I took the
> > > veth's out of the equation with the posted test.
> > >
> > > GR> This scales to a couple of thousand interfaces, though interface
> > > GR> creation takes a long time if more than 1,000 interfaces or so are
> > > GR> created.
> > >
> > This is at least to some degree due to the problems I mentioned earlier.
> > Enhancing the kernel name hash and the sysfs implementation improves
> > performance a lot.
> 
> Is this something you've had a chance to start addressing?  (Just wondering)
> 
Yes - I have code for the name hash change (just one line, really), and two variants
of code for sysfs - one that uses a hash result as 1st step of comparison before
doing a strcmp, and another which uses a hash table per directory in sysfs.
The latter is of course more efficient, but also more expensive in terms of memory usage. 

If there is interest, I can submit a patchset once I find out how exactly to do it ;-).

Guenter

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-09 21:38               ` [Devel] " Paul Menage
@ 2009-01-10  4:50                 ` Li Zefan
  2009-01-10 16:14                   ` Paul Menage
  0 siblings, 1 reply; 33+ messages in thread
From: Li Zefan @ 2009-01-10  4:50 UTC (permalink / raw)
  To: Paul Menage; +Cc: Grzegorz Nosek, containers, netdev

Paul Menage wrote:
> On Wed, Jan 7, 2009 at 1:33 AM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>>> Yes, I like that. Will update the patch. I assume that I must check
>>> list_empty(&cgroup->children)?
>> Yes.
>>
>>> Should I use cgroup_lock()/cgroup_unlock()
>> Yes.
> 
> For checking the "children" list, you can just lock
> ipaddr_subsys.hierarchy_mutex.
> 

Unfortunately hierarchy_mutex can't be used here, since hierarchy_mutex
doesn't protect subsys's create() method, and the create() will access
parent cgroup's data.



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]     ` <6599ad830901091358m11effdbegeff6cbb7ee28e262-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2009-01-10 11:20       ` Grzegorz Nosek
       [not found]         ` <20090110112009.GA12336-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-10 11:20 UTC (permalink / raw)
  To: Paul Menage; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On pią, sty 09, 2009 at 01:58:42 -0800, Paul Menage wrote:
> While something to allow restricting network access from tasks in a
> cgroup is useful, the basic problem with the patches that have been
> proposed is the userspace API.
> 
> Once an API makes it into the kernel, we have to support it more or
> less indefinitely. So that means we want to come up with something
> that will satisfy say 95% of users *before* it gets anywhere near
> mainline.

Certainly. The patch is RFC, not mainline-me-now. While it scratches my
current itch, I'd definitely want it to be more useful. I just wanted to
receive some comments on the overall idea before I invest way too much
time in it.

> So for example, some people might want multiple IP addresses; others
> might want to specify a subnet, but exclude certain addresses;
> controlling which ports or port-ranges can be bound to is also useful
> (and in fact is what I'd be most interested in).

...and some people mignt not want the loopback special case. So we'd
need a black- and whitelist of:

IP address [/netmask] [port [- port]]

right? Would that cover a reasonable set of use cases? If there are
going to be multiple addresses, we'd probably want some mechanism to
determine which one should be used for remapping INADDR_ANY. BTW, do you
want to restrict connect() source ports too?

> Ideally we'd avoid making up a brand new userspace API for this. It
> would be great if we could somehow make use of the iptables API, which
> already has support for specifying these kinds of conditions.

The iptables interface is nice but only works with network packets and
not sockets and I'd find bind() remapping via iptables rather strange.
I'm currently using iptables to SNAT outgoing connections per uid but
I find the cgroup idea rather appealing (as many resources as possible
managed from a single virtual directory with simple shell tools).

So, are you opposed to the current implementation (single IP address) or
to the interface (a file in cgroupfs)?

> I did once hack together a proof-of-concept that let you use iptables
> for controlling connect/accept/bind operations, but it was a complete
> mess and wouldn't survive code review :-)

<shudder> :)

Best regards,
 Grzegorz Nosek
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-10  4:50                 ` Li Zefan
@ 2009-01-10 16:14                   ` Paul Menage
  2009-01-12  2:20                     ` Li Zefan
  0 siblings, 1 reply; 33+ messages in thread
From: Paul Menage @ 2009-01-10 16:14 UTC (permalink / raw)
  To: Li Zefan; +Cc: Grzegorz Nosek, containers, netdev

On Fri, Jan 9, 2009 at 8:50 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>>
>> For checking the "children" list, you can just lock
>> ipaddr_subsys.hierarchy_mutex.
>>
>
> Unfortunately hierarchy_mutex can't be used here, since hierarchy_mutex
> doesn't protect subsys's create() method, and the create() will access
> parent cgroup's data.
>

But that can be solved by putting a spinlock in the ipaddr_cgroup
structure and taking it in the write handler (and the connect/bind
handlers, which should also be using RCU), and taking the parent
structure's lock before copying from it in the create callback. No
need for something as heavy as cgroup_lock().

Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]         ` <20090110112009.GA12336-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
@ 2009-01-10 16:21           ` Paul Menage
       [not found]             ` <6599ad830901100821q2c943d38i314c00f7db51b4f0-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 33+ messages in thread
From: Paul Menage @ 2009-01-10 16:21 UTC (permalink / raw)
  To: Grzegorz Nosek; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On Sat, Jan 10, 2009 at 3:20 AM, Grzegorz Nosek <root-AfQBxy1nhrQ00sYp1HPQUA@public.gmane.org> wrote:
>
> IP address [/netmask] [port [- port]]
>
> right? Would that cover a reasonable set of use cases?

Oh, and don't forget being able to control remote addresses/ports too.
E.g. you might not care what local port/address something binds to (or
there may only be one local address anyway) but you might want to
restrict a cgroup from e.g. connecting outside your data center, etc.
(Something that I'm interested in).

> If there are
> going to be multiple addresses, we'd probably want some mechanism to
> determine which one should be used for remapping INADDR_ANY. BTW, do you
> want to restrict connect() source ports too?

Potentially, yes.

>
> The iptables interface is nice but only works with network packets and
> not sockets

But converting a socket definition into a packet header that would be
sent/received on that socket is a fairly mechanical operation, and
after that you have the entire flexibility of the iptables API
available. So the connect()  operation would construct a fake packet
header and send it through the iptable associated with the current
cgroup; if the packet was accepted the operation was permitted, else
the operation was denied.

> So, are you opposed to the current implementation (single IP address) or
> to the interface (a file in cgroupfs)?

Primarily the interface - changing the code later is simple.

Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]             ` <6599ad830901100821q2c943d38i314c00f7db51b4f0-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2009-01-11  0:25               ` Benny Amorsen
  2009-01-11 10:19               ` Grzegorz Nosek
  1 sibling, 0 replies; 33+ messages in thread
From: Benny Amorsen @ 2009-01-11  0:25 UTC (permalink / raw)
  To: Paul Menage
  Cc: Grzegorz Nosek,
	public-containers-qjLDD68F18O7TbgM5vRIOg-z5DuStaUktnZ+VzJOa5vwg



Paul Menage <menage-hpIqsD4AKlfQT0dZR+AlfA-XMD5yJDbdMReXY1tMh2IBg@public.gmane.org> writes:

> Oh, and don't forget being able to control remote addresses/ports too.
> E.g. you might not care what local port/address something binds to (or
> there may only be one local address anyway) but you might want to
> restrict a cgroup from e.g. connecting outside your data center, etc.
> (Something that I'm interested in).

If it's going to be that advanced, it will end up either like iptables
or like routing tables.

It is a bit much to expect normal applications to use either, but
iptables is especially complicated. I am a little bit tempted by
something resembling routing/rule tables, but it would obviously have
to be a bit more limited. E.g. gateway addresses should not be stored
there at all.

There is also the classic question: What happens if you invoke a
setuid or setgid executable with restrictions in effect? It is hard to
guarantee that this isn't exploitable in any way.


/Benny

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]             ` <6599ad830901100821q2c943d38i314c00f7db51b4f0-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2009-01-11  0:25               ` Benny Amorsen
@ 2009-01-11 10:19               ` Grzegorz Nosek
       [not found]                 ` <20090111101946.GA14325-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
  1 sibling, 1 reply; 33+ messages in thread
From: Grzegorz Nosek @ 2009-01-11 10:19 UTC (permalink / raw)
  To: Paul Menage; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

On sob, sty 10, 2009 at 08:21:53 -0800, Paul Menage wrote:
> But converting a socket definition into a packet header that would be
> sent/received on that socket is a fairly mechanical operation, and
> after that you have the entire flexibility of the iptables API
> available. So the connect()  operation would construct a fake packet
> header and send it through the iptable associated with the current
> cgroup; if the packet was accepted the operation was permitted, else
> the operation was denied.

So if I understand you right, your proposed solution would be something
akin to ipt_cgroup (matching packets originating from a cgroup, like
ipt_owner matches uid/gid) plus netfilter hooks for blocking/remapping
addresses passed to connect() and/or bind()? Or maybe a dedicated
netfilter table with per-cgroup chains?

I'd rather not invent some new userspace tools (to use the iptables API
without /sbin/iptables) to manage the cgroup networking so I guess
extending iptables (or iproute) would be the way to go in that case.

Using the iptables API with connect() sending a fake packet, how would
you represent "allow this connection, but bind() to 10.0.0.1 first"?
Rewrite the source address in an iptables target?

Am I right or completely off the mark?

Best regards,
 Grzegorz Nosek

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-10 16:14                   ` Paul Menage
@ 2009-01-12  2:20                     ` Li Zefan
  2009-01-14  2:07                       ` Paul Menage
  0 siblings, 1 reply; 33+ messages in thread
From: Li Zefan @ 2009-01-12  2:20 UTC (permalink / raw)
  To: Paul Menage; +Cc: Grzegorz Nosek, containers, netdev

Paul Menage wrote:
> On Fri, Jan 9, 2009 at 8:50 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>>> For checking the "children" list, you can just lock
>>> ipaddr_subsys.hierarchy_mutex.
>>>
>> Unfortunately hierarchy_mutex can't be used here, since hierarchy_mutex
>> doesn't protect subsys's create() method, and the create() will access
>> parent cgroup's data.
>>
> 
> But that can be solved by putting a spinlock in the ipaddr_cgroup
> structure and taking it in the write handler (and the connect/bind
> handlers, which should also be using RCU), and taking the parent
> structure's lock before copying from it in the create callback. No
> need for something as heavy as cgroup_lock().
> 

Still won't work. :(

================
lock(ipaddr_subsys.hierarchy_mutex);

if (list_empty(&cgrp->children) &&
    parent->ipv4_addr == INADDR_ANY)
						create()
						  parent->spin_lock();
						  cgrp->ipv4_addr = parent->ipv4_addr;
						  parent->spin_unlock();
	ipcgroup->spin_lock();
	ipcgroup->ipv4_addr = new_addr;
	ipcgroup->spin_unlock();

unlock(ipaddr_subsys.hierarchy_mutex);
						list_add(&cgrp->sibling, &cgrp->parent->children);
================

And neither can it work to hold hierarchy_mutex in subsys's create().

I think the only way to make hierarchy_mutex works for this issue is:

@@ -2403,16 +2403,18 @@ static long cgroup_create(struct cgroup *parent, struct
        if (notify_on_release(parent))
                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

+       cgroup_lock_hierarchy(root);
+
        for_each_subsys(root, ss) {
                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
                if (IS_ERR(css)) {
+                       cgroup_unlock_hierarchy(root);
                        err = PTR_ERR(css);
                        goto err_destroy;
                }
                init_cgroup_css(css, ss, cgrp);
        }

-       cgroup_lock_hierarchy(root);
        list_add(&cgrp->sibling, &cgrp->parent->children);
        cgroup_unlock_hierarchy(root);
        root->number_of_cgroups++;



^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                     ` <20090109183046.GA14063-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2009-01-13 16:23                       ` Dan Smith
  0 siblings, 0 replies; 33+ messages in thread
From: Dan Smith @ 2009-01-13 16:23 UTC (permalink / raw)
  To: Serge E. Hallyn; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

SH> With Linus' tree (pulled yesterday) I get no errors.  Can you
SH> bisect?  (Assuming you don't also get the error on plain
SH> linux-2.6)

As of next-20090112 the crash I was seeing seems to have been fixed.

-- 
Dan Smith
IBM Linux Technology Center
email: danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-12  2:20                     ` Li Zefan
@ 2009-01-14  2:07                       ` Paul Menage
  2009-01-14  2:47                         ` Li Zefan
  0 siblings, 1 reply; 33+ messages in thread
From: Paul Menage @ 2009-01-14  2:07 UTC (permalink / raw)
  To: Li Zefan; +Cc: Grzegorz Nosek, containers, netdev

>
> I think the only way to make hierarchy_mutex works for this issue is:
>
> @@ -2403,16 +2403,18 @@ static long cgroup_create(struct cgroup *parent, struct
>        if (notify_on_release(parent))
>                set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
>
> +       cgroup_lock_hierarchy(root);
> +
>        for_each_subsys(root, ss) {
>                struct cgroup_subsys_state *css = ss->create(ss, cgrp);
>                if (IS_ERR(css)) {
> +                       cgroup_unlock_hierarchy(root);
>                        err = PTR_ERR(css);
>                        goto err_destroy;
>                }
>                init_cgroup_css(css, ss, cgrp);
>        }
>
> -       cgroup_lock_hierarchy(root);
>        list_add(&cgrp->sibling, &cgrp->parent->children);
>        cgroup_unlock_hierarchy(root);
>        root->number_of_cgroups++;
>

That would be possible, but I'm not sure that extending
hierarchy_mutex across all the create calls is a good idea - it's
meant to be very lightweight.

OK, an alternative way to avoid cgroup_lock() is for the
spinlock-protected state in ipcgroup to be the address and the count
of active children.

create() does:

lock parent
css->addr = parent->addr
parent->child_count++;
unlock parent

and write does:

lock css
if (!css->child_count) {
  css->addr = new_addr
} else {
  report error;
}
unlock css

Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] [RFC][PATCH] IP address restricting cgroup subsystem
       [not found]                 ` <20090111101946.GA14325-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
@ 2009-01-14  2:21                   ` Paul Menage
  0 siblings, 0 replies; 33+ messages in thread
From: Paul Menage @ 2009-01-14  2:21 UTC (permalink / raw)
  To: Grzegorz Nosek; +Cc: containers-qjLDD68F18O7TbgM5vRIOg

[-- Attachment #1: Type: text/plain, Size: 1945 bytes --]

On Sun, Jan 11, 2009 at 2:19 AM, Grzegorz Nosek <root-AfQBxy1nhrQ00sYp1HPQUA@public.gmane.org> wrote:
>
> So if I understand you right, your proposed solution would be something
> akin to ipt_cgroup (matching packets originating from a cgroup, like
> ipt_owner matches uid/gid) plus netfilter hooks for blocking/remapping
> addresses passed to connect() and/or bind()? Or maybe a dedicated
> netfilter table with per-cgroup chains?

Yes, something like one of those options. But it would never need to
be actually matching real packets in the data path - just
connect/bind/accept requests in the control path.

>
> Using the iptables API with connect() sending a fake packet, how would
> you represent "allow this connection, but bind() to 10.0.0.1 first"?
> Rewrite the source address in an iptables target?

Hmm, I hadn't considered that - I'd just been thinking of permit/deny
decisions. But you're right, a rewrite rule might be a natural way to
do this.

Clearly this feature would only use a small subset of the available
iptables API, so in that sense it might be overkill. But avoiding
inventing a complex new API seems worth the potential overkill.

I've attached the vague prototype that I was playing with a few months
ago. It's missing some of the bits that it would need:

- it uses the NF_INET_LOCAL_OUT table rather than a new
NF_INET_CONTROL table, because trying to edit/recompile the iptables
userspace binary to handle a new table proved to be too painful for
this prototype. (i.e. it currently does use the fast path checks, but
it really shouldn't ...)

- it only currently handles connect() - no bind() or accept()

- it doesn't have a cgroup-specific iptables filter yet - it just
provides a system-wide control over connections. Adding a per-group
filter would be pretty easy, I think

As it stands, it's sufficient to express complex rules like "disallow
connections to a remote sshd port, except on host H", etc.

Paul

[-- Attachment #2: netfilter_control_hook.patch --]
[-- Type: text/x-patch, Size: 11694 bytes --]

---
 include/linux/netfilter.h            |    1 
 include/linux/netfilter_ipv4.h       |    7 +
 include/net/netns/ipv4.h             |    3 
 net/ipv4/netfilter/Kconfig           |    7 +
 net/ipv4/netfilter/Makefile          |    3 
 net/ipv4/netfilter/ip_tables.c       |    7 +
 net/ipv4/netfilter/iptable_control.c |  167 +++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_ipv4.c                  |    6 +
 net/netfilter/xt_tcpudp.c            |   16 +++
 9 files changed, 214 insertions(+), 3 deletions(-)

Index: netfilter-2.6.25-rc3/include/linux/netfilter.h
===================================================================
--- netfilter-2.6.25-rc3.orig/include/linux/netfilter.h
+++ netfilter-2.6.25-rc3/include/linux/netfilter.h
@@ -47,6 +47,7 @@ enum nf_inet_hooks {
 	NF_INET_FORWARD,
 	NF_INET_LOCAL_OUT,
 	NF_INET_POST_ROUTING,
+//	NF_INET_CONTROL,
 	NF_INET_NUMHOOKS
 };
 
Index: netfilter-2.6.25-rc3/include/net/netns/ipv4.h
===================================================================
--- netfilter-2.6.25-rc3.orig/include/net/netns/ipv4.h
+++ netfilter-2.6.25-rc3/include/net/netns/ipv4.h
@@ -33,5 +33,8 @@ struct netns_ipv4 {
 	struct xt_table		*iptable_raw;
 	struct xt_table		*arptable_filter;
 #endif
+#ifdef CONFIG_IP_NF_CONTROL
+	struct xt_table		*iptable_control;
+#endif     
 };
 #endif
Index: netfilter-2.6.25-rc3/net/ipv4/netfilter/Kconfig
===================================================================
--- netfilter-2.6.25-rc3.orig/net/ipv4/netfilter/Kconfig
+++ netfilter-2.6.25-rc3/net/ipv4/netfilter/Kconfig
@@ -281,6 +281,13 @@ config NF_NAT_SIP
 	depends on IP_NF_IPTABLES && NF_CONNTRACK && NF_NAT
 	default NF_NAT && NF_CONNTRACK_SIP
 
+config IP_NF_CONTROL
+	tristate "Connection control"
+	depends on IP_NF_IPTABLES
+	default m
+	help
+	  This option adds a control hook/table
+
 # mangle + specific targets
 config IP_NF_MANGLE
 	tristate "Packet mangling"
Index: netfilter-2.6.25-rc3/net/ipv4/netfilter/Makefile
===================================================================
--- netfilter-2.6.25-rc3.orig/net/ipv4/netfilter/Makefile
+++ netfilter-2.6.25-rc3/net/ipv4/netfilter/Makefile
@@ -34,11 +34,12 @@ obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 
-# the three instances of ip_tables
+# the five instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
 obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
 obj-$(CONFIG_NF_NAT) += iptable_nat.o
 obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+obj-$(CONFIG_IP_NF_CONTROL) += iptable_control.o
 
 # matches
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
Index: netfilter-2.6.25-rc3/net/ipv4/netfilter/iptable_control.c
===================================================================
--- /dev/null
+++ netfilter-2.6.25-rc3/net/ipv4/netfilter/iptable_control.c
@@ -0,0 +1,167 @@
+/*
+ * 'control' table, used for controlling operations such as bind() or connect()
+ *
+ * Copyright (C) 2007 Paul Menage <menage@google.com>
+ * Cloned from code originally by Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#define CONTROL_VALID_HOOKS (1 << NF_INET_LOCAL_OUT)
+
+static struct
+{
+	struct ipt_replace repl;
+	struct ipt_standard entries[1];
+	struct ipt_error term;
+} initial_table __net_initdata = {
+	.repl = {
+		.name = "control",
+		.valid_hooks = CONTROL_VALID_HOOKS,
+		.num_entries = 2,
+		.size = sizeof(struct ipt_standard) * 1 + sizeof(struct ipt_error),
+		.hook_entry = {
+			[NF_INET_LOCAL_OUT] = 0,
+		},
+		.underflow = {
+			[NF_INET_LOCAL_OUT] = 0,
+		},
+	},
+	.entries = {
+		IPT_STANDARD_INIT(NF_ACCEPT),	/* CONTROL */
+	},
+	.term = IPT_ERROR_INIT,			/* ERROR */
+};
+
+static struct xt_table packet_control = {
+	.name = "control",
+	.valid_hooks =  CONTROL_VALID_HOOKS,
+	.lock = RW_LOCK_UNLOCKED,
+	.me = THIS_MODULE,
+	.af = AF_INET,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+	 struct sk_buff *skb,
+	 const struct net_device *in,
+	 const struct net_device *out,
+	 int (*okfn)(struct sk_buff *))
+{
+	/* We don't actually want to do anything in the real hook. (We
+	 * should actually have a separate hook, but handling that
+	 * from userspace is non-trivial. */
+	return NF_ACCEPT;
+}
+
+int ipt_control_check(int protocol,
+		      struct inet_sock *sock,
+		      struct sockaddr_in *remote)
+{
+	int err = 0;
+	struct sk_buff *skb = alloc_skb(MAX_TCP_HEADER, GFP_USER);
+	struct iphdr *iph;
+	struct tcphdr *th;
+	int verdict;
+	if (skb == NULL) {
+		return -ENOMEM;
+	}
+
+	/* Allow the "owner" module to work */
+	skb->sk = &sock->sk;
+
+	/* Set up a fake TCP/UDP packet */
+	iph = (struct iphdr *)skb_put(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	memset(iph, 0, sizeof(*iph));
+	iph->version = 4;
+	iph->protocol = protocol;
+	iph->saddr = sock->rcv_saddr;
+	iph->daddr = remote->sin_addr.s_addr;
+	iph->ihl = sizeof(*iph) / 4;
+	iph->tot_len = sizeof(*iph) + sizeof(*th);
+	th = (struct tcphdr *)skb_put(skb, sizeof(*th));
+	memset(th, 0, sizeof(*th));
+	skb_set_transport_header(skb, sizeof(*iph));
+	th->source = sock->num;
+	th->dest = remote->sin_port;
+
+#if 0
+	printk(KERN_ERR "Calling ipt_do_table for %08x:%04x -> %08x:%04x. iph = %p, th = %p, data = %p, neth = %p, transh = %p\n",
+	       sock->rcv_saddr, sock->num, remote->sin_addr.s_addr, remote->sin_port, iph, th, skb->data, skb->network_header, skb->transport_header); 
+#endif
+	verdict = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, NULL, init_net.ipv4.iptable_control);
+
+	//printk(KERN_ERR "Verdict = %d\n", verdict);
+	if (verdict != NF_ACCEPT) {
+		err = -EPERM;
+	}
+	kfree_skb(skb);
+	return err;
+}
+
+static struct nf_hook_ops control_ipt_ops[] __read_mostly = {
+	{
+		.hook = ipt_hook,
+		.pf = PF_INET,
+		.hooknum = NF_INET_LOCAL_OUT,
+		.owner = THIS_MODULE,
+	},
+};
+
+static int __net_init iptable_control_net_init(struct net *net)
+{
+	/* Register table */
+	net->ipv4.iptable_control =
+		ipt_register_table(net, &packet_control, &initial_table.repl);
+	if (IS_ERR(net->ipv4.iptable_control)) {
+		int errno = PTR_ERR(net->ipv4.iptable_control);
+		printk(KERN_ERR "Failed to register control table: %d\n",
+		       errno);
+		return errno;
+	}
+	return 0;
+}
+
+static void __net_exit iptable_control_net_exit(struct net *net)
+{
+	ipt_unregister_table(net->ipv4.iptable_control);
+}
+
+static struct pernet_operations iptable_control_net_ops = {
+	.init = iptable_control_net_init,
+	.exit = iptable_control_net_exit,
+};
+
+static int __init iptable_control_init(void)
+{
+	int ret;
+
+	ret = register_pernet_subsys(&iptable_control_net_ops);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	ret = nf_register_hooks(control_ipt_ops, ARRAY_SIZE(control_ipt_ops));
+	if (ret < 0)
+		goto cleanup_table;
+
+	return ret;
+
+ cleanup_table:
+	unregister_pernet_subsys(&iptable_control_net_ops);
+	return ret;
+}
+
+static void __exit iptable_control_fini(void)
+{
+	nf_unregister_hooks(control_ipt_ops, ARRAY_SIZE(control_ipt_ops));
+	unregister_pernet_subsys(&iptable_control_net_ops);
+}
+
+module_init(iptable_control_init);
+module_exit(iptable_control_fini);
+MODULE_LICENSE("GPL");
Index: netfilter-2.6.25-rc3/include/linux/netfilter_ipv4.h
===================================================================
--- netfilter-2.6.25-rc3.orig/include/linux/netfilter_ipv4.h
+++ netfilter-2.6.25-rc3/include/linux/netfilter_ipv4.h
@@ -48,6 +48,7 @@
 #define NF_IP_LOCAL_OUT		3
 /* Packets about to hit the wire. */
 #define NF_IP_POST_ROUTING	4
+//#define NF_IP_CONTROL           5
 #define NF_IP_NUMHOOKS		5
 #endif /* ! __KERNEL__ */
 
@@ -79,6 +80,12 @@ extern int ip_route_me_harder(struct sk_
 extern int ip_xfrm_me_harder(struct sk_buff *skb);
 extern __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
 				   unsigned int dataoff, u_int8_t protocol);
+
+struct inet_sock;
+int ipt_control_check(int protocol,
+		      struct inet_sock *sock,
+		      struct sockaddr_in *remote);				    
+
 #endif /*__KERNEL__*/
 
 #endif /*__LINUX_IP_NETFILTER_H*/
Index: netfilter-2.6.25-rc3/net/ipv4/netfilter/ip_tables.c
===================================================================
--- netfilter-2.6.25-rc3.orig/net/ipv4/netfilter/ip_tables.c
+++ netfilter-2.6.25-rc3/net/ipv4/netfilter/ip_tables.c
@@ -32,9 +32,9 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("IPv4 packet filter");
 
-/*#define DEBUG_IP_FIREWALL*/
+#define DEBUG_IP_FIREWALL
 /*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
-/*#define DEBUG_IP_FIREWALL_USER*/
+#define DEBUG_IP_FIREWALL_USER
 
 #ifdef DEBUG_IP_FIREWALL
 #define dprintf(format, args...)  printk(format , ## args)
@@ -231,6 +231,9 @@ static const char *const hooknames[] = {
 	[NF_INET_FORWARD]		= "FORWARD",
 	[NF_INET_LOCAL_OUT]		= "OUTPUT",
 	[NF_INET_POST_ROUTING]		= "POSTROUTING",
+#ifdef CONFIG_IP_NF_CONTROL
+	[NF_INET_CONTROL]		= "CONTROL",
+#endif
 };
 
 enum nf_ip_trace_comments {
Index: netfilter-2.6.25-rc3/net/ipv4/tcp_ipv4.c
===================================================================
--- netfilter-2.6.25-rc3.orig/net/ipv4/tcp_ipv4.c
+++ netfilter-2.6.25-rc3/net/ipv4/tcp_ipv4.c
@@ -82,6 +82,8 @@
 #include <linux/crypto.h>
 #include <linux/scatterlist.h>
 
+#include <linux/netfilter_ipv4.h>
+
 int sysctl_tcp_tw_reuse __read_mostly;
 int sysctl_tcp_low_latency __read_mostly;
 
@@ -166,6 +168,10 @@ int tcp_v4_connect(struct sock *sk, stru
 	if (usin->sin_family != AF_INET)
 		return -EAFNOSUPPORT;
 
+	if ((err = ipt_control_check(IPPROTO_TCP, inet, usin))) {
+		return err;
+	}
+				    
 	nexthop = daddr = usin->sin_addr.s_addr;
 	if (inet->opt && inet->opt->srr) {
 		if (!daddr)
Index: netfilter-2.6.25-rc3/net/netfilter/xt_tcpudp.c
===================================================================
--- netfilter-2.6.25-rc3.orig/net/netfilter/xt_tcpudp.c
+++ netfilter-2.6.25-rc3/net/netfilter/xt_tcpudp.c
@@ -19,6 +19,7 @@ MODULE_ALIAS("ipt_tcp");
 MODULE_ALIAS("ip6t_udp");
 MODULE_ALIAS("ip6t_tcp");
 
+#define DEBUG_IP_FIREWALL_USER
 #ifdef DEBUG_IP_FIREWALL_USER
 #define duprintf(format, args...) printk(format , ## args)
 #else
@@ -75,6 +76,8 @@ tcp_mt(const struct sk_buff *skb, const 
 	struct tcphdr _tcph, *th;
 	const struct xt_tcp *tcpinfo = matchinfo;
 
+	printk(KERN_ERR "In tcp_mt\n");
+
 	if (offset) {
 		/* To quote Alan:
 
@@ -93,6 +96,8 @@ tcp_mt(const struct sk_buff *skb, const 
 #define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg)))
 
 	th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph);
+	
+	printk(KERN_ERR "th=%p\n", th);
 	if (th == NULL) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
@@ -101,18 +106,29 @@ tcp_mt(const struct sk_buff *skb, const 
 		return false;
 	}
 
+	duprintf("Checking source ports\n");
+
 	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
 			ntohs(th->source),
 			!!(tcpinfo->invflags & XT_TCP_INV_SRCPT)))
 		return false;
+
+	duprintf("Checking dest ports %d - %d vs %d\n", tcpinfo->dpts[0], tcpinfo->dpts[1], ntohs(th->dest));
+
 	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
 			ntohs(th->dest),
 			!!(tcpinfo->invflags & XT_TCP_INV_DSTPT)))
 		return false;
+
+	duprintf("Checking flags\n");
+
 	if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
 		      == tcpinfo->flg_cmp,
 		      XT_TCP_INV_FLAGS))
 		return false;
+
+	duprintf("Checking options\n");
+
 	if (tcpinfo->option) {
 		if (th->doff * 4 < sizeof(_tcph)) {
 			*hotdrop = true;

[-- Attachment #3: Type: text/plain, Size: 206 bytes --]

_______________________________________________
Containers mailing list
Containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA@public.gmane.org
https://lists.linux-foundation.org/mailman/listinfo/containers

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-14  2:07                       ` Paul Menage
@ 2009-01-14  2:47                         ` Li Zefan
  2009-01-14  2:50                           ` Paul Menage
  0 siblings, 1 reply; 33+ messages in thread
From: Li Zefan @ 2009-01-14  2:47 UTC (permalink / raw)
  To: Paul Menage; +Cc: Grzegorz Nosek, containers, netdev

> That would be possible, but I'm not sure that extending
> hierarchy_mutex across all the create calls is a good idea - it's
> meant to be very lightweight.
> 

agree

> OK, an alternative way to avoid cgroup_lock() is for the
> spinlock-protected state in ipcgroup to be the address and the count
> of active children.
> 

This works. But:

- we put extra burden on subsystem developers.
- hierarchy_mutex can't do what we expect, and it's a bit subtle.
- there won't be performance problem or potential lock issue to use
  cgroup_mutex in subsys' simple write functions, so I don't think
  we have to avoid cgroup_mutex here.

In memcg, both mem_cgroup_hierarchy_write() and mem_cgroup_swappiness_write()
check cgrp->children list, similar to ipv4_write() here. And I'm going
to fix swappiness_write() for it doesn't hold cgroup_lock(), but if
avoiding cgroup_lock() is the direction, then I have to use this
alternative way you sugguested.

> create() does:
> 
> lock parent
> css->addr = parent->addr
> parent->child_count++;
> unlock parent
> 
> and write does:
> 
> lock css
> if (!css->child_count) {
>   css->addr = new_addr
> } else {
>   report error;
> }
> unlock css
> 
> Paul
> 
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [Devel] Re: [RFC][PATCH] IP address restricting cgroup subsystem
  2009-01-14  2:47                         ` Li Zefan
@ 2009-01-14  2:50                           ` Paul Menage
  0 siblings, 0 replies; 33+ messages in thread
From: Paul Menage @ 2009-01-14  2:50 UTC (permalink / raw)
  To: Li Zefan; +Cc: Grzegorz Nosek, containers, netdev

On Tue, Jan 13, 2009 at 6:47 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>
> - we put extra burden on subsystem developers.

Better than putting extra burden on cgroup_lock() :-)

Anyway, if this turns out to be a common operation, we can provide
support for it in the framework rather than having each subsystem
duplicate the effort.

Paul

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2009-01-14  2:50 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-06 23:05 [RFC][PATCH] IP address restricting cgroup subsystem Grzegorz Nosek
     [not found] ` <20090106230554.GB25228-IaEwMO9oKu/77SC2UrCW1JJg/dWx8T/9@public.gmane.org>
2009-01-07  6:01   ` Li Zefan
     [not found]     ` <49644526.8030205-BthXqXjhjHXQFUHtdCDX3A@public.gmane.org>
2009-01-07  7:38       ` Grzegorz Nosek
2009-01-07  8:36         ` Li Zefan
2009-01-07  9:16           ` Grzegorz Nosek
2009-01-07  9:33             ` Li Zefan
2009-01-07  9:37               ` Grzegorz Nosek
2009-01-09 21:38               ` [Devel] " Paul Menage
2009-01-10  4:50                 ` Li Zefan
2009-01-10 16:14                   ` Paul Menage
2009-01-12  2:20                     ` Li Zefan
2009-01-14  2:07                       ` Paul Menage
2009-01-14  2:47                         ` Li Zefan
2009-01-14  2:50                           ` Paul Menage
2009-01-07 18:07   ` Serge E. Hallyn
     [not found]     ` <20090107180752.GA19153-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-01-07 19:15       ` Grzegorz Nosek
     [not found]         ` <20090107191536.GA15159-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
2009-01-07 19:32           ` Serge E. Hallyn
     [not found]             ` <20090107193234.GA22625-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-01-08 12:43               ` Benny Amorsen
     [not found]                 ` <20090109144122.GA9685@megiteam.pl>
     [not found]                   ` <20090109162247.GA7925@us.ibm.com>
     [not found]                     ` <20090109162247.GA7925-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-01-09 16:57                       ` Grzegorz Nosek
2009-01-09 16:54               ` Dan Smith
     [not found]                 ` <87priwifnu.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-01-09 17:43                   ` Guenter Roeck
     [not found]                     ` <20090109174334.GA4526-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org>
2009-01-09 18:12                       ` Dan Smith
     [not found]                         ` <87ljtkic1j.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
2009-01-09 22:37                           ` Guenter Roeck
     [not found]                             ` <20090109223756.GA22738-gvzKVTG1yJJBDgjK7y7TUQ@public.gmane.org>
2009-01-09 22:47                               ` Serge E. Hallyn
     [not found]                                 ` <20090109224742.GA15227-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-01-09 23:37                                   ` Guenter Roeck
2009-01-09 18:30                   ` Serge E. Hallyn
     [not found]                     ` <20090109183046.GA14063-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2009-01-13 16:23                       ` Dan Smith
2009-01-09 21:58   ` [Devel] " Paul Menage
     [not found]     ` <6599ad830901091358m11effdbegeff6cbb7ee28e262-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-01-10 11:20       ` Grzegorz Nosek
     [not found]         ` <20090110112009.GA12336-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
2009-01-10 16:21           ` Paul Menage
     [not found]             ` <6599ad830901100821q2c943d38i314c00f7db51b4f0-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-01-11  0:25               ` Benny Amorsen
2009-01-11 10:19               ` Grzegorz Nosek
     [not found]                 ` <20090111101946.GA14325-yp6mvK3Bdd2rDJvtcaxF/A@public.gmane.org>
2009-01-14  2:21                   ` Paul Menage

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.