Netdev List
 help / color / mirror / Atom feed
* [PATCH 13/14] userns: net: make many network capable calls targeted
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

When privilege is protected a namespaced network resource, then having
the required privilege targed toward the user namespace which owns the
resource suffices.

As with other patches, a big concern here is that we be cleanly separating
the cases where privilege protects a network resource from cases where
privilege can lead to laxer constraints on input and, subsequently,
the ability to corrupt, crash, or own the host kernel.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 net/8021q/vlan.c                  |   12 ++++++------
 net/bridge/br_ioctl.c             |   22 +++++++++++-----------
 net/bridge/br_sysfs_br.c          |    8 ++++----
 net/bridge/br_sysfs_if.c          |    2 +-
 net/bridge/netfilter/ebtables.c   |    8 ++++----
 net/core/ethtool.c                |    2 +-
 net/ipv4/arp.c                    |    2 +-
 net/ipv4/devinet.c                |    4 ++--
 net/ipv4/fib_frontend.c           |    2 +-
 net/ipv4/ip_options.c             |    6 +++---
 net/ipv4/ip_sockglue.c            |    4 ++--
 net/ipv4/ipip.c                   |    4 ++--
 net/ipv4/ipmr.c                   |    2 +-
 net/ipv4/netfilter/arp_tables.c   |    8 ++++----
 net/ipv4/netfilter/ip_tables.c    |    8 ++++----
 net/netfilter/ipset/ip_set_core.c |    2 +-
 net/netfilter/ipvs/ip_vs_ctl.c    |    4 ++--
 net/packet/af_packet.c            |    2 +-
 18 files changed, 51 insertions(+), 51 deletions(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 8970ba1..7d12f63 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -558,7 +558,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
 	switch (args.cmd) {
 	case SET_VLAN_INGRESS_PRIORITY_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		vlan_dev_set_ingress_priority(dev,
 					      args.u.skb_priority,
@@ -568,7 +568,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
 
 	case SET_VLAN_EGRESS_PRIORITY_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		err = vlan_dev_set_egress_priority(dev,
 						   args.u.skb_priority,
@@ -577,7 +577,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
 
 	case SET_VLAN_FLAG_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		err = vlan_dev_change_flags(dev,
 					    args.vlan_qos ? args.u.flag : 0,
@@ -586,7 +586,7 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
 
 	case SET_VLAN_NAME_TYPE_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		if ((args.u.name_type >= 0) &&
 		    (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) {
@@ -602,14 +602,14 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
 
 	case ADD_VLAN_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		err = register_vlan_device(dev, args.u.VID);
 		break;
 
 	case DEL_VLAN_CMD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		unregister_vlan_dev(dev, NULL);
 		err = 0;
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 7222fe1..c82f9cb 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -88,7 +88,7 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
 	struct net_device *dev;
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	dev = __dev_get_by_index(dev_net(br->dev), ifindex);
@@ -178,25 +178,25 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	}
 
 	case BRCTL_SET_BRIDGE_FORWARD_DELAY:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		return br_set_forward_delay(br, args[1]);
 
 	case BRCTL_SET_BRIDGE_HELLO_TIME:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		return br_set_hello_time(br, args[1]);
 
 	case BRCTL_SET_BRIDGE_MAX_AGE:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		return br_set_max_age(br, args[1]);
 
 	case BRCTL_SET_AGEING_TIME:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		br->ageing_time = clock_t_to_jiffies(args[1]);
@@ -236,14 +236,14 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 	}
 
 	case BRCTL_SET_BRIDGE_STP_STATE:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		br_stp_set_enabled(br, args[1]);
 		return 0;
 
 	case BRCTL_SET_BRIDGE_PRIORITY:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		spin_lock_bh(&br->lock);
@@ -256,7 +256,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		struct net_bridge_port *p;
 		int ret;
 
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		spin_lock_bh(&br->lock);
@@ -273,7 +273,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		struct net_bridge_port *p;
 		int ret;
 
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		spin_lock_bh(&br->lock);
@@ -330,7 +330,7 @@ static int old_deviceless(struct net *net, void __user *uarg)
 	{
 		char buf[IFNAMSIZ];
 
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ))
@@ -360,7 +360,7 @@ int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uar
 	{
 		char buf[IFNAMSIZ];
 
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		if (copy_from_user(buf, uarg, IFNAMSIZ))
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 68b893e..7f4fa3a 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -36,7 +36,7 @@ static ssize_t store_bridge_parm(struct device *d,
 	unsigned long val;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	val = simple_strtoul(buf, &endp, 0);
@@ -132,7 +132,7 @@ static ssize_t store_stp_state(struct device *d,
 	char *endp;
 	unsigned long val;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	val = simple_strtoul(buf, &endp, 0);
@@ -267,7 +267,7 @@ static ssize_t store_group_addr(struct device *d,
 	unsigned new_addr[6];
 	int i;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (sscanf(buf, "%x:%x:%x:%x:%x:%x",
@@ -304,7 +304,7 @@ static ssize_t store_flush(struct device *d,
 {
 	struct net_bridge *br = to_bridge(d);
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	br_fdb_flush(br);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 6229b62..9cb4d2e 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -209,7 +209,7 @@ static ssize_t brport_store(struct kobject * kobj,
 	char *endp;
 	unsigned long val;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(p->br->dev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	val = simple_strtoul(buf, &endp, 0);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 2b5ca1a..c403c45 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1462,7 +1462,7 @@ static int do_ebt_set_ctl(struct sock *sk,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch(cmd) {
@@ -1484,7 +1484,7 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	struct ebt_replace tmp;
 	struct ebt_table *t;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)))
@@ -2275,7 +2275,7 @@ static int compat_do_ebt_set_ctl(struct sock *sk,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2298,7 +2298,7 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
 	struct compat_ebt_replace tmp;
 	struct ebt_table *t;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	/* try real handler in case userland supplied needed padding */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6cdba5f..56878bf 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1676,7 +1676,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GFEATURES:
 		break;
 	default:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 	}
 
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96a164a..023ad24 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1175,7 +1175,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	switch (cmd) {
 	case SIOCDARP:
 	case SIOCSARP:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 	case SIOCGARP:
 		err = copy_from_user(&r, arg, sizeof(struct arpreq));
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 37b3c18..3683d37 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -728,7 +728,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 
 	case SIOCSIFFLAGS:
 		ret = -EACCES;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			goto out;
 		break;
 	case SIOCSIFADDR:	/* Set interface address (and family) */
@@ -736,7 +736,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCSIFDSTADDR:	/* Set the destination address */
 	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
 		ret = -EACCES;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			goto out;
 		ret = -EINVAL;
 		if (sin->sin_family != AF_INET)
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 92fc5f6..8f34a07 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -437,7 +437,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	switch (cmd) {
 	case SIOCADDRT:		/* Add a route */
 	case SIOCDELRT:		/* Delete a route */
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 
 		if (copy_from_user(&rt, arg, sizeof(rt)))
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ec93335..21df700 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -396,7 +396,7 @@ int ip_options_compile(struct net *net,
 					optptr[2] += 8;
 					break;
 				      default:
-					if (!skb && !capable(CAP_NET_RAW)) {
+					if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
 						pp_ptr = optptr + 3;
 						goto error;
 					}
@@ -432,7 +432,7 @@ int ip_options_compile(struct net *net,
 				opt->router_alert = optptr - iph;
 			break;
 		      case IPOPT_CIPSO:
-			if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) {
+			if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) {
 				pp_ptr = optptr;
 				goto error;
 			}
@@ -445,7 +445,7 @@ int ip_options_compile(struct net *net,
 		      case IPOPT_SEC:
 		      case IPOPT_SID:
 		      default:
-			if (!skb && !capable(CAP_NET_RAW)) {
+			if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) {
 				pp_ptr = optptr;
 				goto error;
 			}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index ab0c9ef..972c65f 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -955,13 +955,13 @@ mc_msf_out:
 	case IP_IPSEC_POLICY:
 	case IP_XFRM_POLICY:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 			break;
 		err = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
 
 	case IP_TRANSPARENT:
-		if (!capable(CAP_NET_ADMIN)) {
+		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
 			err = -EPERM;
 			break;
 		}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 378b20b..6725832 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -629,7 +629,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			goto done;
 
 		err = -EFAULT;
@@ -689,7 +689,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 
 	case SIOCDELTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			goto done;
 
 		if (dev == ipn->fb_tunnel_dev) {
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 58e8791..309aa0c 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1204,7 +1204,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 
 	if (optname != MRT_INIT) {
 		if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
-		    !capable(CAP_NET_ADMIN))
+		    !ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index fd7a3f6..acc908f 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1534,7 +1534,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1678,7 +1678,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1699,7 +1699,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1723,7 +1723,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 24e556e..72f2cde 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1847,7 +1847,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1962,7 +1962,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1984,7 +1984,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2009,7 +2009,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index d7e86ef..38d69a5 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1596,7 +1596,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 	void *data;
 	int copylen = *len, ret = 0;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 	if (optval != SO_IP_SET)
 		return -EBADF;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index be43fd8..7eda0eb 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2284,7 +2284,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 	struct ip_vs_dest_user *udest_compat;
 	struct ip_vs_dest_user_kern udest;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
@@ -2566,7 +2566,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 	struct netns_ipvs *ipvs = net_ipvs(net);
 
 	BUG_ON(!net);
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c698cec..c2e6bb6 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1793,7 +1793,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
 	int err;
 
-	if (!capable(CAP_NET_RAW))
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 		return -EPERM;
 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
 	    sock->type != SOCK_PACKET)
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 12/14] user_ns: target af_key capability check
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

This presumes that it really is complete wrt network namespaces.  Looking
at the code it appears to be.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 net/key/af_key.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/key/af_key.c b/net/key/af_key.c
index 1e733e9..1f90f4e 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -141,7 +141,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
 	struct sock *sk;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 	if (sock->type != SOCK_RAW)
 		return -ESOCKTNOSUPPORT;
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 11/14] userns: make some net-sysfs capable calls targeted
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

Changelog: jul 1: fix compilation errors (net_device != net)

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 net/core/net-sysfs.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1683e5d..876915b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -76,7 +76,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
 	unsigned long new;
 	int ret = -EINVAL;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(net)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	new = simple_strtoul(buf, &endp, 0);
@@ -261,7 +261,7 @@ static ssize_t store_ifalias(struct device *dev, struct device_attribute *attr,
 	size_t count = len;
 	ssize_t ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(dev_net(netdev)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	/* ignore trailing newline */
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 10/14] net/core/scm.c: target capable() calls to user_ns owning the net_ns
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

The uid/gid comparisons don't have to be pulled out.  This just seemed
more easily proved correct.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 net/core/scm.c |   41 ++++++++++++++++++++++++++++++++++-------
 1 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/net/core/scm.c b/net/core/scm.c
index 4c1ef02..21b5d0b 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -43,17 +43,44 @@
  *	setu(g)id.
  */
 
-static __inline__ int scm_check_creds(struct ucred *creds)
+static __inline__ bool uidequiv(struct cred *src, struct ucred *tgt,
+			       struct user_namespace *ns)
+{
+	if (src->user_ns != ns)
+		goto check_capable;
+	if (src->uid == tgt->uid || src->euid == tgt->uid ||
+	    src->suid == tgt->uid)
+		return true;
+check_capable:
+	if (ns_capable(ns, CAP_SETUID))
+		return true;
+	return false;
+}
+
+static __inline__ bool gidequiv(struct cred *src, struct ucred *tgt,
+			       struct user_namespace *ns)
+{
+	if (src->user_ns != ns)
+		goto check_capable;
+	if (src->gid == tgt->gid || src->egid == tgt->gid ||
+	    src->sgid == tgt->gid)
+		return true;
+check_capable:
+	if (ns_capable(ns, CAP_SETGID))
+		return true;
+	return false;
+}
+
+static __inline__ int scm_check_creds(struct ucred *creds, struct socket *sock)
 {
 	const struct cred *cred = current_cred();
+	struct user_namespace *ns = sock_net(sock->sk)->user_ns;
 
-	if ((creds->pid == task_tgid_vnr(current) || capable(CAP_SYS_ADMIN)) &&
-	    ((creds->uid == cred->uid   || creds->uid == cred->euid ||
-	      creds->uid == cred->suid) || capable(CAP_SETUID)) &&
-	    ((creds->gid == cred->gid   || creds->gid == cred->egid ||
-	      creds->gid == cred->sgid) || capable(CAP_SETGID))) {
+	if ((creds->pid == task_tgid_vnr(current) || ns_capable(ns, CAP_SYS_ADMIN)) &&
+	     uidequiv(cred, creds, ns) && gidequiv(cred, creds, ns)) {
 	       return 0;
 	}
+
 	return -EPERM;
 }
 
@@ -169,7 +196,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred)))
 				goto error;
 			memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred));
-			err = scm_check_creds(&p->creds);
+			err = scm_check_creds(&p->creds, sock);
 			if (err)
 				goto error;
 
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 09/14] user ns: convert ipv6 to targeted capabilities
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 net/ipv6/addrconf.c             |    4 ++--
 net/ipv6/af_inet6.c             |    6 ++++--
 net/ipv6/datagram.c             |    6 +++---
 net/ipv6/ip6_flowlabel.c        |   24 ++++++++++++++----------
 net/ipv6/ip6_tunnel.c           |    4 ++--
 net/ipv6/ip6mr.c                |    2 +-
 net/ipv6/ipv6_sockglue.c        |    7 ++++---
 net/ipv6/netfilter/ip6_tables.c |    8 ++++----
 net/ipv6/route.c                |    2 +-
 net/ipv6/sit.c                  |   10 +++++-----
 10 files changed, 40 insertions(+), 33 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a06c53c..6b93e3b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2228,7 +2228,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
 	struct in6_ifreq ireq;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
@@ -2247,7 +2247,7 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg)
 	struct in6_ifreq ireq;
 	int err;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 3b5669a..1854ffe 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -160,7 +160,8 @@ lookup_protocol:
 	}
 
 	err = -EPERM;
-	if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+	if (sock->type == SOCK_RAW && !kern &&
+	    !ns_capable(net->user_ns, CAP_NET_RAW))
 		goto out_rcu_unlock;
 
 	sock->ops = answer->ops;
@@ -281,7 +282,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		return -EINVAL;
 
 	snum = ntohs(addr->sin6_port);
-	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+	if (snum && snum < PROT_SOCK &&
+	    !ns_capable(sock_net(sk)->user_ns, CAP_NET_BIND_SERVICE))
 		return -EACCES;
 
 	lock_sock(sk);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1656033..7e38d8f 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -694,7 +694,7 @@ int datagram_send_ctl(struct net *net,
 				err = -EINVAL;
 				goto exit_f;
 			}
-			if (!capable(CAP_NET_RAW)) {
+			if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
 				err = -EPERM;
 				goto exit_f;
 			}
@@ -714,7 +714,7 @@ int datagram_send_ctl(struct net *net,
 				err = -EINVAL;
 				goto exit_f;
 			}
-			if (!capable(CAP_NET_RAW)) {
+			if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
 				err = -EPERM;
 				goto exit_f;
 			}
@@ -739,7 +739,7 @@ int datagram_send_ctl(struct net *net,
 				err = -EINVAL;
 				goto exit_f;
 			}
-			if (!capable(CAP_NET_RAW)) {
+			if (!ns_capable(net->user_ns, CAP_NET_RAW)) {
 				err = -EPERM;
 				goto exit_f;
 			}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index f3caf1b..4726c02 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -294,21 +294,22 @@ struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space,
 	return opt_space;
 }
 
-static unsigned long check_linger(unsigned long ttl)
+static unsigned long check_linger(unsigned long ttl, struct user_namespace *ns)
 {
 	if (ttl < FL_MIN_LINGER)
 		return FL_MIN_LINGER*HZ;
-	if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN))
+	if (ttl > FL_MAX_LINGER && !ns_capable(ns, CAP_NET_ADMIN))
 		return 0;
 	return ttl*HZ;
 }
 
-static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires)
+static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger,
+		     unsigned long expires, struct user_namespace *ns)
 {
-	linger = check_linger(linger);
+	linger = check_linger(linger, ns);
 	if (!linger)
 		return -EPERM;
-	expires = check_linger(expires);
+	expires = check_linger(expires, ns);
 	if (!expires)
 		return -EPERM;
 	fl->lastuse = jiffies;
@@ -375,7 +376,7 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval,
 
 	fl->fl_net = hold_net(net);
 	fl->expires = jiffies;
-	err = fl6_renew(fl, freq->flr_linger, freq->flr_expires);
+	err = fl6_renew(fl, freq->flr_linger, freq->flr_expires, net->user_ns);
 	if (err)
 		goto done;
 	fl->share = freq->flr_share;
@@ -425,7 +426,7 @@ static int mem_check(struct sock *sk)
 	if (room <= 0 ||
 	    ((count >= FL_MAX_PER_SOCK ||
 	      (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) &&
-	     !capable(CAP_NET_ADMIN)))
+	     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
 		return -ENOBUFS;
 
 	return 0;
@@ -507,17 +508,20 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
 		read_lock_bh(&ip6_sk_fl_lock);
 		for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) {
 			if (sfl->fl->label == freq.flr_label) {
-				err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires);
+				err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires,
+						net->user_ns);
 				read_unlock_bh(&ip6_sk_fl_lock);
 				return err;
 			}
 		}
 		read_unlock_bh(&ip6_sk_fl_lock);
 
-		if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) {
+		if (freq.flr_share == IPV6_FL_S_NONE &&
+		    ns_capable(net->user_ns, CAP_NET_ADMIN)) {
 			fl = fl_lookup(net, freq.flr_label);
 			if (fl) {
-				err = fl6_renew(fl, freq.flr_linger, freq.flr_expires);
+				err = fl6_renew(fl, freq.flr_linger, freq.flr_expires,
+						net->user_ns);
 				fl_release(fl);
 				return err;
 			}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 36c2842..1a98c23 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1269,7 +1269,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			break;
 		err = -EFAULT;
 		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
@@ -1304,7 +1304,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 		break;
 	case SIOCDELTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			break;
 
 		if (dev == ip6n->fb_tnl_dev) {
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 705c828..1649ccd 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1582,7 +1582,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		return -ENOENT;
 
 	if (optname != MRT6_INIT) {
-		if (sk != mrt->mroute6_sk && !capable(CAP_NET_ADMIN))
+		if (sk != mrt->mroute6_sk && !ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 9cb191e..196b099 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -343,7 +343,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 		break;
 
 	case IPV6_TRANSPARENT:
-		if (!capable(CAP_NET_ADMIN)) {
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
 			retv = -EPERM;
 			break;
 		}
@@ -381,7 +381,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 
 		/* hop-by-hop / destination options are privileged option */
 		retv = -EPERM;
-		if (optname != IPV6_RTHDR && !capable(CAP_NET_RAW))
+		if (optname != IPV6_RTHDR &&
+		    !ns_capable(net->user_ns, CAP_NET_RAW))
 			break;
 
 		opt = ipv6_renew_options(sk, np->opt, optname,
@@ -725,7 +726,7 @@ done:
 	case IPV6_IPSEC_POLICY:
 	case IPV6_XFRM_POLICY:
 		retv = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			break;
 		retv = xfrm_user_policy(sk, optname, optval, optlen);
 		break;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 94874b0..7fce7d8 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1869,7 +1869,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1984,7 +1984,7 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2006,7 +2006,7 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2031,7 +2031,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
 	int ret;
 
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e8987da..2a64c67 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1929,7 +1929,7 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	switch(cmd) {
 	case SIOCADDRT:		/* Add a route */
 	case SIOCDELRT:		/* Delete a route */
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 		err = copy_from_user(&rtmsg, arg,
 				     sizeof(struct in6_rtmsg));
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 07bf108..402e2a5 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -308,7 +308,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
 	/* For simple GET or for root users,
 	 * we try harder to allocate.
 	 */
-	kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
+	kp = (cmax <= 1 || ns_capable(dev_net(t->dev)->user_ns, CAP_NET_ADMIN)) ?
 		kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
 		NULL;
 
@@ -926,7 +926,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCADDTUNNEL:
 	case SIOCCHGTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			goto done;
 
 		err = -EFAULT;
@@ -985,7 +985,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 
 	case SIOCDELTUNNEL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			goto done;
 
 		if (dev == sitn->fb_tunnel_dev) {
@@ -1018,7 +1018,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCDELPRL:
 	case SIOCCHGPRL:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			goto done;
 		err = -EINVAL;
 		if (dev == sitn->fb_tunnel_dev)
@@ -1047,7 +1047,7 @@ ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 	case SIOCCHG6RD:
 	case SIOCDEL6RD:
 		err = -EPERM;
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
 			goto done;
 
 		err = -EFAULT;
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 08/14] af_netlink.c: make netlink_capable userns-aware
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn,
	Eric Dumazet
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

netlink_capable should check for permissions against the user
namespace owning the socket in question.

Changelog:
  Per Eric Dumazet advice, use sock_net(sk) instead of #ifdef.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
---
 net/netlink/af_netlink.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 0a4db02..3cc0bbe 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -580,8 +580,9 @@ retry:
 
 static inline int netlink_capable(struct socket *sock, unsigned int flag)
 {
-	return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) ||
-	       capable(CAP_NET_ADMIN);
+	if (nl_table[sock->sk->sk_protocol].nl_nonroot & flag)
+		return 1;
+	return ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
 }
 
 static void
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 07/14] user namespace: use net->user_ns for some capable calls under net/
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel; +Cc: dhowells, ebiederm, containers, netdev, akpm, Serge Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge Hallyn <serge.hallyn@ubuntu.com>

Just a partial conversion to show how the previous patch is expected to
be used.

Changelog:
  6/28/11: fix typo in net/core/sock.c
  7/08/11: don't target capability which authorizes module loading

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 net/core/dev.c  |    4 ++--
 net/core/sock.c |   14 ++++++++------
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 9444c5c..cee43eb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5014,7 +5014,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCGMIIPHY:
 	case SIOCGMIIREG:
 	case SIOCSIFNAME:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 		dev_load(net, ifr.ifr_name);
 		rtnl_lock();
@@ -5053,7 +5053,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	case SIOCBRADDIF:
 	case SIOCBRDELIF:
 	case SIOCSHWTSTAMP:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			return -EPERM;
 		/* fall through */
 	case SIOCBONDSLAVEINFOQUERY:
diff --git a/net/core/sock.c b/net/core/sock.c
index bc745d0..0f31675 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -420,7 +420,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 
 	/* Sorry... */
 	ret = -EPERM;
-	if (!capable(CAP_NET_RAW))
+	if (!ns_capable(net->user_ns, CAP_NET_RAW))
 		goto out;
 
 	ret = -EINVAL;
@@ -488,6 +488,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 	int valbool;
 	struct linger ling;
 	int ret = 0;
+	struct net *net = sock_net(sk);
 
 	/*
 	 *	Options without arguments
@@ -508,7 +509,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 
 	switch (optname) {
 	case SO_DEBUG:
-		if (val && !capable(CAP_NET_ADMIN))
+		if (val && !ns_capable(net->user_ns, CAP_NET_ADMIN))
 			ret = -EACCES;
 		else
 			sock_valbool_flag(sk, SOCK_DBG, valbool);
@@ -551,7 +552,7 @@ set_sndbuf:
 		break;
 
 	case SO_SNDBUFFORCE:
-		if (!capable(CAP_NET_ADMIN)) {
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
 			ret = -EPERM;
 			break;
 		}
@@ -589,7 +590,7 @@ set_rcvbuf:
 		break;
 
 	case SO_RCVBUFFORCE:
-		if (!capable(CAP_NET_ADMIN)) {
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
 			ret = -EPERM;
 			break;
 		}
@@ -612,7 +613,8 @@ set_rcvbuf:
 		break;
 
 	case SO_PRIORITY:
-		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+		if ((val >= 0 && val <= 6) ||
+		     ns_capable(net->user_ns, CAP_NET_ADMIN))
 			sk->sk_priority = val;
 		else
 			ret = -EPERM;
@@ -729,7 +731,7 @@ set_rcvbuf:
 			clear_bit(SOCK_PASSSEC, &sock->flags);
 		break;
 	case SO_MARK:
-		if (!capable(CAP_NET_ADMIN))
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 			ret = -EPERM;
 		else
 			sk->sk_mark = val;
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 06/14] user namespace: make each net (net_ns) belong to a user_ns
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

This way we can target capabilites at the user_ns which created the
net ns.

Changelog:
   jul 8: nsproxy: don't assign netns->userns if not cloning.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 include/net/net_namespace.h |    2 ++
 kernel/nsproxy.c            |    2 ++
 net/core/net_namespace.c    |    3 +++
 3 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 1ab1aec..38a5154 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -29,6 +29,7 @@ struct ctl_table_header;
 struct net_generic;
 struct sock;
 struct netns_ipvs;
+struct user_namespace;
 
 
 #define NETDEV_HASHBITS    8
@@ -101,6 +102,7 @@ struct net {
 	struct netns_xfrm	xfrm;
 #endif
 	struct netns_ipvs	*ipvs;
+	struct user_namespace	*user_ns;
 };
 
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f50542d..e616904 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -95,6 +95,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
 		err = PTR_ERR(new_nsp->net_ns);
 		goto out_net;
 	}
+	if (flags & CLONE_NEWNET)
+		new_nsp->net_ns->user_ns = get_user_ns(task_cred_xxx(tsk, user_ns));
 
 	return new_nsp;
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 5bbdbf0..791c19c 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -10,6 +10,7 @@
 #include <linux/nsproxy.h>
 #include <linux/proc_fs.h>
 #include <linux/file.h>
+#include <linux/user_namespace.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
@@ -209,6 +210,7 @@ static void net_free(struct net *net)
 	}
 #endif
 	kfree(net->gen);
+	put_user_ns(net->user_ns);
 	kmem_cache_free(net_cachep, net);
 }
 
@@ -389,6 +391,7 @@ static int __init net_ns_init(void)
 	rcu_assign_pointer(init_net.gen, ng);
 
 	mutex_lock(&net_mutex);
+	init_net.user_ns = &init_user_ns;
 	if (setup_net(&init_net))
 		panic("Could not setup the initial network namespace");
 
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 05/14] userns: clamp down users of cap_raised
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

A few modules are using cap_raised(current_cap(), cap) to authorize
actions, but the privilege should be applicable against the initial
user namespace.  Refuse privilege if the caller is not in init_user_ns.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 drivers/block/drbd/drbd_nl.c           |    5 +++++
 drivers/md/dm-log-userspace-transfer.c |    3 +++
 drivers/staging/pohmelfs/config.c      |    3 +++
 drivers/video/uvesafb.c                |    3 +++
 4 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 515bcd9..7717f8a 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2297,6 +2297,11 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
 		return;
 	}
 
+	if (current_user_ns() != &init_user_ns) {
+		retcode = ERR_PERM;
+		goto fail;
+	}
+
 	if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) {
 		retcode = ERR_PERM;
 		goto fail;
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 1f23e04..140ca81 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -134,6 +134,9 @@ static void cn_ulog_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 {
 	struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
 
+	if (current_user_ns() != &init_user_ns)
+		return;
+
 	if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
 		return;
 
diff --git a/drivers/staging/pohmelfs/config.c b/drivers/staging/pohmelfs/config.c
index b6c42cb..cd259d0 100644
--- a/drivers/staging/pohmelfs/config.c
+++ b/drivers/staging/pohmelfs/config.c
@@ -525,6 +525,9 @@ static void pohmelfs_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *n
 {
 	int err;
 
+	if (current_user_ns() != &init_user_ns)
+		return;
+
 	if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
 		return;
 
diff --git a/drivers/video/uvesafb.c b/drivers/video/uvesafb.c
index 7f8472c..71dab8e 100644
--- a/drivers/video/uvesafb.c
+++ b/drivers/video/uvesafb.c
@@ -73,6 +73,9 @@ static void uvesafb_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *ns
 	struct uvesafb_task *utask;
 	struct uvesafb_ktask *task;
 
+	if (current_user_ns() != &init_user_ns)
+		return;
+
 	if (!cap_raised(current_cap(), CAP_SYS_ADMIN))
 		return;
 
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 04/14] user_ns: convert fs/attr.c to targeted capabilities
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/attr.c |   20 +++++++++++++-------
 1 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index 538e279..e0cf46a 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -29,6 +29,7 @@
 int inode_change_ok(const struct inode *inode, struct iattr *attr)
 {
 	unsigned int ia_valid = attr->ia_valid;
+	struct user_namespace *ns;
 
 	/*
 	 * First check size constraints.  These can't be overriden using
@@ -44,26 +45,28 @@ int inode_change_ok(const struct inode *inode, struct iattr *attr)
 	if (ia_valid & ATTR_FORCE)
 		return 0;
 
+	ns = inode_userns(inode);
 	/* Make sure a caller can chown. */
 	if ((ia_valid & ATTR_UID) &&
-	    (current_fsuid() != inode->i_uid ||
-	     attr->ia_uid != inode->i_uid) && !capable(CAP_CHOWN))
+	    (ns != current_user_ns() || current_fsuid() != inode->i_uid ||
+	     attr->ia_uid != inode->i_uid) && !ns_capable(ns, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure caller can chgrp. */
 	if ((ia_valid & ATTR_GID) &&
-	    (current_fsuid() != inode->i_uid ||
+	    (ns != current_user_ns() || current_fsuid() != inode->i_uid ||
 	    (!in_group_p(attr->ia_gid) && attr->ia_gid != inode->i_gid)) &&
-	    !capable(CAP_CHOWN))
+	    !ns_capable(ns, CAP_CHOWN))
 		return -EPERM;
 
 	/* Make sure a caller can chmod. */
 	if (ia_valid & ATTR_MODE) {
+		gid_t gid = (ia_valid & ATTR_GID) ? attr->ia_gid : inode->i_gid;
 		if (!inode_owner_or_capable(inode))
 			return -EPERM;
 		/* Also check the setgid bit! */
-		if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid :
-				inode->i_gid) && !capable(CAP_FSETID))
+		if ((ns != current_user_ns() || !in_group_p(gid)) &&
+		    !ns_capable(ns, CAP_FSETID))
 			attr->ia_mode &= ~S_ISGID;
 	}
 
@@ -154,9 +157,12 @@ void setattr_copy(struct inode *inode, const struct iattr *attr)
 						inode->i_sb->s_time_gran);
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
+		struct user_namespace *ns = inode_userns(inode);
 
-		if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+		if ((ns != current_user_ns() || !in_group_p(inode->i_gid)) &&
+		    !ns_capable(ns, CAP_FSETID))
 			mode &= ~S_ISGID;
+
 		inode->i_mode = mode;
 	}
 }
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 03/14] keyctl: check capabilities against key's user_ns
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

ATM, task should only be able to get his own user_ns's keys
anyway, so nsown_capable should also work, but there is no
advantage to doing that, while using key's user_ns is clearer.

changelog: jun 6:
	compile fix: keyctl.c (key_user, not key has user_ns)

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 security/keys/keyctl.c |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index eca5191..fa7d420 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -745,7 +745,7 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	ret = -EACCES;
 	down_write(&key->sem);
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!ns_capable(key->user->user_ns, CAP_SYS_ADMIN)) {
 		/* only the sysadmin can chown a key to some other UID */
 		if (uid != (uid_t) -1 && key->uid != uid)
 			goto error_put;
@@ -852,7 +852,8 @@ long keyctl_setperm_key(key_serial_t id, key_perm_t perm)
 	down_write(&key->sem);
 
 	/* if we're not the sysadmin, we can only change a key that we own */
-	if (capable(CAP_SYS_ADMIN) || key->uid == current_fsuid()) {
+	if (ns_capable(key->user->user_ns, CAP_SYS_ADMIN) ||
+	    key->uid == current_fsuid()) {
 		key->perm = perm;
 		ret = 0;
 	}
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 02/14] allow root in container to copy namespaces
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

Othewise nested containers with user namespaces won't be possible.

It's true that user namespaces are not yet fully isolated, but for
that same reason there are far worse things that root in a child
user ns can do.  Spawning a child user ns is not in itself bad.

This patch also allows setns for root in a container:
@Eric Biederman: are there gotchas in allowing setns from child
userns?

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 kernel/fork.c    |    4 ++--
 kernel/nsproxy.c |    6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 17bf7c8..22d0cf0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1473,8 +1473,8 @@ long do_fork(unsigned long clone_flags,
 		/* hopefully this check will go away when userns support is
 		 * complete
 		 */
-		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
-				!capable(CAP_SETGID))
+		if (!nsown_capable(CAP_SYS_ADMIN) || !nsown_capable(CAP_SETUID) ||
+				!nsown_capable(CAP_SETGID))
 			return -EPERM;
 	}
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9aeab4b..f50542d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -134,7 +134,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
 				CLONE_NEWPID | CLONE_NEWNET)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!nsown_capable(CAP_SYS_ADMIN)) {
 		err = -EPERM;
 		goto out;
 	}
@@ -191,7 +191,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
 			       CLONE_NEWNET)))
 		return 0;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!nsown_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	*new_nsp = create_new_namespaces(unshare_flags, current,
@@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	struct file *file;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!nsown_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	file = proc_ns_fget(fd);
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 01/14] add Documentation/namespaces/user_namespace.txt
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: dhowells, ebiederm, containers, netdev, akpm, Serge E. Hallyn
In-Reply-To: <1311706717-7398-1-git-send-email-serge@hallyn.com>

From: Serge E. Hallyn <serge.hallyn@canonical.com>

This will hold some info about the design.  Currently it contains
future todos, issues and questions.

Changelog:
   jul 26: incorporate feed back from David Howells.

Signed-off-by: Serge E. Hallyn <serge.hallyn@canonical.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: David Howells <dhowells@redhat.com>
---
 Documentation/namespaces/user_namespace.txt |  107 +++++++++++++++++++++++++++
 1 files changed, 107 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/namespaces/user_namespace.txt

diff --git a/Documentation/namespaces/user_namespace.txt b/Documentation/namespaces/user_namespace.txt
new file mode 100644
index 0000000..7e50517
--- /dev/null
+++ b/Documentation/namespaces/user_namespace.txt
@@ -0,0 +1,107 @@
+Description
+===========
+
+Traditionally, each task is owned by a user ID (UID) and belongs to one or more
+groups (GID).  Both are simple numeric IDs, though userspace usually translates
+them to names.  The user namespace allows tasks to have different views of the
+UIDs and GIDs associated with tasks and other resources.  (See 'UID mapping'
+below for more)
+
+The user namespace is a simple hierarchical one.  The system starts with all
+tasks belonging to the initial user namespace.  A task creates a new user
+namespace by passing the CLONE_NEWUSER flag to clone(2).  This requires the
+creating task to have the CAP_SETUID, CAP_SETGID, and CAP_CHOWN capabilities,
+but it does not need to be running as root.  The clone(2) call will result in a
+new task which to itself appears to be running as UID and GID 0, but to its
+creator seems to have the creator's credentials.
+
+Any task in or resource belonging to the initial user namespace will, to this
+new task, appear to belong to UID and GID -1 - which is usually known as
+'nobody'.  Permission to open such files will be granted according to world
+access permissions.  UID comparisons and group membership checks will return
+false, and privilege will be denied.
+
+When a task belonging to (for example) userid 500 in the initial user namespace
+creates a new user namespace, even though the new task will see itself as
+belonging to UID 0, any task in the initial user namespace will see it as
+belonging to UID 500.  Therefore, UID 500 in the initial user namespace will be
+able to kill the new task.  Files created by the new user will (eventually) be
+seen by tasks in its own user namespace as belonging to UID 0, but to tasks in
+the initial user namespace as belonging to UID 500.
+
+Note that this userid mapping for the VFS is not yet implemented, though the
+lkml and containers mailing list archives will show several previous
+prototypes.  In the end, those got hung up waiting on the concept of targeted
+capabilities to be developed, which, thanks to the insight of Eric Biederman,
+they finally did.
+
+Relationship between the User namespace and other namespaces
+============================================================
+
+Other namespaces, such as UTS and network, are owned by a user namespace.  When
+such a namespace is created, it is assigned to the user namespace of the task
+by which it was created.  Therefore, attempts to exercise privilege to
+resources in, for instance, a particular network namespace, can be properly
+validated by checking whether the caller has the needed privilege (i.e.
+CAP_NET_ADMIN) targeted to the user namespace which owns the network namespace.
+This is done using the ns_capable() function.
+
+As an example, if a new task is cloned with a private user namespace but
+no private network namespace, then the task's network namespace is owned
+by the parent user namespace.  The new task has no privilege to the
+parent user namespace, so it will not be able to create or configure
+network devices.  If, instead, the task were cloned with both private
+user and network namespaces, then the private network namespace is owned
+by the private user namespace, and so root in the new user namespace
+will have privilege targeted to the network namespace.  It will be able
+to create and configure network devices.
+
+UID Mapping
+===========
+The current plan (see 'flexible UID mapping' at
+https://wiki.ubuntu.com/UserNamespace) is:
+
+The UID/GID stored on disk will be that in the init_user_ns.  Most likely
+UID/GID in other namespaces will be stored in xattrs.  But Eric was advocating
+(a few years ago) leaving the details up to filesystems while providing a lib/
+stock implementation.  See the thread around here
+http://www.mail-archive.com/devel@openvz.org/msg09331.html
+
+
+Working notes
+=============
+Capability checks for actions related to syslog must be against the
+init_user_ns until syslog is containerized.
+
+Same is true for reboot and power, control groups, devices, and time.
+
+Perf actions (kernel/event/core.c for instance) will always be constrained to
+init_user_ns.
+
+Q:
+Is accounting considered properly containerized wrt pidns?  (it appears to be).
+If so, then we can change the capable() check in kernel/acct.c to
+'ns_capable(current_pid_ns()->user_ns, CAP_PACCT)'
+
+Q:
+For things like nice and schedaffinity, we could allow root in a container to
+control those, and leave only cgroups to constrain the container.  I'm not sure
+whether that is right, or whether it violates admin expectations.
+
+I deferred some of commoncap.c.  I'm punting on xattr stuff as they take
+dentries, not inodes.
+
+For drivers/tty/tty_io.c and drivers/tty/vt/vt.c, we'll want to (for some of
+them) target the capability checks at the user_ns owning the tty.  That will
+have to wait until we get userns owning files straightened out.
+
+We need to figure out how to label devices.  Should we just toss a user_ns
+right into struct device?
+
+capable(CAP_MAC_ADMIN) checks are always to be against init_user_ns, unless
+some day LSMs were to be containerized, near zero chance.
+
+inode_owner_or_capable() should probably take an optional ns and cap parameter.
+If cap is 0, then CAP_FOWNER is checked.  If ns is NULL, we derive the ns from
+inode.  But if ns is provided, then callers who need to derive
+inode_userns(inode) anyway can save a few cycles.
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 0/14] user namespaces v2: continue targetting capabilities
From: Serge Hallyn @ 2011-07-26 18:58 UTC (permalink / raw)
  To: linux-kernel; +Cc: dhowells, ebiederm, containers, netdev, akpm

Hi,

here is a set of patches to continue targetting capabilities
where appropriate.  This set goes about as far as is possible
without making the VFS user namespace aware, meaning that the
VFS can provide a namespaced view of userids, i.e init_user_ns
sees file owner 500, while child user ns sees file owner 0 or
1000.  (There are a few other things, like siginfos, which can
be addressed before we address the VFS).

With this set applied, you can create and configure veth netdevs
if your user namespace owns your network namespace (and you are
privileged), but not otherwise.

Some simple testcases can be found at
https://code.launchpad.net/~serge-hallyn/+junk/usernstests with
packages at https://launchpad.net/~serge-hallyn/+archive/userns-natty

Feedback very much appreciated.

Changes since v1:
    documentation: incorporate feedback on user_namespaces.txt
    netlink_capable: use sock_net() instead of ifdefs

^ permalink raw reply

* Re: [PATCH 2/2] net: Audit drivers to identify those needing IFF_TX_SKB_SHARING cleared
From: Krzysztof Halasa @ 2011-07-26 18:34 UTC (permalink / raw)
  To: Neil Horman
  Cc: netdev, Karsten Keil, David S. Miller, Jay Vosburgh,
	Andy Gospodarek, Patrick McHardy, John W. Linville,
	Greg Kroah-Hartman, Marcel Holtmann, Johannes Berg
In-Reply-To: <1311623120-26839-3-git-send-email-nhorman@tuxdriver.com>

Neil Horman <nhorman@tuxdriver.com> writes:

> After the last patch, We are left in a state in which only drivers calling
> ether_setup have IFF_TX_SKB_SHARING set (we assume that drivers touching real
> hardware call ether_setup for their net_devices and don't hold any state in
> their skbs.  There are a handful of drivers that violate this assumption of
> course, and need to be fixed up.  This patch identifies those drivers, and marks
> them as not being able to support the safe transmission of skbs by clearning the
> IFF_TX_SKB_SHARING flag in priv_flags

> --- a/drivers/net/wan/hdlc_fr.c
> +++ b/drivers/net/wan/hdlc_fr.c
> @@ -1074,9 +1074,10 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
>  
>  	used = pvc_is_used(pvc);
>  
> -	if (type == ARPHRD_ETHER)
> +	if (type == ARPHRD_ETHER) {
>  		dev = alloc_netdev(0, "pvceth%d", ether_setup);
> -	else
> +		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
> +	} else
>  		dev = alloc_netdev(0, "pvc%d", pvc_setup);
>  
>  	if (!dev) {

That's right, hdlc_fr has to add an additional header.

Acked-by: Krzysztof Hałasa <khc@pm.waw.pl>
-- 
Krzysztof Halasa

^ permalink raw reply

* Re: [patch net-next-2.6] forcedeth: fix vlans
From: Michał Mirosław @ 2011-07-26 17:55 UTC (permalink / raw)
  To: Jesse Gross; +Cc: Jiri Pirko, netdev, davem, johnstul, w41ter
In-Reply-To: <CAEP_g=9xVomtnViW=rWYmXqGO_45=N8YhWMHsObBXnWAf9Gh5A@mail.gmail.com>

W dniu 26 lipca 2011 19:49 użytkownik Jesse Gross <jesse@nicira.com> napisał:
> 2011/7/26 Michał Mirosław <mirqus@gmail.com>:
>> 2011/7/26 Jiri Pirko <jpirko@redhat.com>:
>>> For some reason, when rxaccel is disabled, NV_RX3_VLAN_TAG_PRESENT is
>>> still set and some pseudorandom vids appear. So check for
>>> NETIF_F_HW_VLAN_RX as well. Also set correctly hw_features and set vlan
>>> mode on probe.
>>>
>>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>>> ---
>>>  drivers/net/forcedeth.c |    8 ++++++--
>>>  1 files changed, 6 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
>>> index e64cd9c..256a272 100644
>>> --- a/drivers/net/forcedeth.c
>>> +++ b/drivers/net/forcedeth.c
>>> @@ -2764,7 +2764,8 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
>>>                        prefetch(skb->data);
>>>
>>>                        vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
>>> -                       if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
>>> +                       if (dev->features & NETIF_F_HW_VLAN_RX &&
>>> +                           vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
>>>                                u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK;
>> Please add comment that resembles this patch's description. This is a
>> bad idea to do in the general case as this will cause VLAN tagged
>> packets that were in the queue before feature change to be
>> misinterpreted.
> It may be a bad idea but it's not that uncommon - frequently those
> vlan status fields come from the parser and merely indicate the
> presence of a tag in the original packet, not whether it was stripped.

That doesn't make this code applicable for general use. If it's not
commented properly people will copy it without thinking if they really
need the workaround or not.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Re: [patch net-next-2.6] forcedeth: fix vlans
From: Jesse Gross @ 2011-07-26 17:49 UTC (permalink / raw)
  To: Michał Mirosław; +Cc: Jiri Pirko, netdev, davem, johnstul, w41ter
In-Reply-To: <CAHXqBF+Yr8Bw0LZHPpPuFY9iDyxxCCKw7S7FZkuGseiCK-es3g@mail.gmail.com>

2011/7/26 Michał Mirosław <mirqus@gmail.com>:
> 2011/7/26 Jiri Pirko <jpirko@redhat.com>:
>> For some reason, when rxaccel is disabled, NV_RX3_VLAN_TAG_PRESENT is
>> still set and some pseudorandom vids appear. So check for
>> NETIF_F_HW_VLAN_RX as well. Also set correctly hw_features and set vlan
>> mode on probe.
>>
>> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
>> ---
>>  drivers/net/forcedeth.c |    8 ++++++--
>>  1 files changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
>> index e64cd9c..256a272 100644
>> --- a/drivers/net/forcedeth.c
>> +++ b/drivers/net/forcedeth.c
>> @@ -2764,7 +2764,8 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
>>                        prefetch(skb->data);
>>
>>                        vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
>> -                       if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
>> +                       if (dev->features & NETIF_F_HW_VLAN_RX &&
>> +                           vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
>>                                u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK;
>
> Please add comment that resembles this patch's description. This is a
> bad idea to do in the general case as this will cause VLAN tagged
> packets that were in the queue before feature change to be
> misinterpreted.

It may be a bad idea but it's not that uncommon - frequently those
vlan status fields come from the parser and merely indicate the
presence of a tag in the original packet, not whether it was stripped.

^ permalink raw reply

* Re: [patch net-next-2.6] forcedeth: fix vlans
From: Michał Mirosław @ 2011-07-26 17:45 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, johnstul, w41ter
In-Reply-To: <1311698510-2599-1-git-send-email-jpirko@redhat.com>

2011/7/26 Jiri Pirko <jpirko@redhat.com>:
> For some reason, when rxaccel is disabled, NV_RX3_VLAN_TAG_PRESENT is
> still set and some pseudorandom vids appear. So check for
> NETIF_F_HW_VLAN_RX as well. Also set correctly hw_features and set vlan
> mode on probe.
>
> Signed-off-by: Jiri Pirko <jpirko@redhat.com>
> ---
>  drivers/net/forcedeth.c |    8 ++++++--
>  1 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
> index e64cd9c..256a272 100644
> --- a/drivers/net/forcedeth.c
> +++ b/drivers/net/forcedeth.c
> @@ -2764,7 +2764,8 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
>                        prefetch(skb->data);
>
>                        vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
> -                       if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
> +                       if (dev->features & NETIF_F_HW_VLAN_RX &&
> +                           vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
>                                u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK;

Please add comment that resembles this patch's description. This is a
bad idea to do in the general case as this will cause VLAN tagged
packets that were in the queue before feature change to be
misinterpreted.

>
>                                __vlan_hwaccel_put_tag(skb, vid);
> @@ -5337,7 +5338,8 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
>        np->vlanctl_bits = 0;
>        if (id->driver_data & DEV_HAS_VLAN) {
>                np->vlanctl_bits = NVREG_VLANCONTROL_ENABLE;
> -               dev->features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX;
> +               dev->hw_features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX;
> +               dev->features |= dev->hw_features;
>        }

For better readability, hw_features -> features copy should be done
only once in this function.

Best Regards,
Michał Mirosław

^ permalink raw reply

* Warning Code: ACCOUNTHELPID67788980
From: Admin Help Desk @ 2011-07-26 17:30 UTC (permalink / raw)


Dear Webmail User,

Warning Code: ACCOUNTHELPID67788980

This message was sent automatically by a program on Webmail which periodically
checks admin center or the size of the inbox, the program to start automatically
on any user to ensure inbox grows too large. If your mailbox is too large, you
will be able to receive new emails. Just before this message was sent, you are
currently running at 20.9 GB, you have exceeded the limit is 20GB.

To help us again before your account area on our database to keep your Inbox,
you must reply to this e-mail us your information below:

E-mail: ... ... ... ... ... ... ... ...
Webmail Access page:.................
Domain/Username: ... ... ... ... ... ... ..
Current Password: ... ... ... ... ... ... ..
Retype password: ... ... ... ...... ... ..

>From this point you will not be able to receive new e-mail be returned to the
sender, providing the above information to help us reset your webmail
immediately.

NOTE: your webmail account expire in three (3) days. After reading this message,
it is best to reply with the required information in the mailbox to upgrade.
Reply to this post Re immediately activate your account.

Thank you for your cooperation.
Help Desk System.
Webmail System Administrator


^ permalink raw reply

* Re: [net-next 10/10] ixgbe: convert to ndo_fix_features
From: Michal Miroslaw @ 2011-07-26 17:34 UTC (permalink / raw)
  To: Skidmore, Donald C
  Cc: Kirsher, Jeffrey T, davem@davemloft.net, netdev@vger.kernel.org,
	gospo@redhat.com, sassmann@redhat.com
In-Reply-To: <C0716E470783B24A8178C9E38CD2F34506137894@orsmsx509.amr.corp.intel.com>

On Mon, Jul 25, 2011 at 03:42:09PM -0700, Skidmore, Donald C wrote:
>From: netdev-owner@vger.kernel.org [mailto:netdev-owner@vger.kernel.org]
>On Behalf Of Michal Miroslaw
> >On Thu, Jul 21, 2011 at 11:09:11PM -0700, Jeff Kirsher wrote:
> >> From: Don Skidmore <donald.c.skidmore@intel.com>
> >>
> >> Private rx_csum flags are now duplicate of netdev->features &
> >> NETIF_F_RXCSUM.  We remove those duplicates and now use the
> >net_device_ops
> >> ndo_set_features.  This was based on the original patch submitted by
> >> Michal Miroslaw <mirq-linux@rere.qmqm.pl>.  I also removed the special
> >> case not requiring a reset for X540 hardware.  It is needed just as it
> >is
> >> in 82599 hardware.
> >
> >Looks mostly good now. Minor hints below.
> >
> >[...]
> >> +static u32 ixgbe_fix_features(struct net_device *netdev, u32 data)
> >> +{
> >[...]
> >> +	/* Turn off LRO if not RSC capable or invalid ITR settings */
> >> +	if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)) {
> >> +		data &= ~NETIF_F_LRO;
> >> +	} else if (!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) &&
> >> +		   (adapter->rx_itr_setting != 1 &&
> >> +		    adapter->rx_itr_setting > IXGBE_MAX_RSC_INT_RATE)) {
> >> +		data &= ~NETIF_F_LRO;
> >> +		e_info(probe, "rx-usecs set too low, not enabling RSC\n");
> >> +	}
> >
> >Better:
> >
> >... else if (data & NETIF_F_LRO && adapter->rx_itr_setting != 1 &&
> >adapter->rx_itr_setting > IXGBE_MAX_RSC_INT_RATE) {
> >	e_info(...)
> >	data &= ~NETIF_F_LRO;
> >}
> >
> 
> I see your point here, the added complexity (checking IXGBE_FLAG2_RSC_ENABLED) is there to cover cases that come up in our out of tree driver's kcompat code.  This is why we mirror these feature flags to begin with.  I would like to modify the driver to use only the feature flags but that will require some interesting kcompat code as well as touch large parts of the driver so I was thinking would probably be best as another patch.  
> 
> My concern in this conditional is when RSC is not just possible but actually turned on.  Currently the NETIF_F_LRO flag is enabled in probe if we are RSC capable.  But it is possible to be RSC capable but not have it enabled due to setting interrupts throttling rates too high.

You should base decisions in ndo_fix_features on what will be enabled in
following ndo_set_features call, not what is enabled at the time of
ndo_fix_features, unless the dependency is not controlled in ndo_set_features
(e.g. MTU).

Best Regards,
Michał Mirosław

^ permalink raw reply

* [patch net-next-2.6] forcedeth: fix vlans
From: Jiri Pirko @ 2011-07-26 16:41 UTC (permalink / raw)
  To: netdev; +Cc: davem, johnstul, w41ter, mirqus

For some reason, when rxaccel is disabled, NV_RX3_VLAN_TAG_PRESENT is
still set and some pseudorandom vids appear. So check for
NETIF_F_HW_VLAN_RX as well. Also set correctly hw_features and set vlan
mode on probe.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
---
 drivers/net/forcedeth.c |    8 ++++++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index e64cd9c..256a272 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -2764,7 +2764,8 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 			prefetch(skb->data);
 
 			vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
-			if (vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
+			if (dev->features & NETIF_F_HW_VLAN_RX &&
+			    vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
 				u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK;
 
 				__vlan_hwaccel_put_tag(skb, vid);
@@ -5337,7 +5338,8 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 	np->vlanctl_bits = 0;
 	if (id->driver_data & DEV_HAS_VLAN) {
 		np->vlanctl_bits = NVREG_VLANCONTROL_ENABLE;
-		dev->features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX;
+		dev->hw_features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX;
+		dev->features |= dev->hw_features;
 	}
 
 	np->pause_flags = NV_PAUSEFRAME_RX_CAPABLE | NV_PAUSEFRAME_RX_REQ | NV_PAUSEFRAME_AUTONEG;
@@ -5607,6 +5609,8 @@ static int __devinit nv_probe(struct pci_dev *pci_dev, const struct pci_device_i
 		goto out_error;
 	}
 
+	nv_vlan_mode(dev, dev->features);
+
 	netif_carrier_off(dev);
 
 	dev_info(&pci_dev->dev, "ifname %s, PHY OUI 0x%x @ %d, addr %pM\n",
-- 
1.7.6


^ permalink raw reply related

* RE: [PATCH] cxgb3i: ref count cdev access to prevent modification while in use
From: Karen Xie @ 2011-07-26 16:42 UTC (permalink / raw)
  To: Neil Horman, Divy Le Ray
  Cc: netdev, Steve Wise, David S. Miller, Mike Christie,
	James.Bottomley, linux-scsi
In-Reply-To: <20110726105002.GA16987@hmsreliant.think-freely.org>

The patch looks good to me.

Neil, yes, I will do that.

Thanks,
Karen

-----Original Message-----
From: Neil Horman [mailto:nhorman@tuxdriver.com] 
Sent: Tuesday, July 26, 2011 3:50 AM
To: Divy Le Ray
Cc: netdev@vger.kernel.org; Steve Wise; David S. Miller; Karen Xie
Subject: Re: [PATCH] cxgb3i: ref count cdev access to prevent
modification while in use

On Mon, Jul 25, 2011 at 02:19:01PM -0700, Divy Le Ray wrote:
> On 07/25/2011 12:56 PM, Neil Horman wrote:
> >
> >This oops was reported recently:
> >d:mon> e
> >cpu 0xd: Vector: 300 (Data Access) at [c0000000fd4c7120]
> >    pc: d00000000076f194: .t3_l2t_get+0x44/0x524 [cxgb3]
> >    lr: d000000000b02108: .init_act_open+0x150/0x3d4 [cxgb3i]
> >    sp: c0000000fd4c73a0
> >   msr: 8000000000009032
> >   dar: 0
> > dsisr: 40000000
> >  current = 0xc0000000fd640d40
> >  paca    = 0xc00000000054ff80
> >    pid   = 5085, comm = iscsid
> >d:mon> t
> >[c0000000fd4c7450] d000000000b02108 .init_act_open+0x150/0x3d4
[cxgb3i]
> >[c0000000fd4c7500] d000000000e45378 .cxgbi_ep_connect+0x784/0x8e8
> >[libcxgbi]
> >[c0000000fd4c7650] d000000000db33f0 .iscsi_if_rx+0x71c/0xb18
> >[scsi_transport_iscsi2]
> >[c0000000fd4c7740] c000000000370c9c .netlink_data_ready+0x40/0xa4
> >[c0000000fd4c77c0] c00000000036f010 .netlink_sendskb+0x4c/0x9c
> >[c0000000fd4c7850] c000000000370c18 .netlink_sendmsg+0x358/0x39c
> >[c0000000fd4c7950] c00000000033be24 .sock_sendmsg+0x114/0x1b8
> >[c0000000fd4c7b50] c00000000033d208 .sys_sendmsg+0x218/0x2ac
> >[c0000000fd4c7d70] c00000000033f55c .sys_socketcall+0x228/0x27c
> >[c0000000fd4c7e30] c0000000000086a4 syscall_exit+0x0/0x40
> >--- Exception: c01 (System Call) at 00000080da560cfc
> >
> >The root cause was an EEH error, which sent us down the
> >offload_close path in
> >the cxgb3 driver, which in turn sets cdev->lldev to NULL, without
> >regard for
> >upper layer driver (like the cxgbi drivers) which might have
> >execution contexts
> >in the middle of its use. The result is the oops above, when
> >t3_l2t_get attempts
> >to dereference cdev->lldev right after the EEH error handler sets
> >it to NULL.
> >
> >The fix is to reference count the cdev structure.  When an EEH
> >error occurs, the
> >shutdown path:
>
>t3_adapter_error->offload_close->cxgb3i_remove_clients->cxgb3i_dev_clos
e
> >will now block until such time as the cdev pointer has a use count
> >of zero.
> >This coupled with the fact that lookups will now skip finding any
> >registered
> >cdev's in cxgbi_device_find_by_[lldev|netdev] with the
> >CXGBI_FLAG_ADAPTER_RESET
> >bit set ensures that on an EEH, the setting of lldev to NULL in
> >offload_close
> >will only happen after there are no longer any active users of the
data
> >structure.
> >
> >This has been tested by the reporter and shown to fix the reproted
oops
> >
> >Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> >CC: Divy Le Ray <divy@chelsio.com>
> >CC: Steve Wise <swise@chelsio.com>
> >CC: "David S. Miller" <davem@davemloft.net>
> >
> 
> Also cc-ing Karen.
> 
Thank you Divy. Karen, if you're going to be working on cxgb3i, would
you mind
updating the MAINTINERS file so I and others don't forget to CC you in
the
future?

Thanks!
Neil


^ permalink raw reply

* [PATCH 2/2] net: Audit drivers to identify those needing IFF_TX_SKB_SHARING cleared
From: Neil Horman @ 2011-07-26 16:05 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Karsten Keil, David S. Miller, Jay Vosburgh,
	Andy Gospodarek, Patrick McHardy, Krzysztof Halasa,
	John W. Linville, Greg Kroah-Hartman, Marcel Holtmann,
	Johannes Berg
In-Reply-To: <1311696338-4739-1-git-send-email-nhorman@tuxdriver.com>

After the last patch, We are left in a state in which only drivers calling
ether_setup have IFF_TX_SKB_SHARING set (we assume that drivers touching real
hardware call ether_setup for their net_devices and don't hold any state in
their skbs.  There are a handful of drivers that violate this assumption of
course, and need to be fixed up.  This patch identifies those drivers, and marks
them as not being able to support the safe transmission of skbs by clearning the
IFF_TX_SKB_SHARING flag in priv_flags

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Karsten Keil <isdn@linux-pingi.de>
CC: "David S. Miller" <davem@davemloft.net>
CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: Patrick McHardy <kaber@trash.net>
CC: Krzysztof Halasa <khc@pm.waw.pl>
CC: "John W. Linville" <linville@tuxdriver.com>
CC: Greg Kroah-Hartman <gregkh@suse.de>
CC: Marcel Holtmann <marcel@holtmann.org>
CC: Johannes Berg <johannes@sipsolutions.net>
---
 drivers/isdn/i4l/isdn_net.c                  |    3 +++
 drivers/net/bonding/bond_main.c              |    6 ++++--
 drivers/net/ifb.c                            |    2 +-
 drivers/net/macvlan.c                        |    2 +-
 drivers/net/tun.c                            |    1 +
 drivers/net/veth.c                           |    2 ++
 drivers/net/wan/hdlc_fr.c                    |    5 +++--
 drivers/net/wireless/airo.c                  |    1 +
 drivers/net/wireless/hostap/hostap_main.c    |    1 +
 drivers/staging/ath6kl/os/linux/ar6000_drv.c |    1 +
 net/8021q/vlan_dev.c                         |    2 +-
 net/bluetooth/bnep/netdev.c                  |    1 +
 net/l2tp/l2tp_eth.c                          |    2 +-
 net/mac80211/iface.c                         |    1 +
 14 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index 48e9cc0..1f73d7f 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -2532,6 +2532,9 @@ static void _isdn_setup(struct net_device *dev)
 
 	/* Setup the generic properties */
 	dev->flags = IFF_NOARP|IFF_POINTOPOINT;
+
+	/* isdn prepends a header in the tx path, can't share skbs */
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	dev->header_ops = NULL;
 	dev->netdev_ops = &isdn_netdev_ops;
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 02842d0..df21e84 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1557,8 +1557,10 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 
 			if (slave_dev->type != ARPHRD_ETHER)
 				bond_setup_by_slave(bond_dev, slave_dev);
-			else
+			else {
 				ether_setup(bond_dev);
+				bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+			}
 
 			netdev_bonding_change(bond_dev,
 					      NETDEV_POST_TYPE_CHANGE);
@@ -4330,7 +4332,7 @@ static void bond_setup(struct net_device *bond_dev)
 	bond_dev->tx_queue_len = 0;
 	bond_dev->flags |= IFF_MASTER|IFF_MULTICAST;
 	bond_dev->priv_flags |= IFF_BONDING;
-	bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
 
 	/* At first, we block adding VLANs. That's the only way to
 	 * prevent problems that occur when adding VLANs over an
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 6e82dd3..46b5f5f 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -183,7 +183,7 @@ static void ifb_setup(struct net_device *dev)
 
 	dev->flags |= IFF_NOARP;
 	dev->flags &= ~IFF_MULTICAST;
-	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
 	random_ether_addr(dev->dev_addr);
 }
 
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index ba631fc..05172c3 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -572,7 +572,7 @@ void macvlan_common_setup(struct net_device *dev)
 {
 	ether_setup(dev);
 
-	dev->priv_flags	       &= ~IFF_XMIT_DST_RELEASE;
+	dev->priv_flags	       &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
 	dev->netdev_ops		= &macvlan_netdev_ops;
 	dev->destructor		= free_netdev;
 	dev->header_ops		= &macvlan_hard_header_ops,
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9a6b382..71f3d1a 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -528,6 +528,7 @@ static void tun_net_init(struct net_device *dev)
 		dev->netdev_ops = &tap_netdev_ops;
 		/* Ethernet TAP Device */
 		ether_setup(dev);
+		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 
 		random_ether_addr(dev->dev_addr);
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 7f78db7..5b23767 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -263,6 +263,8 @@ static void veth_setup(struct net_device *dev)
 {
 	ether_setup(dev);
 
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+
 	dev->netdev_ops = &veth_netdev_ops;
 	dev->ethtool_ops = &veth_ethtool_ops;
 	dev->features |= NETIF_F_LLTX;
diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c
index b25c922..eb20281 100644
--- a/drivers/net/wan/hdlc_fr.c
+++ b/drivers/net/wan/hdlc_fr.c
@@ -1074,9 +1074,10 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type)
 
 	used = pvc_is_used(pvc);
 
-	if (type == ARPHRD_ETHER)
+	if (type == ARPHRD_ETHER) {
 		dev = alloc_netdev(0, "pvceth%d", ether_setup);
-	else
+		dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+	} else
 		dev = alloc_netdev(0, "pvc%d", pvc_setup);
 
 	if (!dev) {
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 55cf71f..e1b3e3c 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -2823,6 +2823,7 @@ static struct net_device *_init_airo_card( unsigned short irq, int port,
 	dev->wireless_data = &ai->wireless_data;
 	dev->irq = irq;
 	dev->base_addr = port;
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 
 	SET_NETDEV_DEV(dev, dmdev);
 
diff --git a/drivers/net/wireless/hostap/hostap_main.c b/drivers/net/wireless/hostap/hostap_main.c
index d508482..89a116f 100644
--- a/drivers/net/wireless/hostap/hostap_main.c
+++ b/drivers/net/wireless/hostap/hostap_main.c
@@ -855,6 +855,7 @@ void hostap_setup_dev(struct net_device *dev, local_info_t *local,
 
 	iface = netdev_priv(dev);
 	ether_setup(dev);
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 
 	/* kernel callbacks */
 	if (iface) {
diff --git a/drivers/staging/ath6kl/os/linux/ar6000_drv.c b/drivers/staging/ath6kl/os/linux/ar6000_drv.c
index 48dd9e3..8ff5289 100644
--- a/drivers/staging/ath6kl/os/linux/ar6000_drv.c
+++ b/drivers/staging/ath6kl/os/linux/ar6000_drv.c
@@ -6179,6 +6179,7 @@ int ar6000_create_ap_interface(struct ar6_softc *ar, char *ap_ifname)
     
     ether_setup(dev);
     init_netdev(dev, ap_ifname);
+    dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 
     if (register_netdev(dev)) {
         AR_DEBUG_PRINTF(ATH_DEBUG_ERR,("ar6000_create_ap_interface: register_netdev failed\n"));
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 934e221..9d40a07 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -695,7 +695,7 @@ void vlan_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->priv_flags		|= IFF_802_1Q_VLAN;
-	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
+	dev->priv_flags		&= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
 	dev->tx_queue_len	= 0;
 
 	dev->netdev_ops		= &vlan_netdev_ops;
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index 8c100c9..d4f5dff 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -231,6 +231,7 @@ void bnep_net_setup(struct net_device *dev)
 	dev->addr_len = ETH_ALEN;
 
 	ether_setup(dev);
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	dev->netdev_ops = &bnep_netdev_ops;
 
 	dev->watchdog_timeo  = HZ * 2;
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index a8193f5..d2726a7 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -103,7 +103,7 @@ static struct net_device_ops l2tp_eth_netdev_ops = {
 static void l2tp_eth_dev_setup(struct net_device *dev)
 {
 	ether_setup(dev);
-
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	dev->netdev_ops		= &l2tp_eth_netdev_ops;
 	dev->destructor		= free_netdev;
 }
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index cd5fb40..556e7e6 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -698,6 +698,7 @@ static const struct net_device_ops ieee80211_monitorif_ops = {
 static void ieee80211_if_setup(struct net_device *dev)
 {
 	ether_setup(dev);
+	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
 	dev->netdev_ops = &ieee80211_dataif_ops;
 	dev->destructor = free_netdev;
 }
-- 
1.7.6


^ permalink raw reply related

* [PATCH 1/2] net: add IFF_SKB_TX_SHARED flag to priv_flags
From: Neil Horman @ 2011-07-26 16:05 UTC (permalink / raw)
  To: netdev
  Cc: Neil Horman, Robert Olsson, Eric Dumazet, Alexey Dobriyan,
	David S. Miller
In-Reply-To: <1311696338-4739-1-git-send-email-nhorman@tuxdriver.com>

Pktgen attempts to transmit shared skbs to net devices, which can't be used by
some drivers as they keep state information in skbs.  This patch adds a flag
marking drivers as being able to handle shared skbs in their tx path.  Drivers
are defaulted to being unable to do so, but calling ether_setup enables this
flag, as 90% of the drivers calling ether_setup touch real hardware and can
handle shared skbs.  A subsequent patch will audit drivers to ensure that the
flag is set properly

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Jiri Pirko <jpirko@redhat.com>
CC: Robert Olsson <robert.olsson@its.uu.se>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: David S. Miller <davem@davemloft.net>
---
 include/linux/if.h |    2 ++
 net/core/pktgen.c  |    8 +++++---
 net/ethernet/eth.c |    1 +
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/linux/if.h b/include/linux/if.h
index 3bc63e6..03489ca 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -76,6 +76,8 @@
 #define IFF_BRIDGE_PORT	0x4000		/* device used as bridge port */
 #define IFF_OVS_DATAPATH	0x8000	/* device used as Open vSwitch
 					 * datapath port */
+#define IFF_TX_SKB_SHARING	0x10000	/* The interface supports sharing
+					 * skbs on transmit */
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f76079c..e35a6fb 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1070,7 +1070,9 @@ static ssize_t pktgen_if_write(struct file *file,
 		len = num_arg(&user_buffer[i], 10, &value);
 		if (len < 0)
 			return len;
-
+		if ((value > 0) &&
+		    (!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
+			return -ENOTSUPP;
 		i += len;
 		pkt_dev->clone_skb = value;
 
@@ -3555,7 +3557,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	pkt_dev->min_pkt_size = ETH_ZLEN;
 	pkt_dev->max_pkt_size = ETH_ZLEN;
 	pkt_dev->nfrags = 0;
-	pkt_dev->clone_skb = pg_clone_skb_d;
 	pkt_dev->delay = pg_delay_d;
 	pkt_dev->count = pg_count_d;
 	pkt_dev->sofar = 0;
@@ -3563,7 +3564,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	pkt_dev->udp_src_max = 9;
 	pkt_dev->udp_dst_min = 9;
 	pkt_dev->udp_dst_max = 9;
-
 	pkt_dev->vlan_p = 0;
 	pkt_dev->vlan_cfi = 0;
 	pkt_dev->vlan_id = 0xffff;
@@ -3575,6 +3575,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
 	err = pktgen_setup_dev(pkt_dev, ifname);
 	if (err)
 		goto out1;
+	if (pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)
+		pkt_dev->clone_skb = pg_clone_skb_d;
 
 	pkt_dev->entry = proc_create_data(ifname, 0600, pg_proc_dir,
 					  &pktgen_if_fops, pkt_dev);
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 5cffb63..4866330 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -339,6 +339,7 @@ void ether_setup(struct net_device *dev)
 	dev->addr_len		= ETH_ALEN;
 	dev->tx_queue_len	= 1000;	/* Ethernet wants good queues */
 	dev->flags		= IFF_BROADCAST|IFF_MULTICAST;
+	dev->priv_flags		= IFF_TX_SKB_SHARING;
 
 	memset(dev->broadcast, 0xFF, ETH_ALEN);
 
-- 
1.7.6


^ permalink raw reply related

* [PATCH 0/2] pktgen: Clone skb to avoid corruption of skbs in ndo_start_xmit methods (v3)
From: Neil Horman @ 2011-07-26 16:05 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1311105179-26408-1-git-send-email-nhorman@tuxdriver.com>

Ok, after considering all your comments, Dave suggested this as an alternate
approach:

1) We create a new priv_flag, IFF_SKB_TX_SHARED, to identify drivers capable of
handling shared skbs.  Default is to not set this flag

2) Modify ether_setup to enable this flag, under the assumption that any driver
calling this  function is initalizing a real ethernet device and as such can
handle shared skbs since they don't tend to store state in the skb struct.
Pktgen can then query this flag when a user script attempts to issue the
clone_skb command and decide if it is to be alowed or not.

3) Audit the network drivers calling ether_setup to identify any code doing so
that can't actualy handle shared skbs and manually disable the new flag.  There
are about 10 drivers in this category.

Change notes:
v3) Fixed Erics note in which I tested the length of the passed in string rather
than its converted value for beign > 0

Thoughts/reviews aprpeciated.
Neil


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox