public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] lockfree rtcache lookup using RCU
@ 2002-05-08  7:27 Dipankar Sarma
  2002-05-08  8:10 ` David S. Miller
  2002-05-08  9:40 ` [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1) Dax Kelson
  0 siblings, 2 replies; 18+ messages in thread
From: Dipankar Sarma @ 2002-05-08  7:27 UTC (permalink / raw)
  To: linux-kernel

During a past thread about lockfree hashtable lookups using RCU,
merits and complexities of such an approach were discussed.
Here is a patch that uses RCU to do lockfree ipv4 route cache
lookup.

I decided that I should do some microbenchmarking to see
if it really makes a difference or not. Since, we know that
lock contention is not an issue (rwlock with rare updates), I
thought I should concentrate on the cache line bouncing of
the per-bucket rwlock. I figured that if we have lots of
hits in the route cache, the corresponding rwlock cache lines
are going to bounce. So I wrote a test that
uses about 8 processes to send a large number of randomly sized packets to
a single destination. For 1 to 8 CPUs I used the test script
to send a fixed number of packets to a single destination address.
The results show that time needed for lookup continuously increases
for 2.5.3 wherease for rt_rcu-2.5.3, it remains constant.

http://lse.sourceforge.net/locking/rcu/rtcache/rttest/rttest.png

The results are in -

http://lse.sourceforge.net/locking/rcu/rtcache/rttest/

The measurements were done in an 8way PIII xeon with 1MB L2 cache
and 6GB memory.

Comments/suggestions are welcome.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.


diff -urN linux-2.5.3/include/net/dst.h linux-2.5.3-rt_rcu/include/net/dst.h
--- linux-2.5.3/include/net/dst.h	Wed Jan 30 11:11:52 2002
+++ linux-2.5.3-rt_rcu/include/net/dst.h	Fri Feb  8 11:45:15 2002
@@ -9,6 +9,7 @@
 #define _NET_DST_H
 
 #include <linux/config.h>
+#include <linux/rcupdate.h>
 #include <net/neighbour.h>
 
 /*
@@ -62,6 +63,7 @@
 #endif
 
 	struct  dst_ops	        *ops;
+	struct rcu_head		rcu_head;
 		
 	char			info[0];
 };
diff -urN linux-2.5.3/net/ipv4/route.c linux-2.5.3-rt_rcu/net/ipv4/route.c
--- linux-2.5.3/net/ipv4/route.c	Wed Jan 16 00:26:35 2002
+++ linux-2.5.3-rt_rcu/net/ipv4/route.c	Fri Feb  8 11:37:22 2002
@@ -85,6 +85,7 @@
 #include <linux/mroute.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/random.h>
+#include <linux/rcupdate.h>
 #include <net/protocol.h>
 #include <net/ip.h>
 #include <net/route.h>
@@ -188,7 +189,7 @@
 
 struct rt_hash_bucket {
 	struct rtable	*chain;
-	rwlock_t	lock;
+	spinlock_t	lock;
 } __attribute__((__aligned__(8)));
 
 static struct rt_hash_bucket 	*rt_hash_table;
@@ -227,7 +228,6 @@
   	}
 	
 	for (i = rt_hash_mask; i >= 0; i--) {
-		read_lock_bh(&rt_hash_table[i].lock);
 		for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) {
 			/*
 			 *	Spin through entries until we are ready
@@ -238,6 +238,8 @@
 				len = 0;
 				continue;
 			}
+			/* read_barrier_depends() here */
+			rmb();
 			sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 				"%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 				r->u.dst.dev ? r->u.dst.dev->name : "*",
@@ -262,11 +264,9 @@
 			sprintf(buffer + len, "%-127s\n", temp);
 			len += 128;
 			if (pos >= offset+length) {
-				read_unlock_bh(&rt_hash_table[i].lock);
 				goto done;
 			}
 		}
-		read_unlock_bh(&rt_hash_table[i].lock);
         }
 
 done:
@@ -314,13 +314,13 @@
   
 static __inline__ void rt_free(struct rtable *rt)
 {
-	dst_free(&rt->u.dst);
+	call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
 }
 
 static __inline__ void rt_drop(struct rtable *rt)
 {
 	ip_rt_put(rt);
-	dst_free(&rt->u.dst);
+	call_rcu(&rt->u.dst.rcu_head, (void (*)(void *))dst_free, &rt->u.dst);
 }
 
 static __inline__ int rt_fast_clean(struct rtable *rth)
@@ -373,7 +373,7 @@
 		i = (i + 1) & rt_hash_mask;
 		rthp = &rt_hash_table[i].chain;
 
-		write_lock(&rt_hash_table[i].lock);
+		spin_lock(&rt_hash_table[i].lock);
 		while ((rth = *rthp) != NULL) {
 			if (rth->u.dst.expires) {
 				/* Entry is expired even if it is in use */
@@ -392,7 +392,7 @@
 			*rthp = rth->u.rt_next;
 			rt_free(rth);
 		}
-		write_unlock(&rt_hash_table[i].lock);
+		spin_unlock(&rt_hash_table[i].lock);
 
 		/* Fallback loop breaker. */
 		if ((jiffies - now) > 0)
@@ -415,11 +415,11 @@
 	rt_deadline = 0;
 
 	for (i = rt_hash_mask; i >= 0; i--) {
-		write_lock_bh(&rt_hash_table[i].lock);
+		spin_lock_bh(&rt_hash_table[i].lock);
 		rth = rt_hash_table[i].chain;
 		if (rth)
 			rt_hash_table[i].chain = NULL;
-		write_unlock_bh(&rt_hash_table[i].lock);
+		spin_unlock_bh(&rt_hash_table[i].lock);
 
 		for (; rth; rth = next) {
 			next = rth->u.rt_next;
@@ -538,7 +538,7 @@
 
 			k = (k + 1) & rt_hash_mask;
 			rthp = &rt_hash_table[k].chain;
-			write_lock_bh(&rt_hash_table[k].lock);
+			spin_lock_bh(&rt_hash_table[k].lock);
 			while ((rth = *rthp) != NULL) {
 				if (!rt_may_expire(rth, tmo, expire)) {
 					tmo >>= 1;
@@ -549,7 +549,7 @@
 				rt_free(rth);
 				goal--;
 			}
-			write_unlock_bh(&rt_hash_table[k].lock);
+			spin_unlock_bh(&rt_hash_table[k].lock);
 			if (goal <= 0)
 				break;
 		}
@@ -607,18 +607,20 @@
 restart:
 	rthp = &rt_hash_table[hash].chain;
 
-	write_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(&rt_hash_table[hash].lock);
 	while ((rth = *rthp) != NULL) {
 		if (memcmp(&rth->key, &rt->key, sizeof(rt->key)) == 0) {
 			/* Put it first */
 			*rthp = rth->u.rt_next;
+			wmb();
 			rth->u.rt_next = rt_hash_table[hash].chain;
+			wmb();
 			rt_hash_table[hash].chain = rth;
 
 			rth->u.dst.__use++;
 			dst_hold(&rth->u.dst);
 			rth->u.dst.lastuse = now;
-			write_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(&rt_hash_table[hash].lock);
 
 			rt_drop(rt);
 			*rp = rth;
@@ -634,7 +636,7 @@
 	if (rt->rt_type == RTN_UNICAST || rt->key.iif == 0) {
 		int err = arp_bind_neighbour(&rt->u.dst);
 		if (err) {
-			write_unlock_bh(&rt_hash_table[hash].lock);
+			spin_unlock_bh(&rt_hash_table[hash].lock);
 
 			if (err != -ENOBUFS) {
 				rt_drop(rt);
@@ -675,7 +677,7 @@
 	}
 #endif
 	rt_hash_table[hash].chain = rt;
-	write_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(&rt_hash_table[hash].lock);
 	*rp = rt;
 	return 0;
 }
@@ -742,7 +744,7 @@
 {
 	struct rtable **rthp;
 
-	write_lock_bh(&rt_hash_table[hash].lock);
+	spin_lock_bh(&rt_hash_table[hash].lock);
 	ip_rt_put(rt);
 	for (rthp = &rt_hash_table[hash].chain; *rthp;
 	     rthp = &(*rthp)->u.rt_next)
@@ -751,7 +753,7 @@
 			rt_free(rt);
 			break;
 		}
-	write_unlock_bh(&rt_hash_table[hash].lock);
+	spin_unlock_bh(&rt_hash_table[hash].lock);
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
@@ -790,10 +792,10 @@
 
 			rthp=&rt_hash_table[hash].chain;
 
-			read_lock(&rt_hash_table[hash].lock);
 			while ((rth = *rthp) != NULL) {
 				struct rtable *rt;
-
+				/* read_barrier_depends() here */
+				rmb();
 				if (rth->key.dst != daddr ||
 				    rth->key.src != skeys[i] ||
 				    rth->key.tos != tos ||
@@ -811,7 +813,6 @@
 					break;
 
 				dst_clone(&rth->u.dst);
-				read_unlock(&rt_hash_table[hash].lock);
 
 				rt = dst_alloc(&ipv4_dst_ops);
 				if (rt == NULL) {
@@ -822,6 +823,8 @@
 
 				/* Copy all the information. */
 				*rt = *rth;
+ 				memset(&rt->u.dst.rcu_head, 0, 
+ 					sizeof(struct rcu_head));
 				rt->u.dst.__use		= 1;
 				atomic_set(&rt->u.dst.__refcnt, 1);
 				if (rt->u.dst.dev)
@@ -857,7 +860,6 @@
 					ip_rt_put(rt);
 				goto do_next;
 			}
-			read_unlock(&rt_hash_table[hash].lock);
 		do_next:
 			;
 		}
@@ -1037,9 +1039,10 @@
 	for (i = 0; i < 2; i++) {
 		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
 
-		read_lock(&rt_hash_table[hash].lock);
 		for (rth = rt_hash_table[hash].chain; rth;
 		     rth = rth->u.rt_next) {
+			/* read_barrier_depends() here */
+			rmb();
 			if (rth->key.dst == daddr &&
 			    rth->key.src == skeys[i] &&
 			    rth->rt_dst  == daddr &&
@@ -1075,7 +1078,6 @@
 				}
 			}
 		}
-		read_unlock(&rt_hash_table[hash].lock);
 	}
 	return est_mtu ? : new_mtu;
 }
@@ -1629,8 +1631,9 @@
 	tos &= IPTOS_RT_MASK;
 	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
 
-	read_lock(&rt_hash_table[hash].lock);
 	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
+		/* read_barrier_depends() here */
+		rmb();
 		if (rth->key.dst == daddr &&
 		    rth->key.src == saddr &&
 		    rth->key.iif == iif &&
@@ -1643,12 +1646,10 @@
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			rt_cache_stat[smp_processor_id()].in_hit++;
-			read_unlock(&rt_hash_table[hash].lock);
 			skb->dst = (struct dst_entry*)rth;
 			return 0;
 		}
 	}
-	read_unlock(&rt_hash_table[hash].lock);
 
 	/* Multicast recognition logic is moved from route cache to here.
 	   The problem was that too many Ethernet cards have broken/missing
@@ -1988,8 +1989,9 @@
 
 	hash = rt_hash_code(key->dst, key->src ^ (key->oif << 5), key->tos);
 
-	read_lock_bh(&rt_hash_table[hash].lock);
 	for (rth = rt_hash_table[hash].chain; rth; rth = rth->u.rt_next) {
+		/* read_barrier_depends() here */
+		rmb();
 		if (rth->key.dst == key->dst &&
 		    rth->key.src == key->src &&
 		    rth->key.iif == 0 &&
@@ -2003,12 +2005,10 @@
 			dst_hold(&rth->u.dst);
 			rth->u.dst.__use++;
 			rt_cache_stat[smp_processor_id()].out_hit++;
-			read_unlock_bh(&rt_hash_table[hash].lock);
 			*rp = rth;
 			return 0;
 		}
 	}
-	read_unlock_bh(&rt_hash_table[hash].lock);
 
 	return ip_route_output_slow(rp, key);
 }	
@@ -2194,9 +2194,10 @@
 		if (h < s_h) continue;
 		if (h > s_h)
 			s_idx = 0;
-		read_lock_bh(&rt_hash_table[h].lock);
 		for (rt = rt_hash_table[h].chain, idx = 0; rt;
 		     rt = rt->u.rt_next, idx++) {
+			/* read_barrier_depends() here */
+			rmb();
 			if (idx < s_idx)
 				continue;
 			skb->dst = dst_clone(&rt->u.dst);
@@ -2204,12 +2205,10 @@
 					 cb->nlh->nlmsg_seq,
 					 RTM_NEWROUTE, 1) <= 0) {
 				dst_release(xchg(&skb->dst, NULL));
-				read_unlock_bh(&rt_hash_table[h].lock);
 				goto done;
 			}
 			dst_release(xchg(&skb->dst, NULL));
 		}
-		read_unlock_bh(&rt_hash_table[h].lock);
 	}
 
 done:
@@ -2496,7 +2495,7 @@
 
 	rt_hash_mask--;
 	for (i = 0; i <= rt_hash_mask; i++) {
-		rt_hash_table[i].lock = RW_LOCK_UNLOCKED;
+		rt_hash_table[i].lock = SPIN_LOCK_UNLOCKED;
 		rt_hash_table[i].chain = NULL;
 	}
 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] lockfree rtcache lookup using RCU
  2002-05-08  7:27 [PATCH] lockfree rtcache lookup using RCU Dipankar Sarma
@ 2002-05-08  8:10 ` David S. Miller
  2002-05-08  8:54   ` Dipankar Sarma
  2002-05-08  9:40 ` [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1) Dax Kelson
  1 sibling, 1 reply; 18+ messages in thread
From: David S. Miller @ 2002-05-08  8:10 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel

   From: Dipankar Sarma <dipankar@in.ibm.com>
   Date: Wed, 8 May 2002 12:57:11 +0530
   
   For 1 to 8 CPUs I used the test script
   to send a fixed number of packets to a single destination address.
   The results show that time needed for lookup continuously increases
   for 2.5.3 wherease for rt_rcu-2.5.3, it remains constant.

How does it perform for a write-heavy workload such
as a massive route flap?

Both are equally important.

Also, workload for single destination isn't all that interesting
since such a workload isn't all that common except in benchmarking.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] lockfree rtcache lookup using RCU
  2002-05-08  8:10 ` David S. Miller
@ 2002-05-08  8:54   ` Dipankar Sarma
  2002-05-08  9:09     ` David S. Miller
  0 siblings, 1 reply; 18+ messages in thread
From: Dipankar Sarma @ 2002-05-08  8:54 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel

On Wed, May 08, 2002 at 01:10:08AM -0700, David S. Miller wrote:
>    From: Dipankar Sarma <dipankar@in.ibm.com>
>    Date: Wed, 8 May 2002 12:57:11 +0530
>    
>    For 1 to 8 CPUs I used the test script
>    to send a fixed number of packets to a single destination address.
>    The results show that time needed for lookup continuously increases
>    for 2.5.3 wherease for rt_rcu-2.5.3, it remains constant.
> 
> How does it perform for a write-heavy workload such
> as a massive route flap?

Can you suggest a test to do this ? I can't think of anything other
than manually flushing the route cache.

> 
> Both are equally important.

Yes. With RCU, the write-side doesn't affect the performance
of the read-side. However, there is the additional overhead
of calling call_rcu() and queueing up of the RCU callback
for every dst entry that is freed. The other aspect is of
latency, how quickly the deferred rcu callbacks are invoked.
We have done some studies in this regard under various
degrees of workloads. I will put up the graphs in lse. They
will also be presented at OLS2002.

> 
> Also, workload for single destination isn't all that interesting
> since such a workload isn't all that common except in benchmarking.

A heavily loaded webserver with NATed ip addresses. Would this not
result in many server processes looking up the same ip address ?

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] lockfree rtcache lookup using RCU
  2002-05-08  8:54   ` Dipankar Sarma
@ 2002-05-08  9:09     ` David S. Miller
  2002-05-08 13:24       ` Dipankar Sarma
  0 siblings, 1 reply; 18+ messages in thread
From: David S. Miller @ 2002-05-08  9:09 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel

   From: Dipankar Sarma <dipankar@in.ibm.com>
   Date: Wed, 8 May 2002 14:24:33 +0530

   On Wed, May 08, 2002 at 01:10:08AM -0700, David S. Miller wrote:
   > Also, workload for single destination isn't all that interesting
   > since such a workload isn't all that common except in benchmarking.
   
   A heavily loaded webserver with NATed ip addresses. Would this not
   result in many server processes looking up the same ip address ?

The more common situation is server N IP (where N is 1 or a very small
number), destination clients == thousands of IPs.

So looking up the same dst cache entry with each benchmark client
is very unrealistic.  Try a unique IP address for every single lookup.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08  7:27 [PATCH] lockfree rtcache lookup using RCU Dipankar Sarma
  2002-05-08  8:10 ` David S. Miller
@ 2002-05-08  9:40 ` Dax Kelson
  2002-05-08 13:42   ` Keith Owens
  2002-05-08 19:38   ` chris
  1 sibling, 2 replies; 18+ messages in thread
From: Dax Kelson @ 2002-05-08  9:40 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org; +Cc: Linus Torvalds, Alan Cox, marcelo


Originally when a process set*uided all capabilities bits were cleared.  
Then sometime later (wish BK went back 3 years), the behaviour was
modified according to the comment "A process may, via prctl(), elect to
keep its capabilites when it calls setuid() and switches away from
uid==0. Both permitted and effective sets will be retained."

The current behavior/implementation doesn't match the comment. Only 
permitted capabilities are retained.

This patch against 2.4.18-3 (RHL7.3 kernel, should apply against stock)  
fixes it.  Now both permitted and effective capabilities are retained.

--- kernel/sys.c-org	Wed May  8 01:20:37 2002
+++ kernel/sys.c	Wed May  8 01:35:04 2002
@@ -482,7 +482,7 @@
 		cap_clear(current->cap_permitted);
 		cap_clear(current->cap_effective);
 	}
-	if (old_euid == 0 && current->euid != 0) {
+	if ((old_euid == 0 && current->euid != 0) && !current->keep_capabilities) {
 		cap_clear(current->cap_effective);
 	}
 	if (old_euid != 0 && current->euid == 0) {





^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] lockfree rtcache lookup using RCU
  2002-05-08  9:09     ` David S. Miller
@ 2002-05-08 13:24       ` Dipankar Sarma
  2002-05-08 13:45         ` David S. Miller
  0 siblings, 1 reply; 18+ messages in thread
From: Dipankar Sarma @ 2002-05-08 13:24 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel

On Wed, May 08, 2002 at 02:09:32AM -0700, David S. Miller wrote:
> The more common situation is server N IP (where N is 1 or a very small
> number), destination clients == thousands of IPs.
> 
> So looking up the same dst cache entry with each benchmark client
> is very unrealistic.  Try a unique IP address for every single lookup.

Ok, how about this then -

A large number of processes of which small sets may look up the same
ip address. dst ip addresses change after every 50 packets or
so.

Is this more realistic ?

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08  9:40 ` [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1) Dax Kelson
@ 2002-05-08 13:42   ` Keith Owens
  2002-05-08 19:28     ` chris
  2002-05-08 19:38   ` chris
  1 sibling, 1 reply; 18+ messages in thread
From: Keith Owens @ 2002-05-08 13:42 UTC (permalink / raw)
  To: Dax Kelson; +Cc: linux-kernel

On Wed, 8 May 2002 03:40:11 -0600 (MDT), 
Dax Kelson <dax@gurulabs.com> wrote:
>Originally when a process set*uided all capabilities bits were cleared.  
>Then sometime later (wish BK went back 3 years), the behaviour was
>modified according to the comment "A process may, via prctl(), elect to
>keep its capabilites when it calls setuid() and switches away from
>uid==0. Both permitted and effective sets will be retained."

FWIW, the change was in 2.2.18-pre18, between October 26 and 29, 2000.

I have all the kernel versions from 2.0.21 (1997) through 2.5.14 in a
set of PRCS repositories.  A binary chop on 2.2 found the change in a
few minutes.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] lockfree rtcache lookup using RCU
  2002-05-08 13:24       ` Dipankar Sarma
@ 2002-05-08 13:45         ` David S. Miller
  2002-05-14 11:42           ` Dipankar Sarma
  0 siblings, 1 reply; 18+ messages in thread
From: David S. Miller @ 2002-05-08 13:45 UTC (permalink / raw)
  To: dipankar; +Cc: linux-kernel

   From: Dipankar Sarma <dipankar@in.ibm.com>
   Date: Wed, 8 May 2002 18:54:57 +0530
   
   A large number of processes of which small sets may look up the same
   ip address. dst ip addresses change after every 50 packets or
   so.
   
   Is this more realistic ?

More like every 4 or 5 packets.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08 13:42   ` Keith Owens
@ 2002-05-08 19:28     ` chris
  0 siblings, 0 replies; 18+ messages in thread
From: chris @ 2002-05-08 19:28 UTC (permalink / raw)
  To: Keith Owens; +Cc: Dax Kelson, linux-kernel


On Wed, 8 May 2002, Keith Owens wrote:

> On Wed, 8 May 2002 03:40:11 -0600 (MDT),
> Dax Kelson <dax@gurulabs.com> wrote:
> >Originally when a process set*uided all capabilities bits were cleared.
> >Then sometime later (wish BK went back 3 years), the behaviour was
> >modified according to the comment "A process may, via prctl(), elect to
> >keep its capabilites when it calls setuid() and switches away from
> >uid==0. Both permitted and effective sets will be retained."
>
> FWIW, the change was in 2.2.18-pre18, between October 26 and 29, 2000.

I did the original change in 2.3.x. Since it is so important to get useful
capability functionality, someone (Chris Wing?) backported the change to
2.2.x.

I'll reply to the original e-mail shortly..

Cheers
Chris



^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08  9:40 ` [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1) Dax Kelson
  2002-05-08 13:42   ` Keith Owens
@ 2002-05-08 19:38   ` chris
  2002-05-08 19:55     ` Dax Kelson
  2002-05-08 20:21     ` [RFC] Making capabilites useful with legacy apps Dax Kelson
  1 sibling, 2 replies; 18+ messages in thread
From: chris @ 2002-05-08 19:38 UTC (permalink / raw)
  To: Dax Kelson
  Cc: linux-kernel@vger.kernel.org, Linus Torvalds, Alan Cox, marcelo


Hi,

On Wed, 8 May 2002, Dax Kelson wrote:

> Originally when a process set*uided all capabilities bits were cleared.
> Then sometime later (wish BK went back 3 years), the behaviour was
> modified according to the comment "A process may, via prctl(), elect to
> keep its capabilites when it calls setuid() and switches away from
> uid==0. Both permitted and effective sets will be retained."
>
> The current behavior/implementation doesn't match the comment. Only
> permitted capabilities are retained.
>
> This patch against 2.4.18-3 (RHL7.3 kernel, should apply against stock)
> fixes it.  Now both permitted and effective capabilities are retained.

This is a change of behaviour in a fairly security sensitive area, so I'd
like us to step back and ask - should we fix the code or the comment?

An application using prctl()[1] is capability aware. I think it is fair
(and more secure) if we require these applications to explicitly request
raising capabilities in the effective set, after the switch from euid == 0
to euid != 0.

Comments?

Cheers
Chris

[1] There are quite a few now - search google for PR_SET_KEEPCAPS.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08 19:38   ` chris
@ 2002-05-08 19:55     ` Dax Kelson
  2002-05-08 20:17       ` Alan Cox
  2002-05-08 21:20       ` chris
  2002-05-08 20:21     ` [RFC] Making capabilites useful with legacy apps Dax Kelson
  1 sibling, 2 replies; 18+ messages in thread
From: Dax Kelson @ 2002-05-08 19:55 UTC (permalink / raw)
  To: chris@scary.beasts.org
  Cc: linux-kernel@vger.kernel.org, Linus Torvalds, Alan Cox,
	marcelo@conectiva.com.br

On Wed, 8 May 2002, chris@scary.beasts.org wrote:

> This is a change of behaviour in a fairly security sensitive area, so I'd
> like us to step back and ask - should we fix the code or the comment?
> 
> An application using prctl()[1] is capability aware. I think it is fair
> (and more secure) if we require these applications to explicitly request
> raising capabilities in the effective set, after the switch from euid == 0
> to euid != 0.
> 
> Comments?

With the current behaviour an app has to link against libcap and do:

prtctl(PR_SET_KEEPCAPS, 1)
pre_caps = (capgetp(0, cap_init())  // save our caps into a struct
setuid(uid)  
setgid(gid)
capsetp(0,pre_caps)  // Restore them

With this patch, the app does:

prtctl(PR_SET_KEEPCAPS, 1)
setuid(uid)
setgid(gid)

The end result is exactly the same.  I'm trying to envision how this patch
would change security in a negative way.  I keep coming back to the Unix
idea of "don't be smarter than the admin; don't try to protect root from
himself". 

Maybe we could enchance PR_SET_KEEPCAPS so that:

prtctl(PR_SET_KEEPCAPS, 2) kept both the permitted and the effective caps.

Dax Kelson
Guru Labs


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08 19:55     ` Dax Kelson
@ 2002-05-08 20:17       ` Alan Cox
  2002-05-08 21:20       ` chris
  1 sibling, 0 replies; 18+ messages in thread
From: Alan Cox @ 2002-05-08 20:17 UTC (permalink / raw)
  To: Dax Kelson
  Cc: chris@scary.beasts.org, linux-kernel@vger.kernel.org,
	Linus Torvalds, Alan Cox, marcelo@conectiva.com.br

> prtctl(PR_SET_KEEPCAPS, 1)
> setuid(uid)
> setgid(gid)
> 
> The end result is exactly the same.  I'm trying to envision how this patch
> would change security in a negative way.  I keep coming back to the Unix
> idea of "don't be smarter than the admin; don't try to protect root from
> himself". 
> 
> Maybe we could enchance PR_SET_KEEPCAPS so that:
> 
> prtctl(PR_SET_KEEPCAPS, 2) kept both the permitted and the effective caps.

If thats the concern (and its a fair one) then just fix 2.5 not 2.4. 2.4
stuff will need to do the usermode fixes anyway due to old releases

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [RFC] Making capabilites useful with legacy apps
  2002-05-08 19:38   ` chris
  2002-05-08 19:55     ` Dax Kelson
@ 2002-05-08 20:21     ` Dax Kelson
  2002-05-13 12:55       ` Pavel Machek
  1 sibling, 1 reply; 18+ messages in thread
From: Dax Kelson @ 2002-05-08 20:21 UTC (permalink / raw)
  To: linux-kernel@vger.kernel.org
  Cc: torvalds@transmeta.com, alan@lxorguk.ukuu.org.uk,
	marcelo@conectiva.com.br


In attempt to make capabilites more useful before the filesytem support 
arrives, I would like to "wrap" non-capabilities aware apps.

For example:

# capwrap --user nobody --grp nobody --cap CAP_NET_BIND_SERVICE nc -l -p 80

The wrapper would look like:

1 prtctl(PR_SET_KEEPCAPS, 1)
2 setreuid(uid,uid)  
3 setregid(gid,gid)
4 desired_caps = cap_from_text(argv[caps])
5 capsetp(0,desired_caps)
6 execvp(argv[legacyapp])

This wrapper[1] (that would increase security) won't work with the current 
kernel though, because at step 6, all capabilities are cleared.

It looks like when a non uid 0 process execs, all capabilities are 
cleared.

The wrapper could also support chroot and setrlimit.

Dax Kelson
Guru Labs

[1] Marc Heuse wrote "compartment" that does caps OR set*uid, but not both 
(see my discussion above)


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08 19:55     ` Dax Kelson
  2002-05-08 20:17       ` Alan Cox
@ 2002-05-08 21:20       ` chris
  2002-05-08 21:29         ` Dax Kelson
  1 sibling, 1 reply; 18+ messages in thread
From: chris @ 2002-05-08 21:20 UTC (permalink / raw)
  To: Dax Kelson
  Cc: linux-kernel@vger.kernel.org, Linus Torvalds, Alan Cox,
	marcelo@conectiva.com.br


On Wed, 8 May 2002, Dax Kelson wrote:

> With the current behaviour an app has to link against libcap and do:
>
> prtctl(PR_SET_KEEPCAPS, 1)
> pre_caps = (capgetp(0, cap_init())  // save our caps into a struct
> setuid(uid)
> setgid(gid)
> capsetp(0,pre_caps)  // Restore them
>
> With this patch, the app does:
>
> prtctl(PR_SET_KEEPCAPS, 1)
> setuid(uid)
> setgid(gid)

Are you sure about the current behaviour? Taking vsftpd as an example, it
does

prctl()
setuid()
setgid()
cap_set_proc()

i.e. it only fiddles with capabilities after dropping euid == 0. Of
course, someone is welcome to fiddle with capabilities while euid == 0.
But, euid == 0 is hugely privileged even without any capabilities. So, the
benefit of running with euid == 0 and less than full capabilities is a bit
limited.

Cheers
Chris


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1)
  2002-05-08 21:20       ` chris
@ 2002-05-08 21:29         ` Dax Kelson
  0 siblings, 0 replies; 18+ messages in thread
From: Dax Kelson @ 2002-05-08 21:29 UTC (permalink / raw)
  To: chris@scary.beasts.org
  Cc: linux-kernel@vger.kernel.org, Linus Torvalds, Alan Cox,
	marcelo@conectiva.com.br

On Wed, 8 May 2002, chris@scary.beasts.org wrote:

> Are you sure about the current behaviour? Taking vsftpd as an example, it
> does
> 
1 prctl()
2 setuid()
3 setgid()
4 cap_set_proc()
 
> i.e. it only fiddles with capabilities after dropping euid == 0. Of
> course, someone is welcome to fiddle with capabilities while euid == 0.

Sure this can be done before and after the proposed patch, end results are
the same.  The difference would be what the effective caps are at step
3.5.

The point is when doing PR_SET_KEEPCAPS, one would expect not to have my
caps fiddled with at all.

Dax


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Making capabilites useful with legacy apps
  2002-05-08 20:21     ` [RFC] Making capabilites useful with legacy apps Dax Kelson
@ 2002-05-13 12:55       ` Pavel Machek
  2002-05-13 20:28         ` Neil Schemenauer
  0 siblings, 1 reply; 18+ messages in thread
From: Pavel Machek @ 2002-05-13 12:55 UTC (permalink / raw)
  To: Dax Kelson
  Cc: linux-kernel@vger.kernel.org, torvalds@transmeta.com,
	alan@lxorguk.ukuu.org.uk, marcelo@conectiva.com.br

Hi!

> In attempt to make capabilites more useful before the filesytem support 
> arrives, I would like to "wrap" non-capabilities aware apps.
> 
> For example:
> 
> # capwrap --user nobody --grp nobody --cap CAP_NET_BIND_SERVICE nc -l -p 80

That looks pretty nice...

> This wrapper[1] (that would increase security) won't work with the current 
> kernel though, because at step 6, all capabilities are cleared.

This should be fixed, then.
									Pavel
PS: you could ptrace attach yourself, fork and exec on root, and then force
newly exec-ed app to give up id... But that's ugly and complicated hack.

-- 
Philips Velo 1: 1"x4"x8", 300gram, 60, 12MB, 40bogomips, linux, mutt,
details at http://atrey.karlin.mff.cuni.cz/~pavel/velo/index.html.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC] Making capabilites useful with legacy apps
  2002-05-13 12:55       ` Pavel Machek
@ 2002-05-13 20:28         ` Neil Schemenauer
  0 siblings, 0 replies; 18+ messages in thread
From: Neil Schemenauer @ 2002-05-13 20:28 UTC (permalink / raw)
  To: Dax Kelson; +Cc: linux-kernel@vger.kernel.org

You might want to look at my "capwrap" kernel module.  You can get it
from:

    http://arctrix.com/nas/linux/capwrap.tar.gz

It's pretty simple and does the job for me.  Comments appreciated.

  Neil

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] lockfree rtcache lookup using RCU
  2002-05-08 13:45         ` David S. Miller
@ 2002-05-14 11:42           ` Dipankar Sarma
  0 siblings, 0 replies; 18+ messages in thread
From: Dipankar Sarma @ 2002-05-14 11:42 UTC (permalink / raw)
  To: David S. Miller; +Cc: linux-kernel

On Wed, May 08, 2002 at 06:45:28AM -0700, David S. Miller wrote:
>    From: Dipankar Sarma <dipankar@in.ibm.com>
>    Date: Wed, 8 May 2002 18:54:57 +0530
>    
>    A large number of processes of which small sets may look up the same
>    ip address. dst ip addresses change after every 50 packets or
>    so.
>    
>    Is this more realistic ?
> 
> More like every 4 or 5 packets.

Ok, here are some results from a test like that on a 8-way PIII
xeon with 1MB L2 cache. The test has 32 processes sending
random sized packets with dst ip changing every 5 packets.
There results are probably a bit skewed because of the neighbor
table overflows as seen in by the high profile count in
__write_lock_failed and neigh_forced_gc. It does seem that
the packet rate in this test is very high.

ip_route_output_key() is 22% faster with rt_rcu. I will publish 
the results on a 16way NUMA later.

Thanks
-- 
Dipankar Sarma  <dipankar@in.ibm.com> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.



base (2.5.3)
----
__write_lock_failed [c0107af0]: 602749
stext_lock [c024eb18]: 178874
neigh_forced_gc [c020a430]: 104295
USER [c01263d0]: 49856
qdisc_restart [c020f070]: 45435
__read_lock_failed [c0107b10]: 35868
dev_queue_xmit [c0207d00]: 27545
__generic_copy_from_user [c0193b10]: 18680
ace_start_xmit [c01bf670]: 14331
nf_hook_slow [c020e940]: 8365
__kfree_skb [c0204180]: 6919
default_idle [c0106ed0]: 6511
sock_alloc_send_pskb [c02032d0]: 6420
ace_interrupt [c01bf180]: 6254
pfifo_fast_dequeue [c020f550]: 4466
net_tx_action [c0208230]: 4228
ip_finish_output2 [c0219520]: 3747
kfree_skbmem [c0204110]: 3436
ip_route_output_key [c0214470]: 3034



rt_rcu (2.5.3)
-------

__write_lock_failed [c0107af0]: 558354
stext_lock [c024ef58]: 227526
neigh_forced_gc [c020aaa0]: 100893
USER [c0126a40]: 49559
qdisc_restart [c020f6e0]: 45950
dev_queue_xmit [c0208370]: 27353
__read_lock_failed [c0107b10]: 24113
__generic_copy_from_user [c0194180]: 18375
ace_start_xmit [c01bfce0]: 15677
nf_hook_slow [c020efb0]: 8865
__kfree_skb [c02047f0]: 6890
sock_alloc_send_pskb [c0203940]: 6239
ace_interrupt [c01bf7f0]: 5982
default_idle [c0106ed0]: 4679
pfifo_fast_dequeue [c020fbc0]: 4067
ip_finish_output2 [c0219940]: 3886
kfree_skbmem [c0204780]: 3496 
pfifo_fast_enqueue [c020fb40]: 3003
net_tx_action [c02088a0]: 2994
sock_def_write_space [c0204230]: 2552
sock_wfree [c0203700]: 2408
skb_release_data [c02046f0]: 2395
ip_route_output_key [c0214990]: 2358



^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2002-05-14 11:40 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2002-05-08  7:27 [PATCH] lockfree rtcache lookup using RCU Dipankar Sarma
2002-05-08  8:10 ` David S. Miller
2002-05-08  8:54   ` Dipankar Sarma
2002-05-08  9:09     ` David S. Miller
2002-05-08 13:24       ` Dipankar Sarma
2002-05-08 13:45         ` David S. Miller
2002-05-14 11:42           ` Dipankar Sarma
2002-05-08  9:40 ` [PATCH] Completely honor prctl(PR_SET_KEEPCAPS, 1) Dax Kelson
2002-05-08 13:42   ` Keith Owens
2002-05-08 19:28     ` chris
2002-05-08 19:38   ` chris
2002-05-08 19:55     ` Dax Kelson
2002-05-08 20:17       ` Alan Cox
2002-05-08 21:20       ` chris
2002-05-08 21:29         ` Dax Kelson
2002-05-08 20:21     ` [RFC] Making capabilites useful with legacy apps Dax Kelson
2002-05-13 12:55       ` Pavel Machek
2002-05-13 20:28         ` Neil Schemenauer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox