Netdev List
 help / color / mirror / Atom feed
* [PATCH 1/1] TCP: increase default initial receive window.
From: Nandita Dukkipati @ 2010-12-18  3:20 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Tom Herbert, Laurent Chavey, Yuchung Cheng,
	Nandita Dukkipati

This patch changes the default initial receive window to 10 mss
(defined constant). The default window is limited to the maximum
of 10*1460 and 2*mss (when mss > 1460).

Signed-off-by: Nandita Dukkipati <nanditad@google.com>
---
 include/net/tcp.h     |    3 +++
 net/ipv4/tcp_output.c |   11 ++++++++---
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2ab6c9c..6c25ba8 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -60,6 +60,9 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
  */
 #define MAX_TCP_WINDOW		32767U
 
+/* Offer an initial receive window of 10 mss. */
+#define TCP_DEFAULT_INIT_RCVWND	10
+
 /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
 #define TCP_MIN_MSS		88U
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2d39066..dc7c096 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -228,10 +228,15 @@ void tcp_select_initial_window(int __space, __u32 mss,
 		}
 	}
 
-	/* Set initial window to value enough for senders, following RFC5681. */
+	/* Set initial window to a value enough for senders starting with
+	 * initial congestion window of TCP_DEFAULT_INIT_RCVWND. Place
+	 * a limit on the initial window when mss is larger than 1460.
+	 */
 	if (mss > (1 << *rcv_wscale)) {
-		int init_cwnd = rfc3390_bytes_to_packets(mss);
-
+		int init_cwnd = TCP_DEFAULT_INIT_RCVWND;
+		if (mss > 1460)
+			init_cwnd =
+			max_t(u32, (1460 * TCP_DEFAULT_INIT_RCVWND) / mss, 2);
 		/* when initializing use the value from init_rcv_wnd
 		 * rather than the default from above
 		 */
-- 
1.7.3.1


^ permalink raw reply related

* Re: [PATCH 1/1] TCP: increase default initial receive window.
From: Stephen Hemminger @ 2010-12-18  3:59 UTC (permalink / raw)
  To: Nandita Dukkipati
  Cc: David S. Miller, netdev, Tom Herbert, Laurent Chavey,
	Yuchung Cheng
In-Reply-To: <1292642451-892-1-git-send-email-nanditad@google.com>

On Fri, 17 Dec 2010 19:20:51 -0800
Nandita Dukkipati <nanditad@google.com> wrote:

> This patch changes the default initial receive window to 10 mss
> (defined constant). The default window is limited to the maximum
> of 10*1460 and 2*mss (when mss > 1460).
> 
> Signed-off-by: Nandita Dukkipati <nanditad@google.com>

This needs way more discussion because it makes Linux non-RFC
breaking the RFC behavior should require explicit user override.
I also wonder if the magic value 10 should be a sysctl.

^ permalink raw reply

* [PATCH v4 net-next-2.6] netfilter: x_tables: dont block BH while reading counters
From: Eric Dumazet @ 2010-12-18  4:29 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Jesper Dangaard Brouer, netfilter-devel, netdev,
	Stephen Hemminger
In-Reply-To: <1292521986.2883.472.camel@edumazet-laptop>

Using "iptables -L" with a lot of rules have a too big BH latency.
Jesper mentioned ~6 ms and worried of frame drops.

Switch to a per_cpu seqlock scheme, so that taking a snapshot of
counters doesnt need to block BH (for this cpu, but also other cpus).

This adds two increments on seqlock sequence per ipt_do_table() call,
its a reasonable cost for allowing "iptables -L" not block BH
processing.

Reported-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
---
v4: I forgot net/netfilter/x_tables.c change in v3

 include/linux/netfilter/x_tables.h |   10 +++---
 net/ipv4/netfilter/arp_tables.c    |   45 ++++++++-------------------
 net/ipv4/netfilter/ip_tables.c     |   45 ++++++++-------------------
 net/ipv6/netfilter/ip6_tables.c    |   45 ++++++++-------------------
 net/netfilter/x_tables.c           |    3 +
 5 files changed, 49 insertions(+), 99 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 742bec0..6712e71 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -472,7 +472,7 @@ extern void xt_free_table_info(struct xt_table_info *info);
  *  necessary for reading the counters.
  */
 struct xt_info_lock {
-	spinlock_t lock;
+	seqlock_t lock;
 	unsigned char readers;
 };
 DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
@@ -497,7 +497,7 @@ static inline void xt_info_rdlock_bh(void)
 	local_bh_disable();
 	lock = &__get_cpu_var(xt_info_locks);
 	if (likely(!lock->readers++))
-		spin_lock(&lock->lock);
+		write_seqlock(&lock->lock);
 }
 
 static inline void xt_info_rdunlock_bh(void)
@@ -505,7 +505,7 @@ static inline void xt_info_rdunlock_bh(void)
 	struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
 
 	if (likely(!--lock->readers))
-		spin_unlock(&lock->lock);
+		write_sequnlock(&lock->lock);
 	local_bh_enable();
 }
 
@@ -516,12 +516,12 @@ static inline void xt_info_rdunlock_bh(void)
  */
 static inline void xt_info_wrlock(unsigned int cpu)
 {
-	spin_lock(&per_cpu(xt_info_locks, cpu).lock);
+	write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
 }
 
 static inline void xt_info_wrunlock(unsigned int cpu)
 {
-	spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
+	write_sequnlock(&per_cpu(xt_info_locks, cpu).lock);
 }
 
 /*
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3fac340..e855fff 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -710,42 +710,25 @@ static void get_counters(const struct xt_table_info *t,
 	struct arpt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
+		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
 		i = 0;
-		local_bh_disable();
-		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqbegin(lock);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqretry(lock, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
 		}
-		xt_info_wrunlock(cpu);
-		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -759,7 +742,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	 * about).
 	 */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc(countersize);
+	counters = vzalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1007,7 +990,7 @@ static int __do_replace(struct net *net, const char *name,
 	struct arpt_entry *iter;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d63..652efea 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -884,42 +884,25 @@ get_counters(const struct xt_table_info *t,
 	struct ipt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU.
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
+		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
 		i = 0;
-		local_bh_disable();
-		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqbegin(lock);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqretry(lock, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i; /* macro does multi eval of i */
 		}
-		xt_info_wrunlock(cpu);
-		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -932,7 +915,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc(countersize);
+	counters = vzalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ipt_entry *iter;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 4555823..7d227c6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -897,42 +897,25 @@ get_counters(const struct xt_table_info *t,
 	struct ip6t_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu = get_cpu();
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 *
-	 * Bottom half has to be disabled to prevent deadlock
-	 * if new softirq were to run and call ipt_do_table
-	 */
-	local_bh_disable();
-	i = 0;
-	xt_entry_foreach(iter, t->entries[curcpu], t->size) {
-		SET_COUNTER(counters[i], iter->counters.bcnt,
-			    iter->counters.pcnt);
-		++i;
-	}
-	local_bh_enable();
-	/* Processing counters from other cpus, we can let bottom half enabled,
-	 * (preemption is disabled)
-	 */
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
+		seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
 		i = 0;
-		local_bh_disable();
-		xt_info_wrlock(cpu);
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
-			ADD_COUNTER(counters[i], iter->counters.bcnt,
-				    iter->counters.pcnt);
+			u64 bcnt, pcnt;
+			unsigned int start;
+
+			do {
+				start = read_seqbegin(lock);
+				bcnt = iter->counters.bcnt;
+				pcnt = iter->counters.pcnt;
+			} while (read_seqretry(lock, start));
+
+			ADD_COUNTER(counters[i], bcnt, pcnt);
 			++i;
 		}
-		xt_info_wrunlock(cpu);
-		local_bh_enable();
 	}
-	put_cpu();
 }
 
 static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -945,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc(countersize);
+	counters = vzalloc(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1216,7 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ip6t_entry *iter;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8046350..c942376 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1325,7 +1325,8 @@ static int __init xt_init(void)
 
 	for_each_possible_cpu(i) {
 		struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
-		spin_lock_init(&lock->lock);
+
+		seqlock_init(&lock->lock);
 		lock->readers = 0;
 	}
 



^ permalink raw reply related

* Re: [RFC PATCH 3/3] igb: example of how to update igb to make use of in-kernel Toeplitz hashing
From: David Miller @ 2010-12-18  5:09 UTC (permalink / raw)
  To: alexander.h.duyck; +Cc: netdev
In-Reply-To: <20101218010048.28602.49776.stgit@gitlad.jf.intel.com>

From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 17 Dec 2010 17:00:48 -0800

> @@ -1691,6 +1692,7 @@ static const struct net_device_ops igb_netdev_ops = {
>  	.ndo_open		= igb_open,
>  	.ndo_stop		= igb_close,
>  	.ndo_start_xmit		= igb_xmit_frame_adv,
> +	.ndo_select_queue	= toeplitz_select_queue,
>  	.ndo_get_stats64	= igb_get_stats64,
>  	.ndo_set_rx_mode	= igb_set_rx_mode,
>  	.ndo_set_multicast_list	= igb_set_rx_mode,

Adding a NETIF_F_TX_TOEPLITZ flag that skb_tx_hash() keys off of would
be a lot simpler.

We want less overriding of ->ndo_select_queue(), not more, and this case
is definitely gratuitous.

^ permalink raw reply

* Re: [PATCH 1/1] TCP: increase default initial receive window.
From: David Miller @ 2010-12-18  5:13 UTC (permalink / raw)
  To: nanditad; +Cc: netdev, therbert, chavey, ycheng
In-Reply-To: <1292642451-892-1-git-send-email-nanditad@google.com>

From: Nandita Dukkipati <nanditad@google.com>
Date: Fri, 17 Dec 2010 19:20:51 -0800

> This patch changes the default initial receive window to 10 mss
> (defined constant). The default window is limited to the maximum
> of 10*1460 and 2*mss (when mss > 1460).
> 
> Signed-off-by: Nandita Dukkipati <nanditad@google.com>

That's an incredibly terse explanation for a very non-trivial
change with very non-trivial implications.

What analysis have you performed to lead you to decide that this
was a reasonable change to make?  Where can people see that
analysis and look over it to see if they agree with your
assesment of the data?

We can't apply a patch like that without any form of analysis or
reasoning.

You don't say "why" you're doing this, and frankly that really
ticks me off.

^ permalink raw reply

* Re: [PATCH 1/1] TCP: increase default initial receive window.
From: David Miller @ 2010-12-18  5:14 UTC (permalink / raw)
  To: shemminger; +Cc: nanditad, netdev, therbert, chavey, ycheng
In-Reply-To: <20101217195917.33140980@nehalam>

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Fri, 17 Dec 2010 19:59:17 -0800

> On Fri, 17 Dec 2010 19:20:51 -0800
> Nandita Dukkipati <nanditad@google.com> wrote:
> 
>> This patch changes the default initial receive window to 10 mss
>> (defined constant). The default window is limited to the maximum
>> of 10*1460 and 2*mss (when mss > 1460).
>> 
>> Signed-off-by: Nandita Dukkipati <nanditad@google.com>
> 
> This needs way more discussion because it makes Linux non-RFC
> breaking the RFC behavior should require explicit user override.
> I also wonder if the magic value 10 should be a sysctl.

He's changing the default receive window not the transmit
congestion window, so there is nothing RFC about this.

But yes he does have to explain himself why this change is
being made, in exhaustive detail.

^ permalink raw reply

* Re: [PATCH v2] kptr_restrict for hiding kernel pointers from unprivileged users
From: Dan Rosenberg @ 2010-12-18  5:22 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, linux-security-module, netdev, jmorris, eugeneteo,
	kees.cook, mingo, davem
In-Reply-To: <20101217172231.8842f5cc.akpm@linux-foundation.org>

On Fri, 2010-12-17 at 17:22 -0800, Andrew Morton wrote:
> On Fri, 17 Dec 2010 20:12:39 -0500
> Dan Rosenberg <drosenberg@vsecurity.com> wrote:
> 
> > > 
> > > So what's next?  We need to convert 1,000,000 %p callsites to use %pK? 
> > > That'll be fun.  Please consider adding a new checkpatch rule which
> > > detects %p and asks people whether they should have used %pK.
> > 
> > The goal of this format specifier is specifically for pointers that are
> > exposed to unprivileged users.  I agree that hiding all kernel pointers
> > would be nice, but I don't expect the angry masses to ever agree to
> > that.  For now, I'll isolate specific cases, especially in /proc, that
> > are clear risks in terms of information leakage.  I'll also be skipping
> > over pointers written to the syslog, since I think hiding that
> > information is dmesg_restrict's job.
> 
> Well...  some administrators may wish to hide the pointer values even
> for privileged callers.  That's a pretty trivial add-on for the code
> which you have, and means that those admins can also suppress the
> pointers for IRQ-time callers.  More /proc knobs :)
> 

I can add a "2" setting that hides %pK pointers regardless of privilege
level, which I agree is a useful option.  But because it would be built
into the same format specifier, you still couldn't use %pK in interrupt
context (in case the sysctl wasn't set to 2).

> Then again, perhaps those admins would be OK if we simply disabled
> plain old %p everywhere.  In which case we're looking at a separate
> patch, I suggest.  

I would be happy to do this from a security perspective, but I'd imagine
there's a pretty high risk of things breaking by doing such a sweeping
change.

-Dan


^ permalink raw reply

* Re: [RFC PATCH 3/3] igb: example of how to update igb to make use of in-kernel Toeplitz hashing
From: Alexander Duyck @ 2010-12-18  6:53 UTC (permalink / raw)
  To: David Miller; +Cc: alexander.h.duyck, netdev
In-Reply-To: <20101217.210923.193721885.davem@davemloft.net>

On Fri, Dec 17, 2010 at 9:09 PM, David Miller <davem@davemloft.net> wrote:
> From: Alexander Duyck <alexander.h.duyck@intel.com>
> Date: Fri, 17 Dec 2010 17:00:48 -0800
>
>> @@ -1691,6 +1692,7 @@ static const struct net_device_ops igb_netdev_ops = {
>>       .ndo_open               = igb_open,
>>       .ndo_stop               = igb_close,
>>       .ndo_start_xmit         = igb_xmit_frame_adv,
>> +     .ndo_select_queue       = toeplitz_select_queue,
>>       .ndo_get_stats64        = igb_get_stats64,
>>       .ndo_set_rx_mode        = igb_set_rx_mode,
>>       .ndo_set_multicast_list = igb_set_rx_mode,
>
> Adding a NETIF_F_TX_TOEPLITZ flag that skb_tx_hash() keys off of would
> be a lot simpler

Thats probably true.  I hadn't put much thought into it at the time.

> We want less overriding of ->ndo_select_queue(), not more, and this case
> is definitely gratuitous.

I kind of figured that was the approach was flawed.  As I said in the
cover page, this is a somewhat hastily thrown together set of patches
to demonstrate a proof of concept.  To me this appeared to be the
quickest way to come up with the means to alter the queue selection.
The main piece I am interested in is any input on the first patch that
added the Toeplitz hash functionality.  Specifically would a 16 bit
key/hash work for queue selection or would there be cases where we
need a 32 bit key/hash result?

Thanks,

Alex

^ permalink raw reply

* Re: [PATCH] netlink: fix gcc -Wconversion compilation warning
From: Eric W. Biederman @ 2010-12-18  6:58 UTC (permalink / raw)
  To: David Miller; +Cc: kirill, netdev, pablo, ldv, linux-kernel
In-Reply-To: <20101217.120234.59681524.davem@davemloft.net>

David Miller <davem@davemloft.net> writes:

> From: ebiederm@xmission.com (Eric W. Biederman)
> Date: Mon, 13 Dec 2010 13:35:25 -0800
>
>> "Kirill A. Shutsemov" <kirill@shutemov.name> writes:
>> 
>>> From: Dmitry V. Levin <ldv@altlinux.org>
>>>
>>> $ cat << EOF | gcc -Wconversion -xc -S -o/dev/null -
>>> unsigned f(void) {return NLMSG_HDRLEN;}
>>> EOF
>>> <stdin>: In function 'f':
>>> <stdin>:3:26: warning: negative integer implicitly converted to unsigned type
>>>
>> This doesn't look like a bad fix, but I believe things will fail if
>> we give NLMSG_ALIGN an unsigned long like size_t.  Say like sizeof.
>
> What are you talking about? That's exactly his test case,
> look at what NLMSG_HDRLEN is defined to, it's exactly the
> case you're worried "will fail", it passes sizeof() to
> NLMSG_ALIGN.
>
> I think I'll apply Kirill's original patch, it's good enough
> and simpler.

Probably.  The case I was worried about was masks that become
0xffffffxx on 64bit instead of 0xffffffffffffxx.  Especially
when mixing those with ints.

It is possible to get some really weird things.

Eric

^ permalink raw reply

* Re: [RFC PATCH 3/3] igb: example of how to update igb to make use of in-kernel Toeplitz hashing
From: David Miller @ 2010-12-18  6:59 UTC (permalink / raw)
  To: alexander.duyck; +Cc: alexander.h.duyck, netdev
In-Reply-To: <AANLkTimuXo+ZJKmz4U38mLd8x-KymJ0BQxTTjDT2bjgy@mail.gmail.com>

From: Alexander Duyck <alexander.duyck@gmail.com>
Date: Fri, 17 Dec 2010 22:53:24 -0800

> Specifically would a 16 bit key/hash work for queue selection or
> would there be cases where we need a 32 bit key/hash result?

Currently for TX 16-bit ought to be enough.

^ permalink raw reply

* Re: [PATCH 1/1] TCP: increase default initial receive window.
From: Nandita Dukkipati @ 2010-12-18  9:08 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, therbert, chavey, ycheng
In-Reply-To: <20101217.211309.226761639.davem@davemloft.net>

resend in plain text.

On Fri, Dec 17, 2010 at 9:13 PM, David Miller <davem@davemloft.net> wrote:
> From: Nandita Dukkipati <nanditad@google.com>
> Date: Fri, 17 Dec 2010 19:20:51 -0800
>
>> This patch changes the default initial receive window to 10 mss
>> (defined constant). The default window is limited to the maximum
>> of 10*1460 and 2*mss (when mss > 1460).
>>
>> Signed-off-by: Nandita Dukkipati <nanditad@google.com>
>
> That's an incredibly terse explanation for a very non-trivial
> change with very non-trivial implications.
>
> What analysis have you performed to lead you to decide that this
> was a reasonable change to make?  Where can people see that
> analysis and look over it to see if they agree with your
> assesment of the data?

Apologies for the terse comment. Here's a longer explanation.

Background:
A recent proposal to the IETF [Ref: 5] recommends increasing TCP's
initial congestion window to 10 mss or about 15KB. This proposal,
backed with data from several large-scale live experiments as well as
controlled testbed experiments, is under active discussion in the TCPM
working group.

Analysis performed:
Leading up to this proposal were several large-scale Internet
experiments [Ref: 2] with an initial congestion window of 10 mss
(IW10), where we showed that the average latency of HTTP responses
improved by approximately 10%. This was accompanied by a slight
increase in retransmission rate (0.5%), most of which is coming from
applications opening multiple simultaneous connections. To understand
the extreme
worst case scenarios, as well as fairness issues with IW10 versus IW3
traffic, we further conducted controlled testbed experiments
(end-hosts are all Linux based). We came away finding minimal negative
impact even under low link bandwidths (dial-ups) and small buffers
[Ref: 3]. These results are extremely encouraging to adopting IW10.

But obviously, an initial congestion window of 10 mss is useless
unless a TCP receiver advertises an initial receive window of at least
10 mss. Fortunately, in the large-scale Internet experiments we found
that most of the operating systems advertised a large enough initial
receive window, allowing us to experiment with various values of
initial congestion windows. Linux systems were among the few
exceptions that advertised a small receive window. This patch intends
to fix that.

References:

1. This site has a comprehensive list of all IW10 references to date.
http://code.google.com/speed/protocols/tcpm-IW10.html

2. Paper describing results from large-scale Internet experiments with IW10.
http://ccr.sigcomm.org/drupal/?q=node/621

3. Controlled testbed experiments with IW10 under worst case scenarios
http://www.ietf.org/proceedings/79/slides/tcpm-0.pdf

4. Raw test data from testbed experiments (Linux senders/receivers)
with initial congestion window and initial receive window of 10 mss.
http://research.csc.ncsu.edu/netsrv/?q=content/iw10

5. Internet-Draft. Increasing TCP's Initial Window.
https://datatracker.ietf.org/doc/draft-ietf-tcpm-initcwnd/

^ permalink raw reply

* [PATCH 1/2] net: phy: balance disable/enable irq on change
From: Jean-Michel Hautbois @ 2010-12-18 15:55 UTC (permalink / raw)
  To: davem
  Cc: richard.cochran, shemminger, tj, randy.dunlap, netdev,
	linux-kernel, Jean-Michel Hautbois

When phy interface changes its status, it calls phy_change() function.
This function calls the interrupt disabling functions for the driver registered, but if this driver doesn't implement it, there is no IRQ disabling. After doing the work, we call enable_irq and not the respective driver function. This fixes it, as it could lead to an unbalanced IRQ.

Signed-off-by: Jean-Michel Hautbois <jhautbois@gmail.com>
---
 drivers/net/phy/phy.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 7670aac..b28f2ac 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -89,7 +89,8 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts)
 	phydev->interrupts = interrupts;
 	if (phydev->drv->config_intr)
 		err = phydev->drv->config_intr(phydev);
-
+	else
+		err = -ENOSYS;
 	return err;
 }
 
@@ -541,6 +542,10 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 		return err;
 
 	err = phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
+	if (err == -ENOSYS) {
+		err = 0;
+		enable_irq(phydev->irq);
+	}
 
 	return err;
 }
@@ -556,7 +561,10 @@ static int phy_disable_interrupts(struct phy_device *phydev)
 	/* Disable PHY interrupts */
 	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
 
-	if (err)
+	if (err == -ENOSYS) {
+		err = 0;
+		disable_irq(phydev->irq);
+	} else if (err != 0)
 		goto phy_err;
 
 	/* Clear the interrupt */
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 2/2] net: phy: Fixed some checkpatch errors
From: Jean-Michel Hautbois @ 2010-12-18 15:55 UTC (permalink / raw)
  To: davem
  Cc: richard.cochran, shemminger, tj, randy.dunlap, netdev,
	linux-kernel, Jean-Michel Hautbois
In-Reply-To: <1292687723-31981-1-git-send-email-jhautbois@gmail.com>

Fixes some coding style issues (errors and warnings).

Signed-off-by: Jean-Michel Hautbois <jhautbois@gmail.com>
---
 drivers/net/phy/phy.c |   20 ++++++++++----------
 1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index b28f2ac..69758b4 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -33,10 +33,10 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 
-#include <asm/atomic.h>
-#include <asm/io.h>
+#include <linux/atomic.h>
+#include <linux/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**
  * phy_print_status - Convenience function to print out the current phy status
@@ -47,11 +47,11 @@ void phy_print_status(struct phy_device *phydev)
 	pr_info("PHY: %s - Link is %s", dev_name(&phydev->dev),
 			phydev->link ? "Up" : "Down");
 	if (phydev->link)
-		printk(" - %d/%s", phydev->speed,
+		pr_info(" - %d/%s", phydev->speed,
 				DUPLEX_FULL == phydev->duplex ?
 				"Full" : "Half");
 
-	printk("\n");
+	pr_info("\n");
 }
 EXPORT_SYMBOL(phy_print_status);
 
@@ -325,7 +325,7 @@ int phy_mii_ioctl(struct phy_device *phydev,
 
 	case SIOCSMIIREG:
 		if (mii_data->phy_id == phydev->addr) {
-			switch(mii_data->reg_num) {
+			switch (mii_data->reg_num) {
 			case MII_BMCR:
 				if ((val & (BMCR_RESET|BMCR_ANENABLE)) == 0)
 					phydev->autoneg = AUTONEG_DISABLE;
@@ -352,7 +352,7 @@ int phy_mii_ioctl(struct phy_device *phydev,
 		}
 
 		phy_write(phydev, mii_data->reg_num, val);
-		
+
 		if (mii_data->reg_num == MII_BMCR &&
 		    val & BMCR_RESET &&
 		    phydev->drv->config_init) {
@@ -471,7 +471,7 @@ static void phy_force_reduction(struct phy_device *phydev)
 	int idx;
 
 	idx = phy_find_setting(phydev->speed, phydev->duplex);
-	
+
 	idx++;
 
 	idx = phy_find_valid(idx, phydev->supported);
@@ -732,6 +732,7 @@ out_unlock:
 	 * will not reenable interrupts.
 	 */
 }
+EXPORT_SYMBOL(phy_stop);
 
 
 /**
@@ -762,7 +763,6 @@ void phy_start(struct phy_device *phydev)
 	}
 	mutex_unlock(&phydev->lock);
 }
-EXPORT_SYMBOL(phy_stop);
 EXPORT_SYMBOL(phy_start);
 
 /**
@@ -782,7 +782,7 @@ void phy_state_machine(struct work_struct *work)
 	if (phydev->adjust_state)
 		phydev->adjust_state(phydev->attached_dev);
 
-	switch(phydev->state) {
+	switch (phydev->state) {
 		case PHY_DOWN:
 		case PHY_STARTING:
 		case PHY_READY:
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH 1/2] net: phy: balance disable/enable irq on change
From: Ben Hutchings @ 2010-12-18 16:47 UTC (permalink / raw)
  To: Jean-Michel Hautbois
  Cc: davem, richard.cochran, shemminger, tj, randy.dunlap, netdev,
	linux-kernel
In-Reply-To: <1292687723-31981-1-git-send-email-jhautbois@gmail.com>

On Sat, 2010-12-18 at 16:55 +0100, Jean-Michel Hautbois wrote:
> When phy interface changes its status, it calls phy_change() function.
> This function calls the interrupt disabling functions for the driver registered, but if this driver doesn't implement it, there is no IRQ disabling. After doing the work, we call enable_irq and not the respective driver function. This fixes it, as it could lead to an unbalanced IRQ.
> 
> Signed-off-by: Jean-Michel Hautbois <jhautbois@gmail.com>
> ---
>  drivers/net/phy/phy.c |   12 ++++++++++--
>  1 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
> index 7670aac..b28f2ac 100644
> --- a/drivers/net/phy/phy.c
> +++ b/drivers/net/phy/phy.c
> @@ -89,7 +89,8 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts)
>  	phydev->interrupts = interrupts;
>  	if (phydev->drv->config_intr)
>  		err = phydev->drv->config_intr(phydev);
> -
> +	else
> +		err = -ENOSYS;
[...]

ENOSYS means missing system call.  Perhaps EOPNOTSUPP is the appropriate
error code.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* [PATCH v3] kptr_restrict for hiding kernel pointers
From: Dan Rosenberg @ 2010-12-18 17:20 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-security-module
  Cc: jmorris, eric.dumazet, tgraf, eugeneteo, kees.cook, mingo, davem,
	a.p.zijlstra, akpm

Add the %pK printk format specifier and
the /proc/sys/kernel/kptr_restrict sysctl.

The %pK format specifier is designed to hide exposed kernel pointers,
specifically via /proc interfaces.  Exposing these pointers provides an
easy target for kernel write vulnerabilities, since they reveal the
locations of writable structures containing easily triggerable function
pointers.  The behavior of %pK depends on the kptr_restrict sysctl.

If kptr_restrict is set to 0, no deviation from the standard %p behavior
occurs.  If kptr_restrict is set to 1, if the current user (intended to
be a reader via seq_printf(), etc.) does not have CAP_SYSLOG (which is
currently in the LSM tree), kernel pointers using %pK are printed as
0's.  If kptr_restrict is set to 2, kernel pointers using %pK are
printed as 0's regardless of privileges.  Replacing with 0's was chosen
over the default "(null)", which cannot be parsed by userland %p, which
expects "(nil)".


v3 adds the "2" setting, cleans up documentation, removes the CONFIG,
and incorporates changes and suggestions from Andrew Morton.

v2 improves checking for inappropriate context, on suggestion by Peter
Zijlstra.  Thanks to Thomas Graf for suggesting use of a centralized
format specifier.


Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: James Morris <jmorris@namei.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Thomas Graf <tgraf@infradead.org>
Cc: Eugene Teo <eugeneteo@kernel.org>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
---

 Documentation/sysctl/kernel.txt |   14 ++++++++++++++
 include/linux/kernel.h          |    2 ++
 kernel/sysctl.c                 |    9 +++++++++
 lib/vsprintf.c                  |   23 +++++++++++++++++++++++
 4 files changed, 48 insertions(+), 0 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 209e158..3cff47e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -261,6 +262,19 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kptr_restrict:
+
+This toggle indicates whether restrictions are placed on
+exposing kernel addresses via /proc and other interfaces.  When
+kptr_restrict is set to (0), the default, there are no
+restrictions.  When kptr_restrict is set to (1), kernel pointers
+printed using the %pK format specifier will be replaced with 0's
+unless the user has CAP_SYSLOG.  When kptr_restrict is set to
+(2), kernel pointers printed using %pK will be replaced with 0's
+regardless of privileges.
+
+==============================================================
+
 kstack_depth_to_print: (X86 only)
 
 Controls the number of words to print when dumping the raw
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b6de9a6..b4f4863 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -201,6 +201,8 @@ extern int sscanf(const char *, const char *, ...)
 extern int vsscanf(const char *, const char *, va_list)
 	__attribute__ ((format (scanf, 2, 0)));
 
+extern int kptr_restrict;	/* for sysctl */
+
 extern int get_option(char **str, int *pint);
 extern char *get_options(const char *str, int nints, int *ints);
 extern unsigned long long memparse(const char *ptr, char **retptr);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa15..236fa91 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -713,6 +713,15 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
+		.procname	= "kptr_restrict",
+		.data		= &kptr_restrict,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
+	},
+	{
 		.procname	= "ngroups_max",
 		.data		= &ngroups_max,
 		.maxlen		= sizeof (int),
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index c150d3d..b116aeb 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -936,6 +936,8 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
 	return string(buf, end, uuid, spec);
 }
 
+int kptr_restrict;
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -979,6 +981,7 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
  *       Implements a "recursive vsnprintf".
  *       Do not use this feature without some mechanism to verify the
  *       correctness of the format string and va_list arguments.
+ * - 'K' For a kernel pointer that should be hidden from unprivileged users
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -1035,6 +1038,26 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		return buf + vsnprintf(buf, end - buf,
 				       ((struct va_format *)ptr)->fmt,
 				       *(((struct va_format *)ptr)->va));
+	case 'K':
+		/*
+		 * %pK cannot be used in IRQ context because it tests
+		 * CAP_SYSLOG.
+		 */
+		if (in_irq() || in_serving_softirq() || in_nmi())
+			WARN_ONCE(1, "%%pK used in interrupt context.\n");
+
+		if (!kptr_restrict)
+			break;		/* %pK does not obscure pointers */
+
+		if ((kptr_restrict != 2) && capable(CAP_SYSLOG))
+			break;		/* privileged apps expose pointers,
+					   unless kptr_restrict is 2 */
+
+		if (spec.field_width == -1) {
+			spec.field_width = 2 * sizeof(void *);
+			spec.flags |= ZEROPAD;
+		}
+		return number(buf, end, 0, spec);
 	}
 	spec.flags |= SMALL;
 	if (spec.field_width == -1) {



^ permalink raw reply related

* [PATCH net-next] netfilter: x_table: speedup compat operations
From: Eric Dumazet @ 2010-12-18 17:35 UTC (permalink / raw)
  To: Patrick McHardy, David Miller; +Cc: Netfilter Development Mailinglist, netdev

One iptables invocation with 135000 rules takes 35 seconds of cpu time
on a recent server, using a 32bit distro and a 64bit kernel.

We eventually trigger NMI/RCU watchdog.

INFO: rcu_sched_state detected stall on CPU 3 (t=6000 jiffies)

COMPAT mode has quadratic behavior and consume 16 bytes of memory per
rule.

Switch the xt_compat algos to use an array instead of list, and use a
binary search to locate an offset in the sorted array.

This halves memory need (8 bytes per rule), and removes quadratic
behavior [ O(N*N) -> O(N*log2(N)) ]

Time of iptables goes from 35 s to 150 ms.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 include/linux/netfilter/x_tables.h |    3 
 net/bridge/netfilter/ebtables.c    |    1 
 net/ipv4/netfilter/arp_tables.c    |    2 
 net/ipv4/netfilter/ip_tables.c     |    2 
 net/ipv6/netfilter/ip6_tables.c    |    2 
 net/netfilter/x_tables.c           |   82 +++++++++++++++------------
 6 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 742bec0..0f04d98 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -611,8 +611,9 @@ struct _compat_xt_align {
 extern void xt_compat_lock(u_int8_t af);
 extern void xt_compat_unlock(u_int8_t af);
 
-extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta);
+extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta);
 extern void xt_compat_flush_offsets(u_int8_t af);
+extern void xt_compat_init_offsets(u_int8_t af, unsigned int number);
 extern int xt_compat_calc_jump(u_int8_t af, unsigned int offset);
 
 extern int xt_compat_match_offset(const struct xt_match *match);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index cbc9f39..2bf2fb2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1764,6 +1764,7 @@ static int compat_table_info(const struct ebt_table_info *info,
 
 	newinfo->entries_size = size;
 
+	xt_compat_init_offsets(AF_INET, info->nentries);
 	return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
 							entries, newinfo);
 }
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3fac340..47e5178 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -883,6 +883,7 @@ static int compat_table_info(const struct xt_table_info *info,
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
 	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	xt_compat_init_offsets(NFPROTO_ARP, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
 		if (ret != 0)
@@ -1350,6 +1351,7 @@ static int translate_compat_table(const char *name,
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(NFPROTO_ARP);
+	xt_compat_init_offsets(NFPROTO_ARP, number);
 	/* Walk through entries, checking offsets. */
 	xt_entry_foreach(iter0, entry0, total_size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d63..c5a75d7 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1080,6 +1080,7 @@ static int compat_table_info(const struct xt_table_info *info,
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
 	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	xt_compat_init_offsets(AF_INET, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
 		if (ret != 0)
@@ -1681,6 +1682,7 @@ translate_compat_table(struct net *net,
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(AF_INET);
+	xt_compat_init_offsets(AF_INET, number);
 	/* Walk through entries, checking offsets. */
 	xt_entry_foreach(iter0, entry0, total_size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 4555823..0c9973a 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1093,6 +1093,7 @@ static int compat_table_info(const struct xt_table_info *info,
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
 	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	xt_compat_init_offsets(AF_INET6, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
 		if (ret != 0)
@@ -1696,6 +1697,7 @@ translate_compat_table(struct net *net,
 	duprintf("translate_compat_table: size %u\n", info->size);
 	j = 0;
 	xt_compat_lock(AF_INET6);
+	xt_compat_init_offsets(AF_INET6, number);
 	/* Walk through entries, checking offsets. */
 	xt_entry_foreach(iter0, entry0, total_size) {
 		ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8046350..75182ed 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -38,9 +38,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
 #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
 
 struct compat_delta {
-	struct compat_delta *next;
-	unsigned int offset;
-	int delta;
+	unsigned int offset; /* offset in kernel */
+	int delta; /* delta in 32bit user land */
 };
 
 struct xt_af {
@@ -49,7 +48,9 @@ struct xt_af {
 	struct list_head target;
 #ifdef CONFIG_COMPAT
 	struct mutex compat_mutex;
-	struct compat_delta *compat_offsets;
+	struct compat_delta *compat_tab;
+	unsigned int number; /* number of slots in compat_tab[] */
+	unsigned int cur; /* number of used slots in compat_tab[] */
 #endif
 };
 
@@ -414,54 +415,67 @@ int xt_check_match(struct xt_mtchk_param *par,
 EXPORT_SYMBOL_GPL(xt_check_match);
 
 #ifdef CONFIG_COMPAT
-int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta)
+int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
 {
-	struct compat_delta *tmp;
+	struct xt_af *xp = &xt[af];
 
-	tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL);
-	if (!tmp)
-		return -ENOMEM;
+	if (!xp->compat_tab) {
+		if (!xp->number)
+			return -EINVAL;
+		xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
+		if (!xp->compat_tab)
+			return -ENOMEM;
+		xp->cur = 0;
+	}
 
-	tmp->offset = offset;
-	tmp->delta = delta;
+	if (xp->cur >= xp->number)
+		return -EINVAL;
 
-	if (xt[af].compat_offsets) {
-		tmp->next = xt[af].compat_offsets->next;
-		xt[af].compat_offsets->next = tmp;
-	} else {
-		xt[af].compat_offsets = tmp;
-		tmp->next = NULL;
-	}
+	if (xp->cur)
+		delta += xp->compat_tab[xp->cur - 1].delta;
+	xp->compat_tab[xp->cur].offset = offset;
+	xp->compat_tab[xp->cur].delta = delta;
+	xp->cur++;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_compat_add_offset);
 
 void xt_compat_flush_offsets(u_int8_t af)
 {
-	struct compat_delta *tmp, *next;
-
-	if (xt[af].compat_offsets) {
-		for (tmp = xt[af].compat_offsets; tmp; tmp = next) {
-			next = tmp->next;
-			kfree(tmp);
-		}
-		xt[af].compat_offsets = NULL;
+	if (xt[af].compat_tab) {
+		vfree(xt[af].compat_tab);
+		xt[af].compat_tab = NULL;
+		xt[af].number = 0;
 	}
 }
 EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
 
 int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
 {
-	struct compat_delta *tmp;
-	int delta;
-
-	for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next)
-		if (tmp->offset < offset)
-			delta += tmp->delta;
-	return delta;
+	struct compat_delta *tmp = xt[af].compat_tab;
+	int mid, left = 0, right = xt[af].cur - 1;
+
+	while (left <= right) {
+		mid = (left + right) >> 1;
+		if (offset > tmp[mid].offset)
+			left = mid + 1;
+		else if (offset < tmp[mid].offset)
+			right = mid - 1;
+		else
+			return mid ? tmp[mid - 1].delta : 0;
+	}
+	WARN_ON_ONCE(1);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
 
+void xt_compat_init_offsets(u_int8_t af, unsigned int number)
+{
+	xt[af].number = number;
+	xt[af].cur = 0;
+}
+EXPORT_SYMBOL(xt_compat_init_offsets);
+
 int xt_compat_match_offset(const struct xt_match *match)
 {
 	u_int16_t csize = match->compatsize ? : match->matchsize;
@@ -1337,7 +1351,7 @@ static int __init xt_init(void)
 		mutex_init(&xt[i].mutex);
 #ifdef CONFIG_COMPAT
 		mutex_init(&xt[i].compat_mutex);
-		xt[i].compat_offsets = NULL;
+		xt[i].compat_tab = NULL;
 #endif
 		INIT_LIST_HEAD(&xt[i].target);
 		INIT_LIST_HEAD(&xt[i].match);



^ permalink raw reply related

* e1000e crash with 82574L  2.6.37-0.rc5
From: Brian Neu @ 2010-12-18 18:12 UTC (permalink / raw)
  To: netdev

Is this a known bug?
Any fix out there?

[162536.704103] WARNING: at net/sched/sch_generic.c:258  
dev_watchdog+0x111/0x185()  [162536.704110] Hardware name: H8SGL  
[162536.704116] NETDEV WATCHDOG: eth0 (e1000e): transmit queue 0 timed out  
[162536.704122] Modules linked in: ip6table_filter ebtable_nat ebtables  
act_police cls_flow cls_fw cls_u32 sch_htb sch_hfsc sch_ingress sch_sfq bridge  
stp llc xt_time xt_connlimit xt_realm iptable_raw xt_comment xt_recent  
xt_policy ipt_ULOG ipt_REDIRECT ipt_NETMAP ipt_MASQUERADE ipt_ECN ipt_ecn  
ipt_CLUSTERIP ipt_ah ipt_addrtype nf_nat_tftp nf_nat_snmp_basic nf_nat_sip  
nf_nat_pptp nf_nat_proto_gre nf_nat_irc nf_nat_h323 nf_nat_ftp nf_nat_amanda  
ts_kmp nf_conntrack_amanda nf_conntrack_sane nf_conntrack_tftp nf_conntrack_sip  
nf_conntrack_proto_sctp nf_conntrack_pptp nf_conntrack_proto_gre  
nf_conntrack_netlink nf_conntrack_netbios_ns nf_conntrack_irc nf_conntrack_h323  
nf_conntrack_ftp xt_TPROXY nf_tproxy_core ip6_tables nf_defrag_ipv6 xt_tcpmss  
xt_pkttype xt_physdev xt_owner xt_NFQUEUE xt_NFLOG nfnetlink_log xt_multiport  
xt_mark xt_mac xt_limit xt_length xt_iprange xt_helper xt_hashlimit xt_DSCP  
xt_dscp xt_dccp xt_connmark xt_CLASSIFY ipt_LOG iptable_nat nf_nat  
iptable_mangle nfnetlink  fuse tun sunrpc cpufreq_ondemand powernow_k8 
freq_table mperf ipv6 kvm_amd kvm  uinput amd64_edac_mod edac_core e1000e 
edac_mce_amd i2c_piix4 i2c_core k10temp  ghes hed joydev microcode btrfs 
zlib_deflate libcrc32c raid1 pata_acpi  ata_generic sata_promise pata_atiixp 
sata_sil24 uas usb_storage [last unloaded:  scsi_wait_scan]  [162536.704368] 
Pid: 0, comm: kworker/0:0 Not tainted  2.6.37-0.rc5.git2.1.fc13.x86_64 #1  
[162536.704375] Call Trace:  [162536.704380]  <IRQ>  [<ffffffff81052e29>] 
warn_slowpath_common+0x85/0x9d  [162536.704406]  [<ffffffff81052ee4>] 
warn_slowpath_fmt+0x46/0x48  [162536.704418]  [<ffffffff81419f70>] 
dev_watchdog+0x111/0x185  [162536.704430]  [<ffffffff8105fdcf>] 
run_timer_softirq+0x246/0x33f  [162536.704441]  [<ffffffff8105fd29>] ? 
run_timer_softirq+0x1a0/0x33f  [162536.704452]  [<ffffffff81419e5f>] ? 
dev_watchdog+0x0/0x185  [162536.704463]  [<ffffffff81059746>] 
__do_softirq+0x101/0x20c  [162536.704474]  [<ffffffff81011aa6>] ? 
native_sched_clock+0x2d/0x5f  [162536.704484]  [<ffffffff8100bc1c>] 
call_softirq+0x1c/0x30  [162536.704493]  [<ffffffff8100d29f>] 
do_softirq+0x4b/0xa3  [162536.704502]  [<ffffffff81059480>] irq_exit+0x57/0x9b  
[162536.704514]  [<ffffffff814bf93c>] smp_apic_timer_interrupt+0x8d/0x9b  
[162536.704523]  [<ffffffff8100b6d3>] apic_timer_interrupt+0x13/0x20  
[162536.704529]  <EOI>  [<ffffffff81012686>] ? default_idle+0x3e/0x65  
[162536.704549]  [<ffffffff8102d159>] ? native_safe_halt+0xb/0xd  
[162536.704560]  [<ffffffff810816ee>] ? trace_hardirqs_on+0xd/0xf  
[162536.704571]  [<ffffffff8101268b>] default_idle+0x43/0x65  [162536.704581]  
[<ffffffff81009c81>] cpu_idle+0xbe/0x132  [162536.704592]  [<ffffffff814af9ea>] 
start_secondary+0x242/0x244


^ permalink raw reply

* Re: [PATCH v3] kptr_restrict for hiding kernel pointers
From: Kees Cook @ 2010-12-18 18:27 UTC (permalink / raw)
  To: Dan Rosenberg
  Cc: linux-kernel, netdev, linux-security-module, jmorris,
	eric.dumazet, tgraf, eugeneteo, mingo, davem, a.p.zijlstra, akpm
In-Reply-To: <1292692835.10804.67.camel@dan>

On Sat, Dec 18, 2010 at 12:20:34PM -0500, Dan Rosenberg wrote:
> v3 adds the "2" setting, cleans up documentation, removes the CONFIG,
> and incorporates changes and suggestions from Andrew Morton.

Acked-by: Kees Cook <kees.cook@canonical.com>

-- 
Kees Cook
Ubuntu Security Team

^ permalink raw reply

* [PATCH 1/2] net: phy: balance disable/enable irq on change
From: Jean-Michel Hautbois @ 2010-12-18 18:38 UTC (permalink / raw)
  To: davem
  Cc: richard.cochran, shemminger, tj, randy.dunlap, netdev,
	linux-kernel, Jean-Michel Hautbois

When phy interface changes its status, it calls phy_change() function.
This function calls the interrupt disabling functions for the driver registered, but if this driver doesn't implement it, there is no IRQ disabling. After doing the work, we call enable_irq and not the respective driver function. This fixes it, as it could lead to an unbalanced IRQ.

Signed-off-by: Jean-Michel Hautbois <jhautbois@gmail.com>
---
 drivers/net/phy/phy.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 7670aac..b28f2ac 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -89,7 +89,8 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts)
 	phydev->interrupts = interrupts;
 	if (phydev->drv->config_intr)
 		err = phydev->drv->config_intr(phydev);
-
+	else
+		err = -ENOSYS;
 	return err;
 }
 
@@ -541,6 +542,10 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 		return err;
 
 	err = phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
+	if (err == -ENOSYS) {
+		err = 0;
+		enable_irq(phydev->irq);
+	}
 
 	return err;
 }
@@ -556,7 +561,10 @@ static int phy_disable_interrupts(struct phy_device *phydev)
 	/* Disable PHY interrupts */
 	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
 
-	if (err)
+	if (err == -ENOSYS) {
+		err = 0;
+		disable_irq(phydev->irq);
+	} else if (err != 0)
 		goto phy_err;
 
 	/* Clear the interrupt */
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 2/2] net: phy: Fixed some checkpatch errors
From: Jean-Michel Hautbois @ 2010-12-18 18:39 UTC (permalink / raw)
  To: davem
  Cc: richard.cochran, shemminger, tj, randy.dunlap, netdev,
	linux-kernel, Jean-Michel Hautbois
In-Reply-To: <1292697540-2723-1-git-send-email-jhautbois@gmail.com>

Fixes some coding style issues (errors and warnings).

Signed-off-by: Jean-Michel Hautbois <jhautbois@gmail.com>
---
 drivers/net/phy/phy.c |   20 ++++++++++----------
 1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index b28f2ac..69758b4 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -33,10 +33,10 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 
-#include <asm/atomic.h>
-#include <asm/io.h>
+#include <linux/atomic.h>
+#include <linux/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**
  * phy_print_status - Convenience function to print out the current phy status
@@ -47,11 +47,11 @@ void phy_print_status(struct phy_device *phydev)
 	pr_info("PHY: %s - Link is %s", dev_name(&phydev->dev),
 			phydev->link ? "Up" : "Down");
 	if (phydev->link)
-		printk(" - %d/%s", phydev->speed,
+		pr_info(" - %d/%s", phydev->speed,
 				DUPLEX_FULL == phydev->duplex ?
 				"Full" : "Half");
 
-	printk("\n");
+	pr_info("\n");
 }
 EXPORT_SYMBOL(phy_print_status);
 
@@ -325,7 +325,7 @@ int phy_mii_ioctl(struct phy_device *phydev,
 
 	case SIOCSMIIREG:
 		if (mii_data->phy_id == phydev->addr) {
-			switch(mii_data->reg_num) {
+			switch (mii_data->reg_num) {
 			case MII_BMCR:
 				if ((val & (BMCR_RESET|BMCR_ANENABLE)) == 0)
 					phydev->autoneg = AUTONEG_DISABLE;
@@ -352,7 +352,7 @@ int phy_mii_ioctl(struct phy_device *phydev,
 		}
 
 		phy_write(phydev, mii_data->reg_num, val);
-		
+
 		if (mii_data->reg_num == MII_BMCR &&
 		    val & BMCR_RESET &&
 		    phydev->drv->config_init) {
@@ -471,7 +471,7 @@ static void phy_force_reduction(struct phy_device *phydev)
 	int idx;
 
 	idx = phy_find_setting(phydev->speed, phydev->duplex);
-	
+
 	idx++;
 
 	idx = phy_find_valid(idx, phydev->supported);
@@ -732,6 +732,7 @@ out_unlock:
 	 * will not reenable interrupts.
 	 */
 }
+EXPORT_SYMBOL(phy_stop);
 
 
 /**
@@ -762,7 +763,6 @@ void phy_start(struct phy_device *phydev)
 	}
 	mutex_unlock(&phydev->lock);
 }
-EXPORT_SYMBOL(phy_stop);
 EXPORT_SYMBOL(phy_start);
 
 /**
@@ -782,7 +782,7 @@ void phy_state_machine(struct work_struct *work)
 	if (phydev->adjust_state)
 		phydev->adjust_state(phydev->attached_dev);
 
-	switch(phydev->state) {
+	switch (phydev->state) {
 		case PHY_DOWN:
 		case PHY_STARTING:
 		case PHY_READY:
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 1/2] net: phy: balance disable/enable irq on change
From: Jean-Michel Hautbois @ 2010-12-18 18:41 UTC (permalink / raw)
  To: davem
  Cc: richard.cochran, shemminger, tj, randy.dunlap, netdev,
	linux-kernel, Jean-Michel Hautbois

When phy interface changes its status, it calls phy_change() function.
This function calls the interrupt disabling functions for the driver
registered, but if this driver doesn't implement it, there is no IRQ
disabling. After doing the work, we call enable_irq and not the
respective driver function. This fixes it, as it could lead to an
unbalanced IRQ. Error code changed to EOPNOTSUPP.

Signed-off-by: Jean-Michel Hautbois <jhautbois@gmail.com>
---
 drivers/net/phy/phy.c |   12 ++++++++++--
 1 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 7670aac..5f23e8e 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -89,7 +89,8 @@ static int phy_config_interrupt(struct phy_device *phydev, u32 interrupts)
 	phydev->interrupts = interrupts;
 	if (phydev->drv->config_intr)
 		err = phydev->drv->config_intr(phydev);
-
+	else
+		err = -EOPNOTSUPP;
 	return err;
 }
 
@@ -541,6 +542,10 @@ static int phy_enable_interrupts(struct phy_device *phydev)
 		return err;
 
 	err = phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
+	if (err == -EOPNOTSUPP) {
+		err = 0;
+		enable_irq(phydev->irq);
+	}
 
 	return err;
 }
@@ -556,7 +561,10 @@ static int phy_disable_interrupts(struct phy_device *phydev)
 	/* Disable PHY interrupts */
 	err = phy_config_interrupt(phydev, PHY_INTERRUPT_DISABLED);
 
-	if (err)
+	if (err == -EOPNOTSUPP) {
+		err = 0;
+		disable_irq(phydev->irq);
+	} else if (err != 0)
 		goto phy_err;
 
 	/* Clear the interrupt */
-- 
1.7.0.4

^ permalink raw reply related

* [PATCH 2/2] net: phy: Fixed some checkpatch errors
From: Jean-Michel Hautbois @ 2010-12-18 18:41 UTC (permalink / raw)
  To: davem
  Cc: richard.cochran, shemminger, tj, randy.dunlap, netdev,
	linux-kernel, Jean-Michel Hautbois
In-Reply-To: <1292697704-2886-1-git-send-email-jhautbois@gmail.com>

Fixes some coding style issues (errors and warnings).

Signed-off-by: Jean-Michel Hautbois <jhautbois@gmail.com>
---
 drivers/net/phy/phy.c |   20 ++++++++++----------
 1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 5f23e8e..d6a9b29 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -33,10 +33,10 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 
-#include <asm/atomic.h>
-#include <asm/io.h>
+#include <linux/atomic.h>
+#include <linux/io.h>
 #include <asm/irq.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /**
  * phy_print_status - Convenience function to print out the current phy status
@@ -47,11 +47,11 @@ void phy_print_status(struct phy_device *phydev)
 	pr_info("PHY: %s - Link is %s", dev_name(&phydev->dev),
 			phydev->link ? "Up" : "Down");
 	if (phydev->link)
-		printk(" - %d/%s", phydev->speed,
+		pr_info(" - %d/%s", phydev->speed,
 				DUPLEX_FULL == phydev->duplex ?
 				"Full" : "Half");
 
-	printk("\n");
+	pr_info("\n");
 }
 EXPORT_SYMBOL(phy_print_status);
 
@@ -325,7 +325,7 @@ int phy_mii_ioctl(struct phy_device *phydev,
 
 	case SIOCSMIIREG:
 		if (mii_data->phy_id == phydev->addr) {
-			switch(mii_data->reg_num) {
+			switch (mii_data->reg_num) {
 			case MII_BMCR:
 				if ((val & (BMCR_RESET|BMCR_ANENABLE)) == 0)
 					phydev->autoneg = AUTONEG_DISABLE;
@@ -352,7 +352,7 @@ int phy_mii_ioctl(struct phy_device *phydev,
 		}
 
 		phy_write(phydev, mii_data->reg_num, val);
-		
+
 		if (mii_data->reg_num == MII_BMCR &&
 		    val & BMCR_RESET &&
 		    phydev->drv->config_init) {
@@ -471,7 +471,7 @@ static void phy_force_reduction(struct phy_device *phydev)
 	int idx;
 
 	idx = phy_find_setting(phydev->speed, phydev->duplex);
-	
+
 	idx++;
 
 	idx = phy_find_valid(idx, phydev->supported);
@@ -732,6 +732,7 @@ out_unlock:
 	 * will not reenable interrupts.
 	 */
 }
+EXPORT_SYMBOL(phy_stop);
 
 
 /**
@@ -762,7 +763,6 @@ void phy_start(struct phy_device *phydev)
 	}
 	mutex_unlock(&phydev->lock);
 }
-EXPORT_SYMBOL(phy_stop);
 EXPORT_SYMBOL(phy_start);
 
 /**
@@ -782,7 +782,7 @@ void phy_state_machine(struct work_struct *work)
 	if (phydev->adjust_state)
 		phydev->adjust_state(phydev->attached_dev);
 
-	switch(phydev->state) {
+	switch (phydev->state) {
 		case PHY_DOWN:
 		case PHY_STARTING:
 		case PHY_READY:
-- 
1.7.0.4

^ permalink raw reply related

* Re: [PATCH v3] kptr_restrict for hiding kernel pointers
From: Eric Paris @ 2010-12-18 21:07 UTC (permalink / raw)
  To: Dan Rosenberg
  Cc: linux-kernel, netdev, linux-security-module, jmorris,
	eric.dumazet, tgraf, eugeneteo, kees.cook, mingo, davem,
	a.p.zijlstra, akpm
In-Reply-To: <1292692835.10804.67.camel@dan>

On Sat, Dec 18, 2010 at 12:20 PM, Dan Rosenberg
<drosenberg@vsecurity.com> wrote:

> @@ -1035,6 +1038,26 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
>                return buf + vsnprintf(buf, end - buf,
>                                       ((struct va_format *)ptr)->fmt,
>                                       *(((struct va_format *)ptr)->va));
> +       case 'K':
> +               /*
> +                * %pK cannot be used in IRQ context because it tests
> +                * CAP_SYSLOG.
> +                */
> +               if (in_irq() || in_serving_softirq() || in_nmi())
> +                       WARN_ONCE(1, "%%pK used in interrupt context.\n");
> +
> +               if (!kptr_restrict)
> +                       break;          /* %pK does not obscure pointers */
> +
> +               if ((kptr_restrict != 2) && capable(CAP_SYSLOG))
> +                       break;          /* privileged apps expose pointers,
> +                                          unless kptr_restrict is 2 */

I would suggest has_capability_noaudit() since a failure here is not a
security policy violation it is just a code path choice.

I was confused also by the comment about CAP_SYSLOG and IRQ context.
You can check CAP_SYSLOG in IRQ context, it's just that the result is
not going to have any relation to the work being done.  This function
in general doesn't make sense in that context and I don't think saying
that has anything to do with CAP_SYSLOG makes that clear....  Unless
I'm misunderstanding...

^ permalink raw reply

* Re: [PATCH v3] kptr_restrict for hiding kernel pointers
From: Eric Paris @ 2010-12-18 21:10 UTC (permalink / raw)
  To: Dan Rosenberg
  Cc: linux-kernel, netdev, linux-security-module, jmorris,
	eric.dumazet, tgraf, eugeneteo, kees.cook, mingo, davem,
	a.p.zijlstra, akpm
In-Reply-To: <AANLkTimqpKem=rcfk5gsk0bMdWk45EuEThe0wAYiogxc@mail.gmail.com>

On Sat, Dec 18, 2010 at 4:07 PM, Eric Paris <eparis@parisplace.org> wrote:
> On Sat, Dec 18, 2010 at 12:20 PM, Dan Rosenberg
> <drosenberg@vsecurity.com> wrote:
>
>> @@ -1035,6 +1038,26 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
>>                return buf + vsnprintf(buf, end - buf,
>>                                       ((struct va_format *)ptr)->fmt,
>>                                       *(((struct va_format *)ptr)->va));
>> +       case 'K':
>> +               /*
>> +                * %pK cannot be used in IRQ context because it tests
>> +                * CAP_SYSLOG.
>> +                */
>> +               if (in_irq() || in_serving_softirq() || in_nmi())
>> +                       WARN_ONCE(1, "%%pK used in interrupt context.\n");
>> +
>> +               if (!kptr_restrict)
>> +                       break;          /* %pK does not obscure pointers */
>> +
>> +               if ((kptr_restrict != 2) && capable(CAP_SYSLOG))
>> +                       break;          /* privileged apps expose pointers,
>> +                                          unless kptr_restrict is 2 */
>
> I would suggest has_capability_noaudit() since a failure here is not a
> security policy violation it is just a code path choice.
>
> I was confused also by the comment about CAP_SYSLOG and IRQ context.
> You can check CAP_SYSLOG in IRQ context, it's just that the result is
> not going to have any relation to the work being done.  This function
> in general doesn't make sense in that context and I don't think saying
> that has anything to do with CAP_SYSLOG makes that clear....  Unless
> I'm misunderstanding...

Just went back and reread akpm's comments on -v2.  I guess we see it
the same way, I just thought this comment on first glance indicated
that capable() wasn't IRQ safe (it is) not that it just was
meaningless...   I don't think rewriting the comment is necessary.
Sorry for that half of the message....

^ permalink raw reply

* [PATCH v4] kptr_restrict for hiding kernel pointers
From: Dan Rosenberg @ 2010-12-18 21:41 UTC (permalink / raw)
  To: linux-kernel, netdev, linux-security-module
  Cc: jmorris, eric.dumazet, tgraf, eugeneteo, kees.cook, mingo, davem,
	a.p.zijlstra, akpm, eparis

Add the %pK printk format specifier and
the /proc/sys/kernel/kptr_restrict sysctl.

The %pK format specifier is designed to hide exposed kernel pointers,
specifically via /proc interfaces.  Exposing these pointers provides an
easy target for kernel write vulnerabilities, since they reveal the
locations of writable structures containing easily triggerable function
pointers.  The behavior of %pK depends on the kptr_restrict sysctl.

If kptr_restrict is set to 0, no deviation from the standard %p behavior
occurs.  If kptr_restrict is set to 1, if the current user (intended to
be a reader via seq_printf(), etc.) does not have CAP_SYSLOG (which is
currently in the LSM tree), kernel pointers using %pK are printed as
0's.  If kptr_restrict is set to 2, kernel pointers using %pK are
printed as 0's regardless of privileges.  Replacing with 0's was chosen
over the default "(null)", which cannot be parsed by userland %p, which
expects "(nil)".


v4 incorporates Eric Paris' suggestion of using
has_capability_noaudit(), since failing this capability check is not a
policy violation but rather a code path choice and shouldn't generate
potentially excessive log noise.  Adjusted IRQ comment for clarity.

v3 adds the "2" setting, cleans up documentation, removes the CONFIG,
and incorporates changes and suggestions from Andrew Morton.

v2 improves checking for inappropriate context, on suggestion by Peter
Zijlstra.  Thanks to Thomas Graf for suggesting use of a centralized
format specifier.


Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Cc: James Morris <jmorris@namei.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Thomas Graf <tgraf@infradead.org>
Cc: Eugene Teo <eugeneteo@kernel.org>
Cc: Kees Cook <kees.cook@canonical.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Eric Paris <eparis@parisplace.org>
---

 Documentation/sysctl/kernel.txt |   14 ++++++++++++++
 include/linux/kernel.h          |    2 ++
 kernel/sysctl.c                 |    9 +++++++++
 lib/vsprintf.c                  |   24 ++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 0 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 209e158..3cff47e 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -34,6 +34,7 @@ show up in /proc/sys/kernel:
 - hotplug
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
+- kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
@@ -261,6 +262,19 @@ This flag controls the L2 cache of G3 processor boards. If
 
 ==============================================================
 
+kptr_restrict:
+
+This toggle indicates whether restrictions are placed on
+exposing kernel addresses via /proc and other interfaces.  When
+kptr_restrict is set to (0), the default, there are no
+restrictions.  When kptr_restrict is set to (1), kernel pointers
+printed using the %pK format specifier will be replaced with 0's
+unless the user has CAP_SYSLOG.  When kptr_restrict is set to
+(2), kernel pointers printed using %pK will be replaced with 0's
+regardless of privileges.
+
+==============================================================
+
 kstack_depth_to_print: (X86 only)
 
 Controls the number of words to print when dumping the raw
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b6de9a6..b4f4863 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -201,6 +201,8 @@ extern int sscanf(const char *, const char *, ...)
 extern int vsscanf(const char *, const char *, va_list)
 	__attribute__ ((format (scanf, 2, 0)));
 
+extern int kptr_restrict;	/* for sysctl */
+
 extern int get_option(char **str, int *pint);
 extern char *get_options(const char *str, int nints, int *ints);
 extern unsigned long long memparse(const char *ptr, char **retptr);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa15..236fa91 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -713,6 +713,15 @@ static struct ctl_table kern_table[] = {
 	},
 #endif
 	{
+		.procname	= "kptr_restrict",
+		.data		= &kptr_restrict,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &two,
+	},
+	{
 		.procname	= "ngroups_max",
 		.data		= &ngroups_max,
 		.maxlen		= sizeof (int),
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index c150d3d..313f767 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -936,6 +936,8 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
 	return string(buf, end, uuid, spec);
 }
 
+int kptr_restrict;
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
@@ -979,6 +981,7 @@ char *uuid_string(char *buf, char *end, const u8 *addr,
  *       Implements a "recursive vsnprintf".
  *       Do not use this feature without some mechanism to verify the
  *       correctness of the format string and va_list arguments.
+ * - 'K' For a kernel pointer that should be hidden from unprivileged users
  *
  * Note: The difference between 'S' and 'F' is that on ia64 and ppc64
  * function pointers are really function descriptors, which contain a
@@ -1035,6 +1038,27 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 		return buf + vsnprintf(buf, end - buf,
 				       ((struct va_format *)ptr)->fmt,
 				       *(((struct va_format *)ptr)->va));
+	case 'K':
+		/*
+		 * %pK cannot be used in IRQ context because its test
+		 * for CAP_SYSLOG would be meaningless.
+		 */
+		if (in_irq() || in_serving_softirq() || in_nmi())
+			WARN_ONCE(1, "%%pK used in interrupt context.\n");
+
+		if (!kptr_restrict)
+			break;		/* %pK does not obscure pointers */
+
+		if ((kptr_restrict != 2) &&
+		    has_capability_noaudit(current, CAP_SYSLOG))
+			break;		/* privileged apps expose pointers,
+					   unless kptr_restrict is 2 */
+
+		if (spec.field_width == -1) {
+			spec.field_width = 2 * sizeof(void *);
+			spec.flags |= ZEROPAD;
+		}
+		return number(buf, end, 0, spec);
 	}
 	spec.flags |= SMALL;
 	if (spec.field_width == -1) {



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox