Netdev List
 help / color / mirror / Atom feed
* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: Cong Wang @ 2014-10-15 21:25 UTC (permalink / raw)
  To: Krzysztof Kolasa; +Cc: Eric Dumazet, David Miller, Eric Dumazet, netdev
In-Reply-To: <CAHA+R7PK=oXswV0PAkKtAqaeJh+Rx7ug0aaLKoJzJovR98dJpg@mail.gmail.com>

On Wed, Oct 15, 2014 at 2:22 PM, Cong Wang <cwang@twopensource.com> wrote:
>
> Meanwhile Eric is debugging it, do you mind to try a followup quick
> fix on your 32bit system?
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index e13d778..12bd3f6 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -1006,8 +1006,7 @@ static int tcp_transmit_skb(struct sock *sk,
> struct sk_buff *skb, int clone_it,
>         skb->tstamp.tv64 = 0;
>
>         /* Cleanup our debris for IP stacks */
> -       memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
> -                              sizeof(struct inet6_skb_parm)));
> +       memset(TCPCB(skb), 0, sizeof(*TCPCB(skb)));


Of course, s/TCPCB/TCP_SKB_CB/. :)

>
>         err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);

^ permalink raw reply

* [Patch net 1/3] ipv4: call __ip_options_echo() in cookie_v4_check()
From: Cong Wang @ 2014-10-15 21:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, Cong Wang, Krzysztof Kolasa, Eric Dumazet, Cong Wang

From: Cong Wang <cwang@twopensource.com>

commit 971f10eca186cab238c49da ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
missed that cookie_v4_check() still calls ip_options_echo() which uses
IPCB(). It should use TCPCB() at TCP layer, so call __ip_options_echo()
instead.

Fixes: commit 971f10eca186cab238c49da ("tcp: better TCP_SKB_CB layout to reduce cache line misses")
Cc: Krzysztof Kolasa <kkolasa@winsoft.pl>
Cc: Eric Dumazet <edumazet@google.com>
Reported-by: Krzysztof Kolasa <kkolasa@winsoft.pl>
Tested-by: Krzysztof Kolasa <kkolasa@winsoft.pl>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 net/ipv4/syncookies.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 0431a8f..7e7401c 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -321,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 		int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
 
 		ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
-		if (ireq->opt != NULL && ip_options_echo(&ireq->opt->opt, skb)) {
+		if (ireq->opt != NULL && __ip_options_echo(&ireq->opt->opt, skb, opt)) {
 			kfree(ireq->opt);
 			ireq->opt = NULL;
 		}
-- 
1.8.3.1

^ permalink raw reply related

* [Patch net 2/3] ipv4: share tcp_v4_save_options() with cookie_v4_check()
From: Cong Wang @ 2014-10-15 21:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, Cong Wang, Krzysztof Kolasa, Eric Dumazet, Cong Wang
In-Reply-To: <1413408802-21052-1-git-send-email-xiyou.wangcong@gmail.com>

From: Cong Wang <cwang@twopensource.com>

cookie_v4_check() allocates ip_options_rcu in the same way
with tcp_v4_save_options(), we can just make it a helper function.

Cc: Krzysztof Kolasa <kkolasa@winsoft.pl>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 include/net/tcp.h     | 20 ++++++++++++++++++++
 net/ipv4/syncookies.c | 10 +---------
 net/ipv4/tcp_ipv4.c   | 20 --------------------
 3 files changed, 21 insertions(+), 29 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 74efeda..869637a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1666,4 +1666,24 @@ int tcpv4_offload_init(void);
 void tcp_v4_init(void);
 void tcp_init(void);
 
+/*
+ * Save and compile IPv4 options, return a pointer to it
+ */
+static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
+{
+	const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
+	struct ip_options_rcu *dopt = NULL;
+
+	if (opt && opt->optlen) {
+		int opt_size = sizeof(*dopt) + opt->optlen;
+
+		dopt = kmalloc(opt_size, GFP_ATOMIC);
+		if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
+			kfree(dopt);
+			dopt = NULL;
+		}
+	}
+	return dopt;
+}
+
 #endif	/* _TCP_H */
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 7e7401c..c68d0a1 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -317,15 +317,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
 	 */
-	if (opt && opt->optlen) {
-		int opt_size = sizeof(struct ip_options_rcu) + opt->optlen;
-
-		ireq->opt = kmalloc(opt_size, GFP_ATOMIC);
-		if (ireq->opt != NULL && __ip_options_echo(&ireq->opt->opt, skb, opt)) {
-			kfree(ireq->opt);
-			ireq->opt = NULL;
-		}
-	}
+	ireq->opt = tcp_v4_save_options(skb);
 
 	if (security_inet_conn_request(sk, skb, req)) {
 		reqsk_free(req);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 552e87e..6a2a7d6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -880,26 +880,6 @@ bool tcp_syn_flood_action(struct sock *sk,
 }
 EXPORT_SYMBOL(tcp_syn_flood_action);
 
-/*
- * Save and compile IPv4 options into the request_sock if needed.
- */
-static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
-{
-	const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
-	struct ip_options_rcu *dopt = NULL;
-
-	if (opt && opt->optlen) {
-		int opt_size = sizeof(*dopt) + opt->optlen;
-
-		dopt = kmalloc(opt_size, GFP_ATOMIC);
-		if (dopt && __ip_options_echo(&dopt->opt, skb, opt)) {
-			kfree(dopt);
-			dopt = NULL;
-		}
-	}
-	return dopt;
-}
-
 #ifdef CONFIG_TCP_MD5SIG
 /*
  * RFC2385 MD5 checksumming requires a mapping of
-- 
1.8.3.1

^ permalink raw reply related

* [Patch net 3/3] ipv4: clean up cookie_v4_check()
From: Cong Wang @ 2014-10-15 21:33 UTC (permalink / raw)
  To: netdev; +Cc: davem, Cong Wang, Krzysztof Kolasa, Eric Dumazet, Cong Wang
In-Reply-To: <1413408802-21052-1-git-send-email-xiyou.wangcong@gmail.com>

From: Cong Wang <cwang@twopensource.com>

We can retrieve opt from skb, no need to pass it as a parameter.
And opt should always be non-NULL, no need to check.

Cc: Krzysztof Kolasa <kkolasa@winsoft.pl>
Cc: Eric Dumazet <edumazet@google.com>
Tested-by: Krzysztof Kolasa <kkolasa@winsoft.pl>
Signed-off-by: Cong Wang <cwang@twopensource.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
---
 include/net/tcp.h     | 5 ++---
 net/ipv4/syncookies.c | 6 +++---
 net/ipv4/tcp_ipv4.c   | 2 +-
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 869637a..3a4bbbf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -468,8 +468,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
 /* From syncookies.c */
 int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
 		      u32 cookie);
-struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
-			     struct ip_options *opt);
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
 #ifdef CONFIG_SYN_COOKIES
 
 /* Syncookies use a monotonic timer which increments every 60 seconds.
@@ -1674,7 +1673,7 @@ static inline struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 	const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
 	struct ip_options_rcu *dopt = NULL;
 
-	if (opt && opt->optlen) {
+	if (opt->optlen) {
 		int opt_size = sizeof(*dopt) + opt->optlen;
 
 		dopt = kmalloc(opt_size, GFP_ATOMIC);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c68d0a1..d346303 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -255,9 +255,9 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
 }
 EXPORT_SYMBOL(cookie_check_timestamp);
 
-struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
-			     struct ip_options *opt)
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 {
+	struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
 	struct tcp_options_received tcp_opt;
 	struct inet_request_sock *ireq;
 	struct tcp_request_sock *treq;
@@ -336,7 +336,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
 			   inet_sk_flowi_flags(sk),
-			   (opt && opt->srr) ? opt->faddr : ireq->ir_rmt_addr,
+			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
 			   ireq->ir_loc_addr, th->source, th->dest);
 	security_req_classify_flow(req, flowi4_to_flowi(&fl4));
 	rt = ip_route_output_key(sock_net(sk), &fl4);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6a2a7d6..94d1a77 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1408,7 +1408,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
 
 #ifdef CONFIG_SYN_COOKIES
 	if (!th->syn)
-		sk = cookie_v4_check(sk, skb, &TCP_SKB_CB(skb)->header.h4.opt);
+		sk = cookie_v4_check(sk, skb);
 #endif
 	return sk;
 }
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH linux v3 1/1] fs/proc: use a rb tree for the directory entries
From: Andrew Morton @ 2014-10-15 21:37 UTC (permalink / raw)
  To: Nicolas Dichtel
  Cc: netdev, linux-kernel, davem, ebiederm, adobriyan, rui.xiang, viro,
	oleg, gorcunov, kirill.shutemov, grant.likely, tytso
In-Reply-To: <1412672559-5256-2-git-send-email-nicolas.dichtel@6wind.com>

On Tue,  7 Oct 2014 11:02:39 +0200 Nicolas Dichtel <nicolas.dichtel@6wind.com> wrote:

> The current implementation for the directories in /proc is using a single
> linked list. This is slow when handling directories with large numbers of
> entries (eg netdevice-related entries when lots of tunnels are opened).
> 
> This patch replaces this linked list by a red-black tree.
> 
> ...
>
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -166,6 +166,7 @@ void __init proc_root_init(void)
>  {
>  	int err;
>  
> +	proc_root.subdir = RB_ROOT;
>  	proc_init_inodecache();
>  	err = register_filesystem(&proc_fs_type);
>  	if (err)

This can be done at compile time can't it?

--- a/fs/proc/root.c~fs-proc-use-a-rb-tree-for-the-directory-entries-fix
+++ a/fs/proc/root.c
@@ -166,7 +166,6 @@ void __init proc_root_init(void)
 {
 	int err;
 
-	proc_root.subdir = RB_ROOT;
 	proc_init_inodecache();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
@@ -252,6 +251,7 @@ struct proc_dir_entry proc_root = {
 	.proc_iops	= &proc_root_inode_operations, 
 	.proc_fops	= &proc_root_operations,
 	.parent		= &proc_root,
+	.subdir		= RB_ROOT,
 	.name		= "/proc",
 };
 
_

^ permalink raw reply

* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: Eric Dumazet @ 2014-10-15 21:39 UTC (permalink / raw)
  To: Cong Wang; +Cc: Krzysztof Kolasa, David Miller, Eric Dumazet, netdev
In-Reply-To: <CAHA+R7OEvTcZOm-OepWYvGP+pC_CJRoSen351nw8VJGdLTo0uA@mail.gmail.com>

On Wed, 2014-10-15 at 13:36 -0700, Cong Wang wrote:
> On Wed, Oct 15, 2014 at 3:35 AM, Krzysztof Kolasa <kkolasa@winsoft.pl> wrote:
> >
> > one-line patch not resolve problem, fix created by Cong Wang resolves
> > problem !!!
> >
> > tested on 64bit system and working OK
> >
> 
> So my patch fixes your problem on 64bit but not on 32bit,
> is this correct?
> 
> I will send out the patch. And as Eric said, the problem on 32bit
> might be a different issue, we can definitely fix it separately.
> 
> Thanks for testing!

I think more fixes are needed.

This is most probably an IPv6 issue.

^ permalink raw reply

* [PATCH] net: m_can: add CONFIG_HAS_IOMEM dependence
From: David Cohen @ 2014-10-15 21:41 UTC (permalink / raw)
  To: wg, mkl; +Cc: linux-can, netdev, linux-kernel, trivial, David Cohen

m_can uses io memory which makes it not compilable on architectures
without HAS_IOMEM such as UML:

drivers/built-in.o: In function `m_can_plat_probe':
m_can.c:(.text+0x218cc5): undefined reference to `devm_ioremap_resource'
m_can.c:(.text+0x218df9): undefined reference to `devm_ioremap'

Signed-off-by: David Cohen <david.a.cohen@linux.intel.com>
---
 drivers/net/can/m_can/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/can/m_can/Kconfig b/drivers/net/can/m_can/Kconfig
index fca5482c09ac..04f20dd39007 100644
--- a/drivers/net/can/m_can/Kconfig
+++ b/drivers/net/can/m_can/Kconfig
@@ -1,4 +1,5 @@
 config CAN_M_CAN
+	depends on HAS_IOMEM
 	tristate "Bosch M_CAN devices"
 	---help---
 	  Say Y here if you want to support for Bosch M_CAN controller.
-- 
2.1.0


^ permalink raw reply related

* Re: something is wrong in commit 971f10eca1 - tcp: better TCP_SKB_CB layout to reduce cache line misses
From: Eric Dumazet @ 2014-10-15 21:42 UTC (permalink / raw)
  To: Cong Wang; +Cc: Krzysztof Kolasa, David Miller, Eric Dumazet, netdev
In-Reply-To: <1413409168.17186.0.camel@edumazet-glaptop2.roam.corp.google.com>

On Wed, 2014-10-15 at 14:39 -0700, Eric Dumazet wrote:
> On Wed, 2014-10-15 at 13:36 -0700, Cong Wang wrote:
> > On Wed, Oct 15, 2014 at 3:35 AM, Krzysztof Kolasa <kkolasa@winsoft.pl> wrote:
> > >
> > > one-line patch not resolve problem, fix created by Cong Wang resolves
> > > problem !!!
> > >
> > > tested on 64bit system and working OK
> > >
> > 
> > So my patch fixes your problem on 64bit but not on 32bit,
> > is this correct?
> > 
> > I will send out the patch. And as Eric said, the problem on 32bit
> > might be a different issue, we can definitely fix it separately.
> > 
> > Thanks for testing!
> 
> I think more fixes are needed.
> 
> This is most probably an IPv6 issue.
> 

Yes, ipv6 uses inet6_iif() a lot. We need to use a different accessor.

^ permalink raw reply

* Re: Regarding tx-nocache-copy in the Sheevaplug
From: Benjamin Poirier @ 2014-10-15 21:57 UTC (permalink / raw)
  To: Lluís Batlle i Rossell
  Cc: linux-kernel, netdev, Carles Pagès, linux-arm-kernel
In-Reply-To: <20141013105246.GD1972@vicerveza.homeunix.net>

On 2014/10/13 12:52, Lluís Batlle i Rossell wrote:
> Hello,
> 
> on the 7th of January 2014 ths patch was applied:
> https://lkml.org/lkml/2014/1/7/307
> 
> [PATCH v2] net: Do not enable tx-nocache-copy by default
>         
> In the Sheevaplug (ARM Feroceon 88FR131 from Marvell) this made packets to be
> sent corrupted. I think this machine has something special about the cache.
> 
> Enabling back this tx-nocache-copy (as it used to be before the patch) the
> transfers work fine again. I think that most people, encountering this problem,
> completely disable the tx offload instead of enabling back this setting.
> 
> Is this an ARM kernel problem regarding this platform?

This is odd, only x86 defines ARCH_HAS_NOCACHE_UACCESS. On arm,
skb_do_copy_data_nocache() should end up using __copy_from_user()
regardless of tx-nocache-copy.

^ permalink raw reply

* Re: feature suggestion: implement SO_PEERCRED on local AF_INET/AF_INET6 sockets (allow uid-based identification on localhost)
From: David Madore @ 2014-10-15 22:30 UTC (permalink / raw)
  To: Linux Kernel mailing-list, Linux network mailing-list; +Cc: Andy Lutomirski
In-Reply-To: <543E87AC.5090402@amacapital.net>

On Wed, Oct 15, 2014 at 07:41:48AM -0700, Andy Lutomirski wrote:
> On 10/15/2014 06:35 AM, David Madore wrote:
> > Given an AF_UNIX socket, the getsockopt(, SOL_SOCKET, SO_PEERCRED,,)
> > call allows one endpoint to authenticate the other endpoint's pid, uid
> > and gid.
> > 
> > The call is valid on AF_INET and AF_INET6 sockets but returns no data
> > (pid=0, uid=-1, gid=-1).  Obviously it is meaningless to try to get
> > such credentials from a INET/INET6 socket in general, but there is one
> > case where it would make sense: namely, when the endpoint is local
> > (i.e., when the socket is a connection to the same machine, e.g., when
> > connecting to 127.0.0.0/8 or ::1/32).
> 
> I will object to adding it as described, for the same reason that I
> object to anything that extends the current model of socket-based
> credential passing.  Ideally, credentials would *never* be implicitly
> captured by socket syscalls.  We live in the real world, and SO_****CRED
> exists, so I think the best we can do is to try to minimize its use.
> 
> I can elaborate further, or you can IIRC search the archives for
> SCM_IDENTITY, and you can also look at CVE-2013-1979 for a nasty example
> of why this model is broken.

>From what I understand, what was broken is mainly that the credentials
were evaluated when the write() system call took place rather than
when socket() or bind(): this violates the Unix security model
(privilege control occurs when the file descriptor is created, not
when it is used).  On the contrary, it is conform to Unix security
principles that credentials are checked implicitly when binding a
socket (this happens when permissions are being checked on the path
when binding or connecting on a Unix domain socket; and to allow
binding to secure ports in the INET domain; and so on).  It seems to
me that a suid program that is willing to create or bind a socket on
behalf of its caller without knowing exactly what it will be
connecting to, it should intrinsically be treated as a security
vulnerability, even when it is not obviously exploitable.

Also, to go along the real world examples, identd exists and is used
for identification on local networks (e.g. localhost), so the capture
of credentials already takes place.  Unix programmers are aware of
this, and know that a privileged program should not bind a socket if
they don't want to leak privileges.  (Another example is the use of -m
owner in iptables.)

And, of course, if Solaris already has this feature, there is some
experience for it.  Has there been any documented vulnerability
relating to the fact that Solaris allows getpeerucred() to
authenticate locally connected AF_INET sockets?

Note that since the possibility of using SO_PEERCRED on AF_INET
sockets does not hitherto exist on Linux, we can be sure that nobody
uses it, so it's not like it might open vulnerabilities in existing
code.  If you think it's insecure, it can be documented as such (by
comparing it with identd): I still think it's better than having no
control at all when binding to localhost, which is the present
situation (causing, e.g., CVE-2014-2914).

Because SO_PEERCRED currently returns {pid=0,uid=-1,gid=-1} on
AF_INET, we might still return this value if there is any risk that
the endpoint would be unwilling to share its credentials: for example,
this value might be returned if the other endpoint is not ptraceable
by the caller - this would still cover the essential use case, which
is for unprivileged users to authenticate the connections from their
own processes.  Would this limitation assuage your worries about the
proposed feature?

The thing is, I don't see any other way the ssh port forwarding mess
can ever be improved.  Do you have another solution in mind that?

Any attempt to have some kind of authentication of local sockets that
required participation on the client (authenticatee)'s part is doomed:
if modifying the protocol and/or client code is an option, we might as
well use some form of crypto / TLS.  Or Unix-domain sockets.  But what
are we supposed to do when modifying the client (to make it send
credentials, use crypto or connect on AF_UNIX) is not an option?

-- 
     David A. Madore
   ( http://www.madore.org/~david/ )

^ permalink raw reply

* Re: Regarding tx-nocache-copy in the Sheevaplug
From: Eric Dumazet @ 2014-10-15 22:45 UTC (permalink / raw)
  To: Benjamin Poirier
  Cc: Lluís Batlle i Rossell, linux-kernel, netdev,
	Carles Pagès, linux-arm-kernel
In-Reply-To: <20141015215701.GA4109@f1.synalogic.ca>

On Wed, 2014-10-15 at 14:57 -0700, Benjamin Poirier wrote:
> On 2014/10/13 12:52, Lluís Batlle i Rossell wrote:
> > Hello,
> > 
> > on the 7th of January 2014 ths patch was applied:
> > https://lkml.org/lkml/2014/1/7/307
> > 
> > [PATCH v2] net: Do not enable tx-nocache-copy by default
> >         
> > In the Sheevaplug (ARM Feroceon 88FR131 from Marvell) this made packets to be
> > sent corrupted. I think this machine has something special about the cache.
> > 
> > Enabling back this tx-nocache-copy (as it used to be before the patch) the
> > transfers work fine again. I think that most people, encountering this problem,
> > completely disable the tx offload instead of enabling back this setting.
> > 
> > Is this an ARM kernel problem regarding this platform?
> 
> This is odd, only x86 defines ARCH_HAS_NOCACHE_UACCESS. On arm,
> skb_do_copy_data_nocache() should end up using __copy_from_user()
> regardless of tx-nocache-copy.

 kmap_atomic()/kunmap_atomic() is missing, so we lack
__cpuc_flush_dcache_area() operations.

^ permalink raw reply

* RE: [PATCH v3 net] ixgbe: check adapter->vfinfo before dereference
From: Tantilov, Emil S @ 2014-10-15 22:50 UTC (permalink / raw)
  To: Thierry Herbelot, Kirsher, Jeffrey T, Brandeburg, Jesse,
	Allan, Bruce W, netdev@vger.kernel.org
In-Reply-To: <1413367080-31540-1-git-send-email-thierry.herbelot@6wind.com>

>-----Original Message-----
>From: Thierry Herbelot [mailto:thierry.herbelot@6wind.com]
>Sent: Wednesday, October 15, 2014 2:58 AM
>To: Kirsher, Jeffrey T; Brandeburg, Jesse; Allan, Bruce W;
>netdev@vger.kernel.org; Tantilov, Emil S
>Cc: Thierry Herbelot
>Subject: [PATCH v3 net] ixgbe: check adapter->vfinfo before dereference
>
>this protects against the following panic:
>(before a VF was actually created on p96p1 PF Ethernet port)
>
>ip link set p96p1 vf 0 spoofchk off
>BUG: unable to handle kernel NULL pointer dereference at 0000000000000052
>IP: [<ffffffffa044a1c1>]
>ixgbe_ndo_set_vf_spoofchk+0x51/0x150 [ixgbe]
>
>Signed-off-by: Thierry Herbelot <thierry.herbelot@6wind.com>
>---
>
>v2:
>  compilation fixes
>
>v3:
>  remove checks in functions where vfinfo is known not to be NULL
>  return -EINVAL as error code
>
> drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |   42
>++++++++++++++++++++++--
> 1 file changed, 40 insertions(+), 2 deletions(-)

Actually looking into this a bit more, the check for vfinfo is not sufficient
because it does not protect against specifying vf that is outside of sriov_num_vfs range.

All of the ndo functions have a check for it except for ixgbevf_ndo_set_spoofcheck().

The following patch should be all we need to protect against this panic:

This patch adds a check to return -EINVAL when setting spoofcheck on
VF that is not configured.

Reported-by: Thierry Herbelot <thierry.herbelot@6wind.com>
Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c |    3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 706fc69..97c85b8 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -1261,6 +1261,9 @@ int ixgbe_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting)
 	struct ixgbe_hw *hw = &adapter->hw;
 	u32 regval;
 
+	if (vf >= adapter->num_vfs)
+		return -EINVAL;
+
 	adapter->vfinfo[vf].spoofchk_enabled = setting;
 
 	regval = IXGBE_READ_REG(hw, IXGBE_PFVFSPOOF(vf_target_reg));

^ permalink raw reply related

* Re: feature suggestion: implement SO_PEERCRED on local AF_INET/AF_INET6 sockets (allow uid-based identification on localhost)
From: Andy Lutomirski @ 2014-10-15 22:54 UTC (permalink / raw)
  To: David Madore; +Cc: Linux Kernel mailing-list, Linux network mailing-list
In-Reply-To: <20141015223033.GA11458@achernar.madore.org>

On Wed, Oct 15, 2014 at 3:30 PM, David Madore <david+ml@madore.org> wrote:
> On Wed, Oct 15, 2014 at 07:41:48AM -0700, Andy Lutomirski wrote:
>> On 10/15/2014 06:35 AM, David Madore wrote:
>> > Given an AF_UNIX socket, the getsockopt(, SOL_SOCKET, SO_PEERCRED,,)
>> > call allows one endpoint to authenticate the other endpoint's pid, uid
>> > and gid.
>> >
>> > The call is valid on AF_INET and AF_INET6 sockets but returns no data
>> > (pid=0, uid=-1, gid=-1).  Obviously it is meaningless to try to get
>> > such credentials from a INET/INET6 socket in general, but there is one
>> > case where it would make sense: namely, when the endpoint is local
>> > (i.e., when the socket is a connection to the same machine, e.g., when
>> > connecting to 127.0.0.0/8 or ::1/32).
>>
>> I will object to adding it as described, for the same reason that I
>> object to anything that extends the current model of socket-based
>> credential passing.  Ideally, credentials would *never* be implicitly
>> captured by socket syscalls.  We live in the real world, and SO_****CRED
>> exists, so I think the best we can do is to try to minimize its use.
>>
>> I can elaborate further, or you can IIRC search the archives for
>> SCM_IDENTITY, and you can also look at CVE-2013-1979 for a nasty example
>> of why this model is broken.
>
> From what I understand, what was broken is mainly that the credentials
> were evaluated when the write() system call took place rather than
> when socket() or bind(): this violates the Unix security model
> (privilege control occurs when the file descriptor is created, not
> when it is used).  On the contrary, it is conform to Unix security
> principles that credentials are checked implicitly when binding a
> socket (this happens when permissions are being checked on the path
> when binding or connecting on a Unix domain socket; and to allow
> binding to secure ports in the INET domain; and so on).  It seems to
> me that a suid program that is willing to create or bind a socket on
> behalf of its caller without knowing exactly what it will be
> connecting to, it should intrinsically be treated as a security
> vulnerability, even when it is not obviously exploitable.

socket has little precedent for checking credentials and none in
POSIX.  And you're talking about connect, not bind.

>
> Also, to go along the real world examples, identd exists and is used
> for identification on local networks (e.g. localhost), so the capture
> of credentials already takes place.  Unix programmers are aware of
> this, and know that a privileged program should not bind a socket if
> they don't want to leak privileges.  (Another example is the use of -m
> owner in iptables.)

Ugh.

Identd is completely insecure.  Quite a few years ago personally broke
Stanford's entire single sign-on mechanism by exploiting it, against a
hardened, kerberized version, so less.  And iptables -m owner is all
about what you can connect to, not what is assumed by the recipient.
(And it's probably insecure, too, in many cases.)

>
> And, of course, if Solaris already has this feature, there is some
> experience for it.  Has there been any documented vulnerability
> relating to the fact that Solaris allows getpeerucred() to
> authenticate locally connected AF_INET sockets?

Does it matter?  CVE-2013-1979 existed for many years before anyone noticed.

>
> Note that since the possibility of using SO_PEERCRED on AF_INET
> sockets does not hitherto exist on Linux, we can be sure that nobody
> uses it, so it's not like it might open vulnerabilities in existing
> code.  If you think it's insecure, it can be documented as such (by
> comparing it with identd): I still think it's better than having no
> control at all when binding to localhost, which is the present
> situation (causing, e.g., CVE-2014-2914).

This doesn't follow.  *Everybody* uses connect on AF_INET.

IMO anything that sends a caller's credentials needs to be explicit and opt-in.

>
> Because SO_PEERCRED currently returns {pid=0,uid=-1,gid=-1} on
> AF_INET, we might still return this value if there is any risk that
> the endpoint would be unwilling to share its credentials: for example,
> this value might be returned if the other endpoint is not ptraceable
> by the caller - this would still cover the essential use case, which
> is for unprivileged users to authenticate the connections from their
> own processes.  Would this limitation assuage your worries about the
> proposed feature?
>
> The thing is, I don't see any other way the ssh port forwarding mess
> can ever be improved.  Do you have another solution in mind that?

UNIX sockets.  Firewall rules.  An opt-in mechanism like SCM_IDENTITY
that has explicit support in OpenSSH and doesn't happen unless the
administrator actually requests it in sshd_config.

>
> Any attempt to have some kind of authentication of local sockets that
> required participation on the client (authenticatee)'s part is doomed:
> if modifying the protocol and/or client code is an option, we might as
> well use some form of crypto / TLS.  Or Unix-domain sockets.  But what
> are we supposed to do when modifying the client (to make it send
> credentials, use crypto or connect on AF_UNIX) is not an option?

Exactly.

I believe that there is no secure way to authenticate clients that
currently don't authenticate themselves without changing the clients.
That's the whole point: currently-secure are written under the
assumption that they are not exercising their credentials.  You can't
safely change that without making it opt-in.

--Andy

^ permalink raw reply

* tcpdump's capture filter: "vlan" doesn't match
From: Lukas Tribus @ 2014-10-15 22:58 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: John Fastabend, Michał Mirosław, Jiri Pirko,
	Ben Hutchings, Atzm Watanabe, Patrick McHardy, Jesse Gross

Hi,



since 2.6.39 (including -rc1), tcpdump "vlan" capture filters don't match
anymore. All 2.6.38 and older kernels are fine.


I reproduced this specifically on a r8169 NIC on 2.6.39-rc1, but I found
this problem initially on bnx2 and e1000e nics.


Howto reproduce: just tcpdump with a "not vlan", "vlan" or "vlan <vlanid>"
capture filter on a passive eth interface (dot1q/vlan/ip config not necessary).

Actual behavior is that a "vlan [vlanid]" capture filter doesn't match the
(tagged) packet, and a "not vlan" capture filter matches everything.


Disabling rx-vlan-offloading via
ethtool -K eth0 rxvlan off

doesn't change anything.


Here we are filtering for "not vlan" and we can see that the matched frame
is vlan tagged:

# tcpdump -Uenc1 not vlan
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth0, link-type EN10MB (Ethernet), capture size 65535 bytes
22:03:39.077584 70:ca:9b:01:23:34> 00:18:f8:01:23:34, \
*ethertype 802.1Q (0x8100), length 70: vlan 7, p 0*, ethertype IPv4, \
192.168.47.9.443> 192.168.32.30.39436: Flags [.], ack 255248912, \
[...]
1 packet captured
169 packets received by filter
0 packets dropped by kernel
59 packets dropped by interface
#




As suggested here [1], we can pipe everything through another tcpdump
instance:
tcpdump -Uw - | tcpdump -en -r - vlan <vlanid>


But that is not something that works for my specific use-case (dedicated
sniffer box, dedicated interface connected to a Cisco SPAN/mirror port,
un/single/double-tagged packets, remotely accessible via remote-pcap [2]).


The sniffer should also be able to:
- maintain the frame as-is, including dot1q, dot1p (preferably
  without artificial recreation of header fields/values and including CFI/DEI)
- "direct" capture filter based on vlan (not through multiple userspace
  instances)

Kernel <= 2.6.38 perfectly satisfies those requirements.


Isn't disabling rx-vlan-offloading supposed to remedy those problems?




Thanks,

Lukas




[1] https://bugzilla.redhat.com/show_bug.cgi?id=498981
[2] https://github.com/frgtn/rpcapd-linux

 		 	   		  

^ permalink raw reply

* Re: [PATCH net 2/3] net: sctp: fix panic on duplicate ASCONF chunks
From: Daniel Borkmann @ 2014-10-15 23:13 UTC (permalink / raw)
  To: Neil Horman; +Cc: davem, linux-sctp, netdev, Vlad Yasevich
In-Reply-To: <20141015025833.GA6493@localhost.localdomain>

On 10/15/2014 04:58 AM, Neil Horman wrote:
> On Mon, Oct 13, 2014 at 01:25:11AM +0200, Daniel Borkmann wrote:
>> On 10/12/2014 03:42 AM, Neil Horman wrote:
>>> On Sat, Oct 11, 2014 at 12:02:31AM +0200, Daniel Borkmann wrote:
>>>> On 10/10/2014 05:39 PM, Neil Horman wrote:
>>>> ...
>>>>> Is it worth adding a WARN_ON, to indicate that two ASCONF chunks have been
>>>>> received with duplicate serials?
>>>>
>>>> Don't think so, as this would be triggerable from outside.
>>>>
>>> WARN_ON_ONCE then, per serial number?
>>
>> Sorry, but no. If someone seriously runs that in production and it
>> triggers a WARN() from outside, admins will start sending us bug
>> reports that apparently something with the kernel code is wrong.
>>
>> WARN() should only be used if we have some *internal* unexpected bug,
>> but can still fail gracefully. This would neither be an actual code bug
>> nor would it be an internally triggered one, plus we add unnecessary
>> complexity to the code. Similarly, for those reasons we don't WARN()
>> and throw a stack trace when we receive, say, an skb of invalid length
>> elsewhere.
>>
>> I'd also like to avoid any additional pr_debug().
>>
>> I don't think people enable them in production, and if they really do,
>> it's too late anyway as we already have received this chunk. If anything,
>> I'd rather like to see debugging code further removed as we have already
>> different facilities in the kernel for runtime debugging that are much
>> more powerful.
 >
> What do you suggest then?  It seems like this is a protocol error that an
> administrator will want to be made aware of.  I'm open to other options, but
> just saying "no" isn't sufficient for me.

So what do you want an admin to do with this information?

Currently, we don't throw WARN() stack traces or whatever on other 'malformed'
inputs, we either discard them silently in SCTP or send an ABORT, for example.

Do you suggest the kernel should now start throwing a WARN() on every possible
skb that we discard in order to inform the admin?

Given that we can handle this gracefully by basically ignoring the 2nd identical
ASCONF chunk in a packet and send out a single reply, that's just handled fine.

^ permalink raw reply

* Re: Netlink mmap tx security?
From: Daniel Borkmann @ 2014-10-15 23:45 UTC (permalink / raw)
  To: David Miller; +Cc: luto, torvalds, kaber, netdev, tgraf
In-Reply-To: <20141014.220111.179628329028952302.davem@davemloft.net>

On 10/15/2014 04:01 AM, David Miller wrote:
> From: Andy Lutomirski <luto@amacapital.net>
> Date: Tue, 14 Oct 2014 15:16:46 -0700
>
>> It's at least remotely possible that there's something that assumes
>> that assumes that the availability of NETLINK_RX_RING implies
>> NETLINK_TX_RING, which would be unfortunate.
>
> I already found one such case, nlmon :-/

Hmm, can you elaborate? I currently don't think that nlmon cares
actually.

^ permalink raw reply

* RE: ixgbe: Question about Flow Control on 10G
From: Tantilov, Emil S @ 2014-10-15 23:46 UTC (permalink / raw)
  To: dom, Skidmore, Donald C, netdev@vger.kernel.org
In-Reply-To: <543EC362.5060507@citrix.com>

>-----Original Message-----
>From: dom [mailto:dominic.curran@citrix.com]
>Sent: Wednesday, October 15, 2014 11:57 AM
>To: Tantilov, Emil S; Skidmore, Donald C;
>netdev@vger.kernel.org
>Subject: Re: ixgbe: Question about Flow Control on 10G
>
>On 10/14/2014 01:54 PM, Tantilov, Emil S wrote:
>>> -----Original Message-----
>>> From: netdev-owner@vger.kernel.org [mailto:netdev-
>>> owner@vger.kernel.org] On Behalf Of dom
>>> Sent: Tuesday, October 14, 2014 12:01 PM
>>> To: Skidmore, Donald C; netdev@vger.kernel.org
>>> Subject: ixgbe: Question about Flow Control on 10G
>>>
>>> Hi
>>>
>>> I have a question about the ixgbe driver's handling of
>>> 'ethtool -a ethX'
>>> when the NIC is using fibre.
>>>
>>> Specifically I don't understand the code introduced by this
>>> commit:
>>>
>>> commit 73d80953dfd1d5a92948005798c857c311c2834b
>>> Author: Don Skidmore <donald.c.skidmore@intel.com>
>>> Date:   Wed Jul 31 02:19:24 2013 +0000
>>> Subject: ixgbe: fix fc autoneg ethtool reporting.
>>>
>>> The function introduced the function:
>>>         ixgbe_device_supports_autoneg_fc()
>>>
>>> which gets called by
>>> ixgbe_get_pauseparam()/ixgbe_set_pauseparam().
>>>
>>> specifically there is a  case in
>>> ixgbe_device_supports_autoneg_fc()
>>>
>>>      case ixgbe_media_type_fiber_qsfp:
>>>      case ixgbe_media_type_fiber:
>>>          hw->mac.ops.check_link(hw, &speed, &link_up,
>>> false);
>>>          /* if link is down, assume supported */
>>>          if (link_up)
>>>              supported = speed == IXGBE_LINK_SPEED_1GB_FULL ?
>>>                  true : false;
>>>
>>> If link_up=1 then why is supported only true for a
>>> speed=IXGBE_LINK_SPEED_1GB_FULL ?
>>>
>>> Why is Flow Control not supported for IXGBE_LINK_SPEED_10GB_FULL ?
>> For SFP modules (media_type_fiber) flow control autoneg is not supported at 10gig.
>> You can still set flow control manually to enabled/disabled, just not autoneg.
>>
>> Thanks,
>> Emil
>Hi Emil
>
>Thank you for the quick answer.  I have a one follow-up question if I may...
>
>We noticed that back in 3.2.9 (before 73d80953dfd basically) the
>behaviour was different for 10G fibre.  i.e.  autonegotiate showed 'on'.
>
># ethtool -a eth1
>Pause parameters for eth1:
>Autonegotiate:  on
>RX:             on
>TX:             on
>
>
>The code:
>     if (hw->fc.disable_fc_autoneg ||
>         (hw->fc.current_mode == ixgbe_fc_none))
>         pause->autoneg = 0;
>     else
>         pause->autoneg = 1;
>
>So I assume this old output from 'ethtool -a' for autogen was just
>wrong, is that correct ?

Correct.

Thanks,
Emil

>
>[I'm asking cos I _know_ my h/w collegues are going to ask why the change.]
>
>Thanks again
>dom

^ permalink raw reply

* Re: Netlink mmap tx security?
From: David Miller @ 2014-10-15 23:57 UTC (permalink / raw)
  To: dborkman; +Cc: luto, torvalds, kaber, netdev, tgraf
In-Reply-To: <543F0712.8080503@redhat.com>

From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 16 Oct 2014 01:45:22 +0200

> On 10/15/2014 04:01 AM, David Miller wrote:
>> From: Andy Lutomirski <luto@amacapital.net>
>> Date: Tue, 14 Oct 2014 15:16:46 -0700
>>
>>> It's at least remotely possible that there's something that assumes
>>> that assumes that the availability of NETLINK_RX_RING implies
>>> NETLINK_TX_RING, which would be unfortunate.
>>
>> I already found one such case, nlmon :-/
> 
> Hmm, can you elaborate? I currently don't think that nlmon cares
> actually.

nlmon cares, openvswitch cares, etc:

http://openvswitch.org/pipermail/dev/2013-December/034496.html

^ permalink raw reply

* Re: Netlink mmap tx security?
From: Andy Lutomirski @ 2014-10-15 23:58 UTC (permalink / raw)
  To: David Miller
  Cc: Daniel Borkmann, Linus Torvalds, Patrick McHardy,
	Network Development, Thomas Graf
In-Reply-To: <20141015.195737.1429281929513331763.davem@davemloft.net>

On Wed, Oct 15, 2014 at 4:57 PM, David Miller <davem@davemloft.net> wrote:
> From: Daniel Borkmann <dborkman@redhat.com>
> Date: Thu, 16 Oct 2014 01:45:22 +0200
>
>> On 10/15/2014 04:01 AM, David Miller wrote:
>>> From: Andy Lutomirski <luto@amacapital.net>
>>> Date: Tue, 14 Oct 2014 15:16:46 -0700
>>>
>>>> It's at least remotely possible that there's something that assumes
>>>> that assumes that the availability of NETLINK_RX_RING implies
>>>> NETLINK_TX_RING, which would be unfortunate.
>>>
>>> I already found one such case, nlmon :-/
>>
>> Hmm, can you elaborate? I currently don't think that nlmon cares
>> actually.
>
> nlmon cares, openvswitch cares, etc:

It'll still work, just slower, right?

+    if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_RX_RING, &req,
sizeof(req)) < 0
+        || setsockopt(sock->fd, SOL_NETLINK, NETLINK_TX_RING, &req,
sizeof(req)) < 0) {
+        VLOG_INFO("mmap netlink is not supported");
+        return 0;
+    }
+

--Andy

^ permalink raw reply

* Re: [PATCH 1/1 net-next] openvswitch: kerneldoc warning fix
From: David Miller @ 2014-10-16  0:01 UTC (permalink / raw)
  To: fabf; +Cc: linux-kernel, pshelar, dev, netdev
In-Reply-To: <1413399798-8514-1-git-send-email-fabf@skynet.be>


Pravin could you please review these two openvswitch kerneldoc
fixes from Fabian?

Thanks.

^ permalink raw reply

* Re: feature suggestion: implement SO_PEERCRED on local AF_INET/AF_INET6 sockets (allow uid-based identification on localhost)
From: David Madore @ 2014-10-16  0:07 UTC (permalink / raw)
  To: Linux Kernel mailing-list, Linux network mailing-list; +Cc: Andy Lutomirski
In-Reply-To: <CALCETrXnfU6cGaYNBy-X=GKwBZ07Wf8VvnUz1R-qgKVtGrS=hQ@mail.gmail.com>

On Wed, Oct 15, 2014 at 03:54:08PM -0700, Andy Lutomirski wrote:
> On Wed, Oct 15, 2014 at 3:30 PM, David Madore <david+ml@madore.org> wrote:
> > Note that since the possibility of using SO_PEERCRED on AF_INET
> > sockets does not hitherto exist on Linux, we can be sure that nobody
> > uses it, so it's not like it might open vulnerabilities in existing
> > code.  If you think it's insecure, it can be documented as such (by
> > comparing it with identd): I still think it's better than having no
> > control at all when binding to localhost, which is the present
> > situation (causing, e.g., CVE-2014-2914).
> 
> This doesn't follow.  *Everybody* uses connect on AF_INET.
> 
> IMO anything that sends a caller's credentials needs to be explicit and opt-in.

I'm confused as to whether you mean "opt-in" on the side of the caller
(=process requesting the endpoint's credentials), or on that of the
endpoint (=authenticated process).  On the one hand I don't understand
what it could mean on the caller side, on the other hand you mention
explicit support in OpenSSH, which would be the caller in my scenario.

So, in case I haven't been clear enough, the situation I have in mind
is: on "thishost", I run "ssh -L 14321:remotehost:4321 somehost" to
forward connexions on from the local port 14321 of thishost (where ssh
listens on the loopback) to the port 4321 of remotehost.
Unfortunately, now everyone with an acccount on thishost can connect
to port 14321 and effectively emit a connection from somehost to
remotehost on my behalf.  I think everyone agrees that this is a huge
problem.  But I don't understand how you propose to remedy this.

Patching ssh is an option, but I don't see how to do it (ssh needs to
make sure that the connections it receives on 14321 are from the same
uid, and this seems impossible without the feature I'm discussing).
Patching the kernel is an option.  Patching clients that connect to
14321, on the other hand, is not, because there are many different
ones, and their protocol is defined by immutable Internet standards,
so we have no latitude there (for example, we can't ask a Web browser
to connect to Unix domain sockets: there simply isn't a URL scheme to
refer to them).  Adding iptables rules is not an option if I'm not the
system administrator on thishost.

So, how can we solve this problem securely?

> I believe that there is no secure way to authenticate clients that
> currently don't authenticate themselves without changing the clients.
> That's the whole point: currently-secure are written under the
> assumption that they are not exercising their credentials.  You can't
> safely change that without making it opt-in.

Then what are we to do, given that modifying the clients is
impossible?

What about my proposal that user credentials would be returned only if
they refer to the same user as the caller user and that the caller is
permitted to ptrace the endpoint?  This answers your objection of
leaking credentials: the caller could do anything at all with the
other side since it could ptrace it - we're just permitting a user to
authenticate their own sockets.  A further sysctl could enable the use
of the call in more general cases, for those administrators who think
it should be allowed.

-- 
     David A. Madore
   ( http://www.madore.org/~david/ )

^ permalink raw reply

* Re: feature suggestion: implement SO_PEERCRED on local AF_INET/AF_INET6 sockets (allow uid-based identification on localhost)
From: Andy Lutomirski @ 2014-10-16  0:11 UTC (permalink / raw)
  To: David Madore; +Cc: Linux Kernel mailing-list, Linux network mailing-list
In-Reply-To: <20141016000705.GA12208@achernar.madore.org>

On Wed, Oct 15, 2014 at 5:07 PM, David Madore <david+ml@madore.org> wrote:
> On Wed, Oct 15, 2014 at 03:54:08PM -0700, Andy Lutomirski wrote:
>> On Wed, Oct 15, 2014 at 3:30 PM, David Madore <david+ml@madore.org> wrote:
>> > Note that since the possibility of using SO_PEERCRED on AF_INET
>> > sockets does not hitherto exist on Linux, we can be sure that nobody
>> > uses it, so it's not like it might open vulnerabilities in existing
>> > code.  If you think it's insecure, it can be documented as such (by
>> > comparing it with identd): I still think it's better than having no
>> > control at all when binding to localhost, which is the present
>> > situation (causing, e.g., CVE-2014-2914).
>>
>> This doesn't follow.  *Everybody* uses connect on AF_INET.
>>
>> IMO anything that sends a caller's credentials needs to be explicit and opt-in.
>
> I'm confused as to whether you mean "opt-in" on the side of the caller
> (=process requesting the endpoint's credentials), or on that of the
> endpoint (=authenticated process).  On the one hand I don't understand
> what it could mean on the caller side, on the other hand you mention
> explicit support in OpenSSH, which would be the caller in my scenario.

I mean the authenticated process, not the process doing the authentication.

>
> So, in case I haven't been clear enough, the situation I have in mind
> is: on "thishost", I run "ssh -L 14321:remotehost:4321 somehost" to
> forward connexions on from the local port 14321 of thishost (where ssh
> listens on the loopback) to the port 4321 of remotehost.
> Unfortunately, now everyone with an acccount on thishost can connect
> to port 14321 and effectively emit a connection from somehost to
> remotehost on my behalf.  I think everyone agrees that this is a huge
> problem.  But I don't understand how you propose to remedy this.

Unfortunately, I think that you need client changes.  These could be
semi-transparent (using LD_PRELOAD) or almost completely transparent
(using network namespaces).

Actually, a network namespace-based proxying tool could be very useful.

>
> Patching ssh is an option, but I don't see how to do it (ssh needs to
> make sure that the connections it receives on 14321 are from the same
> uid, and this seems impossible without the feature I'm discussing).
> Patching the kernel is an option.  Patching clients that connect to
> 14321, on the other hand, is not, because there are many different
> ones, and their protocol is defined by immutable Internet standards,
> so we have no latitude there (for example, we can't ask a Web browser
> to connect to Unix domain sockets: there simply isn't a URL scheme to
> refer to them).  Adding iptables rules is not an option if I'm not the
> system administrator on thishost.

I misunderstood.  I though that you wanted a server-side solution.

>
> So, how can we solve this problem securely?
>
>> I believe that there is no secure way to authenticate clients that
>> currently don't authenticate themselves without changing the clients.
>> That's the whole point: currently-secure are written under the
>> assumption that they are not exercising their credentials.  You can't
>> safely change that without making it opt-in.
>
> Then what are we to do, given that modifying the clients is
> impossible?
>
> What about my proposal that user credentials would be returned only if
> they refer to the same user as the caller user and that the caller is
> permitted to ptrace the endpoint?  This answers your objection of
> leaking credentials: the caller could do anything at all with the
> other side since it could ptrace it - we're just permitting a user to
> authenticate their own sockets.  A further sysctl could enable the use
> of the call in more general cases, for those administrators who think
> it should be allowed.
>

Ugh.

That's probably safe, but it's quite disgusting IMO.

--Andy

^ permalink raw reply

* [PATCH] vxlan: fix a use after free in vxlan_encap_bypass
From: roy.qing.li @ 2014-10-16  0:49 UTC (permalink / raw)
  To: netdev

From: Li RongQing <roy.qing.li@gmail.com>

when netif_rx() is done, the netif_rx handled skb maybe be freed,
and should not be used.

Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
---
 drivers/net/vxlan.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 6f83be7..f677cd0 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1668,6 +1668,8 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 	struct pcpu_sw_netstats *tx_stats, *rx_stats;
 	union vxlan_addr loopback;
 	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
+	struct net_device *dev = skb->dev;
+	int len = skb->len;
 
 	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
 	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
@@ -1691,16 +1693,16 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
 
 	u64_stats_update_begin(&tx_stats->syncp);
 	tx_stats->tx_packets++;
-	tx_stats->tx_bytes += skb->len;
+	tx_stats->tx_bytes += len;
 	u64_stats_update_end(&tx_stats->syncp);
 
 	if (netif_rx(skb) == NET_RX_SUCCESS) {
 		u64_stats_update_begin(&rx_stats->syncp);
 		rx_stats->rx_packets++;
-		rx_stats->rx_bytes += skb->len;
+		rx_stats->rx_bytes += len;
 		u64_stats_update_end(&rx_stats->syncp);
 	} else {
-		skb->dev->stats.rx_dropped++;
+		dev->stats.rx_dropped++;
 	}
 }
 
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH 1/1 net-next] openvswitch: use vport instead of p
From: Pravin Shelar @ 2014-10-16  1:11 UTC (permalink / raw)
  To: Fabian Frederick; +Cc: LKML, David S. Miller, dev@openvswitch.org, netdev
In-Reply-To: <1413399821-8558-1-git-send-email-fabf@skynet.be>

On Wed, Oct 15, 2014 at 12:03 PM, Fabian Frederick <fabf@skynet.be> wrote:
> All functions used struct vport *vport except
> ovs_vport_find_upcall_portid.
>
> This fixes 1 kerneldoc warning
>
> Signed-off-by: Fabian Frederick <fabf@skynet.be>


Acked-by: Pravin B Shelar <pshelar@nicira.com>

Thanks.

> ---
>  net/openvswitch/vport.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
> index 53001b0..6015802 100644
> --- a/net/openvswitch/vport.c
> +++ b/net/openvswitch/vport.c
> @@ -408,13 +408,13 @@ int ovs_vport_get_upcall_portids(const struct vport *vport,
>   *
>   * Returns the portid of the target socket.  Must be called with rcu_read_lock.
>   */
> -u32 ovs_vport_find_upcall_portid(const struct vport *p, struct sk_buff *skb)
> +u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
>  {
>         struct vport_portids *ids;
>         u32 ids_index;
>         u32 hash;
>
> -       ids = rcu_dereference(p->upcall_portids);
> +       ids = rcu_dereference(vport->upcall_portids);
>
>         if (ids->n_ids == 1 && ids->ids[0] == 0)
>                 return 0;
> --
> 1.9.3
>

^ permalink raw reply

* Re: [PATCH 1/1 net-next] openvswitch: kerneldoc warning fix
From: Pravin Shelar @ 2014-10-16  1:12 UTC (permalink / raw)
  To: Fabian Frederick; +Cc: LKML, David S. Miller, dev@openvswitch.org, netdev
In-Reply-To: <1413399798-8514-1-git-send-email-fabf@skynet.be>

On Wed, Oct 15, 2014 at 12:03 PM, Fabian Frederick <fabf@skynet.be> wrote:
> s/sock/gs
>
> Signed-off-by: Fabian Frederick <fabf@skynet.be>

Looks good.

Acked-by: Pravin B Shelar <pshelar@nicira.com>

Thanks.

> ---
>  net/openvswitch/vport-geneve.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
> index 910b3ef..106a9d8 100644
> --- a/net/openvswitch/vport-geneve.c
> +++ b/net/openvswitch/vport-geneve.c
> @@ -30,7 +30,7 @@
>
>  /**
>   * struct geneve_port - Keeps track of open UDP ports
> - * @sock: The socket created for this port number.
> + * @gs: The socket created for this port number.
>   * @name: vport name.
>   */
>  struct geneve_port {
> --
> 1.9.3
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox