Netdev List
 help / color / mirror / Atom feed
* [PATCH net 0/2] Two BPF fixes
From: Daniel Borkmann @ 2016-12-17  0:54 UTC (permalink / raw)
  To: davem; +Cc: ast, netdev, Daniel Borkmann

This set contains two BPF fixes for net, one that addresses the
complaint from Geert wrt static allocations, and the other is a
fix wrt mem accounting that I found recently during testing.

Thanks!

Daniel Borkmann (2):
  bpf: dynamically allocate digest scratch buffer
  bpf: fix overflow in prog accounting

 include/linux/bpf.h    |  2 +-
 include/linux/filter.h | 17 ++++++++--
 kernel/bpf/core.c      | 88 ++++++++++++++++++++++++++++++++++++++++++--------
 kernel/bpf/syscall.c   | 27 +---------------
 kernel/bpf/verifier.c  |  6 ++--
 5 files changed, 94 insertions(+), 46 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net 1/2] bpf: dynamically allocate digest scratch buffer
From: Daniel Borkmann @ 2016-12-17  0:54 UTC (permalink / raw)
  To: davem; +Cc: ast, netdev, Daniel Borkmann
In-Reply-To: <cover.1481935106.git.daniel@iogearbox.net>

Geert rightfully complained that 7bd509e311f4 ("bpf: add prog_digest
and expose it via fdinfo/netlink") added a too large allocation of
variable 'raw' from bss section, and should instead be done dynamically:

  # ./scripts/bloat-o-meter kernel/bpf/core.o.1 kernel/bpf/core.o.2
  add/remove: 3/0 grow/shrink: 0/0 up/down: 33291/0 (33291)
  function                                     old     new   delta
  raw                                            -   32832  +32832
  [...]

Since this is only relevant during program creation path, which can be
considered slow-path anyway, lets allocate that dynamically and be not
implicitly dependent on verifier mutex. Move bpf_prog_calc_digest() at
the beginning of replace_map_fd_with_map_ptr() and also error handling
stays straight forward.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/bpf.h    |  2 +-
 include/linux/filter.h | 14 +++++++++++---
 kernel/bpf/core.c      | 27 ++++++++++++++++-----------
 kernel/bpf/syscall.c   |  2 +-
 kernel/bpf/verifier.c  |  6 ++++--
 5 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8796ff0..201eb48 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,7 +216,7 @@ struct bpf_event_entry {
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
-void bpf_prog_calc_digest(struct bpf_prog *fp);
+int bpf_prog_calc_digest(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6a16583..a0934e6 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -57,9 +57,6 @@
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK	512
 
-/* Maximum BPF program size in bytes. */
-#define MAX_BPF_SIZE	(BPF_MAXINSNS * sizeof(struct bpf_insn))
-
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -517,6 +514,17 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
 	return BPF_PROG_RUN(prog, xdp);
 }
 
+static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
+{
+	return prog->len * sizeof(struct bpf_insn);
+}
+
+static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog)
+{
+	return round_up(bpf_prog_insn_size(prog) +
+			sizeof(__be64) + 1, SHA_MESSAGE_BYTES);
+}
+
 static inline unsigned int bpf_prog_size(unsigned int proglen)
 {
 	return max(sizeof(struct bpf_prog),
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 83e0d15..75c08b8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -136,28 +136,29 @@ void __bpf_prog_free(struct bpf_prog *fp)
 	vfree(fp);
 }
 
-#define SHA_BPF_RAW_SIZE						\
-	round_up(MAX_BPF_SIZE + sizeof(__be64) + 1, SHA_MESSAGE_BYTES)
-
-/* Called under verifier mutex. */
-void bpf_prog_calc_digest(struct bpf_prog *fp)
+int bpf_prog_calc_digest(struct bpf_prog *fp)
 {
 	const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64);
-	static u32 ws[SHA_WORKSPACE_WORDS];
-	static u8 raw[SHA_BPF_RAW_SIZE];
-	struct bpf_insn *dst = (void *)raw;
+	u32 raw_size = bpf_prog_digest_scratch_size(fp);
+	u32 ws[SHA_WORKSPACE_WORDS];
 	u32 i, bsize, psize, blocks;
+	struct bpf_insn *dst;
 	bool was_ld_map;
-	u8 *todo = raw;
+	u8 *raw, *todo;
 	__be32 *result;
 	__be64 *bits;
 
+	raw = vmalloc(raw_size);
+	if (!raw)
+		return -ENOMEM;
+
 	sha_init(fp->digest);
 	memset(ws, 0, sizeof(ws));
 
 	/* We need to take out the map fd for the digest calculation
 	 * since they are unstable from user space side.
 	 */
+	dst = (void *)raw;
 	for (i = 0, was_ld_map = false; i < fp->len; i++) {
 		dst[i] = fp->insnsi[i];
 		if (!was_ld_map &&
@@ -177,12 +178,13 @@ void bpf_prog_calc_digest(struct bpf_prog *fp)
 		}
 	}
 
-	psize = fp->len * sizeof(struct bpf_insn);
-	memset(&raw[psize], 0, sizeof(raw) - psize);
+	psize = bpf_prog_insn_size(fp);
+	memset(&raw[psize], 0, raw_size - psize);
 	raw[psize++] = 0x80;
 
 	bsize  = round_up(psize, SHA_MESSAGE_BYTES);
 	blocks = bsize / SHA_MESSAGE_BYTES;
+	todo   = raw;
 	if (bsize - psize >= sizeof(__be64)) {
 		bits = (__be64 *)(todo + bsize - sizeof(__be64));
 	} else {
@@ -199,6 +201,9 @@ void bpf_prog_calc_digest(struct bpf_prog *fp)
 	result = (__force __be32 *)fp->digest;
 	for (i = 0; i < SHA_DIGEST_WORDS; i++)
 		result[i] = cpu_to_be32(fp->digest[i]);
+
+	vfree(raw);
+	return 0;
 }
 
 static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4819ec9..35d674c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -811,7 +811,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
 	err = -EFAULT;
 	if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns),
-			   prog->len * sizeof(struct bpf_insn)) != 0)
+			   bpf_prog_insn_size(prog)) != 0)
 		goto free_prog;
 
 	prog->orig_prog = NULL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 81e267b..64b7b1a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2931,6 +2931,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	int insn_cnt = env->prog->len;
 	int i, j, err;
 
+	err = bpf_prog_calc_digest(env->prog);
+	if (err)
+		return err;
+
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
 		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
@@ -3178,8 +3182,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		log_level = 0;
 	}
 
-	bpf_prog_calc_digest(env->prog);
-
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
-- 
1.9.3

^ permalink raw reply related

* [PATCH net 2/2] bpf: fix overflow in prog accounting
From: Daniel Borkmann @ 2016-12-17  0:54 UTC (permalink / raw)
  To: davem; +Cc: ast, netdev, Daniel Borkmann
In-Reply-To: <cover.1481935106.git.daniel@iogearbox.net>

Commit aaac3ba95e4c ("bpf: charge user for creation of BPF maps and
programs") made a wrong assumption of charging against prog->pages.
Unlike map->pages, prog->pages are still subject to change when we
need to expand the program through bpf_prog_realloc().

This can for example happen during verification stage when we need to
expand and rewrite parts of the program. Should the required space
cross a page boundary, then prog->pages is not the same anymore as
its original value that we used to bpf_prog_charge_memlock() on. Thus,
we'll hit a wrap-around during bpf_prog_uncharge_memlock() when prog
is freed eventually. I noticed this that despite having unlimited
memlock, programs suddenly refused to load with EPERM error due to
insufficient memlock.

There are two ways to fix this issue. One would be to add a cached
variable to struct bpf_prog that takes a snapshot of prog->pages at the
time of charging. The other approach is to also account for resizes. I
chose to go with the latter for a couple of reasons: i) We want accounting
rather to be more accurate instead of further fooling limits, ii) adding
yet another page counter on struct bpf_prog would also be a waste just
for this purpose. We also do want to charge as early as possible to
avoid going into the verifier just to find out later on that we crossed
limits. The only place that needs to be fixed is bpf_prog_realloc(),
since only here we expand the program, so we try to account for the
needed delta and should we fail, call-sites check for outcome anyway.
On cBPF to eBPF migrations, we don't grab a reference to the user as
they are charged differently. With that in place, my test case worked
fine.

Fixes: aaac3ba95e4c ("bpf: charge user for creation of BPF maps and programs")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/filter.h |  3 +++
 kernel/bpf/core.c      | 61 +++++++++++++++++++++++++++++++++++++++++++++++---
 kernel/bpf/syscall.c   | 25 ---------------------
 3 files changed, 61 insertions(+), 28 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index a0934e6..496a8c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -577,6 +577,9 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 				  gfp_t gfp_extra_flags);
 void __bpf_prog_free(struct bpf_prog *fp);
 
+int bpf_prog_charge_memlock(struct bpf_prog *prog);
+void bpf_prog_uncharge_memlock(struct bpf_prog *prog);
+
 static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
 {
 	bpf_prog_unlock_ro(fp);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 75c08b8..1f9a146 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -71,6 +71,51 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
 	return NULL;
 }
 
+static int __bpf_prog_charge(struct user_struct *user, u32 pages)
+{
+	unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+	unsigned long user_bufs;
+
+	if (user) {
+		user_bufs = atomic_long_add_return(pages, &user->locked_vm);
+		if (user_bufs > memlock_limit) {
+			atomic_long_sub(pages, &user->locked_vm);
+			return -EPERM;
+		}
+	}
+
+	return 0;
+}
+
+static void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
+{
+	if (user)
+		atomic_long_sub(pages, &user->locked_vm);
+}
+
+int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+	struct user_struct *user = get_current_user();
+	int ret;
+
+	ret = __bpf_prog_charge(user, prog->pages);
+	if (ret) {
+		free_uid(user);
+		return ret;
+	}
+
+	prog->aux->user = user;
+	return 0;
+}
+
+void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+	struct user_struct *user = prog->aux->user;
+
+	__bpf_prog_uncharge(user, prog->pages);
+	free_uid(user);
+}
+
 struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 {
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
@@ -105,19 +150,29 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 	gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
 			  gfp_extra_flags;
 	struct bpf_prog *fp;
+	u32 pages, delta;
+	int ret;
 
 	BUG_ON(fp_old == NULL);
 
 	size = round_up(size, PAGE_SIZE);
-	if (size <= fp_old->pages * PAGE_SIZE)
+	pages = size / PAGE_SIZE;
+	if (pages <= fp_old->pages)
 		return fp_old;
 
+	delta = pages - fp_old->pages;
+	ret = __bpf_prog_charge(fp_old->aux->user, delta);
+	if (ret)
+		return NULL;
+
 	fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
-	if (fp != NULL) {
+	if (fp == NULL) {
+		__bpf_prog_uncharge(fp_old->aux->user, delta);
+	} else {
 		kmemcheck_annotate_bitfield(fp, meta);
 
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
-		fp->pages = size / PAGE_SIZE;
+		fp->pages = pages;
 		fp->aux->prog = fp;
 
 		/* We keep fp->aux from fp_old around in the new
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35d674c..016382b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -615,31 +615,6 @@ static void free_used_maps(struct bpf_prog_aux *aux)
 	kfree(aux->used_maps);
 }
 
-static int bpf_prog_charge_memlock(struct bpf_prog *prog)
-{
-	struct user_struct *user = get_current_user();
-	unsigned long memlock_limit;
-
-	memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-	atomic_long_add(prog->pages, &user->locked_vm);
-	if (atomic_long_read(&user->locked_vm) > memlock_limit) {
-		atomic_long_sub(prog->pages, &user->locked_vm);
-		free_uid(user);
-		return -EPERM;
-	}
-	prog->aux->user = user;
-	return 0;
-}
-
-static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
-{
-	struct user_struct *user = prog->aux->user;
-
-	atomic_long_sub(prog->pages, &user->locked_vm);
-	free_uid(user);
-}
-
 static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 {
 	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
-- 
1.9.3

^ permalink raw reply related

* [PATCH] net: mv643xx_eth: fix build failure
From: Sudip Mukherjee @ 2016-12-17  0:45 UTC (permalink / raw)
  To: Sebastian Hesselbarth; +Cc: linux-kernel, netdev, Sudip Mukherjee

The build of sparc allmodconfig fails with the error:
"of_irq_to_resource" [drivers/net/ethernet/marvell/mv643xx_eth.ko]
	undefined!

of_irq_to_resource() is defined when CONFIG_OF_IRQ is defined. And also
CONFIG_OF_IRQ can only be defined if CONFIG_IRQ is defined. So we can
safely use #if defined(CONFIG_OF_IRQ) in the code.

Signed-off-by: Sudip Mukherjee <sudip.mukherjee@codethink.co.uk>
---

build log of next-20161216 is at:
https://travis-ci.org/sudipm-mukherjee/parport/jobs/184652820

 drivers/net/ethernet/marvell/mv643xx_eth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index 5f62c3d..1fa7c03 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -2713,7 +2713,7 @@ static void infer_hw_params(struct mv643xx_eth_shared_private *msp)
 MODULE_DEVICE_TABLE(of, mv643xx_eth_shared_ids);
 #endif
 
-#if defined(CONFIG_OF) && !defined(CONFIG_MV64X60)
+#if defined(CONFIG_OF_IRQ) && !defined(CONFIG_MV64X60)
 #define mv643xx_eth_property(_np, _name, _v)				\
 	do {								\
 		u32 tmp;						\
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH 1/2] bpf: do not use KMALLOC_SHIFT_MAX
From: Alexei Starovoitov @ 2016-12-17  0:28 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Cristopher Lameter, Andrew Morton, Alexei Starovoitov,
	netdev, Daniel Borkmann
In-Reply-To: <20161216233917.GB23392@dhcp22.suse.cz>

On Sat, Dec 17, 2016 at 12:39:17AM +0100, Michal Hocko wrote:
> On Fri 16-12-16 15:23:42, Alexei Starovoitov wrote:
> > On Fri, Dec 16, 2016 at 11:02:35PM +0100, Michal Hocko wrote:
> > > On Fri 16-12-16 10:02:10, Alexei Starovoitov wrote:
> > > > On Thu, Dec 15, 2016 at 05:47:21PM +0100, Michal Hocko wrote:
> > > > > From: Michal Hocko <mhocko@suse.com>
> > > > > 
> > > > > 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer
> > > > > overflow") has added checks for the maximum allocateable size. It
> > > > > (ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect
> > > > > it is not very clean because we already have KMALLOC_MAX_SIZE for this
> > > > > very reason so let's change both checks to use KMALLOC_MAX_SIZE instead.
> > > > > 
> > > > > Cc: Alexei Starovoitov <ast@kernel.org>
> > > > > Signed-off-by: Michal Hocko <mhocko@suse.com>
> > > > 
> > > > Nack until the patches 1 and 2 are reversed.
> > > 
> > > I do not insist on ordering. The thing is that it shouldn't matter all
> > > that much. Or are you worried about bisectability?
> > 
> > This patch 1 strongly depends on patch 2 !
> > Therefore order matters.
> > The patch 1 by itself is broken.
> > The commit log is saying
> > '(ab)used KMALLOC_SHIFT_MAX for that purpose .. use KMALLOC_MAX_SIZE instead'
> > that is also incorrect. We cannot do that until KMALLOC_MAX_SIZE is fixed.
> > So please change the order
> 
> Yes, I agree that using KMALLOC_MAX_SIZE could lead to a warning with
> the current ordering. Why that matters all that much is less clear to
> me. The allocation would simply fail and you would return ENOMEM rather
> than E2BIG. Does this really matter?
> 
> Anyway, as I've said, I do not really insist on the current ordering and
> the will ask Andrew to reorder them. I am just really wondering about
> such a strong pushback about something that barely matters. Or maybe I
> am just missing your point and checking KMALLOC_MAX_SIZE without an
> update would lead to a wrong behavior, user space breakage, crash or
> anything similar.

if admin set ulimit for locked memory high enough for the particular user,
that non-root user will be able to trigger warn_on_once in __alloc_pages_slowpath
which is not acceptable.
Also see the comment in hashtab.c
  if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
      MAX_BPF_STACK - sizeof(struct htab_elem))
          /* if value_size is bigger, the user space won't be able to
           * access the elements via bpf syscall. This check also makes
           * sure that the elem_size doesn't overflow and it's
           * kmalloc-able later in htab_map_update_elem()
           */
          goto free_htab;

> > and fix the commit log to say that KMALLOC_MAX_SIZE
> > is actually valid limit now.
> 
> KMALLOC_MAX_SIZE has always been the right limit. It's value has been
> incorrect but that is to be fixed now. Using KMALLOC_SHIFT_MAX is simply
> abusing an internal constant. So I am not sure what should be fixed in
> the changelog.

that's exactly my problem with this patch and the commit log.
You think it's abusing KMALLOC_SHIFT_MAX whereas it's doing so
for reasons stated above.
That piece of code cannot use KMALLOC_MAX_SIZE until it's fixed.
So commit log should say something like:
"now since KMALLOC_MAX_SIZE is fixed and size < KMALLOC_MAX_SIZE condition
guarantees warn free allocation in kmalloc(value_size, GFP_USER | __GFP_NOWARN);
we can safely use KMALLOC_MAX_SIZE instead of KMALLOC_SHIFT_MAX"

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH] net: wan: Use dma_pool_zalloc
From: Joe Perches @ 2016-12-17  0:17 UTC (permalink / raw)
  To: Souptick Joarder; +Cc: Krzysztof Hałasa, netdev, Rameshwar Sahu
In-Reply-To: <CAFqt6zYasTqZP1GOXJ7u7=L9U8bgM3SnKb-L7yyVVeDTbv+q0g@mail.gmail.com>

On Fri, 2016-12-16 at 19:25 +0530, Souptick Joarder wrote:
> On Fri, Dec 16, 2016 at 11:40 AM, Joe Perches <joe@perches.com> wrote:
> > On Fri, 2016-12-16 at 11:33 +0530, Souptick Joarder wrote:
> > > On Thu, Dec 15, 2016 at 10:18 PM, Joe Perches <joe@perches.com> wrote:
> > > > On Thu, 2016-12-15 at 10:41 +0530, Souptick Joarder wrote:
> > > > > On Mon, Dec 12, 2016 at 10:12 AM, Souptick Joarder <jrdr.linux@gmail.com> wrote:
> > > > > > On Fri, Dec 9, 2016 at 6:33 PM, Krzysztof Hałasa <khalasa@piap.pl> wrote:
> > > > > > > Souptick Joarder <jrdr.linux@gmail.com> writes:
> > > > > > > 
> > > > > > > > We should use dma_pool_zalloc instead of dma_pool_alloc/memset
> > > > 
> > > > []
> > > > > > > > diff --git a/drivers/net/wan/ixp4xx_hss.c b/drivers/net/wan/ixp4xx_hss.c
> > > > 
> > > > []
> > > > > > > > @@ -976,10 +976,9 @@ static int init_hdlc_queues(struct port *port)
> > > > > > > >                       return -ENOMEM;
> > > > > > > >       }
> > > > > > > > 
> > > > > > > > -     if (!(port->desc_tab = dma_pool_alloc(dma_pool, GFP_KERNEL,
> > > > > > > > -                                           &port->desc_tab_phys)))
> > > > > > > > +     if (!(port->desc_tab = dma_pool_zalloc(dma_pool, GFP_KERNEL,
> > > > > > > > +                                            &port->desc_tab_phys)))
> > > > > > > >               return -ENOMEM;
> > > > > > > > -     memset(port->desc_tab, 0, POOL_ALLOC_SIZE);
> > > > > > > >       memset(port->rx_buff_tab, 0, sizeof(port->rx_buff_tab)); /* tables */
> > > > > > > >       memset(port->tx_buff_tab, 0, sizeof(port->tx_buff_tab));
> > > > > > > 
> > > > > > > This look fine, feel free to send it to the netdev mailing list for
> > > > > > > inclusion.
> > > > > > 
> > > > > > Including netdev mailing list based as requested.
> > > > > > > Acked-by: Krzysztof Halasa <khalasa@piap.pl>
> > > > 
> > > > []
> > > > > Any comment on this patch ?
> > > > 
> > > > Shouldn't the one in drivers/net/ethernet/xscale/ixp4xx_eth.c
> > > > also be changed?
> > > 
> > > Yes, you are right.   Do you want me to include it in same patch?
> > 
> > Your choice.  I would use a single patch.
> 
> There are few other places where the same change is applicable.
> I am planning to put all those changes in a single patch. It includes
> changes in drivers/net/ethernet/xscale/ixp4xx_eth.c
> 
> You can review this patch separately.

If you are spanning multiple drivers maintained by different
groups, it's probably better to create a patch series, one for
each driver, to allow the various maintainers to apply the
patches to their individually maintained drivers.

Joe

^ permalink raw reply

* Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: George Spelvin @ 2016-12-16 23:44 UTC (permalink / raw)
  To: Jason, linux
  Cc: ak, davem, David.Laight, djb, ebiggers3, hannes,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, luto, netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <CAHmME9oEhmqW3320Ch+Rczu_=CxQyUQXCGLnYjDm-CYbWugnSw@mail.gmail.com>

> 64-bit security for an RNG is not reasonable even with rekeying. No no
> no. Considering we already have a massive speed-up here with the
> secure version, there's zero reason to start weakening the security
> because we're trigger happy with our benchmarks. No no no.

Just to clarify, I was discussing the idea with Ted (who's in charge of
the whole thing, not me), not trying to make any sort of final decision
on the subject.  I need to look at the various users (46 non-trivial ones
for get_random_int, 15 for get_random_long) and see what their security
requirements actually are.

I'm also trying to see if HalfSipHash can be used in a way that gives
slightly more than 64 bits of effective security.

The problem is that the old MD5-based transform had unclear, but
obviously ample, security.  There were 64 bytes of global secret and
16 chaining bytes per CPU.  Adapting SipHash (even the full version)
takes more thinking.

An actual HalfSipHash-based equivalent to the existing code would be:

#define RANDOM_INT_WORDS (64 / sizeof(long))	/* 16 or 8 */

static u32 random_int_secret[RANDOM_INT_WORDS]
	____cacheline_aligned __read_mostly;
static DEFINE_PER_CPU(unsigned long[4], get_random_int_hash)
		__aligned(sizeof(unsigned long));

unsigned long get_random_long(void)
{
	unsigned long *hash = get_cpu_var(get_random_int_hash);
	unsigned long v0 = hash[0], v1 = hash[1], v2 = hash[2], v3 = hash[3];
	int i;

	/* This could be improved, but it's equivalent */
	v0 += current->pid + jiffies + random_get_entropy();

	for (i = 0; i < RANDOM_INT_WORDS; i++) {
		v3 ^= random_int_secret[i];
		HSIPROUND;
		HSIPROUND;
		v0 ^= random_int_secret[i];
	}
	/* To be equivalent, we *don't* finalize the transform */

	hash[0] = v0; hash[1] = v1; hash[2] = v2; hash[3] = v3;
	put_cpu_var(get_random_int_hash);

	return v0 ^ v1 ^ v2 ^ v3;
}

I don't think there's a 2^64 attack on that.

But 64 bytes of global secret is ridiculous if the hash function
doesn't require that minimum block size.  It'll take some thinking.


Ths advice I'd give now is:
- Implement
unsigned long hsiphash(const void *data, size_t len, const unsigned long key[2])
  .. as SipHash on 64-bit (maybe SipHash-1-3, still being discussed) and
  HalfSipHash on 32-bit.
- Document when it may or may not be used carefully.
- #define get_random_int (unsigned)get_random_long
- Ted, Andy Lutorminski and I will try to figure out a construction of
  get_random_long() that we all like.


('scuse me for a few hours, I have some unrelated things I really *should*
be working on...)

^ permalink raw reply

* Re: [PATCH 1/2] bpf: do not use KMALLOC_SHIFT_MAX
From: Daniel Borkmann @ 2016-12-16 23:40 UTC (permalink / raw)
  To: Alexei Starovoitov, Michal Hocko
  Cc: linux-mm, Cristopher Lameter, Andrew Morton, Alexei Starovoitov,
	netdev
In-Reply-To: <20161216232340.GA99159@ast-mbp.thefacebook.com>

On 12/17/2016 12:23 AM, Alexei Starovoitov wrote:
> On Fri, Dec 16, 2016 at 11:02:35PM +0100, Michal Hocko wrote:
>> On Fri 16-12-16 10:02:10, Alexei Starovoitov wrote:
>>> On Thu, Dec 15, 2016 at 05:47:21PM +0100, Michal Hocko wrote:
>>>> From: Michal Hocko <mhocko@suse.com>
>>>>
>>>> 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer
>>>> overflow") has added checks for the maximum allocateable size. It
>>>> (ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect
>>>> it is not very clean because we already have KMALLOC_MAX_SIZE for this
>>>> very reason so let's change both checks to use KMALLOC_MAX_SIZE instead.
>>>>
>>>> Cc: Alexei Starovoitov <ast@kernel.org>
>>>> Signed-off-by: Michal Hocko <mhocko@suse.com>
>>>
>>> Nack until the patches 1 and 2 are reversed.
>>
>> I do not insist on ordering. The thing is that it shouldn't matter all
>> that much. Or are you worried about bisectability?
>
> This patch 1 strongly depends on patch 2 !
> Therefore order matters.
> The patch 1 by itself is broken.
> The commit log is saying
> '(ab)used KMALLOC_SHIFT_MAX for that purpose .. use KMALLOC_MAX_SIZE instead'
> that is also incorrect. We cannot do that until KMALLOC_MAX_SIZE is fixed.
> So please change the order and fix the commit log to say that KMALLOC_MAX_SIZE
> is actually valid limit now.

Michal, please also Cc netdev on your v2. Looks like the set
originally didn't Cc it (at least I didn't see 2/2). Thanks.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH 1/2] bpf: do not use KMALLOC_SHIFT_MAX
From: Michal Hocko @ 2016-12-16 23:39 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-mm, Cristopher Lameter, Andrew Morton, Alexei Starovoitov,
	netdev, Daniel Borkmann
In-Reply-To: <20161216232340.GA99159@ast-mbp.thefacebook.com>

On Fri 16-12-16 15:23:42, Alexei Starovoitov wrote:
> On Fri, Dec 16, 2016 at 11:02:35PM +0100, Michal Hocko wrote:
> > On Fri 16-12-16 10:02:10, Alexei Starovoitov wrote:
> > > On Thu, Dec 15, 2016 at 05:47:21PM +0100, Michal Hocko wrote:
> > > > From: Michal Hocko <mhocko@suse.com>
> > > > 
> > > > 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer
> > > > overflow") has added checks for the maximum allocateable size. It
> > > > (ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect
> > > > it is not very clean because we already have KMALLOC_MAX_SIZE for this
> > > > very reason so let's change both checks to use KMALLOC_MAX_SIZE instead.
> > > > 
> > > > Cc: Alexei Starovoitov <ast@kernel.org>
> > > > Signed-off-by: Michal Hocko <mhocko@suse.com>
> > > 
> > > Nack until the patches 1 and 2 are reversed.
> > 
> > I do not insist on ordering. The thing is that it shouldn't matter all
> > that much. Or are you worried about bisectability?
> 
> This patch 1 strongly depends on patch 2 !
> Therefore order matters.
> The patch 1 by itself is broken.
> The commit log is saying
> '(ab)used KMALLOC_SHIFT_MAX for that purpose .. use KMALLOC_MAX_SIZE instead'
> that is also incorrect. We cannot do that until KMALLOC_MAX_SIZE is fixed.
> So please change the order

Yes, I agree that using KMALLOC_MAX_SIZE could lead to a warning with
the current ordering. Why that matters all that much is less clear to
me. The allocation would simply fail and you would return ENOMEM rather
than E2BIG. Does this really matter?

Anyway, as I've said, I do not really insist on the current ordering and
the will ask Andrew to reorder them. I am just really wondering about
such a strong pushback about something that barely matters. Or maybe I
am just missing your point and checking KMALLOC_MAX_SIZE without an
update would lead to a wrong behavior, user space breakage, crash or
anything similar.

> and fix the commit log to say that KMALLOC_MAX_SIZE
> is actually valid limit now.

KMALLOC_MAX_SIZE has always been the right limit. It's value has been
incorrect but that is to be fixed now. Using KMALLOC_SHIFT_MAX is simply
abusing an internal constant. So I am not sure what should be fixed in
the changelog.

-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: Soft lockup in inet_put_port on 4.6
From: Josef Bacik @ 2016-12-16 22:50 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Hannes Frederic Sowa, Craig Gallek, Eric Dumazet,
	Linux Kernel Network Developers
In-Reply-To: <CALx6S34qacbD-zxvEJyDVsp1_U_tkL_t0hEXrtfffBALsDMxEw@mail.gmail.com>

On Fri, Dec 16, 2016 at 5:18 PM, Tom Herbert <tom@herbertland.com> 
wrote:
> On Fri, Dec 16, 2016 at 2:08 PM, Josef Bacik <jbacik@fb.com> wrote:
>>  On Fri, Dec 16, 2016 at 10:21 AM, Josef Bacik <jbacik@fb.com> wrote:
>>> 
>>>  On Fri, Dec 16, 2016 at 9:54 AM, Josef Bacik <jbacik@fb.com> wrote:
>>>> 
>>>>  On Thu, Dec 15, 2016 at 7:07 PM, Hannes Frederic Sowa
>>>>  <hannes@stressinduktion.org> wrote:
>>>>> 
>>>>>  Hi Josef,
>>>>> 
>>>>>  On 15.12.2016 19:53, Josef Bacik wrote:
>>>>>> 
>>>>>>   On Tue, Dec 13, 2016 at 6:32 PM, Tom Herbert 
>>>>>> <tom@herbertland.com>
>>>>>>  wrote:
>>>>>>> 
>>>>>>>   On Tue, Dec 13, 2016 at 3:03 PM, Craig Gallek 
>>>>>>> <kraigatgoog@gmail.com>
>>>>>>>   wrote:
>>>>>>>> 
>>>>>>>>    On Tue, Dec 13, 2016 at 3:51 PM, Tom Herbert 
>>>>>>>> <tom@herbertland.com>
>>>>>>>>   wrote:
>>>>>>>>> 
>>>>>>>>>    I think there may be some suspicious code in 
>>>>>>>>> inet_csk_get_port. At
>>>>>>>>>    tb_found there is:
>>>>>>>>> 
>>>>>>>>>                    if (((tb->fastreuse > 0 && reuse) ||
>>>>>>>>>                         (tb->fastreuseport > 0 &&
>>>>>>>>>                          
>>>>>>>>> !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>>>>>                          sk->sk_reuseport && 
>>>>>>>>> uid_eq(tb->fastuid,
>>>>>>>>>   uid))) &&
>>>>>>>>>                        smallest_size == -1)
>>>>>>>>>                            goto success;
>>>>>>>>>                    if 
>>>>>>>>> (inet_csk(sk)->icsk_af_ops->bind_conflict(sk,
>>>>>>>>>   tb, true)) {
>>>>>>>>>                            if ((reuse ||
>>>>>>>>>                                 (tb->fastreuseport > 0 &&
>>>>>>>>>                                  sk->sk_reuseport &&
>>>>>>>>> 
>>>>>>>>>   !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>>>>>                                  uid_eq(tb->fastuid, uid))) &&
>>>>>>>>>                                smallest_size != -1 && 
>>>>>>>>> --attempts >=
>>>>>>>>>  0) {
>>>>>>>>>                                    
>>>>>>>>> spin_unlock_bh(&head->lock);
>>>>>>>>>                                    goto again;
>>>>>>>>>                            }
>>>>>>>>>                            goto fail_unlock;
>>>>>>>>>                    }
>>>>>>>>> 
>>>>>>>>>    AFAICT there is redundancy in these two conditionals.  The 
>>>>>>>>> same
>>>>>>>>>  clause
>>>>>>>>>    is being checked in both: (tb->fastreuseport > 0 &&
>>>>>>>>>    !rcu_access_pointer(sk->sk_reuseport_cb) && 
>>>>>>>>> sk->sk_reuseport &&
>>>>>>>>>    uid_eq(tb->fastuid, uid))) && smallest_size == -1. If this 
>>>>>>>>> is true
>>>>>>>>>  the
>>>>>>>>>    first conditional should be hit, goto done,  and the 
>>>>>>>>> second will
>>>>>>>>>  never
>>>>>>>>>    evaluate that part to true-- unless the sk is changed (do 
>>>>>>>>> we need
>>>>>>>>>    READ_ONCE for sk->sk_reuseport_cb?).
>>>>>>>> 
>>>>>>>>    That's an interesting point... It looks like this function 
>>>>>>>> also
>>>>>>>>    changed in 4.6 from using a single local_bh_disable() at the
>>>>>>>>  beginning
>>>>>>>>    with several spin_lock(&head->lock) to exclusively
>>>>>>>>    spin_lock_bh(&head->lock) at each locking point.  Perhaps 
>>>>>>>> the full
>>>>>>>>  bh
>>>>>>>>    disable variant was preventing the timers in your stack 
>>>>>>>> trace from
>>>>>>>>    running interleaved with this function before?
>>>>>>> 
>>>>>>> 
>>>>>>>   Could be, although dropping the lock shouldn't be able to 
>>>>>>> affect the
>>>>>>>   search state. TBH, I'm a little lost in reading function, the
>>>>>>>   SO_REUSEPORT handling is pretty complicated. For instance,
>>>>>>>   rcu_access_pointer(sk->sk_reuseport_cb) is checked three 
>>>>>>> times in
>>>>>>>  that
>>>>>>>   function and also in every call to inet_csk_bind_conflict. I 
>>>>>>> wonder
>>>>>>>  if
>>>>>>>   we can simply this under the assumption that SO_REUSEPORT is 
>>>>>>> only
>>>>>>>   allowed if the port number (snum) is explicitly specified.
>>>>>> 
>>>>>> 
>>>>>>   Ok first I have data for you Hannes, here's the time 
>>>>>> distributions
>>>>>>   before during and after the lockup (with all the debugging in 
>>>>>> place
>>>>>>  the
>>>>>>   box eventually recovers).  I've attached it as a text file 
>>>>>> since it is
>>>>>>   long.
>>>>> 
>>>>> 
>>>>>  Thanks a lot!
>>>>> 
>>>>>>   Second is I was thinking about why we would spend so much time 
>>>>>> doing
>>>>>>  the
>>>>>>   ->owners list, and obviously it's because of the massive 
>>>>>> amount of
>>>>>>   timewait sockets on the owners list.  I wrote the following 
>>>>>> dumb patch
>>>>>>   and tested it and the problem has disappeared completely.  Now 
>>>>>> I don't
>>>>>>   know if this is right at all, but I thought it was weird we 
>>>>>> weren't
>>>>>>   copying the soreuseport option from the original socket onto 
>>>>>> the twsk.
>>>>>>   Is there are reason we aren't doing this currently?  Does this 
>>>>>> help
>>>>>>   explain what is happening?  Thanks,
>>>>> 
>>>>> 
>>>>>  The patch is interesting and a good clue, but I am immediately a 
>>>>> bit
>>>>>  concerned that we don't copy/tag the socket with the uid also to 
>>>>> keep
>>>>>  the security properties for SO_REUSEPORT. I have to think a bit 
>>>>> more
>>>>>  about this.
>>>>> 
>>>>>  We have seen hangs during connect. I am afraid this patch 
>>>>> wouldn't help
>>>>>  there while also guaranteeing uniqueness.
>>>> 
>>>> 
>>>> 
>>>>  Yeah so I looked at the code some more and actually my patch is 
>>>> really
>>>>  bad.  If sk2->sk_reuseport is set we'll look at 
>>>> sk2->sk_reuseport_cb, which
>>>>  is outside of the timewait sock, so that's definitely bad.
>>>> 
>>>>  But we should at least be setting it to 0 so that we don't do this
>>>>  normally.  Unfortunately simply setting it to 0 doesn't fix the 
>>>> problem.  So
>>>>  for some reason having ->sk_reuseport set to 1 on a timewait 
>>>> socket makes
>>>>  this problem non-existent, which is strange.
>>>> 
>>>>  So back to the drawing board I guess.  I wonder if doing what 
>>>> craig
>>>>  suggested and batching the timewait timer expires so it hurts 
>>>> less would
>>>>  accomplish the same results.  Thanks,
>>> 
>>> 
>>>  Wait no I lied, we access the sk->sk_reuseport_cb, not sk2's.  
>>> This is the
>>>  code
>>> 
>>>                         if ((!reuse || !sk2->sk_reuse ||
>>>                             sk2->sk_state == TCP_LISTEN) &&
>>>                             (!reuseport || !sk2->sk_reuseport ||
>>>                              
>>> rcu_access_pointer(sk->sk_reuseport_cb) ||
>>>                              (sk2->sk_state != TCP_TIME_WAIT &&
>>>                              !uid_eq(uid, sock_i_uid(sk2))))) {
>>> 
>>>                                 if (!sk2->sk_rcv_saddr || 
>>> !sk->sk_rcv_saddr
>>>  ||
>>>                                     sk2->sk_rcv_saddr == 
>>> sk->sk_rcv_saddr)
>>>                                         break;
>>>                         }
>>> 
>>>  so in my patches case we now have reuseport == 1, 
>>> sk2->sk_reuseport == 1.
>>>  But now we are using reuseport, so sk->sk_reuseport_cb should be 
>>> non-NULL
>>>  right?  So really setting the timewait sock's sk_reuseport should 
>>> have no
>>>  bearing on how this loop plays out right?  Thanks,
>> 
>> 
>> 
>>  So more messing around and I noticed that we basically don't do the
>>  tb->fastreuseport logic at all if we've ended up with a non 
>> SO_REUSEPORT
>>  socket on that tb.  So before I fully understood what I was doing I 
>> fixed it
>>  so that after we go through ->bind_conflict() once with a 
>> SO_REUSEPORT
>>  socket, we reset tb->fastreuseport to 1 and set the uid to match 
>> the uid of
>>  the socket.  This made the problem go away.  Tom pointed out that 
>> if we bind
>>  to the same port on a different address and we have a non 
>> SO_REUSEPORT
>>  socket with the same address on this tb then we'd be screwed with 
>> my code.
>> 
>>  Which brings me to his proposed solution.  We need another hash 
>> table that
>>  is indexed based on the binding address.  Then each node 
>> corresponds to one
>>  address/port binding, with non-SO_REUSEPORT entries having only one 
>> entry,
>>  and normal SO_REUSEPORT entries having many.  This cleans up the 
>> need to
>>  search all the possible sockets on any given tb, we just go and 
>> look at the
>>  one we care about.  Does this make sense?  Thanks,
>> 
> Hi Josef,
> 
> Thinking about it some more the hash table won't work because of the
> rules of binding different addresses to the same port. What I think we
> can do is to change inet_bind_bucket to be structure that contains all
> the information used to detect conflicts (reuse*, if, address, uid,
> etc.) and a list of sockets that share that exact same information--
> for instance all socket in timewait state create through some listener
> socket should wind up on single bucket. When we do the bind_conflict
> function we only should have to walk this buckets, not the full list
> of sockets.
> 
> Thoughts on this?

This sounds good, maybe tb->owners be a list of say

struct inet_unique_shit {
	struct sock_common sk;
	struct hlist socks;
};

Then we make inet_unique_shit like twsks', just copy the relevant 
information, then hang the real sockets off of the socks hlist.  
Something like that?  Thanks,

Josef

^ permalink raw reply

* Re: [PATCH 1/2] bpf: do not use KMALLOC_SHIFT_MAX
From: Alexei Starovoitov @ 2016-12-16 23:23 UTC (permalink / raw)
  To: Michal Hocko
  Cc: linux-mm, Cristopher Lameter, Andrew Morton, Alexei Starovoitov,
	netdev, Daniel Borkmann
In-Reply-To: <20161216220235.GD7645@dhcp22.suse.cz>

On Fri, Dec 16, 2016 at 11:02:35PM +0100, Michal Hocko wrote:
> On Fri 16-12-16 10:02:10, Alexei Starovoitov wrote:
> > On Thu, Dec 15, 2016 at 05:47:21PM +0100, Michal Hocko wrote:
> > > From: Michal Hocko <mhocko@suse.com>
> > > 
> > > 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer
> > > overflow") has added checks for the maximum allocateable size. It
> > > (ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect
> > > it is not very clean because we already have KMALLOC_MAX_SIZE for this
> > > very reason so let's change both checks to use KMALLOC_MAX_SIZE instead.
> > > 
> > > Cc: Alexei Starovoitov <ast@kernel.org>
> > > Signed-off-by: Michal Hocko <mhocko@suse.com>
> > 
> > Nack until the patches 1 and 2 are reversed.
> 
> I do not insist on ordering. The thing is that it shouldn't matter all
> that much. Or are you worried about bisectability?

This patch 1 strongly depends on patch 2 !
Therefore order matters.
The patch 1 by itself is broken.
The commit log is saying
'(ab)used KMALLOC_SHIFT_MAX for that purpose .. use KMALLOC_MAX_SIZE instead'
that is also incorrect. We cannot do that until KMALLOC_MAX_SIZE is fixed.
So please change the order and fix the commit log to say that KMALLOC_MAX_SIZE
is actually valid limit now.

^ permalink raw reply

* Re: [kernel-hardening] Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: George Spelvin @ 2016-12-16 22:41 UTC (permalink / raw)
  To: Jason, kernel-hardening
  Cc: ak, davem, David.Laight, djb, ebiggers3, hannes,
	jeanphilippe.aumasson, linux-crypto, linux-kernel, linux, luto,
	netdev, tom, torvalds, tytso, vegard.nossum
In-Reply-To: <CAHmME9qPx3WUHF3__3wNOXr-AUti4WPO1qDiFus3Zr133FyV1g@mail.gmail.com>

An idea I had which mght be useful:

You could perhaps save two rounds in siphash_*u64.

The final word with the length (called "b" in your implementation)
only needs to be there if the input is variable-sized.

If every use of a given key is of a fixed-size input, you don't need
a length suffix.  When the input is an even number of words, that can
save you two rounds.

This requires an audit of callers (e.g. you have to use different
keys for IPv4 and IPv6 ISNs), but can save time.

(This is crypto 101; search "MD-strengthening" or see the remark on
p. 101 on Damgaard's 1989 paper "A design principle for hash functions" at
http://saluc.engr.uconn.edu/refs/algorithms/hashalg/damgard89adesign.pdf
but I'm sure that Ted, Jean-Philippe, and/or DJB will confirm if you'd
like.)

Jason A. Donenfeld wrote:
> Oh, okay, that is exactly what I thought was going on. I just thought
> you were implying that jiffies could be moved inside the hash, which
> then confused my understanding of how things should be. In any case,
> thanks for the explanation.

No, the rekeying procedure is cleverer.

The thing is, all that matters is that the ISN increments fast enough,
but not wrap too soon.

It *is* permitted to change the random base, as long as it only
increases, and slower than the timestamp does.

So what you do is every few minutes, you increment the high 4 bits of the
random base and change the key used to generate the low 28 bits.

The base used for any particular host might change from 0x10000000
to 0x2fffffff, or from 0x1fffffff to 0x20000000, but either way, it's
increasing, and not too fast.

This has the downside that an attacker can see 4 bits of the base,
so only needs to send send 2^28 = 256 MB to flood the connection,
but the upside that the key used to generate the low bits changes
faster than it can be broken.

^ permalink raw reply

* Re: [PATCH v6 3/5] random: use SipHash in place of MD5
From: Jason A. Donenfeld @ 2016-12-16 22:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Netdev, kernel-hardening@lists.openwall.com, LKML,
	Linux Crypto Mailing List, David Laight, Ted Tso,
	Hannes Frederic Sowa, Linus Torvalds, Eric Biggers, Tom Herbert,
	George Spelvin, Vegard Nossum, Andi Kleen, David S. Miller,
	Jean-Philippe Aumasson
In-Reply-To: <CALCETrX9v=Uwd1zZub=QpD73Lq0LM67NEi1qwqRUjtD5U1bHYw@mail.gmail.com>

Hi Andy,

> Agreed.  A simpler contruction would be:
>
> chaining++;
> output = H(chaining, secret);
>
> And this looks a whole lot like Ted's ChaCha20 construction.

In that simpler construction with counter-based secret rekeying and in
Ted's ChaCha20 construction, the issue is that every X hits, there's a
call to get_random_bytes, which has variable performance and entropy
issues. Doing it my way with it being time based, in the event that
somebody runs ` :(){ :|:& };:`, system performance doesn't suffer
because ASLR is making repeated calls to get_random_bytes every 128 or
so process creations. In the time based way, the system performance
will not suffer.

Jason

^ permalink raw reply

* Re: Soft lockup in inet_put_port on 4.6
From: Tom Herbert @ 2016-12-16 22:18 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Hannes Frederic Sowa, Craig Gallek, Eric Dumazet,
	Linux Kernel Network Developers
In-Reply-To: <1481926135.24490.8@smtp.office365.com>

On Fri, Dec 16, 2016 at 2:08 PM, Josef Bacik <jbacik@fb.com> wrote:
> On Fri, Dec 16, 2016 at 10:21 AM, Josef Bacik <jbacik@fb.com> wrote:
>>
>> On Fri, Dec 16, 2016 at 9:54 AM, Josef Bacik <jbacik@fb.com> wrote:
>>>
>>> On Thu, Dec 15, 2016 at 7:07 PM, Hannes Frederic Sowa
>>> <hannes@stressinduktion.org> wrote:
>>>>
>>>> Hi Josef,
>>>>
>>>> On 15.12.2016 19:53, Josef Bacik wrote:
>>>>>
>>>>>  On Tue, Dec 13, 2016 at 6:32 PM, Tom Herbert <tom@herbertland.com>
>>>>> wrote:
>>>>>>
>>>>>>  On Tue, Dec 13, 2016 at 3:03 PM, Craig Gallek <kraigatgoog@gmail.com>
>>>>>>  wrote:
>>>>>>>
>>>>>>>   On Tue, Dec 13, 2016 at 3:51 PM, Tom Herbert <tom@herbertland.com>
>>>>>>>  wrote:
>>>>>>>>
>>>>>>>>   I think there may be some suspicious code in inet_csk_get_port. At
>>>>>>>>   tb_found there is:
>>>>>>>>
>>>>>>>>                   if (((tb->fastreuse > 0 && reuse) ||
>>>>>>>>                        (tb->fastreuseport > 0 &&
>>>>>>>>                         !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>>>>                         sk->sk_reuseport && uid_eq(tb->fastuid,
>>>>>>>>  uid))) &&
>>>>>>>>                       smallest_size == -1)
>>>>>>>>                           goto success;
>>>>>>>>                   if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk,
>>>>>>>>  tb, true)) {
>>>>>>>>                           if ((reuse ||
>>>>>>>>                                (tb->fastreuseport > 0 &&
>>>>>>>>                                 sk->sk_reuseport &&
>>>>>>>>
>>>>>>>>  !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>>>>                                 uid_eq(tb->fastuid, uid))) &&
>>>>>>>>                               smallest_size != -1 && --attempts >=
>>>>>>>> 0) {
>>>>>>>>                                   spin_unlock_bh(&head->lock);
>>>>>>>>                                   goto again;
>>>>>>>>                           }
>>>>>>>>                           goto fail_unlock;
>>>>>>>>                   }
>>>>>>>>
>>>>>>>>   AFAICT there is redundancy in these two conditionals.  The same
>>>>>>>> clause
>>>>>>>>   is being checked in both: (tb->fastreuseport > 0 &&
>>>>>>>>   !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport &&
>>>>>>>>   uid_eq(tb->fastuid, uid))) && smallest_size == -1. If this is true
>>>>>>>> the
>>>>>>>>   first conditional should be hit, goto done,  and the second will
>>>>>>>> never
>>>>>>>>   evaluate that part to true-- unless the sk is changed (do we need
>>>>>>>>   READ_ONCE for sk->sk_reuseport_cb?).
>>>>>>>
>>>>>>>   That's an interesting point... It looks like this function also
>>>>>>>   changed in 4.6 from using a single local_bh_disable() at the
>>>>>>> beginning
>>>>>>>   with several spin_lock(&head->lock) to exclusively
>>>>>>>   spin_lock_bh(&head->lock) at each locking point.  Perhaps the full
>>>>>>> bh
>>>>>>>   disable variant was preventing the timers in your stack trace from
>>>>>>>   running interleaved with this function before?
>>>>>>
>>>>>>
>>>>>>  Could be, although dropping the lock shouldn't be able to affect the
>>>>>>  search state. TBH, I'm a little lost in reading function, the
>>>>>>  SO_REUSEPORT handling is pretty complicated. For instance,
>>>>>>  rcu_access_pointer(sk->sk_reuseport_cb) is checked three times in
>>>>>> that
>>>>>>  function and also in every call to inet_csk_bind_conflict. I wonder
>>>>>> if
>>>>>>  we can simply this under the assumption that SO_REUSEPORT is only
>>>>>>  allowed if the port number (snum) is explicitly specified.
>>>>>
>>>>>
>>>>>  Ok first I have data for you Hannes, here's the time distributions
>>>>>  before during and after the lockup (with all the debugging in place
>>>>> the
>>>>>  box eventually recovers).  I've attached it as a text file since it is
>>>>>  long.
>>>>
>>>>
>>>> Thanks a lot!
>>>>
>>>>>  Second is I was thinking about why we would spend so much time doing
>>>>> the
>>>>>  ->owners list, and obviously it's because of the massive amount of
>>>>>  timewait sockets on the owners list.  I wrote the following dumb patch
>>>>>  and tested it and the problem has disappeared completely.  Now I don't
>>>>>  know if this is right at all, but I thought it was weird we weren't
>>>>>  copying the soreuseport option from the original socket onto the twsk.
>>>>>  Is there are reason we aren't doing this currently?  Does this help
>>>>>  explain what is happening?  Thanks,
>>>>
>>>>
>>>> The patch is interesting and a good clue, but I am immediately a bit
>>>> concerned that we don't copy/tag the socket with the uid also to keep
>>>> the security properties for SO_REUSEPORT. I have to think a bit more
>>>> about this.
>>>>
>>>> We have seen hangs during connect. I am afraid this patch wouldn't help
>>>> there while also guaranteeing uniqueness.
>>>
>>>
>>>
>>> Yeah so I looked at the code some more and actually my patch is really
>>> bad.  If sk2->sk_reuseport is set we'll look at sk2->sk_reuseport_cb, which
>>> is outside of the timewait sock, so that's definitely bad.
>>>
>>> But we should at least be setting it to 0 so that we don't do this
>>> normally.  Unfortunately simply setting it to 0 doesn't fix the problem.  So
>>> for some reason having ->sk_reuseport set to 1 on a timewait socket makes
>>> this problem non-existent, which is strange.
>>>
>>> So back to the drawing board I guess.  I wonder if doing what craig
>>> suggested and batching the timewait timer expires so it hurts less would
>>> accomplish the same results.  Thanks,
>>
>>
>> Wait no I lied, we access the sk->sk_reuseport_cb, not sk2's.  This is the
>> code
>>
>>                        if ((!reuse || !sk2->sk_reuse ||
>>                            sk2->sk_state == TCP_LISTEN) &&
>>                            (!reuseport || !sk2->sk_reuseport ||
>>                             rcu_access_pointer(sk->sk_reuseport_cb) ||
>>                             (sk2->sk_state != TCP_TIME_WAIT &&
>>                             !uid_eq(uid, sock_i_uid(sk2))))) {
>>
>>                                if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr
>> ||
>>                                    sk2->sk_rcv_saddr == sk->sk_rcv_saddr)
>>                                        break;
>>                        }
>>
>> so in my patches case we now have reuseport == 1, sk2->sk_reuseport == 1.
>> But now we are using reuseport, so sk->sk_reuseport_cb should be non-NULL
>> right?  So really setting the timewait sock's sk_reuseport should have no
>> bearing on how this loop plays out right?  Thanks,
>
>
>
> So more messing around and I noticed that we basically don't do the
> tb->fastreuseport logic at all if we've ended up with a non SO_REUSEPORT
> socket on that tb.  So before I fully understood what I was doing I fixed it
> so that after we go through ->bind_conflict() once with a SO_REUSEPORT
> socket, we reset tb->fastreuseport to 1 and set the uid to match the uid of
> the socket.  This made the problem go away.  Tom pointed out that if we bind
> to the same port on a different address and we have a non SO_REUSEPORT
> socket with the same address on this tb then we'd be screwed with my code.
>
> Which brings me to his proposed solution.  We need another hash table that
> is indexed based on the binding address.  Then each node corresponds to one
> address/port binding, with non-SO_REUSEPORT entries having only one entry,
> and normal SO_REUSEPORT entries having many.  This cleans up the need to
> search all the possible sockets on any given tb, we just go and look at the
> one we care about.  Does this make sense?  Thanks,
>
Hi Josef,

Thinking about it some more the hash table won't work because of the
rules of binding different addresses to the same port. What I think we
can do is to change inet_bind_bucket to be structure that contains all
the information used to detect conflicts (reuse*, if, address, uid,
etc.) and a list of sockets that share that exact same information--
for instance all socket in timewait state create through some listener
socket should wind up on single bucket. When we do the bind_conflict
function we only should have to walk this buckets, not the full list
of sockets.

Thoughts on this?

Thanks,
Tom

> Josef
>

^ permalink raw reply

* Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: Jason A. Donenfeld @ 2016-12-16 22:18 UTC (permalink / raw)
  To: George Spelvin
  Cc: Theodore Ts'o, Andi Kleen, David Miller, David Laight,
	Daniel J . Bernstein, Eric Biggers, Hannes Frederic Sowa,
	Jean-Philippe Aumasson, kernel-hardening,
	Linux Crypto Mailing List, LKML, Andy Lutomirski, Netdev,
	Tom Herbert, Linus Torvalds, Vegard Nossum
In-Reply-To: <20161216221352.26899.qmail@ns.sciencehorizons.net>

On Fri, Dec 16, 2016 at 11:13 PM, George Spelvin
<linux@sciencehorizons.net> wrote:
> Remembering that on "real" machines it's full SipHash, then I'd say that
> 64-bit security + rekeying seems reasonable.

64-bit security for an RNG is not reasonable even with rekeying. No no
no. Considering we already have a massive speed-up here with the
secure version, there's zero reason to start weakening the security
because we're trigger happy with our benchmarks. No no no.

^ permalink raw reply

* Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: Andy Lutomirski @ 2016-12-16 22:15 UTC (permalink / raw)
  To: George Spelvin
  Cc: Ted Ts'o, Andi Kleen, David S. Miller, David Laight,
	D. J. Bernstein, Eric Biggers, Hannes Frederic Sowa,
	Jason A. Donenfeld, Jean-Philippe Aumasson,
	kernel-hardening@lists.openwall.com, Linux Crypto Mailing List,
	linux-kernel@vger.kernel.org, Network Development, Tom Herbert,
	Linus Torvalds, Vegard Nossum
In-Reply-To: <20161216221352.26899.qmail@ns.sciencehorizons.net>

On Fri, Dec 16, 2016 at 2:13 PM, George Spelvin
<linux@sciencehorizons.net> wrote:
>> What should we do with get_random_int() and get_random_long()?  In
>> some cases it's being used in performance sensitive areas, and where
>> anti-DoS protection might be enough.  In others, maybe not so much.
>
> This is tricky.  The entire get_random_int() structure is an abuse of
> the hash function and will need to be thoroughly rethought to convert
> it to SipHash.  Remember, SipHash's security goals are very different
> from MD5, so there's no obvious way to do the conversion.
>
> (It's *documented* as "not cryptographically secure", but we know
> where that goes.)
>
>> If we rekeyed the secret used by get_random_int() and
>> get_random_long() frequently (say, every minute or every 5 minutes),
>> would that be sufficient for current and future users of these
>> interfaces?
>
> Remembering that on "real" machines it's full SipHash, then I'd say that
> 64-bit security + rekeying seems reasonable.
>
> The question is, the idea has recently been floated to make hsiphash =
> SipHash-1-3 on 64-bit machines.  Is *that* okay?
>
>
> The annoying thing about the currently proposed patch is that the *only*
> chaining is the returned value.  What I'd *like* to do is the same
> pattern as we do with md5, and remember v[0..3] between invocations.
> But there's no partial SipHash primitive; we only get one word back.
>
> Even
>         *chaining += ret = siphash_3u64(...)
>
> would be an improvement.

This is almost exactly what I suggested in my email on the other
thread from a few seconds ago :)

--Andy

^ permalink raw reply

* Re: [PATCH v5 1/4] siphash: add cryptographically secure PRF
From: George Spelvin @ 2016-12-16 22:13 UTC (permalink / raw)
  To: linux, tytso
  Cc: ak, davem, David.Laight, djb, ebiggers3, hannes, Jason,
	jeanphilippe.aumasson, kernel-hardening, linux-crypto,
	linux-kernel, luto, netdev, tom, torvalds, vegard.nossum
In-Reply-To: <20161216204358.nlwifgcqnu6pitxs@thunk.org>

> What should we do with get_random_int() and get_random_long()?  In
> some cases it's being used in performance sensitive areas, and where
> anti-DoS protection might be enough.  In others, maybe not so much.

This is tricky.  The entire get_random_int() structure is an abuse of
the hash function and will need to be thoroughly rethought to convert
it to SipHash.  Remember, SipHash's security goals are very different
from MD5, so there's no obvious way to do the conversion.

(It's *documented* as "not cryptographically secure", but we know
where that goes.)

> If we rekeyed the secret used by get_random_int() and
> get_random_long() frequently (say, every minute or every 5 minutes),
> would that be sufficient for current and future users of these
> interfaces?

Remembering that on "real" machines it's full SipHash, then I'd say that
64-bit security + rekeying seems reasonable.

The question is, the idea has recently been floated to make hsiphash =
SipHash-1-3 on 64-bit machines.  Is *that* okay?


The annoying thing about the currently proposed patch is that the *only*
chaining is the returned value.  What I'd *like* to do is the same
pattern as we do with md5, and remember v[0..3] between invocations.
But there's no partial SipHash primitive; we only get one word back.

Even
	*chaining += ret = siphash_3u64(...)

would be an improvement.

Although we could do something like

	c0 = chaining[0];
	chaining[0] = c1 = chaining[1];

	ret = hsiphash(c0, c1, ...)

	chaining[1] = c0 + ret;

^ permalink raw reply

* Re: [PATCH v6 3/5] random: use SipHash in place of MD5
From: Andy Lutomirski @ 2016-12-16 22:13 UTC (permalink / raw)
  To: Jason A. Donenfeld
  Cc: Netdev, kernel-hardening@lists.openwall.com, LKML,
	Linux Crypto Mailing List, David Laight, Ted Tso,
	Hannes Frederic Sowa, Linus Torvalds, Eric Biggers, Tom Herbert,
	George Spelvin, Vegard Nossum, Andi Kleen, David S. Miller,
	Jean-Philippe Aumasson
In-Reply-To: <CAHmME9rbKi3O1SS89LRMEUeMdKyrdutXAfjb9QmW3KNoCuE-wg@mail.gmail.com>

On Fri, Dec 16, 2016 at 1:45 PM, Jason A. Donenfeld <Jason@zx2c4.com> wrote:
> Hi Andy,
>
> On Fri, Dec 16, 2016 at 10:31 PM, Andy Lutomirski <luto@amacapital.net> wrote:
>> I think it would be nice to try to strenghen the PRNG construction.
>> FWIW, I'm not an expert in PRNGs, and there's fairly extensive
>> literature, but I can at least try.
>
> In an effort to keep this patchset as initially as uncontroversial as
> possible, I kept the same same construction as before and just swapped
> out slow MD5 for fast Siphash. Additionally, the function
> documentation says that it isn't cryptographically secure. But in the
> end I certainly agree with you; we should most definitely improve
> things, and seeing the eyeballs now on this series, I think we now
> might have a mandate to do so.
>
>> 1. A one-time leak of memory contents doesn't ruin security until
>> reboot.  This is especially value across suspend and/or hibernation.
>
> Ted and I were discussing this in another thread, and it sounds like
> he wants the same thing. I'll add re-generation of the secret every
> once in a while. Perhaps time-based makes more sense than
> counter-based for rekeying frequency?

Counter may be faster -- you don't need to read a timer.  Lots of CPUs
are surprisingly slow at timing.  OTOH jiffies are fast.

>
>> 2. An attack with a low work factor (2^64?) shouldn't break the scheme
>> until reboot.
>
> It won't. The key is 128-bits.

Whoops, I thought the key was 64-bit...

>
>> This is effectively doing:
>>
>> output = H(prev_output, weak "entropy", per-boot secret);
>>
>> One unfortunately downside is that, if used in a context where an
>> attacker can see a single output, the attacker learns the chaining
>> value.  If the attacker can guess the entropy, then, with 2^64 work,
>> they learn the secret, and they can predict future outputs.
>
> No, the secret is 128-bits, which isn't feasibly guessable. The secret
> also isn't part of the hash, but rather is the generator of the hash
> function. A keyed hash (a PRF) is a bit of a different construction
> than just hashing a secret value into a hash function.

I was thinking in the random oracle model, in which case the whole
function is just a PRF that takes a few input parameters, one of which
is a key.

>
>> Second, change the mode so that an attacker doesn't learn so much
>> internal state.  For example:
>>
>> output = H(old_chain, entropy, secret);
>> new_chain = old_chain + entropy + output;
>
> Happy to make this change, with making the chaining value additive
> rather than a replacement.
>
> However, I'm not sure adding entropy to the new_chain makes a
> different. That entropy is basically just the cycle count plus the
> jiffies count. If an attacker can already guess them, then adding them
> again to the chaining value doesn't really add much.

Agreed.  A simpler contruction would be:

chaining++;
output = H(chaining, secret);

And this looks a whole lot like Ted's ChaCha20 construction.

The benefit of my construction is that (in the random oracle model,
assuming my intuition is right), if an attacker misses ~128 samples
and entropy has at least one bit of independent min-entropy per
sample, then the attacker needs ~2^128 work to brute-force the
chaining value even fi the attacker knew both the original chaining
value and the secret.

--Andy

^ permalink raw reply

* Re: [PATCH v6 3/5] random: use SipHash in place of MD5
From: Jason A. Donenfeld @ 2016-12-16 22:12 UTC (permalink / raw)
  To: Andy Lutomirski, Ted Tso
  Cc: Netdev, kernel-hardening@lists.openwall.com, LKML,
	Linux Crypto Mailing List, David Laight, Hannes Frederic Sowa,
	Linus Torvalds, Eric Biggers, Tom Herbert, George Spelvin,
	Vegard Nossum, Andi Kleen, David S. Miller,
	Jean-Philippe Aumasson
In-Reply-To: <CAHmME9rbKi3O1SS89LRMEUeMdKyrdutXAfjb9QmW3KNoCuE-wg@mail.gmail.com>

Hi Andy, Ted,

I've made the requested changes. Keys now rotate and are per-CPU
based. The chaining value is now additive instead of replacing.

DavidL suggested I lower the velocity of `git-send-email` triggers, so
if you'd like to take a look at this before I post v7, you can follow
along at my git tree here:

    https://git.zx2c4.com/linux-dev/log/?h=siphash

Choose the commit entitled "random: use SipHash in place of MD5"

Thanks,
Jason

^ permalink raw reply

* Re: [TSN RFC v2 5/9] Add TSN header for the driver
From: Richard Cochran @ 2016-12-16 22:09 UTC (permalink / raw)
  To: henrik
  Cc: linux-kernel, Henrik Austad, linux-media, alsa-devel, netdev,
	David S. Miller
In-Reply-To: <1481911153-549-6-git-send-email-henrik@austad.us>

On Fri, Dec 16, 2016 at 06:59:09PM +0100, henrik@austad.us wrote:
> +/*
> + * List of current subtype fields in the common header of AVTPDU
> + *
> + * Note: AVTPDU is a remnant of the standards from when it was AVB.
> + *
> + * The list has been updated with the recent values from IEEE 1722, draft 16.
> + */
> +enum avtp_subtype {
> +	TSN_61883_IIDC = 0,	/* IEC 61883/IIDC Format */
> +	TSN_MMA_STREAM,		/* MMA Streams */
> +	TSN_AAF,		/* AVTP Audio Format */
> +	TSN_CVF,		/* Compressed Video Format */
> +	TSN_CRF,		/* Clock Reference Format */
> +	TSN_TSCF,		/* Time-Synchronous Control Format */
> +	TSN_SVF,		/* SDI Video Format */
> +	TSN_RVF,		/* Raw Video Format */
> +	/* 0x08 - 0x6D reserved */
> +	TSN_AEF_CONTINOUS = 0x6e, /* AES Encrypted Format Continous */
> +	TSN_VSF_STREAM,		/* Vendor Specific Format Stream */
> +	/* 0x70 - 0x7e reserved */
> +	TSN_EF_STREAM = 0x7f,	/* Experimental Format Stream */
> +	/* 0x80 - 0x81 reserved */
> +	TSN_NTSCF = 0x82,	/* Non Time-Synchronous Control Format */
> +	/* 0x83 - 0xed reserved */
> +	TSN_ESCF = 0xec,	/* ECC Signed Control Format */
> +	TSN_EECF,		/* ECC Encrypted Control Format */
> +	TSN_AEF_DISCRETE,	/* AES Encrypted Format Discrete */
> +	/* 0xef - 0xf9 reserved */
> +	TSN_ADP = 0xfa,		/* AVDECC Discovery Protocol */
> +	TSN_AECP,		/* AVDECC Enumeration and Control Protocol */
> +	TSN_ACMP,		/* AVDECC Connection Management Protocol */
> +	/* 0xfd reserved */
> +	TSN_MAAP = 0xfe,	/* MAAP Protocol */
> +	TSN_EF_CONTROL,		/* Experimental Format Control */
> +};

The kernel shouldn't be in the business of assembling media packets.

Thanks,
Richard

^ permalink raw reply

* Re: Soft lockup in inet_put_port on 4.6
From: Josef Bacik @ 2016-12-16 22:08 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: Tom Herbert, Craig Gallek, Eric Dumazet,
	Linux Kernel Network Developers
In-Reply-To: <1481901684.24490.7@smtp.office365.com>

On Fri, Dec 16, 2016 at 10:21 AM, Josef Bacik <jbacik@fb.com> wrote:
> On Fri, Dec 16, 2016 at 9:54 AM, Josef Bacik <jbacik@fb.com> wrote:
>> On Thu, Dec 15, 2016 at 7:07 PM, Hannes Frederic Sowa 
>> <hannes@stressinduktion.org> wrote:
>>> Hi Josef,
>>> 
>>> On 15.12.2016 19:53, Josef Bacik wrote:
>>>>  On Tue, Dec 13, 2016 at 6:32 PM, Tom Herbert 
>>>> <tom@herbertland.com> wrote:
>>>>>  On Tue, Dec 13, 2016 at 3:03 PM, Craig Gallek 
>>>>> <kraigatgoog@gmail.com>
>>>>>  wrote:
>>>>>>   On Tue, Dec 13, 2016 at 3:51 PM, Tom Herbert 
>>>>>> <tom@herbertland.com>
>>>>>>  wrote:
>>>>>>>   I think there may be some suspicious code in 
>>>>>>> inet_csk_get_port. At
>>>>>>>   tb_found there is:
>>>>>>> 
>>>>>>>                   if (((tb->fastreuse > 0 && reuse) ||
>>>>>>>                        (tb->fastreuseport > 0 &&
>>>>>>>                         
>>>>>>> !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>>>                         sk->sk_reuseport && uid_eq(tb->fastuid,
>>>>>>>  uid))) &&
>>>>>>>                       smallest_size == -1)
>>>>>>>                           goto success;
>>>>>>>                   if 
>>>>>>> (inet_csk(sk)->icsk_af_ops->bind_conflict(sk,
>>>>>>>  tb, true)) {
>>>>>>>                           if ((reuse ||
>>>>>>>                                (tb->fastreuseport > 0 &&
>>>>>>>                                 sk->sk_reuseport &&
>>>>>>> 
>>>>>>>  !rcu_access_pointer(sk->sk_reuseport_cb) &&
>>>>>>>                                 uid_eq(tb->fastuid, uid))) &&
>>>>>>>                               smallest_size != -1 && --attempts 
>>>>>>> >= 0) {
>>>>>>>                                   spin_unlock_bh(&head->lock);
>>>>>>>                                   goto again;
>>>>>>>                           }
>>>>>>>                           goto fail_unlock;
>>>>>>>                   }
>>>>>>> 
>>>>>>>   AFAICT there is redundancy in these two conditionals.  The 
>>>>>>> same clause
>>>>>>>   is being checked in both: (tb->fastreuseport > 0 &&
>>>>>>>   !rcu_access_pointer(sk->sk_reuseport_cb) && sk->sk_reuseport 
>>>>>>> &&
>>>>>>>   uid_eq(tb->fastuid, uid))) && smallest_size == -1. If this is 
>>>>>>> true the
>>>>>>>   first conditional should be hit, goto done,  and the second 
>>>>>>> will never
>>>>>>>   evaluate that part to true-- unless the sk is changed (do we 
>>>>>>> need
>>>>>>>   READ_ONCE for sk->sk_reuseport_cb?).
>>>>>>   That's an interesting point... It looks like this function also
>>>>>>   changed in 4.6 from using a single local_bh_disable() at the 
>>>>>> beginning
>>>>>>   with several spin_lock(&head->lock) to exclusively
>>>>>>   spin_lock_bh(&head->lock) at each locking point.  Perhaps the 
>>>>>> full bh
>>>>>>   disable variant was preventing the timers in your stack trace 
>>>>>> from
>>>>>>   running interleaved with this function before?
>>>>> 
>>>>>  Could be, although dropping the lock shouldn't be able to affect 
>>>>> the
>>>>>  search state. TBH, I'm a little lost in reading function, the
>>>>>  SO_REUSEPORT handling is pretty complicated. For instance,
>>>>>  rcu_access_pointer(sk->sk_reuseport_cb) is checked three times 
>>>>> in that
>>>>>  function and also in every call to inet_csk_bind_conflict. I 
>>>>> wonder if
>>>>>  we can simply this under the assumption that SO_REUSEPORT is only
>>>>>  allowed if the port number (snum) is explicitly specified.
>>>> 
>>>>  Ok first I have data for you Hannes, here's the time distributions
>>>>  before during and after the lockup (with all the debugging in 
>>>> place the
>>>>  box eventually recovers).  I've attached it as a text file since 
>>>> it is
>>>>  long.
>>> 
>>> Thanks a lot!
>>> 
>>>>  Second is I was thinking about why we would spend so much time 
>>>> doing the
>>>>  ->owners list, and obviously it's because of the massive amount of
>>>>  timewait sockets on the owners list.  I wrote the following dumb 
>>>> patch
>>>>  and tested it and the problem has disappeared completely.  Now I 
>>>> don't
>>>>  know if this is right at all, but I thought it was weird we 
>>>> weren't
>>>>  copying the soreuseport option from the original socket onto the 
>>>> twsk.
>>>>  Is there are reason we aren't doing this currently?  Does this 
>>>> help
>>>>  explain what is happening?  Thanks,
>>> 
>>> The patch is interesting and a good clue, but I am immediately a bit
>>> concerned that we don't copy/tag the socket with the uid also to 
>>> keep
>>> the security properties for SO_REUSEPORT. I have to think a bit more
>>> about this.
>>> 
>>> We have seen hangs during connect. I am afraid this patch wouldn't 
>>> help
>>> there while also guaranteeing uniqueness.
>> 
>> 
>> Yeah so I looked at the code some more and actually my patch is 
>> really bad.  If sk2->sk_reuseport is set we'll look at 
>> sk2->sk_reuseport_cb, which is outside of the timewait sock, so 
>> that's definitely bad.
>> 
>> But we should at least be setting it to 0 so that we don't do this 
>> normally.  Unfortunately simply setting it to 0 doesn't fix the 
>> problem.  So for some reason having ->sk_reuseport set to 1 on a 
>> timewait socket makes this problem non-existent, which is strange.
>> 
>> So back to the drawing board I guess.  I wonder if doing what craig 
>> suggested and batching the timewait timer expires so it hurts less 
>> would accomplish the same results.  Thanks,
> 
> Wait no I lied, we access the sk->sk_reuseport_cb, not sk2's.  This 
> is the code
> 
>                        if ((!reuse || !sk2->sk_reuse ||
>                            sk2->sk_state == TCP_LISTEN) &&
>                            (!reuseport || !sk2->sk_reuseport ||
>                             rcu_access_pointer(sk->sk_reuseport_cb) ||
>                             (sk2->sk_state != TCP_TIME_WAIT &&
>                             !uid_eq(uid, sock_i_uid(sk2))))) {
> 
>                                if (!sk2->sk_rcv_saddr || 
> !sk->sk_rcv_saddr ||
>                                    sk2->sk_rcv_saddr == 
> sk->sk_rcv_saddr)
>                                        break;
>                        }
> 
> so in my patches case we now have reuseport == 1, sk2->sk_reuseport 
> == 1.  But now we are using reuseport, so sk->sk_reuseport_cb should 
> be non-NULL right?  So really setting the timewait sock's 
> sk_reuseport should have no bearing on how this loop plays out right? 
>  Thanks,


So more messing around and I noticed that we basically don't do the 
tb->fastreuseport logic at all if we've ended up with a non 
SO_REUSEPORT socket on that tb.  So before I fully understood what I 
was doing I fixed it so that after we go through ->bind_conflict() once 
with a SO_REUSEPORT socket, we reset tb->fastreuseport to 1 and set the 
uid to match the uid of the socket.  This made the problem go away.  
Tom pointed out that if we bind to the same port on a different address 
and we have a non SO_REUSEPORT socket with the same address on this tb 
then we'd be screwed with my code.

Which brings me to his proposed solution.  We need another hash table 
that is indexed based on the binding address.  Then each node 
corresponds to one address/port binding, with non-SO_REUSEPORT entries 
having only one entry, and normal SO_REUSEPORT entries having many.  
This cleans up the need to search all the possible sockets on any given 
tb, we just go and look at the one we care about.  Does this make 
sense?  Thanks,

Josef

^ permalink raw reply

* Re: [TSN RFC v2 0/9] TSN driver for the kernel
From: Richard Cochran @ 2016-12-16 22:05 UTC (permalink / raw)
  To: henrik; +Cc: linux-kernel, Henrik Austad, linux-media, alsa-devel, netdev
In-Reply-To: <1481911153-549-1-git-send-email-henrik@austad.us>

On Fri, Dec 16, 2016 at 06:59:04PM +0100, henrik@austad.us wrote:
> The driver is directed via ConfigFS as we need userspace to handle
> stream-reservation (MSRP), discovery and enumeration (IEEE 1722.1) and
> whatever other management is needed.

I complained about configfs before, but you didn't listen.

> 2 new fields in netdev_ops have been introduced, and the Intel
> igb-driver has been updated (as this an AVB-capable NIC which is
> available as a PCI-e card).

The igb hacks show that you are on the wrong track.  We can and should
be able to support TSN without resorting to driver specific hacks and
module parameters.

> Before reading on - this is not even beta, but I'd really appreciate if
> people would comment on the overall architecture and perhaps provide
> some pointers to where I should improve/fix/update

As I said before about V1, this architecture stinks.  You appear to
have continued hacking along and posted the same design again.  Did
you even address any of the points I raised back then?

You are trying to put tons of code into the kernel that really belongs
in user space, and at the same time, you omit critical functions that
only the kernel can provide.

> There are at least one AVB-driver (the AV-part of TSN) in the kernel
> already.

And which driver is that?

Thanks,
Richard

^ permalink raw reply

* Re: [PATCH 1/2] bpf: do not use KMALLOC_SHIFT_MAX
From: Michal Hocko @ 2016-12-16 22:02 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: linux-mm, Cristopher Lameter, Andrew Morton, Alexei Starovoitov,
	netdev, Daniel Borkmann
In-Reply-To: <20161216180209.GA77597@ast-mbp.thefacebook.com>

On Fri 16-12-16 10:02:10, Alexei Starovoitov wrote:
> On Thu, Dec 15, 2016 at 05:47:21PM +0100, Michal Hocko wrote:
> > From: Michal Hocko <mhocko@suse.com>
> > 
> > 01b3f52157ff ("bpf: fix allocation warnings in bpf maps and integer
> > overflow") has added checks for the maximum allocateable size. It
> > (ab)used KMALLOC_SHIFT_MAX for that purpose. While this is not incorrect
> > it is not very clean because we already have KMALLOC_MAX_SIZE for this
> > very reason so let's change both checks to use KMALLOC_MAX_SIZE instead.
> > 
> > Cc: Alexei Starovoitov <ast@kernel.org>
> > Signed-off-by: Michal Hocko <mhocko@suse.com>
> 
> Nack until the patches 1 and 2 are reversed.

I do not insist on ordering. The thing is that it shouldn't matter all
that much. Or are you worried about bisectability?
-- 
Michal Hocko
SUSE Labs

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: [PATCH v6 3/5] random: use SipHash in place of MD5
From: Jason A. Donenfeld @ 2016-12-16 21:45 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Netdev, kernel-hardening@lists.openwall.com, LKML,
	Linux Crypto Mailing List, David Laight, Ted Tso,
	Hannes Frederic Sowa, Linus Torvalds, Eric Biggers, Tom Herbert,
	George Spelvin, Vegard Nossum, Andi Kleen, David S. Miller,
	Jean-Philippe Aumasson

Hi Andy,

On Fri, Dec 16, 2016 at 10:31 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> I think it would be nice to try to strenghen the PRNG construction.
> FWIW, I'm not an expert in PRNGs, and there's fairly extensive
> literature, but I can at least try.

In an effort to keep this patchset as initially as uncontroversial as
possible, I kept the same same construction as before and just swapped
out slow MD5 for fast Siphash. Additionally, the function
documentation says that it isn't cryptographically secure. But in the
end I certainly agree with you; we should most definitely improve
things, and seeing the eyeballs now on this series, I think we now
might have a mandate to do so.

> 1. A one-time leak of memory contents doesn't ruin security until
> reboot.  This is especially value across suspend and/or hibernation.

Ted and I were discussing this in another thread, and it sounds like
he wants the same thing. I'll add re-generation of the secret every
once in a while. Perhaps time-based makes more sense than
counter-based for rekeying frequency?

> 2. An attack with a low work factor (2^64?) shouldn't break the scheme
> until reboot.

It won't. The key is 128-bits.

> This is effectively doing:
>
> output = H(prev_output, weak "entropy", per-boot secret);
>
> One unfortunately downside is that, if used in a context where an
> attacker can see a single output, the attacker learns the chaining
> value.  If the attacker can guess the entropy, then, with 2^64 work,
> they learn the secret, and they can predict future outputs.

No, the secret is 128-bits, which isn't feasibly guessable. The secret
also isn't part of the hash, but rather is the generator of the hash
function. A keyed hash (a PRF) is a bit of a different construction
than just hashing a secret value into a hash function.

> Second, change the mode so that an attacker doesn't learn so much
> internal state.  For example:
>
> output = H(old_chain, entropy, secret);
> new_chain = old_chain + entropy + output;

Happy to make this change, with making the chaining value additive
rather than a replacement.

However, I'm not sure adding entropy to the new_chain makes a
different. That entropy is basically just the cycle count plus the
jiffies count. If an attacker can already guess them, then adding them
again to the chaining value doesn't really add much.

Jason

^ permalink raw reply

* [PATCH] isdn: Constify some function parameters
From: Kees Cook @ 2016-12-16 21:40 UTC (permalink / raw)
  To: Karsten Keil; +Cc: linux-kernel, Emese Revfy, netdev

From: Emese Revfy <re.emese@gmail.com>

The coming initify gcc plugin expects const pointer types, and caught
some __printf arguments that weren't const yet. This fixes those.

Signed-off-by: Emese Revfy <re.emese@gmail.com>
[kees: expanded commit message]
Signed-off-by: Kees Cook <keescook@chromium.org>
---
 drivers/isdn/hisax/config.c | 16 ++++++++--------
 drivers/isdn/hisax/hisax.h  |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/isdn/hisax/config.c b/drivers/isdn/hisax/config.c
index bf04d2a3cf4a..2d12c6ceeb89 100644
--- a/drivers/isdn/hisax/config.c
+++ b/drivers/isdn/hisax/config.c
@@ -659,7 +659,7 @@ int jiftime(char *s, long mark)
 
 static u_char tmpbuf[HISAX_STATUS_BUFSIZE];
 
-void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt,
+void VHiSax_putstatus(struct IsdnCardState *cs, char *head, const char *fmt,
 		      va_list args)
 {
 	/* if head == NULL the fmt contains the full info */
@@ -669,23 +669,24 @@ void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt,
 	u_char		*p;
 	isdn_ctrl	ic;
 	int		len;
+	const u_char	*data;
 
 	if (!cs) {
 		printk(KERN_WARNING "HiSax: No CardStatus for message");
 		return;
 	}
 	spin_lock_irqsave(&cs->statlock, flags);
-	p = tmpbuf;
 	if (head) {
+		p = tmpbuf;
 		p += jiftime(p, jiffies);
 		p += sprintf(p, " %s", head);
 		p += vsprintf(p, fmt, args);
 		*p++ = '\n';
 		*p = 0;
 		len = p - tmpbuf;
-		p = tmpbuf;
+		data = tmpbuf;
 	} else {
-		p = fmt;
+		data = fmt;
 		len = strlen(fmt);
 	}
 	if (len > HISAX_STATUS_BUFSIZE) {
@@ -699,13 +700,12 @@ void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt,
 	if (i >= len)
 		i = len;
 	len -= i;
-	memcpy(cs->status_write, p, i);
+	memcpy(cs->status_write, data, i);
 	cs->status_write += i;
 	if (cs->status_write > cs->status_end)
 		cs->status_write = cs->status_buf;
-	p += i;
 	if (len) {
-		memcpy(cs->status_write, p, len);
+		memcpy(cs->status_write, data + i, len);
 		cs->status_write += len;
 	}
 #ifdef KERNELSTACK_DEBUG
@@ -729,7 +729,7 @@ void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt,
 	}
 }
 
-void HiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, ...)
+void HiSax_putstatus(struct IsdnCardState *cs, char *head, const char *fmt, ...)
 {
 	va_list args;
 
diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 6ead6314e6d2..338d0408b377 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -1288,9 +1288,9 @@ int jiftime(char *s, long mark);
 int HiSax_command(isdn_ctrl *ic);
 int HiSax_writebuf_skb(int id, int chan, int ack, struct sk_buff *skb);
 __printf(3, 4)
-void HiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, ...);
+void HiSax_putstatus(struct IsdnCardState *cs, char *head, const char *fmt, ...);
 __printf(3, 0)
-void VHiSax_putstatus(struct IsdnCardState *cs, char *head, char *fmt, va_list args);
+void VHiSax_putstatus(struct IsdnCardState *cs, char *head, const char *fmt, va_list args);
 void HiSax_reportcard(int cardnr, int sel);
 int QuickHex(char *txt, u_char *p, int cnt);
 void LogFrame(struct IsdnCardState *cs, u_char *p, int size);
-- 
2.7.4


-- 
Kees Cook
Nexus Security

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox