Netdev List
 help / color / mirror / Atom feed
* [RFC v3 net-next 1/5] eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER
From: Tushar Dave @ 2018-08-17 23:08 UTC (permalink / raw)
  To: john.fastabend, ast, daniel, davem, sowmini.varadhan,
	santosh.shilimkar, jakub.kicinski, quentin.monnet, jiong.wang,
	sandipan, kafai, rdna, yhs, netdev
In-Reply-To: <1534547305-25140-1-git-send-email-tushar.n.dave@oracle.com>

Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which uses the
existing socket filter infrastructure for bpf program attach and load.
SOCKET_SG_FILTER eBPF program receives struct scatterlist as bpf context
contrast to SOCKET_FILTER which deals with struct skb. This is useful
for kernel entities that don't have skb to represent packet data but
want to run eBPF socket filter on packet data that is in form of struct
scatterlist e.g. IB/RDMA

Signed-off-by: Tushar Dave <tushar.n.dave@oracle.com>
Acked-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
---
 include/linux/bpf_types.h      |  1 +
 include/uapi/linux/bpf.h       |  1 +
 kernel/bpf/syscall.c           |  1 +
 kernel/bpf/verifier.c          |  1 +
 net/core/filter.c              | 55 ++++++++++++++++++++++++++++++++++++++++--
 samples/bpf/bpf_load.c         | 11 ++++++---
 tools/bpf/bpftool/prog.c       |  1 +
 tools/include/uapi/linux/bpf.h |  1 +
 tools/lib/bpf/libbpf.c         |  3 +++
 tools/lib/bpf/libbpf.h         |  2 ++
 10 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index cd26c09..7dc1503 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -16,6 +16,7 @@
 BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb)
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg)
+BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_SG_FILTER, socksg_filter)
 #endif
 #ifdef CONFIG_BPF_EVENTS
 BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 66917a4..6ec1e32 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
 	BPF_PROG_TYPE_SK_REUSEPORT,
+	BPF_PROG_TYPE_SOCKET_SG_FILTER,
 };
 
 enum bpf_attach_type {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8339d81..160cdb2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1362,6 +1362,7 @@ static int bpf_prog_load(union bpf_attr *attr)
 
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
+	    type != BPF_PROG_TYPE_SOCKET_SG_FILTER &&
 	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ca90679..5abc788 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1321,6 +1321,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 	case BPF_PROG_TYPE_LWT_XMIT:
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_SK_MSG:
+	case BPF_PROG_TYPE_SOCKET_SG_FILTER:
 		if (meta)
 			return meta->pkt_access;
 
diff --git a/net/core/filter.c b/net/core/filter.c
index fd423ce..cec3807 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1140,7 +1140,8 @@ static void bpf_release_orig_filter(struct bpf_prog *fp)
 
 static void __bpf_prog_release(struct bpf_prog *prog)
 {
-	if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
+	if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER ||
+	    prog->type == BPF_PROG_TYPE_SOCKET_SG_FILTER) {
 		bpf_prog_put(prog);
 	} else {
 		bpf_release_orig_filter(prog);
@@ -1539,10 +1540,16 @@ int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 
 static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
 {
+	struct bpf_prog *prog;
+
 	if (sock_flag(sk, SOCK_FILTER_LOCKED))
 		return ERR_PTR(-EPERM);
 
-	return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+	prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
+	if (IS_ERR(prog))
+		prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_SG_FILTER);
+
+	return prog;
 }
 
 int sk_attach_bpf(u32 ufd, struct sock *sk)
@@ -4920,6 +4927,17 @@ bool bpf_helper_changes_pkt_data(void *func)
 }
 
 static const struct bpf_func_proto *
+socksg_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+	switch (func_id) {
+	case BPF_FUNC_msg_pull_data:
+		return &bpf_msg_pull_data_proto;
+	default:
+		return bpf_base_func_proto(func_id);
+	}
+}
+
+static const struct bpf_func_proto *
 tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
@@ -6738,6 +6756,30 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
+static u32 socksg_filter_convert_ctx_access(enum bpf_access_type type,
+					    const struct bpf_insn *si,
+					    struct bpf_insn *insn_buf,
+					    struct bpf_prog *prog,
+					    u32 *target_size)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (si->off) {
+	case offsetof(struct sk_msg_md, data):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, data));
+		break;
+	case offsetof(struct sk_msg_md, data_end):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct sk_msg_buff, data_end));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 				     const struct bpf_insn *si,
 				     struct bpf_insn *insn_buf,
@@ -6876,6 +6918,15 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 	.test_run		= bpf_prog_test_run_skb,
 };
 
+const struct bpf_verifier_ops socksg_filter_verifier_ops = {
+	.get_func_proto         = socksg_filter_func_proto,
+	.is_valid_access        = sk_msg_is_valid_access,
+	.convert_ctx_access     = socksg_filter_convert_ctx_access,
+};
+
+const struct bpf_prog_ops socksg_filter_prog_ops = {
+};
+
 const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
 	.get_func_proto		= tc_cls_act_func_proto,
 	.is_valid_access	= tc_cls_act_is_valid_access,
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 904e775..3b1697d 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -69,6 +69,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_sockops = strncmp(event, "sockops", 7) == 0;
 	bool is_sk_skb = strncmp(event, "sk_skb", 6) == 0;
 	bool is_sk_msg = strncmp(event, "sk_msg", 6) == 0;
+	bool is_socksg = strncmp(event, "socksg", 6) == 0;
+
 	size_t insns_cnt = size / sizeof(struct bpf_insn);
 	enum bpf_prog_type prog_type;
 	char buf[256];
@@ -102,6 +104,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_SK_SKB;
 	} else if (is_sk_msg) {
 		prog_type = BPF_PROG_TYPE_SK_MSG;
+	} else if (is_socksg) {
+		prog_type = BPF_PROG_TYPE_SOCKET_SG_FILTER;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -122,8 +126,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	if (is_xdp || is_perf_event || is_cgroup_skb || is_cgroup_sk)
 		return 0;
 
-	if (is_socket || is_sockops || is_sk_skb || is_sk_msg) {
-		if (is_socket)
+	if (is_socket || is_sockops || is_sk_skb || is_sk_msg || is_socksg) {
+		if (is_socket || is_socksg)
 			event += 6;
 		else
 			event += 7;
@@ -627,7 +631,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map)
 		    memcmp(shname, "cgroup/", 7) == 0 ||
 		    memcmp(shname, "sockops", 7) == 0 ||
 		    memcmp(shname, "sk_skb", 6) == 0 ||
-		    memcmp(shname, "sk_msg", 6) == 0) {
+		    memcmp(shname, "sk_msg", 6) == 0 ||
+		    memcmp(shname, "socksg", 6) == 0) {
 			ret = load_and_attach(shname, data->d_buf,
 					      data->d_size);
 			if (ret != 0)
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index dce960d..9c57c4e 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -74,6 +74,7 @@
 	[BPF_PROG_TYPE_RAW_TRACEPOINT]	= "raw_tracepoint",
 	[BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr",
 	[BPF_PROG_TYPE_LIRC_MODE2]	= "lirc_mode2",
+	[BPF_PROG_TYPE_SOCKET_SG_FILTER] = "socket_sg_filter",
 };
 
 static void print_boot_time(__u64 nsecs, char *buf, unsigned int size)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 66917a4..6ec1e32 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_LWT_SEG6LOCAL,
 	BPF_PROG_TYPE_LIRC_MODE2,
 	BPF_PROG_TYPE_SK_REUSEPORT,
+	BPF_PROG_TYPE_SOCKET_SG_FILTER,
 };
 
 enum bpf_attach_type {
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 2abd0f1..a7ac51c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1502,6 +1502,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
 	case BPF_PROG_TYPE_LIRC_MODE2:
 	case BPF_PROG_TYPE_SK_REUSEPORT:
+	case BPF_PROG_TYPE_SOCKET_SG_FILTER:
 		return false;
 	case BPF_PROG_TYPE_UNSPEC:
 	case BPF_PROG_TYPE_KPROBE:
@@ -2077,6 +2078,7 @@ static bool bpf_program__is_type(struct bpf_program *prog,
 BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT);
 BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP);
 BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT);
+BPF_PROG_TYPE_FNS(socket_sg_filter, BPF_PROG_TYPE_SOCKET_SG_FILTER);
 
 void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 					   enum bpf_attach_type type)
@@ -2129,6 +2131,7 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 	BPF_SA_PROG_SEC("cgroup/sendmsg6", BPF_CGROUP_UDP6_SENDMSG),
 	BPF_S_PROG_SEC("cgroup/post_bind4", BPF_CGROUP_INET4_POST_BIND),
 	BPF_S_PROG_SEC("cgroup/post_bind6", BPF_CGROUP_INET6_POST_BIND),
+	BPF_PROG_SEC("socksg",          BPF_PROG_TYPE_SOCKET_SG_FILTER),
 };
 
 #undef BPF_PROG_SEC
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 96c55fa..7527ea4 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -208,6 +208,7 @@ int bpf_program__set_prep(struct bpf_program *prog, int nr_instance,
 void bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type);
 void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 					   enum bpf_attach_type type);
+int bpf_program__set_socket_sg_filter(struct bpf_program *prog);
 
 bool bpf_program__is_socket_filter(struct bpf_program *prog);
 bool bpf_program__is_tracepoint(struct bpf_program *prog);
@@ -217,6 +218,7 @@ void bpf_program__set_expected_attach_type(struct bpf_program *prog,
 bool bpf_program__is_sched_act(struct bpf_program *prog);
 bool bpf_program__is_xdp(struct bpf_program *prog);
 bool bpf_program__is_perf_event(struct bpf_program *prog);
+bool bpf_program__is_socket_sg_filter(struct bpf_program *prog);
 
 /*
  * No need for __attribute__((packed)), all members of 'bpf_map_def'
-- 
1.8.3.1

^ permalink raw reply related

* [RFC v3 net-next 0/5] eBPF and struct scatterlist
From: Tushar Dave @ 2018-08-17 23:08 UTC (permalink / raw)
  To: john.fastabend, ast, daniel, davem, sowmini.varadhan,
	santosh.shilimkar, jakub.kicinski, quentin.monnet, jiong.wang,
	sandipan, kafai, rdna, yhs, netdev

This is v3 of the RFC sent earlier,
(https://patchwork.ozlabs.org/cover/931785/).

v2->v3:
- As per the review feedback received, this patchset reuses as much code
as possible from sockmap/sk_msg. e.g. it uses existing struct
sk_msg_buff, struct sk_msg_md, sk_msg_convert_ctx_access and part of
code from sk_msg_convert_ctx_access.

- bpf helper bpf_msg_pull_data() is used to access packet data. Some
issues found with bpf_msg_pull_data() are therefore fixed in patch 3.

- A feedback was given that unprivileged user can attach a new
BPF_PROG_TYPE_SOCKET_SG_FILTER to a non-rds socket e.g. normal tcp/udp
through the SO_ATTACH_BPF sockopt, where input context is skb instead of
sg list and can cause issues. However, I found that as an unprivileged,
user can attach any kind of eBPF program to socket using SO_ATTACH_BPF,
not only socksg. But if eBPF program is faulty, kernel BPF verifier take
care of it and invalidate any access to kernel data, doesn't let eBPF
program to run.

- socksg programs now returns action code (e.g. SOCKSG_PASS etc,.).


Background:
The motivation for this work is to allow eBPF based firewalling for
kernel modules that do not always get their packet as an sk_buff from
their downlink drivers. One such instance of this use-case is RDS, which
can be run both over IB (driver RDMA's a scatterlist to the RDS module)
or over TCP (TCP passes an sk_buff to the RDS module).

This patchset uses exiting socket filter infrastructure and extend it
with new eBPF program type that deals with struct scatterlist.
Existing bpf helper bpf_msg_pull_data() is used to inspect packet data
that are in form struct scatterlist. For RDS, the integrated approach
treats the scatterlist as the common denominator, and allows the
application to write a filter for processing a scatterlist.


Details:
Patch 1 adds new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER which
uses the existing socket filter infrastructure for bpf program attach
and load. eBPF program of type BPF_PROG_TYPE_SOCKET_SG_FILTER deals with
struct scatterlist as bpf context contrast to
BPF_PROG_TYPE_SOCKET_FILTER which deals with struct skb. This new eBPF
program type allow socket filter to run on packet data that is in form
of struct scatterlist.

Patch 2 adds sg_filter_run() that runs BPF_PROG_TYPE_SOCKET_SG_FILTER.

Patch 3 fixes bpf_msg_pull_data() for the bugs that were found while
doing some experiment with different size of packets.

patch 4 allows rds_recv_incoming to invoke socket filter program which
deals with struct scatterlist.

Patch 5 adds socket filter eBPF sample program that uses patches 1 to 4.
The sample program opens an rds socket, attach ebpf program
(socksg i.e. BPF_PROG_TYPE_SOCKET_SG_FILTER) to rds socket and uses
bpf_msg_pull_data() helper to inspect RDS packet data. For a test,
current sample program only prints first few bytes of packet data.


Testing:
To confirm data accuracy and results, RDS packets of various sizes has
been tested with socksg program along with various start and end values
for bpf_msg_pull_data(). All such tests shows accurate results.

Thanks.

-Tushar



Tushar Dave (5):
  eBPF: Add new eBPF prog type BPF_PROG_TYPE_SOCKET_SG_FILTER
  ebpf: Add sg_filter_run()
  ebpf: fix bpf_msg_pull_data
  rds: invoke socket sg filter attached to rds socket
  ebpf: Add sample ebpf program for SOCKET_SG_FILTER

 include/linux/bpf_types.h      |   1 +
 include/linux/filter.h         |   8 +
 include/uapi/linux/bpf.h       |   7 +
 kernel/bpf/syscall.c           |   1 +
 kernel/bpf/verifier.c          |   1 +
 net/core/filter.c              | 140 +++++++++++++----
 net/rds/ib.c                   |   1 +
 net/rds/ib.h                   |   1 +
 net/rds/ib_recv.c              |  12 ++
 net/rds/rds.h                  |   2 +
 net/rds/recv.c                 |  17 +++
 net/rds/tcp.c                  |   2 +
 net/rds/tcp.h                  |   2 +
 net/rds/tcp_recv.c             |  38 +++++
 samples/bpf/Makefile           |   3 +
 samples/bpf/bpf_load.c         |  11 +-
 samples/bpf/rds_filter_kern.c  |  42 +++++
 samples/bpf/rds_filter_user.c  | 339 +++++++++++++++++++++++++++++++++++++++++
 tools/bpf/bpftool/prog.c       |   1 +
 tools/include/uapi/linux/bpf.h |   7 +
 tools/lib/bpf/libbpf.c         |   3 +
 tools/lib/bpf/libbpf.h         |   2 +
 22 files changed, 607 insertions(+), 34 deletions(-)
 create mode 100644 samples/bpf/rds_filter_kern.c
 create mode 100644 samples/bpf/rds_filter_user.c

-- 
1.8.3.1

^ permalink raw reply

* Re: [PATCH bpf] bpf: fix redirect to map under tail calls
From: Alexei Starovoitov @ 2018-08-17 22:59 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: john.fastabend, netdev
In-Reply-To: <20180817212614.2423-1-daniel@iogearbox.net>

On Fri, Aug 17, 2018 at 11:26:14PM +0200, Daniel Borkmann wrote:
> Commits 109980b894e9 ("bpf: don't select potentially stale ri->map
> from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner
> pointer on bpf_prog_realloc") tried to mitigate that buggy programs
> using bpf_redirect_map() helper call do not leave stale maps behind.
> Idea was to add a map_owner cookie into the per CPU struct redirect_info
> which was set to prog->aux by the prog making the helper call as a
> proof that the map is not stale since the prog is implicitly holding
> a reference to it. This owner cookie could later on get compared with
> the program calling into BPF whether they match and therefore the
> redirect could proceed with processing the map safely.
> 
> In (obvious) hindsight, this approach breaks down when tail calls are
> involved since the original caller's prog->aux pointer does not have
> to match the one from one of the progs out of the tail call chain,
> and therefore the xdp buffer will be dropped instead of redirected.
> A way around that would be to fix the issue differently (which also
> allows to remove related work in fast path at the same time): once
> the life-time of a redirect map has come to its end we use it's map
> free callback where we need to wait on synchronize_rcu() for current
> outstanding xdp buffers and remove such a map pointer from the
> redirect info if found to be present. At that time no program is
> using this map anymore so we simply invalidate the map pointers to
> NULL iff they previously pointed to that instance while making sure
> that the redirect path only reads out the map once.
> 
> Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine")
> Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs")
> Reported-by: Sebastiano Miano <sebastiano.miano@polito.it>
> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
> Acked-by: John Fastabend <john.fastabend@gmail.com>

Applied, Thanks

^ permalink raw reply

* Re: [PATCH net-next v1] net/tls: Add support for async decryption of tls records
From: Dave Watson @ 2018-08-17 22:12 UTC (permalink / raw)
  To: Vakul Garg; +Cc: netdev, borisp, aviadye, davem
In-Reply-To: <20180816151906.17624-1-vakul.garg@nxp.com>

On 08/16/18 08:49 PM, Vakul Garg wrote:
> Changes since RFC version:
> 	1) Improved commit message.
> 	2) Fixed dequeued record offset handling because of which few of
> 	   tls selftests 'recv_partial, recv_peek, recv_peek_multiple' were failing.

Thanks! Commit message much more clear, tests work great for me also,
only minor comments on clarity

> -			if (tls_sw_advance_skb(sk, skb, chunk)) {
> +			if (async) {
> +				/* Finished with current record, pick up next */
> +				ctx->recv_pkt = NULL;
> +				__strp_unpause(&ctx->strp);
> +				goto mark_eor_chk_ctrl;

Control flow is a little hard to follow here, maybe just pass an async
flag to tls_sw_advance_skb?  It already does strp_unpause and recv_pkt
= NULL.  

> +			} else if (tls_sw_advance_skb(sk, skb, chunk)) {
>  				/* Return full control message to
>  				 * userspace before trying to parse
>  				 * another message type
>  				 */
> +mark_eor_chk_ctrl:
>  				msg->msg_flags |= MSG_EOR;
>  				if (control != TLS_RECORD_TYPE_DATA)
>  					goto recv_end;
> +			} else {
> +				break;

I don't see the need for the else { break; }, isn't this already
covered by while(len); below as before?

^ permalink raw reply

* Re: [offlist] Re: Crash in netlink/sk_filter_trim_cap on ARMv7 on 4.18rc1
From: Daniel Borkmann @ 2018-08-17 22:06 UTC (permalink / raw)
  To: Peter Robinson
  Cc: Russell King - ARM Linux, Marc Haber, linux-arm-kernel, netdev,
	labbott, Eric Dumazet
In-Reply-To: <CALeDE9N=EjaHAqOrw8Hi=EK15f65j0bKFOSbFZaLtjSG20Aytg@mail.gmail.com>

On 08/17/2018 11:13 PM, Peter Robinson wrote:
> On Fri, Aug 17, 2018 at 7:30 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> On 08/17/2018 06:17 PM, Russell King - ARM Linux wrote:
>>> On Fri, Aug 17, 2018 at 02:40:19PM +0200, Daniel Borkmann wrote:
>>>> I'd have one potential bug suspicion, for the 4.18 one you were trying,
>>>> could you run with the below patch to see whether it would help?
>>>
>>> I think this is almost certainly the problem - looking at the history,
>>> it seems that the "-4" was assumed to be part of the scratch stuff in
>>> commit 38ca93060163 ("bpf, arm32: save 4 bytes of unneeded stack space")
>>> but it isn't - it's because "off" of zero refers to the top word in the
>>> stack (iow at STACK_SIZE-4).
>>
>> Yeah agree, my thinking as well (albeit bit late, sigh, sorry about that).
>> Waiting for Peter to get back with results for definite confirmation. Your
>> rework in 1c35ba122d4a ("ARM: net: bpf: use negative numbers for stacked
>> registers") and 96cced4e774a ("ARM: net: bpf: access eBPF scratch space using
>> ARM FP register") fixes this in mainline, so unless I'm missing something this
>> would only need a stand-alone fix for 4.18/stable which I can cook up and
>> submit then.
> 
> I can confirm that fixes the problems I was seeing on Fedora 29.
> 
> Feel free to add a tested by from me:
> 
> Tested-by: Peter Robinson <pbrobinson@gmail.com>

Great, thanks everyone! Will get it out asap.

^ permalink raw reply

* RE: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask
From: Steve Wise @ 2018-08-17 21:28 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Max Gurtovoy',
	'Jason Gunthorpe'
  Cc: 'Leon Romanovsky', 'Doug Ledford',
	'RDMA mailing list', 'Saeed Mahameed',
	'linux-netdev'
In-Reply-To: <ed1ea546-e7aa-3c17-710b-8c95fe90deea@grimberg.me>

> 
> 
> > Hey Sagi,
> >
> > The patch works allowing connections for the various affinity mappings
> below:
> >
> > One comp_vector per core across all cores, starting with numa-local cores:
> 
> Thanks Steve, is this your "Tested by:" tag?

Sure:

Tested-by: Steve Wise <swise@opengridcomputing.com>

^ permalink raw reply

* [PATCH bpf] bpf: fix redirect to map under tail calls
From: Daniel Borkmann @ 2018-08-17 21:26 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: john.fastabend, netdev, Daniel Borkmann

Commits 109980b894e9 ("bpf: don't select potentially stale ri->map
from buggy xdp progs") and 7c3001313396 ("bpf: fix ri->map_owner
pointer on bpf_prog_realloc") tried to mitigate that buggy programs
using bpf_redirect_map() helper call do not leave stale maps behind.
Idea was to add a map_owner cookie into the per CPU struct redirect_info
which was set to prog->aux by the prog making the helper call as a
proof that the map is not stale since the prog is implicitly holding
a reference to it. This owner cookie could later on get compared with
the program calling into BPF whether they match and therefore the
redirect could proceed with processing the map safely.

In (obvious) hindsight, this approach breaks down when tail calls are
involved since the original caller's prog->aux pointer does not have
to match the one from one of the progs out of the tail call chain,
and therefore the xdp buffer will be dropped instead of redirected.
A way around that would be to fix the issue differently (which also
allows to remove related work in fast path at the same time): once
the life-time of a redirect map has come to its end we use it's map
free callback where we need to wait on synchronize_rcu() for current
outstanding xdp buffers and remove such a map pointer from the
redirect info if found to be present. At that time no program is
using this map anymore so we simply invalidate the map pointers to
NULL iff they previously pointed to that instance while making sure
that the redirect path only reads out the map once.

Fixes: 97f91a7cf04f ("bpf: add bpf_redirect_map helper routine")
Fixes: 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs")
Reported-by: Sebastiano Miano <sebastiano.miano@polito.it>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/filter.h     |  3 +-
 include/trace/events/xdp.h |  5 ++--
 kernel/bpf/cpumap.c        |  2 ++
 kernel/bpf/devmap.c        |  1 +
 kernel/bpf/verifier.c      | 21 --------------
 kernel/bpf/xskmap.c        |  1 +
 net/core/filter.c          | 68 ++++++++++++++++++++--------------------------
 7 files changed, 38 insertions(+), 63 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 5d565c5..6791a0a 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -543,7 +543,6 @@ struct bpf_redirect_info {
 	u32 flags;
 	struct bpf_map *map;
 	struct bpf_map *map_to_flush;
-	unsigned long   map_owner;
 	u32 kern_flags;
 };
 
@@ -781,6 +780,8 @@ static inline bool bpf_dump_raw_ok(void)
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 
+void bpf_clear_redirect_map(struct bpf_map *map);
+
 static inline bool xdp_return_frame_no_direct(void)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 1ecf4c6..e95cb86 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -147,9 +147,8 @@ struct _bpf_dtab_netdev {
 
 #define devmap_ifindex(fwd, map)				\
 	(!fwd ? 0 :						\
-	 (!map ? 0 :						\
-	  ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
-	   ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
+	 ((map->map_type == BPF_MAP_TYPE_DEVMAP) ?		\
+	  ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0))
 
 #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx)		\
 	 trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map),	\
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 620bc50..24aac0d 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -479,6 +479,8 @@ static void cpu_map_free(struct bpf_map *map)
 	 * It does __not__ ensure pending flush operations (if any) are
 	 * complete.
 	 */
+
+	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
 	/* To ensure all pending flush operations have completed wait for flush
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ac1df79..141710b 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -161,6 +161,7 @@ static void dev_map_free(struct bpf_map *map)
 	list_del_rcu(&dtab->list);
 	spin_unlock(&dev_map_lock);
 
+	bpf_clear_redirect_map(map);
 	synchronize_rcu();
 
 	/* To ensure all pending flush operations have completed wait for flush
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ca90679..9224611 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5844,27 +5844,6 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			goto patch_call_imm;
 		}
 
-		if (insn->imm == BPF_FUNC_redirect_map) {
-			/* Note, we cannot use prog directly as imm as subsequent
-			 * rewrites would still change the prog pointer. The only
-			 * stable address we can use is aux, which also works with
-			 * prog clones during blinding.
-			 */
-			u64 addr = (unsigned long)prog->aux;
-			struct bpf_insn r4_ld[] = {
-				BPF_LD_IMM64(BPF_REG_4, addr),
-				*insn,
-			};
-			cnt = ARRAY_SIZE(r4_ld);
-
-			new_prog = bpf_patch_insn_data(env, i + delta, r4_ld, cnt);
-			if (!new_prog)
-				return -ENOMEM;
-
-			delta    += cnt - 1;
-			env->prog = prog = new_prog;
-			insn      = new_prog->insnsi + i + delta;
-		}
 patch_call_imm:
 		fn = env->ops->get_func_proto(insn->imm, env->prog);
 		/* all functions that have prototype and verifier allowed
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 4ddf61e1..9f8463a 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -75,6 +75,7 @@ static void xsk_map_free(struct bpf_map *map)
 	struct xsk_map *m = container_of(map, struct xsk_map, map);
 	int i;
 
+	bpf_clear_redirect_map(map);
 	synchronize_net();
 
 	for (i = 0; i < map->max_entries; i++) {
diff --git a/net/core/filter.c b/net/core/filter.c
index fd423ce..c25eb36 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3246,31 +3246,33 @@ static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
 	}
 }
 
-static inline bool xdp_map_invalid(const struct bpf_prog *xdp_prog,
-				   unsigned long aux)
+void bpf_clear_redirect_map(struct bpf_map *map)
 {
-	return (unsigned long)xdp_prog->aux != aux;
+	struct bpf_redirect_info *ri;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		ri = per_cpu_ptr(&bpf_redirect_info, cpu);
+		/* Avoid polluting remote cacheline due to writes if
+		 * not needed. Once we pass this test, we need the
+		 * cmpxchg() to make sure it hasn't been changed in
+		 * the meantime by remote CPU.
+		 */
+		if (unlikely(READ_ONCE(ri->map) == map))
+			cmpxchg(&ri->map, map, NULL);
+	}
 }
 
 static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
-			       struct bpf_prog *xdp_prog)
+			       struct bpf_prog *xdp_prog, struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	unsigned long map_owner = ri->map_owner;
-	struct bpf_map *map = ri->map;
 	u32 index = ri->ifindex;
 	void *fwd = NULL;
 	int err;
 
 	ri->ifindex = 0;
-	ri->map = NULL;
-	ri->map_owner = 0;
-
-	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
-		err = -EFAULT;
-		map = NULL;
-		goto err;
-	}
+	WRITE_ONCE(ri->map, NULL);
 
 	fwd = __xdp_map_lookup_elem(map, index);
 	if (!fwd) {
@@ -3296,12 +3298,13 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
 		    struct bpf_prog *xdp_prog)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	struct bpf_map *map = READ_ONCE(ri->map);
 	struct net_device *fwd;
 	u32 index = ri->ifindex;
 	int err;
 
-	if (ri->map)
-		return xdp_do_redirect_map(dev, xdp, xdp_prog);
+	if (map)
+		return xdp_do_redirect_map(dev, xdp, xdp_prog, map);
 
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
 	ri->ifindex = 0;
@@ -3325,24 +3328,17 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect);
 static int xdp_do_generic_redirect_map(struct net_device *dev,
 				       struct sk_buff *skb,
 				       struct xdp_buff *xdp,
-				       struct bpf_prog *xdp_prog)
+				       struct bpf_prog *xdp_prog,
+				       struct bpf_map *map)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-	unsigned long map_owner = ri->map_owner;
-	struct bpf_map *map = ri->map;
 	u32 index = ri->ifindex;
 	void *fwd = NULL;
 	int err = 0;
 
 	ri->ifindex = 0;
-	ri->map = NULL;
-	ri->map_owner = 0;
+	WRITE_ONCE(ri->map, NULL);
 
-	if (unlikely(xdp_map_invalid(xdp_prog, map_owner))) {
-		err = -EFAULT;
-		map = NULL;
-		goto err;
-	}
 	fwd = __xdp_map_lookup_elem(map, index);
 	if (unlikely(!fwd)) {
 		err = -EINVAL;
@@ -3379,13 +3375,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
 			    struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+	struct bpf_map *map = READ_ONCE(ri->map);
 	u32 index = ri->ifindex;
 	struct net_device *fwd;
 	int err = 0;
 
-	if (ri->map)
-		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog);
-
+	if (map)
+		return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
+						   map);
 	ri->ifindex = 0;
 	fwd = dev_get_by_index_rcu(dev_net(dev), index);
 	if (unlikely(!fwd)) {
@@ -3416,8 +3413,7 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
 
 	ri->ifindex = ifindex;
 	ri->flags = flags;
-	ri->map = NULL;
-	ri->map_owner = 0;
+	WRITE_ONCE(ri->map, NULL);
 
 	return XDP_REDIRECT;
 }
@@ -3430,8 +3426,8 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags,
-	   unsigned long, map_owner)
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
+	   u64, flags)
 {
 	struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 
@@ -3440,15 +3436,11 @@ BPF_CALL_4(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags
 
 	ri->ifindex = ifindex;
 	ri->flags = flags;
-	ri->map = map;
-	ri->map_owner = map_owner;
+	WRITE_ONCE(ri->map, map);
 
 	return XDP_REDIRECT;
 }
 
-/* Note, arg4 is hidden from users and populated by the verifier
- * with the right pointer.
- */
 static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
 	.func           = bpf_xdp_redirect_map,
 	.gpl_only       = false,
-- 
2.9.5

^ permalink raw reply related

* Re: [GIT PULL] 9p updates for 4.19
From: Linus Torvalds @ 2018-08-18  0:31 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Dominique Martinet, V9FS Developers, Linux Kernel Mailing List,
	Network Development
In-Reply-To: <CA+55aFxzuLnK7E3hz_YZSNjjJ7TeMTmPrpn4rhHGjgrY2DgTtQ@mail.gmail.com>

On Fri, Aug 17, 2018 at 4:41 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Fri, Aug 17, 2018 at 3:41 PM Andrew Morton <akpm@linux-foundation.org> wrote:
> >
> > Please do.  I'll actually be sending you the MAINTAINERS update in
> > about 55 seconds.
>
> Heh. Ok, that resolves my biggest issue with the pull request.

.. and with that patch-bomb from Andrew out of the way, I've now
merged the 9p pull request. It's going through my basic build tests
before I push it out, but expect that in minutes.

                   Linus

^ permalink raw reply

* Re: [offlist] Re: Crash in netlink/sk_filter_trim_cap on ARMv7 on 4.18rc1
From: Peter Robinson @ 2018-08-17 21:15 UTC (permalink / raw)
  To: Stefan Wahren
  Cc: Daniel Borkmann, Marc Haber, labbott, Eric Dumazet, netdev,
	Russell King - ARM Linux, linux-arm-kernel
In-Reply-To: <1425110527.374417.1534531889127@email.1und1.de>

Hi Stefan,

>> On 08/17/2018 06:17 PM, Russell King - ARM Linux wrote:
>> > On Fri, Aug 17, 2018 at 02:40:19PM +0200, Daniel Borkmann wrote:
>> >> I'd have one potential bug suspicion, for the 4.18 one you were trying,
>> >> could you run with the below patch to see whether it would help?
>> >
>> > I think this is almost certainly the problem - looking at the history,
>> > it seems that the "-4" was assumed to be part of the scratch stuff in
>> > commit 38ca93060163 ("bpf, arm32: save 4 bytes of unneeded stack space")
>> > but it isn't - it's because "off" of zero refers to the top word in the
>> > stack (iow at STACK_SIZE-4).
>>
>> Yeah agree, my thinking as well (albeit bit late, sigh, sorry about that).
>> Waiting for Peter to get back with results for definite confirmation. Your
>> rework in 1c35ba122d4a ("ARM: net: bpf: use negative numbers for stacked
>> registers") and 96cced4e774a ("ARM: net: bpf: access eBPF scratch space using
>> ARM FP register") fixes this in mainline, so unless I'm missing something this
>> would only need a stand-alone fix for 4.18/stable which I can cook up and
>> submit then.
>
> i was able to reproduce this issue on RPi 3 with Linux 4.18.1 + multi_v7_defconfig and the following  config changes:
>
>  --- a/arch/arm/configs/multi_v7_defconfig
> +++ b/arch/arm/configs/multi_v7_defconfig
> @@ -2,7 +2,10 @@ CONFIG_SYSVIPC=y
>  CONFIG_NO_HZ=y
>  CONFIG_HIGH_RES_TIMERS=y
>  CONFIG_CGROUPS=y
> +CONFIG_CGROUP_BPF=y
>  CONFIG_BLK_DEV_INITRD=y
> +CONFIG_BPF_SYSCALL=y
> +CONFIG_BPF_JIT_ALWAYS_ON=y
>  CONFIG_EMBEDDED=y
>  CONFIG_PERF_EVENTS=y
>  CONFIG_MODULES=y
> @@ -153,6 +156,8 @@ CONFIG_IPV6_MIP6=m
>  CONFIG_IPV6_TUNNEL=m
>  CONFIG_IPV6_MULTIPLE_TABLES=y
>  CONFIG_NET_DSA=m
> +CONFIG_BPF_JIT=y
> +CONFIG_BPF_STREAM_PARSER=y
>  CONFIG_CAN=y
>  CONFIG_CAN_AT91=m
>  CONFIG_CAN_FLEXCAN=m
>
> After applying the "-4" patch the oopses doesn't appear during boot anymore.

Would be fab to get that into the kernel so this is widely tested
moving forward.

Peter

^ permalink raw reply

* Re: [offlist] Re: Crash in netlink/sk_filter_trim_cap on ARMv7 on 4.18rc1
From: Peter Robinson @ 2018-08-17 21:13 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Russell King - ARM Linux, Marc Haber, linux-arm-kernel, netdev,
	labbott, Eric Dumazet
In-Reply-To: <adf49ea9-09a3-80f8-8c85-a62d028e21a3@iogearbox.net>

On Fri, Aug 17, 2018 at 7:30 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> On 08/17/2018 06:17 PM, Russell King - ARM Linux wrote:
>> On Fri, Aug 17, 2018 at 02:40:19PM +0200, Daniel Borkmann wrote:
>>> I'd have one potential bug suspicion, for the 4.18 one you were trying,
>>> could you run with the below patch to see whether it would help?
>>
>> I think this is almost certainly the problem - looking at the history,
>> it seems that the "-4" was assumed to be part of the scratch stuff in
>> commit 38ca93060163 ("bpf, arm32: save 4 bytes of unneeded stack space")
>> but it isn't - it's because "off" of zero refers to the top word in the
>> stack (iow at STACK_SIZE-4).
>
> Yeah agree, my thinking as well (albeit bit late, sigh, sorry about that).
> Waiting for Peter to get back with results for definite confirmation. Your
> rework in 1c35ba122d4a ("ARM: net: bpf: use negative numbers for stacked
> registers") and 96cced4e774a ("ARM: net: bpf: access eBPF scratch space using
> ARM FP register") fixes this in mainline, so unless I'm missing something this
> would only need a stand-alone fix for 4.18/stable which I can cook up and
> submit then.

I can confirm that fixes the problems I was seeing on Fedora 29.

Feel free to add a tested by from me:

Tested-by: Peter Robinson <pbrobinson@gmail.com>

^ permalink raw reply

* Re: [offlist] Re: Crash in netlink/sk_filter_trim_cap on ARMv7 on 4.18rc1
From: Peter Robinson @ 2018-08-17 21:12 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Daniel Borkmann, Marc Haber, linux-arm-kernel, netdev, labbott,
	Eric Dumazet
In-Reply-To: <20180817161743.GX30658@n2100.armlinux.org.uk>

On Fri, Aug 17, 2018 at 5:17 PM, Russell King - ARM Linux
<linux@armlinux.org.uk> wrote:
> On Fri, Aug 17, 2018 at 02:40:19PM +0200, Daniel Borkmann wrote:
>> I'd have one potential bug suspicion, for the 4.18 one you were trying,
>> could you run with the below patch to see whether it would help?
>
> I think this is almost certainly the problem - looking at the history,
> it seems that the "-4" was assumed to be part of the scratch stuff in
> commit 38ca93060163 ("bpf, arm32: save 4 bytes of unneeded stack space")
> but it isn't - it's because "off" of zero refers to the top word in the
> stack (iow at STACK_SIZE-4).

I can confirm that patch fixes the problem I was seeing.

Peter

^ permalink raw reply

* Re: [PATCH] net: lan743x_ptp: convert to ktime_get_clocktai_ts64
From: Richard Cochran @ 2018-08-18  0:09 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Bryan.Whitehead, UNGLinuxDriver, David Miller, YueHaibing,
	Networking, Linux Kernel Mailing List
In-Reply-To: <CAK8P3a1XEDzGtxyZ3Q6=jZe+w67r_ZakksAEQVDC8pYSCne55A@mail.gmail.com>

On Fri, Aug 17, 2018 at 09:29:56PM +0200, Arnd Bergmann wrote:
> This certainly seems to be an "interesting" problem, given that the Linux
> implementations (other than the new lan743x) then start out with UTC,
> while the PTP spec mandates TAI. So even if the system clock is
> perfectly synchronized to UTC at boot,

When the system boots, it is not synchronized.  Only after ntpd or
ptp4l start their work can we say that.

> we set the PTP hardware 37 
> seconds slow.

s/slow/behind/
 
> It would not be hard to change all PTP drivers to explicitly initialize to
> TAI, but that might create its own set of problems if random code
> depends on the current behavior.

Right. (But there was never any guarantee.)

Also, devices that don't have an RTC (like many embedded) start with
1970 anyhow.  So you really can't have "correct" time at boot.  The
question is, which incorrect time would you prefer?

IHMO a clearly wrong value is more useful than one that might be
mistaken as correct by a casual glance from the sysadmin.

> I also see that "phc_ctl /dev/ptp0 set" defaults to CLOCK_REALTIME
> and has no option to use CLOCK_TAI instead. How is this meant to
> work? I see a lot of other code that tries to deal with leap seconds and
> the tai offset, so I hope I was just misreading that code.

You *can* set UTC and then jump the clock by 37 in two steps.

But I don't see an simple solution for setting the TAI-UTC offset at
boot without actually consulting an outside source.  Even if you have
the NTP leap seconds file, how does the system know the file is up to
date?

For PTP and PHC devices, there are two use cases.

1. The device is a slave.  In this case, applications need to wait
   until the PTP status bits say that the time is valid.  The invalid
   time before shouldn't be trusted at all.

2. The device is a grand master.  In this case, the PTP stack has to
   wait until its global time source (like GPS) is ready.  Then it
   will synchronize the local PHC to the global source, and only then
   should it advertise the valid bits.

So I don't think the particular kind of wrong start-up value makes
much difference in practice.

You could argue that if

  a) the system has an RTC, and
  b) the battery isn't dead, and
  c) there is a leap seconds file that isn't out of date,

then the initial PHC time will be less wrong (for some definition of
wrong) using TAI than if the driver had used UTC or 1970.

I myself can't get too excited about having "less wrong" time, but I
wouldn't mind trying to set TAI in the PHC layer if people feel
strongly about it.

Thanks,
Richard

^ permalink raw reply

* Re: [GIT PULL] 9p updates for 4.19
From: Linus Torvalds @ 2018-08-17 23:41 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Dominique Martinet, V9FS Developers, Linux Kernel Mailing List,
	Network Development
In-Reply-To: <20180817154155.ebe6ffee995a37ade029db2f@linux-foundation.org>

On Fri, Aug 17, 2018 at 3:41 PM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> Please do.  I'll actually be sending you the MAINTAINERS update in
> about 55 seconds.

Heh. Ok, that resolves my biggest issue with the pull request.

            Linus

^ permalink raw reply

* Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask
From: Sagi Grimberg @ 2018-08-17 20:26 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: Steve Wise, 'Max Gurtovoy', 'Leon Romanovsky',
	'Doug Ledford', 'RDMA mailing list',
	'Saeed Mahameed', 'linux-netdev'
In-Reply-To: <20180817201701.GJ28676@mellanox.com>

Hi Jason,

> The new patchworks doesn't grab patches inlined in messages, so you
> will need to resend it.

Yes, just wanted to to add Steve's tested by as its going to
lists that did not follow this thread.

> Also, can someone remind me what the outcome is here? Does it
> supersede Leon's patch:
> 
> https://patchwork.kernel.org/patch/10526167/

Leon's patch is exposing the breakage so I think it would be
wise to have it go in after this lands mainline.

^ permalink raw reply

* Re: [iproute PATCH v5 1/2] Make colored output configurable
From: David Ahern @ 2018-08-17 20:22 UTC (permalink / raw)
  To: Phil Sutter, Stephen Hemminger; +Cc: netdev, Till Maas
In-Reply-To: <20180817163846.27578-1-phil@nwl.cc>

On 8/17/18 10:38 AM, Phil Sutter wrote:
> Allow for -color={never,auto,always} to have colored output disabled,
> enabled only if stdout is a terminal or enabled regardless of stdout
> state.
> 
> Signed-off-by: Phil Sutter <phil@nwl.cc>
> ---
> Changes since v1:
> - Allow to override isatty() check by specifying '-color' flag more than
>   once.
> - Document new behaviour in man pages.
> 
> Changes since v2:
> - Implement new -color=foo syntax.
> - Update commit message and man page texts accordingly.
> 
> Changes since v3:
> - Fix typo in tc/tc.c causing compile error.
> 
> Changes since v4:
> - Make matches_color() return boolean.
> ---
>  bridge/bridge.c   |  3 +--
>  include/color.h   |  9 +++++++++
>  ip/ip.c           |  3 +--
>  lib/color.c       | 33 ++++++++++++++++++++++++++++++++-
>  man/man8/bridge.8 | 13 +++++++++++--
>  man/man8/ip.8     | 13 +++++++++++--
>  man/man8/tc.8     | 13 +++++++++++--
>  tc/tc.c           |  3 +--
>  8 files changed, 77 insertions(+), 13 deletions(-)
> 

LGTM.

Reviewed-by: David Ahern <dsahern@gmail.com>

^ permalink raw reply

* Re: [PATCH] sunhme: convert printk to pr_cont
From: David Miller @ 2018-08-17 20:21 UTC (permalink / raw)
  To: mpatocka; +Cc: stephen, sparclinux, netdev
In-Reply-To: <alpine.LRH.2.02.1808171606520.2385@file01.intranet.prod.int.rdu2.redhat.com>

From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 17 Aug 2018 16:08:49 -0400 (EDT)

> I'm not an expert on networking code - you can change it if it is more 
> appropriate this way.

What Stephen is asking of you doesn't require networking expertiece
and he even gave you an example of how to do it.  All you would need
to do is test is suggestion and make sure it works properly.

^ permalink raw reply

* Re: [PATCH iproute2-next] iproute_lwtunnel: allow specifying 'src' for 'encap ip' / 'encap ip6'
From: David Ahern @ 2018-08-17 20:20 UTC (permalink / raw)
  To: Shmulik Ladkani, stephen; +Cc: netdev, shmulik.ladkani, Shmulik Ladkani
In-Reply-To: <20180817073134.19569-1-shmulik.ladkani@gmail.com>

On 8/17/18 1:31 AM, Shmulik Ladkani wrote:
> This allows the user to specify the LWTUNNEL_IP_SRC/LWTUNNEL_IP6_SRC
> when setting an lwtunnel encapsulation route.
> 
> Signed-off-by: Shmulik Ladkani <shmulik.ladkani@gmail.com>
> ---
>  ip/iproute_lwtunnel.c | 22 ++++++++++++++++++++--
>  1 file changed, 20 insertions(+), 2 deletions(-)
> 

applied to iproute2-next. Thanks

^ permalink raw reply

* Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask
From: Jason Gunthorpe @ 2018-08-17 20:17 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Steve Wise, 'Max Gurtovoy', 'Leon Romanovsky',
	'Doug Ledford', 'RDMA mailing list',
	'Saeed Mahameed', 'linux-netdev'
In-Reply-To: <ed1ea546-e7aa-3c17-710b-8c95fe90deea@grimberg.me>

On Fri, Aug 17, 2018 at 01:03:20PM -0700, Sagi Grimberg wrote:
> 
> > Hey Sagi,
> > 
> > The patch works allowing connections for the various affinity mappings below:
> > 
> > One comp_vector per core across all cores, starting with numa-local cores:
> 
> Thanks Steve, is this your "Tested by:" tag?

The new patchworks doesn't grab patches inlined in messages, so you
will need to resend it.

Also, can someone remind me what the outcome is here? Does it
supersede Leon's patch:

https://patchwork.kernel.org/patch/10526167/

?

Thanks,
Jason

^ permalink raw reply

* Re: [PATCH] sunhme: convert printk to pr_cont
From: Mikulas Patocka @ 2018-08-17 20:08 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, sparclinux, netdev
In-Reply-To: <20180817125228.25cf47ce@xeon-e3>



On Fri, 17 Aug 2018, Stephen Hemminger wrote:

> On Fri, 17 Aug 2018 15:12:22 -0400 (EDT)
> Mikulas Patocka <mpatocka@redhat.com> wrote:
> 
> > ===================================================================
> > --- linux-stable.orig/drivers/net/ethernet/sun/sunhme.c	2018-04-20 18:11:00.000000000 +0200
> > +++ linux-stable/drivers/net/ethernet/sun/sunhme.c	2018-08-13 22:01:08.000000000 +0200
> > @@ -572,21 +572,21 @@ static void display_link_mode(struct hap
> >  {
> >  	printk(KERN_INFO "%s: Link is up using ", hp->dev->name);
> >  	if (hp->tcvr_type == external)
> > -		printk("external ");
> > +		pr_cont("external ");
> >  	else
> > -		printk("internal ");
> > -	printk("transceiver at ");
> > +		pr_cont("internal ");
> > +	pr_cont("transceiver at ");
> >  	hp->sw_lpa = happy_meal_tcvr_read(hp, tregs, MII_LPA);
> >  	if (hp->sw_lpa & (LPA_100HALF | LPA_100FULL)) {
> >  		if (hp->sw_lpa & LPA_100FULL)
> > -			printk("100Mb/s, Full Duplex.\n");
> > +			pr_cont("100Mb/s, Full Duplex.\n");
> >  		else
> > -			printk("100Mb/s, Half Duplex.\n");
> > +			pr_cont("100Mb/s, Half Duplex.\n");
> >  	} else {
> >  		if (hp->sw_lpa & LPA_10FULL)
> > -			printk("10Mb/s, Full Duplex.\n");
> > +			pr_cont("10Mb/s, Full Duplex.\n");
> >  		else
> > -			printk("10Mb/s, Half Duplex.\n");
> > +			pr_cont("10Mb/s, Half Duplex.\n");
> >  	}
> >  }
> 
> Why not just  use a single netdev_info (or drop the useless message altogether).
> 
> I.e
> 	netdev_info(hp->dev, "Link is up using %s transceiver at %dMb/s %s Duplex\n",
> 		(hp->tcvr->type == external) ? "external" : "internal",
> 		(hp->sw_lpa & (LPA_100HALF | LPA_100FULL)) ? 100 : 10,
> 		(hw->sw_lpa & (LPA_100FULL | LPA_10FULL)) ? "Full" : "Half"));

I'm not an expert on networking code - you can change it if it is more 
appropriate this way.

Mikulas

^ permalink raw reply

* Re: [PATCH mlx5-next] RDMA/mlx5: Don't use cached IRQ affinity mask
From: Sagi Grimberg @ 2018-08-17 20:03 UTC (permalink / raw)
  To: Steve Wise, 'Max Gurtovoy', 'Jason Gunthorpe'
  Cc: 'Leon Romanovsky', 'Doug Ledford',
	'RDMA mailing list', 'Saeed Mahameed',
	'linux-netdev'
In-Reply-To: <000001d43645$d0fdbc00$72f93400$@opengridcomputing.com>


> Hey Sagi,
> 
> The patch works allowing connections for the various affinity mappings below:
> 
> One comp_vector per core across all cores, starting with numa-local cores:

Thanks Steve, is this your "Tested by:" tag?

^ permalink raw reply

* Re: [PATCH] sunhme: convert printk to pr_cont
From: Stephen Hemminger @ 2018-08-17 19:52 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: David S. Miller, sparclinux, netdev
In-Reply-To: <alpine.LRH.2.02.1808171510510.31883@file01.intranet.prod.int.rdu2.redhat.com>

On Fri, 17 Aug 2018 15:12:22 -0400 (EDT)
Mikulas Patocka <mpatocka@redhat.com> wrote:

> ===================================================================
> --- linux-stable.orig/drivers/net/ethernet/sun/sunhme.c	2018-04-20 18:11:00.000000000 +0200
> +++ linux-stable/drivers/net/ethernet/sun/sunhme.c	2018-08-13 22:01:08.000000000 +0200
> @@ -572,21 +572,21 @@ static void display_link_mode(struct hap
>  {
>  	printk(KERN_INFO "%s: Link is up using ", hp->dev->name);
>  	if (hp->tcvr_type == external)
> -		printk("external ");
> +		pr_cont("external ");
>  	else
> -		printk("internal ");
> -	printk("transceiver at ");
> +		pr_cont("internal ");
> +	pr_cont("transceiver at ");
>  	hp->sw_lpa = happy_meal_tcvr_read(hp, tregs, MII_LPA);
>  	if (hp->sw_lpa & (LPA_100HALF | LPA_100FULL)) {
>  		if (hp->sw_lpa & LPA_100FULL)
> -			printk("100Mb/s, Full Duplex.\n");
> +			pr_cont("100Mb/s, Full Duplex.\n");
>  		else
> -			printk("100Mb/s, Half Duplex.\n");
> +			pr_cont("100Mb/s, Half Duplex.\n");
>  	} else {
>  		if (hp->sw_lpa & LPA_10FULL)
> -			printk("10Mb/s, Full Duplex.\n");
> +			pr_cont("10Mb/s, Full Duplex.\n");
>  		else
> -			printk("10Mb/s, Half Duplex.\n");
> +			pr_cont("10Mb/s, Half Duplex.\n");
>  	}
>  }

Why not just  use a single netdev_info (or drop the useless message altogether).

I.e
	netdev_info(hp->dev, "Link is up using %s transceiver at %dMb/s %s Duplex\n",
		(hp->tcvr->type == external) ? "external" : "internal",
		(hp->sw_lpa & (LPA_100HALF | LPA_100FULL)) ? 100 : 10,
		(hw->sw_lpa & (LPA_100FULL | LPA_10FULL)) ? "Full" : "Half"));

^ permalink raw reply

* Re: [endianness bug] cxgb4: mk_act_open_req() buggers ->{local,peer}_ip on big-endian hosts
From: Al Viro @ 2018-08-17 19:44 UTC (permalink / raw)
  To: Ganesh Goudar; +Cc: Rahul Lakkireddy, David Miller, netdev@vger.kernel.org
In-Reply-To: <20180817185944.GG6515@ZenIV.linux.org.uk>

On Fri, Aug 17, 2018 at 07:59:44PM +0100, Al Viro wrote:
> On Fri, Aug 17, 2018 at 07:58:41PM +0100, Al Viro wrote:
> > On Fri, Aug 17, 2018 at 07:09:49PM +0100, Al Viro wrote:
> > 
> > > Re that code - are you sure it doesn't need le64_to_cpu(*src)?  Because from what
> > > I understand about PCI (which matches just fine to the comments in the same driver),
> > > you probably do need that.  Again, the only real way to find out is to test on
> > > big-endian host...
> > 
> > BTW, would that, by any chance, be an open-coded
> > 	_iowrite64_copy(dst, src, EQ_UNIT/sizeof(u64))
> 
> __iowrite64_copy, even...

FWIW, it looks like the confusion had been between the endianness of the data structures
(b-e both on host and NIC side) and the fact that PCI is l-e.  *IF* that code wants to
copy data from host data structures to iomem as-is, it needs to use __raw_writeq() and
its ilk or writeq(le64_to_cpu(...)) to compensate.  The latter would, indeed, confuse
sparse - we are accessing b-e data as if it was l-e.

If we want copying that wouldn't affect the endianness, we need memcpy_toio() or similar
beasts.  And AFAICS that code is very close to
                /* If we're only writing a single Egress Unit and the BAR2
                 * Queue ID is 0, we can use the Write Combining Doorbell
                 * Gather Buffer; otherwise we use the simple doorbell.
                 */
                if (n == 1 && tq->bar2_qid == 0) {
                        unsigned int index = (tq->pidx ?: tq->size) - 1;
                        /* Copy the TX Descriptor in a tight loop in order to
                         * try to get it to the adapter in a single Write
                         * Combined transfer on the PCI-E Bus.  If the Write
                         * Combine fails (say because of an interrupt, etc.)
                         * the hardware will simply take the last write as a
                         * simple doorbell write with a PIDX Increment of 1
                         * and will fetch the TX Descriptor from memory via
                         * DMA.
                         */
			__iowrite64_copy(tq->bar2_addr + SGE_UDB_WCDOORBELL,
					 &tq->desc[index], EQ_UNIT/sizeof(u64))
		} else {
                        writel(val | QID_V(tq->bar2_qid),
                               tq->bar2_addr + SGE_UDB_KDOORBELL);
		}
                /* This Write Memory Barrier will force the write to the User
                 * Doorbell area to be flushed.  This is needed to prevent
                 * writes on different CPUs for the same queue from hitting
                 * the adapter out of order.  This is required when some Work
                 * Requests take the Write Combine Gather Buffer path (user
                 * doorbell area offset [SGE_UDB_WCDOORBELL..+63]) and some
                 * take the traditional path where we simply increment the
                 * PIDX (User Doorbell area SGE_UDB_KDOORBELL) and have the
                 * hardware DMA read the actual Work Request.
                 */
                wmb();

which wouldn't have looked unusual...  Again, that really needs review from
the folks familiar with the hardware in question, as well as testing - I'm
not trying to push patches like that.  If the current mainline variant
really works on b-e, I'd like to understand how does it manage that, though...

^ permalink raw reply

* [PATCH 15/15] netfilter: nft_dynset: allow dynamic updates of non-anonymous set
From: Pablo Neira Ayuso @ 2018-08-17 19:41 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180817194106.2878-1-pablo@netfilter.org>

This check is superfluous since it breaks valid configurations, remove it.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_dynset.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 81184c244d1a..6e91a37d57f2 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -187,8 +187,6 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
 	if (tb[NFTA_DYNSET_EXPR] != NULL) {
 		if (!(set->flags & NFT_SET_EVAL))
 			return -EINVAL;
-		if (!nft_set_is_anonymous(set))
-			return -EOPNOTSUPP;
 
 		priv->expr = nft_expr_init(ctx, tb[NFTA_DYNSET_EXPR]);
 		if (IS_ERR(priv->expr))
-- 
2.11.0

^ permalink raw reply related

* [PATCH 14/15] netfilter: nft_tproxy: Fix missing-braces warning
From: Pablo Neira Ayuso @ 2018-08-17 19:41 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180817194106.2878-1-pablo@netfilter.org>

From: Máté Eckl <ecklm94@gmail.com>

This patch fixes a warning reported by the kbuild test robot (from linux-next
tree):
   net/netfilter/nft_tproxy.c: In function 'nft_tproxy_eval_v6':
>> net/netfilter/nft_tproxy.c:85:9: warning: missing braces around initializer [-Wmissing-braces]
     struct in6_addr taddr = {0};
            ^
   net/netfilter/nft_tproxy.c:85:9: warning: (near initialization for 'taddr.in6_u') [-Wmissing-braces]

This warning is actually caused by a gcc bug already resolved in newer
versions (kbuild used 4.9) so this kind of initialization is omitted and
memset is used instead.

Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support")
Signed-off-by: Máté Eckl <ecklm94@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_tproxy.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index eff99dffc842..f92a82c73880 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -82,13 +82,15 @@ static void nft_tproxy_eval_v6(const struct nft_expr *expr,
 	const struct nft_tproxy *priv = nft_expr_priv(expr);
 	struct sk_buff *skb = pkt->skb;
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
-	struct in6_addr taddr = {0};
+	struct in6_addr taddr;
 	int thoff = pkt->xt.thoff;
 	struct udphdr _hdr, *hp;
 	__be16 tport = 0;
 	struct sock *sk;
 	int l4proto;
 
+	memset(&taddr, 0, sizeof(taddr));
+
 	if (!pkt->tprot_set) {
 		regs->verdict.code = NFT_BREAK;
 		return;
-- 
2.11.0

^ permalink raw reply related

* [PATCH 13/15] netfilter: uapi: fix linux/netfilter/nf_osf.h userspace compilation errors
From: Pablo Neira Ayuso @ 2018-08-17 19:41 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20180817194106.2878-1-pablo@netfilter.org>

From: "Dmitry V. Levin" <ldv@altlinux.org>

Move inclusion of <linux/ip.h> and <linux/tcp.h> from
linux/netfilter/xt_osf.h to linux/netfilter/nf_osf.h to fix
the following linux/netfilter/nf_osf.h userspace compilation errors:

/usr/include/linux/netfilter/nf_osf.h:59:24: error: 'MAX_IPOPTLEN' undeclared here (not in a function)
  struct nf_osf_opt opt[MAX_IPOPTLEN];
/usr/include/linux/netfilter/nf_osf.h:64:17: error: field 'ip' has incomplete type
  struct iphdr   ip;
/usr/include/linux/netfilter/nf_osf.h:65:18: error: field 'tcp' has incomplete type
  struct tcphdr   tcp;

Fixes: bfb15f2a95cb ("netfilter: extract Passive OS fingerprint infrastructure from xt_osf")
Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nfnetlink_osf.h | 2 ++
 include/uapi/linux/netfilter/xt_osf.h        | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/netfilter/nfnetlink_osf.h b/include/uapi/linux/netfilter/nfnetlink_osf.h
index 76a3527df5dd..272bc3195f2d 100644
--- a/include/uapi/linux/netfilter/nfnetlink_osf.h
+++ b/include/uapi/linux/netfilter/nfnetlink_osf.h
@@ -2,6 +2,8 @@
 #define _NF_OSF_H
 
 #include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
 
 #define MAXGENRELEN	32
 
diff --git a/include/uapi/linux/netfilter/xt_osf.h b/include/uapi/linux/netfilter/xt_osf.h
index 24102b5286ec..6e466236ca4b 100644
--- a/include/uapi/linux/netfilter/xt_osf.h
+++ b/include/uapi/linux/netfilter/xt_osf.h
@@ -21,8 +21,6 @@
 #define _XT_OSF_H
 
 #include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
 #include <linux/netfilter/nfnetlink_osf.h>
 
 #define XT_OSF_GENRE		NF_OSF_GENRE
-- 
2.11.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox