* [PATCH bpf-next v3 3/4] bpf: sync bpf.h to tools/
From: Stanislav Fomichev @ 2019-08-13 16:26 UTC (permalink / raw)
To: netdev, bpf
Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau,
Yonghong Song
In-Reply-To: <20190813162630.124544-1-sdf@google.com>
Sync new sk storage clone flag.
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
tools/include/uapi/linux/bpf.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4393bd4b2419..0ef594ac3899 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -337,6 +337,9 @@ enum bpf_attach_type {
#define BPF_F_RDONLY_PROG (1U << 7)
#define BPF_F_WRONLY_PROG (1U << 8)
+/* Clone map from listener for newly accepted socket */
+#define BPF_F_CLONE (1U << 9)
+
/* flags for BPF_PROG_QUERY */
#define BPF_F_QUERY_EFFECTIVE (1U << 0)
--
2.23.0.rc1.153.gdeed80330f-goog
^ permalink raw reply related
* [PATCH bpf-next v3 4/4] selftests/bpf: add sockopt clone/inheritance test
From: Stanislav Fomichev @ 2019-08-13 16:26 UTC (permalink / raw)
To: netdev, bpf
Cc: davem, ast, daniel, Stanislav Fomichev, Martin KaFai Lau,
Yonghong Song
In-Reply-To: <20190813162630.124544-1-sdf@google.com>
Add a test that calls setsockopt on the listener socket which triggers
BPF program. This BPF program writes to the sk storage and sets
clone flag. Make sure that sk storage is cloned for a newly
accepted connection.
We have two cloned maps in the tests to make sure we hit both cases
in bpf_sk_storage_clone: first element (sk_storage_alloc) and
non-first element(s) (selem_link_map).
Cc: Martin KaFai Lau <kafai@fb.com>
Cc: Yonghong Song <yhs@fb.com>
Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
tools/testing/selftests/bpf/.gitignore | 1 +
tools/testing/selftests/bpf/Makefile | 3 +-
.../selftests/bpf/progs/sockopt_inherit.c | 97 +++++++
.../selftests/bpf/test_sockopt_inherit.c | 253 ++++++++++++++++++
4 files changed, 353 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/bpf/progs/sockopt_inherit.c
create mode 100644 tools/testing/selftests/bpf/test_sockopt_inherit.c
diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 90f70d2c7c22..60c9338cd9b4 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -42,4 +42,5 @@ xdping
test_sockopt
test_sockopt_sk
test_sockopt_multi
+test_sockopt_inherit
test_tcp_rtt
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3bd0f4a0336a..c875763a851a 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_cgroup_storage test_select_reuseport test_section_names \
test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
- test_sockopt_multi test_tcp_rtt
+ test_sockopt_multi test_sockopt_inherit test_tcp_rtt
BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
TEST_GEN_FILES = $(BPF_OBJ_FILES)
@@ -110,6 +110,7 @@ $(OUTPUT)/test_cgroup_attach: cgroup_helpers.c
$(OUTPUT)/test_sockopt: cgroup_helpers.c
$(OUTPUT)/test_sockopt_sk: cgroup_helpers.c
$(OUTPUT)/test_sockopt_multi: cgroup_helpers.c
+$(OUTPUT)/test_sockopt_inherit: cgroup_helpers.c
$(OUTPUT)/test_tcp_rtt: cgroup_helpers.c
.PHONY: force
diff --git a/tools/testing/selftests/bpf/progs/sockopt_inherit.c b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
new file mode 100644
index 000000000000..dede0fcd6102
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sockopt_inherit.c
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
+
+#define SOL_CUSTOM 0xdeadbeef
+#define CUSTOM_INHERIT1 0
+#define CUSTOM_INHERIT2 1
+#define CUSTOM_LISTENER 2
+
+struct sockopt_inherit {
+ __u8 val;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} cloned1_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC | BPF_F_CLONE);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} cloned2_map SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct sockopt_inherit);
+} listener_only_map SEC(".maps");
+
+static __inline struct sockopt_inherit *get_storage(struct bpf_sockopt *ctx)
+{
+ if (ctx->optname == CUSTOM_INHERIT1)
+ return bpf_sk_storage_get(&cloned1_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ else if (ctx->optname == CUSTOM_INHERIT2)
+ return bpf_sk_storage_get(&cloned2_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+ else
+ return bpf_sk_storage_get(&listener_only_map, ctx->sk, 0,
+ BPF_SK_STORAGE_GET_F_CREATE);
+}
+
+SEC("cgroup/getsockopt")
+int _getsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ struct sockopt_inherit *storage;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_CUSTOM)
+ return 1; /* only interested in SOL_CUSTOM */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = get_storage(ctx);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ ctx->retval = 0; /* Reset system call return value to zero */
+
+ optval[0] = storage->val;
+ ctx->optlen = 1;
+
+ return 1;
+}
+
+SEC("cgroup/setsockopt")
+int _setsockopt(struct bpf_sockopt *ctx)
+{
+ __u8 *optval_end = ctx->optval_end;
+ struct sockopt_inherit *storage;
+ __u8 *optval = ctx->optval;
+
+ if (ctx->level != SOL_CUSTOM)
+ return 1; /* only interested in SOL_CUSTOM */
+
+ if (optval + 1 > optval_end)
+ return 0; /* EPERM, bounds check */
+
+ storage = get_storage(ctx);
+ if (!storage)
+ return 0; /* EPERM, couldn't get sk storage */
+
+ storage->val = optval[0];
+ ctx->optlen = -1;
+
+ return 1;
+}
diff --git a/tools/testing/selftests/bpf/test_sockopt_inherit.c b/tools/testing/selftests/bpf/test_sockopt_inherit.c
new file mode 100644
index 000000000000..1bf699815b9b
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_sockopt_inherit.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <error.h>
+#include <errno.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <pthread.h>
+
+#include <linux/filter.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "bpf_rlimit.h"
+#include "bpf_util.h"
+#include "cgroup_helpers.h"
+
+#define CG_PATH "/sockopt_inherit"
+#define SOL_CUSTOM 0xdeadbeef
+#define CUSTOM_INHERIT1 0
+#define CUSTOM_INHERIT2 1
+#define CUSTOM_LISTENER 2
+
+static int connect_to_server(int server_fd)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create client socket");
+ return -1;
+ }
+
+ if (getsockname(server_fd, (struct sockaddr *)&addr, &len)) {
+ log_err("Failed to get server addr");
+ goto out;
+ }
+
+ if (connect(fd, (const struct sockaddr *)&addr, len) < 0) {
+ log_err("Fail to connect to server");
+ goto out;
+ }
+
+ return fd;
+
+out:
+ close(fd);
+ return -1;
+}
+
+static int verify_sockopt(int fd, int optname, const char *msg, char expected)
+{
+ socklen_t optlen = 1;
+ char buf = 0;
+ int err;
+
+ err = getsockopt(fd, SOL_CUSTOM, optname, &buf, &optlen);
+ if (err) {
+ log_err("%s: failed to call getsockopt", msg);
+ return 1;
+ }
+
+ printf("%s %d: got=0x%x ? expected=0x%x\n", msg, optname, buf, expected);
+
+ if (buf != expected) {
+ log_err("%s: unexpected getsockopt value %d != %d", msg,
+ buf, expected);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void *server_thread(void *arg)
+{
+ struct sockaddr_storage addr;
+ socklen_t len = sizeof(addr);
+ int fd = *(int *)arg;
+ int client_fd;
+ int err = 0;
+
+ if (listen(fd, 1) < 0)
+ error(1, errno, "Failed to listed on socket");
+
+ err += verify_sockopt(fd, CUSTOM_INHERIT1, "listen", 1);
+ err += verify_sockopt(fd, CUSTOM_INHERIT2, "listen", 1);
+ err += verify_sockopt(fd, CUSTOM_LISTENER, "listen", 1);
+
+ client_fd = accept(fd, (struct sockaddr *)&addr, &len);
+ if (client_fd < 0)
+ error(1, errno, "Failed to accept client");
+
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "accept", 1);
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "accept", 1);
+ err += verify_sockopt(client_fd, CUSTOM_LISTENER, "accept", 0);
+
+ close(client_fd);
+
+ return (void *)(long)err;
+}
+
+static int start_server(void)
+{
+ struct sockaddr_in addr = {
+ .sin_family = AF_INET,
+ .sin_addr.s_addr = htonl(INADDR_LOOPBACK),
+ };
+ char buf;
+ int err;
+ int fd;
+ int i;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ log_err("Failed to create server socket");
+ return -1;
+ }
+
+ for (i = CUSTOM_INHERIT1; i <= CUSTOM_LISTENER; i++) {
+ buf = 0x01;
+ err = setsockopt(fd, SOL_CUSTOM, i, &buf, 1);
+ if (err) {
+ log_err("Failed to call setsockopt(%d)", i);
+ close(fd);
+ return -1;
+ }
+ }
+
+ if (bind(fd, (const struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ log_err("Failed to bind socket");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int prog_attach(struct bpf_object *obj, int cgroup_fd, const char *title)
+{
+ enum bpf_attach_type attach_type;
+ enum bpf_prog_type prog_type;
+ struct bpf_program *prog;
+ int err;
+
+ err = libbpf_prog_type_by_name(title, &prog_type, &attach_type);
+ if (err) {
+ log_err("Failed to deduct types for %s BPF program", title);
+ return -1;
+ }
+
+ prog = bpf_object__find_program_by_title(obj, title);
+ if (!prog) {
+ log_err("Failed to find %s BPF program", title);
+ return -1;
+ }
+
+ err = bpf_prog_attach(bpf_program__fd(prog), cgroup_fd,
+ attach_type, 0);
+ if (err) {
+ log_err("Failed to attach %s BPF program", title);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int run_test(int cgroup_fd)
+{
+ struct bpf_prog_load_attr attr = {
+ .file = "./sockopt_inherit.o",
+ };
+ int server_fd = -1, client_fd;
+ struct bpf_object *obj;
+ void *server_err;
+ pthread_t tid;
+ int ignored;
+ int err;
+
+ err = bpf_prog_load_xattr(&attr, &obj, &ignored);
+ if (err) {
+ log_err("Failed to load BPF object");
+ return -1;
+ }
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/getsockopt");
+ if (err)
+ goto close_bpf_object;
+
+ err = prog_attach(obj, cgroup_fd, "cgroup/setsockopt");
+ if (err)
+ goto close_bpf_object;
+
+ server_fd = start_server();
+ if (server_fd < 0) {
+ err = -1;
+ goto close_bpf_object;
+ }
+
+ pthread_create(&tid, NULL, server_thread, (void *)&server_fd);
+
+ client_fd = connect_to_server(server_fd);
+ if (client_fd < 0) {
+ err = -1;
+ goto close_server_fd;
+ }
+
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT1, "connect", 0);
+ err += verify_sockopt(client_fd, CUSTOM_INHERIT2, "connect", 0);
+ err += verify_sockopt(client_fd, CUSTOM_LISTENER, "connect", 0);
+
+ pthread_join(tid, &server_err);
+
+ err += (int)(long)server_err;
+
+ close(client_fd);
+
+close_server_fd:
+ close(server_fd);
+close_bpf_object:
+ bpf_object__close(obj);
+ return err;
+}
+
+int main(int args, char **argv)
+{
+ int cgroup_fd;
+ int err = EXIT_SUCCESS;
+
+ if (setup_cgroup_environment())
+ return err;
+
+ cgroup_fd = create_and_get_cgroup(CG_PATH);
+ if (cgroup_fd < 0)
+ goto cleanup_cgroup_env;
+
+ if (join_cgroup(CG_PATH))
+ goto cleanup_cgroup;
+
+ if (run_test(cgroup_fd))
+ err = EXIT_FAILURE;
+
+ printf("test_sockopt_inherit: %s\n",
+ err == EXIT_SUCCESS ? "PASSED" : "FAILED");
+
+cleanup_cgroup:
+ close(cgroup_fd);
+cleanup_cgroup_env:
+ cleanup_cgroup_environment();
+ return err;
+}
--
2.23.0.rc1.153.gdeed80330f-goog
^ permalink raw reply related
* Re: [PATCH net] sctp: fix the transport error_count check
From: Marcelo Ricardo Leitner @ 2019-08-13 16:27 UTC (permalink / raw)
To: Xin Long; +Cc: network dev, linux-sctp, davem, Neil Horman
In-Reply-To: <55b2fe3e5123958ccd7983e0892bc604aa717132.1565614152.git.lucien.xin@gmail.com>
On Mon, Aug 12, 2019 at 08:49:12PM +0800, Xin Long wrote:
> As the annotation says in sctp_do_8_2_transport_strike():
>
> "If the transport error count is greater than the pf_retrans
> threshold, and less than pathmaxrtx ..."
>
> It should be transport->error_count checked with pathmaxrxt,
> instead of asoc->pf_retrans.
>
> Fixes: 5aa93bcf66f4 ("sctp: Implement quick failover draft from tsvwg")
> Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Dave, please consider this one for stable. Thanks.
> ---
> net/sctp/sm_sideeffect.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> index a554d6d..1cf5bb5 100644
> --- a/net/sctp/sm_sideeffect.c
> +++ b/net/sctp/sm_sideeffect.c
> @@ -546,7 +546,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands,
> */
> if (net->sctp.pf_enable &&
> (transport->state == SCTP_ACTIVE) &&
> - (asoc->pf_retrans < transport->pathmaxrxt) &&
> + (transport->error_count < transport->pathmaxrxt) &&
> (transport->error_count > asoc->pf_retrans)) {
>
> sctp_assoc_control_transport(asoc, transport,
> --
> 2.1.0
>
^ permalink raw reply
* Re: [PATCH rdma-next 0/4] Add XRQ and SRQ support to DEVX interface
From: Doug Ledford @ 2019-08-13 16:28 UTC (permalink / raw)
To: Leon Romanovsky
Cc: Jason Gunthorpe, RDMA mailing list, Edward Srouji, Saeed Mahameed,
Yishai Hadas, linux-netdev
In-Reply-To: <20190813100642.GE29138@mtr-leonro.mtl.com>
[-- Attachment #1: Type: text/plain, Size: 1025 bytes --]
On Tue, 2019-08-13 at 10:06 +0000, Leon Romanovsky wrote:
> On Mon, Aug 12, 2019 at 11:43:58AM -0400, Doug Ledford wrote:
> > On Thu, 2019-08-08 at 10:11 +0000, Leon Romanovsky wrote:
> > > On Thu, Aug 08, 2019 at 11:43:54AM +0300, Leon Romanovsky wrote:
> > > > From: Leon Romanovsky <leonro@mellanox.com>
> > > >
> > > > Hi,
> > > >
> > > > This small series extends DEVX interface with SRQ and XRQ legacy
> > > > commands.
> > >
> > > Sorry for typo in cover letter, there is no SRQ here.
> >
> > Series looks fine to me. Are you planning on the first two via
> > mlx5-
> > next and the remainder via RDMA tree?
> >
>
> Thanks, applied to mlx5-next
>
> b1635ee6120c net/mlx5: Add XRQ legacy commands opcodes
> 647d58a989b3 net/mlx5: Use debug message instead of warn
Merged mlx5-next, then applied remaining two patches to for-next.
Thanks.
--
Doug Ledford <dledford@redhat.com>
GPG KeyID: B826A3330E572FDD
Fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD
[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 833 bytes --]
^ permalink raw reply
* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Andrew Lunn @ 2019-08-13 16:28 UTC (permalink / raw)
To: Igor Russkikh
Cc: Antoine Tenart, davem@davemloft.net, sd@queasysnail.net,
f.fainelli@gmail.com, hkallweit1@gmail.com,
netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
allan.nielsen@microchip.com, camelia.groza@nxp.com,
Simon Edelhaus, Pavel Belous
In-Reply-To: <2e3c2307-d414-a531-26cb-064e05fa01fc@aquantia.com>
> 1) With current implementation it's impossible to install SW macsec engine onto
> the device which supports HW offload. That could be a strong limitation in
> cases when user sees HW macsec offload is broken or work differently, and he/she
> wants to replace it with SW one.
> MACSec is a complex feature, and it may happen something is missing in HW.
> Trivial example is 256bit encryption, which is not always a musthave in HW
> implementations.
Ideally, we want the driver to return EOPNOTSUPP if it does not
support something and the software implement should be used.
If the offload is broken, we want a bug report! And if it works
differently, it suggests there is also a bug we need to fix, or the
standard is ambiguous.
It would also be nice to add extra information to the netlink API to
indicate if HW or SW is being used. In other places where we offload
to accelerators we have such additional information.
Andrew
^ permalink raw reply
* Re: [PATCH] sctp: fix memleak in sctp_send_reset_streams
From: Marcelo Ricardo Leitner @ 2019-08-13 16:29 UTC (permalink / raw)
To: zhengbin; +Cc: vyasevich, nhorman, davem, linux-sctp, netdev, yi.zhang
In-Reply-To: <1565705150-17242-1-git-send-email-zhengbin13@huawei.com>
On Tue, Aug 13, 2019 at 10:05:50PM +0800, zhengbin wrote:
> If the stream outq is not empty, need to kfree nstr_list.
>
> Fixes: d570a59c5b5f ("sctp: only allow the out stream reset when the stream outq is empty")
> Reported-by: Hulk Robot <hulkci@huawei.com>
> Signed-off-by: zhengbin <zhengbin13@huawei.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> ---
> net/sctp/stream.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> index 2594660..e83cdaa 100644
> --- a/net/sctp/stream.c
> +++ b/net/sctp/stream.c
> @@ -316,6 +316,7 @@ int sctp_send_reset_streams(struct sctp_association *asoc,
> nstr_list[i] = htons(str_list[i]);
>
> if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) {
> + kfree(nstr_list);
> retval = -EAGAIN;
> goto out;
> }
> --
> 2.7.4
>
^ permalink raw reply
* Re: [PATCH net-next] net/ncsi: allow to customize BMC MAC Address offset
From: Terry Duncan @ 2019-08-13 16:31 UTC (permalink / raw)
To: Tao Ren
Cc: Jakub Kicinski, Andrew Lunn, netdev, openbmc, linux-kernel,
Samuel Mendoza-Jonas, David S.Miller, William Kennington
In-Reply-To: <bc9da695-3fd3-6643-8e06-562cc08fbc62@linux.intel.com>
Tao, in your new patch will it be possible to disable the setting of the
BMC MAC? I would like to be able to send NCSI_OEM_GET_MAC perhaps with
netlink (TBD) to get the system address without it affecting the BMC
address.
I was about to send patches to add support for the Intel adapters when I
saw this thread.
Thanks,
Terry
>>> After giving it more thought, I'm thinking about adding ncsi dt node
>>> with following structure (mac/ncsi similar to mac/mdio/phy):
>>>
>>> &mac0 {
>>> /* MAC properties... */
>>>
>>> use-ncsi;
>> This property seems to be specific to Faraday FTGMAC100. Are you going
>> to make it more generic?
> I'm also using ftgmac100 on my platform, and I don't have plan to change this property.
>
>>> ncsi {
>>> /* ncsi level properties if any */
>>>
>>> package@0 {
>> You should get Rob Herring involved. This is not really describing
>> hardware, so it might get rejected by the device tree maintainer.
> Got it. Thank you for the sharing, and let me think it over :-)
>
>>> 1) mac driver doesn't need to parse "mac-offset" stuff: these
>>> ncsi-network-controller specific settings should be parsed in ncsi
>>> stack.
>>> 2) get_bmc_mac_address command is a channel specific command, and
>>> technically people can configure different offset/formula for
>>> different channels.
>> Does that mean the NCSA code puts the interface into promiscuous mode?
>> Or at least adds these unicast MAC addresses to the MAC receive
>> filter? Humm, ftgmac100 only seems to support multicast address
>> filtering, not unicast filters, so it must be using promisc mode, if
>> you expect to receive frames using this MAC address.
> Uhh, I actually didn't think too much about this: basically it's how to configure frame filtering when there are multiple packages/channels active: single BMC MAC or multiple BMC MAC is also allowed?
> I don't have the answer yet, but will talk to NCSI expert and figure it out.
>
>
> Thanks,
>
> Tao
>
>
^ permalink raw reply
* Re: [PATCH V5 0/9] Fixes for vhost metadata acceleration
From: Christoph Hellwig @ 2019-08-13 16:41 UTC (permalink / raw)
To: Jason Gunthorpe
Cc: Jason Wang, Michael S. Tsirkin, kvm, virtualization, netdev,
linux-kernel, linux-mm
In-Reply-To: <20190813115707.GC29508@ziepe.ca>
On Tue, Aug 13, 2019 at 08:57:07AM -0300, Jason Gunthorpe wrote:
> On Tue, Aug 13, 2019 at 04:31:07PM +0800, Jason Wang wrote:
>
> > What kind of issues do you see? Spinlock is to synchronize GUP with MMU
> > notifier in this series.
>
> A GUP that can't sleep can't pagefault which makes it a really weird
> pattern
get_user_pages/get_user_pages_fast must not be called under a spinlock.
We have the somewhat misnamed __get_user_page_fast that just does a
lookup for existing pages and never faults for a few places that need
to do that lookup from contexts where we can't sleep.
^ permalink raw reply
* Re: tc - mirred ingress not supported at the moment
From: Cong Wang @ 2019-08-13 16:47 UTC (permalink / raw)
To: Martin Olsson; +Cc: netdev
In-Reply-To: <CAAT+qEbOx8Jh3aFS-e7U6FyHo03sdcY6UoeGzwYQbO6WRjc3PQ@mail.gmail.com>
On Tue, Aug 13, 2019 at 4:05 AM Martin Olsson
<martin.olsson+netdev@sentorsecurity.com> wrote:
> Q1: Why was 'ingress' not implemented at the same time as 'egress'?
Because you are using an old iproute2.
ingress support is added by:
https://git.kernel.org/pub/scm/network/iproute2/iproute2.git/commit/?id=5eca0a3701223619a513c7209f7d9335ca1b4cfa
> 2)
> Ok, so I have to use 'egress':
> # tc filter add dev eno2 parent ffff: prio 999 protocol all matchall
> action mirred egress redirect dev mon0
So you redirect packets from eno2's ingress to mon0's egress.
>
> Since the mirred action forces me to use 'egress' as the direction on
> the dest interface, all kinds of network statistics tools show
> incorrect counters. :-(
> eno2 is a pure sniffer interface (it is connected to the SPAN dest
> port of a switch).
> All packets (matchall) on eno2 are mirrored to mon0.
>
> # ip -s link show dev eno2
> ...
> ...
> RX: bytes packets errors dropped overrun mcast
> 13660757 16329 0 0 0 0
> TX: bytes packets errors dropped carrier collsns
> 0 0 0 0 0 0
> # ip -s link show dev mon0
> ...
> ...
> RX: bytes packets errors dropped overrun mcast
> 0 0 0 0 0 0
> TX: bytes packets errors dropped carrier collsns
> 13660757 16329 0 0 0 0
>
> eno2 and mon0 should be identical, but they are inverted.
Yes, this behavior is correct. The keyword "egress" in your cmdline
already says so.
>
> Q2: So... Can the 'ingress' option please be implemented? (I'm no
> programmer, so unfortunetly I can't do it myself).
It is completed, you need to update your iproute2 and kernel.
Thanks.
^ permalink raw reply
* Re: [PATCH bpf-next v3 2/4] bpf: support cloning sk storage on accept()
From: Yonghong Song @ 2019-08-13 16:58 UTC (permalink / raw)
To: Stanislav Fomichev, netdev@vger.kernel.org, bpf@vger.kernel.org
Cc: davem@davemloft.net, ast@kernel.org, daniel@iogearbox.net,
Martin Lau
In-Reply-To: <20190813162630.124544-3-sdf@google.com>
On 8/13/19 9:26 AM, Stanislav Fomichev wrote:
> Add new helper bpf_sk_storage_clone which optionally clones sk storage
> and call it from sk_clone_lock.
>
> Cc: Martin KaFai Lau <kafai@fb.com>
> Cc: Yonghong Song <yhs@fb.com>
> Acked-by: Yonghong Song <yhs@fb.com>
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
> include/net/bpf_sk_storage.h | 10 ++++
> include/uapi/linux/bpf.h | 3 +
> net/core/bpf_sk_storage.c | 103 ++++++++++++++++++++++++++++++++++-
> net/core/sock.c | 9 ++-
> 4 files changed, 119 insertions(+), 6 deletions(-)
>
> diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> index b9dcb02e756b..8e4f831d2e52 100644
> --- a/include/net/bpf_sk_storage.h
> +++ b/include/net/bpf_sk_storage.h
> @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
> extern const struct bpf_func_proto bpf_sk_storage_get_proto;
> extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
>
> +#ifdef CONFIG_BPF_SYSCALL
> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> +#else
> +static inline int bpf_sk_storage_clone(const struct sock *sk,
> + struct sock *newsk)
> +{
> + return 0;
> +}
> +#endif
> +
> #endif /* _BPF_SK_STORAGE_H */
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 4393bd4b2419..0ef594ac3899 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -337,6 +337,9 @@ enum bpf_attach_type {
> #define BPF_F_RDONLY_PROG (1U << 7)
> #define BPF_F_WRONLY_PROG (1U << 8)
>
> +/* Clone map from listener for newly accepted socket */
> +#define BPF_F_CLONE (1U << 9)
> +
> /* flags for BPF_PROG_QUERY */
> #define BPF_F_QUERY_EFFECTIVE (1U << 0)
>
> diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> index 94c7f77ecb6b..1bc7de7e18ba 100644
> --- a/net/core/bpf_sk_storage.c
> +++ b/net/core/bpf_sk_storage.c
> @@ -12,6 +12,9 @@
>
> static atomic_t cache_idx;
>
> +#define SK_STORAGE_CREATE_FLAG_MASK \
> + (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> +
> struct bucket {
> struct hlist_head list;
> raw_spinlock_t lock;
> @@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
> kfree_rcu(sk_storage, rcu);
> }
>
> -/* sk_storage->lock must be held and sk_storage->list cannot be empty */
> static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
> struct bpf_sk_storage_elem *selem)
> {
> @@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
> return 0;
> }
>
> -/* Called by __sk_destruct() */
> +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
> void bpf_sk_storage_free(struct sock *sk)
> {
> struct bpf_sk_storage_elem *selem;
> @@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
>
> smap = (struct bpf_sk_storage_map *)map;
>
> + /* Note that this map might be concurrently cloned from
> + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
> + * RCU read section to finish before proceeding. New RCU
> + * read sections should be prevented via bpf_map_inc_not_zero.
> + */
> synchronize_rcu();
>
> /* bpf prog and the userspace can no longer access this map
> @@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
>
> static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
> {
> - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
> + if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
> + !(attr->map_flags & BPF_F_NO_PREALLOC) ||
> + attr->max_entries ||
> attr->key_size != sizeof(int) || !attr->value_size ||
> /* Enforce BTF for userspace sk dumping */
> !attr->btf_key_type_id || !attr->btf_value_type_id)
> @@ -739,6 +748,94 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
> return err;
> }
>
> +static struct bpf_sk_storage_elem *
> +bpf_sk_storage_clone_elem(struct sock *newsk,
> + struct bpf_sk_storage_map *smap,
> + struct bpf_sk_storage_elem *selem)
> +{
> + struct bpf_sk_storage_elem *copy_selem;
> +
> + copy_selem = selem_alloc(smap, newsk, NULL, true);
> + if (!copy_selem)
> + return NULL;
> +
> + if (map_value_has_spin_lock(&smap->map))
> + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> + SDATA(selem)->data, true);
> + else
> + copy_map_value(&smap->map, SDATA(copy_selem)->data,
> + SDATA(selem)->data);
> +
> + return copy_selem;
> +}
> +
> +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> +{
> + struct bpf_sk_storage *new_sk_storage = NULL;
> + struct bpf_sk_storage *sk_storage;
> + struct bpf_sk_storage_elem *selem;
> + int ret;
> +
> + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> +
> + rcu_read_lock();
> + sk_storage = rcu_dereference(sk->sk_bpf_storage);
> +
> + if (!sk_storage || hlist_empty(&sk_storage->list))
> + goto out;
> +
> + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> + struct bpf_sk_storage_elem *copy_selem;
> + struct bpf_sk_storage_map *smap;
> + struct bpf_map *map;
> +
> + smap = rcu_dereference(SDATA(selem)->smap);
> + if (!(smap->map.map_flags & BPF_F_CLONE))
> + continue;
> +
> + map = bpf_map_inc_not_zero(&smap->map, false);
> + if (IS_ERR(map))
> + continue;
> +
> + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> + if (!copy_selem) {
> + ret = -ENOMEM;
> + bpf_map_put(map);
> + goto err;
> + }
> +
> + if (new_sk_storage) {
> + selem_link_map(smap, copy_selem);
> + __selem_link_sk(new_sk_storage, copy_selem);
> + } else {
> + ret = sk_storage_alloc(newsk, smap, copy_selem);
> + if (ret) {
> + kfree(copy_selem);
> + atomic_sub(smap->elem_size,
> + &newsk->sk_omem_alloc);
> + bpf_map_put(map);
> + goto err;
> + }
> +
> + new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> + }
> + bpf_map_put(map);
> + }
> +
> +out:
> + rcu_read_unlock();
> + return 0;
> +
> +err:
> + rcu_read_unlock();
> +
> + /* Don't free anything explicitly here, caller is responsible to
> + * call bpf_sk_storage_free in case of an error.
> + */
> +
> + return ret;
A nit.
If you set ret = 0 initially, you do not need the above two
rcu_read_unlock(). One "return ret" should be enough.
The comment can be changed to
/* In case of an error, ... */
> +}
> +
> BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> void *, value, u64, flags)
> {
> diff --git a/net/core/sock.c b/net/core/sock.c
> index d57b0cc995a0..f5e801a9cea4 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> goto out;
> }
> RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> -#ifdef CONFIG_BPF_SYSCALL
> - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> -#endif
> +
> + if (bpf_sk_storage_clone(sk, newsk)) {
> + sk_free_unlock_clone(newsk);
> + newsk = NULL;
> + goto out;
> + }
>
> newsk->sk_err = 0;
> newsk->sk_err_soft = 0;
>
^ permalink raw reply
* Re: [patch net-next v3 1/3] net: devlink: allow to change namespaces
From: Jakub Kicinski @ 2019-08-13 17:04 UTC (permalink / raw)
To: Jiri Pirko; +Cc: netdev, davem, stephen, dsahern, mlxsw
In-Reply-To: <20190813061355.GF2428@nanopsycho>
On Tue, 13 Aug 2019 08:13:55 +0200, Jiri Pirko wrote:
> Tue, Aug 13, 2019 at 03:21:22AM CEST, jakub.kicinski@netronome.com wrote:
> >On Mon, 12 Aug 2019 15:47:49 +0200, Jiri Pirko wrote:
> >> @@ -6953,9 +7089,33 @@ int devlink_compat_switch_id_get(struct net_device *dev,
> >> return 0;
> >> }
> >>
> >> +static void __net_exit devlink_pernet_exit(struct net *net)
> >> +{
> >> + struct devlink *devlink;
> >> +
> >> + mutex_lock(&devlink_mutex);
> >> + list_for_each_entry(devlink, &devlink_list, list)
> >> + if (net_eq(devlink_net(devlink), net))
> >> + devlink_netns_change(devlink, &init_net);
> >> + mutex_unlock(&devlink_mutex);
> >> +}
> >
> >Just to be sure - this will not cause any locking issues?
> >Usually the locking order goes devlink -> rtnl
>
> rtnl is not taken. Do I miss something?
Probably not, just double checking.
^ permalink raw reply
* Re: [PATCH 12/16] arm64: prefer __section from compiler_attributes.h
From: Will Deacon @ 2019-08-13 17:08 UTC (permalink / raw)
To: Miguel Ojeda
Cc: Nick Desaulniers, Andrew Morton, Sedat Dilek, Josh Poimboeuf, yhs,
clang-built-linux, Catalin Marinas, Alexei Starovoitov,
Daniel Borkmann, Martin KaFai Lau, Song Liu, Andrey Konovalov,
Greg Kroah-Hartman, Enrico Weigelt, Suzuki K Poulose,
Thomas Gleixner, Masayoshi Mizuma, Shaokun Zhang, Alexios Zavras,
Allison Randal, Linux ARM, linux-kernel, Network Development, bpf
In-Reply-To: <CANiq72mAfJ23PyWzZAELgbKQDCX2nvY0z+dmOMe14qz=wa6eFg@mail.gmail.com>
On Tue, Aug 13, 2019 at 02:36:06PM +0200, Miguel Ojeda wrote:
> On Tue, Aug 13, 2019 at 10:27 AM Will Deacon <will@kernel.org> wrote:
> > On Mon, Aug 12, 2019 at 02:50:45PM -0700, Nick Desaulniers wrote:
> > > GCC unescapes escaped string section names while Clang does not. Because
> > > __section uses the `#` stringification operator for the section name, it
> > > doesn't need to be escaped.
> > >
> > > This antipattern was found with:
> > > $ grep -e __section\(\" -e __section__\(\" -r
> > >
> > > Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
> > > Suggested-by: Josh Poimboeuf <jpoimboe@redhat.com>
> > > Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
> > > ---
> > > arch/arm64/include/asm/cache.h | 2 +-
> > > arch/arm64/kernel/smp_spin_table.c | 2 +-
> > > 2 files changed, 2 insertions(+), 2 deletions(-)
> >
> > Does this fix a build issue, or is it just cosmetic or do we end up with
> > duplicate sections or something else?
>
> This should be cosmetic -- basically we are trying to move all users
> of current available __attribute__s in compiler_attributes.h to the
> __attr forms. I am also adding (slowly) new attributes that are
> already used but we don't have them yet in __attr form.
>
> > Happy to route it via arm64, just having trouble working out whether it's
> > 5.3 material!
>
> As you prefer! Those that are not taken by a maintainer I will pick up
> and send via compiler-attributes.
>
> I would go for 5.4, since there is no particular rush anyway.
Okey doke, I'll pick this one up for 5.4 then. Thanks for the explanation!
Will
^ permalink raw reply
* [RFC PATCH] bpf: handle 32-bit zext during constant blinding
From: Naveen N. Rao @ 2019-08-13 17:10 UTC (permalink / raw)
To: Alexei Starovoitov, Daniel Borkmann, Jiong Wang
Cc: Michael Ellerman, bpf, linuxppc-dev, netdev, linux-kernel
Since BPF constant blinding is performed after the verifier pass, there
are certain ALU32 instructions inserted which don't have a corresponding
zext instruction inserted after. This is causing a kernel oops on
powerpc and can be reproduced by running 'test_cgroup_storage' with
bpf_jit_harden=2.
Fix this by emitting BPF_ZEXT during constant blinding if
prog->aux->verifier_zext is set.
Fixes: a4b1d3c1ddf6cb ("bpf: verifier: insert zero extension according to analysis result")
Reported-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
This approach (the location where zext is being introduced below, in
particular) works for powerpc, but I am not entirely sure if this is
sufficient for other architectures as well. This is broken on v5.3-rc4.
- Naveen
kernel/bpf/core.c | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..d84146e6fd9e 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -890,7 +890,8 @@ int bpf_jit_get_func_addr(const struct bpf_prog *prog,
static int bpf_jit_blind_insn(const struct bpf_insn *from,
const struct bpf_insn *aux,
- struct bpf_insn *to_buff)
+ struct bpf_insn *to_buff,
+ bool emit_zext)
{
struct bpf_insn *to = to_buff;
u32 imm_rnd = get_random_int();
@@ -939,6 +940,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
*to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX);
+ if (emit_zext)
+ *to++ = BPF_ZEXT_REG(from->dst_reg);
break;
case BPF_ALU64 | BPF_ADD | BPF_K:
@@ -992,6 +995,10 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
off -= 2;
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ if (emit_zext) {
+ *to++ = BPF_ZEXT_REG(BPF_REG_AX);
+ off--;
+ }
*to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX,
off);
break;
@@ -1005,6 +1012,8 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */
*to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm);
*to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd);
+ if (emit_zext)
+ *to++ = BPF_ZEXT_REG(BPF_REG_AX);
*to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX);
break;
@@ -1088,7 +1097,8 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog)
insn[1].code == 0)
memcpy(aux, insn, sizeof(aux));
- rewritten = bpf_jit_blind_insn(insn, aux, insn_buff);
+ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff,
+ clone->aux->verifier_zext);
if (!rewritten)
continue;
--
2.22.0
^ permalink raw reply related
* [PATCH net-next] net: dsa: mv88e6xxx: check for mode change in port_setup_mac
From: Marek Behún @ 2019-08-13 17:12 UTC (permalink / raw)
To: netdev; +Cc: Andrew Lunn, Vivien Didelot, Heiner Kallweit, Marek Behún
The mv88e6xxx_port_setup_mac checks if the requested MAC settings are
different from the current ones, and if not, does nothing (since chaning
them requires putting the link down).
In this check it only looks if the triplet [link, speed, duplex] is
being changed.
This patch adds support to also check if the mode parameter (of type
phy_interface_t) is requested to be changed. The current mode is
computed by the ->port_link_state() method, and if it is different from
PHY_INTERFACE_MODE_NA, we check for equality with the requested mode.
In the implementations of the mv88e6250_port_link_state() method we set
the current mode to PHY_INTERFACE_MODE_NA - so the code does not check
for mode change on 6250.
In the mv88e6352_port_link_state() method, we use the cached cmode of
the port to determine the mode as phy_interface_t (and if it is not
enough, eg. for RGMII, we also look at the port control register for
RX/TX timings).
Signed-off-by: Marek Behún <marek.behun@nic.cz>
---
drivers/net/dsa/mv88e6xxx/chip.c | 4 +++-
drivers/net/dsa/mv88e6xxx/port.c | 40 +++++++++++++++++++++++++++++++-
drivers/net/dsa/mv88e6xxx/port.h | 1 +
3 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 818a83eb2dcb..9b3ad22a5b98 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -417,7 +417,9 @@ int mv88e6xxx_port_setup_mac(struct mv88e6xxx_chip *chip, int port, int link,
*/
if (state.link == link &&
state.speed == speed &&
- state.duplex == duplex)
+ state.duplex == duplex &&
+ (state.interface == mode ||
+ state.interface == PHY_INTERFACE_MODE_NA))
return 0;
/* Port's MAC control must not be changed unless the link is down */
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 04309ef0a1cc..304e5b118b08 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -590,6 +590,7 @@ int mv88e6250_port_link_state(struct mv88e6xxx_chip *chip, int port,
state->link = !!(reg & MV88E6250_PORT_STS_LINK);
state->an_enabled = 1;
state->an_complete = state->link;
+ state->interface = PHY_INTERFACE_MODE_NA;
return 0;
}
@@ -598,12 +599,49 @@ int mv88e6352_port_link_state(struct mv88e6xxx_chip *chip, int port,
struct phylink_link_state *state)
{
int err;
- u16 reg;
+ u16 reg, mac;
err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_STS, ®);
if (err)
return err;
+ switch (chip->ports[port].cmode) {
+ case MV88E6XXX_PORT_STS_CMODE_RGMII:
+ err = mv88e6xxx_port_read(chip, port, MV88E6XXX_PORT_MAC_CTL,
+ &mac);
+ if (err)
+ return err;
+
+ if ((mac & MV88E6XXX_PORT_MAC_CTL_RGMII_DELAY_RXCLK) &&
+ (mac & MV88E6XXX_PORT_MAC_CTL_RGMII_DELAY_TXCLK))
+ state->interface = PHY_INTERFACE_MODE_RGMII_ID;
+ else if (mac & MV88E6XXX_PORT_MAC_CTL_RGMII_DELAY_RXCLK)
+ state->interface = PHY_INTERFACE_MODE_RGMII_RXID;
+ else if (mac & MV88E6XXX_PORT_MAC_CTL_RGMII_DELAY_TXCLK)
+ state->interface = PHY_INTERFACE_MODE_RGMII_TXID;
+ else
+ state->interface = PHY_INTERFACE_MODE_RGMII;
+ break;
+ case MV88E6XXX_PORT_STS_CMODE_1000BASE_X:
+ state->interface = PHY_INTERFACE_MODE_1000BASEX;
+ break;
+ case MV88E6XXX_PORT_STS_CMODE_SGMII:
+ state->interface = PHY_INTERFACE_MODE_SGMII;
+ break;
+ case MV88E6XXX_PORT_STS_CMODE_2500BASEX:
+ state->interface = PHY_INTERFACE_MODE_2500BASEX;
+ break;
+ case MV88E6XXX_PORT_STS_CMODE_XAUI:
+ state->interface = PHY_INTERFACE_MODE_XAUI;
+ break;
+ case MV88E6XXX_PORT_STS_CMODE_RXAUI:
+ state->interface = PHY_INTERFACE_MODE_RXAUI;
+ break;
+ default:
+ /* we do not support other cmode values here */
+ state->interface = PHY_INTERFACE_MODE_NA;
+ }
+
switch (reg & MV88E6XXX_PORT_STS_SPEED_MASK) {
case MV88E6XXX_PORT_STS_SPEED_10:
state->speed = SPEED_10;
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index ceec771f8bfc..1abf5ea033e2 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -42,6 +42,7 @@
#define MV88E6XXX_PORT_STS_TX_PAUSED 0x0020
#define MV88E6XXX_PORT_STS_FLOW_CTL 0x0010
#define MV88E6XXX_PORT_STS_CMODE_MASK 0x000f
+#define MV88E6XXX_PORT_STS_CMODE_RGMII 0x0007
#define MV88E6XXX_PORT_STS_CMODE_100BASE_X 0x0008
#define MV88E6XXX_PORT_STS_CMODE_1000BASE_X 0x0009
#define MV88E6XXX_PORT_STS_CMODE_SGMII 0x000a
--
2.21.0
^ permalink raw reply related
* Re: [PATCH bpf-next v3 2/4] bpf: support cloning sk storage on accept()
From: Stanislav Fomichev @ 2019-08-13 17:16 UTC (permalink / raw)
To: Yonghong Song
Cc: Stanislav Fomichev, netdev@vger.kernel.org, bpf@vger.kernel.org,
davem@davemloft.net, ast@kernel.org, daniel@iogearbox.net,
Martin Lau
In-Reply-To: <173b3736-97af-cad5-9432-8e6422a89d05@fb.com>
On 08/13, Yonghong Song wrote:
>
>
> On 8/13/19 9:26 AM, Stanislav Fomichev wrote:
> > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > and call it from sk_clone_lock.
> >
> > Cc: Martin KaFai Lau <kafai@fb.com>
> > Cc: Yonghong Song <yhs@fb.com>
> > Acked-by: Yonghong Song <yhs@fb.com>
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> > include/net/bpf_sk_storage.h | 10 ++++
> > include/uapi/linux/bpf.h | 3 +
> > net/core/bpf_sk_storage.c | 103 ++++++++++++++++++++++++++++++++++-
> > net/core/sock.c | 9 ++-
> > 4 files changed, 119 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> > index b9dcb02e756b..8e4f831d2e52 100644
> > --- a/include/net/bpf_sk_storage.h
> > +++ b/include/net/bpf_sk_storage.h
> > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
> > extern const struct bpf_func_proto bpf_sk_storage_get_proto;
> > extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
> >
> > +#ifdef CONFIG_BPF_SYSCALL
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> > +#else
> > +static inline int bpf_sk_storage_clone(const struct sock *sk,
> > + struct sock *newsk)
> > +{
> > + return 0;
> > +}
> > +#endif
> > +
> > #endif /* _BPF_SK_STORAGE_H */
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 4393bd4b2419..0ef594ac3899 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -337,6 +337,9 @@ enum bpf_attach_type {
> > #define BPF_F_RDONLY_PROG (1U << 7)
> > #define BPF_F_WRONLY_PROG (1U << 8)
> >
> > +/* Clone map from listener for newly accepted socket */
> > +#define BPF_F_CLONE (1U << 9)
> > +
> > /* flags for BPF_PROG_QUERY */
> > #define BPF_F_QUERY_EFFECTIVE (1U << 0)
> >
> > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> > index 94c7f77ecb6b..1bc7de7e18ba 100644
> > --- a/net/core/bpf_sk_storage.c
> > +++ b/net/core/bpf_sk_storage.c
> > @@ -12,6 +12,9 @@
> >
> > static atomic_t cache_idx;
> >
> > +#define SK_STORAGE_CREATE_FLAG_MASK \
> > + (BPF_F_NO_PREALLOC | BPF_F_CLONE)
> > +
> > struct bucket {
> > struct hlist_head list;
> > raw_spinlock_t lock;
> > @@ -209,7 +212,6 @@ static void selem_unlink_sk(struct bpf_sk_storage_elem *selem)
> > kfree_rcu(sk_storage, rcu);
> > }
> >
> > -/* sk_storage->lock must be held and sk_storage->list cannot be empty */
> > static void __selem_link_sk(struct bpf_sk_storage *sk_storage,
> > struct bpf_sk_storage_elem *selem)
> > {
> > @@ -509,7 +511,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
> > return 0;
> > }
> >
> > -/* Called by __sk_destruct() */
> > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
> > void bpf_sk_storage_free(struct sock *sk)
> > {
> > struct bpf_sk_storage_elem *selem;
> > @@ -557,6 +559,11 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
> >
> > smap = (struct bpf_sk_storage_map *)map;
> >
> > + /* Note that this map might be concurrently cloned from
> > + * bpf_sk_storage_clone. Wait for any existing bpf_sk_storage_clone
> > + * RCU read section to finish before proceeding. New RCU
> > + * read sections should be prevented via bpf_map_inc_not_zero.
> > + */
> > synchronize_rcu();
> >
> > /* bpf prog and the userspace can no longer access this map
> > @@ -601,7 +608,9 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
> >
> > static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
> > {
> > - if (attr->map_flags != BPF_F_NO_PREALLOC || attr->max_entries ||
> > + if (attr->map_flags & ~SK_STORAGE_CREATE_FLAG_MASK ||
> > + !(attr->map_flags & BPF_F_NO_PREALLOC) ||
> > + attr->max_entries ||
> > attr->key_size != sizeof(int) || !attr->value_size ||
> > /* Enforce BTF for userspace sk dumping */
> > !attr->btf_key_type_id || !attr->btf_value_type_id)
> > @@ -739,6 +748,94 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
> > return err;
> > }
> >
> > +static struct bpf_sk_storage_elem *
> > +bpf_sk_storage_clone_elem(struct sock *newsk,
> > + struct bpf_sk_storage_map *smap,
> > + struct bpf_sk_storage_elem *selem)
> > +{
> > + struct bpf_sk_storage_elem *copy_selem;
> > +
> > + copy_selem = selem_alloc(smap, newsk, NULL, true);
> > + if (!copy_selem)
> > + return NULL;
> > +
> > + if (map_value_has_spin_lock(&smap->map))
> > + copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> > + SDATA(selem)->data, true);
> > + else
> > + copy_map_value(&smap->map, SDATA(copy_selem)->data,
> > + SDATA(selem)->data);
> > +
> > + return copy_selem;
> > +}
> > +
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > +{
> > + struct bpf_sk_storage *new_sk_storage = NULL;
> > + struct bpf_sk_storage *sk_storage;
> > + struct bpf_sk_storage_elem *selem;
> > + int ret;
> > +
> > + RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > +
> > + rcu_read_lock();
> > + sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > +
> > + if (!sk_storage || hlist_empty(&sk_storage->list))
> > + goto out;
> > +
> > + hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > + struct bpf_sk_storage_elem *copy_selem;
> > + struct bpf_sk_storage_map *smap;
> > + struct bpf_map *map;
> > +
> > + smap = rcu_dereference(SDATA(selem)->smap);
> > + if (!(smap->map.map_flags & BPF_F_CLONE))
> > + continue;
> > +
> > + map = bpf_map_inc_not_zero(&smap->map, false);
> > + if (IS_ERR(map))
> > + continue;
> > +
> > + copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > + if (!copy_selem) {
> > + ret = -ENOMEM;
> > + bpf_map_put(map);
> > + goto err;
> > + }
> > +
> > + if (new_sk_storage) {
> > + selem_link_map(smap, copy_selem);
> > + __selem_link_sk(new_sk_storage, copy_selem);
> > + } else {
> > + ret = sk_storage_alloc(newsk, smap, copy_selem);
> > + if (ret) {
> > + kfree(copy_selem);
> > + atomic_sub(smap->elem_size,
> > + &newsk->sk_omem_alloc);
> > + bpf_map_put(map);
> > + goto err;
> > + }
> > +
> > + new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> > + }
> > + bpf_map_put(map);
> > + }
> > +
> > +out:
> > + rcu_read_unlock();
> > + return 0;
> > +
> > +err:
> > + rcu_read_unlock();
> > +
> > + /* Don't free anything explicitly here, caller is responsible to
> > + * call bpf_sk_storage_free in case of an error.
> > + */
> > +
> > + return ret;
>
> A nit.
> If you set ret = 0 initially, you do not need the above two
> rcu_read_unlock(). One "return ret" should be enough.
> The comment can be changed to
> /* In case of an error, ... */
I thought about it, but decided to keep them separate because of
this comment. OTOH, adding "In case of an error," might help with
that. Can do another respin once Martin gets to review this
version (don't want to spam).
> > +}
> > +
> > BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> > void *, value, u64, flags)
> > {
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index d57b0cc995a0..f5e801a9cea4 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> > goto out;
> > }
> > RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> > -#ifdef CONFIG_BPF_SYSCALL
> > - RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > -#endif
> > +
> > + if (bpf_sk_storage_clone(sk, newsk)) {
> > + sk_free_unlock_clone(newsk);
> > + newsk = NULL;
> > + goto out;
> > + }
> >
> > newsk->sk_err = 0;
> > newsk->sk_err_soft = 0;
> >
^ permalink raw reply
* Re: general protection fault in tls_write_space
From: John Fastabend @ 2019-08-13 17:17 UTC (permalink / raw)
To: Hillf Danton, syzbot
Cc: aviadye, borisp, daniel, davejwatson, davem, jakub.kicinski,
john.fastabend, linux-kernel, netdev, oss-drivers, syzkaller-bugs,
willemb
In-Reply-To: <20190810135900.2820-1-hdanton@sina.com>
Hillf Danton wrote:
>
> On Sat, 10 Aug 2019 01:23:06 -0700
> >
> > syzbot has found a reproducer for the following crash on:
> >
> > HEAD commit: ca497fb6 taprio: remove unused variable 'entry_list_policy'
> > git tree: net-next
> > console output: https://syzkaller.appspot.com/x/log.txt?x=109f3802600000
> > kernel config: https://syzkaller.appspot.com/x/.config?x=d4cf1ffb87d590d7
> > dashboard link: https://syzkaller.appspot.com/bug?extid=dcdc9deefaec44785f32
> > compiler: gcc (GCC) 9.0.0 20181231 (experimental)
> > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=11c78cd2600000
> >
> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
> > Reported-by: syzbot+dcdc9deefaec44785f32@syzkaller.appspotmail.com
> >
> > kasan: CONFIG_KASAN_INLINE enabled
> > kasan: GPF could be caused by NULL-ptr deref or user memory access
> > general protection fault: 0000 [#1] PREEMPT SMP KASAN
> > CPU: 0 PID: 9 Comm: ksoftirqd/0 Not tainted 5.3.0-rc3+ #125
> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> > Google 01/01/2011
> > RIP: 0010:tls_write_space+0x51/0x170 net/tls/tls_main.c:239
> > Code: c1 ea 03 80 3c 02 00 0f 85 26 01 00 00 49 8b 9c 24 b0 06 00 00 48 b8
> > 00 00 00 00 00 fc ff df 48 8d 7b 6a 48 89 fa 48 c1 ea 03 <0f> b6 04 02 48
> > 89 fa 83 e2 07 38 d0 7f 08 84 c0 0f 85 df 00 00 00
> > RSP: 0018:ffff8880a98b74c8 EFLAGS: 00010202
> > RAX: dffffc0000000000 RBX: 0000000000000000 RCX: ffffffff860a27a2
> > RDX: 000000000000000d RSI: ffffffff862c86c1 RDI: 000000000000006a
> > RBP: ffff8880a98b74e0 R08: ffff8880a98a2240 R09: fffffbfff167c289
> > R10: fffffbfff167c288 R11: ffffffff8b3e1447 R12: ffff8880a4de41c0
> > R13: ffff8880a4de45b8 R14: 000000000000000a R15: 0000000000000000
> > FS: 0000000000000000(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
> > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> > CR2: 0000000000000000 CR3: 000000008c9d1000 CR4: 00000000001406f0
> > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
> > Call Trace:
> > tcp_new_space net/ipv4/tcp_input.c:5151 [inline]
> > tcp_check_space+0x191/0x760 net/ipv4/tcp_input.c:5162
> > tcp_data_snd_check net/ipv4/tcp_input.c:5172 [inline]
> > tcp_rcv_state_process+0xe24/0x4e48 net/ipv4/tcp_input.c:6303
> > tcp_v6_do_rcv+0x7d7/0x12c0 net/ipv6/tcp_ipv6.c:1381
> > tcp_v6_rcv+0x31f1/0x3500 net/ipv6/tcp_ipv6.c:1588
> > ip6_protocol_deliver_rcu+0x2fe/0x1660 net/ipv6/ip6_input.c:397
> > ip6_input_finish+0x84/0x170 net/ipv6/ip6_input.c:438
> > NF_HOOK include/linux/netfilter.h:305 [inline]
> > NF_HOOK include/linux/netfilter.h:299 [inline]
> > ip6_input+0xe4/0x3f0 net/ipv6/ip6_input.c:447
> > dst_input include/net/dst.h:442 [inline]
> > ip6_rcv_finish+0x1de/0x2f0 net/ipv6/ip6_input.c:76
> > NF_HOOK include/linux/netfilter.h:305 [inline]
> > NF_HOOK include/linux/netfilter.h:299 [inline]
> > ipv6_rcv+0x10e/0x420 net/ipv6/ip6_input.c:272
> > __netif_receive_skb_one_core+0x113/0x1a0 net/core/dev.c:5006
> > __netif_receive_skb+0x2c/0x1d0 net/core/dev.c:5120
> > process_backlog+0x206/0x750 net/core/dev.c:5951
> > napi_poll net/core/dev.c:6388 [inline]
> > net_rx_action+0x4d6/0x1080 net/core/dev.c:6456
> > __do_softirq+0x262/0x98c kernel/softirq.c:292
> > run_ksoftirqd kernel/softirq.c:603 [inline]
> > run_ksoftirqd+0x8e/0x110 kernel/softirq.c:595
> > smpboot_thread_fn+0x6a3/0xa40 kernel/smpboot.c:165
> > kthread+0x361/0x430 kernel/kthread.c:255
> > ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
> > Modules linked in:
> > ---[ end trace c21a83505707bb9d ]---
>
> Followup of commit 95fa145479fb
> ("bpf: sockmap/tls, close can race with map free")
>
> --- a/net/tls/tls_main.c
> +++ b/net/tls/tls_main.c
> @@ -308,6 +308,9 @@ static void tls_sk_proto_close(struct so
> if (free_ctx)
> icsk->icsk_ulp_data = NULL;
> sk->sk_prot = ctx->sk_proto;
> + /* tls will go; restore sock callback before enabling bh */
> + if (sk->sk_write_space == tls_write_space)
> + sk->sk_write_space = ctx->sk_write_space;
> write_unlock_bh(&sk->sk_callback_lock);
> release_sock(sk);
> if (ctx->tx_conf == TLS_SW)
Hi Hillf,
We need this patch (although slightly updated for bpf tree) do
you want to send it? Otherwise I can. We should only set this if
TX path was enabled otherwise we null it. Checking against
tls_write_space seems best to me as well.
Against bpf this patch should fix it.
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index ce6ef56a65ef..43252a801c3f 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -308,7 +308,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
if (free_ctx)
icsk->icsk_ulp_data = NULL;
sk->sk_prot = ctx->sk_proto;
- sk->sk_write_space = ctx->sk_write_space;
+ if (sk->sk_write_space == tls_write_space)
+ sk->sk_write_space = ctx->sk_write_space;
write_unlock_bh(&sk->sk_callback_lock);
release_sock(sk);
if (ctx->tx_conf == TLS_SW)
^ permalink raw reply related
* Re: general protection fault in tls_write_space
From: Jakub Kicinski @ 2019-08-13 17:27 UTC (permalink / raw)
To: John Fastabend
Cc: Hillf Danton, syzbot, aviadye, borisp, daniel, davejwatson, davem,
linux-kernel, netdev, oss-drivers, syzkaller-bugs, willemb
In-Reply-To: <5d52f09299e91_40c72adb748b25c0d3@john-XPS-13-9370.notmuch>
On Tue, 13 Aug 2019 10:17:06 -0700, John Fastabend wrote:
> > Followup of commit 95fa145479fb
> > ("bpf: sockmap/tls, close can race with map free")
> >
> > --- a/net/tls/tls_main.c
> > +++ b/net/tls/tls_main.c
> > @@ -308,6 +308,9 @@ static void tls_sk_proto_close(struct so
> > if (free_ctx)
> > icsk->icsk_ulp_data = NULL;
> > sk->sk_prot = ctx->sk_proto;
> > + /* tls will go; restore sock callback before enabling bh */
> > + if (sk->sk_write_space == tls_write_space)
> > + sk->sk_write_space = ctx->sk_write_space;
> > write_unlock_bh(&sk->sk_callback_lock);
> > release_sock(sk);
> > if (ctx->tx_conf == TLS_SW)
>
> Hi Hillf,
>
> We need this patch (although slightly updated for bpf tree) do
> you want to send it? Otherwise I can. We should only set this if
> TX path was enabled otherwise we null it. Checking against
> tls_write_space seems best to me as well.
>
> Against bpf this patch should fix it.
>
> diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
> index ce6ef56a65ef..43252a801c3f 100644
> --- a/net/tls/tls_main.c
> +++ b/net/tls/tls_main.c
> @@ -308,7 +308,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
> if (free_ctx)
> icsk->icsk_ulp_data = NULL;
> sk->sk_prot = ctx->sk_proto;
> - sk->sk_write_space = ctx->sk_write_space;
> + if (sk->sk_write_space == tls_write_space)
> + sk->sk_write_space = ctx->sk_write_space;
> write_unlock_bh(&sk->sk_callback_lock);
> release_sock(sk);
> if (ctx->tx_conf == TLS_SW)
This is already in net since Friday:
commit 57c722e932cfb82e9820bbaae1b1f7222ea97b52
Author: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Fri Aug 9 18:36:23 2019 -0700
net/tls: swap sk_write_space on close
Now that we swap the original proto and clear the ULP pointer
on close we have to make sure no callback will try to access
the freed state. sk_write_space is not part of sk_prot, remember
to swap it.
Reported-by: syzbot+dcdc9deefaec44785f32@syzkaller.appspotmail.com
Fixes: 95fa145479fb ("bpf: sockmap/tls, close can race with map free")
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 9cbbae606ced..ce6ef56a65ef 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -308,6 +308,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
if (free_ctx)
icsk->icsk_ulp_data = NULL;
sk->sk_prot = ctx->sk_proto;
+ sk->sk_write_space = ctx->sk_write_space;
write_unlock_bh(&sk->sk_callback_lock);
release_sock(sk);
if (ctx->tx_conf == TLS_SW)
^ permalink raw reply related
* Re: [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Jonathan Lemon @ 2019-08-13 17:36 UTC (permalink / raw)
To: Ivan Khoronzhuk
Cc: magnus.karlsson, bjorn.topel, davem, hawk, john.fastabend,
jakub.kicinski, daniel, netdev, bpf, xdp-newbies, linux-kernel
In-Reply-To: <20190813102318.5521-2-ivan.khoronzhuk@linaro.org>
On 13 Aug 2019, at 3:23, Ivan Khoronzhuk wrote:
> That's needed to get __NR_mmap2 when mmap2 syscall is used.
>
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Acked-by: Jonathan Lemon <jonathan.lemon@gmail.com>
^ permalink raw reply
* Re: [PATCH net] netdevsim: Restore per-network namespace accounting for fib entries
From: David Miller @ 2019-08-13 17:40 UTC (permalink / raw)
To: jiri; +Cc: dsahern, netdev, dsahern
In-Reply-To: <20190813071445.GL2428@nanopsycho>
From: Jiri Pirko <jiri@resnulli.us>
Date: Tue, 13 Aug 2019 09:14:45 +0200
> Mon, Aug 12, 2019 at 05:28:02PM CEST, davem@davemloft.net wrote:
>>From: Jiri Pirko <jiri@resnulli.us>
>>Date: Mon, 12 Aug 2019 10:36:35 +0200
>>
>>> I understand it with real devices, but dummy testing device, who's
>>> purpose is just to test API. Why?
>>
>>Because you'll break all of the wonderful testing infrastructure
>>people like David have created.
>
> Are you referring to selftests? There is no such test there :(
> But if it would be, could implement the limitations
> properly (like using cgroups), change the tests and remove this
> code from netdevsim?
What about older kernels? Can't you see how illogical this is?
^ permalink raw reply
* Re: [PATCH bpf-next 3/3] samples: bpf: syscal_nrs: use mmap2 if defined
From: Jonathan Lemon @ 2019-08-13 17:41 UTC (permalink / raw)
To: Ivan Khoronzhuk
Cc: magnus.karlsson, bjorn.topel, davem, hawk, john.fastabend,
jakub.kicinski, daniel, netdev, bpf, xdp-newbies, linux-kernel
In-Reply-To: <20190813102318.5521-4-ivan.khoronzhuk@linaro.org>
On 13 Aug 2019, at 3:23, Ivan Khoronzhuk wrote:
> For arm32 xdp sockets mmap2 is preferred, so use it if it's defined.
>
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Doesn't this change the application API?
--
Jonathan
> ---
> samples/bpf/syscall_nrs.c | 5 +++++
> samples/bpf/tracex5_kern.c | 11 +++++++++++
> 2 files changed, 16 insertions(+)
>
> diff --git a/samples/bpf/syscall_nrs.c b/samples/bpf/syscall_nrs.c
> index 516e255cbe8f..2dec94238350 100644
> --- a/samples/bpf/syscall_nrs.c
> +++ b/samples/bpf/syscall_nrs.c
> @@ -9,5 +9,10 @@ void syscall_defines(void)
> COMMENT("Linux system call numbers.");
> SYSNR(__NR_write);
> SYSNR(__NR_read);
> +#ifdef __NR_mmap2
> + SYSNR(__NR_mmap2);
> +#else
> SYSNR(__NR_mmap);
> +#endif
> +
> }
> diff --git a/samples/bpf/tracex5_kern.c b/samples/bpf/tracex5_kern.c
> index f57f4e1ea1ec..300350ad299a 100644
> --- a/samples/bpf/tracex5_kern.c
> +++ b/samples/bpf/tracex5_kern.c
> @@ -68,12 +68,23 @@ PROG(SYS__NR_read)(struct pt_regs *ctx)
> return 0;
> }
>
> +#ifdef __NR_mmap2
> +PROG(SYS__NR_mmap2)(struct pt_regs *ctx)
> +{
> + char fmt[] = "mmap2\n";
> +
> + bpf_trace_printk(fmt, sizeof(fmt));
> + return 0;
> +}
> +#else
> PROG(SYS__NR_mmap)(struct pt_regs *ctx)
> {
> char fmt[] = "mmap\n";
> +
> bpf_trace_printk(fmt, sizeof(fmt));
> return 0;
> }
> +#endif
>
> char _license[] SEC("license") = "GPL";
> u32 _version SEC("version") = LINUX_VERSION_CODE;
> --
> 2.17.1
^ permalink raw reply
* Re: [PATCH bpf-next 2/3] xdp: xdp_umem: replace kmap on vmap for umem map
From: Jonathan Lemon @ 2019-08-13 17:42 UTC (permalink / raw)
To: Ivan Khoronzhuk
Cc: magnus.karlsson, bjorn.topel, davem, hawk, john.fastabend,
jakub.kicinski, daniel, netdev, bpf, xdp-newbies, linux-kernel
In-Reply-To: <20190813102318.5521-3-ivan.khoronzhuk@linaro.org>
On 13 Aug 2019, at 3:23, Ivan Khoronzhuk wrote:
> For 64-bit there is no reason to use vmap/vunmap, so use page_address
> as it was initially. For 32 bits, in some apps, like in samples
> xdpsock_user.c when number of pgs in use is quite big, the kmap
> memory can be not enough, despite on this, kmap looks like is
> deprecated in such cases as it can block and should be used rather
> for dynamic mm.
>
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Seems a bit overkill - if not high memory, kmap() falls back
to just page_address(), unlike vmap().
--
Jonathan
> ---
> net/xdp/xdp_umem.c | 16 ++++++++++++----
> 1 file changed, 12 insertions(+), 4 deletions(-)
>
> diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
> index a0607969f8c0..907c9019fe21 100644
> --- a/net/xdp/xdp_umem.c
> +++ b/net/xdp/xdp_umem.c
> @@ -14,7 +14,7 @@
> #include <linux/netdevice.h>
> #include <linux/rtnetlink.h>
> #include <linux/idr.h>
> -#include <linux/highmem.h>
> +#include <linux/vmalloc.h>
>
> #include "xdp_umem.h"
> #include "xsk_queue.h"
> @@ -167,10 +167,12 @@ void xdp_umem_clear_dev(struct xdp_umem *umem)
>
> static void xdp_umem_unmap_pages(struct xdp_umem *umem)
> {
> +#if BITS_PER_LONG == 32
> unsigned int i;
>
> for (i = 0; i < umem->npgs; i++)
> - kunmap(umem->pgs[i]);
> + vunmap(umem->pages[i].addr);
> +#endif
> }
>
> static void xdp_umem_unpin_pages(struct xdp_umem *umem)
> @@ -378,8 +380,14 @@ static int xdp_umem_reg(struct xdp_umem *umem,
> struct xdp_umem_reg *mr)
> goto out_account;
> }
>
> - for (i = 0; i < umem->npgs; i++)
> - umem->pages[i].addr = kmap(umem->pgs[i]);
> + for (i = 0; i < umem->npgs; i++) {
> +#if BITS_PER_LONG == 32
> + umem->pages[i].addr = vmap(&umem->pgs[i], 1, VM_MAP,
> + PAGE_KERNEL);
> +#else
> + umem->pages[i].addr = page_address(umem->pgs[i]);
> +#endif
> + }
>
> return 0;
>
> --
> 2.17.1
^ permalink raw reply
* [PATCH net-next] page_pool: fix logic in __page_pool_get_cached
From: Jonathan Lemon @ 2019-08-13 17:45 UTC (permalink / raw)
To: netdev, davem
Cc: brouer, ilias.apalodimas, saeedm, ttoukan.linux, kernel-team
__page_pool_get_cached() will return NULL when the ring is
empty, even if there are pages present in the lookaside cache.
It is also possible to refill the cache, and then return a
NULL page.
Restructure the logic so eliminate both cases.
Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
---
net/core/page_pool.c | 39 ++++++++++++++++-----------------------
1 file changed, 16 insertions(+), 23 deletions(-)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 68510eb869ea..de09a74a39a4 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -82,12 +82,9 @@ EXPORT_SYMBOL(page_pool_create);
static struct page *__page_pool_get_cached(struct page_pool *pool)
{
struct ptr_ring *r = &pool->ring;
+ bool refill = false;
struct page *page;
- /* Quicker fallback, avoid locks when ring is empty */
- if (__ptr_ring_empty(r))
- return NULL;
-
/* Test for safe-context, caller should provide this guarantee */
if (likely(in_serving_softirq())) {
if (likely(pool->alloc.count)) {
@@ -95,27 +92,23 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
page = pool->alloc.cache[--pool->alloc.count];
return page;
}
- /* Slower-path: Alloc array empty, time to refill
- *
- * Open-coded bulk ptr_ring consumer.
- *
- * Discussion: the ring consumer lock is not really
- * needed due to the softirq/NAPI protection, but
- * later need the ability to reclaim pages on the
- * ring. Thus, keeping the locks.
- */
- spin_lock(&r->consumer_lock);
- while ((page = __ptr_ring_consume(r))) {
- if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
- break;
- pool->alloc.cache[pool->alloc.count++] = page;
- }
- spin_unlock(&r->consumer_lock);
- return page;
+ refill = true;
}
- /* Slow-path: Get page from locked ring queue */
- page = ptr_ring_consume(&pool->ring);
+ /* Quicker fallback, avoid locks when ring is empty */
+ if (__ptr_ring_empty(r))
+ return NULL;
+
+ /* Slow-path: Get page from locked ring queue,
+ * refill alloc array if requested.
+ */
+ spin_lock(&r->consumer_lock);
+ page = __ptr_ring_consume(r);
+ if (refill)
+ pool->alloc.count = __ptr_ring_consume_batched(r,
+ pool->alloc.cache,
+ PP_ALLOC_CACHE_REFILL);
+ spin_unlock(&r->consumer_lock);
return page;
}
--
2.17.1
^ permalink raw reply related
* Re: [patch net-next v3 0/3] net: devlink: Finish network namespace support
From: Jakub Kicinski @ 2019-08-13 17:45 UTC (permalink / raw)
To: David Ahern; +Cc: Jiri Pirko, netdev, davem, stephen, mlxsw
In-Reply-To: <a9fa6f7f-7981-6077-106d-fa2abfc7397c@gmail.com>
On Mon, 12 Aug 2019 19:46:57 -0600, David Ahern wrote:
> On 8/12/19 7:11 PM, Jakub Kicinski wrote:
> > If the devlink instance just disappeared - that'd be a very very strange
> > thing. Only software objects disappear with the namespace.
> > Netdevices without ->rtnl_link_ops go back to init_net.
>
> netdevsim still has rtnl_link_ops:
>
> static struct rtnl_link_ops nsim_link_ops __read_mostly = {
> .kind = DRV_NAME,
> .validate = nsim_validate,
> };
The test harness is the only devlink instance which may
conceivably have link ops. And implementing the behaviour
you ask for would require core changes.
We are back to the precedent by test harness argument :(
^ permalink raw reply
* Re: [RFC PATCH 4/5] PTP: Add flag for non-periodic output
From: Richard Cochran @ 2019-08-13 17:48 UTC (permalink / raw)
To: Felipe Balbi
Cc: netdev, Thomas Gleixner, Ingo Molnar, Borislav Petkov,
H . Peter Anvin, x86, linux-kernel, Christopher S . Hall
In-Reply-To: <87tvalxzzi.fsf@gmail.com>
On Tue, Aug 13, 2019 at 10:53:53AM +0300, Felipe Balbi wrote:
> before I send a new series built on top of this change, I thought I'd
> check with you if I'm on the right path. Below you can find my current
> take at the new IOCTLs. I maintained the same exact structures so that
> there's no maintenance burden. Also introduce a new IOCTL for every
> single one of the previously existing ones even though not all of them
> needed changes. The reason for that was just to make it easier for
> libary authors to update their library by a simple sed script adding '2'
> to the end of the IOCTL macro.
Sounds good. I have a few comments, below...
> Let me know if you want anything to be changed or had a different idea
> about any of this. Also, if you prefer that I finish the entire series
> before you review, no worries either ;-)
>
> Cheers, patch follows:
>
> From bc2aa511d4c2e2228590fb29604c6c33b56527ad Mon Sep 17 00:00:00 2001
> From: Felipe Balbi <felipe.balbi@linux.intel.com>
> Date: Tue, 13 Aug 2019 10:32:35 +0300
> Subject: [PATCH] PTP: introduce new versions of IOCTLs
>
> The current version of the IOCTL have a small problem which prevents
> us from extending the API by making use of reserved fields. In these
> new IOCTLs, we are now making sure that flags and rsv fields are zero
> which will allow us to extend the API in the future.
>
> Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
> ---
> drivers/ptp/ptp_chardev.c | 105 +++++++++++++++++++++++++++++++++
> include/uapi/linux/ptp_clock.h | 12 ++++
> 2 files changed, 117 insertions(+)
>
> diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
> index 18ffe449efdf..94775073527b 100644
> --- a/drivers/ptp/ptp_chardev.c
> +++ b/drivers/ptp/ptp_chardev.c
> @@ -126,6 +126,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> switch (cmd) {
>
> case PTP_CLOCK_GETCAPS:
> + case PTP_CLOCK_GETCAPS2:
> memset(&caps, 0, sizeof(caps));
> caps.max_adj = ptp->info->max_adj;
> caps.n_alarm = ptp->info->n_alarm;
> @@ -153,6 +154,28 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> err = ops->enable(ops, &req, enable);
> break;
>
> + case PTP_EXTTS_REQUEST2:
> + memset(&req, 0, sizeof(req));
This memset not needed, AFAICT. Oh wait, you want to keep drivers
from seeing stack data in the unused parts of the union. That is
fine, but please just do that unconditionally at the top of the
function.
> + if (copy_from_user(&req.extts, (void __user *)arg,
> + sizeof(req.extts))) {
> + err = -EFAULT;
> + break;
> + }
> + if (req.extts.flags || req.extts.rsv[0]
> + || req.extts.rsv[1]) {
> + err = -EINVAL;
Since the code is mostly the same as in the PTP_EXTTS_REQUEST case,
maybe just double up the case statements (like in the other) and add
an extra test for (cmd == PTP_EXTTS_REQUEST2) for this if-block.
> + break;
> + }
> +
> + if (req.extts.index >= ops->n_ext_ts) {
> + err = -EINVAL;
> + break;
> + }
> + req.type = PTP_CLK_REQ_EXTTS;
> + enable = req.extts.flags & PTP_ENABLE_FEATURE ? 1 : 0;
> + err = ops->enable(ops, &req, enable);
> + break;
> +
> case PTP_PEROUT_REQUEST:
> if (copy_from_user(&req.perout, (void __user *)arg,
> sizeof(req.perout))) {
> @@ -168,6 +191,28 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> err = ops->enable(ops, &req, enable);
> break;
>
> + case PTP_PEROUT_REQUEST2:
> + memset(&req, 0, sizeof(req));
> + if (copy_from_user(&req.perout, (void __user *)arg,
> + sizeof(req.perout))) {
> + err = -EFAULT;
> + break;
> + }
> + if (req.perout.flags || req.perout.rsv[0]
> + || req.perout.rsv[1] || req.perout.rsv[2]
> + || req.perout.rsv[3]) {
> + err = -EINVAL;
> + break;
> + }
Also this could share code with the legacy ioctl.
> + if (req.perout.index >= ops->n_per_out) {
> + err = -EINVAL;
> + break;
> + }
> + req.type = PTP_CLK_REQ_PEROUT;
> + enable = req.perout.period.sec || req.perout.period.nsec;
> + err = ops->enable(ops, &req, enable);
> + break;
> +
> case PTP_ENABLE_PPS:
> if (!capable(CAP_SYS_TIME))
> return -EPERM;
> @@ -176,7 +221,17 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> err = ops->enable(ops, &req, enable);
> break;
>
> + case PTP_ENABLE_PPS2:
> + if (!capable(CAP_SYS_TIME))
> + return -EPERM;
> + memset(&req, 0, sizeof(req));
Clearing 'req' unconditionally will make this case the same as the
legacy case.
> + req.type = PTP_CLK_REQ_PPS;
> + enable = arg ? 1 : 0;
> + err = ops->enable(ops, &req, enable);
> + break;
> +
> case PTP_SYS_OFFSET_PRECISE:
> + case PTP_SYS_OFFSET_PRECISE2:
> if (!ptp->info->getcrosststamp) {
> err = -EOPNOTSUPP;
> break;
> @@ -201,6 +256,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> break;
>
> case PTP_SYS_OFFSET_EXTENDED:
> + case PTP_SYS_OFFSET_EXTENDED2:
> if (!ptp->info->gettimex64) {
> err = -EOPNOTSUPP;
> break;
> @@ -232,6 +288,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> break;
>
> case PTP_SYS_OFFSET:
> + case PTP_SYS_OFFSET2:
> sysoff = memdup_user((void __user *)arg, sizeof(*sysoff));
> if (IS_ERR(sysoff)) {
> err = PTR_ERR(sysoff);
> @@ -284,6 +341,31 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> err = -EFAULT;
> break;
>
> + case PTP_PIN_GETFUNC2:
> + memset(&pd, 0, sizeof(pd));
This memset is pointless because of the following copy_from_user().
> + if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
> + err = -EFAULT;
> + break;
> + }
> + if (pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
> + || pd.rsv[3] || pd.rsv[4]) {
> + err = -EINVAL;
> + break;
> + }
Again maybe share the code?
> + pin_index = pd.index;
> + if (pin_index >= ops->n_pins) {
> + err = -EINVAL;
> + break;
> + }
> + pin_index = array_index_nospec(pin_index, ops->n_pins);
> + if (mutex_lock_interruptible(&ptp->pincfg_mux))
> + return -ERESTARTSYS;
> + pd = ops->pin_config[pin_index];
> + mutex_unlock(&ptp->pincfg_mux);
> + if (!err && copy_to_user((void __user *)arg, &pd, sizeof(pd)))
> + err = -EFAULT;
> + break;
> +
> case PTP_PIN_SETFUNC:
> if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
> err = -EFAULT;
> @@ -301,6 +383,29 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
> mutex_unlock(&ptp->pincfg_mux);
> break;
>
> + case PTP_PIN_SETFUNC2:
> + memset(&pd, 0, sizeof(pd));
memset not needed here either.
> + if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
> + err = -EFAULT;
> + break;
> + }
> + if (pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
> + || pd.rsv[3] || pd.rsv[4]) {
> + err = -EINVAL;
> + break;
> + }
also shareable.
Thanks,
Richard
> + pin_index = pd.index;
> + if (pin_index >= ops->n_pins) {
> + err = -EINVAL;
> + break;
> + }
> + pin_index = array_index_nospec(pin_index, ops->n_pins);
> + if (mutex_lock_interruptible(&ptp->pincfg_mux))
> + return -ERESTARTSYS;
> + err = ptp_set_pinfunc(ptp, pin_index, pd.func, pd.chan);
> + mutex_unlock(&ptp->pincfg_mux);
> + break;
> +
> default:
> err = -ENOTTY;
> break;
> diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
> index 1bc794ad957a..039cd62ec706 100644
> --- a/include/uapi/linux/ptp_clock.h
> +++ b/include/uapi/linux/ptp_clock.h
> @@ -149,6 +149,18 @@ struct ptp_pin_desc {
> #define PTP_SYS_OFFSET_EXTENDED \
> _IOWR(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
>
> +#define PTP_CLOCK_GETCAPS2 _IOR(PTP_CLK_MAGIC, 10, struct ptp_clock_caps)
> +#define PTP_EXTTS_REQUEST2 _IOW(PTP_CLK_MAGIC, 11, struct ptp_extts_request)
> +#define PTP_PEROUT_REQUEST2 _IOW(PTP_CLK_MAGIC, 12, struct ptp_perout_request)
> +#define PTP_ENABLE_PPS2 _IOW(PTP_CLK_MAGIC, 13, int)
> +#define PTP_SYS_OFFSET2 _IOW(PTP_CLK_MAGIC, 14, struct ptp_sys_offset)
> +#define PTP_PIN_GETFUNC2 _IOWR(PTP_CLK_MAGIC, 15, struct ptp_pin_desc)
> +#define PTP_PIN_SETFUNC2 _IOW(PTP_CLK_MAGIC, 16, struct ptp_pin_desc)
> +#define PTP_SYS_OFFSET_PRECISE2 \
> + _IOWR(PTP_CLK_MAGIC, 17, struct ptp_sys_offset_precise)
> +#define PTP_SYS_OFFSET_EXTENDED2 \
> + _IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended)
> +
> struct ptp_extts_event {
> struct ptp_clock_time t; /* Time event occured. */
> unsigned int index; /* Which channel produced the event. */
> --
> 2.22.0
>
>
>
> --
> balbi
^ permalink raw reply
* Re: [RFC PATCH 0/5] PTP: add support for Intel's TGPIO controller
From: Richard Cochran @ 2019-08-13 17:49 UTC (permalink / raw)
To: Felipe Balbi
Cc: Andrew Lunn, netdev, Thomas Gleixner, Ingo Molnar,
Borislav Petkov, H . Peter Anvin, x86, linux-kernel,
Christopher S . Hall
In-Reply-To: <87wofhy05t.fsf@gmail.com>
On Tue, Aug 13, 2019 at 10:50:06AM +0300, Felipe Balbi wrote:
> If we do that we make it difficult for those reading specification and
> trying to find the matching driver.
+1
Thanks,
Richard
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox