* [PATCH net-next 09/10] selftests: udp gso with corking
From: Willem de Bruijn @ 2018-04-25 18:56 UTC (permalink / raw)
To: netdev; +Cc: davem, alexander.duyck, dmichail, Willem de Bruijn
In-Reply-To: <20180425185601.55511-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Corked sockets take a different path to construct a udp datagram than
the lockless fast path. Test this alternate path.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/udpgso.c | 42 ++++++++++++++++++++-------
tools/testing/selftests/net/udpgso.sh | 6 ++++
2 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index a47dca025346..6b1998b9f7df 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -49,6 +49,7 @@ static bool cfg_do_ipv4;
static bool cfg_do_ipv6;
static bool cfg_do_connected;
static bool cfg_do_connectionless;
+static bool cfg_do_msgmore;
static bool cfg_do_setsockopt;
static int cfg_specific_test_id = -1;
@@ -369,6 +370,23 @@ static void set_route_mtu(int mtu, bool is_ipv4)
fprintf(stderr, "route mtu (test): %u\n", mtu);
}
+static bool __send_one(int fd, struct msghdr *msg, int flags)
+{
+ int ret;
+
+ ret = sendmsg(fd, msg, flags);
+ if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
+ return false;
+ if (ret == -1)
+ error(1, errno, "sendmsg");
+ if (ret != msg->msg_iov->iov_len)
+ error(1, 0, "sendto: %d != %lu", ret, msg->msg_iov->iov_len);
+ if (msg->msg_flags)
+ error(1, 0, "sendmsg: return flags 0x%x\n", msg->msg_flags);
+
+ return true;
+}
+
static bool send_one(int fd, int len, int gso_len,
struct sockaddr *addr, socklen_t alen)
{
@@ -376,7 +394,6 @@ static bool send_one(int fd, int len, int gso_len,
struct msghdr msg = {0};
struct iovec iov = {0};
struct cmsghdr *cm;
- int ret;
iov.iov_base = buf;
iov.iov_len = len;
@@ -398,15 +415,17 @@ static bool send_one(int fd, int len, int gso_len,
*((uint16_t *) CMSG_DATA(cm)) = gso_len;
}
- ret = sendmsg(fd, &msg, 0);
- if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
- return false;
- if (ret == -1)
- error(1, errno, "sendmsg");
- if (ret != len)
- error(1, 0, "sendto: %d != %u", ret, len);
+ /* If MSG_MORE, send 1 byte followed by remainder */
+ if (cfg_do_msgmore && len > 1) {
+ iov.iov_len = 1;
+ if (!__send_one(fd, &msg, MSG_MORE))
+ error(1, 0, "send 1B failed");
- return true;
+ iov.iov_base++;
+ iov.iov_len = len - 1;
+ }
+
+ return __send_one(fd, &msg, 0);
}
static int recv_one(int fd, int flags)
@@ -558,7 +577,7 @@ static void parse_opts(int argc, char **argv)
{
int c;
- while ((c = getopt(argc, argv, "46cCst:")) != -1) {
+ while ((c = getopt(argc, argv, "46cCmst:")) != -1) {
switch (c) {
case '4':
cfg_do_ipv4 = true;
@@ -572,6 +591,9 @@ static void parse_opts(int argc, char **argv)
case 'C':
cfg_do_connectionless = true;
break;
+ case 'm':
+ cfg_do_msgmore = true;
+ break;
case 's':
cfg_do_setsockopt = true;
break;
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
index 7cdf0e7c1dde..fec24f584fe9 100755
--- a/tools/testing/selftests/net/udpgso.sh
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -21,3 +21,9 @@ echo "ipv4 connected"
# blocked on 2nd loopback address
# echo "ipv6 connected"
# ./in_netns.sh ./udpgso -6 -c
+
+echo "ipv4 msg_more"
+./in_netns.sh ./udpgso -4 -C -m
+
+echo "ipv6 msg_more"
+./in_netns.sh ./udpgso -6 -C -m
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* [PATCH net-next 10/10] selftests: udp gso benchmark
From: Willem de Bruijn @ 2018-04-25 18:56 UTC (permalink / raw)
To: netdev; +Cc: davem, alexander.duyck, dmichail, Willem de Bruijn
In-Reply-To: <20180425185601.55511-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Send udp data between a source and sink, optionally with udp gso.
The two processes are expected to be run on separate hosts.
A script is included that runs them together over loopback in a
single namespace for functionality testing.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/.gitignore | 2 +
tools/testing/selftests/net/Makefile | 3 +-
tools/testing/selftests/net/udpgso_bench.sh | 74 +++
tools/testing/selftests/net/udpgso_bench_rx.c | 265 +++++++++++
tools/testing/selftests/net/udpgso_bench_tx.c | 420 ++++++++++++++++++
5 files changed, 763 insertions(+), 1 deletion(-)
create mode 100755 tools/testing/selftests/net/udpgso_bench.sh
create mode 100644 tools/testing/selftests/net/udpgso_bench_rx.c
create mode 100644 tools/testing/selftests/net/udpgso_bench_tx.c
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 67a26240407c..f0e6c35a93ae 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -9,3 +9,5 @@ reuseport_dualstack
reuseaddr_conflict
tcp_mmap
udpgso
+udpgso_bench_rx
+udpgso_bench_tx
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index db3303348315..df9102ec7b7a 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,12 +6,13 @@ CFLAGS += -I../../../../usr/include/
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
TEST_PROGS += fib_tests.sh fib-onlink-tests.sh in_netns.sh pmtu.sh udpgso.sh
+TEST_PROGS += udpgso_bench.sh
TEST_GEN_FILES = socket
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
TEST_GEN_FILES += tcp_mmap
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
-TEST_GEN_PROGS += udpgso
+TEST_GEN_PROGS += udpgso udpgso_bench_tx udpgso_bench_rx
include ../lib.mk
diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh
new file mode 100755
index 000000000000..792fa4d0285e
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgso benchmarks
+
+wake_children() {
+ local -r jobs="$(jobs -p)"
+
+ if [[ "${jobs}" != "" ]]; then
+ kill -1 ${jobs} 2>/dev/null
+ fi
+}
+trap wake_children EXIT
+
+run_one() {
+ local -r args=$@
+
+ ./udpgso_bench_rx &
+ ./udpgso_bench_rx -t &
+
+ ./udpgso_bench_tx ${args}
+}
+
+run_in_netns() {
+ local -r args=$@
+
+ ./in_netns.sh $0 __subprocess ${args}
+}
+
+run_udp() {
+ local -r args=$@
+
+ echo "udp"
+ run_in_netns ${args}
+
+ echo "udp gso"
+ run_in_netns ${args} -S
+
+ echo "udp gso zerocopy"
+ run_in_netns ${args} -S -z
+}
+
+run_tcp() {
+ local -r args=$@
+
+ echo "tcp"
+ run_in_netns ${args} -t
+
+ echo "tcp zerocopy"
+ run_in_netns ${args} -t -z
+}
+
+run_all() {
+ local -r core_args="-l 4"
+ local -r ipv4_args="${core_args} -4 -D 127.0.0.1"
+ local -r ipv6_args="${core_args} -6 -D ::1"
+
+ echo "ipv4"
+ run_tcp "${ipv4_args}"
+ run_udp "${ipv4_args}"
+
+ echo "ipv6"
+ run_tcp "${ipv4_args}"
+ run_udp "${ipv6_args}"
+}
+
+if [[ $# -eq 0 ]]; then
+ run_all
+elif [[ $1 == "__subprocess" ]]; then
+ shift
+ run_one $@
+else
+ run_in_netns $@
+fi
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
new file mode 100644
index 000000000000..727cf67a3f75
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/errqueue.h>
+#include <linux/if_packet.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+static int cfg_port = 8000;
+static bool cfg_tcp;
+static bool cfg_verify;
+
+static bool interrupted;
+static unsigned long packets, bytes;
+
+static void sigint_handler(int signum)
+{
+ if (signum == SIGINT)
+ interrupted = true;
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void do_poll(int fd)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.events = POLLIN;
+ pfd.revents = 0;
+ pfd.fd = fd;
+
+ do {
+ ret = poll(&pfd, 1, 10);
+ if (ret == -1)
+ error(1, errno, "poll");
+ if (ret == 0)
+ continue;
+ if (pfd.revents != POLLIN)
+ error(1, errno, "poll: 0x%x expected 0x%x\n",
+ pfd.revents, POLLIN);
+ } while (!ret && !interrupted);
+}
+
+static int do_socket(bool do_tcp)
+{
+ struct sockaddr_in6 addr = {0};
+ int fd, val;
+
+ fd = socket(PF_INET6, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket");
+
+ val = 1 << 21;
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)))
+ error(1, errno, "setsockopt rcvbuf");
+ val = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)))
+ error(1, errno, "setsockopt reuseport");
+
+ addr.sin6_family = PF_INET6;
+ addr.sin6_port = htons(cfg_port);
+ addr.sin6_addr = in6addr_any;
+ if (bind(fd, (void *) &addr, sizeof(addr)))
+ error(1, errno, "bind");
+
+ if (do_tcp) {
+ int accept_fd = fd;
+
+ if (listen(accept_fd, 1))
+ error(1, errno, "listen");
+
+ do_poll(accept_fd);
+
+ fd = accept(accept_fd, NULL, NULL);
+ if (fd == -1)
+ error(1, errno, "accept");
+ if (close(accept_fd))
+ error(1, errno, "close accept fd");
+ }
+
+ return fd;
+}
+
+/* Flush all outstanding bytes for the tcp receive queue */
+static void do_flush_tcp(int fd)
+{
+ int ret;
+
+ while (true) {
+ /* MSG_TRUNC flushes up to len bytes */
+ ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ return;
+ if (ret == -1)
+ error(1, errno, "flush");
+ if (ret == 0) {
+ /* client detached */
+ exit(0);
+ }
+
+ packets++;
+ bytes += ret;
+ }
+
+}
+
+static char sanitized_char(char val)
+{
+ return (val >= 'a' && val <= 'z') ? val : '.';
+}
+
+static void do_verify_udp(const char *data, int len)
+{
+ char cur = data[0];
+ int i;
+
+ /* verify contents */
+ if (cur < 'a' || cur > 'z')
+ error(1, 0, "data initial byte out of range");
+
+ for (i = 1; i < len; i++) {
+ if (cur == 'z')
+ cur = 'a';
+ else
+ cur++;
+
+ if (data[i] != cur)
+ error(1, 0, "data[%d]: len %d, %c(%hhu) != %c(%hhu)\n",
+ i, len,
+ sanitized_char(data[i]), data[i],
+ sanitized_char(cur), cur);
+ }
+}
+
+/* Flush all outstanding datagrams. Verify first few bytes of each. */
+static void do_flush_udp(int fd)
+{
+ static char rbuf[ETH_DATA_LEN];
+ int ret, len, budget = 256;
+
+ len = cfg_verify ? sizeof(rbuf) : 0;
+ while (budget--) {
+ /* MSG_TRUNC will make return value full datagram length */
+ ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ return;
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (len) {
+ if (ret == 0)
+ error(1, errno, "recv: 0 byte datagram\n");
+
+ do_verify_udp(rbuf, ret);
+ }
+
+ packets++;
+ bytes += ret;
+ }
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s [-tv] [-p port]", filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "ptv")) != -1) {
+ switch (c) {
+ case 'p':
+ cfg_port = htons(strtoul(optarg, NULL, 0));
+ break;
+ case 't':
+ cfg_tcp = true;
+ break;
+ case 'v':
+ cfg_verify = true;
+ break;
+ }
+ }
+
+ if (optind != argc)
+ usage(argv[0]);
+
+ if (cfg_tcp && cfg_verify)
+ error(1, 0, "TODO: implement verify mode for tcp");
+}
+
+static void do_recv(void)
+{
+ unsigned long tnow, treport;
+ int fd;
+
+ fd = do_socket(cfg_tcp);
+
+ treport = gettimeofday_ms() + 1000;
+ do {
+ do_poll(fd);
+
+ if (cfg_tcp)
+ do_flush_tcp(fd);
+ else
+ do_flush_udp(fd);
+
+ tnow = gettimeofday_ms();
+ if (tnow > treport) {
+ if (packets)
+ fprintf(stderr,
+ "%s rx: %6lu MB/s %8lu calls/s\n",
+ cfg_tcp ? "tcp" : "udp",
+ bytes >> 20, packets);
+ bytes = packets = 0;
+ treport = tnow + 1000;
+ }
+
+ } while (!interrupted);
+
+ if (close(fd))
+ error(1, errno, "close");
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ signal(SIGINT, sigint_handler);
+
+ do_recv();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
new file mode 100644
index 000000000000..e821564053cf
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef ETH_MAX_MTU
+#define ETH_MAX_MTU 0xFFFFU
+#endif
+
+#ifndef UDP_SEGMENT
+#define UDP_SEGMENT 103
+#endif
+
+#ifndef SO_ZEROCOPY
+#define SO_ZEROCOPY 60
+#endif
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0x4000000
+#endif
+
+#define NUM_PKT 100
+
+static bool cfg_cache_trash;
+static int cfg_cpu = -1;
+static int cfg_connected = true;
+static int cfg_family = PF_UNSPEC;
+static uint16_t cfg_mss;
+static int cfg_payload_len = (1472 * 42);
+static int cfg_port = 8000;
+static int cfg_runtime_ms = -1;
+static bool cfg_segment;
+static bool cfg_sendmmsg;
+static bool cfg_tcp;
+static bool cfg_zerocopy;
+
+static socklen_t cfg_alen;
+static struct sockaddr_storage cfg_dst_addr;
+
+static bool interrupted;
+static char buf[NUM_PKT][ETH_MAX_MTU];
+
+static void sigint_handler(int signum)
+{
+ if (signum == SIGINT)
+ interrupted = true;
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static int set_cpu(int cpu)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask))
+ error(1, 0, "setaffinity %d", cpu);
+
+ return 0;
+}
+
+static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
+{
+ struct sockaddr_in6 *addr6 = (void *) sockaddr;
+ struct sockaddr_in *addr4 = (void *) sockaddr;
+
+ switch (domain) {
+ case PF_INET:
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(cfg_port);
+ if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+ error(1, 0, "ipv4 parse error: %s", str_addr);
+ break;
+ case PF_INET6:
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(cfg_port);
+ if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+ error(1, 0, "ipv6 parse error: %s", str_addr);
+ break;
+ default:
+ error(1, 0, "illegal domain");
+ }
+}
+
+static void flush_zerocopy(int fd)
+{
+ struct msghdr msg = {0}; /* flush */
+ int ret;
+
+ while (1) {
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (ret == -1 && errno == EAGAIN)
+ break;
+ if (ret == -1)
+ error(1, errno, "errqueue");
+ if (msg.msg_flags != (MSG_ERRQUEUE | MSG_CTRUNC))
+ error(1, 0, "errqueue: flags 0x%x\n", msg.msg_flags);
+ msg.msg_flags = 0;
+ }
+}
+
+static int send_tcp(int fd, char *data)
+{
+ int ret, done = 0, count = 0;
+
+ while (done < cfg_payload_len) {
+ ret = send(fd, data + done, cfg_payload_len - done,
+ cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "write");
+
+ done += ret;
+ count++;
+ }
+
+ return count;
+}
+
+static int send_udp(int fd, char *data)
+{
+ int ret, total_len, len, count = 0;
+
+ total_len = cfg_payload_len;
+
+ while (total_len) {
+ len = total_len < cfg_mss ? total_len : cfg_mss;
+
+ ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
+ cfg_connected ? NULL : (void *)&cfg_dst_addr,
+ cfg_connected ? 0 : cfg_alen);
+ if (ret == -1)
+ error(1, errno, "write");
+ if (ret != len)
+ error(1, errno, "write: %uB != %uB\n", ret, len);
+
+ total_len -= len;
+ count++;
+ }
+
+ return count;
+}
+
+static int send_udp_sendmmsg(int fd, char *data)
+{
+ const int max_nr_msg = ETH_MAX_MTU / ETH_DATA_LEN;
+ struct mmsghdr mmsgs[max_nr_msg];
+ struct iovec iov[max_nr_msg];
+ unsigned int off = 0, left;
+ int i = 0, ret;
+
+ memset(mmsgs, 0, sizeof(mmsgs));
+
+ left = cfg_payload_len;
+ while (left) {
+ if (i == max_nr_msg)
+ error(1, 0, "sendmmsg: exceeds max_nr_msg");
+
+ iov[i].iov_base = data + off;
+ iov[i].iov_len = cfg_mss < left ? cfg_mss : left;
+
+ mmsgs[i].msg_hdr.msg_iov = iov + i;
+ mmsgs[i].msg_hdr.msg_iovlen = 1;
+
+ off += iov[i].iov_len;
+ left -= iov[i].iov_len;
+ i++;
+ }
+
+ ret = sendmmsg(fd, mmsgs, i, cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "sendmmsg");
+
+ return ret;
+}
+
+static void send_udp_segment_cmsg(struct cmsghdr *cm)
+{
+ uint16_t *valp;
+
+ cm->cmsg_level = SOL_UDP;
+ cm->cmsg_type = UDP_SEGMENT;
+ cm->cmsg_len = CMSG_LEN(sizeof(cfg_mss));
+ valp = (void *)CMSG_DATA(cm);
+ *valp = cfg_mss;
+}
+
+static int send_udp_segment(int fd, char *data)
+{
+ char control[CMSG_SPACE(sizeof(cfg_mss))] = {0};
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ int ret;
+
+ iov.iov_base = data;
+ iov.iov_len = cfg_payload_len;
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ send_udp_segment_cmsg(CMSG_FIRSTHDR(&msg));
+
+ msg.msg_name = (void *)&cfg_dst_addr;
+ msg.msg_namelen = cfg_alen;
+
+ ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "sendmsg");
+ if (ret != iov.iov_len)
+ error(1, 0, "sendmsg: %u != %lu\n", ret, iov.iov_len);
+
+ return 1;
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s [-46cmStuz] [-C cpu] [-D dst ip] [-l secs] [-p port] [-s sendsize]",
+ filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int max_len, hdrlen;
+ int c;
+
+ while ((c = getopt(argc, argv, "46cC:D:l:mp:s:Stuz")) != -1) {
+ switch (c) {
+ case '4':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET;
+ cfg_alen = sizeof(struct sockaddr_in);
+ break;
+ case '6':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET6;
+ cfg_alen = sizeof(struct sockaddr_in6);
+ break;
+ case 'c':
+ cfg_cache_trash = true;
+ break;
+ case 'C':
+ cfg_cpu = strtol(optarg, NULL, 0);
+ break;
+ case 'D':
+ setup_sockaddr(cfg_family, optarg, &cfg_dst_addr);
+ break;
+ case 'l':
+ cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000;
+ break;
+ case 'm':
+ cfg_sendmmsg = true;
+ break;
+ case 'p':
+ cfg_port = strtoul(optarg, NULL, 0);
+ break;
+ case 's':
+ cfg_payload_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ cfg_segment = true;
+ break;
+ case 't':
+ cfg_tcp = true;
+ break;
+ case 'u':
+ cfg_connected = false;
+ break;
+ case 'z':
+ cfg_zerocopy = true;
+ break;
+ }
+ }
+
+ if (optind != argc)
+ usage(argv[0]);
+
+ if (cfg_family == PF_UNSPEC)
+ error(1, 0, "must pass one of -4 or -6");
+ if (cfg_tcp && !cfg_connected)
+ error(1, 0, "connectionless tcp makes no sense");
+ if (cfg_segment && cfg_sendmmsg)
+ error(1, 0, "cannot combine segment offload and sendmmsg");
+
+ if (cfg_family == PF_INET)
+ hdrlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+ else
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct udphdr);
+
+ cfg_mss = ETH_DATA_LEN - hdrlen;
+ max_len = ETH_MAX_MTU - hdrlen;
+
+ if (cfg_payload_len > max_len)
+ error(1, 0, "payload length %u exceeds max %u",
+ cfg_payload_len, max_len);
+}
+
+static void set_pmtu_discover(int fd, bool is_ipv4)
+{
+ int level, name, val;
+
+ if (is_ipv4) {
+ level = SOL_IP;
+ name = IP_MTU_DISCOVER;
+ val = IP_PMTUDISC_DO;
+ } else {
+ level = SOL_IPV6;
+ name = IPV6_MTU_DISCOVER;
+ val = IPV6_PMTUDISC_DO;
+ }
+
+ if (setsockopt(fd, level, name, &val, sizeof(val)))
+ error(1, errno, "setsockopt path mtu");
+}
+
+int main(int argc, char **argv)
+{
+ unsigned long num_msgs, num_sends;
+ unsigned long tnow, treport, tstop;
+ int fd, i, val;
+
+ parse_opts(argc, argv);
+
+ if (cfg_cpu > 0)
+ set_cpu(cfg_cpu);
+
+ for (i = 0; i < sizeof(buf[0]); i++)
+ buf[0][i] = 'a' + (i % 26);
+ for (i = 1; i < NUM_PKT; i++)
+ memcpy(buf[i], buf[0], sizeof(buf[0]));
+
+ signal(SIGINT, sigint_handler);
+
+ fd = socket(cfg_family, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket");
+
+ if (cfg_zerocopy) {
+ val = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val)))
+ error(1, errno, "setsockopt zerocopy");
+ }
+
+ if (cfg_connected &&
+ connect(fd, (void *)&cfg_dst_addr, cfg_alen))
+ error(1, errno, "connect");
+
+ if (cfg_segment)
+ set_pmtu_discover(fd, cfg_family == PF_INET);
+
+ num_msgs = num_sends = 0;
+ tnow = gettimeofday_ms();
+ tstop = tnow + cfg_runtime_ms;
+ treport = tnow + 1000;
+
+ i = 0;
+ do {
+ if (cfg_tcp)
+ num_sends += send_tcp(fd, buf[i]);
+ else if (cfg_segment)
+ num_sends += send_udp_segment(fd, buf[i]);
+ else if (cfg_sendmmsg)
+ num_sends += send_udp_sendmmsg(fd, buf[i]);
+ else
+ num_sends += send_udp(fd, buf[i]);
+ num_msgs++;
+
+ if (cfg_zerocopy && ((num_msgs & 0xF) == 0))
+ flush_zerocopy(fd);
+
+ tnow = gettimeofday_ms();
+ if (tnow > treport) {
+ fprintf(stderr,
+ "%s tx: %6lu MB/s %8lu calls/s %6lu msg/s\n",
+ cfg_tcp ? "tcp" : "udp",
+ (num_msgs * cfg_payload_len) >> 20,
+ num_sends, num_msgs);
+ num_msgs = num_sends = 0;
+ treport = tnow + 1000;
+ }
+
+ /* cold cache when writing buffer */
+ if (cfg_cache_trash)
+ i = ++i < NUM_PKT ? i : 0;
+
+ } while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop));
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ return 0;
+}
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* Re: [PATCH bpf-next 13/15] xsk: support for Tx
From: Willem de Bruijn @ 2018-04-25 19:00 UTC (permalink / raw)
To: Magnus Karlsson
Cc: Björn Töpel, Karlsson, Magnus, Alexander Duyck,
Alexander Duyck, John Fastabend, Alexei Starovoitov,
Jesper Dangaard Brouer, Daniel Borkmann, Michael S. Tsirkin,
Network Development, michael.lundkvist, Brandeburg, Jesse,
Singhai, Anjali, Zhang, Qi Z
In-Reply-To: <CAJ8uoz1crwU_no=dbQsZsjxjRX+iDVv0pxC6+pPhVzNkCxh-3A@mail.gmail.com>
>>> +{
>>> + struct net_device *dev = skb->dev;
>>> + struct sk_buff *orig_skb = skb;
>>> + struct netdev_queue *txq;
>>> + int ret = NETDEV_TX_BUSY;
>>> + bool again = false;
>>> +
>>> + if (unlikely(!netif_running(dev) || !netif_carrier_ok(dev)))
>>> + goto drop;
>>> +
>>> + skb = validate_xmit_skb_list(skb, dev, &again);
>>> + if (skb != orig_skb)
>>> + return NET_XMIT_DROP;
>>
>> Need to free generated segment list on error, see packet_direct_xmit.
>
> I do not use segments in the TX code for reasons of simplicity and the
> free is in the calling function. But as I will create a common
> packet_direct_xmit according to your suggestion, it will have a
> kfree_skb_list() there as in af_packet.c.
Ah yes. For these sockets it is guaranteed that sbks are not gso skbs.
Of course, makes sense.
>> static inline struct xdp_desc *xskq_peek_desc(struct xsk_queue *q,
>> + struct xdp_desc *desc)
>> +{
>> + struct xdp_rxtx_ring *ring;
>> +
>> + if (q->cons_tail == q->cons_head) {
>> + WRITE_ONCE(q->ring->consumer, q->cons_tail);
>> + q->cons_head = q->cons_tail + xskq_nb_avail(q, RX_BATCH_SIZE);
>> +
>> + /* Order consumer and data */
>> + smp_rmb();
>> +
>> + return xskq_validate_desc(q, desc);
>> + }
>> +
>> + ring = (struct xdp_rxtx_ring *)q->ring;
>> + *desc = ring->desc[q->cons_tail & q->ring_mask];
>> + return desc;
>>
>> This only validates descriptors if taking the branch.
>
> Yes, that is because we only want to validate the descriptors once
> even if we call this function multiple times for the same entry.
Then I am probably misreading this function. But isn't head increased
by up to RX_BATCH_SIZE frames at once. If so, then for many frames
the branch is not taken.
^ permalink raw reply
* Re: [PATCH 0/3] Introduce LSM-hook for socketpair(2)
From: James Morris @ 2018-04-25 19:02 UTC (permalink / raw)
To: Paul Moore
Cc: David Herrmann, linux-kernel, teg, Stephen Smalley, selinux,
linux-security-module, Eric Paris, Serge E. Hallyn,
David S. Miller, netdev
In-Reply-To: <CAHC9VhR-V1R2BpCt1aA56q4yT_PWv1kzoWtksbNkm08stcOGPw@mail.gmail.com>
On Wed, 25 Apr 2018, Paul Moore wrote:
> On Wed, Apr 25, 2018 at 2:44 PM, James Morris <jmorris@namei.org> wrote:
> > On Mon, 23 Apr 2018, David Herrmann wrote:
> >> This patch series tries to close this gap and makes both behave the
> >> same. A new LSM-hook is added which allows LSMs to cache the correct
> >> peer information on newly created socket-pairs.
> >
> > Looks okay to me.
> >
> > Once it's respun with the Smack backend and maybe the hook name change,
> > I'll merge this unless DaveM wants it to go in via his networking tree.
>
> Note my objection to the hook placement in patch 2/3; I think we
> should move the hook out of the AF_UNIX layer and up into the socket
> layer.
I vote for this as it maintains the intended abstraction of the socket
API.
--
James Morris
<jmorris@namei.org>
^ permalink raw reply
* Re: [Cake] [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: Eric Dumazet @ 2018-04-25 19:02 UTC (permalink / raw)
To: Toke Høiland-Jørgensen, Eric Dumazet, Jonathan Morton
Cc: netdev, cake
In-Reply-To: <87lgdb5e4w.fsf@toke.dk>
On 04/25/2018 11:34 AM, Toke Høiland-Jørgensen wrote:
> Eric Dumazet <eric.dumazet@gmail.com> writes:
>
>> On 04/25/2018 09:52 AM, Jonathan Morton wrote:
>>>> We can see here the high cost of forcing software GSO :/
>>>>
>>>> Really, this should be done only :
>>>> 1) If requested by the admin ( tc .... gso ....)
>>>>
>>>> 2) If packet size is above a threshold.
>>>> The threshold could be set by the admin, and/or based on a fraction of the bandwidth parameter.
>>>>
>>>> I totally understand why you prefer to segment yourself for < 100 Mbit links.
>>>>
>>>> But this makes no sense on 10Gbit+
>>>
>>> It is absolutely necessary, so far as I can see, to segment GSO
>>> superpackets when overhead compensation is selected - as it very
>>> often should be, even on pure Ethernet links. Without that, the
>>> calculation of link occupancy time will be wrong. (The actual
>>> transmission time of an Ethernet frame is rather more than just 14
>>> bytes longer than the underlying IP packet.)
>>
>> Just fix the overhead compensation computation in the code.
>>
>> skb in a qdisc have everything you need.
>>
>> qdisc_pkt_len_init() has initialized qdisc_skb_cb(skb)->pkt_len with
>> the exact bytes on the wire, and you have gso_segs to perform any
>> adjustement you need to do.
>
> The problem is that may not be the right values. For example, in many
> CPEs there's a built-in switch that strips VLAN tags before the packet
> actually hits the wire. So we do need to be able to get the actual
> packet size. Is it possible to get the sizes of the individual segments
> of a GSO packet? That way we could do the calculation for the whole
> super-packet...
All segments of GSO packets have the same size, by definition.
Only the last segment might be smaller, and again this can be inferred from gso_size and gso_segs
>
>> Do not kill GSO only because you do not want to deal with it.
>
> It's not (just) that we don't want to deal with it, it is also a very
> aggressive optimisation for latency, which makes a lot of sense at
> residential internet bandwidths.
>
If you keep saying this old urban legend, I will NACK your patch.
I am tired of people pretending GSO/TSO are bad for latencies.
This is not true, quite the opposite, when done well.
If you find issues, please share so that we can fix them.
Trying to hide them is not what we want for fresh code.
^ permalink raw reply
* [PATCH v2 net-next] lan78xx: Lan7801 Support for Fixed PHY
From: Raghuram Chary J @ 2018-04-25 19:07 UTC (permalink / raw)
To: davem; +Cc: netdev, unglinuxdriver, woojung.huh, raghuramchary.jallipalli
Adding Fixed PHY support to the lan78xx driver.
Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
---
v0->v1:
* Remove driver version #define
* Modify netdev_info to netdev_dbg
* Move lan7801 specific to new routine and add switch case
* Minor cleanup
v1->v2:
* Removed fixedphy variable and used phy_is_pseudo_fixed_link() check.
---
drivers/net/usb/Kconfig | 1 +
drivers/net/usb/lan78xx.c | 111 +++++++++++++++++++++++++++++++---------------
2 files changed, 77 insertions(+), 35 deletions(-)
diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index f28bd74ac275..418b0904cecb 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -111,6 +111,7 @@ config USB_LAN78XX
select MII
select PHYLIB
select MICROCHIP_PHY
+ select FIXED_PHY
help
This option adds support for Microchip LAN78XX based USB 2
& USB 3 10/100/1000 Ethernet adapters.
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 0867f7275852..48925ed71555 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -36,13 +36,12 @@
#include <linux/irq.h>
#include <linux/irqchip/chained_irq.h>
#include <linux/microchipphy.h>
-#include <linux/phy.h>
+#include <linux/phy_fixed.h>
#include "lan78xx.h"
#define DRIVER_AUTHOR "WOOJUNG HUH <woojung.huh@microchip.com>"
#define DRIVER_DESC "LAN78XX USB 3.0 Gigabit Ethernet Devices"
#define DRIVER_NAME "lan78xx"
-#define DRIVER_VERSION "1.0.6"
#define TX_TIMEOUT_JIFFIES (5 * HZ)
#define THROTTLE_JIFFIES (HZ / 8)
@@ -1477,7 +1476,6 @@ static void lan78xx_get_drvinfo(struct net_device *net,
struct lan78xx_net *dev = netdev_priv(net);
strncpy(info->driver, DRIVER_NAME, sizeof(info->driver));
- strncpy(info->version, DRIVER_VERSION, sizeof(info->version));
usb_make_path(dev->udev, info->bus_info, sizeof(info->bus_info));
}
@@ -2003,52 +2001,90 @@ static int ksz9031rnx_fixup(struct phy_device *phydev)
return 1;
}
-static int lan78xx_phy_init(struct lan78xx_net *dev)
+static int lan7801_phy_init(struct phy_device **phydev,
+ struct lan78xx_net *dev)
{
+ u32 buf;
int ret;
- u32 mii_adv;
- struct phy_device *phydev;
-
- phydev = phy_find_first(dev->mdiobus);
- if (!phydev) {
- netdev_err(dev->net, "no PHY found\n");
- return -EIO;
- }
-
- if ((dev->chipid == ID_REV_CHIP_ID_7800_) ||
- (dev->chipid == ID_REV_CHIP_ID_7850_)) {
- phydev->is_internal = true;
- dev->interface = PHY_INTERFACE_MODE_GMII;
-
- } else if (dev->chipid == ID_REV_CHIP_ID_7801_) {
- if (!phydev->drv) {
+ struct fixed_phy_status fphy_status = {
+ .link = 1,
+ .speed = SPEED_1000,
+ .duplex = DUPLEX_FULL,
+ };
+
+ if (!*phydev) {
+ netdev_dbg(dev->net, "PHY Not Found!! Registering Fixed PHY\n");
+ *phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1,
+ NULL);
+ if (IS_ERR(*phydev)) {
+ netdev_err(dev->net, "No PHY/fixed_PHY found\n");
+ return -ENODEV;
+ }
+ netdev_dbg(dev->net, "Registered FIXED PHY\n");
+ dev->interface = PHY_INTERFACE_MODE_RGMII;
+ ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
+ MAC_RGMII_ID_TXC_DELAY_EN_);
+ ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
+ ret = lan78xx_read_reg(dev, HW_CFG, &buf);
+ buf |= HW_CFG_CLK125_EN_;
+ buf |= HW_CFG_REFCLK25_EN_;
+ ret = lan78xx_write_reg(dev, HW_CFG, buf);
+ } else {
+ if (!(*phydev)->drv) {
netdev_err(dev->net, "no PHY driver found\n");
return -EIO;
}
-
dev->interface = PHY_INTERFACE_MODE_RGMII;
-
/* external PHY fixup for KSZ9031RNX */
ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0,
ksz9031rnx_fixup);
if (ret < 0) {
- netdev_err(dev->net, "fail to register fixup\n");
+ netdev_err(dev->net, "fail to register fixup PHY_KSZ9031RNX\n");
return ret;
}
/* external PHY fixup for LAN8835 */
ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0,
lan8835_fixup);
if (ret < 0) {
- netdev_err(dev->net, "fail to register fixup\n");
+ netdev_err(dev->net, "fail to register fixup for PHY_LAN8835\n");
return ret;
}
/* add more external PHY fixup here if needed */
- phydev->is_internal = false;
- } else {
- netdev_err(dev->net, "unknown ID found\n");
- ret = -EIO;
- goto error;
+ (*phydev)->is_internal = false;
+ }
+ return 0;
+}
+
+static int lan78xx_phy_init(struct lan78xx_net *dev)
+{
+ int ret;
+ u32 mii_adv;
+ struct phy_device *phydev;
+
+ phydev = phy_find_first(dev->mdiobus);
+ switch (dev->chipid) {
+ case ID_REV_CHIP_ID_7801_:
+ ret = lan7801_phy_init(&phydev, dev);
+ if (ret < 0) {
+ netdev_err(dev->net, "lan7801: PHY Init Failed");
+ return ret;
+ }
+ break;
+
+ case ID_REV_CHIP_ID_7800_:
+ case ID_REV_CHIP_ID_7850_:
+ if (!phydev) {
+ netdev_err(dev->net, "no PHY found\n");
+ return -EIO;
+ }
+ phydev->is_internal = true;
+ dev->interface = PHY_INTERFACE_MODE_GMII;
+ break;
+
+ default:
+ netdev_err(dev->net, "Unknown CHIP ID found\n");
+ return -EIO;
}
/* if phyirq is not set, use polling mode in phylib */
@@ -2067,6 +2103,12 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
if (ret) {
netdev_err(dev->net, "can't attach PHY to %s\n",
dev->mdiobus->id);
+ if (dev->chipid == ID_REV_CHIP_ID_7801_) {
+ phy_unregister_fixup_for_uid(PHY_KSZ9031RNX,
+ 0xfffffff0);
+ phy_unregister_fixup_for_uid(PHY_LAN8835,
+ 0xfffffff0);
+ }
return -EIO;
}
@@ -2084,12 +2126,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
dev->fc_autoneg = phydev->autoneg;
return 0;
-
-error:
- phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0);
- phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0);
-
- return ret;
}
static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size)
@@ -3487,6 +3523,7 @@ static void lan78xx_disconnect(struct usb_interface *intf)
struct lan78xx_net *dev;
struct usb_device *udev;
struct net_device *net;
+ struct phy_device *phydev;
dev = usb_get_intfdata(intf);
usb_set_intfdata(intf, NULL);
@@ -3495,12 +3532,16 @@ static void lan78xx_disconnect(struct usb_interface *intf)
udev = interface_to_usbdev(intf);
net = dev->net;
+ phydev = net->phydev;
phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0);
phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0);
phy_disconnect(net->phydev);
+ if (phy_is_pseudo_fixed_link(phydev))
+ fixed_phy_unregister(phydev);
+
unregister_netdev(net);
cancel_delayed_work_sync(&dev->wq);
--
2.16.2
^ permalink raw reply related
* RE: [PATCH net-next] lan78xx: Lan7801 Support for Fixed PHY
From: RaghuramChary.Jallipalli @ 2018-04-25 19:05 UTC (permalink / raw)
To: f.fainelli, andrew; +Cc: davem, netdev, UNGLinuxDriver, Woojung.Huh
In-Reply-To: <7b563a2e-fff5-0d4b-aea2-3d98dda3c192@gmail.com>
Hi Florian,
> It still is completely unnecessary, you can do something like the following:
>
> struct phy_device *phydev = netdev->phydev;
>
> phy_disconnect(phydev);
> if (phy_is_pseudo_fixed_link(phydev))
> fixed_phy_unregister(phydev);
>
> while netdev->phydev becomes NULL after phy_disconnect(), you retained
> a reference to the original PHY device before disconnecting, in order to
> unregister it. Can you see if that works?
> --
Done. Thanks.
Thanks,
Raghu
^ permalink raw reply
* Re: [Cake] [PATCH net-next v3] Add Common Applications Kept Enhanced (cake) qdisc
From: Toke Høiland-Jørgensen @ 2018-04-25 19:15 UTC (permalink / raw)
To: Eric Dumazet, Jonathan Morton; +Cc: netdev, cake
In-Reply-To: <688ac6e7-dd6d-d0ff-ee37-a0e2a0ea761c@gmail.com>
Eric Dumazet <eric.dumazet@gmail.com> writes:
> On 04/25/2018 11:34 AM, Toke Høiland-Jørgensen wrote:
>> Eric Dumazet <eric.dumazet@gmail.com> writes:
>>
>>> On 04/25/2018 09:52 AM, Jonathan Morton wrote:
>>>>> We can see here the high cost of forcing software GSO :/
>>>>>
>>>>> Really, this should be done only :
>>>>> 1) If requested by the admin ( tc .... gso ....)
>>>>>
>>>>> 2) If packet size is above a threshold.
>>>>> The threshold could be set by the admin, and/or based on a fraction of the bandwidth parameter.
>>>>>
>>>>> I totally understand why you prefer to segment yourself for < 100 Mbit links.
>>>>>
>>>>> But this makes no sense on 10Gbit+
>>>>
>>>> It is absolutely necessary, so far as I can see, to segment GSO
>>>> superpackets when overhead compensation is selected - as it very
>>>> often should be, even on pure Ethernet links. Without that, the
>>>> calculation of link occupancy time will be wrong. (The actual
>>>> transmission time of an Ethernet frame is rather more than just 14
>>>> bytes longer than the underlying IP packet.)
>>>
>>> Just fix the overhead compensation computation in the code.
>>>
>>> skb in a qdisc have everything you need.
>>>
>>> qdisc_pkt_len_init() has initialized qdisc_skb_cb(skb)->pkt_len with
>>> the exact bytes on the wire, and you have gso_segs to perform any
>>> adjustement you need to do.
>>
>> The problem is that may not be the right values. For example, in many
>> CPEs there's a built-in switch that strips VLAN tags before the packet
>> actually hits the wire. So we do need to be able to get the actual
>> packet size. Is it possible to get the sizes of the individual segments
>> of a GSO packet? That way we could do the calculation for the whole
>> super-packet...
>
> All segments of GSO packets have the same size, by definition.
>
> Only the last segment might be smaller, and again this can be inferred
> from gso_size and gso_segs
Gotcha. Until we are confident that we've implemented this in a way that
works, will this do?
@@ -88,6 +88,7 @@
#define CAKE_SET_WAYS (8)
#define CAKE_MAX_TINS (8)
#define CAKE_QUEUES (1024)
+#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */
@@ -1437,7 +1439,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
* or if we need to know individual packet sizes for framing overhead.
*/
- if (skb_is_gso(skb)) {
+ if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
/* signed slen to handle corner case
@@ -2337,6 +2339,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_CAKE_MEMORY])
q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
+ if (q->rate_bps && (q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD ||
+ q->rate_flags & CAKE_FLAG_OVERHEAD))
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ else
+ q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+
if (q->tins) {
sch_tree_lock(sch);
cake_reconfigure(sch);
^ permalink raw reply
* [PATCH net-next 1/8] net: Move PHY statistics code into PHY library helpers
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
In order to make it possible for network device drivers that do not
necessarily have a phy_device attached, but still report PHY statistics,
have a preliminary refactoring consisting in creating helper functions
that encapsulate the PHY device driver knowledge within PHYLIB.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/phy/phy.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/phy.h | 20 ++++++++++++++++++++
net/core/ethtool.c | 38 ++++++++------------------------------
3 files changed, 76 insertions(+), 30 deletions(-)
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 05c1e8ef15e6..a98ed12c0009 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1277,3 +1277,51 @@ int phy_ethtool_nway_reset(struct net_device *ndev)
return phy_restart_aneg(phydev);
}
EXPORT_SYMBOL(phy_ethtool_nway_reset);
+
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+{
+ if (!phydev->drv)
+ return -EIO;
+
+ mutex_lock(&phydev->lock);
+ phydev->drv->get_strings(phydev, data);
+ mutex_unlock(&phydev->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_get_strings);
+
+int phy_ethtool_get_sset_count(struct phy_device *phydev)
+{
+ int ret;
+
+ if (!phydev->drv)
+ return -EIO;
+
+ if (phydev->drv->get_sset_count &&
+ phydev->drv->get_strings &&
+ phydev->drv->get_stats) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_sset_count(phydev);
+ mutex_unlock(&phydev->lock);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(phy_ethtool_get_sset_count);
+
+int phy_ethtool_get_stats(struct phy_device *phydev,
+ struct ethtool_stats *stats, u64 *data)
+{
+ if (!phydev->drv)
+ return -EIO;
+
+ mutex_lock(&phydev->lock);
+ phydev->drv->get_stats(phydev, stats, data);
+ mutex_unlock(&phydev->lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(phy_ethtool_get_stats);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f0b5870a6d40..6ca81395c545 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1066,6 +1066,26 @@ int phy_ethtool_nway_reset(struct net_device *ndev);
#if IS_ENABLED(CONFIG_PHYLIB)
int __init mdio_bus_init(void);
void mdio_bus_exit(void);
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data);
+int phy_ethtool_get_sset_count(struct phy_device *phydev);
+int phy_ethtool_get_stats(struct phy_device *phydev,
+ struct ethtool_stats *stats, u64 *data);
+#else
+int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data)
+{
+ return -EOPNOTSUPP;
+}
+
+int phy_ethtool_get_sset_count(struct phy_device *phydev)
+{
+ return -EOPNOTSUPP;
+}
+
+int phy_ethtool_get_stats(struct phy_device *phydev,
+ struct ethtool_stats *stats, u64 *data)
+{
+ return -EOPNOTSUPP;
+}
#endif
extern struct bus_type mdio_bus_type;
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 03416e6dd5d7..f0d42e093c4a 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -210,23 +210,6 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
return ret;
}
-static int phy_get_sset_count(struct phy_device *phydev)
-{
- int ret;
-
- if (phydev->drv->get_sset_count &&
- phydev->drv->get_strings &&
- phydev->drv->get_stats) {
- mutex_lock(&phydev->lock);
- ret = phydev->drv->get_sset_count(phydev);
- mutex_unlock(&phydev->lock);
-
- return ret;
- }
-
- return -EOPNOTSUPP;
-}
-
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -245,7 +228,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_PHY_STATS) {
if (dev->phydev)
- return phy_get_sset_count(dev->phydev);
+ return phy_ethtool_get_sset_count(dev->phydev);
else
return -EOPNOTSUPP;
}
@@ -272,15 +255,10 @@ static void __ethtool_get_strings(struct net_device *dev,
else if (stringset == ETH_SS_PHY_TUNABLES)
memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
else if (stringset == ETH_SS_PHY_STATS) {
- struct phy_device *phydev = dev->phydev;
-
- if (phydev) {
- mutex_lock(&phydev->lock);
- phydev->drv->get_strings(phydev, data);
- mutex_unlock(&phydev->lock);
- } else {
+ if (dev->phydev)
+ phy_ethtool_get_strings(dev->phydev, data);
+ else
return;
- }
} else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
@@ -2001,7 +1979,7 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (!phydev)
return -EOPNOTSUPP;
- n_stats = phy_get_sset_count(phydev);
+ n_stats = phy_ethtool_get_sset_count(dev->phydev);
if (n_stats < 0)
return n_stats;
if (n_stats > S32_MAX / sizeof(u64))
@@ -2016,9 +1994,9 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (n_stats && !data)
return -ENOMEM;
- mutex_lock(&phydev->lock);
- phydev->drv->get_stats(phydev, &stats, data);
- mutex_unlock(&phydev->lock);
+ ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+ if (ret < 0)
+ return ret;
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
--
2.14.1
^ permalink raw reply related
* [PATCH net-next 0/8] net: Extend availability of PHY statistics
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
Hi all,
This patch series adds support for retrieving PHY statistics with DSA switches
when the CPU port uses a PHY to PHY connection (as opposed to MAC to MAC).
To get there a number of things are done:
- first we move the code dealing with PHY statistics outside of net/core/ethtool.c
and create helper functions since the same code will be reused
- then we allow network device drivers to provide an ethtool_get_phy_stats callback
when the standard PHY library helpers are not suitable
- we update the DSA functions dealing with ethtool operations to get passed a
stringset instead of assuming ETH_SS_STATS like they currently do
- then we provide a set of standard helpers within DSA as a framework and add
the plumbing to allow retrieving the PHY statistics of the CPU port(s)
- finally plug support for retrieving such PHY statistics with the b53 driver
Changes in v3:
- retrict the b53 change to 539x and 531x5 series of switches
- added a change to dsa_loop.c to help test the feature
Changes in v2:
- got actual testing when the DSA master network device has a PHY that
already provides statistics (thanks Nikita!)
- fixed the kbuild error reported when CONFIG_PHYLIB=n
- removed the checking of ops which is redundant and not needed
Florian Fainelli (8):
net: Move PHY statistics code into PHY library helpers
net: Allow network devices to have PHY statistics
net: dsa: Do not check for ethtool_ops validity
net: dsa: Pass stringset to ethtool operations
net: dsa: Add helper function to obtain PHY device of a given port
net: dsa: Allow providing PHY statistics from CPU port
net: dsa: b53: Add support for reading PHY statistics
net: dsa: loop: Hook PHY statistics
drivers/net/dsa/b53/b53_common.c | 81 +++++++++++++++++++++++++++---
drivers/net/dsa/b53/b53_priv.h | 6 ++-
drivers/net/dsa/bcm_sf2.c | 1 +
drivers/net/dsa/dsa_loop.c | 12 ++++-
drivers/net/dsa/lan9303-core.c | 11 ++++-
drivers/net/dsa/microchip/ksz_common.c | 11 ++++-
drivers/net/dsa/mt7530.c | 11 ++++-
drivers/net/dsa/mv88e6xxx/chip.c | 10 +++-
drivers/net/dsa/qca8k.c | 10 +++-
drivers/net/phy/phy.c | 48 ++++++++++++++++++
include/linux/ethtool.h | 5 ++
include/linux/phy.h | 20 ++++++++
include/net/dsa.h | 12 ++++-
net/core/ethtool.c | 61 ++++++++---------------
net/dsa/master.c | 62 +++++++++++++++++++----
net/dsa/port.c | 90 +++++++++++++++++++++++++++++-----
net/dsa/slave.c | 5 +-
17 files changed, 369 insertions(+), 87 deletions(-)
--
2.14.1
^ permalink raw reply
* [PATCH net-next 2/8] net: Allow network devices to have PHY statistics
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
Add a new callback: get_ethtool_phy_stats() which allows network device
drivers not making use of the PHY library to return PHY statistics.
Update ethtool_get_phy_stats(), __ethtool_get_sset_count() and
__ethtool_get_strings() accordingly to interogate the network device
about ETH_SS_PHY_STATS.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
include/linux/ethtool.h | 5 +++++
net/core/ethtool.c | 39 +++++++++++++++++++++------------------
2 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h
index b32cd2062f18..f8a2245b70ac 100644
--- a/include/linux/ethtool.h
+++ b/include/linux/ethtool.h
@@ -312,6 +312,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
* by kernel. Returns a negative error code or zero.
* @get_fecparam: Get the network device Forward Error Correction parameters.
* @set_fecparam: Set the network device Forward Error Correction parameters.
+ * @get_ethtool_phy_stats: Return extended statistics about the PHY device.
+ * This is only useful if the device maintains PHY statistics and
+ * cannot use the standard PHY library helpers.
*
* All operations are optional (i.e. the function pointer may be set
* to %NULL) and callers must take this into account. Callers must
@@ -407,5 +410,7 @@ struct ethtool_ops {
struct ethtool_fecparam *);
int (*set_fecparam)(struct net_device *,
struct ethtool_fecparam *);
+ void (*get_ethtool_phy_stats)(struct net_device *,
+ struct ethtool_stats *, u64 *);
};
#endif /* _LINUX_ETHTOOL_H */
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index f0d42e093c4a..4b8992ccf904 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -226,12 +226,9 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_PHY_TUNABLES)
return ARRAY_SIZE(phy_tunable_strings);
- if (sset == ETH_SS_PHY_STATS) {
- if (dev->phydev)
- return phy_ethtool_get_sset_count(dev->phydev);
- else
- return -EOPNOTSUPP;
- }
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ return phy_ethtool_get_sset_count(dev->phydev);
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
@@ -254,12 +251,10 @@ static void __ethtool_get_strings(struct net_device *dev,
memcpy(data, tunable_strings, sizeof(tunable_strings));
else if (stringset == ETH_SS_PHY_TUNABLES)
memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
- else if (stringset == ETH_SS_PHY_STATS) {
- if (dev->phydev)
- phy_ethtool_get_strings(dev->phydev, data);
- else
- return;
- } else
+ else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ phy_ethtool_get_strings(dev->phydev, data);
+ else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
}
@@ -1971,15 +1966,19 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_stats stats;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
u64 *data;
int ret, n_stats;
- if (!phydev)
+ if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
return -EOPNOTSUPP;
- n_stats = phy_ethtool_get_sset_count(dev->phydev);
+ if (dev->phydev && !ops->get_ethtool_phy_stats)
+ n_stats = phy_ethtool_get_sset_count(dev->phydev);
+ else
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
if (n_stats < 0)
return n_stats;
if (n_stats > S32_MAX / sizeof(u64))
@@ -1994,9 +1993,13 @@ static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
if (n_stats && !data)
return -ENOMEM;
- ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
- if (ret < 0)
- return ret;
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
+ if (ret < 0)
+ return ret;
+ } else {
+ ops->get_ethtool_phy_stats(dev, &stats, data);
+ }
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
--
2.14.1
^ permalink raw reply related
* [PATCH net-next 3/8] net: dsa: Do not check for ethtool_ops validity
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
This is completely redundant with what netdev_set_default_ethtool_ops()
does, we are always guaranteed to have a valid dev->ethtool_ops pointer,
however, within that structure, not all function calls may be populated,
so we still have to check them individually.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
net/dsa/master.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 90e6df0351eb..9ec16b39ed15 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -22,7 +22,7 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
int port = cpu_dp->index;
int count = 0;
- if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+ if (ops->get_sset_count && ops->get_ethtool_stats) {
count = ops->get_sset_count(dev, ETH_SS_STATS);
ops->get_ethtool_stats(dev, stats, data);
}
@@ -38,7 +38,7 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
struct dsa_switch *ds = cpu_dp->ds;
int count = 0;
- if (ops && ops->get_sset_count)
+ if (ops->get_sset_count)
count += ops->get_sset_count(dev, sset);
if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
@@ -64,7 +64,7 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
/* We do not want to be NULL-terminated, since this is a prefix */
pfx[sizeof(pfx) - 1] = '_';
- if (ops && ops->get_sset_count && ops->get_strings) {
+ if (ops->get_sset_count && ops->get_strings) {
mcount = ops->get_sset_count(dev, ETH_SS_STATS);
ops->get_strings(dev, stringset, data);
}
--
2.14.1
^ permalink raw reply related
* [PATCH net-next 4/8] net: dsa: Pass stringset to ethtool operations
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
Up until now we largely assumed that we were interested in ETH_SS_STATS
type of strings for all ethtool operations, this is about to change with
the introduction of additional string sets, e.g: ETH_SS_PHY_STATS.
Update all functions to take an appropriate stringset argument and act
on it when it is different than ETH_SS_STATS for now.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/dsa/b53/b53_common.c | 11 +++++++++--
drivers/net/dsa/b53/b53_priv.h | 5 +++--
drivers/net/dsa/dsa_loop.c | 11 +++++++++--
drivers/net/dsa/lan9303-core.c | 11 +++++++++--
drivers/net/dsa/microchip/ksz_common.c | 11 +++++++++--
drivers/net/dsa/mt7530.c | 11 +++++++++--
drivers/net/dsa/mv88e6xxx/chip.c | 10 ++++++++--
drivers/net/dsa/qca8k.c | 10 ++++++++--
include/net/dsa.h | 5 +++--
net/dsa/master.c | 21 +++++++++++++--------
net/dsa/slave.c | 5 +++--
11 files changed, 83 insertions(+), 28 deletions(-)
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 78616787f2a3..726b2d8c6fe9 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -806,13 +806,17 @@ static unsigned int b53_get_mib_size(struct b53_device *dev)
return B53_MIBS_SIZE;
}
-void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+ uint8_t *data)
{
struct b53_device *dev = ds->priv;
const struct b53_mib_desc *mibs = b53_get_mib(dev);
unsigned int mib_size = b53_get_mib_size(dev);
unsigned int i;
+ if (stringset != ETH_SS_STATS)
+ return;
+
for (i = 0; i < mib_size; i++)
strlcpy(data + i * ETH_GSTRING_LEN,
mibs[i].name, ETH_GSTRING_LEN);
@@ -852,10 +856,13 @@ void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
}
EXPORT_SYMBOL(b53_get_ethtool_stats);
-int b53_get_sset_count(struct dsa_switch *ds, int port)
+int b53_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
struct b53_device *dev = ds->priv;
+ if (sset != ETH_SS_STATS)
+ return 0;
+
return b53_get_mib_size(dev);
}
EXPORT_SYMBOL(b53_get_sset_count);
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index 1187ebd79287..b933d5cb5c2d 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -286,9 +286,10 @@ static inline int b53_switch_get_reset_gpio(struct b53_device *dev)
/* Exported functions towards other drivers */
void b53_imp_vlan_setup(struct dsa_switch *ds, int cpu_port);
int b53_configure_vlan(struct dsa_switch *ds);
-void b53_get_strings(struct dsa_switch *ds, int port, uint8_t *data);
+void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+ uint8_t *data);
void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
-int b53_get_sset_count(struct dsa_switch *ds, int port);
+int b53_get_sset_count(struct dsa_switch *ds, int port, int sset);
int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge);
void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index f77be9f85cb3..9354cc08d3fd 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -86,16 +86,23 @@ static int dsa_loop_setup(struct dsa_switch *ds)
return 0;
}
-static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port)
+static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
+ if (sset != ETH_SS_STATS)
+ return 0;
+
return __DSA_LOOP_CNT_MAX;
}
-static void dsa_loop_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+static void dsa_loop_get_strings(struct dsa_switch *ds, int port,
+ u32 stringset, uint8_t *data)
{
struct dsa_loop_priv *ps = ds->priv;
unsigned int i;
+ if (stringset != ETH_SS_STATS)
+ return;
+
for (i = 0; i < __DSA_LOOP_CNT_MAX; i++)
memcpy(data + i * ETH_GSTRING_LEN,
ps->ports[port].mib[i].name, ETH_GSTRING_LEN);
diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c
index fefa454f3e56..b4f6e1a67dd9 100644
--- a/drivers/net/dsa/lan9303-core.c
+++ b/drivers/net/dsa/lan9303-core.c
@@ -977,10 +977,14 @@ static const struct lan9303_mib_desc lan9303_mib[] = {
{ .offset = LAN9303_MAC_TX_LATECOL_0, .name = "TxLateCol", },
};
-static void lan9303_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+static void lan9303_get_strings(struct dsa_switch *ds, int port,
+ u32 stringset, uint8_t *data)
{
unsigned int u;
+ if (stringset != ETH_SS_STATS)
+ return;
+
for (u = 0; u < ARRAY_SIZE(lan9303_mib); u++) {
strncpy(data + u * ETH_GSTRING_LEN, lan9303_mib[u].name,
ETH_GSTRING_LEN);
@@ -1007,8 +1011,11 @@ static void lan9303_get_ethtool_stats(struct dsa_switch *ds, int port,
}
}
-static int lan9303_get_sset_count(struct dsa_switch *ds, int port)
+static int lan9303_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
+ if (sset != ETH_SS_STATS)
+ return 0;
+
return ARRAY_SIZE(lan9303_mib);
}
diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c
index bcb3e6c734f2..7210c49b7922 100644
--- a/drivers/net/dsa/microchip/ksz_common.c
+++ b/drivers/net/dsa/microchip/ksz_common.c
@@ -439,15 +439,22 @@ static void ksz_disable_port(struct dsa_switch *ds, int port,
ksz_port_cfg(dev, port, REG_PORT_CTRL_0, PORT_MAC_LOOPBACK, true);
}
-static int ksz_sset_count(struct dsa_switch *ds, int port)
+static int ksz_sset_count(struct dsa_switch *ds, int port, int sset)
{
+ if (sset != ETH_SS_STATS)
+ return 0;
+
return TOTAL_SWITCH_COUNTER_NUM;
}
-static void ksz_get_strings(struct dsa_switch *ds, int port, uint8_t *buf)
+static void ksz_get_strings(struct dsa_switch *ds, int port,
+ u32 stringset, uint8_t *buf)
{
int i;
+ if (stringset != ETH_SS_STATS)
+ return;
+
for (i = 0; i < TOTAL_SWITCH_COUNTER_NUM; i++) {
memcpy(buf + i * ETH_GSTRING_LEN, mib_names[i].string,
ETH_GSTRING_LEN);
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index 80a4dbc3a499..62e486652e62 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -573,10 +573,14 @@ static int mt7530_phy_write(struct dsa_switch *ds, int port, int regnum,
}
static void
-mt7530_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+mt7530_get_strings(struct dsa_switch *ds, int port, u32 stringset,
+ uint8_t *data)
{
int i;
+ if (stringset != ETH_SS_STATS)
+ return;
+
for (i = 0; i < ARRAY_SIZE(mt7530_mib); i++)
strncpy(data + i * ETH_GSTRING_LEN, mt7530_mib[i].name,
ETH_GSTRING_LEN);
@@ -604,8 +608,11 @@ mt7530_get_ethtool_stats(struct dsa_switch *ds, int port,
}
static int
-mt7530_get_sset_count(struct dsa_switch *ds, int port)
+mt7530_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
+ if (sset != ETH_SS_STATS)
+ return 0;
+
return ARRAY_SIZE(mt7530_mib);
}
diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 3d2091099f7f..8f92ccc0dd54 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -742,11 +742,14 @@ static void mv88e6xxx_atu_vtu_get_strings(uint8_t *data)
}
static void mv88e6xxx_get_strings(struct dsa_switch *ds, int port,
- uint8_t *data)
+ u32 stringset, uint8_t *data)
{
struct mv88e6xxx_chip *chip = ds->priv;
int count = 0;
+ if (stringset != ETH_SS_STATS)
+ return;
+
mutex_lock(&chip->reg_lock);
if (chip->info->ops->stats_get_strings)
@@ -789,12 +792,15 @@ static int mv88e6320_stats_get_sset_count(struct mv88e6xxx_chip *chip)
STATS_TYPE_BANK1);
}
-static int mv88e6xxx_get_sset_count(struct dsa_switch *ds, int port)
+static int mv88e6xxx_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
struct mv88e6xxx_chip *chip = ds->priv;
int serdes_count = 0;
int count = 0;
+ if (sset != ETH_SS_STATS)
+ return 0;
+
mutex_lock(&chip->reg_lock);
if (chip->info->ops->stats_get_sset_count)
count = chip->info->ops->stats_get_sset_count(chip);
diff --git a/drivers/net/dsa/qca8k.c b/drivers/net/dsa/qca8k.c
index 600d5ad1fbde..757b6d90ea36 100644
--- a/drivers/net/dsa/qca8k.c
+++ b/drivers/net/dsa/qca8k.c
@@ -600,10 +600,13 @@ qca8k_phy_write(struct dsa_switch *ds, int phy, int regnum, u16 val)
}
static void
-qca8k_get_strings(struct dsa_switch *ds, int port, uint8_t *data)
+qca8k_get_strings(struct dsa_switch *ds, int port, u32 stringset, uint8_t *data)
{
int i;
+ if (stringset != ETH_SS_STATS)
+ return;
+
for (i = 0; i < ARRAY_SIZE(ar8327_mib); i++)
strncpy(data + i * ETH_GSTRING_LEN, ar8327_mib[i].name,
ETH_GSTRING_LEN);
@@ -631,8 +634,11 @@ qca8k_get_ethtool_stats(struct dsa_switch *ds, int port,
}
static int
-qca8k_get_sset_count(struct dsa_switch *ds, int port)
+qca8k_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
+ if (sset != ETH_SS_STATS)
+ return 0;
+
return ARRAY_SIZE(ar8327_mib);
}
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 60fb4ec8ba61..0bc0aad1b02e 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -356,10 +356,11 @@ struct dsa_switch_ops {
/*
* ethtool hardware statistics.
*/
- void (*get_strings)(struct dsa_switch *ds, int port, uint8_t *data);
+ void (*get_strings)(struct dsa_switch *ds, int port,
+ u32 stringset, uint8_t *data);
void (*get_ethtool_stats)(struct dsa_switch *ds,
int port, uint64_t *data);
- int (*get_sset_count)(struct dsa_switch *ds, int port);
+ int (*get_sset_count)(struct dsa_switch *ds, int port, int sset);
/*
* ethtool Wake-on-LAN
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 9ec16b39ed15..8d27687fd0ca 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -38,11 +38,14 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
struct dsa_switch *ds = cpu_dp->ds;
int count = 0;
- if (ops->get_sset_count)
- count += ops->get_sset_count(dev, sset);
+ if (ops->get_sset_count) {
+ count = ops->get_sset_count(dev, sset);
+ if (count < 0)
+ count = 0;
+ }
- if (sset == ETH_SS_STATS && ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, cpu_dp->index);
+ if (ds->ops->get_sset_count)
+ count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
return count;
}
@@ -65,18 +68,20 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
pfx[sizeof(pfx) - 1] = '_';
if (ops->get_sset_count && ops->get_strings) {
- mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+ mcount = ops->get_sset_count(dev, stringset);
+ if (mcount < 0)
+ mcount = 0;
ops->get_strings(dev, stringset, data);
}
- if (stringset == ETH_SS_STATS && ds->ops->get_strings) {
+ if (ds->ops->get_strings) {
ndata = data + mcount * len;
/* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
* the output after to prepend our CPU port prefix we
* constructed earlier
*/
- ds->ops->get_strings(ds, port, ndata);
- count = ds->ops->get_sset_count(ds, port);
+ ds->ops->get_strings(ds, port, stringset, ndata);
+ count = ds->ops->get_sset_count(ds, port, stringset);
for (i = 0; i < count; i++) {
memmove(ndata + (i * len + sizeof(pfx)),
ndata + i * len, len - sizeof(pfx));
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 18561af7a8f1..f3fb3a0880b1 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -560,7 +560,8 @@ static void dsa_slave_get_strings(struct net_device *dev,
strncpy(data + 2 * len, "rx_packets", len);
strncpy(data + 3 * len, "rx_bytes", len);
if (ds->ops->get_strings)
- ds->ops->get_strings(ds, dp->index, data + 4 * len);
+ ds->ops->get_strings(ds, dp->index, stringset,
+ data + 4 * len);
}
}
@@ -605,7 +606,7 @@ static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
count = 4;
if (ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, dp->index);
+ count += ds->ops->get_sset_count(ds, dp->index, sset);
return count;
}
--
2.14.1
^ permalink raw reply related
* [PATCH net-next 5/8] net: dsa: Add helper function to obtain PHY device of a given port
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
In preparation for having more call sites attempting to obtain a
reference against a PHY device corresponding to a particular port,
introduce a helper function for that purpose.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
net/dsa/port.c | 33 ++++++++++++++++++++++-----------
1 file changed, 22 insertions(+), 11 deletions(-)
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 7acc1169d75e..5e2a88720a9a 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -273,25 +273,38 @@ int dsa_port_vlan_del(struct dsa_port *dp,
return 0;
}
-static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
{
- struct device_node *port_dn = dp->dn;
struct device_node *phy_dn;
- struct dsa_switch *ds = dp->ds;
struct phy_device *phydev;
- int port = dp->index;
- int err = 0;
- phy_dn = of_parse_phandle(port_dn, "phy-handle", 0);
+ phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
if (!phy_dn)
- return 0;
+ return NULL;
phydev = of_phy_find_device(phy_dn);
if (!phydev) {
- err = -EPROBE_DEFER;
- goto err_put_of;
+ of_node_put(phy_dn);
+ return ERR_PTR(-EPROBE_DEFER);
}
+ return phydev;
+}
+
+static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct phy_device *phydev;
+ int port = dp->index;
+ int err = 0;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (!phydev)
+ return 0;
+
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
if (enable) {
err = genphy_config_init(phydev);
if (err < 0)
@@ -317,8 +330,6 @@ static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
err_put_dev:
put_device(&phydev->mdio.dev);
-err_put_of:
- of_node_put(phy_dn);
return err;
}
--
2.14.1
^ permalink raw reply related
* [PATCH net-next 6/8] net: dsa: Allow providing PHY statistics from CPU port
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
Implement the same type of ethtool diversion that we have for
ETH_SS_STATS and make it work with ETH_SS_PHY_STATS. This allows
providing PHY level statistics for CPU ports that are directly
connecting to a PHY device.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
include/net/dsa.h | 7 +++++++
net/dsa/master.c | 47 ++++++++++++++++++++++++++++++++++++++++-----
net/dsa/port.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 106 insertions(+), 5 deletions(-)
diff --git a/include/net/dsa.h b/include/net/dsa.h
index 0bc0aad1b02e..462e9741b210 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -361,6 +361,8 @@ struct dsa_switch_ops {
void (*get_ethtool_stats)(struct dsa_switch *ds,
int port, uint64_t *data);
int (*get_sset_count)(struct dsa_switch *ds, int port, int sset);
+ void (*get_ethtool_phy_stats)(struct dsa_switch *ds,
+ int port, uint64_t *data);
/*
* ethtool Wake-on-LAN
@@ -589,4 +591,9 @@ static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev,
#define BRCM_TAG_GET_PORT(v) ((v) >> 8)
#define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff)
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data);
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data);
+int dsa_port_get_phy_sset_count(struct dsa_port *dp);
+
#endif
diff --git a/net/dsa/master.c b/net/dsa/master.c
index 8d27687fd0ca..c90ee3227dea 100644
--- a/net/dsa/master.c
+++ b/net/dsa/master.c
@@ -31,6 +31,32 @@ static void dsa_master_get_ethtool_stats(struct net_device *dev,
ds->ops->get_ethtool_stats(ds, port, data + count);
}
+static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int count = 0;
+
+ if (dev->phydev && !ops->get_ethtool_phy_stats) {
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ if (count >= 0)
+ phy_ethtool_get_stats(dev->phydev, stats, data);
+ } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
+ count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+ ops->get_ethtool_phy_stats(dev, stats, data);
+ }
+
+ if (count < 0)
+ count = 0;
+
+ if (ds->ops->get_ethtool_phy_stats)
+ ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
static int dsa_master_get_sset_count(struct net_device *dev, int sset)
{
struct dsa_port *cpu_dp = dev->dsa_ptr;
@@ -38,11 +64,14 @@ static int dsa_master_get_sset_count(struct net_device *dev, int sset)
struct dsa_switch *ds = cpu_dp->ds;
int count = 0;
- if (ops->get_sset_count) {
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats)
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ else if (ops->get_sset_count)
count = ops->get_sset_count(dev, sset);
- if (count < 0)
- count = 0;
- }
+
+ if (count < 0)
+ count = 0;
if (ds->ops->get_sset_count)
count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
@@ -67,7 +96,14 @@ static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
/* We do not want to be NULL-terminated, since this is a prefix */
pfx[sizeof(pfx) - 1] = '_';
- if (ops->get_sset_count && ops->get_strings) {
+ if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats) {
+ mcount = phy_ethtool_get_sset_count(dev->phydev);
+ if (mcount < 0)
+ mcount = 0;
+ else
+ phy_ethtool_get_strings(dev->phydev, data);
+ } else if (ops->get_sset_count && ops->get_strings) {
mcount = ops->get_sset_count(dev, stringset);
if (mcount < 0)
mcount = 0;
@@ -107,6 +143,7 @@ static int dsa_master_ethtool_setup(struct net_device *dev)
ops->get_sset_count = dsa_master_get_sset_count;
ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
ops->get_strings = dsa_master_get_strings;
+ ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
dev->ethtool_ops = ops;
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 5e2a88720a9a..2413beb995be 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -383,3 +383,60 @@ void dsa_port_link_unregister_of(struct dsa_port *dp)
else
dsa_port_setup_phy_of(dp, false);
}
+
+int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_strings(phydev, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
+
+int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_stats(phydev, NULL, data);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
+
+int dsa_port_get_phy_sset_count(struct dsa_port *dp)
+{
+ struct phy_device *phydev;
+ int ret = -EOPNOTSUPP;
+
+ if (of_phy_is_fixed_link(dp->dn))
+ return ret;
+
+ phydev = dsa_port_get_phy_device(dp);
+ if (IS_ERR_OR_NULL(phydev))
+ return ret;
+
+ ret = phy_ethtool_get_sset_count(phydev);
+ put_device(&phydev->mdio.dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
--
2.14.1
^ permalink raw reply related
* [PATCH net-next 7/8] net: dsa: b53: Add support for reading PHY statistics
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
Allow the b53 driver to return PHY statistics when the CPU port used is
different than 5, 7 or 8, because those are typically PHY-less on most
devices. This is useful for debugging link problems between the switch
and an external host when using a non standard CPU port number (e.g: 4).
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/dsa/b53/b53_common.c | 78 ++++++++++++++++++++++++++++++++++------
drivers/net/dsa/b53/b53_priv.h | 1 +
drivers/net/dsa/bcm_sf2.c | 1 +
3 files changed, 70 insertions(+), 10 deletions(-)
diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c
index 726b2d8c6fe9..9f561fe505cb 100644
--- a/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@ -806,20 +806,39 @@ static unsigned int b53_get_mib_size(struct b53_device *dev)
return B53_MIBS_SIZE;
}
+static struct phy_device *b53_get_phy_device(struct dsa_switch *ds, int port)
+{
+ /* These ports typically do not have built-in PHYs */
+ switch (port) {
+ case B53_CPU_PORT_25:
+ case 7:
+ case B53_CPU_PORT:
+ return NULL;
+ }
+
+ return mdiobus_get_phy(ds->slave_mii_bus, port);
+}
+
void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
uint8_t *data)
{
struct b53_device *dev = ds->priv;
const struct b53_mib_desc *mibs = b53_get_mib(dev);
unsigned int mib_size = b53_get_mib_size(dev);
+ struct phy_device *phydev;
unsigned int i;
- if (stringset != ETH_SS_STATS)
- return;
+ if (stringset == ETH_SS_STATS) {
+ for (i = 0; i < mib_size; i++)
+ strlcpy(data + i * ETH_GSTRING_LEN,
+ mibs[i].name, ETH_GSTRING_LEN);
+ } else if (stringset == ETH_SS_PHY_STATS) {
+ phydev = b53_get_phy_device(ds, port);
+ if (!phydev)
+ return;
- for (i = 0; i < mib_size; i++)
- strlcpy(data + i * ETH_GSTRING_LEN,
- mibs[i].name, ETH_GSTRING_LEN);
+ phy_ethtool_get_strings(phydev, data);
+ }
}
EXPORT_SYMBOL(b53_get_strings);
@@ -856,14 +875,34 @@ void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
}
EXPORT_SYMBOL(b53_get_ethtool_stats);
+void b53_get_ethtool_phy_stats(struct dsa_switch *ds, int port, uint64_t *data)
+{
+ struct phy_device *phydev;
+
+ phydev = b53_get_phy_device(ds, port);
+ if (!phydev)
+ return;
+
+ phy_ethtool_get_stats(phydev, NULL, data);
+}
+EXPORT_SYMBOL(b53_get_ethtool_phy_stats);
+
int b53_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
struct b53_device *dev = ds->priv;
+ struct phy_device *phydev;
- if (sset != ETH_SS_STATS)
- return 0;
+ if (sset == ETH_SS_STATS) {
+ return b53_get_mib_size(dev);
+ } else if (sset == ETH_SS_PHY_STATS) {
+ phydev = b53_get_phy_device(ds, port);
+ if (!phydev)
+ return 0;
+
+ return phy_ethtool_get_sset_count(phydev);
+ }
- return b53_get_mib_size(dev);
+ return 0;
}
EXPORT_SYMBOL(b53_get_sset_count);
@@ -1484,7 +1523,7 @@ void b53_br_fast_age(struct dsa_switch *ds, int port)
}
EXPORT_SYMBOL(b53_br_fast_age);
-static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port)
+static bool b53_possible_cpu_port(struct dsa_switch *ds, int port)
{
/* Broadcom switches will accept enabling Broadcom tags on the
* following ports: 5, 7 and 8, any other port is not supported
@@ -1496,10 +1535,19 @@ static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port)
return true;
}
- dev_warn(ds->dev, "Port %d is not Broadcom tag capable\n", port);
return false;
}
+static bool b53_can_enable_brcm_tags(struct dsa_switch *ds, int port)
+{
+ bool ret = b53_possible_cpu_port(ds, port);
+
+ if (!ret)
+ dev_warn(ds->dev, "Port %d is not Broadcom tag capable\n",
+ port);
+ return ret;
+}
+
enum dsa_tag_protocol b53_get_tag_protocol(struct dsa_switch *ds, int port)
{
struct b53_device *dev = ds->priv;
@@ -1657,6 +1705,7 @@ static const struct dsa_switch_ops b53_switch_ops = {
.get_strings = b53_get_strings,
.get_ethtool_stats = b53_get_ethtool_stats,
.get_sset_count = b53_get_sset_count,
+ .get_ethtool_phy_stats = b53_get_ethtool_phy_stats,
.phy_read = b53_phy_read16,
.phy_write = b53_phy_write16,
.adjust_link = b53_adjust_link,
@@ -1961,6 +2010,15 @@ static int b53_switch_init(struct b53_device *dev)
dev->num_ports = dev->cpu_port + 1;
dev->enabled_ports |= BIT(dev->cpu_port);
+ /* Include non standard CPU port built-in PHYs to be probed */
+ if (is539x(dev) || is531x5(dev)) {
+ for (i = 0; i < dev->num_ports; i++) {
+ if (!(dev->ds->phys_mii_mask & BIT(i)) &&
+ !b53_possible_cpu_port(dev->ds, i))
+ dev->ds->phys_mii_mask |= BIT(i);
+ }
+ }
+
dev->ports = devm_kzalloc(dev->dev,
sizeof(struct b53_port) * dev->num_ports,
GFP_KERNEL);
diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h
index b933d5cb5c2d..cc284a514de9 100644
--- a/drivers/net/dsa/b53/b53_priv.h
+++ b/drivers/net/dsa/b53/b53_priv.h
@@ -290,6 +290,7 @@ void b53_get_strings(struct dsa_switch *ds, int port, u32 stringset,
uint8_t *data);
void b53_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data);
int b53_get_sset_count(struct dsa_switch *ds, int port, int sset);
+void b53_get_ethtool_phy_stats(struct dsa_switch *ds, int port, uint64_t *data);
int b53_br_join(struct dsa_switch *ds, int port, struct net_device *bridge);
void b53_br_leave(struct dsa_switch *ds, int port, struct net_device *bridge);
void b53_br_set_stp_state(struct dsa_switch *ds, int port, u8 state);
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index 0378eded31f2..97236cfcbae4 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -859,6 +859,7 @@ static const struct dsa_switch_ops bcm_sf2_ops = {
.get_strings = b53_get_strings,
.get_ethtool_stats = b53_get_ethtool_stats,
.get_sset_count = b53_get_sset_count,
+ .get_ethtool_phy_stats = b53_get_ethtool_phy_stats,
.get_phy_flags = bcm_sf2_sw_get_phy_flags,
.adjust_link = bcm_sf2_sw_adjust_link,
.fixed_link_update = bcm_sf2_sw_fixed_link_update,
--
2.14.1
^ permalink raw reply related
* [PATCH net-next 8/8] net: dsa: loop: Hook PHY statistics
From: Florian Fainelli @ 2018-04-25 19:12 UTC (permalink / raw)
To: netdev
Cc: Florian Fainelli, andrew, vivien.didelot, cphealy, davem,
nikita.yoush
In-Reply-To: <20180425191254.3467-1-f.fainelli@gmail.com>
We just return the same statistics through ethtool_get_stats() and
ethtool_get_phy_stats() for simplicity since this is just a mock-up driver.
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
drivers/net/dsa/dsa_loop.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c
index 9354cc08d3fd..58f14af04639 100644
--- a/drivers/net/dsa/dsa_loop.c
+++ b/drivers/net/dsa/dsa_loop.c
@@ -88,7 +88,7 @@ static int dsa_loop_setup(struct dsa_switch *ds)
static int dsa_loop_get_sset_count(struct dsa_switch *ds, int port, int sset)
{
- if (sset != ETH_SS_STATS)
+ if (sset != ETH_SS_STATS && sset != ETH_SS_PHY_STATS)
return 0;
return __DSA_LOOP_CNT_MAX;
@@ -100,7 +100,7 @@ static void dsa_loop_get_strings(struct dsa_switch *ds, int port,
struct dsa_loop_priv *ps = ds->priv;
unsigned int i;
- if (stringset != ETH_SS_STATS)
+ if (stringset != ETH_SS_STATS && stringset != ETH_SS_PHY_STATS)
return;
for (i = 0; i < __DSA_LOOP_CNT_MAX; i++)
@@ -263,6 +263,7 @@ static const struct dsa_switch_ops dsa_loop_driver = {
.get_strings = dsa_loop_get_strings,
.get_ethtool_stats = dsa_loop_get_ethtool_stats,
.get_sset_count = dsa_loop_get_sset_count,
+ .get_ethtool_phy_stats = dsa_loop_get_ethtool_stats,
.phy_read = dsa_loop_phy_read,
.phy_write = dsa_loop_phy_write,
.port_bridge_join = dsa_loop_port_bridge_join,
--
2.14.1
^ permalink raw reply related
* Re: [net-next 00/10][pull request] 10GbE Intel Wired LAN Driver Updates 2018-04-25
From: David Miller @ 2018-04-25 19:18 UTC (permalink / raw)
To: jeffrey.t.kirsher; +Cc: netdev, nhorman, sassmann, jogreene
In-Reply-To: <20180425153305.25878-1-jeffrey.t.kirsher@intel.com>
From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Wed, 25 Apr 2018 08:32:55 -0700
> This series represents yet another phase of the macvlan cleanup Alex has
> been working on.
>
> The main goal of these changes is to make it so that we only support
> offloading what we can actually offload and we don't break any existing
> functionality. So for example we were claiming to advertise source mode
> macvlan and we were doing nothing of the sort, so support for that has been
> dropped.
>
> The biggest change with this set is that broadcast/multicast replication is
> no longer being supported in software. Alex dropped it as it leads to
> scaling issues when a broadcast frame has to be replicated up to 64 times.
>
> Beyond that this set goes through and optimized the time needed to bring up
> and tear down the macvlan interfaces on ixgbe and provides a clean way for
> us to disable the macvlan offload when needed.
Ok, the macvlan changes look good to me.
> The following are changes since commit c749fa181bd5848be78691d23168ec61ce691b95:
> Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
> and are available in the git repository at:
> git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue 10GbE
I'll pull this into net-next, thanks everyone.
^ permalink raw reply
* Re: KMSAN: uninit-value in _copy_to_iter (2)
From: syzbot @ 2018-04-25 19:19 UTC (permalink / raw)
To: avagin, davem, dingtianhong, edumazet, elena.reshetova, jasowang,
kvm, linux-kernel, matthew, mingo, mst, netdev, pabeni,
syzkaller-bugs, viro, virtualization, willemb
In-Reply-To: <000000000000a5b2b1056a86e98c@google.com>
syzbot has found reproducer for the following crash on
https://github.com/google/kmsan.git/master commit
d2d741e5d1898dfde1a75ea3d29a9a3e2edf0617 (Sun Apr 22 15:05:22 2018 +0000)
kmsan: add initialization for shmem pages
syzbot dashboard link:
https://syzkaller.appspot.com/bug?extid=87cfa083e727a224754b
So far this crash happened 3 times on
https://github.com/google/kmsan.git/master.
C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5122017598636032
syzkaller reproducer:
https://syzkaller.appspot.com/x/repro.syz?id=6680049734385664
Raw console output:
https://syzkaller.appspot.com/x/log.txt?id=5920461749747712
Kernel config: https://syzkaller.appspot.com/x/.config?id=328654897048964367
compiler: clang version 7.0.0 (trunk 329391)
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+87cfa083e727a224754b@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed.
==================================================================
BUG: KMSAN: uninit-value in copyout lib/iov_iter.c:140 [inline]
BUG: KMSAN: uninit-value in _copy_to_iter+0x46d/0x28f0 lib/iov_iter.c:571
CPU: 1 PID: 4516 Comm: syz-executor879 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x185/0x1d0 lib/dump_stack.c:53
kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
kmsan_internal_check_memory+0x135/0x1e0 mm/kmsan/kmsan.c:1157
kmsan_copy_to_user+0x69/0x160 mm/kmsan/kmsan.c:1199
copyout lib/iov_iter.c:140 [inline]
_copy_to_iter+0x46d/0x28f0 lib/iov_iter.c:571
copy_to_iter include/linux/uio.h:106 [inline]
vhost_chr_read_iter+0x7ac/0xc50 drivers/vhost/vhost.c:1104
vhost_net_chr_read_iter+0xf6/0x130 drivers/vhost/net.c:1365
call_read_iter include/linux/fs.h:1776 [inline]
aio_read+0x5c1/0x6f0 fs/aio.c:1517
io_submit_one fs/aio.c:1633 [inline]
do_io_submit+0x1bb4/0x2f60 fs/aio.c:1698
SYSC_io_submit+0x98/0xb0 fs/aio.c:1723
SyS_io_submit+0x56/0x80 fs/aio.c:1720
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x4457b9
RSP: 002b:00007ff9343e4da8 EFLAGS: 00000293 ORIG_RAX: 00000000000000d1
RAX: ffffffffffffffda RBX: 00000000006dac44 RCX: 00000000004457b9
RDX: 00000000200001c0 RSI: 0000000000000001 RDI: 00007ff93439a000
RBP: 00000000006dac40 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000293 R12: 901aeeff3a98f9ab
R13: 98c94b26f489688e R14: ae1b2dfa3c87200a R15: 0000000000000001
Uninit was created at:
kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
__kmalloc+0x23c/0x350 mm/slub.c:3791
kmalloc include/linux/slab.h:517 [inline]
vhost_new_msg drivers/vhost/vhost.c:2340 [inline]
vhost_iotlb_miss drivers/vhost/vhost.c:1124 [inline]
translate_desc+0xbef/0x1120 drivers/vhost/vhost.c:1829
__vhost_get_user_slow drivers/vhost/vhost.c:812 [inline]
__vhost_get_user drivers/vhost/vhost.c:846 [inline]
vhost_update_used_flags+0x469/0x8d0 drivers/vhost/vhost.c:1715
vhost_vq_init_access+0x173/0xa20 drivers/vhost/vhost.c:1763
vhost_net_set_backend drivers/vhost/net.c:1166 [inline]
vhost_net_ioctl+0x22b0/0x3480 drivers/vhost/net.c:1322
vfs_ioctl fs/ioctl.c:46 [inline]
do_vfs_ioctl+0xaf0/0x2440 fs/ioctl.c:686
SYSC_ioctl+0x1d2/0x260 fs/ioctl.c:701
SyS_ioctl+0x54/0x80 fs/ioctl.c:692
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
Bytes 4-7 of 72 are uninitialized
==================================================================
^ permalink raw reply
* Re: [PATCH v2 net-next] lan78xx: Lan7801 Support for Fixed PHY
From: Florian Fainelli @ 2018-04-25 19:23 UTC (permalink / raw)
To: Raghuram Chary J, davem; +Cc: netdev, unglinuxdriver, woojung.huh
In-Reply-To: <20180425190711.10479-1-raghuramchary.jallipalli@microchip.com>
On 04/25/2018 12:07 PM, Raghuram Chary J wrote:
> Adding Fixed PHY support to the lan78xx driver.
>
> Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
> ---
> v0->v1:
> * Remove driver version #define
This should be a separate patch of its own, more comment below.
> * Modify netdev_info to netdev_dbg
> * Move lan7801 specific to new routine and add switch case
> * Minor cleanup
> -static int lan78xx_phy_init(struct lan78xx_net *dev)
> +static int lan7801_phy_init(struct phy_device **phydev,
> + struct lan78xx_net *dev)
Passing a **phydev looks really unnecessary here, why don't just return
a struct phy_device * as a return argument here? If you need to
communicate a specific return value, you can use ERR_PTR()/PTR_ERR() for
that purpose.
> {
> + u32 buf;
> int ret;
> - u32 mii_adv;
> - struct phy_device *phydev;
> -
> - phydev = phy_find_first(dev->mdiobus);
> - if (!phydev) {
> - netdev_err(dev->net, "no PHY found\n");
> - return -EIO;
> - }
> -
> - if ((dev->chipid == ID_REV_CHIP_ID_7800_) ||
> - (dev->chipid == ID_REV_CHIP_ID_7850_)) {
> - phydev->is_internal = true;
> - dev->interface = PHY_INTERFACE_MODE_GMII;
> -
> - } else if (dev->chipid == ID_REV_CHIP_ID_7801_) {
> - if (!phydev->drv) {
> + struct fixed_phy_status fphy_status = {
> + .link = 1,
> + .speed = SPEED_1000,
> + .duplex = DUPLEX_FULL,
> + };
> +
> + if (!*phydev) {
> + netdev_dbg(dev->net, "PHY Not Found!! Registering Fixed PHY\n");
> + *phydev = fixed_phy_register(PHY_POLL, &fphy_status, -1,
> + NULL);
> + if (IS_ERR(*phydev)) {
> + netdev_err(dev->net, "No PHY/fixed_PHY found\n");
> + return -ENODEV;
> + }
> + netdev_dbg(dev->net, "Registered FIXED PHY\n");
> + dev->interface = PHY_INTERFACE_MODE_RGMII;
> + ret = lan78xx_write_reg(dev, MAC_RGMII_ID,
> + MAC_RGMII_ID_TXC_DELAY_EN_);
> + ret = lan78xx_write_reg(dev, RGMII_TX_BYP_DLL, 0x3D00);
> + ret = lan78xx_read_reg(dev, HW_CFG, &buf);
> + buf |= HW_CFG_CLK125_EN_;
> + buf |= HW_CFG_REFCLK25_EN_;
> + ret = lan78xx_write_reg(dev, HW_CFG, buf);
> + } else {
> + if (!(*phydev)->drv) {
> netdev_err(dev->net, "no PHY driver found\n");
> return -EIO;
> }
> -
> dev->interface = PHY_INTERFACE_MODE_RGMII;
> -
> /* external PHY fixup for KSZ9031RNX */
> ret = phy_register_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0,
> ksz9031rnx_fixup);
> if (ret < 0) {
> - netdev_err(dev->net, "fail to register fixup\n");
> + netdev_err(dev->net, "fail to register fixup PHY_KSZ9031RNX\n");
Unrelated message change, that should be a separate commit.
> return ret;
> }
> /* external PHY fixup for LAN8835 */
> ret = phy_register_fixup_for_uid(PHY_LAN8835, 0xfffffff0,
> lan8835_fixup);
> if (ret < 0) {
> - netdev_err(dev->net, "fail to register fixup\n");
> + netdev_err(dev->net, "fail to register fixup for PHY_LAN8835\n");
Same here, can be grouped in the same commit as the one right above.
> return ret;
> }
> /* add more external PHY fixup here if needed */
>
> - phydev->is_internal = false;
> - } else {
> - netdev_err(dev->net, "unknown ID found\n");
> - ret = -EIO;
> - goto error;
> + (*phydev)->is_internal = false;
> + }
> + return 0;
> +}
> +
> +static int lan78xx_phy_init(struct lan78xx_net *dev)
> +{
> + int ret;
> + u32 mii_adv;
> + struct phy_device *phydev;
> +
> + phydev = phy_find_first(dev->mdiobus);
> + switch (dev->chipid) {
> + case ID_REV_CHIP_ID_7801_:
> + ret = lan7801_phy_init(&phydev, dev);
> + if (ret < 0) {
> + netdev_err(dev->net, "lan7801: PHY Init Failed");
> + return ret;
> + }
> + break;
> +
> + case ID_REV_CHIP_ID_7800_:
> + case ID_REV_CHIP_ID_7850_:
> + if (!phydev) {
> + netdev_err(dev->net, "no PHY found\n");
> + return -EIO;
> + }
> + phydev->is_internal = true;
Uggh you are not really supposed to set that, the matching PHY driver
should have PHY_IS_INTERNAL in the .flags member to indicate that to the
core PHY library.
> + dev->interface = PHY_INTERFACE_MODE_GMII;
> + break;
> +
> + default:
> + netdev_err(dev->net, "Unknown CHIP ID found\n");
> + return -EIO;
> }
>
> /* if phyirq is not set, use polling mode in phylib */
> @@ -2067,6 +2103,12 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
> if (ret) {
> netdev_err(dev->net, "can't attach PHY to %s\n",
> dev->mdiobus->id);
> + if (dev->chipid == ID_REV_CHIP_ID_7801_) {
> + phy_unregister_fixup_for_uid(PHY_KSZ9031RNX,
> + 0xfffffff0);
> + phy_unregister_fixup_for_uid(PHY_LAN8835,
> + 0xfffffff0);
> + }
> return -EIO;
> }
>
> @@ -2084,12 +2126,6 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
> dev->fc_autoneg = phydev->autoneg;
>
> return 0;
> -
> -error:
> - phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0);
> - phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0);
> -
> - return ret;
> }
>
> static int lan78xx_set_rx_max_frame_length(struct lan78xx_net *dev, int size)
> @@ -3487,6 +3523,7 @@ static void lan78xx_disconnect(struct usb_interface *intf)
> struct lan78xx_net *dev;
> struct usb_device *udev;
> struct net_device *net;
> + struct phy_device *phydev;
>
> dev = usb_get_intfdata(intf);
> usb_set_intfdata(intf, NULL);
> @@ -3495,12 +3532,16 @@ static void lan78xx_disconnect(struct usb_interface *intf)
>
> udev = interface_to_usbdev(intf);
> net = dev->net;
> + phydev = net->phydev;
>
> phy_unregister_fixup_for_uid(PHY_KSZ9031RNX, 0xfffffff0);
> phy_unregister_fixup_for_uid(PHY_LAN8835, 0xfffffff0);
>
> phy_disconnect(net->phydev);
>
> + if (phy_is_pseudo_fixed_link(phydev))
> + fixed_phy_unregister(phydev);
That part now looks good.
> +
> unregister_netdev(net);
>
> cancel_delayed_work_sync(&dev->wq);
>
--
Florian
^ permalink raw reply
* [PATCH bpf-next v7 02/10] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-25 19:29 UTC (permalink / raw)
To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180425192910.556352-1-yhs@fb.com>
Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.
This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
include/linux/bpf.h | 1 +
include/linux/filter.h | 3 ++-
include/uapi/linux/bpf.h | 19 ++++++++++++--
kernel/bpf/core.c | 5 ++++
kernel/bpf/stackmap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/verifier.c | 19 ++++++++++++++
kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++-
7 files changed, 160 insertions(+), 4 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ee5275e..2c520b4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -690,6 +690,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
extern const struct bpf_func_proto bpf_sock_map_update_proto;
/* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b23..64899c0 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -468,7 +468,8 @@ struct bpf_prog {
dst_needed:1, /* Do we need dst entry? */
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
- kprobe_override:1; /* Do we override a kprobe? */
+ kprobe_override:1, /* Do we override a kprobe? */
+ has_callchain_buf:1; /* callchain buffer allocated? */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e667939..663900f 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -529,6 +529,17 @@ union bpf_attr {
* other bits - reserved
* Return: >= 0 stackid on success or negative error
*
+ * int bpf_get_stack(ctx, buf, size, flags)
+ * walk user or kernel stack and store the ips in buf
+ * @ctx: struct pt_regs*
+ * @buf: user buffer to fill stack
+ * @size: the buf size
+ * @flags: bits 0-7 - numer of stack frames to skip
+ * bit 8 - collect user stack instead of kernel
+ * bit 11 - get build-id as well if user stack
+ * other bits - reserved
+ * Return: >= 0 size copied on success or negative error
+ *
* s64 bpf_csum_diff(from, from_size, to, to_size, seed)
* calculate csum diff
* @from: raw from buffer
@@ -851,7 +862,8 @@ union bpf_attr {
FN(msg_pull_data), \
FN(bind), \
FN(xdp_adjust_tail), \
- FN(skb_get_xfrm_state),
+ FN(skb_get_xfrm_state), \
+ FN(get_stack),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -885,11 +897,14 @@ enum bpf_func_id {
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
#define BPF_F_SKIP_FIELD_MASK 0xffULL
#define BPF_F_USER_STACK (1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
#define BPF_F_FAST_STACK_CMP (1ULL << 9)
#define BPF_F_REUSE_STACKID (1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID (1ULL << 11)
/* BPF_FUNC_skb_set_tunnel_key flags. */
#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d315b39..3e0b6b1 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
#include <linux/rbtree_latch.h>
#include <linux/kallsyms.h>
#include <linux/rcupdate.h>
+#include <linux/perf_event.h>
#include <asm/unaligned.h>
@@ -1709,6 +1710,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
aux = container_of(work, struct bpf_prog_aux, work);
if (bpf_prog_is_dev_bound(aux))
bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+ if (aux->prog->has_callchain_buf)
+ put_callchain_buffers();
+#endif
for (i = 0; i < aux->func_cnt; i++)
bpf_jit_free(aux->func[i]);
if (aux->func_cnt) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1..3ba102b 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+ u64, flags)
+{
+ u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+ bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+ u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+ bool user = flags & BPF_F_USER_STACK;
+ struct perf_callchain_entry *trace;
+ bool kernel = !user;
+ int err = -EINVAL;
+ u64 *ips;
+
+ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+ BPF_F_USER_BUILD_ID)))
+ goto clear;
+ if (kernel && user_build_id)
+ goto clear;
+
+ elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+ : sizeof(u64);
+ if (unlikely(size % elem_size))
+ goto clear;
+
+ num_elem = size / elem_size;
+ if (sysctl_perf_event_max_stack < num_elem)
+ init_nr = 0;
+ else
+ init_nr = sysctl_perf_event_max_stack - num_elem;
+ trace = get_perf_callchain(regs, init_nr, kernel, user,
+ sysctl_perf_event_max_stack, false, false);
+ if (unlikely(!trace))
+ goto err_fault;
+
+ trace_nr = trace->nr - init_nr;
+ if (trace_nr < skip)
+ goto err_fault;
+
+ trace_nr -= skip;
+ trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+ copy_len = trace_nr * elem_size;
+ ips = trace->ip + skip + init_nr;
+ if (user && user_build_id)
+ stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+ else
+ memcpy(buf, ips, copy_len);
+
+ if (size > copy_len)
+ memset(buf + copy_len, 0, size - copy_len);
+ return copy_len;
+
+err_fault:
+ err = -EFAULT;
+clear:
+ memset(buf, 0, size);
+ return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+ .func = bpf_get_stack,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
/* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index eb1a596..253f6bd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
#include <linux/stringify.h>
#include <linux/bsearch.h>
#include <linux/sort.h>
+#include <linux/perf_event.h>
#include "disasm.h"
@@ -2450,6 +2451,24 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (err)
return err;
+ if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) {
+ const char *err_str;
+
+#ifdef CONFIG_PERF_EVENTS
+ err = get_callchain_buffers(sysctl_perf_event_max_stack);
+ err_str = "cannot get callchain buffer for func %s#%d\n";
+#else
+ err = -ENOTSUPP;
+ err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n";
+#endif
+ if (err) {
+ verbose(env, err_str, func_id_name(func_id), func_id);
+ return err;
+ }
+
+ env->prog->has_callchain_buf = true;
+ }
+
if (changes_data)
clear_all_pkt_pointers(env);
return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d..fe8476f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
#include "trace.h"
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
/**
* trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto;
case BPF_FUNC_perf_event_read_value:
return &bpf_perf_event_read_value_proto;
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+ u64, flags)
+{
+ struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+ return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+ .func = bpf_get_stack_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_tp;
default:
return tracing_func_proto(func_id, prog);
}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_tp;
case BPF_FUNC_perf_prog_read_value:
return &bpf_perf_prog_read_value_proto;
default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
/*
* bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
* to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
*/
static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+ void *, buf, u32, size, u64, flags)
+{
+ struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+ perf_fetch_caller_regs(regs);
+ return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+ (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+ .func = bpf_get_stack_raw_tp,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+};
+
static const struct bpf_func_proto *
raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_output_proto_raw_tp;
case BPF_FUNC_get_stackid:
return &bpf_get_stackid_proto_raw_tp;
+ case BPF_FUNC_get_stack:
+ return &bpf_get_stack_proto_raw_tp;
default:
return tracing_func_proto(func_id, prog);
}
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v7 04/10] bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals
From: Yonghong Song @ 2018-04-25 19:29 UTC (permalink / raw)
To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180425192910.556352-1-yhs@fb.com>
In verifier function adjust_scalar_min_max_vals,
when src_known is false and the opcode is BPF_LSH/BPF_RSH,
early return will happen in the function. So remove
the branch in handling BPF_LSH/BPF_RSH when src_known is false.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
kernel/bpf/verifier.c | 11 ++---------
1 file changed, 2 insertions(+), 9 deletions(-)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 988400e..6e3f859 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2940,10 +2940,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
dst_reg->umin_value <<= umin_val;
dst_reg->umax_value <<= umax_val;
}
- if (src_known)
- dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
- else
- dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
/* We may learn something more from the var_off */
__update_reg_bounds(dst_reg);
break;
@@ -2971,11 +2968,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
*/
dst_reg->smin_value = S64_MIN;
dst_reg->smax_value = S64_MAX;
- if (src_known)
- dst_reg->var_off = tnum_rshift(dst_reg->var_off,
- umin_val);
- else
- dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+ dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
dst_reg->umin_value >>= umax_val;
dst_reg->umax_value >>= umin_val;
/* We may learn something more from the var_off */
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v7 03/10] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-25 19:29 UTC (permalink / raw)
To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180425192910.556352-1-yhs@fb.com>
The special property of return values for helpers bpf_get_stack
and bpf_probe_read_str are captured in verifier.
Both helpers return a negative error code or
a length, which is equal to or smaller than the buffer
size argument. This additional information in the
verifier can avoid the condition such as "retval > bufsize"
in the bpf program. For example, for the code blow,
usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
if (usize < 0 || usize > max_len)
return 0;
The verifier may have the following errors:
52: (85) call bpf_get_stack#65
R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R9_w=inv800 R10=fp0,call_-1
53: (bf) r8 = r0
54: (bf) r1 = r8
55: (67) r1 <<= 32
56: (bf) r2 = r1
57: (77) r2 >>= 32
58: (25) if r2 > 0x31f goto pc+33
R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
umax_value=18446744069414584320,
var_off=(0x0; 0xffffffff00000000))
R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R8=inv(id=0) R9=inv800 R10=fp0,call_-1
59: (1f) r9 -= r8
60: (c7) r1 s>>= 32
61: (bf) r2 = r7
62: (0f) r2 += r1
math between map_value pointer and register with unbounded
min value is not allowed
The failure is due to llvm compiler optimization where register "r2",
which is a copy of "r1", is tested for condition while later on "r1"
is used for map_ptr operation. The verifier is not able to track such
inst sequence effectively.
Without the "usize > max_len" condition, there is no llvm optimization
and the below generated code passed verifier:
52: (85) call bpf_get_stack#65
R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R9_w=inv800 R10=fp0,call_-1
53: (b7) r1 = 0
54: (bf) r8 = r0
55: (67) r8 <<= 32
56: (c7) r8 s>>= 32
57: (6d) if r1 s> r8 goto pc+24
R0=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff))
R1=inv0 R6=ctx(id=0,off=0,imm=0)
R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
R10=fp0,call_-1
58: (bf) r2 = r7
59: (0f) r2 += r8
60: (1f) r9 -= r8
61: (bf) r1 = r6
Signed-off-by: Yonghong Song <yhs@fb.com>
---
kernel/bpf/verifier.c | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 253f6bd..988400e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -165,6 +165,8 @@ struct bpf_call_arg_meta {
bool pkt_access;
int regno;
int access_size;
+ s64 msize_smax_value;
+ u64 msize_umax_value;
};
static DEFINE_MUTEX(bpf_verifier_lock);
@@ -1985,6 +1987,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
} else if (arg_type_is_mem_size(arg_type)) {
bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
+ /* remember the mem_size which may be used later
+ * to refine return values.
+ */
+ meta->msize_smax_value = reg->smax_value;
+ meta->msize_umax_value = reg->umax_value;
+
/* The register is SCALAR_VALUE; the access check
* happens using its boundaries.
*/
@@ -2324,6 +2332,23 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+ int func_id,
+ struct bpf_call_arg_meta *meta)
+{
+ struct bpf_reg_state *ret_reg = ®s[BPF_REG_0];
+
+ if (ret_type != RET_INTEGER ||
+ (func_id != BPF_FUNC_get_stack &&
+ func_id != BPF_FUNC_probe_read_str))
+ return;
+
+ ret_reg->smax_value = meta->msize_smax_value;
+ ret_reg->umax_value = meta->msize_umax_value;
+ __reg_deduce_bounds(ret_reg);
+ __reg_bound_offset(ret_reg);
+}
+
static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
const struct bpf_func_proto *fn = NULL;
@@ -2447,6 +2472,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return -EINVAL;
}
+ do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
err = check_map_func_compatibility(env, meta.map_ptr, func_id);
if (err)
return err;
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v7 07/10] samples/bpf: move common-purpose trace functions to selftests
From: Yonghong Song @ 2018-04-25 19:29 UTC (permalink / raw)
To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180425192910.556352-1-yhs@fb.com>
There is no functionality change in this patch. The common-purpose
trace functions, including perf_event polling and ksym lookup,
are moved from trace_output_user.c and bpf_load.c to
selftests/bpf/trace_helpers.c so that these function can
be reused later in selftests.
Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
samples/bpf/Makefile | 11 +-
samples/bpf/bpf_load.c | 63 ----------
samples/bpf/bpf_load.h | 7 --
samples/bpf/offwaketime_user.c | 1 +
samples/bpf/sampleip_user.c | 1 +
samples/bpf/spintest_user.c | 1 +
samples/bpf/trace_event_user.c | 1 +
samples/bpf/trace_output_user.c | 125 +++----------------
tools/testing/selftests/bpf/trace_helpers.c | 186 ++++++++++++++++++++++++++++
tools/testing/selftests/bpf/trace_helpers.h | 24 ++++
10 files changed, 238 insertions(+), 182 deletions(-)
create mode 100644 tools/testing/selftests/bpf/trace_helpers.c
create mode 100644 tools/testing/selftests/bpf/trace_helpers.h
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index aa8c392..d36444c 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -49,6 +49,7 @@ hostprogs-y += xdp_adjust_tail
# Libbpf dependencies
LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
+TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
sock_example-objs := sock_example.o $(LIBBPF)
@@ -65,10 +66,10 @@ tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
-trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
+trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o $(TRACE_HELPERS)
lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
-offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
-spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
+offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS)
map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
@@ -82,8 +83,8 @@ xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
test_current_task_under_cgroup_user.o
-trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
-sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o
+trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o $(TRACE_HELPERS)
tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index bebe418..529972e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -650,66 +650,3 @@ void read_trace_pipe(void)
}
}
}
-
-#define MAX_SYMS 300000
-static struct ksym syms[MAX_SYMS];
-static int sym_cnt;
-
-static int ksym_cmp(const void *p1, const void *p2)
-{
- return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
-}
-
-int load_kallsyms(void)
-{
- FILE *f = fopen("/proc/kallsyms", "r");
- char func[256], buf[256];
- char symbol;
- void *addr;
- int i = 0;
-
- if (!f)
- return -ENOENT;
-
- while (!feof(f)) {
- if (!fgets(buf, sizeof(buf), f))
- break;
- if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
- break;
- if (!addr)
- continue;
- syms[i].addr = (long) addr;
- syms[i].name = strdup(func);
- i++;
- }
- sym_cnt = i;
- qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
- return 0;
-}
-
-struct ksym *ksym_search(long key)
-{
- int start = 0, end = sym_cnt;
- int result;
-
- while (start < end) {
- size_t mid = start + (end - start) / 2;
-
- result = key - syms[mid].addr;
- if (result < 0)
- end = mid;
- else if (result > 0)
- start = mid + 1;
- else
- return &syms[mid];
- }
-
- if (start >= 1 && syms[start - 1].addr < key &&
- key < syms[start].addr)
- /* valid ksym */
- return &syms[start - 1];
-
- /* out of range. return _stext */
- return &syms[0];
-}
-
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 453c200..2c3d0b4 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -54,12 +54,5 @@ int load_bpf_file(char *path);
int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
void read_trace_pipe(void);
-struct ksym {
- long addr;
- char *name;
-};
-
-int load_kallsyms(void);
-struct ksym *ksym_search(long key);
int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
#endif
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 512f87a..f06063a 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -17,6 +17,7 @@
#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
+#include "trace_helpers.h"
#define PRINT_RAW_ADDR 0
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 4ed690b..60c2b73 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -22,6 +22,7 @@
#include "libbpf.h"
#include "bpf_load.h"
#include "perf-sys.h"
+#include "trace_helpers.h"
#define DEFAULT_FREQ 99
#define DEFAULT_SECS 5
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
index 3d73621..8d3e9cf 100644
--- a/samples/bpf/spintest_user.c
+++ b/samples/bpf/spintest_user.c
@@ -7,6 +7,7 @@
#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
+#include "trace_helpers.h"
int main(int ac, char **argv)
{
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 56f7a25..1fa1bec 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -21,6 +21,7 @@
#include "libbpf.h"
#include "bpf_load.h"
#include "perf-sys.h"
+#include "trace_helpers.h"
#define SAMPLE_FREQ 50
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index ccca1e3..cc4b383 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -21,100 +21,10 @@
#include "libbpf.h"
#include "bpf_load.h"
#include "perf-sys.h"
+#include "trace_helpers.h"
static int pmu_fd;
-int page_size;
-int page_cnt = 8;
-volatile struct perf_event_mmap_page *header;
-
-typedef void (*print_fn)(void *data, int size);
-
-static int perf_event_mmap(int fd)
-{
- void *base;
- int mmap_size;
-
- page_size = getpagesize();
- mmap_size = page_size * (page_cnt + 1);
-
- base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
- if (base == MAP_FAILED) {
- printf("mmap err\n");
- return -1;
- }
-
- header = base;
- return 0;
-}
-
-static int perf_event_poll(int fd)
-{
- struct pollfd pfd = { .fd = fd, .events = POLLIN };
-
- return poll(&pfd, 1, 1000);
-}
-
-struct perf_event_sample {
- struct perf_event_header header;
- __u32 size;
- char data[];
-};
-
-static void perf_event_read(print_fn fn)
-{
- __u64 data_tail = header->data_tail;
- __u64 data_head = header->data_head;
- __u64 buffer_size = page_cnt * page_size;
- void *base, *begin, *end;
- char buf[256];
-
- asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
- if (data_head == data_tail)
- return;
-
- base = ((char *)header) + page_size;
-
- begin = base + data_tail % buffer_size;
- end = base + data_head % buffer_size;
-
- while (begin != end) {
- struct perf_event_sample *e;
-
- e = begin;
- if (begin + e->header.size > base + buffer_size) {
- long len = base + buffer_size - begin;
-
- assert(len < e->header.size);
- memcpy(buf, begin, len);
- memcpy(buf + len, base, e->header.size - len);
- e = (void *) buf;
- begin = base + e->header.size - len;
- } else if (begin + e->header.size == base + buffer_size) {
- begin = base;
- } else {
- begin += e->header.size;
- }
-
- if (e->header.type == PERF_RECORD_SAMPLE) {
- fn(e->data, e->size);
- } else if (e->header.type == PERF_RECORD_LOST) {
- struct {
- struct perf_event_header header;
- __u64 id;
- __u64 lost;
- } *lost = (void *) e;
- printf("lost %lld events\n", lost->lost);
- } else {
- printf("unknown event type=%d size=%d\n",
- e->header.type, e->header.size);
- }
- }
-
- __sync_synchronize(); /* smp_mb() */
- header->data_tail = data_head;
-}
-
static __u64 time_get_ns(void)
{
struct timespec ts;
@@ -127,7 +37,7 @@ static __u64 start_time;
#define MAX_CNT 100000ll
-static void print_bpf_output(void *data, int size)
+static int print_bpf_output(void *data, int size)
{
static __u64 cnt;
struct {
@@ -138,7 +48,7 @@ static void print_bpf_output(void *data, int size)
if (e->cookie != 0x12345678) {
printf("BUG pid %llx cookie %llx sized %d\n",
e->pid, e->cookie, size);
- kill(0, SIGINT);
+ return PERF_EVENT_ERROR;
}
cnt++;
@@ -146,8 +56,10 @@ static void print_bpf_output(void *data, int size)
if (cnt == MAX_CNT) {
printf("recv %lld events per sec\n",
MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
- kill(0, SIGINT);
+ return PERF_EVENT_DONE;
}
+
+ return PERF_EVENT_CONT;
}
static void test_bpf_perf_event(void)
@@ -166,10 +78,18 @@ static void test_bpf_perf_event(void)
ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
}
+static void exec_action(void)
+{
+ FILE *f;
+
+ f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+ (void) f;
+}
+
int main(int argc, char **argv)
{
char filename[256];
- FILE *f;
+ int ret;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
@@ -180,17 +100,8 @@ int main(int argc, char **argv)
test_bpf_perf_event();
- if (perf_event_mmap(pmu_fd) < 0)
- return 1;
-
- f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
- (void) f;
-
start_time = time_get_ns();
- for (;;) {
- perf_event_poll(pmu_fd);
- perf_event_read(print_bpf_output);
- }
-
- return 0;
+ ret = perf_event_poller(pmu_fd, exec_action, print_bpf_output);
+ kill(0, SIGINT);
+ return ret;
}
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
new file mode 100644
index 0000000..00954e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/mman.h>
+#include "trace_helpers.h"
+
+#define MAX_SYMS 300000
+static struct ksym syms[MAX_SYMS];
+static int sym_cnt;
+
+static int ksym_cmp(const void *p1, const void *p2)
+{
+ return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
+}
+
+int load_kallsyms(void)
+{
+ FILE *f = fopen("/proc/kallsyms", "r");
+ char func[256], buf[256];
+ char symbol;
+ void *addr;
+ int i = 0;
+
+ if (!f)
+ return -ENOENT;
+
+ while (!feof(f)) {
+ if (!fgets(buf, sizeof(buf), f))
+ break;
+ if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
+ break;
+ if (!addr)
+ continue;
+ syms[i].addr = (long) addr;
+ syms[i].name = strdup(func);
+ i++;
+ }
+ sym_cnt = i;
+ qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
+ return 0;
+}
+
+struct ksym *ksym_search(long key)
+{
+ int start = 0, end = sym_cnt;
+ int result;
+
+ while (start < end) {
+ size_t mid = start + (end - start) / 2;
+
+ result = key - syms[mid].addr;
+ if (result < 0)
+ end = mid;
+ else if (result > 0)
+ start = mid + 1;
+ else
+ return &syms[mid];
+ }
+
+ if (start >= 1 && syms[start - 1].addr < key &&
+ key < syms[start].addr)
+ /* valid ksym */
+ return &syms[start - 1];
+
+ /* out of range. return _stext */
+ return &syms[0];
+}
+
+static int page_size;
+static int page_cnt = 8;
+static volatile struct perf_event_mmap_page *header;
+
+static int perf_event_mmap(int fd)
+{
+ void *base;
+ int mmap_size;
+
+ page_size = getpagesize();
+ mmap_size = page_size * (page_cnt + 1);
+
+ base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+ if (base == MAP_FAILED) {
+ printf("mmap err\n");
+ return -1;
+ }
+
+ header = base;
+ return 0;
+}
+
+static int perf_event_poll(int fd)
+{
+ struct pollfd pfd = { .fd = fd, .events = POLLIN };
+
+ return poll(&pfd, 1, 1000);
+}
+
+struct perf_event_sample {
+ struct perf_event_header header;
+ __u32 size;
+ char data[];
+};
+
+static int perf_event_read(perf_event_print_fn fn)
+{
+ __u64 data_tail = header->data_tail;
+ __u64 data_head = header->data_head;
+ __u64 buffer_size = page_cnt * page_size;
+ void *base, *begin, *end;
+ char buf[256];
+ int ret;
+
+ asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+ if (data_head == data_tail)
+ return PERF_EVENT_CONT;
+
+ base = ((char *)header) + page_size;
+
+ begin = base + data_tail % buffer_size;
+ end = base + data_head % buffer_size;
+
+ while (begin != end) {
+ struct perf_event_sample *e;
+
+ e = begin;
+ if (begin + e->header.size > base + buffer_size) {
+ long len = base + buffer_size - begin;
+
+ assert(len < e->header.size);
+ memcpy(buf, begin, len);
+ memcpy(buf + len, base, e->header.size - len);
+ e = (void *) buf;
+ begin = base + e->header.size - len;
+ } else if (begin + e->header.size == base + buffer_size) {
+ begin = base;
+ } else {
+ begin += e->header.size;
+ }
+
+ if (e->header.type == PERF_RECORD_SAMPLE) {
+ ret = fn(e->data, e->size);
+ if (ret != PERF_EVENT_CONT)
+ return ret;
+ } else if (e->header.type == PERF_RECORD_LOST) {
+ struct {
+ struct perf_event_header header;
+ __u64 id;
+ __u64 lost;
+ } *lost = (void *) e;
+ printf("lost %lld events\n", lost->lost);
+ } else {
+ printf("unknown event type=%d size=%d\n",
+ e->header.type, e->header.size);
+ }
+ }
+
+ __sync_synchronize(); /* smp_mb() */
+ header->data_tail = data_head;
+ return PERF_EVENT_CONT;
+}
+
+int perf_event_poller(int fd, perf_event_exec_fn exec_fn,
+ perf_event_print_fn output_fn)
+{
+ int ret;
+
+ if (perf_event_mmap(fd) < 0)
+ return PERF_EVENT_ERROR;
+
+ exec_fn();
+
+ for (;;) {
+ perf_event_poll(fd);
+ ret = perf_event_read(output_fn);
+ if (ret != PERF_EVENT_CONT)
+ return ret;
+ }
+
+ return PERF_EVENT_DONE;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
new file mode 100644
index 0000000..8750778
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_HELPER_H
+#define __TRACE_HELPER_H
+
+struct ksym {
+ long addr;
+ char *name;
+};
+
+int load_kallsyms(void);
+struct ksym *ksym_search(long key);
+
+typedef void (*perf_event_exec_fn)(void);
+typedef int (*perf_event_print_fn)(void *data, int size);
+
+/* return code for perf_event_print_fn */
+#define PERF_EVENT_DONE 0
+#define PERF_EVENT_ERROR 1
+#define PERF_EVENT_CONT 2
+
+/* return PERF_EVENT_DONE or PERF_EVENT_ERROR */
+int perf_event_poller(int fd, perf_event_exec_fn exec_fn,
+ perf_event_print_fn output_fn);
+#endif
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v7 06/10] tools/bpf: add bpf_get_stack helper to tools headers
From: Yonghong Song @ 2018-04-25 19:29 UTC (permalink / raw)
To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180425192910.556352-1-yhs@fb.com>
The tools header file bpf.h is synced with kernel uapi bpf.h.
The new helper is also added to bpf_helpers.h.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/include/uapi/linux/bpf.h | 19 +++++++++++++++++--
tools/testing/selftests/bpf/bpf_helpers.h | 2 ++
2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e667939..663900f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -529,6 +529,17 @@ union bpf_attr {
* other bits - reserved
* Return: >= 0 stackid on success or negative error
*
+ * int bpf_get_stack(ctx, buf, size, flags)
+ * walk user or kernel stack and store the ips in buf
+ * @ctx: struct pt_regs*
+ * @buf: user buffer to fill stack
+ * @size: the buf size
+ * @flags: bits 0-7 - numer of stack frames to skip
+ * bit 8 - collect user stack instead of kernel
+ * bit 11 - get build-id as well if user stack
+ * other bits - reserved
+ * Return: >= 0 size copied on success or negative error
+ *
* s64 bpf_csum_diff(from, from_size, to, to_size, seed)
* calculate csum diff
* @from: raw from buffer
@@ -851,7 +862,8 @@ union bpf_attr {
FN(msg_pull_data), \
FN(bind), \
FN(xdp_adjust_tail), \
- FN(skb_get_xfrm_state),
+ FN(skb_get_xfrm_state), \
+ FN(get_stack),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -885,11 +897,14 @@ enum bpf_func_id {
/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
#define BPF_F_TUNINFO_IPV6 (1ULL << 0)
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
#define BPF_F_SKIP_FIELD_MASK 0xffULL
#define BPF_F_USER_STACK (1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
#define BPF_F_FAST_STACK_CMP (1ULL << 9)
#define BPF_F_REUSE_STACKID (1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID (1ULL << 11)
/* BPF_FUNC_skb_set_tunnel_key flags. */
#define BPF_F_ZERO_CSUM_TX (1ULL << 1)
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 69d7b91..265f8e0 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -101,6 +101,8 @@ static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
static int (*bpf_skb_get_xfrm_state)(void *ctx, int index, void *state,
int size, int flags) =
(void *) BPF_FUNC_skb_get_xfrm_state;
+static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
+ (void *) BPF_FUNC_get_stack;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions
--
2.9.5
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox