* [PATCH RFC net-next 04/11] udp: paged allocation with gso
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
When sending large datagrams that are later segmented, store data in
page frags to avoid copying from linear in skb_segment.
This logic will also be used by zerocopy.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
net/ipv4/ip_output.c | 15 +++++++++++----
net/ipv6/ip6_output.c | 19 ++++++++++++++-----
2 files changed, 25 insertions(+), 9 deletions(-)
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7abfb24ec5e5..9ccd6c28e420 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -878,11 +878,13 @@ static int __ip_append_data(struct sock *sk,
struct rtable *rt = (struct rtable *)cork->dst;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
+ bool paged;
skb = skb_peek_tail(queue);
exthdrlen = !skb ? rt->dst.header_len : 0;
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+ paged = !!cork->gso_size;
if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
@@ -934,6 +936,7 @@ static int __ip_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -954,8 +957,12 @@ static int __ip_append_data(struct sock *sk,
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
+ else if (!paged)
alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += exthdrlen;
@@ -999,7 +1006,7 @@ static int __ip_append_data(struct sock *sk,
/*
* Find where to start putting bytes.
*/
- data = skb_put(skb, fraglen + exthdrlen);
+ data = skb_put(skb, fraglen + exthdrlen - pagedlen);
skb_set_network_header(skb, exthdrlen);
skb->transport_header = (skb->network_header +
fragheaderlen);
@@ -1015,7 +1022,7 @@ static int __ip_append_data(struct sock *sk,
pskb_trim_unique(skb_prev, maxfraglen);
}
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
@@ -1023,7 +1030,7 @@ static int __ip_append_data(struct sock *sk,
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
csummode = CHECKSUM_NONE;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 3ce947c1d173..9fbcec4fb946 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1270,6 +1270,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
+ bool paged;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1277,6 +1278,7 @@ static int __ip6_append_data(struct sock *sk,
dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
}
+ paged = !!cork->gso_size;
mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
@@ -1368,6 +1370,7 @@ static int __ip6_append_data(struct sock *sk,
unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen;
+ unsigned int pagedlen = 0;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1390,11 +1393,17 @@ static int __ip6_append_data(struct sock *sk,
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
+ fraglen = datalen + fragheaderlen;
+
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else
- alloclen = datalen + fragheaderlen;
+ else if (!paged)
+ alloclen = fraglen;
+ else {
+ alloclen = min_t(int, fraglen, MAX_HEADER);
+ pagedlen = fraglen - alloclen;
+ }
alloclen += dst_exthdrlen;
@@ -1416,7 +1425,7 @@ static int __ip6_append_data(struct sock *sk,
*/
alloclen += sizeof(struct frag_hdr);
- copy = datalen - transhdrlen - fraggap;
+ copy = datalen - transhdrlen - fraggap - pagedlen;
if (copy < 0) {
err = -EINVAL;
goto error;
@@ -1455,7 +1464,7 @@ static int __ip6_append_data(struct sock *sk,
/*
* Find where to start putting bytes
*/
- data = skb_put(skb, fraglen);
+ data = skb_put(skb, fraglen - pagedlen);
skb_set_network_header(skb, exthdrlen);
data += fragheaderlen;
skb->transport_header = (skb->network_header +
@@ -1478,7 +1487,7 @@ static int __ip6_append_data(struct sock *sk,
}
offset += copy;
- length -= datalen - fraggap;
+ length -= copy + transhdrlen;
transhdrlen = 0;
exthdrlen = 0;
dst_exthdrlen = 0;
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* [PATCH RFC net-next 05/11] udp: add gso segment cmsg
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Allow specifying segment size in the send call.
The new control message performs the same function as socket option
UDP_SEGMENT while avoiding the extra system call.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
include/net/udp.h | 1 +
net/ipv4/udp.c | 43 +++++++++++++++++++++++++++++++++++++++++--
net/ipv6/udp.c | 5 ++++-
3 files changed, 46 insertions(+), 3 deletions(-)
diff --git a/include/net/udp.h b/include/net/udp.h
index 741d888d0fdb..05990746810e 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -273,6 +273,7 @@ int udp_abort(struct sock *sk, int err);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
int udp_push_pending_frames(struct sock *sk);
void udp_flush_pending_frames(struct sock *sk);
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst);
int udp_rcv(struct sk_buff *skb);
int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 731772a69043..2bad61a05c5e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -851,6 +851,42 @@ int udp_push_pending_frames(struct sock *sk)
}
EXPORT_SYMBOL(udp_push_pending_frames);
+static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
+{
+ switch (cmsg->cmsg_type) {
+ case UDP_SEGMENT:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16)))
+ return -EINVAL;
+ *gso_size = *(__u16 *)CMSG_DATA(cmsg);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
+{
+ struct cmsghdr *cmsg;
+ bool need_ip = false;
+ int err;
+
+ for_each_cmsghdr(cmsg, msg) {
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_UDP) {
+ need_ip = true;
+ continue;
+ }
+
+ err = __udp_cmsg_send(cmsg, gso_size);
+ if (err)
+ return err;
+ }
+
+ return need_ip;
+}
+
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
@@ -939,8 +975,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.gso_size = up->gso_size;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6);
- if (unlikely(err)) {
+ err = udp_cmsg_send(sk, msg, &ipc.gso_size);
+ if (err > 0)
+ err = ip_cmsg_send(sk, msg, &ipc,
+ sk->sk_family == AF_INET6);
+ if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 16aee23b1e5f..c8b3289bdbd0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1274,7 +1274,10 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt->tot_len = sizeof(*opt);
ipc6.opt = opt;
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6, &sockc);
+ err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
+ if (err > 0)
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+ &ipc6, &sockc);
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* [PATCH RFC net-next 06/11] udp: add gso support to virtual devices
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Virtual devices such as tunnels and bonding can handle large packets.
Only segment packets when reaching a physical or loopback device.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
include/linux/netdev_features.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 35b79f47a13d..1e4883bb02a7 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -80,6 +80,7 @@ enum {
NETIF_F_GRO_HW_BIT, /* Hardware Generic receive offload */
NETIF_F_HW_TLS_RECORD_BIT, /* Offload TLS record */
+ NETIF_F_GSO_UDP_L4_BIT, /* UDP payload GSO (not UFO) */
/*
* Add your fresh new feature above and remember to update
@@ -147,6 +148,7 @@ enum {
#define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM)
#define NETIF_F_RX_UDP_TUNNEL_PORT __NETIF_F(RX_UDP_TUNNEL_PORT)
#define NETIF_F_HW_TLS_RECORD __NETIF_F(HW_TLS_RECORD)
+#define NETIF_F_GSO_UDP_L4 __NETIF_F(GSO_UDP_L4)
#define for_each_netdev_feature(mask_addr, bit) \
for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
@@ -216,6 +218,7 @@ enum {
NETIF_F_GSO_GRE_CSUM | \
NETIF_F_GSO_IPXIP4 | \
NETIF_F_GSO_IPXIP6 | \
+ NETIF_F_GSO_UDP_L4 | \
NETIF_F_GSO_UDP_TUNNEL | \
NETIF_F_GSO_UDP_TUNNEL_CSUM)
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* [PATCH RFC net-next 08/11] selftests: udp gso
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Validate udp gso, including edge cases (such as min/max gso sizes).
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/.gitignore | 1 +
tools/testing/selftests/net/Makefile | 3 +-
tools/testing/selftests/net/udpgso.c | 486 +++++++++++++++++++++++++
tools/testing/selftests/net/udpgso.sh | 16 +
4 files changed, 505 insertions(+), 1 deletion(-)
create mode 100644 tools/testing/selftests/net/udpgso.c
create mode 100755 tools/testing/selftests/net/udpgso.sh
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index c612d6e38c62..6333b789c8fa 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -7,3 +7,4 @@ reuseport_bpf_cpu
reuseport_bpf_numa
reuseport_dualstack
reuseaddr_conflict
+udpgso
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 23e725f4305e..a05a657489b9 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -5,12 +5,13 @@ CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
CFLAGS += -I../../../../usr/include/
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
-TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh
+TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
TEST_GEN_FILES = socket
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
TEST_GEN_FILES += tcp_mmap
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
+TEST_GEN_PROGS += udpgso
include ../lib.mk
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
new file mode 100644
index 000000000000..68a230dfee73
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso.c
@@ -0,0 +1,486 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <stddef.h>
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <net/if.h>
+#include <linux/in.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <netinet/if_ether.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef ETH_MAX_MTU
+#define ETH_MAX_MTU 0xFFFFU
+#endif
+
+#ifndef UDP_SEGMENT
+#define UDP_SEGMENT 103
+#endif
+
+#define CONST_MTU_TEST 1500
+
+#define CONST_HDRLEN_V4 (sizeof(struct iphdr) + sizeof(struct udphdr))
+#define CONST_HDRLEN_V6 (sizeof(struct ip6_hdr) + sizeof(struct udphdr))
+
+#define CONST_MSS_V4 (CONST_MTU_TEST - CONST_HDRLEN_V4)
+#define CONST_MSS_V6 (CONST_MTU_TEST - CONST_HDRLEN_V6)
+
+#define CONST_MAX_SEGS_V4 (ETH_MAX_MTU / CONST_MSS_V4)
+#define CONST_MAX_SEGS_V6 (ETH_MAX_MTU / CONST_MSS_V6)
+
+static bool cfg_do_ipv4;
+static bool cfg_do_ipv6;
+static bool cfg_do_connectionless;
+static bool cfg_do_setsockopt;
+static int cfg_specific_test_id = -1;
+
+static const char cfg_ifname[] = "lo";
+static unsigned short cfg_port = 9000;
+
+static char buf[ETH_MAX_MTU];
+
+struct testcase {
+ int tlen; /* send() buffer size, may exceed mss */
+ bool tfail; /* send() call is expected to fail */
+ int gso_len; /* mss after applying gso */
+ int r_num_mss; /* recv(): number of calls of full mss */
+ int r_len_last; /* recv(): size of last non-mss dgram, if any */
+};
+
+const struct in6_addr addr6 = IN6ADDR_LOOPBACK_INIT;
+const struct in_addr addr4 = { .s_addr = __constant_htonl(INADDR_LOOPBACK + 2) };
+
+struct testcase testcases_v4[] = {
+ {
+ /* no GSO: send a single byte */
+ .tlen = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* no GSO: send a single MSS */
+ .tlen = CONST_MSS_V4,
+ .r_num_mss = 1,
+ },
+ {
+ /* no GSO: send a single MSS + 1B: fail */
+ .tlen = CONST_MSS_V4 + 1,
+ .tfail = true,
+ },
+ {
+ /* send a single MSS: will fail with GSO, because the segment
+ * logic in udp4_ufo_fragment demands a gso skb to be > MTU
+ */
+ .tlen = CONST_MSS_V4,
+ .gso_len = CONST_MSS_V4,
+ .tfail = true,
+ .r_num_mss = 1,
+ },
+ {
+ /* send a single MSS + 1B */
+ .tlen = CONST_MSS_V4 + 1,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* send exactly 2 MSS */
+ .tlen = CONST_MSS_V4 * 2,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = 2,
+ },
+ {
+ /* send 2 MSS + 1B */
+ .tlen = (CONST_MSS_V4 * 2) + 1,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = 2,
+ .r_len_last = 1,
+ },
+ {
+ /* send MAX segs */
+ .tlen = (ETH_MAX_MTU / CONST_MSS_V4) * CONST_MSS_V4,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = (ETH_MAX_MTU / CONST_MSS_V4),
+ },
+
+ {
+ /* send MAX bytes */
+ .tlen = ETH_MAX_MTU - CONST_HDRLEN_V4,
+ .gso_len = CONST_MSS_V4,
+ .r_num_mss = CONST_MAX_SEGS_V4,
+ .r_len_last = ETH_MAX_MTU - CONST_HDRLEN_V4 -
+ (CONST_MAX_SEGS_V4 * CONST_MSS_V4),
+ },
+ {
+ /* send MAX + 1: fail */
+ .tlen = ETH_MAX_MTU - CONST_HDRLEN_V4 + 1,
+ .gso_len = CONST_MSS_V4,
+ .tfail = true,
+ },
+ {
+ /* EOL */
+ }
+};
+
+#ifndef IP6_MAX_MTU
+#define IP6_MAX_MTU (ETH_MAX_MTU + sizeof(struct ip6_hdr))
+#endif
+
+struct testcase testcases_v6[] = {
+ {
+ /* no GSO: send a single byte */
+ .tlen = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* no GSO: send a single MSS */
+ .tlen = CONST_MSS_V6,
+ .r_num_mss = 1,
+ },
+ {
+ /* no GSO: send a single MSS + 1B: fail */
+ .tlen = CONST_MSS_V6 + 1,
+ .tfail = true,
+ },
+ {
+ /* send a single MSS: will fail with GSO, because the segment
+ * logic in udp4_ufo_fragment demands a gso skb to be > MTU
+ */
+ .tlen = CONST_MSS_V6,
+ .gso_len = CONST_MSS_V6,
+ .tfail = true,
+ .r_num_mss = 1,
+ },
+ {
+ /* send a single MSS + 1B */
+ .tlen = CONST_MSS_V6 + 1,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = 1,
+ .r_len_last = 1,
+ },
+ {
+ /* send exactly 2 MSS */
+ .tlen = CONST_MSS_V6 * 2,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = 2,
+ },
+ {
+ /* send 2 MSS + 1B */
+ .tlen = (CONST_MSS_V6 * 2) + 1,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = 2,
+ .r_len_last = 1,
+ },
+ {
+ /* send MAX segs */
+ .tlen = (IP6_MAX_MTU / CONST_MSS_V6) * CONST_MSS_V6,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = (IP6_MAX_MTU / CONST_MSS_V6),
+ },
+
+ {
+ /* send MAX bytes */
+ .tlen = IP6_MAX_MTU - CONST_HDRLEN_V6,
+ .gso_len = CONST_MSS_V6,
+ .r_num_mss = CONST_MAX_SEGS_V6,
+ .r_len_last = IP6_MAX_MTU - CONST_HDRLEN_V6 -
+ (CONST_MAX_SEGS_V6 * CONST_MSS_V6),
+ },
+ {
+ /* send MAX + 1: fail */
+ .tlen = IP6_MAX_MTU - CONST_HDRLEN_V6 + 1,
+ .gso_len = CONST_MSS_V6,
+ .tfail = true,
+ },
+ {
+ /* EOL */
+ }
+};
+
+static unsigned int get_device_mtu(int fd, const char *ifname)
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ strcpy(ifr.ifr_name, ifname);
+
+ if (ioctl(fd, SIOCGIFMTU, &ifr))
+ error(1, errno, "ioctl get mtu");
+
+ return ifr.ifr_mtu;
+}
+
+static void __set_device_mtu(int fd, const char *ifname, unsigned int mtu)
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ ifr.ifr_mtu = mtu;
+ strcpy(ifr.ifr_name, ifname);
+
+ if (ioctl(fd, SIOCSIFMTU, &ifr))
+ error(1, errno, "ioctl set mtu");
+}
+
+static void set_device_mtu(int fd, int mtu)
+{
+ int val;
+
+ val = get_device_mtu(fd, cfg_ifname);
+ fprintf(stderr, "device mtu (orig): %u\n", val);
+
+ __set_device_mtu(fd, cfg_ifname, mtu);
+ val = get_device_mtu(fd, cfg_ifname);
+ if (val != mtu)
+ error(1, 0, "unable to set device mtu to %u\n", val);
+
+ fprintf(stderr, "device mtu (test): %u\n", val);
+}
+
+static void set_pmtu_discover(int fd, bool is_ipv4)
+{
+ int level, name, val;
+
+ if (is_ipv4) {
+ level = SOL_IP;
+ name = IP_MTU_DISCOVER;
+ val = IP_PMTUDISC_DO;
+ } else {
+ level = SOL_IPV6;
+ name = IPV6_MTU_DISCOVER;
+ val = IPV6_PMTUDISC_DO;
+ }
+
+ if (setsockopt(fd, level, name, &val, sizeof(val)))
+ error(1, errno, "setsockopt path mtu");
+}
+
+static bool send_one(int fd, int len, int gso_len,
+ struct sockaddr *addr, socklen_t alen)
+{
+ char control[CMSG_SPACE(sizeof(uint16_t))] = {0};
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ struct cmsghdr *cm;
+ int ret;
+
+ iov.iov_base = buf;
+ iov.iov_len = len;
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_name = addr;
+ msg.msg_namelen = alen;
+
+ if (gso_len && !cfg_do_setsockopt) {
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+
+ cm = CMSG_FIRSTHDR(&msg);
+ cm->cmsg_level = SOL_UDP;
+ cm->cmsg_type = UDP_SEGMENT;
+ cm->cmsg_len = CMSG_LEN(sizeof(uint16_t));
+ *((uint16_t *) CMSG_DATA(cm)) = gso_len;
+ }
+
+ ret = sendmsg(fd, &msg, 0);
+ if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
+ return false;
+ if (ret == -1)
+ error(1, errno, "sendmsg");
+ if (ret != len)
+ error(1, 0, "sendto: %d != %u", ret, len);
+
+ return true;
+}
+
+static int recv_one(int fd, int flags)
+{
+ int ret;
+
+ ret = recv(fd, buf, sizeof(buf), flags);
+ if (ret == -1 && errno == EAGAIN && (flags & MSG_DONTWAIT))
+ return 0;
+ if (ret == -1)
+ error(1, errno, "recv");
+
+ return ret;
+}
+
+static void run_one(struct testcase *test, int fdt, int fdr,
+ struct sockaddr *addr, socklen_t alen)
+{
+ int i, ret, val, mss;
+ bool sent;
+
+ fprintf(stderr, "ipv%d tx:%d gso:%d %s\n",
+ addr->sa_family == AF_INET ? 4 : 6,
+ test->tlen, test->gso_len,
+ test->tfail ? "(fail)" : "");
+
+ val = test->gso_len;
+ if (cfg_do_setsockopt) {
+ if (setsockopt(fdt, SOL_UDP, UDP_SEGMENT, &val, sizeof(val)))
+ error(1, errno, "setsockopt udp segment");
+ }
+
+ sent = send_one(fdt, test->tlen, test->gso_len, addr, alen);
+ if (sent && test->tfail)
+ error(1, 0, "send succeeded while expecting failure");
+ if (!sent && !test->tfail)
+ error(1, 0, "send failed while expecting success");
+ if (!sent)
+ return;
+
+ mss = addr->sa_family == AF_INET ? CONST_MSS_V4 : CONST_MSS_V6;
+
+ /* Recv all full MSS datagrams */
+ for (i = 0; i < test->r_num_mss; i++) {
+ ret = recv_one(fdr, 0);
+ if (ret != mss)
+ error(1, 0, "recv.%d: %d != %d", i, ret, mss);
+ }
+
+ /* Recv the non-full last datagram, if tlen was not a multiple of mss */
+ if (test->r_len_last) {
+ ret = recv_one(fdr, 0);
+ if (ret != test->r_len_last)
+ error(1, 0, "recv.%d: %d != %d (last)",
+ i, ret, test->r_len_last);
+ }
+
+ /* Verify received all data */
+ ret = recv_one(fdr, MSG_DONTWAIT);
+ if (ret)
+ error(1, 0, "recv: unexpected datagram");
+}
+
+static void run_all(int fdt, int fdr, struct sockaddr *addr, socklen_t alen)
+{
+ struct testcase *tests, *test;
+
+ tests = addr->sa_family == AF_INET ? testcases_v4 : testcases_v6;
+
+ for (test = tests; test->tlen; test++) {
+ /* if a specific test is given, then skip all others */
+ if (cfg_specific_test_id == -1 ||
+ cfg_specific_test_id == test - tests)
+ run_one(test, fdt, fdr, addr, alen);
+ }
+}
+
+static void run_test(struct sockaddr *addr, socklen_t alen)
+{
+ struct timeval tv = { .tv_usec = 100 * 1000 };
+ int fdr, fdt;
+
+ fdr = socket(addr->sa_family, SOCK_DGRAM, 0);
+ if (fdr == -1)
+ error(1, errno, "socket r");
+
+ if (bind(fdr, addr, alen))
+ error(1, errno, "bind");
+
+ /* Have tests fail quickly instead of hang */
+ if (setsockopt(fdr, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)))
+ error(1, errno, "setsockopt rcv timeout");
+
+ fdt = socket(addr->sa_family, SOCK_DGRAM, 0);
+ if (fdt == -1)
+ error(1, errno, "socket t");
+
+ /* Do not fragment these datagrams: only succeed if GSO works */
+ set_pmtu_discover(fdt, addr->sa_family == AF_INET);
+
+ if (cfg_do_connectionless) {
+ set_device_mtu(fdt, CONST_MTU_TEST);
+ run_all(fdt, fdr, addr, alen);
+ }
+
+ if (close(fdt))
+ error(1, errno, "close t");
+ if (close(fdr))
+ error(1, errno, "close r");
+}
+
+static void run_test_v4(void)
+{
+ struct sockaddr_in addr = {0};
+
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(cfg_port);
+ addr.sin_addr = addr4;
+
+ run_test((void *)&addr, sizeof(addr));
+}
+
+static void run_test_v6(void)
+{
+ struct sockaddr_in6 addr = {0};
+
+ addr.sin6_family = AF_INET6;
+ addr.sin6_port = htons(cfg_port);
+ addr.sin6_addr = addr6;
+
+ run_test((void *)&addr, sizeof(addr));
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "46Cst:")) != -1) {
+ switch (c) {
+ case '4':
+ cfg_do_ipv4 = true;
+ break;
+ case '6':
+ cfg_do_ipv6 = true;
+ break;
+ case 'C':
+ cfg_do_connectionless = true;
+ break;
+ case 's':
+ cfg_do_setsockopt = true;
+ break;
+ case 't':
+ cfg_specific_test_id = strtoul(optarg, NULL, 0);
+ break;
+ default:
+ error(1, 0, "%s: parse error", argv[0]);
+ }
+ }
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ if (cfg_do_ipv4)
+ run_test_v4();
+ if (cfg_do_ipv6)
+ run_test_v6();
+
+ fprintf(stderr, "OK\n");
+ return 0;
+}
+
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
new file mode 100755
index 000000000000..7977b97e060c
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgso regression tests
+
+echo "ipv4 cmsg"
+./in_netns.sh ./udpgso -4 -C
+
+echo "ipv4 setsockopt"
+./in_netns.sh ./udpgso -4 -C -s
+
+echo "ipv6 cmsg"
+./in_netns.sh ./udpgso -6 -C
+
+echo "ipv6 setsockopt"
+./in_netns.sh ./udpgso -6 -C -s
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* [PATCH RFC net-next 07/11] udp: zerocopy
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Extend zerocopy to udp sockets. Allow setting sockopt SO_ZEROCOPY and
interpret flag MSG_ZEROCOPY.
This patch was previously part of the zerocopy RFC patchsets. Zerocopy
is not effective at small MTU. With segmentation offload building
larger datagram, the benefit of page flipping outweights the cost of
generating a completion notification.
The datagram implementation has initial refcnt of 0, instead of 1 for
TCP. The tcp_sendmsg_locked function has to hold a reference on
uarg independent those held by the skbs it generates, because the
skbs can be sent and freed in the function main loop. This is not
needed for other sockets.
Benefit depends on the ratio of cycles spent copying. For an initial
benchmark with udp gso that spends 13% of (systemwide) cycles in
copy_user_fast_string, cycle savings of zerocopy are proportionate:
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
udp gso zerocopy
2394 MB/s 40608 msg/s 40608 calls/s
11,205,017,927 cycles
This is likely not the best demonstrator benchmark.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
include/linux/skbuff.h | 1 +
net/core/skbuff.c | 12 +++++++++++-
net/core/sock.c | 5 ++++-
net/ipv4/ip_output.c | 24 ++++++++++++++++++++++--
net/ipv6/ip6_output.c | 23 ++++++++++++++++++++++-
5 files changed, 60 insertions(+), 5 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6850643508c1..756206bc8eca 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -483,6 +483,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg);
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 3eed21f64e0b..92df6db2c851 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -959,7 +959,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- refcount_set(&uarg->refcnt, 1);
+ refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
sock_hold(sk);
return uarg;
@@ -1099,6 +1099,10 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
atomic_dec(&sk->sk_zckey);
uarg->len--;
+ /* datagram does not hold an extra ref for the syscall itself */
+ if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
+ refcount_set(&uarg->refcnt, 1);
+
sock_zerocopy_put(uarg);
}
}
@@ -1107,6 +1111,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
struct iov_iter *from, size_t length);
+int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
+{
+ return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
+}
+EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);
+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
diff --git a/net/core/sock.c b/net/core/sock.c
index b2c3db169ca1..1480d7d92294 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1053,7 +1053,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (sk->sk_protocol != IPPROTO_TCP)
+ if (sk->sk_type == SOCK_RAW)
+ ret = -ENOTSUPP;
+ if (sk->sk_protocol != IPPROTO_TCP &&
+ sk->sk_protocol != IPPROTO_UDP)
ret = -ENOTSUPP;
} else if (sk->sk_family != PF_RDS) {
ret = -ENOTSUPP;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9ccd6c28e420..b5986ff1250e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -864,8 +864,8 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
-
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
@@ -913,6 +913,20 @@ static int __ip_append_data(struct sock *sk,
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
+ if (flags & MSG_ZEROCOPY && length) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg);
+ }
+ }
+
cork->length += length;
/* So, what's going on in the loop below?
@@ -1002,6 +1016,7 @@ static int __ip_append_data(struct sock *sk,
cork->tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
+ skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes.
@@ -1063,7 +1078,7 @@ static int __ip_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1093,6 +1108,10 @@ static int __ip_append_data(struct sock *sk,
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1105,6 +1124,7 @@ static int __ip_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 9fbcec4fb946..9725d283091c 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1270,6 +1270,7 @@ static int __ip6_append_data(struct sock *sk,
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
+ struct ubuf_info *uarg = NULL;
bool paged;
skb = skb_peek_tail(queue);
@@ -1338,6 +1339,20 @@ static int __ip6_append_data(struct sock *sk,
tskey = sk->sk_tskey++;
}
+ if (flags & MSG_ZEROCOPY && length) {
+ uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+ if (!uarg)
+ return -ENOBUFS;
+
+ if ((rt->dst.dev->features & NETIF_F_SG) &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ } else {
+ uarg->zerocopy = 0;
+ skb_zcopy_set(skb, uarg);
+ }
+ }
+
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
@@ -1460,6 +1475,7 @@ static int __ip6_append_data(struct sock *sk,
tx_flags = 0;
skb_shinfo(skb)->tskey = tskey;
tskey = 0;
+ skb_zcopy_set(skb, uarg);
/*
* Find where to start putting bytes
@@ -1520,7 +1536,7 @@ static int __ip6_append_data(struct sock *sk,
err = -EFAULT;
goto error;
}
- } else {
+ } else if (!uarg || !uarg->zerocopy) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
@@ -1550,6 +1566,10 @@ static int __ip6_append_data(struct sock *sk,
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1562,6 +1582,7 @@ static int __ip6_append_data(struct sock *sk,
error_efault:
err = -EFAULT;
error:
+ sock_zerocopy_put_abort(uarg);
cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* [PATCH RFC net-next 09/11] selftests: udp gso with connected sockets
From: Willem de Bruijn @ 2018-04-17 20:00 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Connected sockets use path mtu instead of device mtu.
Test this path by inserting a route mtu that is lower than the device
mtu. Verify that the path mtu for the connection matches this lower
number, then run the same test as in the connectionless case.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/udpgso.c | 117 +++++++++++++++++++++++++-
tools/testing/selftests/net/udpgso.sh | 7 ++
2 files changed, 122 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index 68a230dfee73..a47dca025346 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -47,6 +47,7 @@
static bool cfg_do_ipv4;
static bool cfg_do_ipv6;
+static bool cfg_do_connected;
static bool cfg_do_connectionless;
static bool cfg_do_setsockopt;
static int cfg_specific_test_id = -1;
@@ -273,6 +274,101 @@ static void set_pmtu_discover(int fd, bool is_ipv4)
error(1, errno, "setsockopt path mtu");
}
+static unsigned int get_path_mtu(int fd, bool is_ipv4)
+{
+ socklen_t vallen;
+ unsigned int mtu;
+ int ret;
+
+ vallen = sizeof(mtu);
+ if (is_ipv4)
+ ret = getsockopt(fd, SOL_IP, IP_MTU, &mtu, &vallen);
+ else
+ ret = getsockopt(fd, SOL_IPV6, IPV6_MTU, &mtu, &vallen);
+
+ if (ret)
+ error(1, errno, "getsockopt mtu");
+
+
+ fprintf(stderr, "path mtu (read): %u\n", mtu);
+ return mtu;
+}
+
+/* very wordy version of system("ip route add dev lo mtu 1500 127.0.0.3/32") */
+static void set_route_mtu(int mtu, bool is_ipv4)
+{
+ struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
+ struct nlmsghdr *nh;
+ struct rtattr *rta;
+ struct rtmsg *rt;
+ char data[NLMSG_ALIGN(sizeof(*nh)) +
+ NLMSG_ALIGN(sizeof(*rt)) +
+ NLMSG_ALIGN(RTA_LENGTH(sizeof(addr6))) +
+ NLMSG_ALIGN(RTA_LENGTH(sizeof(int))) +
+ NLMSG_ALIGN(RTA_LENGTH(0) + RTA_LENGTH(sizeof(int)))];
+ int fd, ret, alen, off = 0;
+
+ alen = is_ipv4 ? sizeof(addr4) : sizeof(addr6);
+
+ fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (fd == -1)
+ error(1, errno, "socket netlink");
+
+ memset(data, 0, sizeof(data));
+
+ nh = (void *)data;
+ nh->nlmsg_type = RTM_NEWROUTE;
+ nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE;
+ off += NLMSG_ALIGN(sizeof(*nh));
+
+ rt = (void *)(data + off);
+ rt->rtm_family = is_ipv4 ? AF_INET : AF_INET6;
+ rt->rtm_table = RT_TABLE_MAIN;
+ rt->rtm_dst_len = alen << 3;
+ rt->rtm_protocol = RTPROT_BOOT;
+ rt->rtm_scope = RT_SCOPE_UNIVERSE;
+ rt->rtm_type = RTN_UNICAST;
+ off += NLMSG_ALIGN(sizeof(*rt));
+
+ rta = (void *)(data + off);
+ rta->rta_type = RTA_DST;
+ rta->rta_len = RTA_LENGTH(alen);
+ if (is_ipv4)
+ memcpy(RTA_DATA(rta), &addr4, alen);
+ else
+ memcpy(RTA_DATA(rta), &addr6, alen);
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ rta = (void *)(data + off);
+ rta->rta_type = RTA_OIF;
+ rta->rta_len = RTA_LENGTH(sizeof(int));
+ *((int *)(RTA_DATA(rta))) = 1; //if_nametoindex("lo");
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* MTU is a subtype in a metrics type */
+ rta = (void *)(data + off);
+ rta->rta_type = RTA_METRICS;
+ rta->rta_len = RTA_LENGTH(0) + RTA_LENGTH(sizeof(int));
+ off += NLMSG_ALIGN(rta->rta_len);
+
+ /* now fill MTU subtype. Note that it fits within above rta_len */
+ rta = (void *)(((char *) rta) + RTA_LENGTH(0));
+ rta->rta_type = RTAX_MTU;
+ rta->rta_len = RTA_LENGTH(sizeof(int));
+ *((int *)(RTA_DATA(rta))) = mtu;
+
+ nh->nlmsg_len = off;
+
+ ret = sendto(fd, data, off, 0, (void *)&nladdr, sizeof(nladdr));
+ if (ret != off)
+ error(1, errno, "send netlink: %uB != %uB\n", ret, off);
+
+ if (close(fd))
+ error(1, errno, "close netlink");
+
+ fprintf(stderr, "route mtu (test): %u\n", mtu);
+}
+
static bool send_one(int fd, int len, int gso_len,
struct sockaddr *addr, socklen_t alen)
{
@@ -391,7 +487,7 @@ static void run_all(int fdt, int fdr, struct sockaddr *addr, socklen_t alen)
static void run_test(struct sockaddr *addr, socklen_t alen)
{
struct timeval tv = { .tv_usec = 100 * 1000 };
- int fdr, fdt;
+ int fdr, fdt, val;
fdr = socket(addr->sa_family, SOCK_DGRAM, 0);
if (fdr == -1)
@@ -416,6 +512,20 @@ static void run_test(struct sockaddr *addr, socklen_t alen)
run_all(fdt, fdr, addr, alen);
}
+ if (cfg_do_connected) {
+ set_device_mtu(fdt, CONST_MTU_TEST + 100);
+ set_route_mtu(CONST_MTU_TEST, addr->sa_family == AF_INET);
+
+ if (connect(fdt, addr, alen))
+ error(1, errno, "connect");
+
+ val = get_path_mtu(fdt, addr->sa_family == AF_INET);
+ if (val != CONST_MTU_TEST)
+ error(1, 0, "bad path mtu %u\n", val);
+
+ run_all(fdt, fdr, addr, 0 /* use connected addr */);
+ }
+
if (close(fdt))
error(1, errno, "close t");
if (close(fdr))
@@ -448,7 +558,7 @@ static void parse_opts(int argc, char **argv)
{
int c;
- while ((c = getopt(argc, argv, "46Cst:")) != -1) {
+ while ((c = getopt(argc, argv, "46cCst:")) != -1) {
switch (c) {
case '4':
cfg_do_ipv4 = true;
@@ -456,6 +566,9 @@ static void parse_opts(int argc, char **argv)
case '6':
cfg_do_ipv6 = true;
break;
+ case 'c':
+ cfg_do_connected = true;
+ break;
case 'C':
cfg_do_connectionless = true;
break;
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
index 7977b97e060c..7cdf0e7c1dde 100755
--- a/tools/testing/selftests/net/udpgso.sh
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -14,3 +14,10 @@ echo "ipv6 cmsg"
echo "ipv6 setsockopt"
./in_netns.sh ./udpgso -6 -C -s
+
+echo "ipv4 connected"
+./in_netns.sh ./udpgso -4 -c
+
+# blocked on 2nd loopback address
+# echo "ipv6 connected"
+# ./in_netns.sh ./udpgso -6 -c
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* [PATCH RFC net-next 10/11] selftests: udp gso with corking
From: Willem de Bruijn @ 2018-04-17 20:10 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
From: Willem de Bruijn <willemb@google.com>
Corked sockets take a different path to construct a udp datagram than
the lockless fast path. Test this alternate path.
Segmentation offload requires checksum offload, which is not supported
with corked sockets. This test fails after commit d749c9cbffd6
("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked sockets") and commit
682b1a9d3f96 ("ipv6: no CHECKSUM_PARTIAL on MSG_MORE corked sockets").
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/udpgso.c | 42 ++++++++++++++++++++-------
tools/testing/selftests/net/udpgso.sh | 8 +++++
2 files changed, 40 insertions(+), 10 deletions(-)
diff --git a/tools/testing/selftests/net/udpgso.c b/tools/testing/selftests/net/udpgso.c
index a47dca025346..6b1998b9f7df 100644
--- a/tools/testing/selftests/net/udpgso.c
+++ b/tools/testing/selftests/net/udpgso.c
@@ -49,6 +49,7 @@ static bool cfg_do_ipv4;
static bool cfg_do_ipv6;
static bool cfg_do_connected;
static bool cfg_do_connectionless;
+static bool cfg_do_msgmore;
static bool cfg_do_setsockopt;
static int cfg_specific_test_id = -1;
@@ -369,6 +370,23 @@ static void set_route_mtu(int mtu, bool is_ipv4)
fprintf(stderr, "route mtu (test): %u\n", mtu);
}
+static bool __send_one(int fd, struct msghdr *msg, int flags)
+{
+ int ret;
+
+ ret = sendmsg(fd, msg, flags);
+ if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
+ return false;
+ if (ret == -1)
+ error(1, errno, "sendmsg");
+ if (ret != msg->msg_iov->iov_len)
+ error(1, 0, "sendto: %d != %lu", ret, msg->msg_iov->iov_len);
+ if (msg->msg_flags)
+ error(1, 0, "sendmsg: return flags 0x%x\n", msg->msg_flags);
+
+ return true;
+}
+
static bool send_one(int fd, int len, int gso_len,
struct sockaddr *addr, socklen_t alen)
{
@@ -376,7 +394,6 @@ static bool send_one(int fd, int len, int gso_len,
struct msghdr msg = {0};
struct iovec iov = {0};
struct cmsghdr *cm;
- int ret;
iov.iov_base = buf;
iov.iov_len = len;
@@ -398,15 +415,17 @@ static bool send_one(int fd, int len, int gso_len,
*((uint16_t *) CMSG_DATA(cm)) = gso_len;
}
- ret = sendmsg(fd, &msg, 0);
- if (ret == -1 && (errno == EMSGSIZE || errno == ENOMEM))
- return false;
- if (ret == -1)
- error(1, errno, "sendmsg");
- if (ret != len)
- error(1, 0, "sendto: %d != %u", ret, len);
+ /* If MSG_MORE, send 1 byte followed by remainder */
+ if (cfg_do_msgmore && len > 1) {
+ iov.iov_len = 1;
+ if (!__send_one(fd, &msg, MSG_MORE))
+ error(1, 0, "send 1B failed");
- return true;
+ iov.iov_base++;
+ iov.iov_len = len - 1;
+ }
+
+ return __send_one(fd, &msg, 0);
}
static int recv_one(int fd, int flags)
@@ -558,7 +577,7 @@ static void parse_opts(int argc, char **argv)
{
int c;
- while ((c = getopt(argc, argv, "46cCst:")) != -1) {
+ while ((c = getopt(argc, argv, "46cCmst:")) != -1) {
switch (c) {
case '4':
cfg_do_ipv4 = true;
@@ -572,6 +591,9 @@ static void parse_opts(int argc, char **argv)
case 'C':
cfg_do_connectionless = true;
break;
+ case 'm':
+ cfg_do_msgmore = true;
+ break;
case 's':
cfg_do_setsockopt = true;
break;
diff --git a/tools/testing/selftests/net/udpgso.sh b/tools/testing/selftests/net/udpgso.sh
index 7cdf0e7c1dde..47acfbbbae38 100755
--- a/tools/testing/selftests/net/udpgso.sh
+++ b/tools/testing/selftests/net/udpgso.sh
@@ -21,3 +21,11 @@ echo "ipv4 connected"
# blocked on 2nd loopback address
# echo "ipv6 connected"
# ./in_netns.sh ./udpgso -6 -c
+
+# ip forces CHECKSUM_NONE with MSG_MORE
+# echo "ipv4 msg_more"
+# ./in_netns.sh ./udpgso -4 -C -m
+
+# ipv6 forces CHECKSUM_NONE with MSG_MORE
+# echo "ipv6 msg_more"
+# ./in_netns.sh ./udpgso -6 -C -m
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* [PATCH RFC net-next 11/11] selftests: udp gso benchmark
From: Willem de Bruijn @ 2018-04-17 20:10 UTC (permalink / raw)
To: netdev; +Cc: Willem de Bruijn
In-Reply-To: <20180417201034.32087-1-willemdebruijn.kernel@gmail.com>
From: Willem de Bruijn <willemb@google.com>
Send udp data between a source and sink, optionally with udp gso.
The two processes are expected to be run on separate hosts.
A script is included that runs them together over loopback in a
single namespace.
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
tools/testing/selftests/net/.gitignore | 2 +
tools/testing/selftests/net/Makefile | 4 +-
tools/testing/selftests/net/udpgso_bench.sh | 74 ++++
tools/testing/selftests/net/udpgso_bench_rx.c | 265 ++++++++++++
tools/testing/selftests/net/udpgso_bench_tx.c | 379 ++++++++++++++++++
5 files changed, 722 insertions(+), 2 deletions(-)
create mode 100755 tools/testing/selftests/net/udpgso_bench.sh
create mode 100644 tools/testing/selftests/net/udpgso_bench_rx.c
create mode 100644 tools/testing/selftests/net/udpgso_bench_tx.c
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 6333b789c8fa..bb9329387181 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -8,3 +8,5 @@ reuseport_bpf_numa
reuseport_dualstack
reuseaddr_conflict
udpgso
+udpgso_bench_rx
+udpgso_bench_tx
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index a05a657489b9..750fbe371338 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -5,13 +5,13 @@ CFLAGS = -Wall -Wl,--no-as-needed -O2 -g
CFLAGS += -I../../../../usr/include/
TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
-TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
+TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh udpgso_bench.sh
TEST_GEN_FILES = socket
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
TEST_GEN_FILES += tcp_mmap
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict
-TEST_GEN_PROGS += udpgso
+TEST_GEN_PROGS += udpgso udpgso_bench_tx udpgso_bench_rx
include ../lib.mk
diff --git a/tools/testing/selftests/net/udpgso_bench.sh b/tools/testing/selftests/net/udpgso_bench.sh
new file mode 100755
index 000000000000..792fa4d0285e
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Run a series of udpgso benchmarks
+
+wake_children() {
+ local -r jobs="$(jobs -p)"
+
+ if [[ "${jobs}" != "" ]]; then
+ kill -1 ${jobs} 2>/dev/null
+ fi
+}
+trap wake_children EXIT
+
+run_one() {
+ local -r args=$@
+
+ ./udpgso_bench_rx &
+ ./udpgso_bench_rx -t &
+
+ ./udpgso_bench_tx ${args}
+}
+
+run_in_netns() {
+ local -r args=$@
+
+ ./in_netns.sh $0 __subprocess ${args}
+}
+
+run_udp() {
+ local -r args=$@
+
+ echo "udp"
+ run_in_netns ${args}
+
+ echo "udp gso"
+ run_in_netns ${args} -S
+
+ echo "udp gso zerocopy"
+ run_in_netns ${args} -S -z
+}
+
+run_tcp() {
+ local -r args=$@
+
+ echo "tcp"
+ run_in_netns ${args} -t
+
+ echo "tcp zerocopy"
+ run_in_netns ${args} -t -z
+}
+
+run_all() {
+ local -r core_args="-l 4"
+ local -r ipv4_args="${core_args} -4 -D 127.0.0.1"
+ local -r ipv6_args="${core_args} -6 -D ::1"
+
+ echo "ipv4"
+ run_tcp "${ipv4_args}"
+ run_udp "${ipv4_args}"
+
+ echo "ipv6"
+ run_tcp "${ipv4_args}"
+ run_udp "${ipv6_args}"
+}
+
+if [[ $# -eq 0 ]]; then
+ run_all
+elif [[ $1 == "__subprocess" ]]; then
+ shift
+ run_one $@
+else
+ run_in_netns $@
+fi
diff --git a/tools/testing/selftests/net/udpgso_bench_rx.c b/tools/testing/selftests/net/udpgso_bench_rx.c
new file mode 100644
index 000000000000..727cf67a3f75
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_rx.c
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <error.h>
+#include <errno.h>
+#include <limits.h>
+#include <linux/errqueue.h>
+#include <linux/if_packet.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+static int cfg_port = 8000;
+static bool cfg_tcp;
+static bool cfg_verify;
+
+static bool interrupted;
+static unsigned long packets, bytes;
+
+static void sigint_handler(int signum)
+{
+ if (signum == SIGINT)
+ interrupted = true;
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static void do_poll(int fd)
+{
+ struct pollfd pfd;
+ int ret;
+
+ pfd.events = POLLIN;
+ pfd.revents = 0;
+ pfd.fd = fd;
+
+ do {
+ ret = poll(&pfd, 1, 10);
+ if (ret == -1)
+ error(1, errno, "poll");
+ if (ret == 0)
+ continue;
+ if (pfd.revents != POLLIN)
+ error(1, errno, "poll: 0x%x expected 0x%x\n",
+ pfd.revents, POLLIN);
+ } while (!ret && !interrupted);
+}
+
+static int do_socket(bool do_tcp)
+{
+ struct sockaddr_in6 addr = {0};
+ int fd, val;
+
+ fd = socket(PF_INET6, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket");
+
+ val = 1 << 21;
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)))
+ error(1, errno, "setsockopt rcvbuf");
+ val = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &val, sizeof(val)))
+ error(1, errno, "setsockopt reuseport");
+
+ addr.sin6_family = PF_INET6;
+ addr.sin6_port = htons(cfg_port);
+ addr.sin6_addr = in6addr_any;
+ if (bind(fd, (void *) &addr, sizeof(addr)))
+ error(1, errno, "bind");
+
+ if (do_tcp) {
+ int accept_fd = fd;
+
+ if (listen(accept_fd, 1))
+ error(1, errno, "listen");
+
+ do_poll(accept_fd);
+
+ fd = accept(accept_fd, NULL, NULL);
+ if (fd == -1)
+ error(1, errno, "accept");
+ if (close(accept_fd))
+ error(1, errno, "close accept fd");
+ }
+
+ return fd;
+}
+
+/* Flush all outstanding bytes for the tcp receive queue */
+static void do_flush_tcp(int fd)
+{
+ int ret;
+
+ while (true) {
+ /* MSG_TRUNC flushes up to len bytes */
+ ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ return;
+ if (ret == -1)
+ error(1, errno, "flush");
+ if (ret == 0) {
+ /* client detached */
+ exit(0);
+ }
+
+ packets++;
+ bytes += ret;
+ }
+
+}
+
+static char sanitized_char(char val)
+{
+ return (val >= 'a' && val <= 'z') ? val : '.';
+}
+
+static void do_verify_udp(const char *data, int len)
+{
+ char cur = data[0];
+ int i;
+
+ /* verify contents */
+ if (cur < 'a' || cur > 'z')
+ error(1, 0, "data initial byte out of range");
+
+ for (i = 1; i < len; i++) {
+ if (cur == 'z')
+ cur = 'a';
+ else
+ cur++;
+
+ if (data[i] != cur)
+ error(1, 0, "data[%d]: len %d, %c(%hhu) != %c(%hhu)\n",
+ i, len,
+ sanitized_char(data[i]), data[i],
+ sanitized_char(cur), cur);
+ }
+}
+
+/* Flush all outstanding datagrams. Verify first few bytes of each. */
+static void do_flush_udp(int fd)
+{
+ static char rbuf[ETH_DATA_LEN];
+ int ret, len, budget = 256;
+
+ len = cfg_verify ? sizeof(rbuf) : 0;
+ while (budget--) {
+ /* MSG_TRUNC will make return value full datagram length */
+ ret = recv(fd, rbuf, len, MSG_TRUNC | MSG_DONTWAIT);
+ if (ret == -1 && errno == EAGAIN)
+ return;
+ if (ret == -1)
+ error(1, errno, "recv");
+ if (len) {
+ if (ret == 0)
+ error(1, errno, "recv: 0 byte datagram\n");
+
+ do_verify_udp(rbuf, ret);
+ }
+
+ packets++;
+ bytes += ret;
+ }
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s [-tv] [-p port]", filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int c;
+
+ while ((c = getopt(argc, argv, "ptv")) != -1) {
+ switch (c) {
+ case 'p':
+ cfg_port = htons(strtoul(optarg, NULL, 0));
+ break;
+ case 't':
+ cfg_tcp = true;
+ break;
+ case 'v':
+ cfg_verify = true;
+ break;
+ }
+ }
+
+ if (optind != argc)
+ usage(argv[0]);
+
+ if (cfg_tcp && cfg_verify)
+ error(1, 0, "TODO: implement verify mode for tcp");
+}
+
+static void do_recv(void)
+{
+ unsigned long tnow, treport;
+ int fd;
+
+ fd = do_socket(cfg_tcp);
+
+ treport = gettimeofday_ms() + 1000;
+ do {
+ do_poll(fd);
+
+ if (cfg_tcp)
+ do_flush_tcp(fd);
+ else
+ do_flush_udp(fd);
+
+ tnow = gettimeofday_ms();
+ if (tnow > treport) {
+ if (packets)
+ fprintf(stderr,
+ "%s rx: %6lu MB/s %8lu calls/s\n",
+ cfg_tcp ? "tcp" : "udp",
+ bytes >> 20, packets);
+ bytes = packets = 0;
+ treport = tnow + 1000;
+ }
+
+ } while (!interrupted);
+
+ if (close(fd))
+ error(1, errno, "close");
+}
+
+int main(int argc, char **argv)
+{
+ parse_opts(argc, argv);
+
+ signal(SIGINT, sigint_handler);
+
+ do_recv();
+
+ return 0;
+}
diff --git a/tools/testing/selftests/net/udpgso_bench_tx.c b/tools/testing/selftests/net/udpgso_bench_tx.c
new file mode 100644
index 000000000000..9588f72c0262
--- /dev/null
+++ b/tools/testing/selftests/net/udpgso_bench_tx.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define _GNU_SOURCE
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <netinet/if_ether.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <poll.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#ifndef ETH_MAX_MTU
+#define ETH_MAX_MTU 0xFFFFU
+#endif
+
+#ifndef UDP_SEGMENT
+#define UDP_SEGMENT 103
+#endif
+
+#ifndef SO_ZEROCOPY
+#define SO_ZEROCOPY 60
+#endif
+
+#ifndef MSG_ZEROCOPY
+#define MSG_ZEROCOPY 0x4000000
+#endif
+
+#define NUM_PKT 100
+
+static bool cfg_cache_trash;
+static int cfg_cpu = -1;
+static int cfg_connected = true;
+static int cfg_family = PF_UNSPEC;
+static uint16_t cfg_mss;
+static int cfg_payload_len = (1472 * 42);
+static int cfg_port = 8000;
+static int cfg_runtime_ms = -1;
+static bool cfg_segment;
+static bool cfg_tcp;
+static bool cfg_zerocopy;
+
+static socklen_t cfg_alen;
+static struct sockaddr_storage cfg_dst_addr;
+
+static bool interrupted;
+static char buf[NUM_PKT][ETH_MAX_MTU];
+
+static void sigint_handler(int signum)
+{
+ if (signum == SIGINT)
+ interrupted = true;
+}
+
+static unsigned long gettimeofday_ms(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
+}
+
+static int set_cpu(int cpu)
+{
+ cpu_set_t mask;
+
+ CPU_ZERO(&mask);
+ CPU_SET(cpu, &mask);
+ if (sched_setaffinity(0, sizeof(mask), &mask))
+ error(1, 0, "setaffinity %d", cpu);
+
+ return 0;
+}
+
+static void setup_sockaddr(int domain, const char *str_addr, void *sockaddr)
+{
+ struct sockaddr_in6 *addr6 = (void *) sockaddr;
+ struct sockaddr_in *addr4 = (void *) sockaddr;
+
+ switch (domain) {
+ case PF_INET:
+ addr4->sin_family = AF_INET;
+ addr4->sin_port = htons(cfg_port);
+ if (inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
+ error(1, 0, "ipv4 parse error: %s", str_addr);
+ break;
+ case PF_INET6:
+ addr6->sin6_family = AF_INET6;
+ addr6->sin6_port = htons(cfg_port);
+ if (inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
+ error(1, 0, "ipv6 parse error: %s", str_addr);
+ break;
+ default:
+ error(1, 0, "illegal domain");
+ }
+}
+
+static void flush_zerocopy(int fd)
+{
+ struct msghdr msg = {0}; /* flush */
+ int ret;
+
+ while (1) {
+ ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
+ if (ret == -1 && errno == EAGAIN)
+ break;
+ if (ret == -1)
+ error(1, errno, "errqueue");
+ if (msg.msg_flags != (MSG_ERRQUEUE | MSG_CTRUNC))
+ error(1, 0, "errqueue: flags 0x%x\n", msg.msg_flags);
+ msg.msg_flags = 0;
+ }
+}
+
+static int send_tcp(int fd, char *data)
+{
+ int ret, done = 0, count = 0;
+
+ while (done < cfg_payload_len) {
+ ret = send(fd, data + done, cfg_payload_len - done,
+ cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "write");
+
+ done += ret;
+ count++;
+ }
+
+ return count;
+}
+
+static int send_udp(int fd, char *data)
+{
+ int ret, total_len, len, count = 0;
+
+ total_len = cfg_payload_len;
+
+ while (total_len) {
+ len = total_len < cfg_mss ? total_len : cfg_mss;
+
+ ret = sendto(fd, data, len, cfg_zerocopy ? MSG_ZEROCOPY : 0,
+ cfg_connected ? NULL : (void *)&cfg_dst_addr,
+ cfg_connected ? 0 : cfg_alen);
+ if (ret == -1)
+ error(1, errno, "write");
+ if (ret != len)
+ error(1, errno, "write: %uB != %uB\n", ret, len);
+
+ total_len -= len;
+ count++;
+ }
+
+ return count;
+}
+
+static void send_udp_segment_cmsg(struct cmsghdr *cm)
+{
+ uint16_t *valp;
+
+ cm->cmsg_level = SOL_UDP;
+ cm->cmsg_type = UDP_SEGMENT;
+ cm->cmsg_len = CMSG_LEN(sizeof(cfg_mss));
+ valp = (void *)CMSG_DATA(cm);
+ *valp = cfg_mss;
+}
+
+static int send_udp_segment(int fd, char *data)
+{
+ char control[CMSG_SPACE(sizeof(cfg_mss))] = {0};
+ struct msghdr msg = {0};
+ struct iovec iov = {0};
+ int ret;
+
+ iov.iov_base = data;
+ iov.iov_len = cfg_payload_len;
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ msg.msg_control = control;
+ msg.msg_controllen = sizeof(control);
+ send_udp_segment_cmsg(CMSG_FIRSTHDR(&msg));
+
+ msg.msg_name = (void *)&cfg_dst_addr;
+ msg.msg_namelen = cfg_alen;
+
+ ret = sendmsg(fd, &msg, cfg_zerocopy ? MSG_ZEROCOPY : 0);
+ if (ret == -1)
+ error(1, errno, "sendmsg");
+ if (ret != iov.iov_len)
+ error(1, 0, "sendmsg: %u != %lu\n", ret, iov.iov_len);
+
+ return 1;
+}
+
+static void usage(const char *filepath)
+{
+ error(1, 0, "Usage: %s [-46cStuUz] [-C cpu] [-D dst ip] [-l secs] [-p port] [-s sendsize]",
+ filepath);
+}
+
+static void parse_opts(int argc, char **argv)
+{
+ int max_len, hdrlen;
+ int c;
+
+ while ((c = getopt(argc, argv, "46cC:D:l:p:s:Stuz")) != -1) {
+ switch (c) {
+ case '4':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET;
+ cfg_alen = sizeof(struct sockaddr_in);
+ break;
+ case '6':
+ if (cfg_family != PF_UNSPEC)
+ error(1, 0, "Pass one of -4 or -6");
+ cfg_family = PF_INET6;
+ cfg_alen = sizeof(struct sockaddr_in6);
+ break;
+ case 'c':
+ cfg_cache_trash = true;
+ break;
+ case 'C':
+ cfg_cpu = strtol(optarg, NULL, 0);
+ break;
+ case 'D':
+ setup_sockaddr(cfg_family, optarg, &cfg_dst_addr);
+ break;
+ case 'l':
+ cfg_runtime_ms = strtoul(optarg, NULL, 10) * 1000;
+ break;
+ case 'p':
+ cfg_port = strtoul(optarg, NULL, 0);
+ break;
+ case 's':
+ cfg_payload_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ cfg_segment = true;
+ break;
+ case 't':
+ cfg_tcp = true;
+ break;
+ case 'u':
+ cfg_connected = false;
+ break;
+ case 'z':
+ cfg_zerocopy = true;
+ break;
+ }
+ }
+
+ if (optind != argc)
+ usage(argv[0]);
+
+ if (cfg_family == PF_UNSPEC)
+ error(1, 0, "must pass one of -4 or -6");
+ if (cfg_tcp && !cfg_connected)
+ error(1, 0, "connectionless tcp makes no sense");
+
+ if (cfg_family == PF_INET)
+ hdrlen = sizeof(struct iphdr) + sizeof(struct udphdr);
+ else
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct udphdr);
+
+ cfg_mss = ETH_DATA_LEN - hdrlen;
+ max_len = ETH_MAX_MTU - hdrlen;
+
+ if (cfg_payload_len > max_len)
+ error(1, 0, "payload length %u exceeds max %u",
+ cfg_payload_len, max_len);
+}
+
+static void set_pmtu_discover(int fd, bool is_ipv4)
+{
+ int level, name, val;
+
+ if (is_ipv4) {
+ level = SOL_IP;
+ name = IP_MTU_DISCOVER;
+ val = IP_PMTUDISC_DO;
+ } else {
+ level = SOL_IPV6;
+ name = IPV6_MTU_DISCOVER;
+ val = IPV6_PMTUDISC_DO;
+ }
+
+ if (setsockopt(fd, level, name, &val, sizeof(val)))
+ error(1, errno, "setsockopt path mtu");
+}
+
+int main(int argc, char **argv)
+{
+ unsigned long num_msgs, num_sends;
+ unsigned long tnow, treport, tstop;
+ int fd, i, val;
+
+ parse_opts(argc, argv);
+
+ if (cfg_cpu > 0)
+ set_cpu(cfg_cpu);
+
+ for (i = 0; i < sizeof(buf[0]); i++)
+ buf[0][i] = 'a' + (i % 26);
+ for (i = 1; i < NUM_PKT; i++)
+ memcpy(buf[i], buf[0], sizeof(buf[0]));
+
+ signal(SIGINT, sigint_handler);
+
+ fd = socket(cfg_family, cfg_tcp ? SOCK_STREAM : SOCK_DGRAM, 0);
+ if (fd == -1)
+ error(1, errno, "socket");
+
+ if (cfg_zerocopy) {
+ val = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &val, sizeof(val)))
+ error(1, errno, "setsockopt zerocopy");
+ }
+
+ if (cfg_connected &&
+ connect(fd, (void *)&cfg_dst_addr, cfg_alen))
+ error(1, errno, "connect");
+
+ if (cfg_segment)
+ set_pmtu_discover(fd, cfg_family == PF_INET);
+
+ num_msgs = num_sends = 0;
+ tnow = gettimeofday_ms();
+ tstop = tnow + cfg_runtime_ms;
+ treport = tnow + 1000;
+
+ i = 0;
+ do {
+ if (cfg_tcp)
+ num_sends += send_tcp(fd, buf[i]);
+ else if (cfg_segment)
+ num_sends += send_udp_segment(fd, buf[i]);
+ else
+ num_sends += send_udp(fd, buf[i]);
+ num_msgs++;
+
+ if (cfg_zerocopy && ((num_msgs & 0xF) == 0))
+ flush_zerocopy(fd);
+
+ tnow = gettimeofday_ms();
+ if (tnow > treport) {
+ fprintf(stderr,
+ "%s tx: %6lu MB/s %8lu calls/s %6lu msg/s\n",
+ cfg_tcp ? "tcp" : "udp",
+ (num_msgs * cfg_payload_len) >> 20,
+ num_sends, num_msgs);
+ num_msgs = num_sends = 0;
+ treport = tnow + 1000;
+ }
+
+ /* cold cache when writing buffer */
+ if (cfg_cache_trash)
+ i = ++i < NUM_PKT ? i : 0;
+
+ } while (!interrupted && (cfg_runtime_ms == -1 || tnow < tstop));
+
+ if (close(fd))
+ error(1, errno, "close");
+
+ return 0;
+}
--
2.17.0.484.g0c8726318c-goog
^ permalink raw reply related
* Re: [PATCH RFC net-next 00/11] udp gso
From: Sowmini Varadhan @ 2018-04-17 20:15 UTC (permalink / raw)
To: Willem de Bruijn; +Cc: netdev, Willem de Bruijn
In-Reply-To: <20180417200059.30154-1-willemdebruijn.kernel@gmail.com>
On (04/17/18 16:00), Willem de Bruijn wrote:
>
> This patchset implements GSO for UDP. A process can concatenate and
> submit multiple datagrams to the same destination in one send call
> by setting socket option SOL_UDP/UDP_SEGMENT with the segment size,
> or passing an analogous cmsg at send time.
>
> The stack will send the entire large (up to network layer max size)
> datagram through the protocol layer. At the GSO layer, it is broken
> up in individual segments. All receive the same network layer header
> and UDP src and dst port. All but the last segment have the same UDP
> header, but the last may differ in length and checksum.
I'll go through the patch-set later today/tomorrow (interesting!),
but a question: what are message boundary semantics in this model? E.g.,
if I do a udp_sendmsg of 2000 bytes, and then then a udp_sendmsg of
512 bytes, with the receiver first recvmsg 2000 bytes and then the
512 bytes?
My understanding of the comment above is that no, it will not-
because (assuming an mtu of 1500) there will be 2 UDP messages on
the wire, the first one with 1500 bytes, and the second with 1012
bytes (with some discounts for the various L2/L3 etc headers)
--Sowmini
^ permalink raw reply
* Re: [PATCH RFC net-next 00/11] udp gso
From: Willem de Bruijn @ 2018-04-17 20:23 UTC (permalink / raw)
To: Sowmini Varadhan; +Cc: Network Development, Willem de Bruijn
In-Reply-To: <20180417201557.GA4080@oracle.com>
On Tue, Apr 17, 2018 at 4:15 PM, Sowmini Varadhan
<sowmini.varadhan@oracle.com> wrote:
> On (04/17/18 16:00), Willem de Bruijn wrote:
>>
>> This patchset implements GSO for UDP. A process can concatenate and
>> submit multiple datagrams to the same destination in one send call
>> by setting socket option SOL_UDP/UDP_SEGMENT with the segment size,
>> or passing an analogous cmsg at send time.
>>
>> The stack will send the entire large (up to network layer max size)
>> datagram through the protocol layer. At the GSO layer, it is broken
>> up in individual segments. All receive the same network layer header
>> and UDP src and dst port. All but the last segment have the same UDP
>> header, but the last may differ in length and checksum.
>
> I'll go through the patch-set later today/tomorrow (interesting!),
Thanks!
> but a question: what are message boundary semantics in this model? E.g.,
> if I do a udp_sendmsg of 2000 bytes, and then then a udp_sendmsg of
> 512 bytes, with the receiver first recvmsg 2000 bytes and then the
> 512 bytes?
>
> My understanding of the comment above is that no, it will not-
> because (assuming an mtu of 1500) there will be 2 UDP messages on
> the wire, the first one with 1500 bytes, and the second with 1012
> bytes (with some discounts for the various L2/L3 etc headers)
Assuming IPv4 with an MTU of 1500 and the maximum segment
size of 1472, the receiver will see three datagrams with MSS of
1472B, 528B and 512B.
The feature does not combine payloads of subsequent datagrams.
It only combines payloads of subsequent send calls if they are
building a single datagram using corking. But due to lack of checksum
offload with corking, this is actually disabled.
^ permalink raw reply
* Re: [PATCH v2 3/8] net: ax88796: Do not free IRQ in ax_remove() (already freed in ax_close()).
From: Michael Schmitz @ 2018-04-17 20:32 UTC (permalink / raw)
To: John Paul Adrian Glaubitz
Cc: netdev, Andrew Lunn, Linux/m68k, Michael Karcher, Michael Karcher
In-Reply-To: <dcbec8cb-8fc9-ffe6-a005-b04bbd30cd59@physik.fu-berlin.de>
Hi Adrian,
On Tue, Apr 17, 2018 at 11:40 PM, John Paul Adrian Glaubitz
<glaubitz@physik.fu-berlin.de> wrote:
> On 04/17/2018 04:08 AM, Michael Schmitz wrote:
>>
>> From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
>
> This should be:
>
> From: Michael Karcher <debian@mkarcher.dialup.fu-berlin.de>
I haven't found a way to change that in my tree yet, sorry. Unless
someone has a simple way to fix patch authorship after a merge, I may
have to reimport from scratch.
Cheers,
Michael
^ permalink raw reply
* Re: [PATCH net-next 0/5] virtio-net: Add SCTP checksum offload support
From: Vlad Yasevich @ 2018-04-17 20:35 UTC (permalink / raw)
To: Marcelo Ricardo Leitner, Vladislav Yasevich
Cc: netdev, linux-sctp, virtualization, mst, jasowang, nhorman
In-Reply-To: <20180402144730.GA6001@localhost.localdomain>
On 04/02/2018 10:47 AM, Marcelo Ricardo Leitner wrote:
> On Mon, Apr 02, 2018 at 09:40:01AM -0400, Vladislav Yasevich wrote:
>> Now that we have SCTP offload capabilities in the kernel, we can add
>> them to virtio as well. First step is SCTP checksum.
>
> Thanks.
>
>> As for GSO, the way sctp GSO is currently implemented buys us nothing
>> in added support to virtio. To add true GSO, would require a lot of
>> re-work inside of SCTP and would require extensions to the virtio
>> net header to carry extra sctp data.
>
> Can you please elaborate more on this? Is this because SCTP GSO relies
> on the gso skb format for knowing how to segment it instead of having
> a list of sizes?
>
it's mainly because all the true segmentation, placing data into chunks,
has already happened. All that GSO does is allow for higher bundling
rate between VMs. If that is all SCTP GSO ever going to do, that fine,
but the goal is to do real GSO eventually and potentially reduce the
amount of memory copying we are doing.
If we do that, any current attempt at GSO in virtio would have to be
depricated and we'd need GSO2 or something like that.
This is why, after doing the GSO support, I decided not to include it.
-vlad
> Marcelo
>
^ permalink raw reply
* Re: [PATCH v2 3/8] net: ax88796: Do not free IRQ in ax_remove() (already freed in ax_close()).
From: Michael Schmitz @ 2018-04-17 20:36 UTC (permalink / raw)
To: David Miller
Cc: Geert Uytterhoeven, John Paul Adrian Glaubitz, netdev,
Andrew Lunn, Linux/m68k, Michael Karcher
In-Reply-To: <20180417.095115.87630263190451622.davem@davemloft.net>
Thanks Geert, I'll fix that.
I see my v2 series shows up as new series on patchwork - do I need to
do something different when tagging the next version, Dave?
Cheers,
Michael
On Wed, Apr 18, 2018 at 1:51 AM, David Miller <davem@davemloft.net> wrote:
> From: Geert Uytterhoeven <geert@linux-m68k.org>
> Date: Tue, 17 Apr 2018 10:20:25 +0200
>
>> BTW, I have a git alias for that:
>>
>> $ git help fixes
>> `git fixes' is aliased to `show --format='Fixes: %h ("%s")' -s'
>> $ git fixes 82533ad9a1c
>> Fixes: 82533ad9a1c ("net: ethernet: ax88796: don't call free_irq
>> without request_irq first")
>
> Thanks for sharing :)
^ permalink raw reply
* Fw: [Bug 199427] New: Wired Network interface goes repeatedly up and down with NetworkManager
From: Stephen Hemminger @ 2018-04-17 20:36 UTC (permalink / raw)
To: jeffrey.t.kirsher; +Cc: netdev, intel-wired-lan
Begin forwarded message:
Date: Tue, 17 Apr 2018 18:57:20 +0000
From: bugzilla-daemon@bugzilla.kernel.org
To: stephen@networkplumber.org
Subject: [Bug 199427] New: Wired Network interface goes repeatedly up and down with NetworkManager
https://bugzilla.kernel.org/show_bug.cgi?id=199427
Bug ID: 199427
Summary: Wired Network interface goes repeatedly up and down
with NetworkManager
Product: Networking
Version: 2.5
Kernel Version: 4.13.0-18
Hardware: All
OS: Linux
Tree: Mainline
Status: NEW
Severity: blocking
Priority: P1
Component: IPV4
Assignee: stephen@networkplumber.org
Reporter: kuczek@kuczek.pl
Regression: No
Hardware: Laptop Dell Precision 3510 Core i7 and another one Corei5 Desktop
Computer
Common at both: Network Card Intel Pro 1000 e1000e
System: Lubuntu 17.10 x64
After update from kernel 4.13.0-17 to 4.13.0-18 network interface goes
repeatedly up and down. System booted with old 4.13.0-17 works as expected.
It seems to be this issue:
https://bugzilla.kernel.org/show_bug.cgi?id=198047
Syslog says over and over again:
Apr 17 09:42:30 ronald-Precision-3510 dhclient[3610]: DHCPREQUEST of
10.0.10.155 on enp0s31f6 to 255.255.255.255 port 67 (xid=0x7ccfb91c)
Apr 17 09:42:30 ronald-Precision-3510 dhclient[3610]: DHCPACK of 10.0.10.155
from 10.0.10.2
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.7146] dhcp4 (enp0s31f6): address 10.0.10.155
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.7146] dhcp4 (enp0s31f6): plen 24 (255.255.255.0)
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.7146] dhcp4 (enp0s31f6): gateway 10.0.10.2
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.7146] dhcp4 (enp0s31f6): lease time 600000
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.7146] dhcp4 (enp0s31f6): nameserver '10.0.10.2'
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.7146] dhcp4 (enp0s31f6): domain name 'morawski.pl'
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.7146] dhcp4 (enp0s31f6): state changed unknown -> bound
Apr 17 09:42:30 ronald-Precision-3510 dhclient[3610]: bound to 10.0.10.155 --
renewal in 240494 seconds.
Apr 17 09:42:30 ronald-Precision-3510 kernel: [ 878.926052] e1000e
0000:00:1f.6 enp0s31f6: changing MTU from 1500 to 1300
Apr 17 09:42:30 ronald-Precision-3510 avahi-daemon[770]: Joining mDNS multicast
group on interface enp0s31f6.IPv4 with address 10.0.10.155.
Apr 17 09:42:30 ronald-Precision-3510 avahi-daemon[770]: New relevant interface
enp0s31f6.IPv4 for mDNS.
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.8059] device (enp0s31f6): state change: ip-config -> ip-check
(reason 'none', internal state 'managed')
Apr 17 09:42:30 ronald-Precision-3510 avahi-daemon[770]: Registering new
address record for 10.0.10.155 on enp0s31f6.IPv4.
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.8075] device (enp0s31f6): state change: ip-check -> secondaries
(reason 'none', internal state 'managed')
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.8080] device (enp0s31f6): state change: secondaries -> activated
(reason 'none', internal state 'managed')
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.8108] policy: set 'Dom' (enp0s31f6) as default for IPv4 routing and
DNS
Apr 17 09:42:30 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950950.8114] device (enp0s31f6): Activation: successful, device activated.
Apr 17 09:42:30 ronald-Precision-3510 dbus[792]: [system] Activating via
systemd: service name='org.freedesktop.nm_dispatcher'
unit='dbus-org.freedesktop.nm-dispatcher.service'
Apr 17 09:42:30 ronald-Precision-3510 systemd[1]: Starting Network Manager
Script Dispatcher Service...
Apr 17 09:42:30 ronald-Precision-3510 dbus[792]: [system] Successfully
activated service 'org.freedesktop.nm_dispatcher'
Apr 17 09:42:30 ronald-Precision-3510 systemd[1]: Started Network Manager
Script Dispatcher Service.
Apr 17 09:42:30 ronald-Precision-3510 nm-dispatcher: req:1 'up' [enp0s31f6]:
new request (1 scripts)
Apr 17 09:42:30 ronald-Precision-3510 nm-dispatcher: req:1 'up' [enp0s31f6]:
start running ordered scripts...
Apr 17 09:42:31 ronald-Precision-3510 avahi-daemon[770]: Joining mDNS multicast
group on interface enp0s31f6.IPv6 with address fe80::d681:d7ff:feab:a4ca.
Apr 17 09:42:31 ronald-Precision-3510 avahi-daemon[770]: New relevant interface
enp0s31f6.IPv6 for mDNS.
Apr 17 09:42:31 ronald-Precision-3510 avahi-daemon[770]: Registering new
address record for fe80::d681:d7ff:feab:a4ca on enp0s31f6.*.
Apr 17 09:42:34 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950954.8703] device (enp0s31f6): state change: activated -> unavailable
(reason 'carrier-changed', internal state 'managed')
Apr 17 09:42:34 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950954.9029] dhcp4 (enp0s31f6): canceled DHCP transaction, DHCP client pid
3610
Apr 17 09:42:34 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950954.9030] dhcp4 (enp0s31f6): state changed bound -> done
Apr 17 09:42:34 ronald-Precision-3510 avahi-daemon[770]: Withdrawing address
record for fe80::d681:d7ff:feab:a4ca on enp0s31f6.
Apr 17 09:42:34 ronald-Precision-3510 avahi-daemon[770]: Leaving mDNS multicast
group on interface enp0s31f6.IPv6 with address fe80::d681:d7ff:feab:a4ca.
Apr 17 09:42:34 ronald-Precision-3510 kernel: [ 883.115719] e1000e
0000:00:1f.6 enp0s31f6: changing MTU from 1300 to 1500
Apr 17 09:42:34 ronald-Precision-3510 avahi-daemon[770]: Interface
enp0s31f6.IPv6 no longer relevant for mDNS.
Apr 17 09:42:34 ronald-Precision-3510 avahi-daemon[770]: Withdrawing address
record for 10.0.10.155 on enp0s31f6.
Apr 17 09:42:34 ronald-Precision-3510 avahi-daemon[770]: Leaving mDNS multicast
group on interface enp0s31f6.IPv4 with address 10.0.10.155.
Apr 17 09:42:34 ronald-Precision-3510 avahi-daemon[770]: Interface
enp0s31f6.IPv4 no longer relevant for mDNS.
Apr 17 09:42:35 ronald-Precision-3510 NetworkManager[808]: <info>
Apr 17 09:42:35 ronald-Precision-3510 nm-dispatcher: req:2 'down' [enp0s31f6]:
new request (1 scripts)
Apr 17 09:42:40 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950960.6599] device (enp0s31f6): link connected
Apr 17 09:42:40 ronald-Precision-3510 NetworkManager[808]: <info>
[1523950960.6608] device (enp0s31f6): state change: unavailable -> disconnected
(reason 'carrier-changed', internal state 'managed')
Kind regards
Ronald Kuczek
--
You are receiving this mail because:
You are the assignee for the bug.
^ permalink raw reply
* Re: [PATCH net,stable] tun: fix vlan packet truncation
From: Bjørn Mork @ 2018-04-17 20:41 UTC (permalink / raw)
To: David Miller; +Cc: netdev, jasowang
In-Reply-To: <20180417.134727.1681639113353628332.davem@davemloft.net>
David Miller <davem@davemloft.net> writes:
> The VLAN business might be bogus, and needs to be removed.
>
> However, the pskb_trim() has to stay in some form. I think this is what
> Jason is trying to say.
>
> The semantics of running a BPF program is that the program returns the
> desired packet length. We must truncate the packet to the length
> returned by the BPF program, therefore.
Right. That's obvious when I look closer at it. Sorry for being so
slow... New version coming up.
Bjørn
^ permalink raw reply
* Re: [PATCH v2 3/8] net: ax88796: Do not free IRQ in ax_remove() (already freed in ax_close()).
From: John Paul Adrian Glaubitz @ 2018-04-17 20:42 UTC (permalink / raw)
To: Michael Schmitz
Cc: netdev, Andrew Lunn, Linux/m68k, Michael Karcher, Michael Karcher
In-Reply-To: <CAOmrzk+uE_ouXCmmd78j0zG4QrqT=TvXNhMOQxtt5KWg8KS_cQ@mail.gmail.com>
On 04/17/2018 10:32 PM, Michael Schmitz wrote:
>> From: Michael Karcher <debian@mkarcher.dialup.fu-berlin.de>
>
> I haven't found a way to change that in my tree yet, sorry. Unless
> someone has a simple way to fix patch authorship after a merge, I may
> have to reimport from scratch.
I guess you'll have to re-import then, sorry :(.
The correct mail address is:
Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>
which Michael uses for all his kernel work.
Adrian
--
.''`. John Paul Adrian Glaubitz
: :' : Debian Developer - glaubitz@debian.org
`. `' Freie Universitaet Berlin - glaubitz@physik.fu-berlin.de
`- GPG: 62FF 8A75 84E0 2956 9546 0006 7426 3B37 F5B5 F913
^ permalink raw reply
* [PATCH v2 net,stable] tun: fix vlan packet truncation
From: Bjørn Mork @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Bjørn Mork, Jason Wang
Bogus trimming in tun_net_xmit() causes truncated vlan packets.
skb->len is correct whether or not skb_vlan_tag_present() is true. There
is no more reason to adjust the skb length on xmit in this driver than
any other driver. tun_put_user() adds 4 bytes to the total for tagged
packets because it transmits the tag inline to userspace. This is
similar to a nic transmitting the tag inline on the wire.
Reproducing the bug by sending any tagged packet through back-to-back
connected tap interfaces:
socat TUN,tun-type=tap,iff-up,tun-name=in TUN,tun-type=tap,iff-up,tun-name=out &
ip link add link in name in.20 type vlan id 20
ip addr add 10.9.9.9/24 dev in.20
ip link set in.20 up
tshark -nxxi in -f arp -c1 2>/dev/null &
tshark -nxxi out -f arp -c1 2>/dev/null &
ping -c 1 10.9.9.5 >/dev/null 2>&1
The output from the 'in' and 'out' interfaces are different when the
bug is present:
Capturing on 'in'
0000 ff ff ff ff ff ff 76 cf 76 37 d5 0a 81 00 00 14 ......v.v7......
0010 08 06 00 01 08 00 06 04 00 01 76 cf 76 37 d5 0a ..........v.v7..
0020 0a 09 09 09 00 00 00 00 00 00 0a 09 09 05 ..............
Capturing on 'out'
0000 ff ff ff ff ff ff 76 cf 76 37 d5 0a 81 00 00 14 ......v.v7......
0010 08 06 00 01 08 00 06 04 00 01 76 cf 76 37 d5 0a ..........v.v7..
0020 0a 09 09 09 00 00 00 00 00 00 ..........
Fixes: aff3d70a07ff ("tun: allow to attach ebpf socket filter")
Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
---
v2:
- Must still call pskb_trim() after running the filter, as pointed out by
Jason and David. But no need to check if len < 0 anymore, since
run_ebpf_filter() returns insigned ints.
drivers/net/tun.c | 5 -----
1 file changed, 5 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 28583aa0c17d..82c378652555 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1102,11 +1102,6 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
goto drop;
len = run_ebpf_filter(tun, skb, len);
-
- /* Trim extra bytes since we may insert vlan proto & TCI
- * in tun_put_user().
- */
- len -= skb_vlan_tag_present(skb) ? sizeof(struct veth) : 0;
if (len <= 0 || pskb_trim(skb, len))
goto drop;
--
2.11.0
^ permalink raw reply related
* [PATCH bpf-next v4 00/10] BTF: BPF Type Format
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev
Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team,
Arnaldo Carvalho de Melo
This patch introduces BPF Type Format (BTF).
BTF (BPF Type Format) is the meta data format which describes
the data types of BPF program/map. Hence, it basically focus
on the C programming language which the modern BPF is primary
using. The first use case is to provide a generic pretty print
capability for a BPF map.
A modified pahole that can convert dwarf to BTF is here:
https://github.com/iamkafai/pahole/tree/btf
(no change since Apr 3rd)
Please see individual patch for details.
v4:
- Fix warning (remove unneeded semicolon)
- Remove a redundant variable (nr_bytes) from btf_int_check_meta() in
patch 1. Caught by W=1.
v3:
- Rebase to bpf-next
- Fix sparse warning (by adding static)
- Add BTF header logging: btf_verifier_log_hdr()
- Fix the alignment test on btf->type_off
- Add tests for the BTF header
- Lower the max BTF size to 16MB. It should be enough
for some time. We could raise it later if it would
be needed.
v2:
- Use kvfree where needed in patch 1 and 2
- Also consider BTF_INT_OFFSET() in the btf_int_check_meta()
in patch 1
- Fix an incorrect goto target in map_create() during
the btf-error-path in patch 7
- re-org some local vars to keep the rev xmas tree in btf.c
Martin KaFai Lau (10):
bpf: btf: Introduce BPF Type Format (BTF)
bpf: btf: Validate type reference
bpf: btf: Check members of struct/union
bpf: btf: Add pretty print capability for data with BTF type info
bpf: btf: Add BPF_BTF_LOAD command
bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd
bpf: btf: Add pretty print support to the basic arraymap
bpf: btf: Sync bpf.h and btf.h to tools/
bpf: btf: Add BTF support to libbpf
bpf: btf: Add BTF tests
include/linux/bpf.h | 20 +-
include/linux/btf.h | 48 +
include/uapi/linux/bpf.h | 12 +
include/uapi/linux/btf.h | 132 ++
kernel/bpf/Makefile | 1 +
kernel/bpf/arraymap.c | 50 +
kernel/bpf/btf.c | 2091 ++++++++++++++++++++++++++
kernel/bpf/inode.c | 146 +-
kernel/bpf/syscall.c | 51 +-
tools/include/uapi/linux/bpf.h | 13 +
tools/include/uapi/linux/btf.h | 132 ++
tools/lib/bpf/Build | 2 +-
tools/lib/bpf/bpf.c | 92 +-
tools/lib/bpf/bpf.h | 16 +
tools/lib/bpf/btf.c | 377 +++++
tools/lib/bpf/btf.h | 22 +
tools/lib/bpf/libbpf.c | 148 +-
tools/lib/bpf/libbpf.h | 3 +
tools/testing/selftests/bpf/Makefile | 26 +-
tools/testing/selftests/bpf/test_btf.c | 1669 ++++++++++++++++++++
tools/testing/selftests/bpf/test_btf_haskv.c | 48 +
tools/testing/selftests/bpf/test_btf_nokv.c | 43 +
22 files changed, 5101 insertions(+), 41 deletions(-)
create mode 100644 include/linux/btf.h
create mode 100644 include/uapi/linux/btf.h
create mode 100644 kernel/bpf/btf.c
create mode 100644 tools/include/uapi/linux/btf.h
create mode 100644 tools/lib/bpf/btf.c
create mode 100644 tools/lib/bpf/btf.h
create mode 100644 tools/testing/selftests/bpf/test_btf.c
create mode 100644 tools/testing/selftests/bpf/test_btf_haskv.c
create mode 100644 tools/testing/selftests/bpf/test_btf_nokv.c
--
2.9.5
^ permalink raw reply
* [PATCH bpf-next v4 06/10] bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180417204243.4028831-1-kafai@fb.com>
This patch adds BPF_OBJ_GET_INFO_BY_FD support to BTF fd.
The original BTF data, which was used to create the BTF fd during
the earlier BPF_BTF_LOAD call, will be returned.
The userspace is expected to allocate buffer
to info.info and the buffer size is set to info.info_len before
calling BPF_OBJ_GET_INFO_BY_FD.
The original BTF data is copied to the userspace buffer (info.info).
Only upto the user's specified info.info_len will be copied.
The original BTF data size is set to info.info_len. The userspace
needs to check if it is bigger than its allocated buffer size.
If it is, the userspace should realloc with the kernel-returned
info.info_len and call the BPF_OBJ_GET_INFO_BY_FD again.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
include/linux/btf.h | 5 +++++
kernel/bpf/btf.c | 17 ++++++++++++++++-
kernel/bpf/syscall.c | 2 ++
3 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/include/linux/btf.h b/include/linux/btf.h
index a7c7072535ea..a966dc6d61ee 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -10,9 +10,14 @@ struct btf;
struct btf_type;
union bpf_attr;
+extern const struct file_operations btf_fops;
+
void btf_put(struct btf *btf);
int btf_new_fd(const union bpf_attr *attr);
struct btf *btf_get_by_fd(int fd);
+int btf_get_info_by_fd(const struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr);
/* Figure out the size of a type_id. If type_id is a modifier
* (e.g. const), it will be resolved to find out the type with size.
*
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ba088f02ed17..b09470d2df91 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -2029,7 +2029,7 @@ static int btf_release(struct inode *inode, struct file *filp)
return 0;
}
-static const struct file_operations btf_fops = {
+const struct file_operations btf_fops = {
.release = btf_release,
};
@@ -2074,3 +2074,18 @@ struct btf *btf_get_by_fd(int fd)
return btf;
}
+
+int btf_get_info_by_fd(const struct btf *btf,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ void __user *udata = u64_to_user_ptr(attr->info.info);
+ u32 copy_len = min_t(u32, btf->data_size,
+ attr->info.info_len);
+
+ if (copy_to_user(udata, btf->data, copy_len) ||
+ put_user(btf->data_size, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd8ebadc66eb..0a4924a0a8da 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2017,6 +2017,8 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
else if (f.file->f_op == &bpf_map_fops)
err = bpf_map_get_info_by_fd(f.file->private_data, attr,
uattr);
+ else if (f.file->f_op == &btf_fops)
+ err = btf_get_info_by_fd(f.file->private_data, attr, uattr);
else
err = -EINVAL;
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 09/10] bpf: btf: Add BTF support to libbpf
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180417204243.4028831-1-kafai@fb.com>
If the ".BTF" elf section exists, libbpf will try to create
a btf_fd (through BPF_BTF_LOAD). If that fails, it will still
continue loading the bpf prog/map without the BTF.
If the bpf_object has a BTF loaded, it will create a map with the btf_fd.
libbpf will try to figure out the btf_key_id and btf_value_id of a map by
finding the BTF type with name "<map_name>_key" and "<map_name>_value".
If they cannot be found, it will continue without using the BTF.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
tools/lib/bpf/Build | 2 +-
tools/lib/bpf/bpf.c | 92 +++++++++---
tools/lib/bpf/bpf.h | 16 +++
tools/lib/bpf/btf.c | 377 +++++++++++++++++++++++++++++++++++++++++++++++++
tools/lib/bpf/btf.h | 22 +++
tools/lib/bpf/libbpf.c | 148 +++++++++++++++++--
tools/lib/bpf/libbpf.h | 3 +
7 files changed, 629 insertions(+), 31 deletions(-)
create mode 100644 tools/lib/bpf/btf.c
create mode 100644 tools/lib/bpf/btf.h
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 64c679d67109..6070e655042d 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1 +1 @@
-libbpf-y := libbpf.o bpf.o nlattr.o
+libbpf-y := libbpf.o bpf.o nlattr.o btf.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index acbb3f8b3bec..76b36cc16e7f 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -73,43 +73,76 @@ static inline int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr,
return syscall(__NR_bpf, cmd, attr, size);
}
-int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
- int key_size, int value_size, int max_entries,
- __u32 map_flags, int node)
+int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr)
{
- __u32 name_len = name ? strlen(name) : 0;
+ __u32 name_len = create_attr->name ? strlen(create_attr->name) : 0;
union bpf_attr attr;
memset(&attr, '\0', sizeof(attr));
- attr.map_type = map_type;
- attr.key_size = key_size;
- attr.value_size = value_size;
- attr.max_entries = max_entries;
- attr.map_flags = map_flags;
- memcpy(attr.map_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1));
+ attr.map_type = create_attr->map_type;
+ attr.key_size = create_attr->key_size;
+ attr.value_size = create_attr->value_size;
+ attr.max_entries = create_attr->max_entries;
+ attr.map_flags = create_attr->map_flags;
+ memcpy(attr.map_name, create_attr->name,
+ min(name_len, BPF_OBJ_NAME_LEN - 1));
+ attr.numa_node = create_attr->numa_node;
+ attr.btf_fd = create_attr->btf_fd;
+ attr.btf_key_id = create_attr->btf_key_id;
+ attr.btf_value_id = create_attr->btf_value_id;
+
+ return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
+ int key_size, int value_size, int max_entries,
+ __u32 map_flags, int node)
+{
+ struct bpf_create_map_attr map_attr = {};
+
+ map_attr.name = name;
+ map_attr.map_type = map_type;
+ map_attr.map_flags = map_flags;
+ map_attr.key_size = key_size;
+ map_attr.value_size = value_size;
+ map_attr.max_entries = max_entries;
if (node >= 0) {
- attr.map_flags |= BPF_F_NUMA_NODE;
- attr.numa_node = node;
+ map_attr.numa_node = node;
+ map_attr.map_flags |= BPF_F_NUMA_NODE;
}
- return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+ return bpf_create_map_xattr(&map_attr);
}
int bpf_create_map(enum bpf_map_type map_type, int key_size,
int value_size, int max_entries, __u32 map_flags)
{
- return bpf_create_map_node(map_type, NULL, key_size, value_size,
- max_entries, map_flags, -1);
+ struct bpf_create_map_attr map_attr = {};
+
+ map_attr.map_type = map_type;
+ map_attr.map_flags = map_flags;
+ map_attr.key_size = key_size;
+ map_attr.value_size = value_size;
+ map_attr.max_entries = max_entries;
+
+ return bpf_create_map_xattr(&map_attr);
}
int bpf_create_map_name(enum bpf_map_type map_type, const char *name,
int key_size, int value_size, int max_entries,
__u32 map_flags)
{
- return bpf_create_map_node(map_type, name, key_size, value_size,
- max_entries, map_flags, -1);
+ struct bpf_create_map_attr map_attr = {};
+
+ map_attr.name = name;
+ map_attr.map_type = map_type;
+ map_attr.map_flags = map_flags;
+ map_attr.key_size = key_size;
+ map_attr.value_size = value_size;
+ map_attr.max_entries = max_entries;
+
+ return bpf_create_map_xattr(&map_attr);
}
int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name,
@@ -573,3 +606,28 @@ int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags)
close(sock);
return ret;
}
+
+int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
+ bool do_log)
+{
+ union bpf_attr attr = {};
+ int fd;
+
+ attr.btf = ptr_to_u64(btf);
+ attr.btf_size = btf_size;
+
+retry:
+ if (do_log && log_buf && log_buf_size) {
+ attr.btf_log_level = 1;
+ attr.btf_log_size = log_buf_size;
+ attr.btf_log_buf = ptr_to_u64(log_buf);
+ }
+
+ fd = sys_bpf(BPF_BTF_LOAD, &attr, sizeof(attr));
+ if (fd == -1 && !do_log && log_buf && log_buf_size) {
+ do_log = true;
+ goto retry;
+ }
+
+ return fd;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 39f6a0d64a3b..01bda076310f 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -26,6 +26,20 @@
#include <linux/bpf.h>
#include <stddef.h>
+struct bpf_create_map_attr {
+ const char *name;
+ enum bpf_map_type map_type;
+ __u32 map_flags;
+ __u32 key_size;
+ __u32 value_size;
+ __u32 max_entries;
+ __u32 numa_node;
+ __u32 btf_fd;
+ __u32 btf_key_id;
+ __u32 btf_value_id;
+};
+
+int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr);
int bpf_create_map_node(enum bpf_map_type map_type, const char *name,
int key_size, int value_size, int max_entries,
__u32 map_flags, int node);
@@ -87,4 +101,6 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len);
int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
__u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt);
int bpf_raw_tracepoint_open(const char *name, int prog_fd);
+int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
+ bool do_log);
#endif
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
new file mode 100644
index 000000000000..b16c8f43e98b
--- /dev/null
+++ b/tools/lib/bpf/btf.c
@@ -0,0 +1,377 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/err.h>
+#include <linux/btf.h>
+#include "btf.h"
+#include "bpf.h"
+
+#define elog(fmt, ...) { if (err_log) err_log(fmt, ##__VA_ARGS__); }
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define min(a, b) ((a) < (b) ? (a) : (b))
+
+#define BTF_MAX_NR_TYPES 65535
+
+static struct btf_type btf_void;
+
+struct btf {
+ union {
+ struct btf_header *hdr;
+ void *data;
+ };
+ struct btf_type **types;
+ const char *strings;
+ void *nohdr_data;
+ uint32_t nr_types;
+ uint32_t types_size;
+ uint32_t data_size;
+ int fd;
+};
+
+static const char *btf_name_by_offset(const struct btf *btf, uint32_t offset)
+{
+ if (!BTF_STR_TBL_ELF_ID(offset) &&
+ BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+ return &btf->strings[BTF_STR_OFFSET(offset)];
+ else
+ return NULL;
+}
+
+static int btf_add_type(struct btf *btf, struct btf_type *t)
+{
+ if (btf->types_size - btf->nr_types < 2) {
+ struct btf_type **new_types;
+ u32 expand_by, new_size;
+
+ if (btf->types_size == BTF_MAX_NR_TYPES)
+ return -E2BIG;
+
+ expand_by = max(btf->types_size >> 2, 16);
+ new_size = min(BTF_MAX_NR_TYPES, btf->types_size + expand_by);
+
+ new_types = realloc(btf->types, sizeof(*new_types) * new_size);
+ if (!new_types)
+ return -ENOMEM;
+
+ if (btf->nr_types == 0)
+ new_types[0] = &btf_void;
+
+ btf->types = new_types;
+ btf->types_size = new_size;
+ }
+
+ btf->types[++(btf->nr_types)] = t;
+
+ return 0;
+}
+
+static int btf_parse_hdr(struct btf *btf, btf_print_fn_t err_log)
+{
+ const struct btf_header *hdr = btf->hdr;
+ u32 meta_left;
+
+ if (btf->data_size < sizeof(struct btf_header)) {
+ elog("BTF header not found\n");
+ return -EINVAL;
+ }
+
+ if (hdr->magic != BTF_MAGIC) {
+ elog("Invalid BTF magic:%x\n", hdr->magic);
+ return -EINVAL;
+ }
+
+ if (hdr->version != BTF_VERSION) {
+ elog("Unsupported BTF version:%u\n", hdr->version);
+ return -ENOTSUP;
+ }
+
+ if (hdr->flags) {
+ elog("Unsupported BTF flags:%x\n", hdr->flags);
+ return -ENOTSUP;
+ }
+
+ meta_left = btf->data_size - sizeof(*hdr);
+ if (!meta_left) {
+ elog("BTF has no data\n");
+ return -EINVAL;
+ }
+
+ if (meta_left < hdr->type_off) {
+ elog("Invalid BTF type section offset:%u\n", hdr->type_off);
+ return -EINVAL;
+ }
+
+ if (meta_left < hdr->str_off) {
+ elog("Invalid BTF string section offset:%u\n", hdr->str_off);
+ return -EINVAL;
+ }
+
+ if (hdr->type_off >= hdr->str_off) {
+ elog("BTF type section offset >= string section offset. No type?\n");
+ return -EINVAL;
+ }
+
+ if (hdr->type_off & 0x02) {
+ elog("BTF type section is not aligned to 4 bytes\n");
+ return -EINVAL;
+ }
+
+ btf->nohdr_data = btf->hdr + 1;
+
+ return 0;
+}
+
+static int btf_parse_str_sec(struct btf *btf, btf_print_fn_t err_log)
+{
+ const struct btf_header *hdr = btf->hdr;
+ const char *start = btf->nohdr_data + hdr->str_off;
+ const char *end = start + btf->hdr->str_len;
+
+ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+ start[0] || end[-1]) {
+ elog("Invalid BTF string section\n");
+ return -EINVAL;
+ }
+
+ btf->strings = start;
+
+ return 0;
+}
+
+static int btf_parse_type_sec(struct btf *btf, btf_print_fn_t err_log)
+{
+ struct btf_header *hdr = btf->hdr;
+ void *nohdr_data = btf->nohdr_data;
+ void *next_type = nohdr_data + hdr->type_off;
+ void *end_type = nohdr_data + hdr->str_off;
+
+ while (next_type < end_type) {
+ struct btf_type *t = next_type;
+ uint16_t vlen = BTF_INFO_VLEN(t->info);
+ int err;
+
+ next_type += sizeof(*t);
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_INT:
+ next_type += sizeof(int);
+ break;
+ case BTF_KIND_ARRAY:
+ next_type += sizeof(struct btf_array);
+ break;
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ next_type += vlen * sizeof(struct btf_member);
+ break;
+ case BTF_KIND_ENUM:
+ next_type += vlen * sizeof(struct btf_enum);
+ break;
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_PTR:
+ case BTF_KIND_FWD:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ break;
+ case BTF_KIND_FLOAT:
+ case BTF_KIND_FUNC:
+ default:
+ elog("Unsupported BTF_KIND:%u\n",
+ BTF_INFO_KIND(t->info));
+ return -EINVAL;
+ }
+
+ err = btf_add_type(btf, t);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static const struct btf_type *btf_type_by_id(const struct btf *btf,
+ uint32_t type_id)
+{
+ if (type_id > btf->nr_types)
+ return NULL;
+
+ return btf->types[type_id];
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+ return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+ return !t || btf_type_is_void(t);
+}
+
+static int64_t btf_type_size(const struct btf_type *t)
+{
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_INT:
+ case BTF_KIND_FLOAT:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ return t->size;
+ case BTF_KIND_PTR:
+ return sizeof(void *);
+ default:
+ return -EINVAL;
+ }
+}
+
+#define MAX_RESOLVE_DEPTH 32
+
+int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id)
+{
+ const struct btf_array *array;
+ const struct btf_type *t;
+ uint32_t nelems = 1;
+ int64_t size = -1;
+ int i;
+
+ t = btf_type_by_id(btf, type_id);
+ for (i = 0; i < MAX_RESOLVE_DEPTH && !btf_type_is_void_or_null(t);
+ i++) {
+ size = btf_type_size(t);
+ if (size >= 0)
+ break;
+
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ type_id = t->type;
+ break;
+ case BTF_KIND_ARRAY:
+ array = (const struct btf_array *)(t + 1);
+ if (nelems && array->nelems > UINT32_MAX / nelems)
+ return -E2BIG;
+ nelems *= array->nelems;
+ type_id = array->type;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ t = btf_type_by_id(btf, type_id);
+ }
+
+ if (size < 0)
+ return -EINVAL;
+
+ if (nelems && size > UINT32_MAX / nelems)
+ return -E2BIG;
+
+ return nelems * size;
+}
+
+int32_t btf__find_by_name(const struct btf *btf, const char *type_name)
+{
+ uint32_t i;
+
+ if (!strcmp(type_name, "void"))
+ return 0;
+
+ for (i = 1; i <= btf->nr_types; i++) {
+ const struct btf_type *t = btf->types[i];
+ const char *name = btf_name_by_offset(btf, t->name);
+
+ if (name && !strcmp(type_name, name))
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+void btf__free(struct btf *btf)
+{
+ if (!btf)
+ return;
+
+ if (btf->fd != -1)
+ close(btf->fd);
+
+ free(btf->data);
+ free(btf->types);
+ free(btf);
+}
+
+struct btf *btf__new(uint8_t *data, uint32_t size,
+ btf_print_fn_t err_log)
+{
+ uint32_t log_buf_size = 0;
+ char *log_buf = NULL;
+ struct btf *btf;
+ int err;
+
+ btf = calloc(1, sizeof(struct btf));
+ if (!btf)
+ return ERR_PTR(-ENOMEM);
+
+ btf->fd = -1;
+
+ if (err_log) {
+ log_buf = malloc(BPF_LOG_BUF_SIZE);
+ if (!log_buf) {
+ err = -ENOMEM;
+ goto done;
+ }
+ *log_buf = 0;
+ log_buf_size = BPF_LOG_BUF_SIZE;
+ }
+
+ btf->data = malloc(size);
+ if (!btf->data) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ memcpy(btf->data, data, size);
+ btf->data_size = size;
+
+ btf->fd = bpf_load_btf(btf->data, btf->data_size,
+ log_buf, log_buf_size, false);
+
+ if (btf->fd == -1) {
+ err = -errno;
+ elog("Error loading BTF: %s(%d)\n", strerror(errno), errno);
+ if (log_buf && *log_buf)
+ elog("%s\n", log_buf);
+ goto done;
+ }
+
+ err = btf_parse_hdr(btf, err_log);
+ if (err)
+ goto done;
+
+ err = btf_parse_str_sec(btf, err_log);
+ if (err)
+ goto done;
+
+ err = btf_parse_type_sec(btf, err_log);
+
+done:
+ free(log_buf);
+
+ if (err) {
+ btf__free(btf);
+ return ERR_PTR(err);
+ }
+
+ return btf;
+}
+
+int btf__fd(const struct btf *btf)
+{
+ return btf->fd;
+}
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
new file mode 100644
index 000000000000..74bb344035bb
--- /dev/null
+++ b/tools/lib/bpf/btf.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef __BPF_BTF_H
+#define __BPF_BTF_H
+
+#include <stdint.h>
+
+#define BTF_ELF_SEC ".BTF"
+
+struct btf;
+
+typedef int (*btf_print_fn_t)(const char *, ...)
+ __attribute__((format(printf, 1, 2)));
+
+void btf__free(struct btf *btf);
+struct btf *btf__new(uint8_t *data, uint32_t size, btf_print_fn_t err_log);
+int32_t btf__find_by_name(const struct btf *btf, const char *type_name);
+int64_t btf__resolve_size(const struct btf *btf, uint32_t type_id);
+int btf__fd(const struct btf *btf);
+
+#endif
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 5922443063f0..59340bf4fc58 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -45,6 +45,7 @@
#include "libbpf.h"
#include "bpf.h"
+#include "btf.h"
#ifndef EM_BPF
#define EM_BPF 247
@@ -212,6 +213,8 @@ struct bpf_map {
char *name;
size_t offset;
struct bpf_map_def def;
+ uint32_t btf_key_id;
+ uint32_t btf_value_id;
void *priv;
bpf_map_clear_priv_t clear_priv;
};
@@ -256,6 +259,8 @@ struct bpf_object {
*/
struct list_head list;
+ struct btf *btf;
+
void *priv;
bpf_object_clear_priv_t clear_priv;
@@ -819,7 +824,15 @@ static int bpf_object__elf_collect(struct bpf_object *obj)
data->d_size);
else if (strcmp(name, "maps") == 0)
obj->efile.maps_shndx = idx;
- else if (sh.sh_type == SHT_SYMTAB) {
+ else if (strcmp(name, BTF_ELF_SEC) == 0) {
+ obj->btf = btf__new(data->d_buf, data->d_size,
+ __pr_debug);
+ if (IS_ERR(obj->btf)) {
+ pr_warning("Error loading ELF section %s: %ld. Ignored and continue.\n",
+ BTF_ELF_SEC, PTR_ERR(obj->btf));
+ obj->btf = NULL;
+ }
+ } else if (sh.sh_type == SHT_SYMTAB) {
if (obj->efile.symbols) {
pr_warning("bpf: multiple SYMTAB in %s\n",
obj->path);
@@ -996,33 +1009,126 @@ bpf_program__collect_reloc(struct bpf_program *prog, GElf_Shdr *shdr,
return 0;
}
+static int bpf_map_find_btf_info(struct bpf_map *map, const struct btf *btf)
+{
+ struct bpf_map_def *def = &map->def;
+ const size_t max_name = 256;
+ int64_t key_size, value_size;
+ int32_t key_id, value_id;
+ char name[max_name];
+
+ /* Find key type by name from BTF */
+ if (snprintf(name, max_name, "%s_key", map->name) == max_name) {
+ pr_warning("map:%s length of BTF key_type:%s_key is too long\n",
+ map->name, map->name);
+ return -EINVAL;
+ }
+
+ key_id = btf__find_by_name(btf, name);
+ if (key_id < 0) {
+ pr_debug("map:%s key_type:%s cannot be found in BTF\n",
+ map->name, name);
+ return key_id;
+ }
+
+ key_size = btf__resolve_size(btf, key_id);
+ if (key_size < 0) {
+ pr_warning("map:%s key_type:%s cannot get the BTF type_size\n",
+ map->name, name);
+ return key_size;
+ }
+
+ if (def->key_size != key_size) {
+ pr_warning("map:%s key_type:%s has BTF type_size:%ld != key_size:%u\n",
+ map->name, name, key_size, def->key_size);
+ return -EINVAL;
+ }
+
+ /* Find value type from BTF */
+ if (snprintf(name, max_name, "%s_value", map->name) == max_name) {
+ pr_warning("map:%s length of BTF value_type:%s_value is too long\n",
+ map->name, map->name);
+ return -EINVAL;
+ }
+
+ value_id = btf__find_by_name(btf, name);
+ if (value_id < 0) {
+ pr_debug("map:%s value_type:%s cannot be found in BTF\n",
+ map->name, name);
+ return value_id;
+ }
+
+ value_size = btf__resolve_size(btf, value_id);
+ if (value_size < 0) {
+ pr_warning("map:%s value_type:%s cannot get the BTF type_size\n",
+ map->name, name);
+ return value_size;
+ }
+
+ if (def->value_size != value_size) {
+ pr_warning("map:%s value_type:%s has BTF type_size:%ld != value_size:%u\n",
+ map->name, name, value_size, def->value_size);
+ return -EINVAL;
+ }
+
+ map->btf_key_id = key_id;
+ map->btf_value_id = value_id;
+
+ return 0;
+}
+
static int
bpf_object__create_maps(struct bpf_object *obj)
{
+ struct bpf_create_map_attr create_attr = {};
unsigned int i;
+ int err;
for (i = 0; i < obj->nr_maps; i++) {
- struct bpf_map_def *def = &obj->maps[i].def;
- int *pfd = &obj->maps[i].fd;
-
- *pfd = bpf_create_map_name(def->type,
- obj->maps[i].name,
- def->key_size,
- def->value_size,
- def->max_entries,
- def->map_flags);
+ struct bpf_map *map = &obj->maps[i];
+ struct bpf_map_def *def = &map->def;
+ int *pfd = &map->fd;
+
+ create_attr.name = map->name;
+ create_attr.map_type = def->type;
+ create_attr.map_flags = def->map_flags;
+ create_attr.key_size = def->key_size;
+ create_attr.value_size = def->value_size;
+ create_attr.max_entries = def->max_entries;
+ create_attr.btf_fd = 0;
+ create_attr.btf_key_id = 0;
+ create_attr.btf_value_id = 0;
+
+ if (obj->btf && !bpf_map_find_btf_info(map, obj->btf)) {
+ create_attr.btf_fd = btf__fd(obj->btf);
+ create_attr.btf_key_id = map->btf_key_id;
+ create_attr.btf_value_id = map->btf_value_id;
+ }
+
+ *pfd = bpf_create_map_xattr(&create_attr);
+ if (*pfd < 0 && create_attr.btf_key_id) {
+ pr_warning("Error in bpf_create_map_xattr(%s):%s(%d). Retrying without BTF.\n",
+ map->name, strerror(errno), errno);
+ create_attr.btf_fd = 0;
+ create_attr.btf_key_id = 0;
+ create_attr.btf_value_id = 0;
+ map->btf_key_id = 0;
+ map->btf_value_id = 0;
+ *pfd = bpf_create_map_xattr(&create_attr);
+ }
+
if (*pfd < 0) {
size_t j;
- int err = *pfd;
+ err = *pfd;
pr_warning("failed to create map (name: '%s'): %s\n",
- obj->maps[i].name,
+ map->name,
strerror(errno));
for (j = 0; j < i; j++)
zclose(obj->maps[j].fd);
return err;
}
- pr_debug("create map %s: fd=%d\n", obj->maps[i].name, *pfd);
+ pr_debug("create map %s: fd=%d\n", map->name, *pfd);
}
return 0;
@@ -1641,6 +1747,7 @@ void bpf_object__close(struct bpf_object *obj)
bpf_object__elf_finish(obj);
bpf_object__unload(obj);
+ btf__free(obj->btf);
for (i = 0; i < obj->nr_maps; i++) {
zfree(&obj->maps[i].name);
@@ -1692,6 +1799,11 @@ unsigned int bpf_object__kversion(struct bpf_object *obj)
return obj ? obj->kern_version : 0;
}
+int bpf_object__btf_fd(const struct bpf_object *obj)
+{
+ return obj->btf ? btf__fd(obj->btf) : -1;
+}
+
int bpf_object__set_priv(struct bpf_object *obj, void *priv,
bpf_object_clear_priv_t clear_priv)
{
@@ -1929,6 +2041,16 @@ const char *bpf_map__name(struct bpf_map *map)
return map ? map->name : NULL;
}
+uint32_t bpf_map__btf_key_id(const struct bpf_map *map)
+{
+ return map ? map->btf_key_id : 0;
+}
+
+uint32_t bpf_map__btf_value_id(const struct bpf_map *map)
+{
+ return map ? map->btf_value_id : 0;
+}
+
int bpf_map__set_priv(struct bpf_map *map, void *priv,
bpf_map_clear_priv_t clear_priv)
{
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index a3a62a583f27..bd53c2f50048 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -78,6 +78,7 @@ int bpf_object__load(struct bpf_object *obj);
int bpf_object__unload(struct bpf_object *obj);
const char *bpf_object__name(struct bpf_object *obj);
unsigned int bpf_object__kversion(struct bpf_object *obj);
+int bpf_object__btf_fd(const struct bpf_object *obj);
struct bpf_object *bpf_object__next(struct bpf_object *prev);
#define bpf_object__for_each_safe(pos, tmp) \
@@ -239,6 +240,8 @@ bpf_map__next(struct bpf_map *map, struct bpf_object *obj);
int bpf_map__fd(struct bpf_map *map);
const struct bpf_map_def *bpf_map__def(struct bpf_map *map);
const char *bpf_map__name(struct bpf_map *map);
+uint32_t bpf_map__btf_key_id(const struct bpf_map *map);
+uint32_t bpf_map__btf_value_id(const struct bpf_map *map);
typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *);
int bpf_map__set_priv(struct bpf_map *map, void *priv,
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 01/10] bpf: btf: Introduce BPF Type Format (BTF)
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180417204243.4028831-1-kafai@fb.com>
This patch introduces BPF type Format (BTF).
BTF (BPF Type Format) is the meta data format which describes
the data types of BPF program/map. Hence, it basically focus
on the C programming language which the modern BPF is primary
using. The first use case is to provide a generic pretty print
capability for a BPF map.
BTF has its root from CTF (Compact C-Type format). To simplify
the handling of BTF data, BTF removes the differences between
small and big type/struct-member. Hence, BTF consistently uses u32
instead of supporting both "one u16" and "two u32 (+padding)" in
describing type and struct-member.
It also raises the number of types (and functions) limit
from 0x7fff to 0x7fffffff.
Due to the above changes, the format is not compatible to CTF.
Hence, BTF starts with a new BTF_MAGIC and version number.
This patch does the first verification pass to the BTF. The first
pass checks:
1. meta-data size (e.g. It does not go beyond the total btf's size)
2. name_offset is valid
3. Each BTF_KIND (e.g. int, enum, struct....) does its
own check of its meta-data.
Some other checks, like checking a struct's member is referring
to a valid type, can only be done in the second pass. The second
verification pass will be implemented in the next patch.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
include/uapi/linux/btf.h | 132 +++++++
kernel/bpf/Makefile | 1 +
kernel/bpf/btf.c | 938 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 1071 insertions(+)
create mode 100644 include/uapi/linux/btf.h
create mode 100644 kernel/bpf/btf.c
diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
new file mode 100644
index 000000000000..30f23ad78eff
--- /dev/null
+++ b/include/uapi/linux/btf.h
@@ -0,0 +1,132 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/* Copyright (c) 2018 Facebook */
+#ifndef _UAPI__LINUX_BTF_H__
+#define _UAPI__LINUX_BTF_H__
+
+#include <linux/types.h>
+
+#define BTF_MAGIC 0xeB9F
+#define BTF_MAGIC_SWAP 0x9FeB
+#define BTF_VERSION 1
+#define BTF_FLAGS_COMPR 0x01
+
+struct btf_header {
+ __u16 magic;
+ __u8 version;
+ __u8 flags;
+
+ __u32 parent_label;
+ __u32 parent_name;
+
+ /* All offsets are in bytes relative to the end of this header */
+ __u32 label_off; /* offset of label section */
+ __u32 object_off; /* offset of data object section*/
+ __u32 func_off; /* offset of function section */
+ __u32 type_off; /* offset of type section */
+ __u32 str_off; /* offset of string section */
+ __u32 str_len; /* length of string section */
+};
+
+/* Max # of type identifier */
+#define BTF_MAX_TYPE 0x7fffffff
+/* Max offset into the string section */
+#define BTF_MAX_NAME_OFFSET 0x7fffffff
+/* Max # of struct/union/enum members or func args */
+#define BTF_MAX_VLEN 0xffff
+
+/* The type id is referring to a parent BTF */
+#define BTF_TYPE_PARENT(id) (((id) >> 31) & 0x1)
+#define BTF_TYPE_ID(id) ((id) & BTF_MAX_TYPE)
+
+/* String is in the ELF string section */
+#define BTF_STR_TBL_ELF_ID(ref) (((ref) >> 31) & 0x1)
+#define BTF_STR_OFFSET(ref) ((ref) & BTF_MAX_NAME_OFFSET)
+
+struct btf_type {
+ __u32 name;
+ /* "info" bits arrangement
+ * bits 0-15: vlen (e.g. # of struct's members)
+ * bits 16-23: unused
+ * bits 24-28: kind (e.g. int, ptr, array...etc)
+ * bits 29-30: unused
+ * bits 31: root
+ */
+ __u32 info;
+ /* "size" is used by INT, ENUM, STRUCT and UNION.
+ * "size" tells the size of the type it is describing.
+ *
+ * "type" is used by PTR, TYPEDEF, VOLATILE, CONST and RESTRICT.
+ * "type" is a type_id referring to another type.
+ */
+ union {
+ __u32 size;
+ __u32 type;
+ };
+};
+
+#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f)
+#define BTF_INFO_ISROOT(info) (!!(((info) >> 24) & 0x80))
+#define BTF_INFO_VLEN(info) ((info) & 0xffff)
+
+#define BTF_KIND_UNKN 0 /* Unknown */
+#define BTF_KIND_INT 1 /* Integer */
+#define BTF_KIND_FLOAT 2 /* Float */
+#define BTF_KIND_PTR 3 /* Pointer */
+#define BTF_KIND_ARRAY 4 /* Array */
+#define BTF_KIND_FUNC 5 /* Function */
+#define BTF_KIND_STRUCT 6 /* Struct */
+#define BTF_KIND_UNION 7 /* Union */
+#define BTF_KIND_ENUM 8 /* Enumeration */
+#define BTF_KIND_FWD 9 /* Forward */
+#define BTF_KIND_TYPEDEF 10 /* Typedef */
+#define BTF_KIND_VOLATILE 11 /* Volatile */
+#define BTF_KIND_CONST 12 /* Const */
+#define BTF_KIND_RESTRICT 13 /* Restrict */
+#define BTF_KIND_MAX 13
+#define NR_BTF_KINDS 14
+
+/* For some specific BTF_KIND, "struct btf_type" is immediately
+ * followed by extra data.
+ */
+
+/* BTF_KIND_INT is followed by a u32 and the following
+ * is the 32 bits arrangement:
+ */
+#define BTF_INT_ENCODING(VAL) (((VAL) & 0xff000000) >> 24)
+#define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16)
+#define BTF_INT_BITS(VAL) ((VAL) & 0x0000ffff)
+
+/* Attributes stored in the BTF_INT_ENCODING */
+#define BTF_INT_SIGNED 0x1
+#define BTF_INT_CHAR 0x2
+#define BTF_INT_BOOL 0x4
+#define BTF_INT_VARARGS 0x8
+
+/* BTF_KIND_ENUM is followed by multiple "struct btf_enum".
+ * The exact number of btf_enum is stored in the vlen (of the
+ * info in "struct btf_type").
+ */
+struct btf_enum {
+ __u32 name;
+ __s32 val;
+};
+
+/* BTF_KIND_ARRAY is followed by one "struct btf_array" */
+struct btf_array {
+ __u32 type;
+ __u32 index_type;
+ __u32 nelems;
+};
+
+/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed
+ * by multiple "struct btf_member". The exact number
+ * of btf_member is stored in the vlen (of the info in
+ * "struct btf_type").
+ */
+struct btf_member {
+ __u32 name;
+ __u32 type;
+ __u32 offset; /* offset in bits */
+};
+
+#endif /* _UAPI__LINUX_BTF_H__ */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index a713fd23ec88..35c485fa9ea3 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,6 +4,7 @@ obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o
+obj-$(CONFIG_BPF_SYSCALL) += btf.o
ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_BPF_SYSCALL) += devmap.o
obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
new file mode 100644
index 000000000000..3a6fd0109f58
--- /dev/null
+++ b/kernel/bpf/btf.c
@@ -0,0 +1,938 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#include <uapi/linux/btf.h>
+#include <uapi/linux/types.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kernel.h>
+#include <linux/bpf_verifier.h>
+#include <linux/btf.h>
+
+/* BTF (BPF Type Format) is the meta data format which describes
+ * the data types of BPF program/map. Hence, it basically focus
+ * on the C programming language which the modern BPF is primary
+ * using.
+ *
+ * ELF Section:
+ * ~~~~~~~~~~~
+ * The BTF data is stored under the ".BTF" ELF section
+ *
+ * struct btf_type:
+ * ~~~~~~~~~~~~~~~
+ * Each 'struct btf_type' object describes a C data type.
+ * Depending on the type it is describing, a 'struct btf_type'
+ * object may be followed by more data. F.e.
+ * To describe an array, 'struct btf_type' is followed by
+ * 'struct btf_array'.
+ *
+ * 'struct btf_type' and any extra data following it are
+ * 4 bytes aligned.
+ *
+ * Type section:
+ * ~~~~~~~~~~~~~
+ * The BTF type section contains a list of 'struct btf_type' objects.
+ * Each one describes a C type. Recall from the above section
+ * that a 'struct btf_type' object could be immediately followed by extra
+ * data in order to desribe some particular C types.
+ *
+ * type_id:
+ * ~~~~~~~
+ * Each btf_type object is identified by a type_id. The type_id
+ * is implicitly implied by the location of the btf_type object in
+ * the BTF type section. The first one has type_id 1. The second
+ * one has type_id 2...etc. Hence, an earlier btf_type has
+ * a smaller type_id.
+ *
+ * A btf_type object may refer to another btf_type object by using
+ * type_id (i.e. the "type" in the "struct btf_type").
+ *
+ * NOTE that we cannot assume any reference-order.
+ * A btf_type object can refer to an earlier btf_type object
+ * but it can also refer to a later btf_type object.
+ *
+ * For example, to describe "const void *". A btf_type
+ * object describing "const" may refer to another btf_type
+ * object describing "void *". This type-reference is done
+ * by specifying type_id:
+ *
+ * [1] CONST (anon) type_id=2
+ * [2] PTR (anon) type_id=0
+ *
+ * The above is the btf_verifier debug log:
+ * - Each line started with "[?]" is a btf_type object
+ * - [?] is the type_id of the btf_type object.
+ * - CONST/PTR is the BTF_KIND_XXX
+ * - "(anon)" is the name of the type. It just
+ * happens that CONST and PTR has no name.
+ * - type_id=XXX is the 'u32 type' in btf_type
+ *
+ * NOTE: "void" has type_id 0
+ *
+ * String section:
+ * ~~~~~~~~~~~~~~
+ * The BTF string section contains the names used by the type section.
+ * Each string is referred by an "offset" from the beginning of the
+ * string section.
+ *
+ * Each string is '\0' terminated.
+ *
+ * The first character in the string section must be '\0'
+ * which is used to mean 'anonymous'. Some btf_type may not
+ * have a name.
+ */
+
+/* BTF verification:
+ *
+ * To verify BTF data, two passes are needed.
+ *
+ * Pass #1
+ * ~~~~~~~
+ * The first pass is to collect all btf_type objects to
+ * an array: "btf->types".
+ *
+ * Depending on the C type that a btf_type is describing,
+ * a btf_type may be followed by extra data. We don't know
+ * how many btf_type is there, and more importantly we don't
+ * know where each btf_type is located in the type section.
+ *
+ * Without knowing the location of each type_id, most verifications
+ * cannot be done. e.g. an earlier btf_type may refer to a later
+ * btf_type (recall the "const void *" above), so we cannot
+ * check this type-reference in the first pass.
+ *
+ * In the first pass, it still does some verifications (e.g.
+ * checking the name is a valid offset to the string section).
+ */
+
+#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
+#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1)
+#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK)
+#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3)
+#define BITS_ROUNDUP_BYTES(bits) \
+ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits))
+
+/* 16MB for 64k structs and each has 16 members and
+ * a few MB spaces for the string section.
+ * The hard limit is S32_MAX.
+ */
+#define BTF_MAX_SIZE (16 * 1024 * 1024)
+/* 64k. We can raise it later. The hard limit is S32_MAX. */
+#define BTF_MAX_NR_TYPES 65535
+
+#define for_each_member(i, struct_type, member) \
+ for (i = 0, member = btf_type_member(struct_type); \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
+struct btf {
+ union {
+ struct btf_header *hdr;
+ void *data;
+ };
+ struct btf_type **types;
+ const char *strings;
+ void *nohdr_data;
+ u32 nr_types;
+ u32 types_size;
+ u32 data_size;
+};
+
+struct btf_verifier_env {
+ struct btf *btf;
+ struct bpf_verifier_log log;
+ u32 log_type_id;
+};
+
+static const char * const btf_kind_str[NR_BTF_KINDS] = {
+ [BTF_KIND_UNKN] = "UNKNOWN",
+ [BTF_KIND_INT] = "INT",
+ [BTF_KIND_FLOAT] = "FLOAT",
+ [BTF_KIND_PTR] = "PTR",
+ [BTF_KIND_ARRAY] = "ARRAY",
+ [BTF_KIND_FUNC] = "FUNC",
+ [BTF_KIND_STRUCT] = "STRUCT",
+ [BTF_KIND_UNION] = "UNION",
+ [BTF_KIND_ENUM] = "ENUM",
+ [BTF_KIND_FWD] = "FWD",
+ [BTF_KIND_TYPEDEF] = "TYPEDEF",
+ [BTF_KIND_VOLATILE] = "VOLATILE",
+ [BTF_KIND_CONST] = "CONST",
+ [BTF_KIND_RESTRICT] = "RESTRICT",
+};
+
+struct btf_kind_operations {
+ s32 (*check_meta)(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left);
+ void (*log_details)(struct btf_verifier_env *env,
+ const struct btf_type *t);
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
+static struct btf_type btf_void;
+
+static const char *btf_int_encoding_str(u8 encoding)
+{
+ if (encoding == 0)
+ return "(none)";
+ else if (encoding == BTF_INT_SIGNED)
+ return "SIGNED";
+ else if (encoding == BTF_INT_CHAR)
+ return "CHAR";
+ else if (encoding == BTF_INT_BOOL)
+ return "BOOL";
+ else if (encoding == BTF_INT_VARARGS)
+ return "VARARGS";
+ else
+ return "UNKN";
+}
+
+static u16 btf_type_vlen(const struct btf_type *t)
+{
+ return BTF_INFO_VLEN(t->info);
+}
+
+static u32 btf_type_int(const struct btf_type *t)
+{
+ return *(u32 *)(t + 1);
+}
+
+static const struct btf_array *btf_type_array(const struct btf_type *t)
+{
+ return (const struct btf_array *)(t + 1);
+}
+
+static const struct btf_member *btf_type_member(const struct btf_type *t)
+{
+ return (const struct btf_member *)(t + 1);
+}
+
+static const struct btf_enum *btf_type_enum(const struct btf_type *t)
+{
+ return (const struct btf_enum *)(t + 1);
+}
+
+static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
+{
+ return kind_ops[BTF_INFO_KIND(t->info)];
+}
+
+static bool btf_name_offset_valid(const struct btf *btf, u32 offset)
+{
+ return !BTF_STR_TBL_ELF_ID(offset) &&
+ BTF_STR_OFFSET(offset) < btf->hdr->str_len;
+}
+
+static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
+{
+ if (!BTF_STR_OFFSET(offset))
+ return "(anon)";
+ else if (BTF_STR_OFFSET(offset) < btf->hdr->str_len)
+ return &btf->strings[BTF_STR_OFFSET(offset)];
+ else
+ return "(invalid-name-offset)";
+}
+
+__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
+ const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+}
+
+__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ bool log_details,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ u8 kind = BTF_INFO_KIND(t->info);
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ __btf_verifier_log(log, "[%u] %s %s%s",
+ env->log_type_id,
+ btf_kind_str[kind],
+ btf_name_by_offset(btf, t->name),
+ log_details ? " " : "");
+
+ if (log_details)
+ btf_type_ops(t)->log_details(env, t);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+#define btf_verifier_log_type(env, t, ...) \
+ __btf_verifier_log_type((env), (t), true, __VA_ARGS__)
+#define btf_verifier_log_basic(env, t, ...) \
+ __btf_verifier_log_type((env), (t), false, __VA_ARGS__)
+
+__printf(4, 5)
+static void btf_verifier_log_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const char *fmt, ...)
+{
+ struct bpf_verifier_log *log = &env->log;
+ struct btf *btf = env->btf;
+ va_list args;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
+ btf_name_by_offset(btf, member->name),
+ member->type, member->offset);
+
+ if (fmt && *fmt) {
+ __btf_verifier_log(log, " ");
+ va_start(args, fmt);
+ bpf_verifier_vlog(log, fmt, args);
+ va_end(args);
+ }
+
+ __btf_verifier_log(log, "\n");
+}
+
+static void btf_verifier_log_hdr(struct btf_verifier_env *env)
+{
+ struct bpf_verifier_log *log = &env->log;
+ const struct btf *btf = env->btf;
+ const struct btf_header *hdr;
+
+ if (!bpf_verifier_log_needed(log))
+ return;
+
+ hdr = btf->hdr;
+ __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic);
+ __btf_verifier_log(log, "version: %u\n", hdr->version);
+ __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags);
+ __btf_verifier_log(log, "parent_label: %u\n", hdr->parent_label);
+ __btf_verifier_log(log, "parent_name: %u\n", hdr->parent_name);
+ __btf_verifier_log(log, "label_off: %u\n", hdr->label_off);
+ __btf_verifier_log(log, "object_off: %u\n", hdr->object_off);
+ __btf_verifier_log(log, "func_off: %u\n", hdr->func_off);
+ __btf_verifier_log(log, "type_off: %u\n", hdr->type_off);
+ __btf_verifier_log(log, "str_off: %u\n", hdr->str_off);
+ __btf_verifier_log(log, "str_len: %u\n", hdr->str_len);
+ __btf_verifier_log(log, "btf_total_size: %u\n", btf->data_size);
+}
+
+static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
+{
+ struct btf *btf = env->btf;
+
+ /* < 2 because +1 for btf_void which is always in btf->types[0].
+ * btf_void is not accounted in btf->nr_types because btf_void
+ * does not come from the BTF file.
+ */
+ if (btf->types_size - btf->nr_types < 2) {
+ /* Expand 'types' array */
+
+ struct btf_type **new_types;
+ u32 expand_by, new_size;
+
+ if (btf->types_size == BTF_MAX_NR_TYPES) {
+ btf_verifier_log(env, "Exceeded max num of types");
+ return -E2BIG;
+ }
+
+ expand_by = max_t(u32, btf->types_size >> 2, 16);
+ new_size = min_t(u32, BTF_MAX_NR_TYPES,
+ btf->types_size + expand_by);
+
+ new_types = kvzalloc(new_size * sizeof(*new_types),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!new_types)
+ return -ENOMEM;
+
+ if (btf->nr_types == 0)
+ new_types[0] = &btf_void;
+ else
+ memcpy(new_types, btf->types,
+ sizeof(*btf->types) * (btf->nr_types + 1));
+
+ kvfree(btf->types);
+ btf->types = new_types;
+ btf->types_size = new_size;
+ }
+
+ btf->types[++(btf->nr_types)] = t;
+
+ return 0;
+}
+
+static void btf_free(struct btf *btf)
+{
+ kvfree(btf->types);
+ kvfree(btf->data);
+ kfree(btf);
+}
+
+static void btf_verifier_env_free(struct btf_verifier_env *env)
+{
+ kfree(env);
+}
+
+static int btf_df_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ btf_verifier_log_basic(env, t, "Unsupported check_meta");
+ return -ENOTSUPP;
+}
+
+static void btf_df_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "Unsupported log_details");
+}
+
+static struct btf_kind_operations df_ops = {
+ .check_meta = btf_df_check_meta,
+ .log_details = btf_df_log,
+};
+
+static s32 btf_int_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 int_data, nr_bits, meta_needed = sizeof(int_data);
+ u16 encoding;
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ int_data = btf_type_int(t);
+ nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data);
+
+ if (nr_bits > BITS_PER_U64) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds %zu",
+ BITS_PER_U64);
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) {
+ btf_verifier_log_type(env, t, "nr_bits exceeds type_size");
+ return -EINVAL;
+ }
+
+ encoding = BTF_INT_ENCODING(int_data);
+ if (encoding &&
+ encoding != BTF_INT_SIGNED &&
+ encoding != BTF_INT_CHAR &&
+ encoding != BTF_INT_BOOL &&
+ encoding != BTF_INT_VARARGS) {
+ btf_verifier_log_type(env, t, "Unsupported encoding");
+ return -ENOTSUPP;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_int_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ int int_data = btf_type_int(t);
+
+ btf_verifier_log(env,
+ "size=%u bits_offset=%u nr_bits=%u encoding=%s",
+ t->size, BTF_INT_OFFSET(int_data),
+ BTF_INT_BITS(int_data),
+ btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
+}
+
+static const struct btf_kind_operations int_ops = {
+ .check_meta = btf_int_check_meta,
+ .log_details = btf_int_log,
+};
+
+static int btf_ref_type_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ if (BTF_TYPE_PARENT(t->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return 0;
+}
+
+static void btf_ref_type_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "type_id=%u", t->type);
+}
+
+static struct btf_kind_operations modifier_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations ptr_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .log_details = btf_ref_type_log,
+};
+
+static struct btf_kind_operations fwd_ops = {
+ .check_meta = btf_ref_type_check_meta,
+ .log_details = btf_ref_type_log,
+};
+
+static s32 btf_array_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_array *array = btf_type_array(t);
+ u32 meta_needed = sizeof(*array);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (btf_type_vlen(t)) {
+ btf_verifier_log_type(env, t, "vlen != 0");
+ return -EINVAL;
+ }
+
+ /* We are a little forgiving on array->index_type since
+ * the kernel is not using it.
+ */
+ /* Array elem cannot be in type void,
+ * so !array->type is not allowed.
+ */
+ if (!array->type || BTF_TYPE_PARENT(array->type)) {
+ btf_verifier_log_type(env, t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ return meta_needed;
+}
+
+static void btf_array_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ const struct btf_array *array = btf_type_array(t);
+
+ btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u",
+ array->type, array->index_type, array->nelems);
+}
+
+static struct btf_kind_operations array_ops = {
+ .check_meta = btf_array_check_meta,
+ .log_details = btf_array_log,
+};
+
+static s32 btf_struct_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION;
+ const struct btf_member *member;
+ struct btf *btf = env->btf;
+ u32 struct_size = t->size;
+ u32 meta_needed;
+ u16 i;
+
+ meta_needed = btf_type_vlen(t) * sizeof(*member);
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for_each_member(i, t, member) {
+ if (!btf_name_offset_valid(btf, member->name)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member name_offset:%u",
+ member->name);
+ return -EINVAL;
+ }
+
+ /* A member cannot be in type void */
+ if (!member->type || BTF_TYPE_PARENT(member->type)) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid type_id");
+ return -EINVAL;
+ }
+
+ if (is_union && member->offset) {
+ btf_verifier_log_member(env, t, member,
+ "Invalid member bits_offset");
+ return -EINVAL;
+ }
+
+ if (BITS_ROUNDUP_BYTES(member->offset) > struct_size) {
+ btf_verifier_log_member(env, t, member,
+ "Memmber bits_offset exceeds its struct size");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_member(env, t, member, NULL);
+ }
+
+ return meta_needed;
+}
+
+static void btf_struct_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations struct_ops = {
+ .check_meta = btf_struct_check_meta,
+ .log_details = btf_struct_log,
+};
+
+static s32 btf_enum_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ struct btf *btf = env->btf;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size != sizeof(int)) {
+ btf_verifier_log_type(env, t, "Expected size:%zu",
+ sizeof(int));
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name);
+ return -EINVAL;
+ }
+
+ btf_verifier_log(env, "\t%s val=%d\n",
+ btf_name_by_offset(btf, enums[i].name),
+ enums[i].val);
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum_log(struct btf_verifier_env *env,
+ const struct btf_type *t)
+{
+ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
+}
+
+static struct btf_kind_operations enum_ops = {
+ .check_meta = btf_enum_check_meta,
+ .log_details = btf_enum_log,
+};
+
+static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
+ [BTF_KIND_INT] = &int_ops,
+ [BTF_KIND_FLOAT] = &df_ops,
+ [BTF_KIND_PTR] = &ptr_ops,
+ [BTF_KIND_ARRAY] = &array_ops,
+ [BTF_KIND_FUNC] = &df_ops,
+ [BTF_KIND_STRUCT] = &struct_ops,
+ [BTF_KIND_UNION] = &struct_ops,
+ [BTF_KIND_ENUM] = &enum_ops,
+ [BTF_KIND_FWD] = &fwd_ops,
+ [BTF_KIND_TYPEDEF] = &modifier_ops,
+ [BTF_KIND_VOLATILE] = &modifier_ops,
+ [BTF_KIND_CONST] = &modifier_ops,
+ [BTF_KIND_RESTRICT] = &modifier_ops,
+};
+
+static s32 btf_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ u32 saved_meta_left = meta_left;
+ s32 var_meta_size;
+
+ if (meta_left < sizeof(*t)) {
+ btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu",
+ env->log_type_id, meta_left, sizeof(*t));
+ return -EINVAL;
+ }
+ meta_left -= sizeof(*t);
+
+ if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX ||
+ BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) {
+ btf_verifier_log(env, "[%u] Invalid kind:%u",
+ env->log_type_id, BTF_INFO_KIND(t->info));
+ return -EINVAL;
+ }
+
+ if (!btf_name_offset_valid(env->btf, t->name)) {
+ btf_verifier_log(env, "[%u] Invalid name_offset:%u",
+ env->log_type_id, t->name);
+ return -EINVAL;
+ }
+
+ var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left);
+ if (var_meta_size < 0)
+ return var_meta_size;
+
+ meta_left -= var_meta_size;
+
+ return saved_meta_left - meta_left;
+}
+
+static int btf_check_all_metas(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ struct btf_header *hdr;
+ void *cur, *end;
+
+ hdr = btf->hdr;
+ cur = btf->nohdr_data + hdr->type_off;
+ end = btf->nohdr_data + hdr->str_off;
+
+ env->log_type_id = 1;
+ while (cur < end) {
+ struct btf_type *t = cur;
+ s32 meta_size;
+
+ meta_size = btf_check_meta(env, t, end - cur);
+ if (meta_size < 0)
+ return meta_size;
+
+ btf_add_type(env, t);
+ cur += meta_size;
+ env->log_type_id++;
+ }
+
+ return 0;
+}
+
+static int btf_parse_type_sec(struct btf_verifier_env *env)
+{
+ return btf_check_all_metas(env);
+}
+
+static int btf_parse_str_sec(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr;
+ struct btf *btf = env->btf;
+ const char *start, *end;
+
+ hdr = btf->hdr;
+ start = btf->nohdr_data + hdr->str_off;
+ end = start + hdr->str_len;
+
+ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET ||
+ start[0] || end[-1]) {
+ btf_verifier_log(env, "Invalid string section");
+ return -EINVAL;
+ }
+
+ btf->strings = start;
+
+ return 0;
+}
+
+static int btf_parse_hdr(struct btf_verifier_env *env)
+{
+ const struct btf_header *hdr;
+ struct btf *btf = env->btf;
+ u32 meta_left;
+
+ if (btf->data_size < sizeof(*hdr)) {
+ btf_verifier_log(env, "btf_header not found");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_hdr(env);
+
+ hdr = btf->hdr;
+ if (hdr->magic != BTF_MAGIC) {
+ btf_verifier_log(env, "Invalid magic");
+ return -EINVAL;
+ }
+
+ if (hdr->version != BTF_VERSION) {
+ btf_verifier_log(env, "Unsupported version");
+ return -ENOTSUPP;
+ }
+
+ if (hdr->flags) {
+ btf_verifier_log(env, "Unsupported flags");
+ return -ENOTSUPP;
+ }
+
+ meta_left = btf->data_size - sizeof(*hdr);
+ if (!meta_left) {
+ btf_verifier_log(env, "No data");
+ return -EINVAL;
+ }
+
+ if (meta_left < hdr->type_off || hdr->str_off <= hdr->type_off ||
+ /* Type section must align to 4 bytes */
+ hdr->type_off & (sizeof(u32) - 1)) {
+ btf_verifier_log(env, "Invalid type_off");
+ return -EINVAL;
+ }
+
+ if (meta_left < hdr->str_off ||
+ meta_left - hdr->str_off < hdr->str_len) {
+ btf_verifier_log(env, "Invalid str_off or str_len");
+ return -EINVAL;
+ }
+
+ btf->nohdr_data = btf->hdr + 1;
+
+ return 0;
+}
+
+static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
+ u32 log_level, char __user *log_ubuf, u32 log_size)
+{
+ struct btf_verifier_env *env = NULL;
+ struct bpf_verifier_log *log;
+ struct btf *btf = NULL;
+ u8 *data;
+ int err;
+
+ if (btf_data_size > BTF_MAX_SIZE)
+ return ERR_PTR(-E2BIG);
+
+ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN);
+ if (!env)
+ return ERR_PTR(-ENOMEM);
+
+ log = &env->log;
+ if (log_level || log_ubuf || log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log->level = log_level;
+ log->ubuf = log_ubuf;
+ log->len_total = log_size;
+
+ /* log attributes have to be sane */
+ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+ !log->level || !log->ubuf) {
+ err = -EINVAL;
+ goto errout;
+ }
+ }
+
+ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN);
+ if (!btf) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ data = kvmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN);
+ if (!data) {
+ err = -ENOMEM;
+ goto errout;
+ }
+
+ btf->data = data;
+ btf->data_size = btf_data_size;
+
+ if (copy_from_user(data, btf_data, btf_data_size)) {
+ err = -EFAULT;
+ goto errout;
+ }
+
+ env->btf = btf;
+
+ err = btf_parse_hdr(env);
+ if (err)
+ goto errout;
+
+ err = btf_parse_str_sec(env);
+ if (err)
+ goto errout;
+
+ err = btf_parse_type_sec(env);
+ if (err)
+ goto errout;
+
+ if (!err && log->level && bpf_verifier_log_full(log)) {
+ err = -ENOSPC;
+ goto errout;
+ }
+
+ if (!err) {
+ btf_verifier_env_free(env);
+ return btf;
+ }
+
+errout:
+ btf_verifier_env_free(env);
+ if (btf)
+ btf_free(btf);
+ return ERR_PTR(err);
+}
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 02/10] bpf: btf: Validate type reference
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180417204243.4028831-1-kafai@fb.com>
After collecting all btf_type in the first pass in an earlier patch,
the second pass (in this patch) can validate the reference types
(e.g. the referring type does exist and it does not refer to itself).
While checking the reference type, it also gathers other information (e.g.
the size of an array). This info will be useful in checking the
struct's members in a later patch. They will also be useful in doing
pretty print later.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
include/linux/btf.h | 37 +++
kernel/bpf/btf.c | 668 +++++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 704 insertions(+), 1 deletion(-)
create mode 100644 include/linux/btf.h
diff --git a/include/linux/btf.h b/include/linux/btf.h
new file mode 100644
index 000000000000..f14b60368753
--- /dev/null
+++ b/include/linux/btf.h
@@ -0,0 +1,37 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018 Facebook */
+
+#ifndef _LINUX_BTF_H
+#define _LINUX_BTF_H 1
+
+#include <linux/types.h>
+
+struct btf;
+struct btf_type;
+
+/* Figure out the size of a type_id. If type_id is a modifier
+ * (e.g. const), it will be resolved to find out the type with size.
+ *
+ * For example:
+ * In describing "const void *", type_id is "const" and "const"
+ * refers to "void *". The return type will be "void *".
+ *
+ * If type_id is a simple "int", then return type will be "int".
+ *
+ * @btf: struct btf object
+ * @type_id: Find out the size of type_id. The type_id of the return
+ * type is set to *type_id.
+ * @ret_size: It can be NULL. If not NULL, the size of the return
+ * type is set to *ret_size.
+ * Return: The btf_type (resolved to another type with size info if needed).
+ * NULL is returned if type_id itself does not have size info
+ * (e.g. void) or it cannot be resolved to another type that
+ * has size info.
+ * *type_id and *ret_size will not be changed in the
+ * NULL return case.
+ */
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+ u32 *type_id,
+ u32 *ret_size);
+
+#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 3a6fd0109f58..e3467846a579 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -105,6 +105,50 @@
*
* In the first pass, it still does some verifications (e.g.
* checking the name is a valid offset to the string section).
+ *
+ * Pass #2
+ * ~~~~~~~
+ * The main focus is to resolve a btf_type that is referring
+ * to another type.
+ *
+ * We have to ensure the referring type:
+ * 1) does exist in the BTF (i.e. in btf->types[])
+ * 2) does not cause a loop:
+ * struct A {
+ * struct B b;
+ * };
+ *
+ * struct B {
+ * struct A a;
+ * };
+ *
+ * btf_type_needs_resolve() decides if a btf_type needs
+ * to be resolved.
+ *
+ * The needs_resolve type implements the "resolve()" ops which
+ * essentially does a DFS and detects backedge.
+ *
+ * During resolve (or DFS), different C types have different
+ * "RESOLVED" conditions.
+ *
+ * When resolving a BTF_KIND_STRUCT, we need to resolve all its
+ * members because a member is always referring to another
+ * type. A struct's member can be treated as "RESOLVED" if
+ * it is referring to a BTF_KIND_PTR. Otherwise, the
+ * following valid C struct would be rejected:
+ *
+ * struct A {
+ * int m;
+ * struct A *a;
+ * };
+ *
+ * When resolving a BTF_KIND_PTR, it needs to keep resolving if
+ * it is referring to another BTF_KIND_PTR. Otherwise, we cannot
+ * detect a pointer loop, e.g.:
+ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR +
+ * ^ |
+ * +-----------------------------------------+
+ *
*/
#define BITS_PER_U64 (sizeof(u64) * BITS_PER_BYTE)
@@ -127,12 +171,19 @@
i < btf_type_vlen(struct_type); \
i++, member++)
+#define for_each_member_from(i, from, struct_type, member) \
+ for (i = from, member = btf_type_member(struct_type) + from; \
+ i < btf_type_vlen(struct_type); \
+ i++, member++)
+
struct btf {
union {
struct btf_header *hdr;
void *data;
};
struct btf_type **types;
+ u32 *resolved_ids;
+ u32 *resolved_sizes;
const char *strings;
void *nohdr_data;
u32 nr_types;
@@ -140,10 +191,42 @@ struct btf {
u32 data_size;
};
+enum verifier_phase {
+ CHECK_META,
+ CHECK_TYPE,
+};
+
+struct resolve_vertex {
+ const struct btf_type *t;
+ u32 type_id;
+ u16 next_member;
+};
+
+enum visit_state {
+ NOT_VISITED,
+ VISITED,
+ RESOLVED,
+};
+
+enum resolve_mode {
+ RESOLVE_TBD, /* To Be Determined */
+ RESOLVE_PTR, /* Resolving for Pointer */
+ RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union
+ * or array
+ */
+};
+
+#define MAX_RESOLVE_DEPTH 32
+
struct btf_verifier_env {
struct btf *btf;
+ u8 *visit_states;
+ struct resolve_vertex stack[MAX_RESOLVE_DEPTH];
struct bpf_verifier_log log;
u32 log_type_id;
+ u32 top_stack;
+ enum verifier_phase phase;
+ enum resolve_mode resolve_mode;
};
static const char * const btf_kind_str[NR_BTF_KINDS] = {
@@ -167,6 +250,8 @@ struct btf_kind_operations {
s32 (*check_meta)(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left);
+ int (*resolve)(struct btf_verifier_env *env,
+ const struct resolve_vertex *v);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
};
@@ -174,6 +259,102 @@ struct btf_kind_operations {
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
static struct btf_type btf_void;
+static bool btf_type_is_modifier(const struct btf_type *t)
+{
+ /* Some of them is not strictly a C modifier
+ * but they are grouped into the same bucket
+ * for BTF concern:
+ * A type (t) that refers to another
+ * type through t->type AND its size cannot
+ * be determined without following the t->type.
+ *
+ * ptr does not fall into this bucket
+ * because its size is always sizeof(void *).
+ */
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_TYPEDEF:
+ case BTF_KIND_VOLATILE:
+ case BTF_KIND_CONST:
+ case BTF_KIND_RESTRICT:
+ return true;
+ }
+
+ return false;
+}
+
+static bool btf_type_is_void(const struct btf_type *t)
+{
+ /* void => no type and size info.
+ * Hence, FWD is also treated as void.
+ */
+ return t == &btf_void || BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
+}
+
+static bool btf_type_is_void_or_null(const struct btf_type *t)
+{
+ return !t || btf_type_is_void(t);
+}
+
+/* union is only a special case of struct:
+ * all its offsetof(member) == 0
+ */
+static bool btf_type_is_struct(const struct btf_type *t)
+{
+ u8 kind = BTF_INFO_KIND(t->info);
+
+ return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
+}
+
+static bool btf_type_is_array(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
+}
+
+static bool btf_type_is_ptr(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_PTR;
+}
+
+static bool btf_type_is_int(const struct btf_type *t)
+{
+ return BTF_INFO_KIND(t->info) == BTF_KIND_INT;
+}
+
+/* What types need to be resolved?
+ *
+ * btf_type_is_modifier() is an obvious one.
+ *
+ * btf_type_is_struct() because its member refers to
+ * another type (through member->type).
+
+ * btf_type_is_array() because its element (array->type)
+ * refers to another type. Array can be thought of a
+ * special case of struct while array just has the same
+ * member-type repeated by array->nelems of times.
+ */
+static bool btf_type_needs_resolve(const struct btf_type *t)
+{
+ return btf_type_is_modifier(t) ||
+ btf_type_is_ptr(t) ||
+ btf_type_is_struct(t) ||
+ btf_type_is_array(t);
+}
+
+/* t->size can be used */
+static bool btf_type_has_size(const struct btf_type *t)
+{
+ switch (BTF_INFO_KIND(t->info)) {
+ case BTF_KIND_INT:
+ case BTF_KIND_FLOAT:
+ case BTF_KIND_STRUCT:
+ case BTF_KIND_UNION:
+ case BTF_KIND_ENUM:
+ return true;
+ }
+
+ return false;
+}
+
static const char *btf_int_encoding_str(u8 encoding)
{
if (encoding == 0)
@@ -236,6 +417,14 @@ static const char *btf_name_by_offset(const struct btf *btf, u32 offset)
return "(invalid-name-offset)";
}
+static const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id)
+{
+ if (type_id > btf->nr_types)
+ return NULL;
+
+ return btf->types[type_id];
+}
+
__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log,
const char *fmt, ...)
{
@@ -310,6 +499,15 @@ static void btf_verifier_log_member(struct btf_verifier_env *env,
if (!bpf_verifier_log_needed(log))
return;
+ /* The CHECK_META phase already did a btf dump.
+ *
+ * If member is logged again, it must hit an error in
+ * parsing this member. It is useful to print out which
+ * struct this member belongs to.
+ */
+ if (env->phase != CHECK_META)
+ btf_verifier_log_type(env, struct_type, NULL);
+
__btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u",
btf_name_by_offset(btf, member->name),
member->type, member->offset);
@@ -395,15 +593,176 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t)
static void btf_free(struct btf *btf)
{
kvfree(btf->types);
+ kvfree(btf->resolved_sizes);
+ kvfree(btf->resolved_ids);
kvfree(btf->data);
kfree(btf);
}
+static int env_resolve_init(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ u32 nr_types = btf->nr_types;
+ u32 *resolved_sizes = NULL;
+ u32 *resolved_ids = NULL;
+ u8 *visit_states = NULL;
+
+ /* +1 for btf_void */
+ resolved_sizes = kvzalloc((nr_types + 1) * sizeof(*resolved_sizes),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_sizes)
+ goto nomem;
+
+ resolved_ids = kvzalloc((nr_types + 1) * sizeof(*resolved_ids),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!resolved_ids)
+ goto nomem;
+
+ visit_states = kvzalloc((nr_types + 1) * sizeof(*visit_states),
+ GFP_KERNEL | __GFP_NOWARN);
+ if (!visit_states)
+ goto nomem;
+
+ btf->resolved_sizes = resolved_sizes;
+ btf->resolved_ids = resolved_ids;
+ env->visit_states = visit_states;
+
+ return 0;
+
+nomem:
+ kvfree(resolved_sizes);
+ kvfree(resolved_ids);
+ kvfree(visit_states);
+ return -ENOMEM;
+}
+
static void btf_verifier_env_free(struct btf_verifier_env *env)
{
+ kvfree(env->visit_states);
kfree(env);
}
+static bool env_type_is_resolve_sink(const struct btf_verifier_env *env,
+ const struct btf_type *next_type)
+{
+ switch (env->resolve_mode) {
+ case RESOLVE_TBD:
+ /* int, enum or void is a sink */
+ return !btf_type_needs_resolve(next_type);
+ case RESOLVE_PTR:
+ /* int, enum, void, struct or array is a sink for ptr */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_ptr(next_type);
+ case RESOLVE_STRUCT_OR_ARRAY:
+ /* int, enum, void or ptr is a sink for struct and array */
+ return !btf_type_is_modifier(next_type) &&
+ !btf_type_is_array(next_type) &&
+ !btf_type_is_struct(next_type);
+ default:
+ BUG_ON(1);
+ }
+}
+
+static bool env_type_is_resolved(const struct btf_verifier_env *env,
+ u32 type_id)
+{
+ return env->visit_states[type_id] == RESOLVED;
+}
+
+static int env_stack_push(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ struct resolve_vertex *v;
+
+ if (env->top_stack == MAX_RESOLVE_DEPTH)
+ return -E2BIG;
+
+ if (env->visit_states[type_id] != NOT_VISITED)
+ return -EEXIST;
+
+ env->visit_states[type_id] = VISITED;
+
+ v = &env->stack[env->top_stack++];
+ v->t = t;
+ v->type_id = type_id;
+ v->next_member = 0;
+
+ if (env->resolve_mode == RESOLVE_TBD) {
+ if (btf_type_is_ptr(t))
+ env->resolve_mode = RESOLVE_PTR;
+ else if (btf_type_is_struct(t) || btf_type_is_array(t))
+ env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY;
+ }
+
+ return 0;
+}
+
+static void env_stack_set_next_member(struct btf_verifier_env *env,
+ u16 next_member)
+{
+ env->stack[env->top_stack - 1].next_member = next_member;
+}
+
+static void env_stack_pop_resolved(struct btf_verifier_env *env,
+ u32 resolved_type_id,
+ u32 resolved_size)
+{
+ u32 type_id = env->stack[--(env->top_stack)].type_id;
+ struct btf *btf = env->btf;
+
+ btf->resolved_sizes[type_id] = resolved_size;
+ btf->resolved_ids[type_id] = resolved_type_id;
+ env->visit_states[type_id] = RESOLVED;
+}
+
+static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env)
+{
+ return env->top_stack ? &env->stack[env->top_stack - 1] : NULL;
+}
+
+/* The input param "type_id" must point to a needs_resolve type */
+static const struct btf_type *btf_type_id_resolve(const struct btf *btf,
+ u32 *type_id)
+{
+ *type_id = btf->resolved_ids[*type_id];
+ return btf_type_by_id(btf, *type_id);
+}
+
+const struct btf_type *btf_type_id_size(const struct btf *btf,
+ u32 *type_id, u32 *ret_size)
+{
+ const struct btf_type *size_type;
+ u32 size_type_id = *type_id;
+ u32 size = 0;
+
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_is_void_or_null(size_type))
+ return NULL;
+
+ if (btf_type_has_size(size_type)) {
+ size = size_type->size;
+ } else if (btf_type_is_array(size_type)) {
+ size = btf->resolved_sizes[size_type_id];
+ } else if (btf_type_is_ptr(size_type)) {
+ size = sizeof(void *);
+ } else {
+ if (WARN_ON_ONCE(!btf_type_is_modifier(size_type)))
+ return NULL;
+
+ size = btf->resolved_sizes[size_type_id];
+ size_type_id = btf->resolved_ids[size_type_id];
+ size_type = btf_type_by_id(btf, size_type_id);
+ if (btf_type_is_void(size_type))
+ return NULL;
+ }
+
+ *type_id = size_type_id;
+ if (ret_size)
+ *ret_size = size;
+
+ return size_type;
+}
+
static int btf_df_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -412,6 +771,13 @@ static int btf_df_check_meta(struct btf_verifier_env *env,
return -ENOTSUPP;
}
+static int btf_df_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ btf_verifier_log_basic(env, v->t, "Unsupported resolve");
+ return -EINVAL;
+}
+
static void btf_df_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -420,6 +786,7 @@ static void btf_df_log(struct btf_verifier_env *env,
static struct btf_kind_operations df_ops = {
.check_meta = btf_df_check_meta,
+ .resolve = btf_df_resolve,
.log_details = btf_df_log,
};
@@ -485,6 +852,7 @@ static void btf_int_log(struct btf_verifier_env *env,
static const struct btf_kind_operations int_ops = {
.check_meta = btf_int_check_meta,
+ .resolve = btf_df_resolve,
.log_details = btf_int_log,
};
@@ -507,6 +875,104 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return 0;
}
+static int btf_modifier_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *t = v->t;
+ const struct btf_type *next_type;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size = 0;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ /* "typedef void new_void", "const void"...etc */
+ if (btf_type_is_void(next_type))
+ goto resolved;
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* Figure out the resolved next_type_id with size.
+ * They will be stored in the current modifier's
+ * resolved_ids and resolved_sizes such that it can
+ * save us a few type-following when we use it later (e.g. in
+ * pretty print).
+ */
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+ !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+resolved:
+ env_stack_pop_resolved(env, next_type_id, next_type_size);
+
+ return 0;
+}
+
+static int btf_ptr_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_type *next_type;
+ const struct btf_type *t = v->t;
+ u32 next_type_id = t->type;
+ struct btf *btf = env->btf;
+ u32 next_type_size = 0;
+
+ next_type = btf_type_by_id(btf, next_type_id);
+ if (!next_type) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+ /* "void *" */
+ if (btf_type_is_void(next_type))
+ goto resolved;
+
+ if (!env_type_is_resolve_sink(env, next_type) &&
+ !env_type_is_resolved(env, next_type_id))
+ return env_stack_push(env, next_type, next_type_id);
+
+ /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY,
+ * the modifier may have stopped resolving when it was resolved
+ * to a ptr (last-resolved-ptr).
+ *
+ * We now need to continue from the last-resolved-ptr to
+ * ensure the last-resolved-ptr will not referring back to
+ * the currenct ptr (t).
+ */
+ if (btf_type_is_modifier(next_type)) {
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id;
+
+ resolved_type_id = next_type_id;
+ resolved_type = btf_type_id_resolve(btf, &resolved_type_id);
+
+ if (btf_type_is_ptr(resolved_type) &&
+ !env_type_is_resolve_sink(env, resolved_type) &&
+ !env_type_is_resolved(env, resolved_type_id))
+ return env_stack_push(env, resolved_type,
+ resolved_type_id);
+ }
+
+ if (!btf_type_id_size(btf, &next_type_id, &next_type_size) &&
+ !btf_type_is_void(btf_type_id_resolve(btf, &next_type_id))) {
+ btf_verifier_log_type(env, v->t, "Invalid type_id");
+ return -EINVAL;
+ }
+
+resolved:
+ env_stack_pop_resolved(env, next_type_id, 0);
+
+ return 0;
+}
+
static void btf_ref_type_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -515,16 +981,19 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
static struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
+ .resolve = btf_modifier_resolve,
.log_details = btf_ref_type_log,
};
static struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
+ .resolve = btf_ptr_resolve,
.log_details = btf_ref_type_log,
};
static struct btf_kind_operations fwd_ops = {
.check_meta = btf_ref_type_check_meta,
+ .resolve = btf_df_resolve,
.log_details = btf_ref_type_log,
};
@@ -563,6 +1032,61 @@ static s32 btf_array_check_meta(struct btf_verifier_env *env,
return meta_needed;
}
+static int btf_array_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_array *array = btf_type_array(v->t);
+ const struct btf_type *elem_type;
+ u32 elem_type_id = array->type;
+ struct btf *btf = env->btf;
+ u32 elem_size;
+
+ elem_type = btf_type_by_id(btf, elem_type_id);
+ if (btf_type_is_void_or_null(elem_type)) {
+ btf_verifier_log_type(env, v->t,
+ "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, elem_type) &&
+ !env_type_is_resolved(env, elem_type_id))
+ return env_stack_push(env, elem_type, elem_type_id);
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ if (!elem_type) {
+ btf_verifier_log_type(env, v->t, "Invalid elem");
+ return -EINVAL;
+ }
+
+ if (btf_type_is_int(elem_type)) {
+ int int_type_data = btf_type_int(elem_type);
+ u16 nr_bits = BTF_INT_BITS(int_type_data);
+ u16 nr_bytes = BITS_ROUNDUP_BYTES(nr_bits);
+
+ /* Put more restriction on array of int. The int cannot
+ * be a bit field and it must be either u8/u16/u32/u64.
+ */
+ if (BITS_PER_BYTE_MASKED(nr_bits) ||
+ BTF_INT_OFFSET(int_type_data) ||
+ (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) &&
+ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64))) {
+ btf_verifier_log_type(env, v->t,
+ "Invalid array of int");
+ return -EINVAL;
+ }
+ }
+
+ if (array->nelems && elem_size > U32_MAX / array->nelems) {
+ btf_verifier_log_type(env, v->t,
+ "Array size overflows U32_MAX");
+ return -EINVAL;
+ }
+
+ env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems);
+
+ return 0;
+}
+
static void btf_array_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -574,6 +1098,7 @@ static void btf_array_log(struct btf_verifier_env *env,
static struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
+ .resolve = btf_array_resolve,
.log_details = btf_array_log,
};
@@ -631,6 +1156,50 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return meta_needed;
}
+static int btf_struct_resolve(struct btf_verifier_env *env,
+ const struct resolve_vertex *v)
+{
+ const struct btf_member *member;
+ u16 i;
+
+ /* Before continue resolving the next_member,
+ * ensure the last member is indeed resolved to a
+ * type with size info.
+ */
+ if (v->next_member) {
+ const struct btf_member *last_member;
+ u16 last_member_type_id;
+
+ last_member = btf_type_member(v->t) + v->next_member - 1;
+ last_member_type_id = last_member->type;
+ if (WARN_ON_ONCE(!env_type_is_resolved(env,
+ last_member_type_id)))
+ return -EINVAL;
+ }
+
+ for_each_member_from(i, v->next_member, v->t, member) {
+ u32 member_type_id = member->type;
+ const struct btf_type *member_type = btf_type_by_id(env->btf,
+ member_type_id);
+
+ if (btf_type_is_void_or_null(member_type)) {
+ btf_verifier_log_member(env, v->t, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ if (!env_type_is_resolve_sink(env, member_type) &&
+ !env_type_is_resolved(env, member_type_id)) {
+ env_stack_set_next_member(env, i + 1);
+ return env_stack_push(env, member_type, member_type_id);
+ }
+ }
+
+ env_stack_pop_resolved(env, 0, 0);
+
+ return 0;
+}
+
static void btf_struct_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -639,6 +1208,7 @@ static void btf_struct_log(struct btf_verifier_env *env,
static struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
+ .resolve = btf_struct_resolve,
.log_details = btf_struct_log,
};
@@ -692,6 +1262,7 @@ static void btf_enum_log(struct btf_verifier_env *env,
static struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
+ .resolve = btf_df_resolve,
.log_details = btf_enum_log,
};
@@ -774,9 +1345,104 @@ static int btf_check_all_metas(struct btf_verifier_env *env)
return 0;
}
+static int btf_resolve(struct btf_verifier_env *env,
+ const struct btf_type *t, u32 type_id)
+{
+ const struct resolve_vertex *v;
+ int err = 0;
+
+ env->resolve_mode = RESOLVE_TBD;
+ env_stack_push(env, t, type_id);
+ while (!err && (v = env_stack_peak(env))) {
+ env->log_type_id = v->type_id;
+ err = btf_type_ops(v->t)->resolve(env, v);
+ }
+
+ env->log_type_id = type_id;
+ if (err == -E2BIG)
+ btf_verifier_log_type(env, t,
+ "Exceeded max resolving depth:%u",
+ MAX_RESOLVE_DEPTH);
+ else if (err == -EEXIST)
+ btf_verifier_log_type(env, t, "Loop detected");
+
+ return err;
+}
+
+static bool btf_resolve_valid(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 type_id)
+{
+ struct btf *btf = env->btf;
+
+ if (!env_type_is_resolved(env, type_id))
+ return false;
+
+ if (btf_type_is_struct(t))
+ return !btf->resolved_ids[type_id] &&
+ !btf->resolved_sizes[type_id];
+
+ if (btf_type_is_modifier(t) || btf_type_is_ptr(t)) {
+ t = btf_type_id_resolve(btf, &type_id);
+ return t && !btf_type_is_modifier(t);
+ }
+
+ if (btf_type_is_array(t)) {
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_type *elem_type;
+ u32 elem_type_id = array->type;
+ u32 elem_size;
+
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ return elem_type && !btf_type_is_modifier(elem_type) &&
+ (array->nelems * elem_size ==
+ btf->resolved_sizes[type_id]);
+ }
+
+ return false;
+}
+
+static int btf_check_all_types(struct btf_verifier_env *env)
+{
+ struct btf *btf = env->btf;
+ u32 type_id;
+ int err;
+
+ err = env_resolve_init(env);
+ if (err)
+ return err;
+
+ env->phase++;
+ for (type_id = 1; type_id <= btf->nr_types; type_id++) {
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ env->log_type_id = type_id;
+ if (btf_type_needs_resolve(t) &&
+ !env_type_is_resolved(env, type_id)) {
+ err = btf_resolve(env, t, type_id);
+ if (err)
+ return err;
+ }
+
+ if (btf_type_needs_resolve(t) &&
+ !btf_resolve_valid(env, t, type_id)) {
+ btf_verifier_log_type(env, t, "Invalid resolve state");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int btf_parse_type_sec(struct btf_verifier_env *env)
{
- return btf_check_all_metas(env);
+ int err;
+
+ err = btf_check_all_metas(env);
+ if (err)
+ return err;
+
+ return btf_check_all_types(env);
}
static int btf_parse_str_sec(struct btf_verifier_env *env)
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 05/10] bpf: btf: Add BPF_BTF_LOAD command
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180417204243.4028831-1-kafai@fb.com>
This patch adds a BPF_BTF_LOAD command which
1) loads and verifies the BTF (implemented in earlier patches)
2) returns a BTF fd to userspace. In the next patch, the
BTF fd can be specified during BPF_MAP_CREATE.
It currently limits to CAP_SYS_ADMIN.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
include/linux/btf.h | 4 +++
include/uapi/linux/bpf.h | 9 +++++++
kernel/bpf/btf.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 17 ++++++++++++
4 files changed, 97 insertions(+)
diff --git a/include/linux/btf.h b/include/linux/btf.h
index d8bdab0280ba..a7c7072535ea 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -8,7 +8,11 @@
struct btf;
struct btf_type;
+union bpf_attr;
+void btf_put(struct btf *btf);
+int btf_new_fd(const union bpf_attr *attr);
+struct btf *btf_get_by_fd(int fd);
/* Figure out the size of a type_id. If type_id is a modifier
* (e.g. const), it will be resolved to find out the type with size.
*
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c5ec89732a8d..c7d75f18521b 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -95,6 +95,7 @@ enum bpf_cmd {
BPF_OBJ_GET_INFO_BY_FD,
BPF_PROG_QUERY,
BPF_RAW_TRACEPOINT_OPEN,
+ BPF_BTF_LOAD,
};
enum bpf_map_type {
@@ -363,6 +364,14 @@ union bpf_attr {
__u64 name;
__u32 prog_fd;
} raw_tracepoint;
+
+ struct { /* anonymous struct for BPF_BTF_LOAD */
+ __aligned_u64 btf;
+ __aligned_u64 btf_log_buf;
+ __u32 btf_size;
+ __u32 btf_log_size;
+ __u32 btf_log_level;
+ };
} __attribute__((aligned(8)));
/* BPF helper function descriptions:
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4703b26e34a9..ba088f02ed17 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -7,6 +7,8 @@
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/bpf_verifier.h>
@@ -190,6 +192,7 @@ struct btf {
u32 nr_types;
u32 types_size;
u32 data_size;
+ refcount_t refcnt;
};
enum verifier_phase {
@@ -607,6 +610,17 @@ static void btf_free(struct btf *btf)
kfree(btf);
}
+static void btf_get(struct btf *btf)
+{
+ refcount_inc(&btf->refcnt);
+}
+
+void btf_put(struct btf *btf)
+{
+ if (btf && refcount_dec_and_test(&btf->refcnt))
+ btf_free(btf);
+}
+
static int env_resolve_init(struct btf_verifier_env *env)
{
struct btf *btf = env->btf;
@@ -1990,6 +2004,7 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
if (!err) {
btf_verifier_env_free(env);
+ btf_get(btf);
return btf;
}
@@ -2007,3 +2022,55 @@ void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
}
+
+static int btf_release(struct inode *inode, struct file *filp)
+{
+ btf_put(filp->private_data);
+ return 0;
+}
+
+static const struct file_operations btf_fops = {
+ .release = btf_release,
+};
+
+int btf_new_fd(const union bpf_attr *attr)
+{
+ struct btf *btf;
+ int fd;
+
+ btf = btf_parse(u64_to_user_ptr(attr->btf),
+ attr->btf_size, attr->btf_log_level,
+ u64_to_user_ptr(attr->btf_log_buf),
+ attr->btf_log_size);
+ if (IS_ERR(btf))
+ return PTR_ERR(btf);
+
+ fd = anon_inode_getfd("btf", &btf_fops, btf,
+ O_RDONLY | O_CLOEXEC);
+ if (fd < 0)
+ btf_put(btf);
+
+ return fd;
+}
+
+struct btf *btf_get_by_fd(int fd)
+{
+ struct btf *btf;
+ struct fd f;
+
+ f = fdget(fd);
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &btf_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ btf = f.file->private_data;
+ btf_get(btf);
+ fdput(f);
+
+ return btf;
+}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ca46df19c9a..cd8ebadc66eb 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -11,6 +11,7 @@
*/
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
+#include <linux/btf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
@@ -2023,6 +2024,19 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
return err;
}
+#define BPF_BTF_LOAD_LAST_FIELD btf_log_level
+
+static int bpf_btf_load(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_BTF_LOAD))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return btf_new_fd(attr);
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -2103,6 +2117,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_RAW_TRACEPOINT_OPEN:
err = bpf_raw_tracepoint_open(&attr);
break;
+ case BPF_BTF_LOAD:
+ err = bpf_btf_load(&attr);
+ break;
default:
err = -EINVAL;
break;
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 04/10] bpf: btf: Add pretty print capability for data with BTF type info
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180417204243.4028831-1-kafai@fb.com>
This patch adds pretty print capability for data with BTF type info.
The current usage is to allow pretty print for a BPF map.
The next few patches will allow a read() on a pinned map with BTF
type info for its key and value.
This patch uses the seq_printf() infra.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
include/linux/btf.h | 2 +
kernel/bpf/btf.c | 199 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 201 insertions(+)
diff --git a/include/linux/btf.h b/include/linux/btf.h
index f14b60368753..d8bdab0280ba 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -33,5 +33,7 @@ struct btf_type;
const struct btf_type *btf_type_id_size(const struct btf *btf,
u32 *type_id,
u32 *ret_size);
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+ struct seq_file *m);
#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index bb220a9899e2..4703b26e34a9 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3,6 +3,7 @@
#include <uapi/linux/btf.h>
#include <uapi/linux/types.h>
+#include <linux/seq_file.h>
#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/slab.h>
@@ -258,6 +259,9 @@ struct btf_kind_operations {
const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
+ void (*seq_show)(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct seq_file *m);
};
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS];
@@ -798,11 +802,19 @@ static void btf_df_log(struct btf_verifier_env *env,
btf_verifier_log(env, "Unsupported log_details");
}
+static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offsets,
+ struct seq_file *m)
+{
+ seq_printf(m, "<unsupported kind:%u>", BTF_INFO_KIND(t->info));
+}
+
static struct btf_kind_operations df_ops = {
.check_meta = btf_df_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
.log_details = btf_df_log,
+ .seq_show = btf_df_seq_show,
};
static int btf_int_check_member(struct btf_verifier_env *env,
@@ -903,11 +915,96 @@ static void btf_int_log(struct btf_verifier_env *env,
btf_int_encoding_str(BTF_INT_ENCODING(int_data)));
}
+static void btf_int_bits_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u16 nr_bits = BTF_INT_BITS(int_data);
+ u16 total_bits_offset;
+ u16 nr_copy_bytes;
+ u16 nr_copy_bits;
+ u8 nr_upper_bits;
+ union {
+ u64 u64_num;
+ u8 u8_nums[8];
+ } print_num;
+
+ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data);
+ data += BITS_ROUNDDOWN_BYTES(total_bits_offset);
+ bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset);
+ nr_copy_bits = nr_bits + bits_offset;
+ nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits);
+
+ print_num.u64_num = 0;
+ memcpy(&print_num.u64_num, data, nr_copy_bytes);
+
+ /* Ditch the higher order bits */
+ nr_upper_bits = BITS_PER_BYTE_MASKED(nr_copy_bits);
+ if (nr_upper_bits) {
+ /* We need to mask out some bits of the upper byte. */
+ u8 mask = (1 << nr_upper_bits) - 1;
+
+ print_num.u8_nums[nr_copy_bytes - 1] &= mask;
+ }
+
+ print_num.u64_num >>= bits_offset;
+
+ seq_printf(m, "0x%llx", print_num.u64_num);
+}
+
+static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ u32 int_data = btf_type_int(t);
+ u8 encoding = BTF_INT_ENCODING(int_data);
+ bool sign = encoding & BTF_INT_SIGNED;
+ u32 nr_bits = BTF_INT_BITS(int_data);
+
+ if (bits_offset || BTF_INT_OFFSET(int_data) ||
+ BITS_PER_BYTE_MASKED(nr_bits)) {
+ btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ return;
+ }
+
+ switch (nr_bits) {
+ case 64:
+ if (sign)
+ seq_printf(m, "%lld", *(s64 *)data);
+ else
+ seq_printf(m, "%llu", *(u64 *)data);
+ break;
+ case 32:
+ if (sign)
+ seq_printf(m, "%d", *(s32 *)data);
+ else
+ seq_printf(m, "%u", *(u32 *)data);
+ break;
+ case 16:
+ if (sign)
+ seq_printf(m, "%d", *(s16 *)data);
+ else
+ seq_printf(m, "%u", *(u16 *)data);
+ break;
+ case 8:
+ if (sign)
+ seq_printf(m, "%d", *(s8 *)data);
+ else
+ seq_printf(m, "%u", *(u8 *)data);
+ break;
+ default:
+ btf_int_bits_seq_show(btf, t, data, bits_offset, m);
+ }
+}
+
static const struct btf_kind_operations int_ops = {
.check_meta = btf_int_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_int_check_member,
.log_details = btf_int_log,
+ .seq_show = btf_int_seq_show,
};
static int btf_modifier_check_member(struct btf_verifier_env *env,
@@ -1078,6 +1175,24 @@ static int btf_ptr_resolve(struct btf_verifier_env *env,
return 0;
}
+static void btf_modifier_seq_show(const struct btf *btf,
+ const struct btf_type *t,
+ u32 type_id, void *data,
+ u8 bits_offset, struct seq_file *m)
+{
+ t = btf_type_id_resolve(btf, &type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m);
+}
+
+static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ /* It is a hashed value */
+ seq_printf(m, "%p", *(void **)data);
+}
+
static void btf_ref_type_log(struct btf_verifier_env *env,
const struct btf_type *t)
{
@@ -1089,6 +1204,7 @@ static struct btf_kind_operations modifier_ops = {
.resolve = btf_modifier_resolve,
.check_member = btf_modifier_check_member,
.log_details = btf_ref_type_log,
+ .seq_show = btf_modifier_seq_show,
};
static struct btf_kind_operations ptr_ops = {
@@ -1096,6 +1212,7 @@ static struct btf_kind_operations ptr_ops = {
.resolve = btf_ptr_resolve,
.check_member = btf_ptr_check_member,
.log_details = btf_ref_type_log,
+ .seq_show = btf_ptr_seq_show,
};
static struct btf_kind_operations fwd_ops = {
@@ -1103,6 +1220,7 @@ static struct btf_kind_operations fwd_ops = {
.resolve = btf_df_resolve,
.check_member = btf_df_check_member,
.log_details = btf_ref_type_log,
+ .seq_show = btf_df_seq_show,
};
static int btf_array_check_member(struct btf_verifier_env *env,
@@ -1233,11 +1351,36 @@ static void btf_array_log(struct btf_verifier_env *env,
array->type, array->index_type, array->nelems);
}
+static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_array *array = btf_type_array(t);
+ const struct btf_kind_operations *elem_ops;
+ const struct btf_type *elem_type;
+ u32 i, elem_size, elem_type_id;
+
+ elem_type_id = array->type;
+ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size);
+ elem_ops = btf_type_ops(elem_type);
+ seq_puts(m, "[");
+ for (i = 0; i < array->nelems; i++) {
+ if (i)
+ seq_puts(m, ",");
+
+ elem_ops->seq_show(btf, elem_type, elem_type_id, data,
+ bits_offset, m);
+ data += elem_size;
+ }
+ seq_puts(m, "]");
+}
+
static struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
.check_member = btf_array_check_member,
.log_details = btf_array_log,
+ .seq_show = btf_array_seq_show,
};
static int btf_struct_check_member(struct btf_verifier_env *env,
@@ -1385,11 +1528,39 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
+static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ",";
+ const struct btf_member *member;
+ u32 i;
+
+ seq_puts(m, "{");
+ for_each_member(i, t, member) {
+ const struct btf_type *member_type = btf_type_by_id(btf,
+ member->type);
+ u32 member_offset = member->offset;
+ u32 bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
+ u8 bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
+ const struct btf_kind_operations *ops;
+
+ if (i)
+ seq_puts(m, seq);
+
+ ops = btf_type_ops(member_type);
+ ops->seq_show(btf, member_type, member->type,
+ data + bytes_offset, bits8_offset, m);
+ }
+ seq_puts(m, "}");
+}
+
static struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
.check_member = btf_struct_check_member,
.log_details = btf_struct_log,
+ .seq_show = btf_struct_seq_show,
};
static int btf_enum_check_member(struct btf_verifier_env *env,
@@ -1465,11 +1636,31 @@ static void btf_enum_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
}
+static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct seq_file *m)
+{
+ const struct btf_enum *enums = btf_type_enum(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ int v = *(int *)data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v == enums[i].val) {
+ seq_printf(m, "%s",
+ btf_name_by_offset(btf, enums[i].name));
+ return;
+ }
+ }
+
+ seq_printf(m, "%d", v);
+}
+
static struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
.check_member = btf_enum_check_member,
.log_details = btf_enum_log,
+ .seq_show = btf_enum_seq_show,
};
static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
@@ -1808,3 +1999,11 @@ static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size,
btf_free(btf);
return ERR_PTR(err);
}
+
+void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj,
+ struct seq_file *m)
+{
+ const struct btf_type *t = btf_type_by_id(btf, type_id);
+
+ btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m);
+}
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 03/10] bpf: btf: Check members of struct/union
From: Martin KaFai Lau @ 2018-04-17 20:42 UTC (permalink / raw)
To: netdev; +Cc: Alexei Starovoitov, Daniel Borkmann, kernel-team
In-Reply-To: <20180417204243.4028831-1-kafai@fb.com>
This patch checks a few things of struct's members:
1) It has a valid size (e.g. a "const void" is invalid)
2) A member's size (+ its member's offset) does not exceed
the containing struct's size.
3) The member's offset satisfies the alignment requirement
The above can only be done after the needs_resolve member's type
is resolved. Hence, the above is done together in
btf_struct_resolve().
Each possible member's type (e.g. int, enum, modifier...) implements
the check_member() ops which will be called from btf_struct_resolve().
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@fb.com>
---
kernel/bpf/btf.c | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 206 insertions(+)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e3467846a579..bb220a9899e2 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -252,6 +252,10 @@ struct btf_kind_operations {
u32 meta_left);
int (*resolve)(struct btf_verifier_env *env,
const struct resolve_vertex *v);
+ int (*check_member)(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type);
void (*log_details)(struct btf_verifier_env *env,
const struct btf_type *t);
};
@@ -771,6 +775,16 @@ static int btf_df_check_meta(struct btf_verifier_env *env,
return -ENOTSUPP;
}
+static int btf_df_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ btf_verifier_log_basic(env, struct_type,
+ "Unsupported check_member");
+ return -EINVAL;
+}
+
static int btf_df_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
@@ -787,9 +801,48 @@ static void btf_df_log(struct btf_verifier_env *env,
static struct btf_kind_operations df_ops = {
.check_meta = btf_df_check_meta,
.resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
.log_details = btf_df_log,
};
+static int btf_int_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 int_data = btf_type_int(member_type);
+ u32 struct_bits_off = member->offset;
+ u32 struct_size = struct_type->size;
+ u32 nr_copy_bits;
+ u32 bytes_offset;
+
+ if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "bits_offset exceeds U32_MAX");
+ return -EINVAL;
+ }
+
+ struct_bits_off += BTF_INT_OFFSET(int_data);
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ nr_copy_bits = BTF_INT_BITS(int_data) +
+ BITS_PER_BYTE_MASKED(struct_bits_off);
+
+ if (nr_copy_bits > BITS_PER_U64) {
+ btf_verifier_log_member(env, struct_type, member,
+ "nr_copy_bits exceeds 64");
+ return -EINVAL;
+ }
+
+ if (struct_size < bytes_offset ||
+ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static s32 btf_int_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -853,9 +906,61 @@ static void btf_int_log(struct btf_verifier_env *env,
static const struct btf_kind_operations int_ops = {
.check_meta = btf_int_check_meta,
.resolve = btf_df_resolve,
+ .check_member = btf_int_check_member,
.log_details = btf_int_log,
};
+static int btf_modifier_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ const struct btf_type *resolved_type;
+ u32 resolved_type_id = member->type;
+ struct btf_member resolved_member;
+ struct btf *btf = env->btf;
+
+ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL);
+ if (!resolved_type) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Invalid member");
+ return -EINVAL;
+ }
+
+ resolved_member = *member;
+ resolved_member.type = resolved_type_id;
+
+ return btf_type_ops(resolved_type)->check_member(env, struct_type,
+ &resolved_member,
+ resolved_type);
+}
+
+static int btf_ptr_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_size, struct_bits_off, bytes_offset;
+
+ struct_size = struct_type->size;
+ struct_bits_off = member->offset;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ if (struct_size - bytes_offset < sizeof(void *)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -982,21 +1087,53 @@ static void btf_ref_type_log(struct btf_verifier_env *env,
static struct btf_kind_operations modifier_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_modifier_resolve,
+ .check_member = btf_modifier_check_member,
.log_details = btf_ref_type_log,
};
static struct btf_kind_operations ptr_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_ptr_resolve,
+ .check_member = btf_ptr_check_member,
.log_details = btf_ref_type_log,
};
static struct btf_kind_operations fwd_ops = {
.check_meta = btf_ref_type_check_meta,
.resolve = btf_df_resolve,
+ .check_member = btf_df_check_member,
.log_details = btf_ref_type_log,
};
+static int btf_array_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+ u32 array_type_id, array_size;
+ struct btf *btf = env->btf;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ array_type_id = member->type;
+ btf_type_id_size(btf, &array_type_id, &array_size);
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < array_size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static s32 btf_array_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -1099,9 +1236,35 @@ static void btf_array_log(struct btf_verifier_env *env,
static struct btf_kind_operations array_ops = {
.check_meta = btf_array_check_meta,
.resolve = btf_array_resolve,
+ .check_member = btf_array_check_member,
.log_details = btf_array_log,
};
+static int btf_struct_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < member_type->size) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static s32 btf_struct_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -1160,6 +1323,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_member *member;
+ int err;
u16 i;
/* Before continue resolving the next_member,
@@ -1167,6 +1331,7 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
* type with size info.
*/
if (v->next_member) {
+ const struct btf_type *last_member_type;
const struct btf_member *last_member;
u16 last_member_type_id;
@@ -1175,6 +1340,14 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
if (WARN_ON_ONCE(!env_type_is_resolved(env,
last_member_type_id)))
return -EINVAL;
+
+ last_member_type = btf_type_by_id(env->btf,
+ last_member_type_id);
+ err = btf_type_ops(last_member_type)->check_member(env, v->t,
+ last_member,
+ last_member_type);
+ if (err)
+ return err;
}
for_each_member_from(i, v->next_member, v->t, member) {
@@ -1193,6 +1366,12 @@ static int btf_struct_resolve(struct btf_verifier_env *env,
env_stack_set_next_member(env, i + 1);
return env_stack_push(env, member_type, member_type_id);
}
+
+ err = btf_type_ops(member_type)->check_member(env, v->t,
+ member,
+ member_type);
+ if (err)
+ return err;
}
env_stack_pop_resolved(env, 0, 0);
@@ -1209,9 +1388,35 @@ static void btf_struct_log(struct btf_verifier_env *env,
static struct btf_kind_operations struct_ops = {
.check_meta = btf_struct_check_meta,
.resolve = btf_struct_resolve,
+ .check_member = btf_struct_check_member,
.log_details = btf_struct_log,
};
+static int btf_enum_check_member(struct btf_verifier_env *env,
+ const struct btf_type *struct_type,
+ const struct btf_member *member,
+ const struct btf_type *member_type)
+{
+ u32 struct_bits_off = member->offset;
+ u32 struct_size, bytes_offset;
+
+ if (BITS_PER_BYTE_MASKED(struct_bits_off)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member is not byte aligned");
+ return -EINVAL;
+ }
+
+ struct_size = struct_type->size;
+ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off);
+ if (struct_size - bytes_offset < sizeof(int)) {
+ btf_verifier_log_member(env, struct_type, member,
+ "Member exceeds struct_size");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static s32 btf_enum_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -1263,6 +1468,7 @@ static void btf_enum_log(struct btf_verifier_env *env,
static struct btf_kind_operations enum_ops = {
.check_meta = btf_enum_check_meta,
.resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
.log_details = btf_enum_log,
};
--
2.9.5
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox