* [iproute PATCH v2 11/18] ss: Make some variables function-local
From: Phil Sutter @ 2016-12-02 10:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
addrp_width and screen_width are used in main() only, so no need to have
them globally available.
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 4 +---
1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index 44386c82c7578..3662f5f4861c7 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -105,10 +105,8 @@ int sctp_ino;
int netid_width;
int state_width;
-int addrp_width;
int addr_width;
int serv_width;
-int screen_width;
static const char *TCP_PROTO = "tcp";
static const char *SCTP_PROTO = "sctp";
@@ -3975,6 +3973,7 @@ int main(int argc, char *argv[])
FILE *filter_fp = NULL;
int ch;
int state_filter = 0;
+ int addrp_width, screen_width = 80;
while ((ch = getopt_long(argc, argv,
"dhaletuwxnro460spbEf:miA:D:F:vVzZN:KHS",
@@ -4264,7 +4263,6 @@ int main(int argc, char *argv[])
if (current_filter.states&(current_filter.states-1))
state_width = 10;
- screen_width = 80;
if (isatty(STDOUT_FILENO)) {
struct winsize w;
--
2.10.0
^ permalink raw reply related
* [iproute PATCH v2 13/18] ss: Get rid of useless goto in handle_follow_request()
From: Phil Sutter @ 2016-12-02 10:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index c498478421190..ec71c21ce6a4a 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -3632,7 +3632,7 @@ static int generic_show_sock(const struct sockaddr_nl *addr,
static int handle_follow_request(struct filter *f)
{
- int ret = -1;
+ int ret = 0;
int groups = 0;
struct rtnl_handle rth;
@@ -3655,10 +3655,8 @@ static int handle_follow_request(struct filter *f)
rth.local.nl_pid = 0;
if (rtnl_dump_filter(&rth, generic_show_sock, f))
- goto Exit;
+ ret = -1;
- ret = 0;
-Exit:
rtnl_close(&rth);
return ret;
}
--
2.10.0
^ permalink raw reply related
* [iproute PATCH v2 08/18] ss: Turn generic_proc_open() wrappers into macros
From: Phil Sutter @ 2016-12-02 10:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 89 ++++++++++++++-------------------------------------------------
1 file changed, 19 insertions(+), 70 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index ad38eb97b0055..71040a82ca6b1 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -327,76 +327,25 @@ static FILE *generic_proc_open(const char *env, const char *name)
return fopen(p, "r");
}
-
-static FILE *net_tcp_open(void)
-{
- return generic_proc_open("PROC_NET_TCP", "net/tcp");
-}
-
-static FILE *net_tcp6_open(void)
-{
- return generic_proc_open("PROC_NET_TCP6", "net/tcp6");
-}
-
-static FILE *net_udp_open(void)
-{
- return generic_proc_open("PROC_NET_UDP", "net/udp");
-}
-
-static FILE *net_udp6_open(void)
-{
- return generic_proc_open("PROC_NET_UDP6", "net/udp6");
-}
-
-static FILE *net_raw_open(void)
-{
- return generic_proc_open("PROC_NET_RAW", "net/raw");
-}
-
-static FILE *net_raw6_open(void)
-{
- return generic_proc_open("PROC_NET_RAW6", "net/raw6");
-}
-
-static FILE *net_unix_open(void)
-{
- return generic_proc_open("PROC_NET_UNIX", "net/unix");
-}
-
-static FILE *net_packet_open(void)
-{
- return generic_proc_open("PROC_NET_PACKET", "net/packet");
-}
-
-static FILE *net_netlink_open(void)
-{
- return generic_proc_open("PROC_NET_NETLINK", "net/netlink");
-}
-
-static FILE *slabinfo_open(void)
-{
- return generic_proc_open("PROC_SLABINFO", "slabinfo");
-}
-
-static FILE *net_sockstat_open(void)
-{
- return generic_proc_open("PROC_NET_SOCKSTAT", "net/sockstat");
-}
-
-static FILE *net_sockstat6_open(void)
-{
- return generic_proc_open("PROC_NET_SOCKSTAT6", "net/sockstat6");
-}
-
-static FILE *net_snmp_open(void)
-{
- return generic_proc_open("PROC_NET_SNMP", "net/snmp");
-}
-
-static FILE *ephemeral_ports_open(void)
-{
- return generic_proc_open("PROC_IP_LOCAL_PORT_RANGE", "sys/net/ipv4/ip_local_port_range");
-}
+#define net_tcp_open() generic_proc_open("PROC_NET_TCP", "net/tcp")
+#define net_tcp6_open() generic_proc_open("PROC_NET_TCP6", "net/tcp6")
+#define net_udp_open() generic_proc_open("PROC_NET_UDP", "net/udp")
+#define net_udp6_open() generic_proc_open("PROC_NET_UDP6", "net/udp6")
+#define net_raw_open() generic_proc_open("PROC_NET_RAW", "net/raw")
+#define net_raw6_open() generic_proc_open("PROC_NET_RAW6", "net/raw6")
+#define net_unix_open() generic_proc_open("PROC_NET_UNIX", "net/unix")
+#define net_packet_open() generic_proc_open("PROC_NET_PACKET", \
+ "net/packet")
+#define net_netlink_open() generic_proc_open("PROC_NET_NETLINK", \
+ "net/netlink")
+#define slabinfo_open() generic_proc_open("PROC_SLABINFO", "slabinfo")
+#define net_sockstat_open() generic_proc_open("PROC_NET_SOCKSTAT", \
+ "net/sockstat")
+#define net_sockstat6_open() generic_proc_open("PROC_NET_SOCKSTAT6", \
+ "net/sockstat6")
+#define net_snmp_open() generic_proc_open("PROC_NET_SNMP", "net/snmp")
+#define ephemeral_ports_open() generic_proc_open("PROC_IP_LOCAL_PORT_RANGE", \
+ "sys/net/ipv4/ip_local_port_range")
struct user_ent {
struct user_ent *next;
--
2.10.0
^ permalink raw reply related
* [iproute PATCH v2 05/18] ss: introduce proc_ctx_print()
From: Phil Sutter @ 2016-12-02 10:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
This consolidates identical code in three places. While the function
name is not quite perfect as there is different proc_ctx printing code
in netlink_show_one() as well, I sadly didn't find a more suitable one.
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 49 ++++++++++++++-----------------------------------
1 file changed, 14 insertions(+), 35 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index a953d4b022aed..fcbaecbe25a2f 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -1773,14 +1773,9 @@ void *parse_markmask(const char *markmask)
return res;
}
-static void inet_stats_print(struct sockstat *s)
+static void proc_ctx_print(struct sockstat *s)
{
- char *buf = NULL;
-
- sock_state_print(s);
-
- inet_addr_print(&s->local, s->lport, s->iface);
- inet_addr_print(&s->remote, s->rport, 0);
+ char *buf;
if (show_proc_ctx || show_sock_ctx) {
if (find_entry(s->ino, &buf,
@@ -1797,6 +1792,16 @@ static void inet_stats_print(struct sockstat *s)
}
}
+static void inet_stats_print(struct sockstat *s)
+{
+ sock_state_print(s);
+
+ inet_addr_print(&s->local, s->lport, s->iface);
+ inet_addr_print(&s->remote, s->rport, 0);
+
+ proc_ctx_print(s);
+}
+
static int proc_parse_inet_addr(char *loc, char *rem, int family, struct
sockstat * s)
{
@@ -3001,7 +3006,6 @@ static void unix_stats_print(struct sockstat *list, struct filter *f)
{
struct sockstat *s;
char *peer;
- char *ctx_buf = NULL;
bool use_proc = unix_use_proc();
char port_name[30] = {};
@@ -3050,19 +3054,7 @@ static void unix_stats_print(struct sockstat *list, struct filter *f)
sock_addr_print(peer, " ", int_to_str(s->rport, port_name),
NULL);
- if (show_proc_ctx || show_sock_ctx) {
- if (find_entry(s->ino, &ctx_buf,
- (show_proc_ctx & show_sock_ctx) ?
- PROC_SOCK_CTX : PROC_CTX) > 0) {
- printf(" users:(%s)", ctx_buf);
- free(ctx_buf);
- }
- } else if (show_users) {
- if (find_entry(s->ino, &ctx_buf, USERS) > 0) {
- printf(" users:(%s)", ctx_buf);
- free(ctx_buf);
- }
- }
+ proc_ctx_print(s);
printf("\n");
}
}
@@ -3260,7 +3252,6 @@ static int unix_show(struct filter *f)
static int packet_stats_print(struct sockstat *s, const struct filter *f)
{
- char *buf = NULL;
const char *addr, *port;
char ll_name[16];
@@ -3287,19 +3278,7 @@ static int packet_stats_print(struct sockstat *s, const struct filter *f)
sock_addr_print(addr, ":", port, NULL);
sock_addr_print("", "*", "", NULL);
- if (show_proc_ctx || show_sock_ctx) {
- if (find_entry(s->ino, &buf,
- (show_proc_ctx & show_sock_ctx) ?
- PROC_SOCK_CTX : PROC_CTX) > 0) {
- printf(" users:(%s)", buf);
- free(buf);
- }
- } else if (show_users) {
- if (find_entry(s->ino, &buf, USERS) > 0) {
- printf(" users:(%s)", buf);
- free(buf);
- }
- }
+ proc_ctx_print(s);
if (show_details)
sock_details_print(s);
--
2.10.0
^ permalink raw reply related
* [iproute PATCH v2 10/18] ss: Make user_ent_hash_build_init local to user_ent_hash_build()
From: Phil Sutter @ 2016-12-02 10:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
By having it statically defined, there is no need for it to be global.
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index 97fcfd4a85548..44386c82c7578 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -100,8 +100,6 @@ int show_bpf;
int show_proc_ctx;
int show_sock_ctx;
int show_header = 1;
-/* If show_users & show_proc_ctx only do user_ent_hash_build() once */
-int user_ent_hash_build_init;
int follow_events;
int sctp_ino;
@@ -421,6 +419,7 @@ static void user_ent_hash_build(void)
char *pid_context;
char *sock_context;
const char *no_ctx = "unavailable";
+ static int user_ent_hash_build_init;
/* If show_users & show_proc_ctx set only do this once */
if (user_ent_hash_build_init != 0)
--
2.10.0
^ permalink raw reply related
* [iproute PATCH v2 16/18] ss: Make sstate_name local to sock_state_print()
From: Phil Sutter @ 2016-12-02 10:40 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 29 ++++++++++++++---------------
1 file changed, 14 insertions(+), 15 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index e82c416b5fa72..8439f473d7f7b 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -655,21 +655,6 @@ static unsigned long long cookie_sk_get(const uint32_t *cookie)
return (((unsigned long long)cookie[1] << 31) << 1) | cookie[0];
}
-static const char *sstate_name[] = {
- "UNKNOWN",
- [SS_ESTABLISHED] = "ESTAB",
- [SS_SYN_SENT] = "SYN-SENT",
- [SS_SYN_RECV] = "SYN-RECV",
- [SS_FIN_WAIT1] = "FIN-WAIT-1",
- [SS_FIN_WAIT2] = "FIN-WAIT-2",
- [SS_TIME_WAIT] = "TIME-WAIT",
- [SS_CLOSE] = "UNCONN",
- [SS_CLOSE_WAIT] = "CLOSE-WAIT",
- [SS_LAST_ACK] = "LAST-ACK",
- [SS_LISTEN] = "LISTEN",
- [SS_CLOSING] = "CLOSING",
-};
-
static const char *sctp_sstate_name[] = {
[SCTP_STATE_CLOSED] = "CLOSED",
[SCTP_STATE_COOKIE_WAIT] = "COOKIE_WAIT",
@@ -815,6 +800,20 @@ static const char *proto_name(int protocol)
static void sock_state_print(struct sockstat *s)
{
const char *sock_name;
+ static const char * const sstate_name[] = {
+ "UNKNOWN",
+ [SS_ESTABLISHED] = "ESTAB",
+ [SS_SYN_SENT] = "SYN-SENT",
+ [SS_SYN_RECV] = "SYN-RECV",
+ [SS_FIN_WAIT1] = "FIN-WAIT-1",
+ [SS_FIN_WAIT2] = "FIN-WAIT-2",
+ [SS_TIME_WAIT] = "TIME-WAIT",
+ [SS_CLOSE] = "UNCONN",
+ [SS_CLOSE_WAIT] = "CLOSE-WAIT",
+ [SS_LAST_ACK] = "LAST-ACK",
+ [SS_LISTEN] = "LISTEN",
+ [SS_CLOSING] = "CLOSING",
+ };
switch (s->local.family) {
case AF_UNIX:
--
2.10.0
^ permalink raw reply related
* [iproute PATCH v2 17/18] ss: Make sstate_namel local to scan_state()
From: Phil Sutter @ 2016-12-02 10:40 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 29 ++++++++++++++---------------
1 file changed, 14 insertions(+), 15 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index 8439f473d7f7b..c72aba7e65ad3 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -666,21 +666,6 @@ static const char *sctp_sstate_name[] = {
[SCTP_STATE_SHUTDOWN_ACK_SENT] = "ACK_SENT",
};
-static const char *sstate_namel[] = {
- "UNKNOWN",
- [SS_ESTABLISHED] = "established",
- [SS_SYN_SENT] = "syn-sent",
- [SS_SYN_RECV] = "syn-recv",
- [SS_FIN_WAIT1] = "fin-wait-1",
- [SS_FIN_WAIT2] = "fin-wait-2",
- [SS_TIME_WAIT] = "time-wait",
- [SS_CLOSE] = "unconnected",
- [SS_CLOSE_WAIT] = "close-wait",
- [SS_LAST_ACK] = "last-ack",
- [SS_LISTEN] = "listening",
- [SS_CLOSING] = "closing",
-};
-
struct sockstat {
struct sockstat *next;
unsigned int type;
@@ -3888,6 +3873,20 @@ static void usage(void)
static int scan_state(const char *state)
{
+ static const char * const sstate_namel[] = {
+ "UNKNOWN",
+ [SS_ESTABLISHED] = "established",
+ [SS_SYN_SENT] = "syn-sent",
+ [SS_SYN_RECV] = "syn-recv",
+ [SS_FIN_WAIT1] = "fin-wait-1",
+ [SS_FIN_WAIT2] = "fin-wait-2",
+ [SS_TIME_WAIT] = "time-wait",
+ [SS_CLOSE] = "unconnected",
+ [SS_CLOSE_WAIT] = "close-wait",
+ [SS_LAST_ACK] = "last-ack",
+ [SS_LISTEN] = "listening",
+ [SS_CLOSING] = "closing",
+ };
int i;
if (strcasecmp(state, "close") == 0 ||
--
2.10.0
^ permalink raw reply related
* [iproute PATCH v2 00/18] ss: Minor code review
From: Phil Sutter @ 2016-12-02 10:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
This is a series of misc changes to ss code which happened as fall-out
when working on a unified output formatter (still unfinished).
Changes since v1:
- Rebased onto current upstream, resolved conflicts in patch 4 generated
by previously added SCTP socket support.
Phil Sutter (18):
ss: Mark fall through in arg parsing switch()
ss: Drop empty lines in UDP output
ss: Add missing tab when printing UNIX details
ss: Use sockstat->type in all socket types
ss: introduce proc_ctx_print()
ss: Drop list traversal from unix_stats_print()
ss: Eliminate unix_use_proc()
ss: Turn generic_proc_open() wrappers into macros
ss: Make tmr_name local to tcp_timer_print()
ss: Make user_ent_hash_build_init local to user_ent_hash_build()
ss: Make some variables function-local
ss: Make slabstat_ids local to get_slabstat()
ss: Get rid of useless goto in handle_follow_request()
ss: Get rid of single-fielded struct snmpstat
ss: Make unix_state_map local to unix_show()
ss: Make sstate_name local to sock_state_print()
ss: Make sstate_namel local to scan_state()
ss: unix_show: No need to initialize members of calloc'ed structs
misc/ss.c | 532 ++++++++++++++++++++++++++------------------------------------
1 file changed, 224 insertions(+), 308 deletions(-)
--
2.10.0
^ permalink raw reply
* [PATCH net-next 0/2] samples, bpf: Refactor; Add automated tests for cgroups
From: Sargun Dhillon @ 2016-12-02 10:42 UTC (permalink / raw)
To: netdev; +Cc: daniel, ast
These two patches are around refactoring out some old, reusable code from the
existing test_current_task_under_cgroup_user test, and adding a new, automated
test.
There is some generic cgroupsv2 setup & cleanup code, given that most
environment still don't have it setup by default. With this code, we're able
to pretty easily add an automated test for future cgroupsv2 functionality.
Sargun Dhillon (2):
samples, bpf: Refactor test_current_task_under_cgroup - separate out
helpers
samples, bpf: Add automated test for cgroup filter attachments
samples/bpf/Makefile | 4 +-
samples/bpf/cgroup_helpers.c | 177 ++++++++++++++++++++++
samples/bpf/cgroup_helpers.h | 16 ++
samples/bpf/test_cgrp2_attach2.c | 132 ++++++++++++++++
samples/bpf/test_current_task_under_cgroup_user.c | 108 +++----------
5 files changed, 352 insertions(+), 85 deletions(-)
create mode 100644 samples/bpf/cgroup_helpers.c
create mode 100644 samples/bpf/cgroup_helpers.h
create mode 100644 samples/bpf/test_cgrp2_attach2.c
--
2.7.4
^ permalink raw reply
* [iproute PATCH v2 14/18] ss: Get rid of single-fielded struct snmpstat
From: Phil Sutter @ 2016-12-02 10:39 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev
In-Reply-To: <20161202104002.17310-1-phil@nwl.cc>
A struct with only a single field does not make much sense. Besides
that, it was used by print_summary() only.
Signed-off-by: Phil Sutter <phil@nwl.cc>
---
misc/ss.c | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index ec71c21ce6a4a..c7818eadf9e75 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -3661,10 +3661,6 @@ static int handle_follow_request(struct filter *f)
return ret;
}
-struct snmpstat {
- int tcp_estab;
-};
-
static int get_snmp_int(char *proto, char *key, int *result)
{
char buf[1024];
@@ -3785,11 +3781,11 @@ static int get_sockstat(struct ssummary *s)
static int print_summary(void)
{
struct ssummary s;
- struct snmpstat sn;
+ int tcp_estab;
if (get_sockstat(&s) < 0)
perror("ss: get_sockstat");
- if (get_snmp_int("Tcp:", "CurrEstab", &sn.tcp_estab) < 0)
+ if (get_snmp_int("Tcp:", "CurrEstab", &tcp_estab) < 0)
perror("ss: get_snmpstat");
get_slabstat(&slabstat);
@@ -3798,7 +3794,7 @@ static int print_summary(void)
printf("TCP: %d (estab %d, closed %d, orphaned %d, synrecv %d, timewait %d/%d), ports %d\n",
s.tcp_total + slabstat.tcp_syns + s.tcp_tws,
- sn.tcp_estab,
+ tcp_estab,
s.tcp_total - (s.tcp4_hashed+s.tcp6_hashed-s.tcp_tws),
s.tcp_orphans,
slabstat.tcp_syns,
--
2.10.0
^ permalink raw reply related
* [PATCH net-next 1/2] samples, bpf: Refactor test_current_task_under_cgroup - separate out helpers
From: Sargun Dhillon @ 2016-12-02 10:42 UTC (permalink / raw)
To: netdev; +Cc: daniel, ast
This patch modifies test_current_task_under_cgroup_user. The test has
several helpers around creating a temporary environment for cgroup
testing, and moving the current task around cgroups. This set of
helpers can then be used in other tests.
Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
samples/bpf/Makefile | 2 +-
samples/bpf/cgroup_helpers.c | 177 ++++++++++++++++++++++
samples/bpf/cgroup_helpers.h | 16 ++
samples/bpf/test_current_task_under_cgroup_user.c | 108 +++----------
4 files changed, 218 insertions(+), 85 deletions(-)
create mode 100644 samples/bpf/cgroup_helpers.c
create mode 100644 samples/bpf/cgroup_helpers.h
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 22b6407e..3c805af 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -54,7 +54,7 @@ test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
# reuse xdp1 source intentionally
xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
-test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \
+test_current_task_under_cgroup-objs := bpf_load.o libbpf.o cgroup_helpers.o \
test_current_task_under_cgroup_user.o
trace_event-objs := bpf_load.o libbpf.o trace_event_user.o
sampleip-objs := bpf_load.o libbpf.o sampleip_user.o
diff --git a/samples/bpf/cgroup_helpers.c b/samples/bpf/cgroup_helpers.c
new file mode 100644
index 0000000..9d1be94
--- /dev/null
+++ b/samples/bpf/cgroup_helpers.c
@@ -0,0 +1,177 @@
+#define _GNU_SOURCE
+#include <sched.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <linux/limits.h>
+#include <stdio.h>
+#include <linux/sched.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <ftw.h>
+
+
+#include "cgroup_helpers.h"
+
+/*
+ * To avoid relying on the system setup, when setup_cgroup_env is called
+ * we create a new mount namespace, and cgroup namespace. The cgroup2
+ * root is mounted at CGROUP_MOUNT_PATH
+ *
+ * Unfortunately, most people don't have cgroupv2 enabled at this point in time.
+ * It's easier to create our own mount namespace and manage it ourselves.
+ *
+ * We assume /mnt exists.
+ */
+
+#define WALK_FD_LIMIT 16
+#define CGROUP_MOUNT_PATH "/mnt"
+#define CGROUP_WORK_DIR "/cgroup-test-work-dir"
+#define format_cgroup_path(buf, path) \
+ snprintf(buf, sizeof(buf), "%s%s%s", CGROUP_MOUNT_PATH, \
+ CGROUP_WORK_DIR, path)
+
+/**
+ * setup_cgroup_environment() - Setup the cgroup environment
+ *
+ * After calling this function, cleanup_cgroup_environment should be called
+ * once testing is complete.
+ *
+ * This function will print an error to stderr and return 1 if it is unable
+ * to setup the cgroup environment. If setup is successful, 0 is returned.
+ */
+int setup_cgroup_environment(void)
+{
+ char cgroup_workdir[PATH_MAX + 1];
+
+ format_cgroup_path(cgroup_workdir, "");
+
+ if (unshare(CLONE_NEWNS)) {
+ log_err("unshare");
+ return 1;
+ }
+
+ if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
+ log_err("mount fakeroot");
+ return 1;
+ }
+
+ if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) {
+ log_err("mount cgroup2");
+ return 1;
+ }
+
+ /* Cleanup existing failed runs, now that the environment is setup */
+ cleanup_cgroup_environment();
+
+ if (mkdir(cgroup_workdir, 0777) && errno != EEXIST) {
+ log_err("mkdir cgroup work dir");
+ return 1;
+ }
+
+ return 0;
+}
+
+static int nftwfunc(const char *filename, const struct stat *statptr,
+ int fileflags, struct FTW *pfwt)
+{
+ if ((fileflags & FTW_D) && rmdir(filename))
+ log_err("Removing cgroup: %s", filename);
+ return 0;
+}
+
+
+static int join_cgroup_from_top(char *cgroup_path)
+{
+ char cgroup_procs_path[PATH_MAX + 1];
+ pid_t pid = getpid();
+ int fd, rc = 0;
+
+ snprintf(cgroup_procs_path, sizeof(cgroup_procs_path),
+ "%s/cgroup.procs", cgroup_path);
+
+ fd = open(cgroup_procs_path, O_WRONLY);
+ if (fd < 0) {
+ log_err("Opening Cgroup Procs: %s", cgroup_procs_path);
+ return 1;
+ }
+
+ if (dprintf(fd, "%d\n", pid) < 0) {
+ log_err("Joining Cgroup");
+ rc = 1;
+ }
+
+ close(fd);
+ return rc;
+}
+
+/**
+ * join_cgroup() - Join a cgroup
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * This function expects a cgroup to already be created, relative to the cgroup
+ * work dir, and it joins it. For example, passing "/my-cgroup" as the path
+ * would actually put the calling process into the cgroup
+ * "/cgroup-test-work-dir/my-cgroup"
+ *
+ * On success, it returns 0, otherwise on failure it returns 1.
+ */
+int join_cgroup(char *path)
+{
+ char cgroup_path[PATH_MAX + 1];
+
+ format_cgroup_path(cgroup_path, path);
+ return join_cgroup_from_top(cgroup_path);
+}
+
+/**
+ * cleanup_cgroup_environment() - Cleanup Cgroup Testing Environment
+ *
+ * This is an idempotent function to delete all temporary cgroups that
+ * have been created during the test, including the cgroup testing work
+ * directory.
+ *
+ * At call time, it moves the calling process to the root cgroup, and then
+ * runs the deletion process. It is idempotent, and should not fail, unless
+ * a process is lingering.
+ *
+ * On failure, it will print an error to stderr, and try to continue.
+ */
+void cleanup_cgroup_environment(void)
+{
+ char cgroup_workdir[PATH_MAX + 1];
+
+ format_cgroup_path(cgroup_workdir, "");
+ join_cgroup_from_top(CGROUP_MOUNT_PATH);
+ nftw(cgroup_workdir, nftwfunc, WALK_FD_LIMIT, FTW_DEPTH | FTW_MOUNT);
+}
+
+/**
+ * create_and_get_cgroup() - Create a cgroup, relative to workdir, and get the FD
+ * @path: The cgroup path, relative to the workdir, to join
+ *
+ * This function creates a cgroup under the top level workdir and returns the
+ * file descriptor. It is idempotent.
+ *
+ * On success, it returns the file descriptor. On failure it returns 0.
+ * If there is a failure, it prints the error to stderr.
+ */
+int create_and_get_cgroup(char *path)
+{
+ char cgroup_path[PATH_MAX + 1];
+ int fd;
+
+ format_cgroup_path(cgroup_path, path);
+ if (mkdir(cgroup_path, 0777) && errno != EEXIST) {
+ log_err("mkdiring cgroup");
+ return 0;
+ }
+
+ fd = open(cgroup_path, O_RDONLY);
+ if (fd < 0) {
+ log_err("Opening Cgroup");
+ return 0;
+ }
+
+ return fd;
+}
diff --git a/samples/bpf/cgroup_helpers.h b/samples/bpf/cgroup_helpers.h
new file mode 100644
index 0000000..78c5520
--- /dev/null
+++ b/samples/bpf/cgroup_helpers.h
@@ -0,0 +1,16 @@
+#ifndef __CGROUP_HELPERS_H
+#define __CGROUP_HELPERS_H
+#include <errno.h>
+#include <string.h>
+
+#define clean_errno() (errno == 0 ? "None" : strerror(errno))
+#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
+ __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
+
+
+int create_and_get_cgroup(char *path);
+int join_cgroup(char *path);
+int setup_cgroup_environment(void);
+void cleanup_cgroup_environment(void);
+
+#endif
diff --git a/samples/bpf/test_current_task_under_cgroup_user.c b/samples/bpf/test_current_task_under_cgroup_user.c
index 30b0bce..95aaaa8 100644
--- a/samples/bpf/test_current_task_under_cgroup_user.c
+++ b/samples/bpf/test_current_task_under_cgroup_user.c
@@ -11,50 +11,16 @@
#include <unistd.h>
#include "libbpf.h"
#include "bpf_load.h"
-#include <string.h>
-#include <fcntl.h>
-#include <errno.h>
#include <linux/bpf.h>
-#include <sched.h>
-#include <sys/mount.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <linux/limits.h>
+#include "cgroup_helpers.h"
-#define CGROUP_MOUNT_PATH "/mnt"
-#define CGROUP_PATH "/mnt/my-cgroup"
-
-#define clean_errno() (errno == 0 ? "None" : strerror(errno))
-#define log_err(MSG, ...) fprintf(stderr, "(%s:%d: errno: %s) " MSG "\n", \
- __FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
-
-static int join_cgroup(char *path)
-{
- int fd, rc = 0;
- pid_t pid = getpid();
- char cgroup_path[PATH_MAX + 1];
-
- snprintf(cgroup_path, sizeof(cgroup_path), "%s/cgroup.procs", path);
-
- fd = open(cgroup_path, O_WRONLY);
- if (fd < 0) {
- log_err("Opening Cgroup");
- return 1;
- }
-
- if (dprintf(fd, "%d\n", pid) < 0) {
- log_err("Joining Cgroup");
- rc = 1;
- }
- close(fd);
- return rc;
-}
+#define CGROUP_PATH "/my-cgroup"
int main(int argc, char **argv)
{
- char filename[256];
- int cg2, idx = 0;
pid_t remote_pid, local_pid = getpid();
+ int cg2, idx = 0, rc = 0;
+ char filename[256];
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
if (load_bpf_file(filename)) {
@@ -62,47 +28,22 @@ int main(int argc, char **argv)
return 1;
}
- /*
- * This is to avoid interfering with existing cgroups. Unfortunately,
- * most people don't have cgroupv2 enabled at this point in time.
- * It's easier to create our own mount namespace and manage it
- * ourselves.
- */
- if (unshare(CLONE_NEWNS)) {
- log_err("unshare");
- return 1;
- }
-
- if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) {
- log_err("mount fakeroot");
- return 1;
- }
-
- if (mount("none", CGROUP_MOUNT_PATH, "cgroup2", 0, NULL)) {
- log_err("mount cgroup2");
- return 1;
- }
+ if (setup_cgroup_environment())
+ goto err;
- if (mkdir(CGROUP_PATH, 0777) && errno != EEXIST) {
- log_err("mkdir cgroup");
- return 1;
- }
+ cg2 = create_and_get_cgroup(CGROUP_PATH);
- cg2 = open(CGROUP_PATH, O_RDONLY);
- if (cg2 < 0) {
- log_err("opening target cgroup");
- goto cleanup_cgroup_err;
- }
+ if (!cg2)
+ goto err;
if (bpf_update_elem(map_fd[0], &idx, &cg2, BPF_ANY)) {
log_err("Adding target cgroup to map");
- goto cleanup_cgroup_err;
- }
- if (join_cgroup("/mnt/my-cgroup")) {
- log_err("Leaving target cgroup");
- goto cleanup_cgroup_err;
+ goto err;
}
+ if (join_cgroup(CGROUP_PATH))
+ goto err;
+
/*
* The installed helper program catched the sync call, and should
* write it to the map.
@@ -115,12 +56,12 @@ int main(int argc, char **argv)
fprintf(stderr,
"BPF Helper didn't write correct PID to map, but: %d\n",
remote_pid);
- goto leave_cgroup_err;
+ goto err;
}
/* Verify the negative scenario; leave the cgroup */
- if (join_cgroup(CGROUP_MOUNT_PATH))
- goto leave_cgroup_err;
+ if (join_cgroup("/"))
+ goto err;
remote_pid = 0;
bpf_update_elem(map_fd[1], &idx, &remote_pid, BPF_ANY);
@@ -130,16 +71,15 @@ int main(int argc, char **argv)
if (local_pid == remote_pid) {
fprintf(stderr, "BPF cgroup negative test did not work\n");
- goto cleanup_cgroup_err;
+ goto err;
}
- rmdir(CGROUP_PATH);
- return 0;
+ goto out;
+err:
+ rc = 1;
- /* Error condition, cleanup */
-leave_cgroup_err:
- join_cgroup(CGROUP_MOUNT_PATH);
-cleanup_cgroup_err:
- rmdir(CGROUP_PATH);
- return 1;
+out:
+ close(cg2);
+ cleanup_cgroup_environment();
+ return rc;
}
--
2.7.4
^ permalink raw reply related
* [PATCH net-next 2/2] samples, bpf: Add automated test for cgroup filter attachments
From: Sargun Dhillon @ 2016-12-02 10:42 UTC (permalink / raw)
To: netdev; +Cc: daniel, ast
This patch adds the sample program test_cgrp2_attach2. This program is
similar to test_cgrp2_attach, but it performs automated testing of the
cgroupv2 BPF attached filters. It runs the following checks:
* Simple filter attachment
* Application of filters to child cgroups
* Overriding filters on child cgroups
* Checking that this still works when the parent filter is removed
The filters that are used here are simply allow all / deny all filters, so
it isn't checking the actual functionality of the filters, but rather
the behaviour around detachment / attachment. If net_cls is enabled,
this test will fail.
Signed-off-by: Sargun Dhillon <sargun@sargun.me>
---
samples/bpf/Makefile | 2 +
samples/bpf/test_cgrp2_attach2.c | 132 +++++++++++++++++++++++++++++++++++++++
2 files changed, 134 insertions(+)
create mode 100644 samples/bpf/test_cgrp2_attach2.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 3c805af..8892d7c 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -23,6 +23,7 @@ hostprogs-y += map_perf_test
hostprogs-y += test_overhead
hostprogs-y += test_cgrp2_array_pin
hostprogs-y += test_cgrp2_attach
+hostprogs-y += test_cgrp2_attach2
hostprogs-y += xdp1
hostprogs-y += xdp2
hostprogs-y += test_current_task_under_cgroup
@@ -51,6 +52,7 @@ map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
test_cgrp2_array_pin-objs := libbpf.o test_cgrp2_array_pin.o
test_cgrp2_attach-objs := libbpf.o test_cgrp2_attach.o
+test_cgrp2_attach2-objs := libbpf.o test_cgrp2_attach2.o cgroup_helpers.o
xdp1-objs := bpf_load.o libbpf.o xdp1_user.o
# reuse xdp1 source intentionally
xdp2-objs := bpf_load.o libbpf.o xdp1_user.o
diff --git a/samples/bpf/test_cgrp2_attach2.c b/samples/bpf/test_cgrp2_attach2.c
new file mode 100644
index 0000000..ddfac42
--- /dev/null
+++ b/samples/bpf/test_cgrp2_attach2.c
@@ -0,0 +1,132 @@
+/* eBPF example program:
+ *
+ * - Creates arraymap in kernel with 4 bytes keys and 8 byte values
+ *
+ * - Loads eBPF program
+ *
+ * The eBPF program accesses the map passed in to store two pieces of
+ * information. The number of invocations of the program, which maps
+ * to the number of packets received, is stored to key 0. Key 1 is
+ * incremented on each iteration by the number of bytes stored in
+ * the skb.
+ *
+ * - Attaches the new program to a cgroup using BPF_PROG_ATTACH
+ *
+ * - Every second, reads map[0] and map[1] to see how many bytes and
+ * packets were seen on any socket of tasks in the given cgroup.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include <linux/bpf.h>
+
+#include "libbpf.h"
+#include "cgroup_helpers.h"
+
+#define FOO "/foo"
+#define BAR "/foo/bar/"
+#define PING_CMD "ping -c1 -w1 127.0.0.1"
+
+static int prog_load(int verdict)
+{
+ int ret;
+ struct bpf_insn prog[] = {
+ BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */
+ BPF_EXIT_INSN(),
+ };
+
+ ret = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SKB,
+ prog, sizeof(prog), "GPL", 0);
+
+ if (ret < 0) {
+ log_err("Loading program");
+ printf("Output from verifier:\n%s\n-------\n", bpf_log_buf);
+ return 0;
+ }
+ return ret;
+}
+
+
+int main(int argc, char **argv)
+{
+ int drop_prog, allow_prog, foo = 0, bar = 0, rc = 0;
+
+ allow_prog = prog_load(1);
+ if (!allow_prog)
+ goto err;
+
+ drop_prog = prog_load(0);
+ if (!drop_prog)
+ goto err;
+
+ if (setup_cgroup_environment())
+ goto err;
+
+ /* Create cgroup /foo, get fd, and join it */
+ foo = create_and_get_cgroup(FOO);
+ if (!foo)
+ goto err;
+
+ if (join_cgroup(FOO))
+ goto err;
+
+ if (bpf_prog_attach(drop_prog, foo, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Attaching prog to /foo");
+ goto err;
+ }
+
+ assert(system(PING_CMD) != 0);
+
+ /* Create cgroup /foo/bar, get fd, and join it */
+ bar = create_and_get_cgroup(BAR);
+ if (!bar)
+ goto err;
+
+ if (join_cgroup(BAR))
+ goto err;
+
+ assert(system(PING_CMD) != 0);
+
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Attaching prog to /foo/bar");
+ goto err;
+ }
+
+ assert(system(PING_CMD) == 0);
+
+
+ if (bpf_prog_detach(bar, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching program from /foo/bar");
+ goto err;
+ }
+
+ assert(system(PING_CMD) != 0);
+
+ if (bpf_prog_attach(allow_prog, bar, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Attaching prog to /foo/bar");
+ goto err;
+ }
+
+ if (bpf_prog_detach(foo, BPF_CGROUP_INET_EGRESS)) {
+ log_err("Detaching program from /foo");
+ goto err;
+ }
+
+ assert(system(PING_CMD) == 0);
+
+ goto out;
+
+err:
+ rc = 1;
+
+out:
+ close(foo);
+ close(bar);
+ cleanup_cgroup_environment();
+ return rc;
+}
--
2.7.4
^ permalink raw reply related
* Re: stmmac: turn coalescing / NAPI off in stmmac
From: Pavel Machek @ 2016-12-02 10:42 UTC (permalink / raw)
To: Giuseppe CAVALLARO; +Cc: David Miller, alexandre.torgue, netdev, linux-kernel
In-Reply-To: <2ceae6dc-3a48-3212-c634-cc6f1f0b363f@st.com>
[-- Attachment #1: Type: text/plain, Size: 2372 bytes --]
Hi!
> >Anyway... since you asked. I belive I have way to disable NAPI / tx
> >coalescing in the driver. Unfortunately, locking is missing on the rx
> >path, and needs to be extended to _irqsave variant on tx path.
>
> I have just replied to a previous thread about that...
Yeah, please reply to David's mail where he describes why it can't
work.
> >So patch currently looks like this (hand edited, can't be
> >applied, got it working few hours ago). Does it look acceptable?
> >
> >I'd prefer this to go after the patch that pulls common code to single
> >place, so that single place needs to be patched. Plus I guess I should
> >add ifdefs, so that more advanced NAPI / tx coalescing code can be
> >reactivated when it is fixed. Trivial fixes can go on top. Does that
> >sound like a plan?
>
> Hmm, what I find strange is that, just this code is running since a
> long time on several platforms and Chip versions. No raise condition
> have been found or lock protection problems (also proving look
> mechanisms).
Well, it works better for me when I disable CONFIG_SMP. It is normal
that locking problems are hard to reproduce :-(.
> Pavel, I ask you sorry if I missed some problems so, if you can
> (as D. Miller asked) to send us a cover letter + all patches
> I will try to reply soon. I can do also some tests if you ask
> me that! I could run on 3.x and 4.x but I cannot promise you
> benchmarks.
Actually... I have questions here. David normally pulls from you (can
I have a address of your git tree?).
Could you apply these to your git?
[PATCH] stmmac ethernet: unify locking
[PATCH] stmmac: simplify flag assignment
[PATCH] stmmac: cleanup documenation, make it match reality
They are rather trivial and independend, I'm not sure what cover
letter would say, besides "simple fixes".
Then I can re-do the reset on top of that...
> >Which tree do you want patches against?
> >
> >https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git/ ?
>
> I think that bug fixing should be on top of net.git but I let Miller
> to decide.
Hmm. It is "only" a performance problem (40msec delays).. I guess
-next is better target.
Best regards,
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]
^ permalink raw reply
* [PATCH net v2] tcp: warn on bogus MSS and try to amend it
From: Marcelo Ricardo Leitner @ 2016-12-02 10:55 UTC (permalink / raw)
To: netdev
Cc: Jon Maxwell, Alex Sidorenko, Alexey Kuznetsov, James Morris,
Hideaki YOSHIFUJI, Patrick McHardy, tlfalcon, Brian King,
Eric Dumazet, davem, marcelo.leitner
There have been some reports lately about TCP connection stalls caused
by NIC drivers that aren't setting gso_size on aggregated packets on rx
path. This causes TCP to assume that the MSS is actually the size of the
aggregated packet, which is invalid.
Although the proper fix is to be done at each driver, it's often hard
and cumbersome for one to debug, come to such root cause and report/fix
it.
This patch amends this situation in two ways. First, it adds a warning
on when this situation occurs, so it gives a hint to those trying to
debug this. It also limit the maximum probed MSS to the adverised MSS,
as it should never be any higher than that.
The result is that the connection may not have the best performance ever
but it shouldn't stall, and the admin will have a hint on what to look
for.
Tested with virtio by forcing gso_size to 0.
Cc: Jonathan Maxwell <jmaxwell37@gmail.com>
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
v2: Updated msg as suggested by David.
net/ipv4/tcp_input.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a27b9c0e27c08b4e4aeaff3d0bfdf3ae561ba4d8..fd619eb93749b6de56a41669248b337c051d9fe2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -144,7 +144,10 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
- icsk->icsk_ack.rcv_mss = len;
+ icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
+ tcp_sk(sk)->advmss);
+ if (icsk->icsk_ack.rcv_mss != len)
+ pr_warn_once("Driver has suspect GRO implementation, TCP performance may be compromised.\n");
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
--
2.9.3
^ permalink raw reply related
* Re: [PATCH 2/7] net: ethernet: ti: cpdma: fix desc re-queuing
From: Ivan Khoronzhuk @ 2016-12-02 11:03 UTC (permalink / raw)
To: Grygorii Strashko
Cc: David S. Miller, netdev, Mugunthan V N, Sekhar Nori, linux-kernel,
linux-omap
In-Reply-To: <20161201233432.6182-3-grygorii.strashko@ti.com>
On Thu, Dec 01, 2016 at 05:34:27PM -0600, Grygorii Strashko wrote:
> The currently processing cpdma descriptor with EOQ flag set may
> contain two values in Next Descriptor Pointer field:
> - valid pointer: means CPDMA missed addition of new desc in queue;
It shouldn't happen in normal circumstances, right?
So, why it happens only for egress channels? And Does that mean
there is some resynchronization between submit and process function,
or this is h/w issue?
> - null: no more descriptors in queue.
> In the later case, it's not required to write to HDP register, but now
> CPDMA does it.
>
> Hence, add additional check for Next Descriptor Pointer != null in
> cpdma_chan_process() function before writing in HDP register.
>
> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
> ---
> drivers/net/ethernet/ti/davinci_cpdma.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c b/drivers/net/ethernet/ti/davinci_cpdma.c
> index 0924014..379314f 100644
> --- a/drivers/net/ethernet/ti/davinci_cpdma.c
> +++ b/drivers/net/ethernet/ti/davinci_cpdma.c
> @@ -1152,7 +1152,7 @@ static int __cpdma_chan_process(struct cpdma_chan *chan)
> chan->count--;
> chan->stats.good_dequeue++;
>
> - if (status & CPDMA_DESC_EOQ) {
> + if ((status & CPDMA_DESC_EOQ) && chan->head) {
> chan->stats.requeue++;
> chan_write(chan, hdp, desc_phys(pool, chan->head));
> }
> --
> 2.10.1
>
^ permalink raw reply
* [PATCH iproute2 V5 0/3] tc: Support for ip tunnel metadata set/unset/classify
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
To: Stephen Hemminger
Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
Amir Vadai
Hi,
This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].
Applied and tested on top of commit b6c7fc61faab ("ss: print new tcp_info
fields: busy, rwnd-limited, sndbuf-limited times")
Example usage:
$ tc filter add dev vxlan0 protocol ip parent ffff: \
flower \
enc_src_ip 11.11.0.2 \
enc_dst_ip 11.11.0.1 \
enc_key_id 11 \
dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0
$ tc filter add dev net0 protocol ip parent ffff: \
flower \
ip_proto 1 \
dst_ip 11.11.11.2 \
action tunnel_key set \
src_ip 11.11.0.1 \
dst_ip 11.11.0.2 \
id 11 \
action mirred egress redirect dev vxlan0
[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")
Thanks,
Amir
Changes from V4:
- Fix rebase conflicts for net-next
Changes from V3:
- Fix bad wording in the man page about the use of the 'unset' operation
Changes from V2:
- Use const where needed
- Don't lose return value
- Introduce rta_getattr_be16() and rta_getattr_be32()
Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
and the man page tc-tunnel_key to reflect the fact that 'unset' operation is
no mandatory.
And describe when it might be needed.
- Rename the 'release' operation to 'unset'
Amir Vadai (3):
libnetlink: Introduce rta_getattr_be*()
tc/cls_flower: Classify packet in ip tunnels
tc/act_tunnel: Introduce ip tunnel action
Amir Vadai (3):
libnetlink: Introduce rta_getattr_be*()
tc/cls_flower: Classify packet in ip tunnels
tc/act_tunnel: Introduce ip tunnel action
bridge/fdb.c | 4 +-
include/libnetlink.h | 9 ++
include/linux/tc_act/tc_tunnel_key.h | 42 ++++++
ip/iplink_geneve.c | 2 +-
ip/iplink_vxlan.c | 2 +-
man/man8/tc-flower.8 | 17 ++-
man/man8/tc-tunnel_key.8 | 112 +++++++++++++++
tc/Makefile | 1 +
tc/f_flower.c | 84 +++++++++++-
tc/m_tunnel_key.c | 258 +++++++++++++++++++++++++++++++++++
10 files changed, 522 insertions(+), 9 deletions(-)
create mode 100644 include/linux/tc_act/tc_tunnel_key.h
create mode 100644 man/man8/tc-tunnel_key.8
create mode 100644 tc/m_tunnel_key.c
--
2.10.2
^ permalink raw reply
* [PATCH iproute2 V5 1/3] libnetlink: Introduce rta_getattr_be*()
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
To: Stephen Hemminger
Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
Amir Vadai
In-Reply-To: <20161202112515.11705-1-amir@vadai.me>
Add the utility functions rta_getattr_be16() and rta_getattr_be32(), and
change existing code to use it.
Signed-off-by: Amir Vadai <amir@vadai.me>
---
bridge/fdb.c | 4 ++--
include/libnetlink.h | 9 +++++++++
ip/iplink_geneve.c | 2 +-
ip/iplink_vxlan.c | 2 +-
tc/f_flower.c | 2 +-
5 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/bridge/fdb.c b/bridge/fdb.c
index 90f4b154c5dc..a91521776e99 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -168,10 +168,10 @@ int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
if (tb[NDA_PORT]) {
if (jw_global)
jsonw_uint_field(jw_global, "port",
- ntohs(rta_getattr_u16(tb[NDA_PORT])));
+ rta_getattr_be16(tb[NDA_PORT]));
else
fprintf(fp, "port %d ",
- ntohs(rta_getattr_u16(tb[NDA_PORT])));
+ rta_getattr_be16(tb[NDA_PORT]));
}
if (tb[NDA_VNI]) {
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 483509ca9635..751ebf186dd4 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -10,6 +10,7 @@
#include <linux/if_addr.h>
#include <linux/neighbour.h>
#include <linux/netconf.h>
+#include <arpa/inet.h>
struct rtnl_handle {
int fd;
@@ -140,10 +141,18 @@ static inline __u16 rta_getattr_u16(const struct rtattr *rta)
{
return *(__u16 *)RTA_DATA(rta);
}
+static inline __be16 rta_getattr_be16(const struct rtattr *rta)
+{
+ return ntohs(rta_getattr_u16(rta));
+}
static inline __u32 rta_getattr_u32(const struct rtattr *rta)
{
return *(__u32 *)RTA_DATA(rta);
}
+static inline __be32 rta_getattr_be32(const struct rtattr *rta)
+{
+ return ntohl(rta_getattr_u32(rta));
+}
static inline __u64 rta_getattr_u64(const struct rtattr *rta)
{
__u64 tmp;
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 3bfba91c644c..1e6669d07d60 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -234,7 +234,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
if (tb[IFLA_GENEVE_PORT])
fprintf(f, "dstport %u ",
- ntohs(rta_getattr_u16(tb[IFLA_GENEVE_PORT])));
+ rta_getattr_be16(tb[IFLA_GENEVE_PORT]));
if (tb[IFLA_GENEVE_COLLECT_METADATA])
fputs("external ", f);
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 93af979a1e97..6d02bb47b2f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -413,7 +413,7 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
if (tb[IFLA_VXLAN_PORT])
fprintf(f, "dstport %u ",
- ntohs(rta_getattr_u16(tb[IFLA_VXLAN_PORT])));
+ rta_getattr_be16(tb[IFLA_VXLAN_PORT]));
if (tb[IFLA_VXLAN_LEARNING] &&
!rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 1555764b9996..e132974e0d1d 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -511,7 +511,7 @@ static void flower_print_ip_addr(FILE *f, char *name, __be16 eth_type,
static void flower_print_port(FILE *f, char *name, struct rtattr *attr)
{
- fprintf(f, "\n %s %d", name, ntohs(rta_getattr_u16(attr)));
+ fprintf(f, "\n %s %d", name, rta_getattr_be16(attr));
}
static int flower_print_opt(struct filter_util *qu, FILE *f,
--
2.10.2
^ permalink raw reply related
* [PATCH iproute2 V5 2/3] tc/cls_flower: Classify packet in ip tunnels
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
To: Stephen Hemminger
Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
Amir Vadai
In-Reply-To: <20161202112515.11705-1-amir@vadai.me>
Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.
For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':
$ tc filter add dev vxlan0 protocol ip parent ffff: \
flower \
enc_src_ip 11.11.0.2 \
enc_dst_ip 11.11.0.1 \
enc_key_id 11 \
dst_ip 11.11.11.1 \
action mirred egress redirect dev vnet0
Signed-off-by: Amir Vadai <amir@vadai.me>
---
man/man8/tc-flower.8 | 17 ++++++++++-
tc/f_flower.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++--
2 files changed, 95 insertions(+), 4 deletions(-)
diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 16ef261797ab..dd3564917dcc 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -34,7 +34,11 @@ flower \- flow based traffic control filter
.BR dst_ip " | " src_ip " } { "
.IR ipv4_address " | " ipv6_address " } | { "
.BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
.SH DESCRIPTION
The
.B flower
@@ -112,6 +116,17 @@ which has to be specified in beforehand.
Match on layer 4 protocol source or destination port number. Only available for
.BR ip_proto " values " udp " and " tcp ,
which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
.SH NOTES
As stated above where applicable, matches of a certain layer implicitly depend
on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index e132974e0d1d..7e7f4c92a947 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
" dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
" src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
" dst_port PORT-NUMBER |\n"
- " src_port PORT-NUMBER }\n"
+ " src_port PORT-NUMBER |\n"
+ " enc_dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
+ " enc_src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
+ " enc_key_id [ KEY-ID ] }\n"
" FILTERID := X:Y:Z\n"
" ACTION-SPEC := ... look at individual actions\n"
"\n"
@@ -125,8 +128,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
family = AF_INET;
} else if (eth_type == htons(ETH_P_IPV6)) {
family = AF_INET6;
+ } else if (!eth_type) {
+ family = AF_UNSPEC;
} else {
- fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
}
@@ -134,8 +138,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
if (ret)
return -1;
- if (addr.family != family)
+ if (family && (addr.family != family)) {
+ fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
return -1;
+ }
addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
addr.data, addr.bytelen);
@@ -197,6 +203,18 @@ static int flower_parse_port(char *str, __u8 ip_port, bool is_src,
return 0;
}
+static int flower_parse_key_id(const char *str, int type, struct nlmsghdr *n)
+{
+ int ret;
+ __be32 key_id;
+
+ ret = get_be32(&key_id, str, 10);
+ if (!ret)
+ addattr32(n, MAX_MSG, type, key_id);
+
+ return ret;
+}
+
static int flower_parse_opt(struct filter_util *qu, char *handle,
int argc, char **argv, struct nlmsghdr *n)
{
@@ -354,6 +372,38 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
fprintf(stderr, "Illegal \"src_port\"\n");
return -1;
}
+ } else if (matches(*argv, "enc_dst_ip") == 0) {
+ NEXT_ARG();
+ ret = flower_parse_ip_addr(*argv, 0,
+ TCA_FLOWER_KEY_ENC_IPV4_DST,
+ TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+ TCA_FLOWER_KEY_ENC_IPV6_DST,
+ TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+ n);
+ if (ret < 0) {
+ fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
+ return -1;
+ }
+ } else if (matches(*argv, "enc_src_ip") == 0) {
+ NEXT_ARG();
+ ret = flower_parse_ip_addr(*argv, 0,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC,
+ TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC,
+ TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+ n);
+ if (ret < 0) {
+ fprintf(stderr, "Illegal \"enc_src_ip\"\n");
+ return -1;
+ }
+ } else if (matches(*argv, "enc_key_id") == 0) {
+ NEXT_ARG();
+ ret = flower_parse_key_id(*argv,
+ TCA_FLOWER_KEY_ENC_KEY_ID, n);
+ if (ret < 0) {
+ fprintf(stderr, "Illegal \"enc_key_id\"\n");
+ return -1;
+ }
} else if (matches(*argv, "action") == 0) {
NEXT_ARG();
ret = parse_action(&argc, &argv, TCA_FLOWER_ACT, n);
@@ -514,6 +564,13 @@ static void flower_print_port(FILE *f, char *name, struct rtattr *attr)
fprintf(f, "\n %s %d", name, rta_getattr_be16(attr));
}
+static void flower_print_key_id(FILE *f, const char *name,
+ struct rtattr *attr)
+{
+ if (attr)
+ fprintf(f, "\n %s %d", name, rta_getattr_be32(attr));
+}
+
static int flower_print_opt(struct filter_util *qu, FILE *f,
struct rtattr *opt, __u32 handle)
{
@@ -579,6 +636,25 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
flower_print_port(f, "src_port",
tb[flower_port_attr_type(ip_proto, true)]);
+ flower_print_ip_addr(f, "enc_dst_ip",
+ tb[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] ?
+ htons(ETH_P_IP) : htons(ETH_P_IPV6),
+ tb[TCA_FLOWER_KEY_ENC_IPV4_DST],
+ tb[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK],
+ tb[TCA_FLOWER_KEY_ENC_IPV6_DST],
+ tb[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK]);
+
+ flower_print_ip_addr(f, "enc_src_ip",
+ tb[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] ?
+ htons(ETH_P_IP) : htons(ETH_P_IPV6),
+ tb[TCA_FLOWER_KEY_ENC_IPV4_SRC],
+ tb[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK],
+ tb[TCA_FLOWER_KEY_ENC_IPV6_SRC],
+ tb[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK]);
+
+ flower_print_key_id(f, "enc_key_id",
+ tb[TCA_FLOWER_KEY_ENC_KEY_ID]);
+
if (tb[TCA_FLOWER_FLAGS]) {
__u32 flags = rta_getattr_u32(tb[TCA_FLOWER_FLAGS]);
--
2.10.2
^ permalink raw reply related
* [PATCH iproute2 V5 3/3] tc/act_tunnel: Introduce ip tunnel action
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
To: Stephen Hemminger
Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
Amir Vadai
In-Reply-To: <20161202112515.11705-1-amir@vadai.me>
This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.
The 'unset' action is optional. It is used to explicitly unset the
metadata created by the tunnel device during decap. If not used, the
metadata will be released automatically by the kernel.
The 'set' operation, will set the metadata with the specified values for
the encap.
For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:
$ tc filter add dev net0 protocol ip parent ffff: \
flower \
ip_proto 1 \
dst_ip 11.11.11.2 \
action tunnel_key set \
src_ip 11.11.0.1 \
dst_ip 11.11.0.2 \
id 11 \
action mirred egress redirect dev vxlan0
Signed-off-by: Amir Vadai <amir@vadai.me>
---
include/linux/tc_act/tc_tunnel_key.h | 42 ++++++
man/man8/tc-tunnel_key.8 | 112 +++++++++++++++
tc/Makefile | 1 +
tc/m_tunnel_key.c | 258 +++++++++++++++++++++++++++++++++++
4 files changed, 413 insertions(+)
create mode 100644 include/linux/tc_act/tc_tunnel_key.h
create mode 100644 man/man8/tc-tunnel_key.8
create mode 100644 tc/m_tunnel_key.c
diff --git a/include/linux/tc_act/tc_tunnel_key.h b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET 1
+#define TCA_TUNNEL_KEY_ACT_RELEASE 2
+
+struct tc_tunnel_key {
+ tc_gen;
+ int t_action;
+};
+
+enum {
+ TCA_TUNNEL_KEY_UNSPEC,
+ TCA_TUNNEL_KEY_TM,
+ TCA_TUNNEL_KEY_PARMS,
+ TCA_TUNNEL_KEY_ENC_IPV4_SRC, /* be32 */
+ TCA_TUNNEL_KEY_ENC_IPV4_DST, /* be32 */
+ TCA_TUNNEL_KEY_ENC_IPV6_SRC, /* struct in6_addr */
+ TCA_TUNNEL_KEY_ENC_IPV6_DST, /* struct in6_addr */
+ TCA_TUNNEL_KEY_ENC_KEY_ID, /* be64 */
+ TCA_TUNNEL_KEY_PAD,
+ __TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index 000000000000..17b15b9b34b9
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,112 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" "Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " unset " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action combined with a shared IP tunnel device, allows to perform IP tunnel en-
+or decapsulation on a packet, reflected by
+the operation modes
+.IR UNSET " and " SET .
+The
+.I UNSET
+mode is optional - even without using it, the metadata information will be
+released automatically when packet processing will be finished.
+.IR UNSET
+function could be used in cases when traffic is forwarded between two tunnels,
+where the metadata from the first tunnel will be used for encapsulation done by
+the second tunnel.
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and unset the metadata created by it (for
+.I UNSET
+).
+
+.SH OPTIONS
+.TP
+.B unset
+Unset the tunnel metadata created by the IP tunnel device. This function is
+not mandatory and might be used only in some specific use cases (as explained
+above).
+.TP
+.B set
+Set tunnel metadata to be used by the IP tunnel device. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming ICMP packets on eth0 into a vxlan
+tunnel, by setting metadata to VNI 11, source IP 11.11.0.1 and destination IP
+11.11.0.2, and by redirecting the packet with the metadata to device vxlan0,
+which will do the actual encapsulation using the metadata:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle ffff: ingress
+#tc filter add dev eth0 protocol ip parent ffff: \\
+ flower \\
+ ip_proto icmp \\
+ action tunnel_key set \\
+ src_ip 11.11.0.1 \\
+ dst_ip 11.11.0.2 \\
+ id 11 \\
+ action mirred egress redirect dev vxlan0
+.EE
+.RE
+
+Here is an example of the
+.B unset
+function: Incoming VXLAN traffic with outer IP's and VNI 11 is decapsulated by
+vxlan0 and metadata is unset before redirecting to tunl1 device:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle ffff: ingress
+#tc filter add dev vxlan0 protocol ip parent ffff: \
+ flower \\
+ enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 \
+ action tunnel_key unset \
+ action mirred egress redirect dev tunl1
+.EE
+.RE
+
+.SH SEE ALSO
+.BR tc (8)
diff --git a/tc/Makefile b/tc/Makefile
index f986fcb9e9fd..bb9011432ea1 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -50,6 +50,7 @@ TCMODULES += m_simple.o
TCMODULES += m_vlan.o
TCMODULES += m_connmark.o
TCMODULES += m_bpf.o
+TCMODULES += m_tunnel_key.o
TCMODULES += p_ip.o
TCMODULES += p_icmp.o
TCMODULES += p_tcp.o
diff --git a/tc/m_tunnel_key.c b/tc/m_tunnel_key.c
new file mode 100644
index 000000000000..f4a20e24e0bf
--- /dev/null
+++ b/tc/m_tunnel_key.c
@@ -0,0 +1,258 @@
+/*
+ * m_tunnel_key.c ip tunnel manipulation module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Amir Vadai <amir@vadai.me>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/if_ether.h>
+#include "utils.h"
+#include "rt_names.h"
+#include "tc_util.h"
+#include <linux/tc_act/tc_tunnel_key.h>
+
+static void explain(void)
+{
+ fprintf(stderr, "Usage: tunnel_key unset\n");
+ fprintf(stderr, " tunnel_key set id TUNNELID src_ip IP dst_ip IP\n");
+}
+
+static void usage(void)
+{
+ explain();
+ exit(-1);
+}
+
+static int tunnel_key_parse_ip_addr(const char *str, int addr4_type,
+ int addr6_type, struct nlmsghdr *n)
+{
+ inet_prefix addr;
+ int ret;
+
+ ret = get_addr(&addr, str, AF_UNSPEC);
+ if (ret)
+ return ret;
+
+ addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
+ addr.data, addr.bytelen);
+
+ return 0;
+}
+
+static int tunnel_key_parse_key_id(const char *str, int type,
+ struct nlmsghdr *n)
+{
+ __be32 key_id;
+ int ret;
+
+ ret = get_be32(&key_id, str, 10);
+ if (!ret)
+ addattr32(n, MAX_MSG, type, key_id);
+
+ return ret;
+}
+
+static int parse_tunnel_key(struct action_util *a, int *argc_p, char ***argv_p,
+ int tca_id, struct nlmsghdr *n)
+{
+ struct tc_tunnel_key parm = { .action = TC_ACT_PIPE };
+ char **argv = *argv_p;
+ int argc = *argc_p;
+ struct rtattr *tail;
+ int action = 0;
+ int ret;
+ int has_src_ip = 0;
+ int has_dst_ip = 0;
+ int has_key_id = 0;
+
+ if (matches(*argv, "tunnel_key") != 0)
+ return -1;
+
+ tail = NLMSG_TAIL(n);
+ addattr_l(n, MAX_MSG, tca_id, NULL, 0);
+
+ NEXT_ARG();
+
+ while (argc > 0) {
+ if (matches(*argv, "unset") == 0) {
+ if (action) {
+ fprintf(stderr, "unexpected \"%s\" - action already specified\n",
+ *argv);
+ explain();
+ return -1;
+ }
+ action = TCA_TUNNEL_KEY_ACT_RELEASE;
+ } else if (matches(*argv, "set") == 0) {
+ if (action) {
+ fprintf(stderr, "unexpected \"%s\" - action already specified\n",
+ *argv);
+ explain();
+ return -1;
+ }
+ action = TCA_TUNNEL_KEY_ACT_SET;
+ } else if (matches(*argv, "src_ip") == 0) {
+ NEXT_ARG();
+ ret = tunnel_key_parse_ip_addr(*argv,
+ TCA_TUNNEL_KEY_ENC_IPV4_SRC,
+ TCA_TUNNEL_KEY_ENC_IPV6_SRC,
+ n);
+ if (ret < 0) {
+ fprintf(stderr, "Illegal \"src_ip\"\n");
+ return -1;
+ }
+ has_src_ip = 1;
+ } else if (matches(*argv, "dst_ip") == 0) {
+ NEXT_ARG();
+ ret = tunnel_key_parse_ip_addr(*argv,
+ TCA_TUNNEL_KEY_ENC_IPV4_DST,
+ TCA_TUNNEL_KEY_ENC_IPV6_DST,
+ n);
+ if (ret < 0) {
+ fprintf(stderr, "Illegal \"dst_ip\"\n");
+ return -1;
+ }
+ has_dst_ip = 1;
+ } else if (matches(*argv, "id") == 0) {
+ NEXT_ARG();
+ ret = tunnel_key_parse_key_id(*argv, TCA_TUNNEL_KEY_ENC_KEY_ID, n);
+ if (ret < 0) {
+ fprintf(stderr, "Illegal \"id\"\n");
+ return -1;
+ }
+ has_key_id = 1;
+ } else if (matches(*argv, "help") == 0) {
+ usage();
+ } else {
+ break;
+ }
+ NEXT_ARG_FWD();
+ }
+
+ if (argc && !action_a2n(*argv, &parm.action, false))
+ NEXT_ARG_FWD();
+
+ if (argc) {
+ if (matches(*argv, "index") == 0) {
+ NEXT_ARG();
+ if (get_u32(&parm.index, *argv, 10)) {
+ fprintf(stderr, "tunnel_key: Illegal \"index\"\n");
+ return -1;
+ }
+
+ NEXT_ARG_FWD();
+ }
+ }
+
+ if (action == TCA_TUNNEL_KEY_ACT_SET &&
+ (!has_src_ip || !has_dst_ip || !has_key_id)) {
+ fprintf(stderr, "set needs tunnel_key parameters\n");
+ explain();
+ return -1;
+ }
+
+ parm.t_action = action;
+ addattr_l(n, MAX_MSG, TCA_TUNNEL_KEY_PARMS, &parm, sizeof(parm));
+ tail->rta_len = (char *)NLMSG_TAIL(n) - (char *)tail;
+
+ *argc_p = argc;
+ *argv_p = argv;
+
+ return 0;
+}
+
+static void tunnel_key_print_ip_addr(FILE *f, const char *name,
+ struct rtattr *attr)
+{
+ int family;
+ size_t len;
+
+ if (!attr)
+ return;
+
+ len = RTA_PAYLOAD(attr);
+
+ if (len == 4)
+ family = AF_INET;
+ else if (len == 16)
+ family = AF_INET6;
+ else
+ return;
+
+ fprintf(f, "\n\t%s %s", name, rt_addr_n2a_rta(family, attr));
+}
+
+static void tunnel_key_print_key_id(FILE *f, const char *name,
+ struct rtattr *attr)
+{
+ if (!attr)
+ return;
+ fprintf(f, "\n\t%s %d", name, rta_getattr_be32(attr));
+}
+
+static int print_tunnel_key(struct action_util *au, FILE *f, struct rtattr *arg)
+{
+ struct rtattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+ struct tc_tunnel_key *parm;
+
+ if (!arg)
+ return -1;
+
+ parse_rtattr_nested(tb, TCA_TUNNEL_KEY_MAX, arg);
+
+ if (!tb[TCA_TUNNEL_KEY_PARMS]) {
+ fprintf(f, "[NULL tunnel_key parameters]");
+ return -1;
+ }
+ parm = RTA_DATA(tb[TCA_TUNNEL_KEY_PARMS]);
+
+ fprintf(f, "tunnel_key");
+
+ switch (parm->t_action) {
+ case TCA_TUNNEL_KEY_ACT_RELEASE:
+ fprintf(f, " unset");
+ break;
+ case TCA_TUNNEL_KEY_ACT_SET:
+ fprintf(f, " set");
+ tunnel_key_print_ip_addr(f, "src_ip",
+ tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+ tunnel_key_print_ip_addr(f, "dst_ip",
+ tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+ tunnel_key_print_ip_addr(f, "src_ip",
+ tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+ tunnel_key_print_ip_addr(f, "dst_ip",
+ tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+ tunnel_key_print_key_id(f, "key_id",
+ tb[TCA_TUNNEL_KEY_ENC_KEY_ID]);
+ break;
+ }
+ fprintf(f, " %s", action_n2a(parm->action));
+
+ fprintf(f, "\n\tindex %d ref %d bind %d", parm->index, parm->refcnt,
+ parm->bindcnt);
+
+ if (show_stats) {
+ if (tb[TCA_TUNNEL_KEY_TM]) {
+ struct tcf_t *tm = RTA_DATA(tb[TCA_TUNNEL_KEY_TM]);
+
+ print_tm(f, tm);
+ }
+ }
+
+ fprintf(f, "\n ");
+
+ return 0;
+}
+
+struct action_util tunnel_key_action_util = {
+ .id = "tunnel_key",
+ .parse_aopt = parse_tunnel_key,
+ .print_aopt = print_tunnel_key,
+};
--
2.10.2
^ permalink raw reply related
* Re: [PATCH 5/7] Documentation: DT: net: cpsw: allow to specify descriptors pool size
From: Ivan Khoronzhuk @ 2016-12-02 11:28 UTC (permalink / raw)
To: Grygorii Strashko
Cc: David S. Miller, netdev, Mugunthan V N, Sekhar Nori, linux-kernel,
linux-omap
In-Reply-To: <20161201233432.6182-6-grygorii.strashko@ti.com>
On Thu, Dec 01, 2016 at 05:34:30PM -0600, Grygorii Strashko wrote:
> Add optional property "descs_pool_size" to specify buffer descriptor's
> pool size. The "descs_pool_size" should define total number of CPDMA
> CPPI descriptors to be used for both ingress/egress packets
> processing. If not specified - the default value 256 will be used
> which will allow to place descriptor's pool into the internal CPPI
> RAM on most of TI SoC.
>
> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
> ---
> Documentation/devicetree/bindings/net/cpsw.txt | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt
> index 5ad439f..b99d196 100644
> --- a/Documentation/devicetree/bindings/net/cpsw.txt
> +++ b/Documentation/devicetree/bindings/net/cpsw.txt
> @@ -35,6 +35,11 @@ Optional properties:
> For example in dra72x-evm, pcf gpio has to be
> driven low so that cpsw slave 0 and phy data
> lines are connected via mux.
> +- descs_pool_size : total number of CPDMA CPPI descriptors to be used for
> + both ingress/egress packets processing. if not
> + specified the default value 256 will be used which
> + will allow to place descriptors pool into the
> + internal CPPI RAM.
Does it describe h/w? Why now module parameter? or even smth like ethtool num
ring entries?
>
> Slave Properties:
> --
> 2.10.1
>
^ permalink raw reply
* Re: [flamebait] xdp, well meaning but pointless
From: Hannes Frederic Sowa @ 2016-12-02 11:54 UTC (permalink / raw)
To: Jesper Dangaard Brouer, Tom Herbert
Cc: Thomas Graf, Florian Westphal, Linux Kernel Network Developers
In-Reply-To: <20161202112450.1720d33d@redhat.com>
On 02.12.2016 11:24, Jesper Dangaard Brouer wrote:
> On Thu, 1 Dec 2016 13:51:32 -0800
> Tom Herbert <tom@herbertland.com> wrote:
>
>>>> The technical plenary at last IETF on Seoul a couple of weeks ago was
>>>> exclusively focussed on DDOS in light of the recent attack against
>>>> Dyn. There were speakers form Cloudflare and Dyn. The Cloudflare
>>>> presentation by Nick Sullivan
>>>> (https://www.ietf.org/proceedings/97/slides/slides-97-ietf-sessb-how-to-stay-online-harsh-realities-of-operating-in-a-hostile-network-nick-sullivan-01.pdf)
>>>> alluded to some implementation of DDOS mitigation. In particular, on
>>>> slide 6 Nick gave some numbers for drop rates in DDOS. The "kernel"
>
> slide 14
>
>>>> numbers he gave we're based in iptables+BPF and that was a whole
>>>> 1.2Mpps-- somehow that seems ridiculously to me (I said so at the mic
>>>> and that's also when I introduced XDP to whole IETF :-) ). If that's
>>>> the best we can do the Internet is in a world hurt. DDOS mitigation
>>>> alone is probably a sufficient motivation to look at XDP. We need
>>>> something that drops bad packets as quickly as possible when under
>>>> attack, we need this to be integrated into the stack, we need it to be
>>>> programmable to deal with the increasing savvy of attackers, and we
>>>> don't want to be forced to be dependent on HW solutions. This is why
>>>> we created XDP!
>
> The 1.2Mpps number is a bit low, but we are unfortunately in that
> ballpark.
>
>>> I totally understand that. But in my reply to David in this thread I
>>> mentioned DNS apex processing as being problematic which is actually
>>> being referred in your linked slide deck on page 9 ("What do floods look
>>> like") and the problematic of parsing DNS packets in XDP due to string
>>> processing and looping inside eBPF.
>
> That is a weak argument. You do realize CloudFlare actually use eBPF to
> do this exact filtering, and (so-far) eBPF for parsing DNS have been
> sufficient for them.
You are talking about this code on the following slides (I actually
transcribed it for you here and disassembled):
l0: ld #0x14
l1: ldxb 4*([0]&0xf)
l2: add x
l3: tax
l4: ld [x+0]
l5: jeq #0x7657861, l6, l13
l6: ld [x+4]
l7: jeq #0x6d706c65, l8, l13
l8: ld [x+8]
l9: jeq #0x3636f6d, l10, l13
l10: ldb [x+12]
l11: jeq #0, l12, l13
l12: ret #0x1
l13: ret #0
You can offload this to u32 in hardware if that is what you want.
The reason this works is because of netfilter, which allows them to
dynamically generate BPF programs and insert and delete them from
chains, do intersection or unions of them.
If you have a freestanding program like in XDP the complexity space is a
different one and not comparable to this at all.
Bye,
Hannes
^ permalink raw reply
* [PATCH iproute2/net-next] ss: initialise variables outside of for loop
From: Simon Horman @ 2016-12-02 11:56 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: netdev, Simon Horman
Initialise for loops outside of for loops. GCC flags this as being
out of spec unless C99 or C11 mode is used.
With this change the entire tree appears to compile cleanly with -Wall.
$ gcc --version
gcc (Debian 4.9.2-10) 4.9.2
...
$ make
...
ss.c: In function ‘unix_show_sock’:
ss.c:3128:4: error: ‘for’ loop initial declarations are only allowed in C99 or C11 mode
...
Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
misc/ss.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/misc/ss.c b/misc/ss.c
index 839781ee29bc..ce0b9d3d993d 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -3124,10 +3124,12 @@ static int unix_show_sock(const struct sockaddr_nl *addr, struct nlmsghdr *nlh,
memcpy(name, RTA_DATA(tb[UNIX_DIAG_NAME]), len);
name[len] = '\0';
- if (name[0] == '\0')
- for (int i = 0; i < len; i++)
+ if (name[0] == '\0') {
+ int i;
+ for (i = 0; i < len; i++)
if (name[i] == '\0')
name[i] = '@';
+ }
stat.name = &name[0];
memcpy(stat.local.data, &stat.name, sizeof(stat.name));
}
--
2.7.0.rc3.207.g0ac5344
^ permalink raw reply related
* Re: [PATCH net v3] tipc: check minimum bearer MTU
From: Ying Xue @ 2016-12-02 12:07 UTC (permalink / raw)
To: Michal Kubecek, Jon Maloy
Cc: Qian, netdev, Zhang, linux-kernel, tipc-discussion, Ben Hutchings,
David S. Miller
In-Reply-To: <20161202083341.BB955A0F33@unicorn.suse.cz>
On 12/02/2016 04:33 PM, Michal Kubecek wrote:
> Qian Zhang (张谦) reported a potential socket buffer overflow in
> tipc_msg_build() which is also known as CVE-2016-8632: due to
> insufficient checks, a buffer overflow can occur if MTU is too short for
> even tipc headers. As anyone can set device MTU in a user/net namespace,
> this issue can be abused by a regular user.
>
> As agreed in the discussion on Ben Hutchings' original patch, we should
> check the MTU at the moment a bearer is attached rather than for each
> processed packet. We also need to repeat the check when bearer MTU is
> adjusted to new device MTU. UDP case also needs a check to avoid
> overflow when calculating bearer MTU.
>
> Fixes: b97bf3fd8f6a ("[TIPC] Initial merge")
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> Reported-by: Qian Zhang (张谦) <zhangqian-c@360.cn>
> ---
Thanks, it looks nice to me.
Acked-by: Ying Xue <ying.xue@windriver.com>
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion
^ permalink raw reply
* Re: Initial thoughts on TXDP
From: Jesper Dangaard Brouer @ 2016-12-02 12:13 UTC (permalink / raw)
To: Tom Herbert
Cc: brouer, Florian Westphal, Linux Kernel Network Developers,
linux-mm
In-Reply-To: <CALx6S36ywu3ruY7AFKYk=N4Ekr5zjY33ivx92EgNNT36XoXhFA@mail.gmail.com>
On Thu, 1 Dec 2016 11:51:42 -0800 Tom Herbert <tom@herbertland.com> wrote:
> On Wed, Nov 30, 2016 at 6:44 PM, Florian Westphal <fw@strlen.de> wrote:
> > Tom Herbert <tom@herbertland.com> wrote:
[...]
> >> - Call into TCP/IP stack with page data directly from driver-- no
> >> skbuff allocation or interface. This is essentially provided by the
> >> XDP API although we would need to generalize the interface to call
> >> stack functions (I previously posted patches for that). We will also
> >> need a new action, XDP_HELD?, that indicates the XDP function held the
> >> packet (put on a socket for instance).
> >
> > Seems this will not work at all with the planned page pool thing when
> > pages start to be held indefinitely.
It is quite the opposite, the page pool support pages are being held
for longer times, than drivers today. The current driver page recycle
tricks cannot, as they depend on page refcnt being decremented quickly
(while pages are still mapped in their recycle queue).
> > You can also never get even close to userspace offload stacks once you
> > need/do this; allocations in hotpath are too expensive.
Yes. It is important to understand that once the number of outstanding
pages get large, the driver recycle stops working. Meaning the pages
allocations start to go through the page allocator. I've documented[1]
that the bare alloc+free cost[2] (231 cycles order-0/4K) is higher than
the 10G wirespeed budget (201 cycles).
Thus, the driver recycle tricks are nice for benchmarking, as it hides
the page allocator overhead. But this optimization might disappear for
Tom's and Eric's more real-world use-cases e.g. like 10.000 sockets.
The page pool don't these issues.
[1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf
[2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/bench/page_bench01.c
--
Best regards,
Jesper Dangaard Brouer
MSc.CS, Principal Kernel Engineer at Red Hat
LinkedIn: http://www.linkedin.com/in/brouer
^ permalink raw reply
* Re: [PATCH v2] sh_eth: remove unchecked interrupts
From: Geert Uytterhoeven @ 2016-12-02 12:18 UTC (permalink / raw)
To: Chris Brandt
Cc: Sergei Shtylyov, David Miller, Simon Horman, Geert Uytterhoeven,
netdev@vger.kernel.org, Linux-Renesas
In-Reply-To: <SG2PR06MB1165C7D7B7DFBAC4B5BAB09A8A8F0@SG2PR06MB1165.apcprd06.prod.outlook.com>
Hi Chris,
On Thu, Dec 1, 2016 at 7:53 PM, Chris Brandt <Chris.Brandt@renesas.com> wrote:
> On 12/1/2016, Sergei Shtylyov wrote:
>>
>> On 12/01/2016 05:42 PM, Geert Uytterhoeven wrote:
>>
>> >> --- a/drivers/net/ethernet/renesas/sh_eth.c
>> >> +++ b/drivers/net/ethernet/renesas/sh_eth.c
>> >> @@ -518,7 +518,7 @@ static struct sh_eth_cpu_data r7s72100_data = {
>> >>
>> >> .ecsr_value = ECSR_ICD,
>> >> .ecsipr_value = ECSIPR_ICDIP,
>> >> - .eesipr_value = 0xff7f009f,
>> >> + .eesipr_value = 0xe77f009f,
>> >
>> > Comment not directly related to the merits of this patch: the EESIPR
>> > bit definitions seem to be identical to those for EESR, so we can get
>> > rid of the hardcoded values here?
>>
>> Do you mean using the @define's? We have EESIPR bits also declared,
>> see enum DMAC_IM_BIT,
Yes, that's what I meant.
Unfortunately the DMAC_IM_BIT enum doesn't cover all bits.
> Is your idea to get rid of .eesipr_value for devices that have values
> that are the same as .eesr_err_check?
>
>
> For example in sh_eth_dev_init():
>
> sh_eth_modify(ndev, EESR, 0, 0);
> mdp->irq_enabled = true;
> - sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
> + if (mdp->cd->eesipr_value)
> + sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
> + else
> + sh_eth_write(ndev, mdp->cd->eesr_err_check, EESIPR);
No, my intention was to just get rid of the hardcoded values when
initializing .eesipr_value.
Gr{oetje,eeting}s,
Geert
--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox