Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH iproute2 V5 0/3] tc: Support for ip tunnel metadata set/unset/classify
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
	Amir Vadai

Hi,

This short series adds support for matching and setting metadata for ip tunnel
shared device using the TC system, introduced in kernel 4.9 [1].

Applied and tested on top of commit b6c7fc61faab ("ss: print new tcp_info
fields: busy, rwnd-limited, sndbuf-limited times")


Example usage:

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
      enc_src_ip 11.11.0.2 \
      enc_dst_ip 11.11.0.1 \
      enc_key_id 11 \
      dst_ip 11.11.11.1 \
    action mirred egress redirect dev vnet0

$ tc filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action tunnel_key set \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

[1] - d1ba24feb466 ("Merge branch 'act_tunnel_key'")

Thanks,
Amir

Changes from V4:
- Fix rebase conflicts for net-next

Changes from V3:
- Fix bad wording in the man page about the use of the 'unset' operation

Changes from V2:
- Use const where needed
- Don't lose return value
- Introduce rta_getattr_be16() and rta_getattr_be32()

Changes from V1:
- Updated Patch 2/2 ("tc/act_tunnel: Introduce ip tunnel action") commit log
	and the man page tc-tunnel_key to reflect the fact that 'unset' operation is
	no mandatory.
	And describe when it might be needed.
- Rename the 'release' operation to 'unset'

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

Amir Vadai (3):
  libnetlink: Introduce rta_getattr_be*()
  tc/cls_flower: Classify packet in ip tunnels
  tc/act_tunnel: Introduce ip tunnel action

 bridge/fdb.c                         |   4 +-
 include/libnetlink.h                 |   9 ++
 include/linux/tc_act/tc_tunnel_key.h |  42 ++++++
 ip/iplink_geneve.c                   |   2 +-
 ip/iplink_vxlan.c                    |   2 +-
 man/man8/tc-flower.8                 |  17 ++-
 man/man8/tc-tunnel_key.8             | 112 +++++++++++++++
 tc/Makefile                          |   1 +
 tc/f_flower.c                        |  84 +++++++++++-
 tc/m_tunnel_key.c                    | 258 +++++++++++++++++++++++++++++++++++
 10 files changed, 522 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

-- 
2.10.2

^ permalink raw reply

* [PATCH iproute2 V5 1/3] libnetlink: Introduce rta_getattr_be*()
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
	Amir Vadai
In-Reply-To: <20161202112515.11705-1-amir@vadai.me>

Add the utility functions rta_getattr_be16() and rta_getattr_be32(), and
change existing code to use it.

Signed-off-by: Amir Vadai <amir@vadai.me>
---
 bridge/fdb.c         | 4 ++--
 include/libnetlink.h | 9 +++++++++
 ip/iplink_geneve.c   | 2 +-
 ip/iplink_vxlan.c    | 2 +-
 tc/f_flower.c        | 2 +-
 5 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index 90f4b154c5dc..a91521776e99 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -168,10 +168,10 @@ int print_fdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 	if (tb[NDA_PORT]) {
 		if (jw_global)
 			jsonw_uint_field(jw_global, "port",
-					 ntohs(rta_getattr_u16(tb[NDA_PORT])));
+					 rta_getattr_be16(tb[NDA_PORT]));
 		else
 			fprintf(fp, "port %d ",
-				ntohs(rta_getattr_u16(tb[NDA_PORT])));
+				rta_getattr_be16(tb[NDA_PORT]));
 	}
 
 	if (tb[NDA_VNI]) {
diff --git a/include/libnetlink.h b/include/libnetlink.h
index 483509ca9635..751ebf186dd4 100644
--- a/include/libnetlink.h
+++ b/include/libnetlink.h
@@ -10,6 +10,7 @@
 #include <linux/if_addr.h>
 #include <linux/neighbour.h>
 #include <linux/netconf.h>
+#include <arpa/inet.h>
 
 struct rtnl_handle {
 	int			fd;
@@ -140,10 +141,18 @@ static inline __u16 rta_getattr_u16(const struct rtattr *rta)
 {
 	return *(__u16 *)RTA_DATA(rta);
 }
+static inline __be16 rta_getattr_be16(const struct rtattr *rta)
+{
+	return ntohs(rta_getattr_u16(rta));
+}
 static inline __u32 rta_getattr_u32(const struct rtattr *rta)
 {
 	return *(__u32 *)RTA_DATA(rta);
 }
+static inline __be32 rta_getattr_be32(const struct rtattr *rta)
+{
+	return ntohl(rta_getattr_u32(rta));
+}
 static inline __u64 rta_getattr_u64(const struct rtattr *rta)
 {
 	__u64 tmp;
diff --git a/ip/iplink_geneve.c b/ip/iplink_geneve.c
index 3bfba91c644c..1e6669d07d60 100644
--- a/ip/iplink_geneve.c
+++ b/ip/iplink_geneve.c
@@ -234,7 +234,7 @@ static void geneve_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 
 	if (tb[IFLA_GENEVE_PORT])
 		fprintf(f, "dstport %u ",
-			ntohs(rta_getattr_u16(tb[IFLA_GENEVE_PORT])));
+			rta_getattr_be16(tb[IFLA_GENEVE_PORT]));
 
 	if (tb[IFLA_GENEVE_COLLECT_METADATA])
 		fputs("external ", f);
diff --git a/ip/iplink_vxlan.c b/ip/iplink_vxlan.c
index 93af979a1e97..6d02bb47b2f0 100644
--- a/ip/iplink_vxlan.c
+++ b/ip/iplink_vxlan.c
@@ -413,7 +413,7 @@ static void vxlan_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
 
 	if (tb[IFLA_VXLAN_PORT])
 		fprintf(f, "dstport %u ",
-			ntohs(rta_getattr_u16(tb[IFLA_VXLAN_PORT])));
+			rta_getattr_be16(tb[IFLA_VXLAN_PORT]));
 
 	if (tb[IFLA_VXLAN_LEARNING] &&
 	    !rta_getattr_u8(tb[IFLA_VXLAN_LEARNING]))
diff --git a/tc/f_flower.c b/tc/f_flower.c
index 1555764b9996..e132974e0d1d 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -511,7 +511,7 @@ static void flower_print_ip_addr(FILE *f, char *name, __be16 eth_type,
 
 static void flower_print_port(FILE *f, char *name, struct rtattr *attr)
 {
-	fprintf(f, "\n  %s %d", name, ntohs(rta_getattr_u16(attr)));
+	fprintf(f, "\n  %s %d", name, rta_getattr_be16(attr));
 }
 
 static int flower_print_opt(struct filter_util *qu, FILE *f,
-- 
2.10.2

^ permalink raw reply related

* [PATCH iproute2 V5 2/3] tc/cls_flower: Classify packet in ip tunnels
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
	Amir Vadai
In-Reply-To: <20161202112515.11705-1-amir@vadai.me>

Introduce classifying by metadata extracted by the tunnel device.
Outer header fields - source/dest ip and tunnel id, are extracted from
the metadata when classifying.

For example, the following will add a filter on the ingress Qdisc of shared
vxlan device named 'vxlan0'. To forward packets with outer src ip
11.11.0.2, dst ip 11.11.0.1 and tunnel id 11. The packets will be
forwarded to tap device 'vnet0':

$ tc filter add dev vxlan0 protocol ip parent ffff: \
    flower \
      enc_src_ip 11.11.0.2 \
      enc_dst_ip 11.11.0.1 \
      enc_key_id 11 \
      dst_ip 11.11.11.1 \
    action mirred egress redirect dev vnet0

Signed-off-by: Amir Vadai <amir@vadai.me>
---
 man/man8/tc-flower.8 | 17 ++++++++++-
 tc/f_flower.c        | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 95 insertions(+), 4 deletions(-)

diff --git a/man/man8/tc-flower.8 b/man/man8/tc-flower.8
index 16ef261797ab..dd3564917dcc 100644
--- a/man/man8/tc-flower.8
+++ b/man/man8/tc-flower.8
@@ -34,7 +34,11 @@ flower \- flow based traffic control filter
 .BR dst_ip " | " src_ip " } { "
 .IR ipv4_address " | " ipv6_address " } | { "
 .BR dst_port " | " src_port " } "
-.IR port_number " }"
+.IR port_number " } | "
+.B enc_key_id
+.IR KEY-ID " | {"
+.BR enc_dst_ip " | " enc_src_ip " } { "
+.IR ipv4_address " | " ipv6_address " } | "
 .SH DESCRIPTION
 The
 .B flower
@@ -112,6 +116,17 @@ which has to be specified in beforehand.
 Match on layer 4 protocol source or destination port number. Only available for
 .BR ip_proto " values " udp " and " tcp ,
 which has to be specified in beforehand.
+.TP
+.BI enc_key_id " NUMBER"
+.TQ
+.BI enc_dst_ip " ADDRESS"
+.TQ
+.BI enc_src_ip " ADDRESS"
+Match on IP tunnel metadata. Key id
+.I NUMBER
+is a 32 bit tunnel key id (e.g. VNI for VXLAN tunnel).
+.I ADDRESS
+must be a valid IPv4 or IPv6 address.
 .SH NOTES
 As stated above where applicable, matches of a certain layer implicitly depend
 on the matches of the next lower layer. Precisely, layer one and two matches (
diff --git a/tc/f_flower.c b/tc/f_flower.c
index e132974e0d1d..7e7f4c92a947 100644
--- a/tc/f_flower.c
+++ b/tc/f_flower.c
@@ -41,7 +41,10 @@ static void explain(void)
 		"                       dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
 		"                       src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
 		"                       dst_port PORT-NUMBER |\n"
-		"                       src_port PORT-NUMBER }\n"
+		"                       src_port PORT-NUMBER |\n"
+		"                       enc_dst_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
+		"                       enc_src_ip [ IPV4-ADDR | IPV6-ADDR ] |\n"
+		"                       enc_key_id [ KEY-ID ] }\n"
 		"       FILTERID := X:Y:Z\n"
 		"       ACTION-SPEC := ... look at individual actions\n"
 		"\n"
@@ -125,8 +128,9 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
 		family = AF_INET;
 	} else if (eth_type == htons(ETH_P_IPV6)) {
 		family = AF_INET6;
+	} else if (!eth_type) {
+		family = AF_UNSPEC;
 	} else {
-		fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
 		return -1;
 	}
 
@@ -134,8 +138,10 @@ static int flower_parse_ip_addr(char *str, __be16 eth_type,
 	if (ret)
 		return -1;
 
-	if (addr.family != family)
+	if (family && (addr.family != family)) {
+		fprintf(stderr, "Illegal \"eth_type\" for ip address\n");
 		return -1;
+	}
 
 	addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
 		  addr.data, addr.bytelen);
@@ -197,6 +203,18 @@ static int flower_parse_port(char *str, __u8 ip_port, bool is_src,
 	return 0;
 }
 
+static int flower_parse_key_id(const char *str, int type, struct nlmsghdr *n)
+{
+	int ret;
+	__be32 key_id;
+
+	ret = get_be32(&key_id, str, 10);
+	if (!ret)
+		addattr32(n, MAX_MSG, type, key_id);
+
+	return ret;
+}
+
 static int flower_parse_opt(struct filter_util *qu, char *handle,
 			    int argc, char **argv, struct nlmsghdr *n)
 {
@@ -354,6 +372,38 @@ static int flower_parse_opt(struct filter_util *qu, char *handle,
 				fprintf(stderr, "Illegal \"src_port\"\n");
 				return -1;
 			}
+		} else if (matches(*argv, "enc_dst_ip") == 0) {
+			NEXT_ARG();
+			ret = flower_parse_ip_addr(*argv, 0,
+						   TCA_FLOWER_KEY_ENC_IPV4_DST,
+						   TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,
+						   TCA_FLOWER_KEY_ENC_IPV6_DST,
+						   TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,
+						   n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"enc_dst_ip\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "enc_src_ip") == 0) {
+			NEXT_ARG();
+			ret = flower_parse_ip_addr(*argv, 0,
+						   TCA_FLOWER_KEY_ENC_IPV4_SRC,
+						   TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,
+						   TCA_FLOWER_KEY_ENC_IPV6_SRC,
+						   TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,
+						   n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"enc_src_ip\"\n");
+				return -1;
+			}
+		} else if (matches(*argv, "enc_key_id") == 0) {
+			NEXT_ARG();
+			ret = flower_parse_key_id(*argv,
+						  TCA_FLOWER_KEY_ENC_KEY_ID, n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"enc_key_id\"\n");
+				return -1;
+			}
 		} else if (matches(*argv, "action") == 0) {
 			NEXT_ARG();
 			ret = parse_action(&argc, &argv, TCA_FLOWER_ACT, n);
@@ -514,6 +564,13 @@ static void flower_print_port(FILE *f, char *name, struct rtattr *attr)
 	fprintf(f, "\n  %s %d", name, rta_getattr_be16(attr));
 }
 
+static void flower_print_key_id(FILE *f, const char *name,
+				struct rtattr *attr)
+{
+	if (attr)
+		fprintf(f, "\n  %s %d", name, rta_getattr_be32(attr));
+}
+
 static int flower_print_opt(struct filter_util *qu, FILE *f,
 			    struct rtattr *opt, __u32 handle)
 {
@@ -579,6 +636,25 @@ static int flower_print_opt(struct filter_util *qu, FILE *f,
 	flower_print_port(f, "src_port",
 			  tb[flower_port_attr_type(ip_proto, true)]);
 
+	flower_print_ip_addr(f, "enc_dst_ip",
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] ?
+			     htons(ETH_P_IP) : htons(ETH_P_IPV6),
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_DST],
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_DST_MASK],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_DST],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK]);
+
+	flower_print_ip_addr(f, "enc_src_ip",
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] ?
+			     htons(ETH_P_IP) : htons(ETH_P_IPV6),
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_SRC],
+			     tb[TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_SRC],
+			     tb[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK]);
+
+	flower_print_key_id(f, "enc_key_id",
+			    tb[TCA_FLOWER_KEY_ENC_KEY_ID]);
+
 	if (tb[TCA_FLOWER_FLAGS]) {
 		__u32 flags = rta_getattr_u32(tb[TCA_FLOWER_FLAGS]);
 
-- 
2.10.2

^ permalink raw reply related

* [PATCH iproute2 V5 3/3] tc/act_tunnel: Introduce ip tunnel action
From: Amir Vadai @ 2016-12-02 11:25 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, David S. Miller, Or Gerlitz, Hadar Har-Zion, Roi Dayan,
	Amir Vadai
In-Reply-To: <20161202112515.11705-1-amir@vadai.me>

This action could be used before redirecting packets to a shared tunnel
device, or when redirecting packets arriving from a such a device.

The 'unset' action is optional. It is used to explicitly unset the
metadata created by the tunnel device during decap. If not used, the
metadata will be released automatically by the kernel.
The 'set' operation, will set the metadata with the specified values for
the encap.

For example, the following flower filter will forward all ICMP packets
destined to 11.11.11.2 through the shared vxlan device 'vxlan0'. Before
redirecting, a metadata for the vxlan tunnel is created using the
tunnel_key action and it's arguments:

$ tc filter add dev net0 protocol ip parent ffff: \
    flower \
      ip_proto 1 \
      dst_ip 11.11.11.2 \
    action tunnel_key set \
      src_ip 11.11.0.1 \
      dst_ip 11.11.0.2 \
      id 11 \
    action mirred egress redirect dev vxlan0

Signed-off-by: Amir Vadai <amir@vadai.me>
---
 include/linux/tc_act/tc_tunnel_key.h |  42 ++++++
 man/man8/tc-tunnel_key.8             | 112 +++++++++++++++
 tc/Makefile                          |   1 +
 tc/m_tunnel_key.c                    | 258 +++++++++++++++++++++++++++++++++++
 4 files changed, 413 insertions(+)
 create mode 100644 include/linux/tc_act/tc_tunnel_key.h
 create mode 100644 man/man8/tc-tunnel_key.8
 create mode 100644 tc/m_tunnel_key.c

diff --git a/include/linux/tc_act/tc_tunnel_key.h b/include/linux/tc_act/tc_tunnel_key.h
new file mode 100644
index 000000000000..f9ddf5369a45
--- /dev/null
+++ b/include/linux/tc_act/tc_tunnel_key.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2016, Amir Vadai <amir@vadai.me>
+ * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_TC_TUNNEL_KEY_H
+#define __LINUX_TC_TUNNEL_KEY_H
+
+#include <linux/pkt_cls.h>
+
+#define TCA_ACT_TUNNEL_KEY 17
+
+#define TCA_TUNNEL_KEY_ACT_SET	    1
+#define TCA_TUNNEL_KEY_ACT_RELEASE  2
+
+struct tc_tunnel_key {
+	tc_gen;
+	int t_action;
+};
+
+enum {
+	TCA_TUNNEL_KEY_UNSPEC,
+	TCA_TUNNEL_KEY_TM,
+	TCA_TUNNEL_KEY_PARMS,
+	TCA_TUNNEL_KEY_ENC_IPV4_SRC,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV4_DST,	/* be32 */
+	TCA_TUNNEL_KEY_ENC_IPV6_SRC,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_IPV6_DST,	/* struct in6_addr */
+	TCA_TUNNEL_KEY_ENC_KEY_ID,	/* be64 */
+	TCA_TUNNEL_KEY_PAD,
+	__TCA_TUNNEL_KEY_MAX,
+};
+
+#define TCA_TUNNEL_KEY_MAX (__TCA_TUNNEL_KEY_MAX - 1)
+
+#endif
+
diff --git a/man/man8/tc-tunnel_key.8 b/man/man8/tc-tunnel_key.8
new file mode 100644
index 000000000000..17b15b9b34b9
--- /dev/null
+++ b/man/man8/tc-tunnel_key.8
@@ -0,0 +1,112 @@
+.TH "Tunnel metadata manipulation action in tc" 8 "10 Nov 2016" "iproute2" "Linux"
+
+.SH NAME
+tunnel_key - Tunnel metadata manipulation
+.SH SYNOPSIS
+.in +8
+.ti -8
+.BR tc " ... " "action tunnel_key" " { " unset " | "
+.IR SET " }"
+
+.ti -8
+.IR SET " := "
+.BR set " " src_ip
+.IR ADDRESS
+.BR dst_ip
+.IR ADDRESS
+.BI id " KEY_ID"
+
+.SH DESCRIPTION
+The
+.B tunnel_key
+action combined with a shared IP tunnel device, allows to perform IP tunnel en-
+or decapsulation on a packet, reflected by
+the operation modes
+.IR UNSET " and " SET .
+The
+.I UNSET
+mode is optional - even without using it, the metadata information will be
+released automatically when packet processing will be finished.
+.IR UNSET
+function could be used in cases when traffic is forwarded between two tunnels,
+where the metadata from the first tunnel will be used for encapsulation done by
+the second tunnel.
+.IR SET
+mode requires the source and destination ip
+.I ADDRESS
+and the tunnel key id
+.I KEY_ID
+which will be used by the ip tunnel shared device to create the tunnel header. The
+.B tunnel_key
+action is useful only in combination with a
+.B mirred redirect
+action to a shared IP tunnel device which will use the metadata (for
+.I SET
+) and unset the metadata created by it (for
+.I UNSET
+).
+
+.SH OPTIONS
+.TP
+.B unset
+Unset the tunnel metadata created by the IP tunnel device.  This function is
+not mandatory and might be used only in some specific use cases (as explained
+above).
+.TP
+.B set
+Set tunnel metadata to be used by the IP tunnel device. Requires
+.B id
+,
+.B src_ip
+and
+.B dst_ip
+options.
+.RS
+.TP
+.B id
+Tunnel ID (for example VNI in VXLAN tunnel)
+.TP
+.B src_ip
+Outer header source IP address (IPv4 or IPv6)
+.TP
+.B dst_ip
+Outer header destination IP address (IPv4 or IPv6)
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming ICMP packets on eth0 into a vxlan
+tunnel, by setting metadata to VNI 11, source IP 11.11.0.1 and destination IP
+11.11.0.2, and by redirecting the packet with the metadata to device vxlan0,
+which will do the actual encapsulation using the metadata:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle ffff: ingress
+#tc filter add dev eth0 protocol ip parent ffff: \\
+  flower \\
+    ip_proto icmp \\
+  action tunnel_key set \\
+    src_ip 11.11.0.1 \\
+    dst_ip 11.11.0.2 \\
+    id 11 \\
+  action mirred egress redirect dev vxlan0
+.EE
+.RE
+
+Here is an example of the
+.B unset
+function: Incoming VXLAN traffic with outer IP's and VNI 11 is decapsulated by
+vxlan0 and metadata is unset before redirecting to tunl1 device:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle ffff: ingress
+#tc filter add dev vxlan0 protocol ip parent ffff: \
+  flower \\
+	  enc_src_ip 11.11.0.2 enc_dst_ip 11.11.0.1 enc_key_id 11 \
+	action tunnel_key unset \
+	action mirred egress redirect dev tunl1
+.EE
+.RE
+
+.SH SEE ALSO
+.BR tc (8)
diff --git a/tc/Makefile b/tc/Makefile
index f986fcb9e9fd..bb9011432ea1 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -50,6 +50,7 @@ TCMODULES += m_simple.o
 TCMODULES += m_vlan.o
 TCMODULES += m_connmark.o
 TCMODULES += m_bpf.o
+TCMODULES += m_tunnel_key.o
 TCMODULES += p_ip.o
 TCMODULES += p_icmp.o
 TCMODULES += p_tcp.o
diff --git a/tc/m_tunnel_key.c b/tc/m_tunnel_key.c
new file mode 100644
index 000000000000..f4a20e24e0bf
--- /dev/null
+++ b/tc/m_tunnel_key.c
@@ -0,0 +1,258 @@
+/*
+ * m_tunnel_key.c	ip tunnel manipulation module
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Authors:     Amir Vadai <amir@vadai.me>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <linux/if_ether.h>
+#include "utils.h"
+#include "rt_names.h"
+#include "tc_util.h"
+#include <linux/tc_act/tc_tunnel_key.h>
+
+static void explain(void)
+{
+	fprintf(stderr, "Usage: tunnel_key unset\n");
+	fprintf(stderr, "       tunnel_key set id TUNNELID src_ip IP dst_ip IP\n");
+}
+
+static void usage(void)
+{
+	explain();
+	exit(-1);
+}
+
+static int tunnel_key_parse_ip_addr(const char *str, int addr4_type,
+				    int addr6_type, struct nlmsghdr *n)
+{
+	inet_prefix addr;
+	int ret;
+
+	ret = get_addr(&addr, str, AF_UNSPEC);
+	if (ret)
+		return ret;
+
+	addattr_l(n, MAX_MSG, addr.family == AF_INET ? addr4_type : addr6_type,
+		  addr.data, addr.bytelen);
+
+	return 0;
+}
+
+static int tunnel_key_parse_key_id(const char *str, int type,
+				   struct nlmsghdr *n)
+{
+	__be32 key_id;
+	int ret;
+
+	ret = get_be32(&key_id, str, 10);
+	if (!ret)
+		addattr32(n, MAX_MSG, type, key_id);
+
+	return ret;
+}
+
+static int parse_tunnel_key(struct action_util *a, int *argc_p, char ***argv_p,
+			    int tca_id, struct nlmsghdr *n)
+{
+	struct tc_tunnel_key parm = { .action = TC_ACT_PIPE };
+	char **argv = *argv_p;
+	int argc = *argc_p;
+	struct rtattr *tail;
+	int action = 0;
+	int ret;
+	int has_src_ip = 0;
+	int has_dst_ip = 0;
+	int has_key_id = 0;
+
+	if (matches(*argv, "tunnel_key") != 0)
+		return -1;
+
+	tail = NLMSG_TAIL(n);
+	addattr_l(n, MAX_MSG, tca_id, NULL, 0);
+
+	NEXT_ARG();
+
+	while (argc > 0) {
+		if (matches(*argv, "unset") == 0) {
+			if (action) {
+				fprintf(stderr, "unexpected \"%s\" - action already specified\n",
+					*argv);
+				explain();
+				return -1;
+			}
+			action = TCA_TUNNEL_KEY_ACT_RELEASE;
+		} else if (matches(*argv, "set") == 0) {
+			if (action) {
+				fprintf(stderr, "unexpected \"%s\" - action already specified\n",
+					*argv);
+				explain();
+				return -1;
+			}
+			action = TCA_TUNNEL_KEY_ACT_SET;
+		} else if (matches(*argv, "src_ip") == 0) {
+			NEXT_ARG();
+			ret = tunnel_key_parse_ip_addr(*argv,
+						       TCA_TUNNEL_KEY_ENC_IPV4_SRC,
+						       TCA_TUNNEL_KEY_ENC_IPV6_SRC,
+						       n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"src_ip\"\n");
+				return -1;
+			}
+			has_src_ip = 1;
+		} else if (matches(*argv, "dst_ip") == 0) {
+			NEXT_ARG();
+			ret = tunnel_key_parse_ip_addr(*argv,
+						       TCA_TUNNEL_KEY_ENC_IPV4_DST,
+						       TCA_TUNNEL_KEY_ENC_IPV6_DST,
+						       n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"dst_ip\"\n");
+				return -1;
+			}
+			has_dst_ip = 1;
+		} else if (matches(*argv, "id") == 0) {
+			NEXT_ARG();
+			ret = tunnel_key_parse_key_id(*argv, TCA_TUNNEL_KEY_ENC_KEY_ID, n);
+			if (ret < 0) {
+				fprintf(stderr, "Illegal \"id\"\n");
+				return -1;
+			}
+			has_key_id = 1;
+		} else if (matches(*argv, "help") == 0) {
+			usage();
+		} else {
+			break;
+		}
+		NEXT_ARG_FWD();
+	}
+
+	if (argc && !action_a2n(*argv, &parm.action, false))
+		NEXT_ARG_FWD();
+
+	if (argc) {
+		if (matches(*argv, "index") == 0) {
+			NEXT_ARG();
+			if (get_u32(&parm.index, *argv, 10)) {
+				fprintf(stderr, "tunnel_key: Illegal \"index\"\n");
+				return -1;
+			}
+
+			NEXT_ARG_FWD();
+		}
+	}
+
+	if (action == TCA_TUNNEL_KEY_ACT_SET &&
+	    (!has_src_ip || !has_dst_ip || !has_key_id)) {
+		fprintf(stderr, "set needs tunnel_key parameters\n");
+		explain();
+		return -1;
+	}
+
+	parm.t_action = action;
+	addattr_l(n, MAX_MSG, TCA_TUNNEL_KEY_PARMS, &parm, sizeof(parm));
+	tail->rta_len = (char *)NLMSG_TAIL(n) - (char *)tail;
+
+	*argc_p = argc;
+	*argv_p = argv;
+
+	return 0;
+}
+
+static void tunnel_key_print_ip_addr(FILE *f, const char *name,
+				     struct rtattr *attr)
+{
+	int family;
+	size_t len;
+
+	if (!attr)
+		return;
+
+	len = RTA_PAYLOAD(attr);
+
+	if (len == 4)
+		family = AF_INET;
+	else if (len == 16)
+		family = AF_INET6;
+	else
+		return;
+
+	fprintf(f, "\n\t%s %s", name, rt_addr_n2a_rta(family, attr));
+}
+
+static void tunnel_key_print_key_id(FILE *f, const char *name,
+				    struct rtattr *attr)
+{
+	if (!attr)
+		return;
+	fprintf(f, "\n\t%s %d", name, rta_getattr_be32(attr));
+}
+
+static int print_tunnel_key(struct action_util *au, FILE *f, struct rtattr *arg)
+{
+	struct rtattr *tb[TCA_TUNNEL_KEY_MAX + 1];
+	struct tc_tunnel_key *parm;
+
+	if (!arg)
+		return -1;
+
+	parse_rtattr_nested(tb, TCA_TUNNEL_KEY_MAX, arg);
+
+	if (!tb[TCA_TUNNEL_KEY_PARMS]) {
+		fprintf(f, "[NULL tunnel_key parameters]");
+		return -1;
+	}
+	parm = RTA_DATA(tb[TCA_TUNNEL_KEY_PARMS]);
+
+	fprintf(f, "tunnel_key");
+
+	switch (parm->t_action) {
+	case TCA_TUNNEL_KEY_ACT_RELEASE:
+		fprintf(f, " unset");
+		break;
+	case TCA_TUNNEL_KEY_ACT_SET:
+		fprintf(f, " set");
+		tunnel_key_print_ip_addr(f, "src_ip",
+					 tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
+		tunnel_key_print_ip_addr(f, "dst_ip",
+					 tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
+		tunnel_key_print_ip_addr(f, "src_ip",
+					 tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
+		tunnel_key_print_ip_addr(f, "dst_ip",
+					 tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
+		tunnel_key_print_key_id(f, "key_id",
+					tb[TCA_TUNNEL_KEY_ENC_KEY_ID]);
+		break;
+	}
+	fprintf(f, " %s", action_n2a(parm->action));
+
+	fprintf(f, "\n\tindex %d ref %d bind %d", parm->index, parm->refcnt,
+		parm->bindcnt);
+
+	if (show_stats) {
+		if (tb[TCA_TUNNEL_KEY_TM]) {
+			struct tcf_t *tm = RTA_DATA(tb[TCA_TUNNEL_KEY_TM]);
+
+			print_tm(f, tm);
+		}
+	}
+
+	fprintf(f, "\n ");
+
+	return 0;
+}
+
+struct action_util tunnel_key_action_util = {
+	.id = "tunnel_key",
+	.parse_aopt = parse_tunnel_key,
+	.print_aopt = print_tunnel_key,
+};
-- 
2.10.2

^ permalink raw reply related

* Re: [PATCH 5/7] Documentation: DT: net: cpsw: allow to specify descriptors pool size
From: Ivan Khoronzhuk @ 2016-12-02 11:28 UTC (permalink / raw)
  To: Grygorii Strashko
  Cc: David S. Miller, netdev, Mugunthan V N, Sekhar Nori, linux-kernel,
	linux-omap
In-Reply-To: <20161201233432.6182-6-grygorii.strashko@ti.com>

On Thu, Dec 01, 2016 at 05:34:30PM -0600, Grygorii Strashko wrote:
> Add optional property "descs_pool_size" to specify buffer descriptor's
> pool size. The "descs_pool_size" should define total number of CPDMA
> CPPI descriptors to be used for both ingress/egress packets
> processing. If not specified - the default value 256 will be used
> which will allow to place descriptor's pool into the internal CPPI
> RAM on most of TI SoC.
> 
> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
> ---
>  Documentation/devicetree/bindings/net/cpsw.txt | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt
> index 5ad439f..b99d196 100644
> --- a/Documentation/devicetree/bindings/net/cpsw.txt
> +++ b/Documentation/devicetree/bindings/net/cpsw.txt
> @@ -35,6 +35,11 @@ Optional properties:
>  			  For example in dra72x-evm, pcf gpio has to be
>  			  driven low so that cpsw slave 0 and phy data
>  			  lines are connected via mux.
> +- descs_pool_size	: total number of CPDMA CPPI descriptors to be used for
> +			  both ingress/egress packets processing. if not
> +			  specified the default value 256 will be used which
> +			  will allow to place descriptors pool into the
> +			  internal CPPI RAM.
Does it describe h/w? Why now module parameter? or even smth like ethtool num
ring entries?

>  
>  Slave Properties:
> -- 
> 2.10.1
> 

^ permalink raw reply

* Re: [flamebait] xdp, well meaning but pointless
From: Hannes Frederic Sowa @ 2016-12-02 11:54 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Tom Herbert
  Cc: Thomas Graf, Florian Westphal, Linux Kernel Network Developers
In-Reply-To: <20161202112450.1720d33d@redhat.com>

On 02.12.2016 11:24, Jesper Dangaard Brouer wrote:
> On Thu, 1 Dec 2016 13:51:32 -0800
> Tom Herbert <tom@herbertland.com> wrote:
> 
>>>> The technical plenary at last IETF on Seoul a couple of weeks ago was
>>>> exclusively focussed on DDOS in light of the recent attack against
>>>> Dyn. There were speakers form Cloudflare and Dyn. The Cloudflare
>>>> presentation by Nick Sullivan
>>>> (https://www.ietf.org/proceedings/97/slides/slides-97-ietf-sessb-how-to-stay-online-harsh-realities-of-operating-in-a-hostile-network-nick-sullivan-01.pdf)
>>>> alluded to some implementation of DDOS mitigation. In particular, on
>>>> slide 6 Nick gave some numbers for drop rates in DDOS. The "kernel"
> 
> slide 14
> 
>>>> numbers he gave we're based in iptables+BPF and that was a whole
>>>> 1.2Mpps-- somehow that seems ridiculously to me (I said so at the mic
>>>> and that's also when I introduced XDP to whole IETF :-) ). If that's
>>>> the best we can do the Internet is in a world hurt. DDOS mitigation
>>>> alone is probably a sufficient motivation to look at XDP. We need
>>>> something that drops bad packets as quickly as possible when under
>>>> attack, we need this to be integrated into the stack, we need it to be
>>>> programmable to deal with the increasing savvy of attackers, and we
>>>> don't want to be forced to be dependent on HW solutions. This is why
>>>> we created XDP!  
> 
> The 1.2Mpps number is a bit low, but we are unfortunately in that
> ballpark.
> 
>>> I totally understand that. But in my reply to David in this thread I
>>> mentioned DNS apex processing as being problematic which is actually
>>> being referred in your linked slide deck on page 9 ("What do floods look
>>> like") and the problematic of parsing DNS packets in XDP due to string
>>> processing and looping inside eBPF.
> 
> That is a weak argument. You do realize CloudFlare actually use eBPF to
> do this exact filtering, and (so-far) eBPF for parsing DNS have been
> sufficient for them.

You are talking about this code on the following slides (I actually
transcribed it for you here and disassembled):

l0:	ld #0x14
l1:	ldxb 4*([0]&0xf)
l2:	add x
l3:	tax
l4:	ld [x+0]
l5:	jeq #0x7657861, l6, l13
l6:	ld [x+4]
l7:	jeq #0x6d706c65, l8, l13
l8:	ld [x+8]
l9:	jeq #0x3636f6d, l10, l13
l10:	ldb [x+12]
l11:	jeq #0, l12, l13
l12:	ret #0x1
l13:	ret #0

You can offload this to u32 in hardware if that is what you want.

The reason this works is because of netfilter, which allows them to
dynamically generate BPF programs and insert and delete them from
chains, do intersection or unions of them.

If you have a freestanding program like in XDP the complexity space is a
different one and not comparable to this at all.

Bye,
Hannes

^ permalink raw reply

* [PATCH iproute2/net-next] ss: initialise variables outside of for loop
From: Simon Horman @ 2016-12-02 11:56 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, Simon Horman

Initialise for loops outside of for loops. GCC flags this as being
out of spec unless C99 or C11 mode is used.

With this change the entire tree appears to compile cleanly with -Wall.

$ gcc --version
gcc (Debian 4.9.2-10) 4.9.2
...
$ make
...
ss.c: In function ‘unix_show_sock’:
ss.c:3128:4: error: ‘for’ loop initial declarations are only allowed in C99 or C11 mode
...

Signed-off-by: Simon Horman <simon.horman@netronome.com>
---
 misc/ss.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/misc/ss.c b/misc/ss.c
index 839781ee29bc..ce0b9d3d993d 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -3124,10 +3124,12 @@ static int unix_show_sock(const struct sockaddr_nl *addr, struct nlmsghdr *nlh,
 
 		memcpy(name, RTA_DATA(tb[UNIX_DIAG_NAME]), len);
 		name[len] = '\0';
-		if (name[0] == '\0')
-			for (int i = 0; i < len; i++)
+		if (name[0] == '\0') {
+			int i;
+			for (i = 0; i < len; i++)
 				if (name[i] == '\0')
 					name[i] = '@';
+		}
 		stat.name = &name[0];
 		memcpy(stat.local.data, &stat.name, sizeof(stat.name));
 	}
-- 
2.7.0.rc3.207.g0ac5344

^ permalink raw reply related

* Re: [PATCH net v3] tipc: check minimum bearer MTU
From: Ying Xue @ 2016-12-02 12:07 UTC (permalink / raw)
  To: Michal Kubecek, Jon Maloy
  Cc: Qian, netdev, Zhang, linux-kernel, tipc-discussion, Ben Hutchings,
	David S. Miller
In-Reply-To: <20161202083341.BB955A0F33@unicorn.suse.cz>

On 12/02/2016 04:33 PM, Michal Kubecek wrote:
> Qian Zhang (张谦) reported a potential socket buffer overflow in
> tipc_msg_build() which is also known as CVE-2016-8632: due to
> insufficient checks, a buffer overflow can occur if MTU is too short for
> even tipc headers. As anyone can set device MTU in a user/net namespace,
> this issue can be abused by a regular user.
>
> As agreed in the discussion on Ben Hutchings' original patch, we should
> check the MTU at the moment a bearer is attached rather than for each
> processed packet. We also need to repeat the check when bearer MTU is
> adjusted to new device MTU. UDP case also needs a check to avoid
> overflow when calculating bearer MTU.
>
> Fixes: b97bf3fd8f6a ("[TIPC] Initial merge")
> Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
> Reported-by: Qian Zhang (张谦) <zhangqian-c@360.cn>
> ---

Thanks, it looks nice to me.

Acked-by: Ying Xue <ying.xue@windriver.com>


------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most 
engaging tech sites, SlashDot.org! http://sdm.link/slashdot
_______________________________________________
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion

^ permalink raw reply

* Re: Initial thoughts on TXDP
From: Jesper Dangaard Brouer @ 2016-12-02 12:13 UTC (permalink / raw)
  To: Tom Herbert
  Cc: brouer, Florian Westphal, Linux Kernel Network Developers,
	linux-mm
In-Reply-To: <CALx6S36ywu3ruY7AFKYk=N4Ekr5zjY33ivx92EgNNT36XoXhFA@mail.gmail.com>

On Thu, 1 Dec 2016 11:51:42 -0800 Tom Herbert <tom@herbertland.com> wrote:
> On Wed, Nov 30, 2016 at 6:44 PM, Florian Westphal <fw@strlen.de> wrote:
> > Tom Herbert <tom@herbertland.com> wrote:  
[...]
> >>   - Call into TCP/IP stack with page data directly from driver-- no
> >> skbuff allocation or interface. This is essentially provided by the
> >> XDP API although we would need to generalize the interface to call
> >> stack functions (I previously posted patches for that). We will also
> >> need a new action, XDP_HELD?, that indicates the XDP function held the
> >> packet (put on a socket for instance).  
> >
> > Seems this will not work at all with the planned page pool thing when
> > pages start to be held indefinitely.

It is quite the opposite, the page pool support pages are being held
for longer times, than drivers today.  The current driver page recycle
tricks cannot, as they depend on page refcnt being decremented quickly
(while pages are still mapped in their recycle queue).

> > You can also never get even close to userspace offload stacks once you
> > need/do this; allocations in hotpath are too expensive.

Yes. It is important to understand that once the number of outstanding
pages get large, the driver recycle stops working.  Meaning the pages
allocations start to go through the page allocator.  I've documented[1]
that the bare alloc+free cost[2] (231 cycles order-0/4K) is higher than
the 10G wirespeed budget (201 cycles).

Thus, the driver recycle tricks are nice for benchmarking, as it hides
the page allocator overhead. But this optimization might disappear for
Tom's and Eric's more real-world use-cases e.g. like 10.000 sockets.
The page pool don't these issues.

[1] http://people.netfilter.org/hawk/presentations/MM-summit2016/generic_page_pool_mm_summit2016.pdf
[2] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/bench/page_bench01.c

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: [PATCH v2] sh_eth: remove unchecked interrupts
From: Geert Uytterhoeven @ 2016-12-02 12:18 UTC (permalink / raw)
  To: Chris Brandt
  Cc: Sergei Shtylyov, David Miller, Simon Horman, Geert Uytterhoeven,
	netdev@vger.kernel.org, Linux-Renesas
In-Reply-To: <SG2PR06MB1165C7D7B7DFBAC4B5BAB09A8A8F0@SG2PR06MB1165.apcprd06.prod.outlook.com>

Hi Chris,

On Thu, Dec 1, 2016 at 7:53 PM, Chris Brandt <Chris.Brandt@renesas.com> wrote:
> On 12/1/2016, Sergei Shtylyov wrote:
>>
>> On 12/01/2016 05:42 PM, Geert Uytterhoeven wrote:
>>
>> >> --- a/drivers/net/ethernet/renesas/sh_eth.c
>> >> +++ b/drivers/net/ethernet/renesas/sh_eth.c
>> >> @@ -518,7 +518,7 @@ static struct sh_eth_cpu_data r7s72100_data = {
>> >>
>> >>         .ecsr_value     = ECSR_ICD,
>> >>         .ecsipr_value   = ECSIPR_ICDIP,
>> >> -       .eesipr_value   = 0xff7f009f,
>> >> +       .eesipr_value   = 0xe77f009f,
>> >
>> > Comment not directly related to the merits of this patch: the EESIPR
>> > bit definitions seem to be identical to those for EESR, so we can get
>> > rid of the hardcoded values here?
>>
>>     Do you mean using the @define's? We have EESIPR bits also declared,
>> see enum DMAC_IM_BIT,

Yes, that's what I meant.
Unfortunately the DMAC_IM_BIT enum doesn't cover all bits.

> Is your idea to get rid of .eesipr_value for devices that have values
> that are the same as .eesr_err_check?
>
>
> For example in sh_eth_dev_init():
>
>         sh_eth_modify(ndev, EESR, 0, 0);
>         mdp->irq_enabled = true;
> -       sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
> +       if (mdp->cd->eesipr_value)
> +               sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
> +       else
> +               sh_eth_write(ndev, mdp->cd->eesr_err_check, EESIPR);

No, my intention was to just get rid of the hardcoded values when
initializing .eesipr_value.

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: stmmac ethernet in kernel 4.9-rc6: coalescing related pauses.
From: Pavel Machek @ 2016-12-02 12:32 UTC (permalink / raw)
  To: Giuseppe CAVALLARO; +Cc: alexandre.torgue, David Miller, netdev, linux-kernel
In-Reply-To: <3192a4b6-1e97-048f-a0dd-bfc0f3d96ed8@st.com>

[-- Attachment #1: Type: text/plain, Size: 3728 bytes --]

Hi!

> >Well, if you have a workload that sends and receive packets, it tends
> >to work ok, as you do tx_clean() in stmmac_poll(). My workload is not
> >like that -- it is "sending packets at 3MB/sec, receiving none". So
> >the stmmac_tx_timer() is rescheduled and rescheduled and rescheduled,
> >and then we run out of transmit descriptors, and then 40msec passes,
> >and then we clean them. Bad.
> >
> >And that's why low-res timers do not cut it.
> 
> in that case, I expect that the tuning of the driver could help you.
> I mean, by using ethtool, it could be enough to set the IC bit on all
> the descriptors. You should touch the tx_coal_frames.
> 
> Then you can use ethtool -S to monitor the status.

Yes, I did something similar. Unfortnunately that meant crash within
minutes, at least with 4.4 kernel. (If you know what was fixed between
4.4 and 4.9, that would be helpful).

> We had experimented this tuning on STB IP where just datagrams
> had to send externally. To be honest, although we had seen
> better results w/o any timer, we kept this approach enabled
> because the timer was fast enough to cover our tests on SH4 boxes.

Please reply to David, and explain how it is supposed to
work... because right now it does not. 40 msec delays are not
acceptable in default configuration.

> >>In the ring, some descriptors can raise the irq (according to a
> >>threshold) and set the IC bit. In this path, the NAPI  poll will be
> >>scheduled.
> >
> >Not NAPI poll but stmmac_tx_timer(), right?
> 
> in the xmit according the the threshold the timer is started or the
> interrupt is set inside the descriptor.
> Then stmmac_tx_clean will be always called and, if you see the flow,
> no irqlock protection is needed!

Agreed that no irqlock protection is needed if we rely on napi and timers.

> >>Concerning the lock protection, we had reviewed long time ago and
> >>IIRC, no raise condition should be present. Open to review it,
> >>again!
...
> >There's nothing that protect stmmac_poll() from running concurently
> >with stmmac_dma_interrupt(), right?
> 
> This is not necessary.

dma_interrupt accesses shared priv->xstats; variables are of type
unsigned long (not atomic_t), yet they are accesssed from interrupt
context and from stmmac_ethtool without any locking. That can result
in broken statistics AFAICT.

Please take another look. As far as I can tell, you can have two cpus
at #1 and #2 in the code, at the same time. It looks like napi_... has
some atomic opertions inside so that looks safe at the first look. But
I'm not sure if they also include enough memory barriers to make it
safe...?


static void stmmac_dma_interrupt(struct stmmac_priv *priv)
{
...
        status = priv->hw->dma->dma_interrupt(priv->ioaddr, &priv->xstats);
        if (likely((status & handle_rx)) || (status & handle_tx)) {
                if (likely(napi_schedule_prep(&priv->napi))) {
#1
                        stmmac_disable_dma_irq(priv);
                        __napi_schedule(&priv->napi);
                }
        }


static int stmmac_poll(struct napi_struct *napi, int budget)
{
        struct stmmac_priv *priv = container_of(napi, struct stmmac_priv, napi);
        int work_done = 0;

        priv->xstats.napi_poll++;
        stmmac_tx_clean(priv);

        work_done = stmmac_rx(priv, budget);
	if (work_done < budget) {
                napi_complete(napi);
#2
	        stmmac_enable_dma_irq(priv);
        }
        return work_done;
}


Best regards,
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* net/can: warning in raw_setsockopt/__alloc_pages_slowpath
From: Andrey Konovalov @ 2016-12-02 12:43 UTC (permalink / raw)
  To: Oliver Hartkopp, Marc Kleine-Budde, David S. Miller, linux-can,
	netdev, LKML
  Cc: Dmitry Vyukov, Kostya Serebryany, syzkaller

[-- Attachment #1: Type: text/plain, Size: 1851 bytes --]

Hi!

I've got the following error report while running the syzkaller fuzzer.

A reproducer is attached.

On commit d8e435f3ab6fea2ea324dce72b51dd7761747523 (Nov 26).

------------[ cut here ]------------
WARNING: CPU: 0 PID: 4009 at mm/page_alloc.c:3511
__alloc_pages_slowpath+0x3d4/0x1bf0
Modules linked in:
CPU: 0 PID: 4009 Comm: a.out Not tainted 4.9.0-rc6+ #54
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
 ffff88006832f8a8 ffffffff81c73b14 0000000000000000 0000000000000000
 ffffffff842c6320 0000000000000000 ffff88006832f8f0 ffffffff8123dc57
 ffff880067d86000 ffffffff00000db7 ffffffff842c6320 0000000000000db7
Call Trace:
 [<     inline     >] __dump_stack lib/dump_stack.c:15
 [<ffffffff81c73b14>] dump_stack+0xb3/0x10f lib/dump_stack.c:51
 [<ffffffff8123dc57>] __warn+0x1a7/0x1f0 kernel/panic.c:550
 [<ffffffff8123de6c>] warn_slowpath_null+0x2c/0x40 kernel/panic.c:585
 [<ffffffff81559b04>] __alloc_pages_slowpath+0x3d4/0x1bf0 mm/page_alloc.c:3511
 [<ffffffff8155b8e2>] __alloc_pages_nodemask+0x5c2/0x710 mm/page_alloc.c:3781
 [<ffffffff816236a4>] alloc_pages_current+0xf4/0x400 mm/mempolicy.c:2072
 [<     inline     >] alloc_pages ./include/linux/gfp.h:469
 [<ffffffff815a834f>] kmalloc_order+0x1f/0x70 mm/slab_common.c:1015
 [<ffffffff815a83bf>] kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026
 [<     inline     >] kmalloc_large ./include/linux/slab.h:422
 [<ffffffff81635e67>] __kmalloc_track_caller+0x227/0x2a0 mm/slub.c:4233
 [<ffffffff8159932c>] memdup_user+0x2c/0xa0 mm/util.c:137
 [<ffffffff8369e0de>] raw_setsockopt+0x1be/0x9f0 net/can/raw.c:506
 [<     inline     >] SYSC_setsockopt net/socket.c:1757
 [<ffffffff82ca10c4>] SyS_setsockopt+0x154/0x240 net/socket.c:1736
 [<ffffffff840f2901>] entry_SYSCALL_64_fastpath+0x1f/0xc2
arch/x86/entry/entry_64.S:209
---[ end trace bc80556cca970089 ]---

[-- Attachment #2: setsockopt-kmalloc-warning-poc.c --]
[-- Type: text/x-csrc, Size: 4559 bytes --]

// autogenerated by syzkaller (http://github.com/google/syzkaller)

#ifndef __NR_setsockopt
#define __NR_setsockopt 54
#endif
#ifndef __NR_socket
#define __NR_socket 41
#endif

#define _GNU_SOURCE

#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>

#include <linux/capability.h>
#include <linux/if.h>
#include <linux/if_tun.h>
#include <linux/sched.h>
#include <net/if_arp.h>

#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <pthread.h>
#include <setjmp.h>
#include <signal.h>
#include <stdarg.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

const int kFailStatus = 67;
const int kErrorStatus = 68;
const int kRetryStatus = 69;

__attribute__((noreturn)) void fail(const char* msg, ...)
{
  int e = errno;
  fflush(stdout);
  va_list args;
  va_start(args, msg);
  vfprintf(stderr, msg, args);
  va_end(args);
  fprintf(stderr, " (errno %d)\n", e);
  exit(kFailStatus);
}

__attribute__((noreturn)) void exitf(const char* msg, ...)
{
  int e = errno;
  fflush(stdout);
  va_list args;
  va_start(args, msg);
  vfprintf(stderr, msg, args);
  va_end(args);
  fprintf(stderr, " (errno %d)\n", e);
  exit(kRetryStatus);
}

static int flag_debug;

void debug(const char* msg, ...)
{
  if (!flag_debug)
    return;
  va_list args;
  va_start(args, msg);
  vfprintf(stdout, msg, args);
  va_end(args);
  fflush(stdout);
}

__thread int skip_segv;
__thread jmp_buf segv_env;

static void segv_handler(int sig, siginfo_t* info, void* uctx)
{
  if (__atomic_load_n(&skip_segv, __ATOMIC_RELAXED))
    _longjmp(segv_env, 1);
  exit(sig);
}

static void install_segv_handler()
{
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_sigaction = segv_handler;
  sa.sa_flags = SA_NODEFER | SA_SIGINFO;
  sigaction(SIGSEGV, &sa, NULL);
  sigaction(SIGBUS, &sa, NULL);
}

#define NONFAILING(...)                                                \
  {                                                                    \
    __atomic_fetch_add(&skip_segv, 1, __ATOMIC_SEQ_CST);               \
    if (_setjmp(segv_env) == 0) {                                      \
      __VA_ARGS__;                                                     \
    }                                                                  \
    __atomic_fetch_sub(&skip_segv, 1, __ATOMIC_SEQ_CST);               \
  }

static uintptr_t execute_syscall(int nr, uintptr_t a0, uintptr_t a1,
                                 uintptr_t a2, uintptr_t a3,
                                 uintptr_t a4, uintptr_t a5,
                                 uintptr_t a6, uintptr_t a7,
                                 uintptr_t a8)
{
  switch (nr) {
  default:
    return syscall(nr, a0, a1, a2, a3, a4, a5);
  }
}

static void setup_main_process(uint64_t pid)
{
  struct sigaction sa;
  memset(&sa, 0, sizeof(sa));
  sa.sa_handler = SIG_IGN;
  syscall(SYS_rt_sigaction, 0x20, &sa, NULL, 8);
  syscall(SYS_rt_sigaction, 0x21, &sa, NULL, 8);
  install_segv_handler();

  char tmpdir_template[] = "./syzkaller.XXXXXX";
  char* tmpdir = mkdtemp(tmpdir_template);
  if (!tmpdir)
    fail("failed to mkdtemp");
  if (chmod(tmpdir, 0777))
    fail("failed to chmod");
  if (chdir(tmpdir))
    fail("failed to chdir");
}

static void loop();

static void sandbox_common()
{
  prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0);
  setpgrp();
  setsid();

  struct rlimit rlim;
  rlim.rlim_cur = rlim.rlim_max = 128 << 20;
  setrlimit(RLIMIT_AS, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 1 << 20;
  setrlimit(RLIMIT_FSIZE, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 1 << 20;
  setrlimit(RLIMIT_STACK, &rlim);
  rlim.rlim_cur = rlim.rlim_max = 0;
  setrlimit(RLIMIT_CORE, &rlim);

  unshare(CLONE_NEWNS);
  unshare(CLONE_NEWIPC);
  unshare(CLONE_IO);
}

static int do_sandbox_none()
{
  int pid = fork();
  if (pid)
    return pid;
  sandbox_common();
  loop();
  exit(1);
}

long r[2];
void loop()
{
  memset(r, -1, sizeof(r));
  r[0] = execute_syscall(__NR_socket, 0x1dul, 0x3ul, 0x1ul, 0, 0, 0, 0,
                         0, 0);
  r[1] = execute_syscall(__NR_setsockopt, r[0], 0x65ul, 0x1ul,
                         0x20000000ul, 0x18000000ul, 0, 0, 0, 0);
}
int main()
{
  setup_main_process(0);
  int pid = do_sandbox_none();
  int status = 0;
  while (waitpid(pid, &status, __WALL) != pid) {
  }
  return 0;
}



^ permalink raw reply

* arp_filter and IPv6 ND
From: Saku Ytti @ 2016-12-02 12:51 UTC (permalink / raw)
  To: netdev

Hey,

net.ipv4.conf.all.arp_filter appears not to have IPv6 counter part.
Or am I missing something? That is Linux does answer to ND queries for
unrelated interfaces by default, and I can't seem to find way to turn
that off.

Is it proper maintainership to accept changes to single protocol,
without mandating the support for other protocol having same
behavioural characteristics?

It is good that some parts for ARP and ND have common code in linux
(neighbour.c) unlike in BSD where everything seems to be
self-contained.
I'd wish that even more of ARP/ND would common, because there are
still lot of common behavioural code in ARP/ND code itself, which
requires double maintenance and are implemented by different people at
different times, so leads to different set of bugs and behaviour for
same intended behaviour.

For example this feature should be protocol agnostic, developer should
only need to develop it once for the higher level behavioural code,
without minding which IP AFI it is for. Obviously that does not
exclude ability to sysctl configure it on/off per AFI.

Thanks!
-- 
  ++ytti

^ permalink raw reply

* Re: [PATCH net v2] igb: re-assign hw address pointer on reset after PCI error
From: Guilherme G. Piccoli @ 2016-12-02 12:55 UTC (permalink / raw)
  To: intel-wired-lan, jeffrey.t.kirsher, alexander.duyck; +Cc: netdev, todd.fujinaka
In-Reply-To: <1478803603-30306-1-git-send-email-gpiccoli@linux.vnet.ibm.com>

On 11/10/2016 04:46 PM, Guilherme G. Piccoli wrote:
> Whenever the igb driver detects the result of a read operation returns
> a value composed only by F's (like 0xFFFFFFFF), it will detach the
> net_device, clear the hw_addr pointer and warn to the user that adapter's
> link is lost - those steps happen on igb_rd32().
> 
> In case a PCI error happens on Power architecture, there's a recovery
> mechanism called EEH, that will reset the PCI slot and call driver's
> handlers to reset the adapter and network functionality as well.
> 
> We observed that once hw_addr is NULL after the error is detected on
> igb_rd32(), it's never assigned back, so in the process of resetting
> the network functionality we got a NULL pointer dereference in both
> igb_configure_tx_ring() and igb_configure_rx_ring(). In order to avoid
> such bug, this patch re-assigns the hw_addr value in the slot_reset
> handler.
> 
> Reported-by: Anthony H. Thai <ahthai@us.ibm.com>
> Reported-by: Harsha Thyagaraja <hathyaga@in.ibm.com>
> Signed-off-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/intel/igb/igb_main.c | 5 +++++
>  1 file changed, 5 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
> index edc9a6a..136ee9e 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -7878,6 +7878,11 @@ static pci_ers_result_t igb_io_slot_reset(struct pci_dev *pdev)
>  		pci_enable_wake(pdev, PCI_D3hot, 0);
>  		pci_enable_wake(pdev, PCI_D3cold, 0);
> 
> +		/* In case of PCI error, adapter lose its HW address
> +		 * so we should re-assign it here.
> +		 */
> +		hw->hw_addr = adapter->io_addr;
> +
>  		igb_reset(adapter);
>  		wr32(E1000_WUS, ~0);
>  		result = PCI_ERS_RESULT_RECOVERED;
> 

Ping?

Sorry to annoy, any news on this?
Thanks in advance!

Cheers,


Guilherme

^ permalink raw reply

* Re: Initial thoughts on TXDP
From: Jesper Dangaard Brouer @ 2016-12-02 13:01 UTC (permalink / raw)
  To: Hannes Frederic Sowa
  Cc: brouer, Tom Herbert, Florian Westphal,
	Linux Kernel Network Developers, Alexander Duyck, John Fastabend,
	linux-mm
In-Reply-To: <859a0c99-f427-1db8-d260-1297777792fb@stressinduktion.org>

On Thu, 1 Dec 2016 23:47:44 +0100
Hannes Frederic Sowa <hannes@stressinduktion.org> wrote:

> Side note:
> 
> On 01.12.2016 20:51, Tom Herbert wrote:
> >> > E.g. "mini-skb": Even if we assume that this provides a speedup
> >> > (where does that come from? should make no difference if a 32 or
> >> >  320 byte buffer gets allocated).

Yes, the size of the allocation from the SLUB allocator does not change
base performance/cost much (at least for small objects, if < 1024).

Do notice the base SLUB alloc+free cost is fairly high (compared to a
201 cycles budget). Especially for networking as the free-side is very
likely to hit a slow path.  SLUB fast-path 53 cycles, and slow-path
around 100 cycles (data from [1]).  I've tried to address this with the
kmem_cache bulk APIs.  Which reduce the cost to approx 30 cycles.
(Something we have not fully reaped the benefit from yet!)

[1] https://git.kernel.org/torvalds/c/ca257195511

> >> >  
> > It's the zero'ing of three cache lines. I believe we talked about that
> > as netdev.

Actually 4 cache-lines, but with some cleanup I believe we can get down
to clearing 192 bytes 3 cache-lines.

> 
> Jesper and me played with that again very recently:
> 
> https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_memset.c#L590
> 
> In micro-benchmarks we saw a pretty good speed up not using the rep
> stosb generated by gcc builtin but plain movq's. Probably the cost model
> for __builtin_memset in gcc is wrong?

Yes, I believe so.

> When Jesper is free we wanted to benchmark this and maybe come up with a
> arch specific way of cleaning if it turns out to really improve throughput.
> 
> SIMD instructions seem even faster but the kernel_fpu_begin/end() kill
> all the benefits.

One strange thing was, that on my skylake CPU (i7-6700K @4.00GHz),
Hannes's hand-optimized MOVQ ASM-code didn't go past 8 bytes per cycle,
or 32 cycles for 256 bytes.

Talking to Alex and John during netdev, and reading on the Intel arch,
I though that this CPU should be-able-to perform 16 bytes per cycle.
The CPU can do it as the rep-stos show this once the size gets large
enough.

On this CPU the memset rep stos starts to win around 512 bytes:

 192/35 =  5.5 bytes/cycle
 256/36 =  7.1 bytes/cycle
 512/40 = 12.8 bytes/cycle
 768/46 = 16.7 bytes/cycle
1024/52 = 19.7 bytes/cycle
2048/84 = 24.4 bytes/cycle
4096/148= 27.7 bytes/cycle

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: net/can: warning in raw_setsockopt/__alloc_pages_slowpath
From: Marc Kleine-Budde @ 2016-12-02 13:24 UTC (permalink / raw)
  To: Andrey Konovalov, Oliver Hartkopp, David S. Miller, linux-can,
	netdev, LKML
  Cc: Dmitry Vyukov, Kostya Serebryany, syzkaller
In-Reply-To: <CAAeHK+x3vqjzzXPtp-xCshWnvTcVrZqGmJR7rbooRFxBJanv8g@mail.gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 3166 bytes --]

On 12/02/2016 01:43 PM, Andrey Konovalov wrote:
> Hi!
> 
> I've got the following error report while running the syzkaller fuzzer.
> 
> A reproducer is attached.
> 
> On commit d8e435f3ab6fea2ea324dce72b51dd7761747523 (Nov 26).
> 
> ------------[ cut here ]------------
> WARNING: CPU: 0 PID: 4009 at mm/page_alloc.c:3511
> __alloc_pages_slowpath+0x3d4/0x1bf0
> Modules linked in:
> CPU: 0 PID: 4009 Comm: a.out Not tainted 4.9.0-rc6+ #54
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
>  ffff88006832f8a8 ffffffff81c73b14 0000000000000000 0000000000000000
>  ffffffff842c6320 0000000000000000 ffff88006832f8f0 ffffffff8123dc57
>  ffff880067d86000 ffffffff00000db7 ffffffff842c6320 0000000000000db7
> Call Trace:
>  [<     inline     >] __dump_stack lib/dump_stack.c:15
>  [<ffffffff81c73b14>] dump_stack+0xb3/0x10f lib/dump_stack.c:51
>  [<ffffffff8123dc57>] __warn+0x1a7/0x1f0 kernel/panic.c:550
>  [<ffffffff8123de6c>] warn_slowpath_null+0x2c/0x40 kernel/panic.c:585
>  [<ffffffff81559b04>] __alloc_pages_slowpath+0x3d4/0x1bf0 mm/page_alloc.c:3511
>  [<ffffffff8155b8e2>] __alloc_pages_nodemask+0x5c2/0x710 mm/page_alloc.c:3781
>  [<ffffffff816236a4>] alloc_pages_current+0xf4/0x400 mm/mempolicy.c:2072
>  [<     inline     >] alloc_pages ./include/linux/gfp.h:469
>  [<ffffffff815a834f>] kmalloc_order+0x1f/0x70 mm/slab_common.c:1015
>  [<ffffffff815a83bf>] kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026
>  [<     inline     >] kmalloc_large ./include/linux/slab.h:422
>  [<ffffffff81635e67>] __kmalloc_track_caller+0x227/0x2a0 mm/slub.c:4233
>  [<ffffffff8159932c>] memdup_user+0x2c/0xa0 mm/util.c:137
>  [<ffffffff8369e0de>] raw_setsockopt+0x1be/0x9f0 net/can/raw.c:506

We should add a check for a sensible optlen....

> static int raw_setsockopt(struct socket *sock, int level, int optname,
> 			  char __user *optval, unsigned int optlen)
> {
> 	struct sock *sk = sock->sk;
> 	struct raw_sock *ro = raw_sk(sk);
> 	struct can_filter *filter = NULL;  /* dyn. alloc'ed filters */
> 	struct can_filter sfilter;         /* single filter */
> 	struct net_device *dev = NULL;
> 	can_err_mask_t err_mask = 0;
> 	int count = 0;
> 	int err = 0;
> 
> 	if (level != SOL_CAN_RAW)
> 		return -EINVAL;
> 
> 	switch (optname) {
> 
> 	case CAN_RAW_FILTER:
> 		if (optlen % sizeof(struct can_filter) != 0)
> 			return -EINVAL;

here...

		if (optlen > 64 * sizeof(struct can_filter))
			return -EINVAL;

> 
> 		count = optlen / sizeof(struct can_filter);
> 
> 		if (count > 1) {
> 			/* filter does not fit into dfilter => alloc space */
> 			filter = memdup_user(optval, optlen);
> 			if (IS_ERR(filter))
> 				return PTR_ERR(filter);
> 		} else if (count == 1) {
> 			if (copy_from_user(&sfilter, optval, sizeof(sfilter)))
> 				return -EFAULT;
> 		}
> 
> 		lock_sock(sk);

Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: stmmac ethernet in kernel 4.9-rc6: coalescing related pauses.
From: Giuseppe CAVALLARO @ 2016-12-02 13:51 UTC (permalink / raw)
  To: Pavel Machek
  Cc: alexandre.torgue, David Miller, netdev, linux-kernel,
	Alexandre TORGUE
In-Reply-To: <20161202123241.GA5869@amd>

On 12/2/2016 1:32 PM, Pavel Machek wrote:
> Hi!
>
>>> Well, if you have a workload that sends and receive packets, it tends
>>> to work ok, as you do tx_clean() in stmmac_poll(). My workload is not
>>> like that -- it is "sending packets at 3MB/sec, receiving none". So
>>> the stmmac_tx_timer() is rescheduled and rescheduled and rescheduled,
>>> and then we run out of transmit descriptors, and then 40msec passes,
>>> and then we clean them. Bad.
>>>
>>> And that's why low-res timers do not cut it.
>>
>> in that case, I expect that the tuning of the driver could help you.
>> I mean, by using ethtool, it could be enough to set the IC bit on all
>> the descriptors. You should touch the tx_coal_frames.
>>
>> Then you can use ethtool -S to monitor the status.
>
> Yes, I did something similar. Unfortnunately that meant crash within
> minutes, at least with 4.4 kernel. (If you know what was fixed between
> 4.4 and 4.9, that would be helpful).

4.4 has no GMAC4 support.
Alex, do you remember any patches to fix that?

>> We had experimented this tuning on STB IP where just datagrams
>> had to send externally. To be honest, although we had seen
>> better results w/o any timer, we kept this approach enabled
>> because the timer was fast enough to cover our tests on SH4 boxes.
>
> Please reply to David, and explain how it is supposed to
> work... because right now it does not. 40 msec delays are not
> acceptable in default configuration.

I mean, that on UP and SMP system this schema helped
to improve the performance saving CPU on my side and this has been
tested since a long time (~4 years).
I tested something similar to yours where unidirectional traffic
with limited throughput was needed and I can confirm you that
tuning/removing coalesce parameters this helped. The tuning I decided
to keep in the driver was suitable in several user cases and if now
you have problems or you want to review it I can just confirm that
there are no problems on my side. If you want to simply the logic
around the tx process and remove timer on official driver I can accept
that. I will just ask you uto double check if the throughput and
CPU usage when request max throughput (better if on GiGa setup) has
no regressions.
Otherwise we could start thinking about adaptive schema if feasible.

>>>> In the ring, some descriptors can raise the irq (according to a
>>>> threshold) and set the IC bit. In this path, the NAPI  poll will be
>>>> scheduled.
>>>
>>> Not NAPI poll but stmmac_tx_timer(), right?
>>
>> in the xmit according the the threshold the timer is started or the
>> interrupt is set inside the descriptor.
>> Then stmmac_tx_clean will be always called and, if you see the flow,
>> no irqlock protection is needed!
>
> Agreed that no irqlock protection is needed if we rely on napi and timers.

ok

>
>>>> Concerning the lock protection, we had reviewed long time ago and
>>>> IIRC, no raise condition should be present. Open to review it,
>>>> again!
> ...
>>> There's nothing that protect stmmac_poll() from running concurently
>>> with stmmac_dma_interrupt(), right?
>>
>> This is not necessary.
>
> dma_interrupt accesses shared priv->xstats; variables are of type
> unsigned long (not atomic_t), yet they are accesssed from interrupt
> context and from stmmac_ethtool without any locking. That can result
> in broken statistics AFAICT.

ok we can check this and welcome patches and I'd prefer to
remove xstats from critical part of the code like ISR (that
comes from old story of the driver).

>
> Please take another look. As far as I can tell, you can have two cpus
> at #1 and #2 in the code, at the same time. It looks like napi_... has
> some atomic opertions inside so that looks safe at the first look. But
> I'm not sure if they also include enough memory barriers to make it
> safe...?

Although I have never reproduced related issues on SMP platforms
due to reordering of memory operations but, as said above, welcome
review on this especially if you are seeing problems when manage NAPI.

FYI, the only memory barrier you will see in the driver are about the
OWN_BIT setting till now.

> static void stmmac_dma_interrupt(struct stmmac_priv *priv)
> {
> ...
>         status = priv->hw->dma->dma_interrupt(priv->ioaddr, &priv->xstats);
>         if (likely((status & handle_rx)) || (status & handle_tx)) {
>                 if (likely(napi_schedule_prep(&priv->napi))) {
> #1
>                         stmmac_disable_dma_irq(priv);
>                         __napi_schedule(&priv->napi);
>                 }
>         }
>
>
> static int stmmac_poll(struct napi_struct *napi, int budget)
> {
>         struct stmmac_priv *priv = container_of(napi, struct stmmac_priv, napi);
>         int work_done = 0;
>
>         priv->xstats.napi_poll++;
>         stmmac_tx_clean(priv);
>
>         work_done = stmmac_rx(priv, budget);
> 	if (work_done < budget) {
>                 napi_complete(napi);
> #2
> 	        stmmac_enable_dma_irq(priv);
>         }

hmm, I have to check (and refresh my memory) but the driver
uses the napi_schedule_prep.

Regards

Peppe

>         return work_done;
> }
>
>
> Best regards,
> 									Pavel
>

^ permalink raw reply

* Re: [PATCH net] fib_trie: Avoid expensive update of suffix length when not required
From: Robert Shearman @ 2016-12-02 13:54 UTC (permalink / raw)
  To: Alexander Duyck; +Cc: David Miller, Netdev, Alexander Duyck
In-Reply-To: <CAKgT0Uep40bYFQtToVBdmez5sPoyzapXNTUU10m9x6pyTj=Etg@mail.gmail.com>

On 01/12/16 18:29, Alexander Duyck wrote:
> On Wed, Nov 30, 2016 at 4:27 PM, Robert Shearman <rshearma@brocade.com> wrote:
>> On 29/11/16 23:14, Alexander Duyck wrote:
>>> On Tue, Nov 29, 2016 at 9:46 AM, Robert Shearman <rshearma@brocade.com>
>>>>
>>>> Fixes: 5405afd1a306 ("fib_trie: Add tracking value for suffix length")
>>>> Signed-off-by: Robert Shearman <rshearma@brocade.com>
>>>
>>>
>>> So I am not entirely convinced this is a regression, I was wondering
>>> what your numbers looked like before the patch you are saying this
>>> fixes?  I recall I fixed a number of issues leading up to this so I am
>>> just curious.
>>
>>
>> At commit 21d1f11db0e2f20a549ad8322879507850544670 ("fib_trie: Remove checks
>> for index >= tnode_child_length from tnode_get_child") which is the parent
>> of 5405afd1a306:
>>
>> $ time sudo ip route restore < ~/allroutes
>> RTNETLINK answers: File exists
>> RTNETLINK answers: File exists
>> RTNETLINK answers: File exists
>> RTNETLINK answers: File exists
>>
>> real    0m3.451s
>> user    0m0.184s
>> sys     0m2.004s
>>
>>
>> At commit 5405afd1a30620de8601436bae739c67c0bc7324 ("fib_trie: Add tracking
>> value for suffix length"):
>>
>> $ time sudo ip route restore < ~/allroutes
>> RTNETLINK answers: File exists
>> RTNETLINK answers: File exists
>> RTNETLINK answers: File exists
>> RTNETLINK answers: File exists
>>
>> real    0m48.624s
>> user    0m0.360s
>> sys     0m46.960s
>>
>> So does that warrant a fixes tag for this patch?
>
> Yes, please include it in the patch description next time.
>
> I've been giving it thought and the fact is the patch series has
> merit.  I just think it was being a bit too aggressive in terms of
> some of the changes.  So placing any changes in put_child seem to be
> the one piece in this that make this a bit ugly.

Does that imply the current updating of the parent's slen value should 
be removed from put_child then?

I started out without doing the changes in put_child, but that meant the 
changes to push the slen up through the trie were spread all over the 
place. This seemed like the cleaner solution.

>>>> +       }
>>>> +}
>>>> +
>>>>  /* Add a child at position i overwriting the old value.
>>>>   * Update the value of full_children and empty_children.
>>>> + *
>>>> + * The suffix length of the parent node and the rest of the tree is
>>>> + * updated (if required) when adding/replacing a node, but is caller's
>>>> + * responsibility on removal.
>>>>   */
>>>>  static void put_child(struct key_vector *tn, unsigned long i,
>>>>                       struct key_vector *n)
>>>> @@ -447,8 +461,8 @@ static void put_child(struct key_vector *tn, unsigned
>>>> long i,
>>>>         else if (!wasfull && isfull)
>>>>                 tn_info(tn)->full_children++;
>>>>
>>>> -       if (n && (tn->slen < n->slen))
>>>> -               tn->slen = n->slen;
>>>> +       if (n)
>>>> +               node_push_suffix(tn, n);
>>>
>>>
>>> This is just creating redundant work if we have to perform a resize.
>>
>>
>> The only real redundant work is the assignment of slen in tn, since the
>> propagation up the trie has to happen regardless and if a subsequent resize
>> results in changes to the trie then the propagation will stop at tn's parent
>> since the suffix length of the tn's parent will not be less than tn's suffix
>> length.
>
>
> This isn't going to work.  We want to avoid trying to push the suffix
> when we place a child.  There are scenarios where we are placing
> children in nodes that don't have parents yet because we are doing
> rebalances and such.  I suspect this could be pretty expensive on a
> resize call.

It's not expensive because the new parents that are being built up are 
orphaned from the rest of the trie, so the push up won't proceed into 
the rest of the trie until the new node is inserted into the trie.

> We want to pull the suffix pushing out of the resize completely.  The
> problem is this is going to cause us to start pushing suffixes for
> every node moved as a part of resize.

This will mean that nodes will have to be visited multiple times, which 
could well be more expensive than doing it during the resize.

>
>>>
>>>>         rcu_assign_pointer(tn->tnode[i], n);
>>>>  }
>>
>> ...
>>
>>>> -static void leaf_pull_suffix(struct key_vector *tp, struct key_vector
>>>> *l)
>>>> +static void node_set_suffix(struct key_vector *tp, unsigned char slen)
>>>>  {
>>>> -       while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
>>>> -               if (update_suffix(tp) > l->slen)
>>>> +       if (slen > tp->slen)
>>>> +               tp->slen = slen;
>>>> +}
>>>> +
>>>> +static void node_pull_suffix(struct key_vector *tn)
>>>> +{
>>>> +       struct key_vector *tp;
>>>> +       unsigned char slen;
>>>> +
>>>> +       slen = update_suffix(tn);
>>>> +       tp = node_parent(tn);
>>>> +       while ((tp->slen > tp->pos) && (tp->slen > slen)) {
>>>> +               if (update_suffix(tp) > slen)
>>>>                         break;
>>>>                 tp = node_parent(tp);
>>>>         }
>>>>  }
>>>>
>>>
>
> Actually I realized that there is a bug here.  The check for
> update_suffix(tp) > slen can cause us to stop prematurely if I am not
> mistaken.  What we should probably be doing is pulling out tp->slen
> and if the result of update_suffix is the same value then we can break
> the loop.  Otherwise we can stop early if a "second runner up" shows
> up that is supposed to be pushed up the trie.

Excellent point. This also a problem in the existing code when removing 
a fib alias with other aliases still on the leaf. I'll send a separate 
patch for this and base my changes on top of it.

> I actually started around with patches to do much the same as what you
> are doing and I will probably submit them as an RFC so if you need you
> can pick pieces out as needed, or I could submit them if they work for
> you.

I'd be happy to test out any patches you send me.

>>> I really hate all the renaming.  Can't you just leave these as
>>> leaf_pull_suffix and leaf_push _suffix.  I'm fine with us dropping the
>>> leaf of the suffix but the renaming just makes adds a bunch of noise.
>>> Really it might work better if you broke the replacement of the
>>> key_vector as a leaf with the suffix length into a separate patch,
>>> then you could do the rename as a part of that.
>>
>>
>> Ok, I'll leave the naming of leaf_push_suffix alone. Note that
>> leaf_pull_suffix hasn't been renamed, the below in the diff is just an
>> artifact of how diff decided to present the changes along with the moving of
>> leaf_push_suffix.
>
> So I have been looking this over for the last couple days and actually
> I have started to be okay with the renaming.
>
> However I would ask to be consistent.  If you are going to have
> node_pull_suffix and node_push_suffix then just pass a node and a
> suffix length.  You can drop the leaf key vector from both functions
> instead of just one.

Ok, I can do that.

>
>>>
>>>> -static void leaf_push_suffix(struct key_vector *tn, struct key_vector
>>>> *l)
>>>> +static void leaf_pull_suffix(struct key_vector *tp, struct key_vector
>>>> *l)
>>>>  {
>>>> -       /* if this is a new leaf then tn will be NULL and we can sort
>>>> -        * out parent suffix lengths as a part of trie_rebalance
>>>> -        */
>>>> -       while (tn->slen < l->slen) {
>>>> -               tn->slen = l->slen;
>>>> -               tn = node_parent(tn);
>>>> +       while ((tp->slen > tp->pos) && (tp->slen > l->slen)) {
>>>> +               if (update_suffix(tp) > l->slen)
>>>> +                       break;
>>>> +               tp = node_parent(tp);
>>>>         }
>>>>  }
>>>
>>>
>>> If possible it would work better if you could avoid moving functions
>>> around as a result of your changes.
>>
>>
>> Ok, I can add a forward declaration instead.
>
> It shouldn't be necessary.  I think you are doing way too much with
> this patch.  We just need this to be a fix, you are doing a bit too
> much like adding this new node_set_suffix function which isn't really
> needed.

As per your comment below the node_set_suffix function isn't necessary. 
However, I'm not sure exactly what you're suggesting with this patch 
doing too much. Are you saying that you'd like to see a series of 
patches starting with some of the restructuring/renaming of the 
push/pull functions, or are you saying that the sum total is too much?

>>>>
>>>> @@ -1783,6 +1801,16 @@ void fib_table_flush_external(struct fib_table
>>>> *tb)
>>>>                         if (IS_TRIE(pn))
>>>>                                 break;
>>>>
>>>> +                       /* push the suffix length to the parent node,
>>>> +                        * since a previous leaf removal may have
>>>> +                        * caused it to change
>>>> +                        */
>>>> +                       if (pn->slen > pn->pos) {
>>>> +                               unsigned char slen = update_suffix(pn);
>>>> +
>>>> +                               node_set_suffix(node_parent(pn), slen);
>>>> +                       }
>>>> +
>>>
>>>
>>> Why bother with the local variable?  You could just pass
>>> update_suffix(pn) as the parameter to your node_set_suffix function.
>>
>>
>> To avoid a long line on the node_set_suffix call or splitting it across
>> lines, but I'll remove the local variable as you suggest and the same below.
>
> Actually I think there is a bug here.  You shouldn't be setting the
> suffix for the parent based on the child.  You can just call
> update_suffix(pn) and that should be enough.

Yes, you're right. BTW, this logic is transferred from the existing 
resize function and I think what saves us both before and after my 
changes is that the logic is repeated for the parent node which fixes 
the value and so on all the way up the trie.

Thanks,
Rob

^ permalink raw reply

* Aw: Re: stmmac ethernet in kernel 4.9-rc6: coalescing related pauses.
From: Lino Sanfilippo @ 2016-12-02 14:05 UTC (permalink / raw)
  To: Pavel Machek
  Cc: Giuseppe CAVALLARO, alexandre.torgue, David Miller, netdev,
	linux-kernel
In-Reply-To: <20161202084511.GA32294@amd>

Hi,

> 
> There's nothing that protect stmmac_poll() from running concurently
> with stmmac_dma_interrupt(), right?
> 

could it be that there is also another issue concerned locking?:
The tx completion handler takes the xmit_lock in case that the
netif_queue is stopped. This is AFAICS unnecessary, since both
xmit and completion handler are already synchronized by the private
tx lock. But it is IMHO also dangerous:

In the xmit handler we have the locking order
1. xmit_lock
2. private tx lock

while in the completion handler its the reverse:

1. private tx lock
2. xmit lock.

Do I miss something?

Regards,
Lino

^ permalink raw reply

* Re: [PATCH] stmmac: reduce code duplication getting basic descriptors
From: Alexandre Torgue @ 2016-12-02 14:09 UTC (permalink / raw)
  To: Pavel Machek, David Miller, Andrew Morton
  Cc: peppe.cavallaro, netdev, linux-kernel
In-Reply-To: <20161128121736.GD15034@amd>

Hi Pavel,

On 11/28/2016 01:17 PM, Pavel Machek wrote:
>
> Remove code duplication getting basic descriptors.

I agree with your patch, it will make code easier to understand.
After fix kbuild issue you can add my Acked-by;

Regards
Alex

>
> Signed-off-by: Pavel Machek <pavel@denx.de>
>
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index f7133d0..ed20668 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -929,6 +929,17 @@ static int stmmac_set_bfsize(int mtu, int bufsize)
>  	return ret;
>  }
>
> +static inline struct dma_desc *stmmac_tx_desc(struct stmmac_priv *priv, int i)
> +{
> +	struct dma_desc *p;
> +	if (priv->extend_desc)
> +		p = &((priv->dma_etx + i)->basic);
> +	else
> +		p = priv->dma_tx + i;
> +	return p;
> +}
> +
> +
>  /**
>   * stmmac_clear_descriptors - clear descriptors
>   * @priv: driver private structure
> @@ -1078,11 +1089,7 @@ static int init_dma_desc_rings(struct net_device *dev, gfp_t flags)
>
>  	/* TX INITIALIZATION */
>  	for (i = 0; i < DMA_TX_SIZE; i++) {
> -		struct dma_desc *p;
> -		if (priv->extend_desc)
> -			p = &((priv->dma_etx + i)->basic);
> -		else
> -			p = priv->dma_tx + i;
> +		struct dma_desc *p = stmmac_tx_desc(priv, i);
>
>  		if (priv->synopsys_id >= DWMAC_CORE_4_00) {
>  			p->des0 = 0;
> @@ -1129,12 +1136,7 @@ static void dma_free_tx_skbufs(struct stmmac_priv *priv)
>  	int i;
>
>  	for (i = 0; i < DMA_TX_SIZE; i++) {
> -		struct dma_desc *p;
> -
> -		if (priv->extend_desc)
> -			p = &((priv->dma_etx + i)->basic);
> -		else
> -			p = priv->dma_tx + i;
> +		struct dma_desc *p = stmmac_tx_desc(priv, i);
>
>  		if (priv->tx_skbuff_dma[i].buf) {
>  			if (priv->tx_skbuff_dma[i].map_as_page)
> @@ -1314,14 +1316,9 @@ static void __stmmac_tx_clean(struct stmmac_priv *priv)
>
>  	while (entry != priv->cur_tx) {
>  		struct sk_buff *skb = priv->tx_skbuff[entry];
> -		struct dma_desc *p;
> +		struct dma_desc *p = stmmac_tx_desc(priv, entry);
>  		int status;
>
> -		if (priv->extend_desc)
> -			p = (struct dma_desc *)(priv->dma_etx + entry);
> -		else
> -			p = priv->dma_tx + entry;
> -
>  		status = priv->hw->desc->tx_status(&priv->dev->stats,
>  						      &priv->xstats, p,
>  						      priv->ioaddr);
> @@ -2227,11 +2224,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
>
>  	csum_insertion = (skb->ip_summed == CHECKSUM_PARTIAL);
>
> -	if (likely(priv->extend_desc))
> -		desc = (struct dma_desc *)(priv->dma_etx + entry);
> -	else
> -		desc = priv->dma_tx + entry;
> -
> +	desc = stmmac_tx_desc(priv, entry);
>  	first = desc;
>
>  	priv->tx_skbuff[first_entry] = skb;
> @@ -2254,10 +2247,7 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
>
>  		entry = STMMAC_GET_ENTRY(entry, DMA_TX_SIZE);
>
> -		if (likely(priv->extend_desc))
> -			desc = (struct dma_desc *)(priv->dma_etx + entry);
> -		else
> -			desc = priv->dma_tx + entry;
> +		desc = stmmac_tx_desc(priv, entry);
>
>  		des = skb_frag_dma_map(priv->device, frag, 0, len,
>  				       DMA_TO_DEVICE);
>

^ permalink raw reply

* Re: arp_filter and IPv6 ND
From: Hannes Frederic Sowa @ 2016-12-02 14:08 UTC (permalink / raw)
  To: Saku Ytti, netdev
In-Reply-To: <CAAeewD82Lfo7c=vLHR7p3oxJ8d0OgWrijKmNn8EPFKq7fN7AMg@mail.gmail.com>

On 02.12.2016 13:51, Saku Ytti wrote:
> net.ipv4.conf.all.arp_filter appears not to have IPv6 counter part.
> Or am I missing something? That is Linux does answer to ND queries for
> unrelated interfaces by default, and I can't seem to find way to turn
> that off.

May I ask why you want to turn it off?

In IPv6 this depends on the scope. In IPv4 this concept doesn't really
exist.

Please notice that in IPv4 arp_filter does not necessarily mean that the
system is operating in strong end system mode but you end up in an
hybrid clone where arp is acting strong but routing not and thus you
also have to add fib rules to simulate that.

> Is it proper maintainership to accept changes to single protocol,
> without mandating the support for other protocol having same
> behavioural characteristics?
> 
> It is good that some parts for ARP and ND have common code in linux
> (neighbour.c) unlike in BSD where everything seems to be
> self-contained.
>
> I'd wish that even more of ARP/ND would common, because there are
> still lot of common behavioural code in ARP/ND code itself, which
> requires double maintenance and are implemented by different people at
> different times, so leads to different set of bugs and behaviour for
> same intended behaviour.
> 
> For example this feature should be protocol agnostic, developer should
> only need to develop it once for the higher level behavioural code,
> without minding which IP AFI it is for. Obviously that does not
> exclude ability to sysctl configure it on/off per AFI.

^ permalink raw reply

* [PATCH 1/6] net: stmmac: return error if no DMA configuration is found
From: Niklas Cassel @ 2016-12-02 14:11 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue; +Cc: Niklas Cassel, netdev, linux-kernel

From: Niklas Cassel <niklas.cassel@axis.com>

All drivers except pci glue layer calls stmmac_probe_config_dt.
stmmac_probe_config_dt does a kzalloc dma_cfg.

pci glue layer does kzalloc dma_cfg explicitly, so all current
drivers does a kzalloc dma_cfg.

Return an error if no DMA configuration is found, that way
we can assume that the DMA configuration always exists.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 1f9ec02fa7f8..04f88df7da49 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1577,16 +1577,12 @@ static void stmmac_check_ether_addr(struct stmmac_priv *priv)
  */
 static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 {
-	int pbl = DEFAULT_DMA_PBL, fixed_burst = 0, aal = 0;
-	int mixed_burst = 0;
 	int atds = 0;
 	int ret = 0;
 
-	if (priv->plat->dma_cfg) {
-		pbl = priv->plat->dma_cfg->pbl;
-		fixed_burst = priv->plat->dma_cfg->fixed_burst;
-		mixed_burst = priv->plat->dma_cfg->mixed_burst;
-		aal = priv->plat->dma_cfg->aal;
+	if (!priv->plat->dma_cfg) {
+		dev_err(priv->device, "DMA configuration not found\n");
+		return -EINVAL;
 	}
 
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
@@ -1598,8 +1594,12 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 		return ret;
 	}
 
-	priv->hw->dma->init(priv->ioaddr, pbl, fixed_burst, mixed_burst,
-			    aal, priv->dma_tx_phy, priv->dma_rx_phy, atds);
+	priv->hw->dma->init(priv->ioaddr,
+			    priv->plat->dma_cfg->pbl,
+			    priv->plat->dma_cfg->fixed_burst,
+			    priv->plat->dma_cfg->mixed_burst,
+			    priv->plat->dma_cfg->aal,
+			    priv->dma_tx_phy, priv->dma_rx_phy, atds);
 
 	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
 		priv->rx_tail_addr = priv->dma_rx_phy +
-- 
2.1.4

^ permalink raw reply related

* [PATCH 2/6] net: stmmac: simplify the common DMA init API
From: Niklas Cassel @ 2016-12-02 14:11 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue; +Cc: Niklas Cassel, netdev, linux-kernel
In-Reply-To: <1480687910-9690-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>

Use struct stmmac_dma_cfg *dma_cfg as an argument rather
than using all the struct members as individual arguments.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 drivers/net/ethernet/stmicro/stmmac/common.h        |  4 ++--
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c | 13 +++++++------
 drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c  |  7 ++++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c    | 14 ++++++++------
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c   |  6 +-----
 5 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h
index 6d2de4e01f6d..3023ec7ae83e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -411,8 +411,8 @@ extern const struct stmmac_desc_ops ndesc_ops;
 struct stmmac_dma_ops {
 	/* DMA core initialization */
 	int (*reset)(void __iomem *ioaddr);
-	void (*init)(void __iomem *ioaddr, int pbl, int fb, int mb,
-		     int aal, u32 dma_tx, u32 dma_rx, int atds);
+	void (*init)(void __iomem *ioaddr, struct stmmac_dma_cfg *dma_cfg,
+		     u32 dma_tx, u32 dma_rx, int atds);
 	/* Configure the AXI Bus Mode Register */
 	void (*axi)(void __iomem *ioaddr, struct stmmac_axi *axi);
 	/* Dump DMA registers */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
index 990746955216..01d0d0f315e5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_dma.c
@@ -82,8 +82,9 @@ static void dwmac1000_dma_axi(void __iomem *ioaddr, struct stmmac_axi *axi)
 	writel(value, ioaddr + DMA_AXI_BUS_MODE);
 }
 
-static void dwmac1000_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb,
-			       int aal, u32 dma_tx, u32 dma_rx, int atds)
+static void dwmac1000_dma_init(void __iomem *ioaddr,
+			       struct stmmac_dma_cfg *dma_cfg,
+			       u32 dma_tx, u32 dma_rx, int atds)
 {
 	u32 value = readl(ioaddr + DMA_BUS_MODE);
 
@@ -99,20 +100,20 @@ static void dwmac1000_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb,
 	 */
 	value |= DMA_BUS_MODE_MAXPBL;
 	value &= ~DMA_BUS_MODE_PBL_MASK;
-	value |= (pbl << DMA_BUS_MODE_PBL_SHIFT);
+	value |= (dma_cfg->pbl << DMA_BUS_MODE_PBL_SHIFT);
 
 	/* Set the Fixed burst mode */
-	if (fb)
+	if (dma_cfg->fixed_burst)
 		value |= DMA_BUS_MODE_FB;
 
 	/* Mixed Burst has no effect when fb is set */
-	if (mb)
+	if (dma_cfg->mixed_burst)
 		value |= DMA_BUS_MODE_MB;
 
 	if (atds)
 		value |= DMA_BUS_MODE_ATDS;
 
-	if (aal)
+	if (dma_cfg->aal)
 		value |= DMA_BUS_MODE_AAL;
 
 	writel(value, ioaddr + DMA_BUS_MODE);
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
index 61f54c99a7de..e5664da382f3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac100_dma.c
@@ -32,11 +32,12 @@
 #include "dwmac100.h"
 #include "dwmac_dma.h"
 
-static void dwmac100_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb,
-			      int aal, u32 dma_tx, u32 dma_rx, int atds)
+static void dwmac100_dma_init(void __iomem *ioaddr,
+			      struct stmmac_dma_cfg *dma_cfg,
+			      u32 dma_tx, u32 dma_rx, int atds)
 {
 	/* Enable Application Access by writing to DMA CSR0 */
-	writel(DMA_BUS_MODE_DEFAULT | (pbl << DMA_BUS_MODE_PBL_SHIFT),
+	writel(DMA_BUS_MODE_DEFAULT | (dma_cfg->pbl << DMA_BUS_MODE_PBL_SHIFT),
 	       ioaddr + DMA_BUS_MODE);
 
 	/* Mask interrupts by writing to CSR7 */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
index 577316de6ba8..0946546d6dcd 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c
@@ -97,27 +97,29 @@ static void dwmac4_dma_init_channel(void __iomem *ioaddr, int pbl,
 	writel(dma_rx_phy, ioaddr + DMA_CHAN_RX_BASE_ADDR(channel));
 }
 
-static void dwmac4_dma_init(void __iomem *ioaddr, int pbl, int fb, int mb,
-			    int aal, u32 dma_tx, u32 dma_rx, int atds)
+static void dwmac4_dma_init(void __iomem *ioaddr,
+			    struct stmmac_dma_cfg *dma_cfg,
+			    u32 dma_tx, u32 dma_rx, int atds)
 {
 	u32 value = readl(ioaddr + DMA_SYS_BUS_MODE);
 	int i;
 
 	/* Set the Fixed burst mode */
-	if (fb)
+	if (dma_cfg->fixed_burst)
 		value |= DMA_SYS_BUS_FB;
 
 	/* Mixed Burst has no effect when fb is set */
-	if (mb)
+	if (dma_cfg->mixed_burst)
 		value |= DMA_SYS_BUS_MB;
 
-	if (aal)
+	if (dma_cfg->aal)
 		value |= DMA_SYS_BUS_AAL;
 
 	writel(value, ioaddr + DMA_SYS_BUS_MODE);
 
 	for (i = 0; i < DMA_CHANNEL_NB_MAX; i++)
-		dwmac4_dma_init_channel(ioaddr, pbl, dma_tx, dma_rx, i);
+		dwmac4_dma_init_channel(ioaddr, dma_cfg->pbl,
+					dma_tx, dma_rx, i);
 }
 
 static void _dwmac4_dump_dma_regs(void __iomem *ioaddr, u32 channel)
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 04f88df7da49..27a03f7ee095 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1594,11 +1594,7 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 		return ret;
 	}
 
-	priv->hw->dma->init(priv->ioaddr,
-			    priv->plat->dma_cfg->pbl,
-			    priv->plat->dma_cfg->fixed_burst,
-			    priv->plat->dma_cfg->mixed_burst,
-			    priv->plat->dma_cfg->aal,
+	priv->hw->dma->init(priv->ioaddr, priv->plat->dma_cfg,
 			    priv->dma_tx_phy, priv->dma_rx_phy, atds);
 
 	if (priv->synopsys_id >= DWMAC_CORE_4_00) {
-- 
2.1.4

^ permalink raw reply related

* [PATCH 4/6] net: stmmac: dwmac1000: fix define DMA_BUS_MODE_RPBL_MASK
From: Niklas Cassel @ 2016-12-02 14:11 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue; +Cc: Niklas Cassel, netdev, linux-kernel
In-Reply-To: <1480687910-9690-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>

DMA_BUS_MODE_RPBL_MASK is really 6 bits,
just like DMA_BUS_MODE_PBL_MASK.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index ff3e5ab39bd0..52b9407a8a39 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -225,7 +225,7 @@ enum rx_tx_priority_ratio {
 
 #define DMA_BUS_MODE_FB		0x00010000	/* Fixed burst */
 #define DMA_BUS_MODE_MB		0x04000000	/* Mixed burst */
-#define DMA_BUS_MODE_RPBL_MASK	0x003e0000	/* Rx-Programmable Burst Len */
+#define DMA_BUS_MODE_RPBL_MASK	0x007e0000	/* Rx-Programmable Burst Len */
 #define DMA_BUS_MODE_RPBL_SHIFT	17
 #define DMA_BUS_MODE_USP	0x00800000
 #define DMA_BUS_MODE_MAXPBL	0x01000000
-- 
2.1.4

^ permalink raw reply related

* [PATCH 3/6] net: stmmac: stmmac_platform: fix parsing of DT binding
From: Niklas Cassel @ 2016-12-02 14:11 UTC (permalink / raw)
  To: Giuseppe Cavallaro, Alexandre Torgue; +Cc: Niklas Cassel, netdev, linux-kernel
In-Reply-To: <1480687910-9690-1-git-send-email-niklass@axis.com>

From: Niklas Cassel <niklas.cassel@axis.com>

commit 64c3b252e9fc ("net: stmmac: fixed the pbl setting with DT")
changed the parsing of the DT binding.

Before 64c3b252e9fc, snps,fixed-burst and snps,mixed-burst were parsed
regardless if the property snps,pbl existed or not.
After the commit, fixed burst and mixed burst are only parsed if
snps,pbl exists. Now when snps,aal has been added, it too is only
parsed if snps,pbl exists.

Since the DT binding does not specify that fixed burst, mixed burst
or aal depend on snps,pbl being specified, undo changes introduced
by 64c3b252e9fc.

The issue commit 64c3b252e9fc ("net: stmmac: fixed the pbl setting with
DT") tries to address is solved in another way:
The databook specifies that all values other than
1, 2, 4, 8, 16, or 32 results in undefined behavior,
so snps,pbl = <0> is invalid.

If pbl is 0 after parsing, set pbl to DEFAULT_DMA_PBL.
This handles the case where the property is omitted, and also handles
the case where the property is specified without any data.

Signed-off-by: Niklas Cassel <niklas.cassel@axis.com>
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  3 +++
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  | 27 +++++++++++-----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 27a03f7ee095..9ba2eb4c22fe 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1585,6 +1585,9 @@ static int stmmac_init_dma_engine(struct stmmac_priv *priv)
 		return -EINVAL;
 	}
 
+	if (!priv->plat->dma_cfg->pbl)
+		priv->plat->dma_cfg->pbl = DEFAULT_DMA_PBL;
+
 	if (priv->extend_desc && (priv->mode == STMMAC_RING_MODE))
 		atds = 1;
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
index b46c892c7079..05b8c33effd5 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
@@ -303,21 +303,20 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
 		plat->force_sf_dma_mode = 1;
 	}
 
-	if (of_find_property(np, "snps,pbl", NULL)) {
-		dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg),
-				       GFP_KERNEL);
-		if (!dma_cfg) {
-			of_node_put(plat->phy_node);
-			return ERR_PTR(-ENOMEM);
-		}
-		plat->dma_cfg = dma_cfg;
-		of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
-		dma_cfg->aal = of_property_read_bool(np, "snps,aal");
-		dma_cfg->fixed_burst =
-			of_property_read_bool(np, "snps,fixed-burst");
-		dma_cfg->mixed_burst =
-			of_property_read_bool(np, "snps,mixed-burst");
+	dma_cfg = devm_kzalloc(&pdev->dev, sizeof(*dma_cfg),
+			       GFP_KERNEL);
+	if (!dma_cfg) {
+		of_node_put(plat->phy_node);
+		return ERR_PTR(-ENOMEM);
 	}
+	plat->dma_cfg = dma_cfg;
+
+	of_property_read_u32(np, "snps,pbl", &dma_cfg->pbl);
+
+	dma_cfg->aal = of_property_read_bool(np, "snps,aal");
+	dma_cfg->fixed_burst = of_property_read_bool(np, "snps,fixed-burst");
+	dma_cfg->mixed_burst = of_property_read_bool(np, "snps,mixed-burst");
+
 	plat->force_thresh_dma_mode = of_property_read_bool(np, "snps,force_thresh_dma_mode");
 	if (plat->force_thresh_dma_mode) {
 		plat->force_sf_dma_mode = 0;
-- 
2.1.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox