Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFC PATCH net-next 1/5] tcp: Add TCP TRACE_EVENTs
From: Martin KaFai Lau @ 2014-12-15  1:56 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Hannes Frederic Sowa, Steven Rostedt,
	Lawrence Brakmo, Josef Bacik, Kernel Team
In-Reply-To: <1418608606-1569264-1-git-send-email-kafai@fb.com>

Add TRACE_EVENT when:
1. connection established
2. segs received
3. segs sending out
4. connection close

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/trace/events/tcp.h | 175 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/net-traces.c      |   1 +
 net/ipv4/tcp.c             |   6 +-
 net/ipv4/tcp_input.c       |   3 +
 net/ipv4/tcp_output.c      |   3 +
 5 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/events/tcp.h

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
new file mode 100644
index 0000000..81b40ef
--- /dev/null
+++ b/include/trace/events/tcp.h
@@ -0,0 +1,175 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM tcp
+
+#if !defined(_TRACE_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TCP_H
+
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+#include <linux/ipv6.h>
+#include <linux/tracepoint.h>
+#include <uapi/linux/in6.h>
+
+#define TCP_TRACE_ASSIGN_SA(e, sk)	do {				\
+	(e)->lport = inet_sk((sk))->inet_sport;				\
+	(e)->rport = inet_sk((sk))->inet_dport;				\
+	if ((sk)->sk_family == AF_INET) {				\
+		(e)->ipv6 = 0;						\
+		memset((e)->laddr, 0, sizeof((e)->laddr));		\
+		memset((e)->raddr, 0, sizeof((e)->raddr));		\
+		memcpy((e)->laddr, &inet_sk((sk))->inet_saddr,		\
+		       sizeof(inet_sk((sk))->inet_saddr));		\
+		memcpy((e)->raddr, &inet_sk((sk))->inet_daddr,		\
+		       sizeof(inet_sk((sk))->inet_daddr));		\
+	} else {							\
+		(e)->ipv6 = 1;						\
+		memcpy((e)->laddr, inet6_sk((sk))->saddr.s6_addr,	\
+		       sizeof((e)->laddr));				\
+		memcpy((e)->raddr, (sk)->sk_v6_daddr.s6_addr,		\
+		       sizeof((e)->raddr));				\
+	}								\
+} while (0)
+
+DECLARE_EVENT_CLASS(tcp,
+	TP_PROTO(struct sock *sk),
+	TP_ARGS(sk),
+	TP_STRUCT__entry(
+		__field(u8, ipv6)
+		__array(u8, laddr, 16)
+		__array(u8, raddr, 16)
+		__field(u16, lport)
+		__field(u16, rport)
+		__field(u32, snd_cwnd)
+		__field(u32, mss_cache)
+		__field(u32, ssthresh)
+		__field(u64, srtt_us)
+		__field(u32, rto_ms)
+	),
+	TP_fast_assign(
+		TCP_TRACE_ASSIGN_SA(__entry, sk);
+		__entry->snd_cwnd = tcp_sk(sk)->snd_cwnd;
+		__entry->mss_cache = tcp_sk(sk)->mss_cache;
+		__entry->ssthresh = tcp_current_ssthresh(sk);
+		__entry->srtt_us = tcp_sk(sk)->srtt_us >> 3;
+		__entry->rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+	),
+	TP_printk("local=%s:%d remote=%s:%d snd_cwnd=%u mss_cache=%u "
+		  "ssthresh=%u srtt_us=%llu rto_ms=%u",
+		  __print_hex(__entry->laddr, 16),
+		  __entry->lport,
+		  __print_hex(__entry->raddr, 16),
+		  __entry->rport,
+		  __entry->snd_cwnd, __entry->mss_cache,
+		  __entry->ssthresh, __entry->srtt_us, __entry->rto_ms)
+);
+
+DEFINE_EVENT(tcp,
+	     tcp_established,
+	     TP_PROTO(struct sock *sk),
+	     TP_ARGS(sk)
+);
+
+DEFINE_EVENT(tcp,
+	     tcp_close,
+	     TP_PROTO(struct sock *sk),
+	     TP_ARGS(sk)
+);
+
+TRACE_EVENT(tcp_transmit_skb,
+	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+	TP_ARGS(sk, skb),
+	TP_STRUCT__entry(
+		__field(u8, ipv6)
+		__array(u8, laddr, 16)
+		__array(u8, raddr, 16)
+		__field(u16, lport)
+		__field(u16, rport)
+		__field(u32, seq)
+		__field(u32, end_seq)
+		__field(u32, pcount)
+		__field(u8, ca_state)
+		__field(u32, snd_nxt)
+		__field(u32, snd_una)
+		__field(u32, snd_wnd)
+		__field(u32, snd_cwnd)
+		__field(u32, mss_cache)
+		__field(u32, ssthresh)
+		__field(u64, srtt_us)
+		__field(u32, rto_ms)
+	),
+	TP_fast_assign(
+		TCP_TRACE_ASSIGN_SA(__entry, sk);
+		__entry->seq = TCP_SKB_CB(skb)->seq;
+		__entry->end_seq = TCP_SKB_CB(skb)->end_seq;
+		__entry->pcount = tcp_skb_pcount(skb);
+		__entry->ca_state = inet_csk(sk)->icsk_ca_state;
+		__entry->snd_nxt = tcp_sk(sk)->snd_nxt;
+		__entry->snd_una = tcp_sk(sk)->snd_una;
+		__entry->snd_wnd = tcp_sk(sk)->snd_wnd;
+		__entry->snd_cwnd = tcp_sk(sk)->snd_cwnd;
+		__entry->mss_cache = tcp_sk(sk)->mss_cache;
+		__entry->ssthresh = tcp_current_ssthresh(sk);
+		__entry->srtt_us = tcp_sk(sk)->srtt_us >> 3;
+		__entry->rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+	),
+	TP_printk("local=%s:%d remote=%s:%d "
+		  "skb_seq=%u skb_end_seq=%u pcount=%u ca_state=%x "
+		  "snd_nxt=%u snd_una=%u snd_wnd=%u snd_cwnd=%u mss_cache=%u "
+		  "ssthresh=%u srtt_us=%llu rto_ms=%u",
+		  __print_hex(__entry->laddr, 16), __entry->lport,
+		  __print_hex(__entry->raddr, 16), __entry->rport,
+
+		  __entry->seq, __entry->end_seq, __entry->pcount,
+		  __entry->ca_state,
+
+		  __entry->snd_nxt, __entry->snd_una, __entry->snd_wnd,
+		  __entry->snd_cwnd, __entry->mss_cache,
+
+		  __entry->ssthresh, __entry->srtt_us, __entry->rto_ms)
+);
+
+TRACE_EVENT(tcp_rcv_established,
+	    TP_PROTO(struct sock *sk, struct sk_buff *skb),
+	    TP_ARGS(sk, skb),
+	TP_STRUCT__entry(
+		__field(u8, ipv6)
+		__array(u8, laddr, 16)
+		__array(u8, raddr, 16)
+		__field(u16, lport)
+		__field(u16, rport)
+		__field(u32, seq)
+		__field(u32, end_seq)
+		__field(u32, ack_seq)
+		__field(u32, snd_una)
+		__field(u32, rcv_nxt)
+		__field(u32, rcv_wnd)
+	),
+	TP_fast_assign(
+		TCP_TRACE_ASSIGN_SA(__entry, sk);
+		__entry->seq = TCP_SKB_CB(skb)->seq;
+		__entry->end_seq = TCP_SKB_CB(skb)->end_seq;
+		__entry->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+		__entry->snd_una = tcp_sk(sk)->snd_una;
+		__entry->rcv_nxt = tcp_sk(sk)->rcv_nxt;
+		__entry->rcv_wnd = tcp_sk(sk)->rcv_wnd;
+	),
+	TP_printk("local=%s:%d remote=%s:%d "
+		  "skb_seq=%u skb_end_seq=%u skb_ack_seq=%u snd_una=%u "
+		  "rcv_nxt=%u, rcv_wnd=%u",
+		  __print_hex(__entry->laddr, 16), __entry->lport,
+		  __print_hex(__entry->raddr, 16), __entry->rport,
+
+		  __entry->seq, __entry->end_seq, __entry->ack_seq,
+		  __entry->snd_una,
+
+		  __entry->rcv_nxt, __entry->rcv_wnd)
+);
+
+#undef TCP_TRACE_ASSIGN_SA
+
+#endif /* _TRACE_TCP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index ba3c012..63f966b 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -31,6 +31,7 @@
 #include <trace/events/napi.h>
 #include <trace/events/sock.h>
 #include <trace/events/udp.h>
+#include <trace/events/tcp.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3075723..3b887fa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -275,6 +275,7 @@
 #include <net/xfrm.h>
 #include <net/ip.h>
 #include <net/sock.h>
+#include <trace/events/tcp.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -1901,8 +1902,10 @@ void tcp_set_state(struct sock *sk, int state)
 
 	switch (state) {
 	case TCP_ESTABLISHED:
-		if (oldstate != TCP_ESTABLISHED)
+		if (oldstate != TCP_ESTABLISHED) {
 			TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+			trace_tcp_established(sk);
+		}
 		break;
 
 	case TCP_CLOSE:
@@ -1913,6 +1916,7 @@ void tcp_set_state(struct sock *sk, int state)
 		if (inet_csk(sk)->icsk_bind_hash &&
 		    !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
 			inet_put_port(sk);
+		trace_tcp_close(sk);
 		/* fall through */
 	default:
 		if (oldstate == TCP_ESTABLISHED)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 075ab4d..808fad7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -75,6 +75,7 @@
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 #include <linux/errqueue.h>
+#include <trace/events/tcp.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
@@ -5076,6 +5077,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	trace_tcp_rcv_established(sk, skb);
+
 	if (unlikely(sk->sk_rx_dst == NULL))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
 	/*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7f18262..9832512 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -41,6 +41,7 @@
 #include <linux/compiler.h>
 #include <linux/gfp.h>
 #include <linux/module.h>
+#include <trace/events/tcp.h>
 
 /* People can turn this off for buggy TCP's found in printers etc. */
 int sysctl_tcp_retrans_collapse __read_mostly = 1;
@@ -1014,6 +1015,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	/* Our usage of tstamp should remain private */
 	skb->tstamp.tv64 = 0;
 
+	trace_tcp_transmit_skb(sk, skb);
+
 	/* Cleanup our debris for IP stacks */
 	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
 			       sizeof(struct inet6_skb_parm)));
-- 
1.8.1

^ permalink raw reply related

* [RFC PATCH net-next 2/5] tcp: A perf script for TCP tracepoints
From: Martin KaFai Lau @ 2014-12-15  1:56 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Hannes Frederic Sowa, Steven Rostedt,
	Lawrence Brakmo, Josef Bacik, Kernel Team
In-Reply-To: <1418608606-1569264-1-git-send-email-kafai@fb.com>

A sample perf script.  It has a simple ip/port filtering and a summary output.

Here is a test with netem delay 100ms loss 0.1% and comparing the tcp-summary
output between reno and cubic.  It was run in a kvm environment:

[root@qemu1-centos65 perf]# sysctl -w net.ipv4.tcp_congestion_control=reno
net.ipv4.tcp_congestion_control = reno
[root@qemu1-centos65 perf]# ./perf record -a -e 'tcp:*' netperf -c -C -H 192.168.168.254 -l 60 -p 8888 -- -s 1M -S 1M -m 64K -P ,8889
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.168.254 () port 8889 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

2097152 425984  65536    60.78         2.91   0.00     0.86     0.000   771.397
[ perf record: Woken up 13 times to write data ]
[ perf record: Captured and wrote 3.231 MB perf.data (~141185 samples) ]
[root@qemu1 perf]# PYTHONPATH="$PYTHONPATH:/root/devhostshare/fb-kernel/linux/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace" ./perf script -s scripts/python/tcp-summary.py  -- --rport 8889
snd_cwnd(avg/min/max): 27/8/47
segs out: 15290
data segs out: 15275
octets out: 22116754
loss rxmits: 0
other rxmits: 13
rxmits%: 0.085
dup_acks: 402
established: 1
close: 1

[root@qemu1 perf]# sysctl -w net.ipv4.tcp_congestion_control=cubic
net.ipv4.tcp_congestion_control = cubic
[root@qemu1 perf]# ./perf record -a -e 'tcp:*' netperf -c -C -H 192.168.168.254 -l 60 -p 8888 -- -s 1M -S 1M -m 64K -P ,8889
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.168.254 () port 8889 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

2097152 425984  65536    60.25         4.97   0.02     0.86     1.096   454.051
[ perf record: Woken up 21 times to write data ]
[ perf record: Captured and wrote 5.525 MB perf.data (~241393 samples) ]
[root@qemu1 perf]# PYTHONPATH="$PYTHONPATH:/root/devhostshare/fb-kernel/linux/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace" ./perf script -s scripts/python/tcp-summary.py  -- --rport 8889
snd_cwnd(avg/min/max): 47/10/78
segs out: 25869
data segs out: 25848
octets out: 37426458
loss rxmits: 0
other rxmits: 19
rxmits%: 0.074
dup_acks: 957
established: 1
close: 1

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 tools/perf/scripts/python/tcp-summary.py | 262 +++++++++++++++++++++++++++++++
 1 file changed, 262 insertions(+)
 create mode 100644 tools/perf/scripts/python/tcp-summary.py

diff --git a/tools/perf/scripts/python/tcp-summary.py b/tools/perf/scripts/python/tcp-summary.py
new file mode 100644
index 0000000..fe85a43
--- /dev/null
+++ b/tools/perf/scripts/python/tcp-summary.py
@@ -0,0 +1,262 @@
+import os
+import sys
+import argparse
+import struct
+import socket
+
+sys.path.append(os.environ['PERF_EXEC_PATH'] + \
+	'/scripts/python/Perf-Trace-Util/lib/Perf/Trace')
+
+from perf_trace_context import *
+from Core import *
+
+def ntohip(nip):
+	hip = 0L
+	for i in nip:
+		hip <<= 8
+		hip |= i
+	return hip
+
+def unpack_nip(nip_in_c):
+	fmt = "!" + "B" * len(nip_in_c)
+	return struct.unpack(fmt, nip_in_c)
+
+def unpack_and_hostify_sa(laddr, lport, raddr, rport):
+	return (ntohip(unpack_nip(laddr)), socket.ntohs(lport),
+		ntohip(unpack_nip(raddr)), socket.ntohs(rport))
+
+class SimpleSubnet(object):
+	def _parse_ip4(self):
+		hip = 0L
+		dec_u8 = self.ip_str.split(".")
+		for i in dec_u8:
+			hip = hip << 8
+			hip |= long(i)
+		self.hip = hip
+
+	def _parse_ip6(self):
+		hip = 0L;
+		hex_u16	 = self.ip_str.split(":")
+		if self.ip_str.endswith("::"):
+			if hex_u16.count("") > 2:
+				raise Exception("Error in parsing IPv6")
+			cpz_zeros = 8 - len(hex_u16) + 2
+			hex_u16 = hex_u16[:-1]
+		elif "" in hex_u16:
+			if hex_u16.count("") > 1:
+				raise Exception("Error in parsing IPv6")
+			cpz_zeros = 8 - len(hex_u16) + 1
+		for i in hex_u16:
+			if len(i) == 0:
+				for j in range(cpz_zeros):
+					hip = hip << 16
+			else:
+				hip = hip << 16;
+				hip |= long(i, 16)
+		self.hip = hip
+
+	def _parse_ip(self, ip_str):
+		slash_start = ip_str.find("/")
+		if slash_start == -1:
+			self.ip_str = ip_str
+			self.plen = 0
+		else:
+			self.ip_str = ip_str[0:slash_start]
+			self.plen = int(ip_str[slash_start+1:])
+
+		if ':' in self.ip_str:
+			self._parse_ip6()
+			self.is_ip6 = True
+			if self.plen == 0:
+				self.plen = 128
+			self.netmask = 0xffffffffffffffffffffffffffffffffL >> (128 - self.plen) << (128 - self.plen)
+		else:
+			self._parse_ip4()
+			self.is_ip6 = False
+			if self.plen == 0:
+				self.plen = 32
+			self.netmask = 0xffffffffL >> (32 - self.plen) << (32 - self.plen)
+
+	def _min_ip(self):
+		return self.hip & self.netmask
+
+	def _max_ip(self):
+		if self.is_ip6:
+			return self._min_ip() | (0xffffffffffffffffffffffffffffffffL - self.netmask)
+		else:
+			return self._min_ip() | (0xffffffffL - self.netmask)
+
+	def __contains__(self, hip):
+		return self._min_ip() <= hip <= self._max_ip()
+
+	def __init__(self, ip_str):
+		self._parse_ip(ip_str)
+
+class SimpleFilter(object):
+	def __init__(self, lsn, lhport, rsn, rhport):
+		self.lsn = lsn
+		self.rsn = rsn
+		self.lhport = lhport
+		self.rhport = rhport
+
+	def match(self, lhip, lhport, rhip, rhport):
+		if self.lsn and lhip not in self.lsn:
+			return False
+		if self.rsn and rhip not in self.rsn:
+			return False
+		if self.lhport and self.lhport != lhport:
+			return False
+		if self.rhport and self.rhport != rhport:
+			return False
+
+		return True
+
+class TcpSummary(object):
+	def __init__(self):
+		self.snd_cwnd_sum = 0
+		self.snd_cwnd_count = 0
+		self.max_snd_cwnd = 0
+		self.min_snd_cwnd = 0xffffffff
+		self.segs_out = 0
+		self.data_segs_out = 0
+		self.data_octets_out = 0
+		self.loss_rxmits = 0
+		self.other_rxmits = 0
+		self.dup_acks = 0
+		self.established = 0
+		self.close = 0
+
+	def add_loss_rxmits(self, n):
+		self.loss_rxmits += n
+
+	def add_other_rxmits(self, n):
+		self.other_rxmits += n
+
+	def add_segs_out(self, n):
+		self.segs_out += n
+
+	def add_data_segs_out(self, n):
+		self.data_segs_out += n
+
+	def add_data_octets_out(self, n):
+		self.data_octets_out += n
+
+	def add_snd_cwnd_sample(self, n):
+		self.snd_cwnd_sum += n
+		self.snd_cwnd_count += 1
+		self.min_snd_cwnd = min(self.min_snd_cwnd, n)
+		self.max_snd_cwnd = max(self.max_snd_cwnd, n)
+
+	def inc_dup_acks(self):
+		self.dup_acks += 1
+
+	def inc_established(self):
+		self.established += 1
+
+	def inc_close(self):
+		self.close += 1
+
+	def report(self):
+		if self.snd_cwnd_count == 0:
+			avg_cwnd = 0
+			min_snd_cwnd = 0
+		else:
+			avg_cwnd = self.snd_cwnd_sum / self.snd_cwnd_count
+			min_snd_cwnd = self.min_snd_cwnd
+		rxmit_rate = (self.loss_rxmits + self.other_rxmits) * 100.0 / \
+			     self.data_segs_out if self.data_segs_out else 0
+		print "snd_cwnd(avg/min/max): %u/%u/%u" % (avg_cwnd,
+							   min_snd_cwnd,
+							   self.max_snd_cwnd)
+		print "segs out: %u" % self.segs_out
+		print "data segs out: %u" % self.data_segs_out
+		print "octets out: %u" % self.data_octets_out
+		print "loss rxmits: %u" % self.loss_rxmits
+		print "other rxmits: %u" % self.other_rxmits
+		print "rxmits%%: %.3f" % rxmit_rate
+		print "dup_acks: %u" % self.dup_acks
+		print "established: %u" % self.established
+		print "close: %u" % self.close
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--laddr", help="Local address in IP[/prefix-len]")
+parser.add_argument("--raddr", help="Remote address in IP[/prefix-len]")
+parser.add_argument("--lport", type=int, default=0, help="Local port")
+parser.add_argument("--rport", type=int, default=0, help="Remote port")
+args = parser.parse_args()
+lsn = SimpleSubnet(args.laddr) if args.laddr else None
+rsn = SimpleSubnet(args.raddr) if args.raddr else None
+addr_filter = SimpleFilter(lsn, args.lport, rsn, args.rport)
+tcp_summary = TcpSummary()
+
+def trace_begin():
+	pass
+
+def trace_end():
+	tcp_summary.report();
+
+def tcp__tcp_transmit_skb(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	common_callchain, ipv6, laddr, raddr, lport,
+	rport, seq, end_seq, pcount, ca_state,
+	snd_nxt, snd_una, snd_wnd, snd_cwnd, mss_cache,
+	ssthresh, srtt_us, rto_ms):
+
+	if not addr_filter.match(*unpack_and_hostify_sa(laddr, lport,
+							raddr, rport)):
+		return
+
+	tcp_summary.add_segs_out(pcount)
+	if end_seq > seq:
+		tcp_summary.add_snd_cwnd_sample(snd_cwnd)
+		if seq < snd_nxt:
+			if ca_state == 4:
+				tcp_summary.add_loss_rxmits(pcount)
+			else:
+				tcp_summary.add_other_rxmits(pcount)
+		else:
+			tcp_summary.add_data_segs_out(pcount)
+			tcp_summary.add_data_octets_out(end_seq - seq)
+
+def tcp__tcp_rcv_established(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	common_callchain, ipv6, laddr, raddr, lport,
+	rport, seq, end_seq, ack_seq, snd_una,
+	rcv_nxt, rcv_wnd):
+	if not addr_filter.match(*unpack_and_hostify_sa(laddr, lport,
+							raddr, rport)):
+		return
+
+	if seq == end_seq and ack_seq == snd_una:
+		tcp_summary.inc_dup_acks()
+
+def tcp__tcp_established(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	common_callchain, ipv6, laddr, raddr, lport,
+	rport, snd_cwnd, mss_cache, ssthresh, srtt_us,
+	rto_ms):
+
+	if not addr_filter.match(*unpack_and_hostify_sa(laddr, lport,
+							raddr, rport)):
+		return
+
+	tcp_summary.inc_established()
+
+def tcp__tcp_close(event_name, context, common_cpu,
+	common_secs, common_nsecs, common_pid, common_comm,
+	common_callchain, ipv6, laddr, raddr, lport,
+	rport, snd_cwnd, mss_cache, ssthresh, srtt_us,
+	rto_ms):
+
+	if not addr_filter.match(*unpack_and_hostify_sa(laddr, lport,
+							raddr, rport)):
+		return
+
+	tcp_summary.inc_close()
+
+def trace_unhandled(event_name, context, event_fields_dict):
+	print ' '.join(['%s=%s'%(k,str(v))for k,v in sorted(event_fields_dict.items())])
+
+def print_header(event_name, cpu, secs, nsecs, pid, comm):
+	print "%-20s %5u %05u.%09u %8u %-20s " % \
+	(event_name, cpu, secs, nsecs, pid, comm),
-- 
1.8.1

^ permalink raw reply related

* [RFC PATCH net-next 3/5] tcp: Add a few more tracepoints for tcp tracer
From: Martin KaFai Lau @ 2014-12-15  1:56 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Hannes Frederic Sowa, Steven Rostedt,
	Lawrence Brakmo, Josef Bacik, Kernel Team
In-Reply-To: <1418608606-1569264-1-git-send-email-kafai@fb.com>

The tcp tracer, which will be added in the later patch, depends
on them to collect statistics.

Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/trace/events/tcp.h | 15 +++++++++++++++
 net/ipv4/tcp_input.c       |  6 ++++++
 2 files changed, 21 insertions(+)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 81b40ef..440a26b 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -167,6 +167,21 @@ TRACE_EVENT(tcp_rcv_established,
 		  __entry->rcv_nxt, __entry->rcv_wnd)
 );
 
+DECLARE_TRACE(tcp_ooo_rcv,
+	      TP_PROTO(struct sock *sk),
+	      TP_ARGS(sk)
+);
+
+DECLARE_TRACE(tcp_sacks_rcv,
+	     TP_PROTO(struct sock *sk, int num_sacks),
+	     TP_ARGS(sk, num_sacks)
+);
+
+DECLARE_TRACE(tcp_rtt_sample,
+	      TP_PROTO(struct sock *sk, long rtt_sample_us),
+	      TP_ARGS(sk, rtt_sample_us)
+);
+
 #undef TCP_TRACE_ASSIGN_SA
 
 #endif /* _TRACE_TCP_H */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 808fad7..1f82987 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1650,6 +1650,8 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	int i, j;
 	int first_sack_index;
 
+	trace_tcp_sacks_rcv(sk, num_sacks);
+
 	state.flag = 0;
 	state.reord = tp->packets_out;
 	state.rtt_us = -1L;
@@ -2932,6 +2934,9 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
 
 	/* RFC6298: only reset backoff on valid RTT measurement. */
 	inet_csk(sk)->icsk_backoff = 0;
+
+	trace_tcp_rtt_sample(sk, seq_rtt_us);
+
 	return true;
 }
 
@@ -4232,6 +4237,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
 	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
 		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+	trace_tcp_ooo_rcv(sk);
 
 	skb1 = skb_peek_tail(&tp->out_of_order_queue);
 	if (!skb1) {
-- 
1.8.1

^ permalink raw reply related

* Potential bugs found in 8139too
From: Jia-Ju Bai @ 2014-12-15  2:35 UTC (permalink / raw)
  To: netdev

Recently I test linux device drivers 3.17.2, and find some potential bugs.

The target file is drivers/net/ethernet/realtek/8139too.c, which is used to
build 8139too.ko. I hope you can help me check my findings:
[1] In the normal process of 8139too, netif_napi_add is called in
rtl8139_init_one, but netif_napi_del is not called in rtl8139_remove_one.
However, many other ethernet card drivers call them in pairs, even in the
error handling paths, such as r8169 and igb.
[2] In the normal process of 8139too, pci_enable_device and
pci_disable_device are called in pairs in rtl8139_init_board(in
rtl8139_init_one) and rtl8139_remove_one. However, when pci_enable_device
has been called and pci_request_regions is failed in rtl8139_init_board,
"err_out" segment in rtl8139_init_board is executed immediately to exit, but
pci_disable_device is not called because "disable_dev_on_err = 0".

Could you help me check these findings? Thank you very much, and I'm looking
forward to your reply.

--
Jia-Ju Bai

^ permalink raw reply

* Potential bugs found in via-rhine
From: Jia-Ju Bai @ 2014-12-15  2:46 UTC (permalink / raw)
  To: netdev

Recently I test linux device drivers 3.17.2, and find some potential bugs.
The target file is drivers/net/ethernet/via/via-rhine, which is used to
build via-rhine.ko. I hope you can help me check my findings:

[1] In the normal process, netif_napi_add is called in
rhine_init_one_common, but netif_napi_del is not called in
rhine_remove_one_pci. However, many other ethernet card drivers call them in
pairs, even in the error handling paths, such as r8169 and igb.
[2] In my running test, if dma_mapping_error in alloc_rbufs is failed,
system crash occurs when executing netif_receive_skb in rhine_napipoll, but
I can not find the specific reason.

Could you help me check these findings? Thank you very much, and I'm looking
forward to your reply.

--
Jia-Ju Bai

^ permalink raw reply

* Potential bugs found in igb
From: Jia-Ju Bai @ 2014-12-15  3:22 UTC (permalink / raw)
  To: netdev
In-Reply-To: <005801d0180e$6a672de0$3f3589a0$@163.com>

Recently I test linux device drivers in Linux 3.17.2, and find some
potential bugs.

igb driver:
[1] In the normal process of igb, pci_enable_pcie_error_reporting and
pci_disable_pcie_error_reporting is called in pairs in igb_probe and
igb_remove. However, when pci_enable_pcie_error_reporting has been called
and alloc_etherdev_mqs in igb_probe is failed, "err_alloc_etherdev" segment
in igb_probe is executed immediately to exit, but
pci_disable_pcie_error_reporting is not called.
[2] The same situation with [1] happens when pci_iomap in igb_probe is
failed.
[3] The same situation with [1] happens when igb_sw_init in igb_probe is
failed.
[4] The same situation with [1] happens when register_netdev in igb_probe is
failed.
[5] The same situation with [1] happens when igb_init_i2c in igb_probe is
failed.
[6] The function kcalloc is called by igb_sw_init when initializing the
ethernet card driver, but kfree is not called when register_netdev in
igb_probe is failed, which may cause memory leak.
[7] The same situation with [6] happens when igb_init_i2c in igb_probe is
failed.
[8] The same situation with [6] happens when kzalloc in igb_alloc_q_vector
is failed.
[9] The same situation with [6] happens when igb_alloc_q_vector in
igb_alloc_q_vectors is failed.
[10] When igb_init_i2c in igb_probe is failed, igb_enable_sriov is called in
igb_probe_vfs, but igb_disable_sriov is not called.
[11] The same situation with [10] happens when register_netdev in igb_probe
is failed.

Could you help me check these findings? Thank you very much, and I'm looking
forward to your reply.

--
Jia-Ju Bai

^ permalink raw reply

* Potential bugs found in e1000/e1000e
From: Jia-Ju Bai @ 2014-12-15  3:23 UTC (permalink / raw)
  To: netdev
In-Reply-To: <001301d0180a$f7cb74b0$e7625e10$@163.com>

Recently I test linux device drivers in Linux 3.17.2, and find some
potential bugs.

e1000 driver:
The target file is drivers/net/ethernet/intel/e1000/e1000_main.c, which is
used to build e1000.ko. I hope you can help me check my findings:
[1] In the normal process, netif_napi_add is called in e1000_probe, but
netif_napi_del is not called in e1000_remove. However, many other ethernet
card drivers call them in pairs, even in the error handling paths, such as
r8169 and igb.

e1000e driver:
The target file is drivers/net/ethernet/intel/e1000e/netdev.c, which is used
to build e1000e.ko. I hope you can help me check my findings:
[1] In the normal process, netif_napi_add is called in e1000_probe, but
netif_napi_del is not called in e1000_remove. However, many other ethernet
card drivers call them in pairs, even in the error handling paths, such as
r8169 and igb.
[2] The function vzalloc is called by e1000e_setup_rx_resources (in
e1000_open) when initializing the ethernet card driver. But when vzalloc is
failed, "err" segment in e1000e_setup_rx_resources is executed to return,
and then e1000e_free_tx_resources in "err_setup_rx" segment in e1000_open is
executed to halt. However, "writel(0, tx_ring->head)" statement in
e1000_clean_tx_ring in e1000e_free_tx_resources will cause system crash,
because "tx_ring->head" is not assigned the value. In the code,
"tx_ring->head" is initialized in e1000_configure_tx in e1000_configure
after the e1000e_setup_rx_resources.
[3] The same system crashes with [2] happens, when kcalloc in
e1000e_setup_rx_resources is failed(returns NULL).
[4] The same system crashes with [2] happens, when e1000_alloc_ring_dma in
e1000e_setup_rx_resources is failed(returns error code).
[5] In the normal process of e1000e, pci_enable_pcie_error_reporting and
pci_disable_pcie_error_reporting is called in pairs in e1000_probe and
e1000_remove. However, when pci_enable_pcie_error_reporting has been called
and pci_save_state in e1000_probe is failed, "err_alloc_etherdev" segment in
e1000_probe is executed immediately to exit, but
pci_disable_pcie_error_reporting is not called.
[6] The same situation with [5] happens when alloc_etherdev_mqs in
e1000_probe is failed.
[7] The same situation with [5] happens when ioremap in e1000_probe is
failed.
[8] The same situation with [5] happens when e1000_sw_init in e1000_probe is
failed.
[9] The same situation with [5] happens when register_netdev in e1000_probe
is failed.
[10] When request_irq in e1000_request_irq is failed, pm_qos_add_request in
e1000_open is called, but pm_qos_remove_request is not called.

Could you help me check these findings? Thank you very much, and I'm looking
forward to your reply.

--
Jia-Ju Bai

^ permalink raw reply

* Potential bugs found in e100
From: Jia-Ju Bai @ 2014-12-15  3:24 UTC (permalink / raw)
  To: netdev
In-Reply-To: <001101d01809$b58ccbe0$20a663a0$@163.com>

Recently I test linux device drivers in Linux 3.17.2, and find some
potential bugs. 

e100 driver:
The target file is drivers/net/ethernet/intel/e100.c, which is used to build
e100.ko. I hope you can help me check my findings:
[1] The function pci_pool_create is called by e100_probe when initializing
the ethernet card driver. But when pci_pool_create is failed, which means
that it returns NULL to nic->cbs_pool, the system crash will happen. Because
pci_pool_alloc (in e100_alloc_cbs in e100_up in e100_open) need to use
nic->cbs_pool to allocate the resource, but it is NULL. I suggest that a
check can be added in the code to detect whether pci_pool_create returns
NULL.
[2] In the normal process, netif_napi_add is called in e100_probe, but
netif_napi_del is not called in e100_remove. However, many other ethernet
card drivers call them in pairs, even in the error handling paths, such as
r8169 and igb.

Could you help me check these findings? Thank you very much, and I'm looking
forward to your reply.

--
Jia-Ju Bai

^ permalink raw reply

* Re: [RFC PATCH net-next 0/5] tcp: TCP tracer
From: Alexei Starovoitov @ 2014-12-15  6:55 UTC (permalink / raw)
  To: Martin KaFai Lau
  Cc: netdev@vger.kernel.org, David S. Miller, Hannes Frederic Sowa,
	Steven Rostedt, Lawrence Brakmo, Josef Bacik, Kernel Team

On Sun, Dec 14, 2014 at 5:56 PM, Martin KaFai Lau <kafai@fb.com> wrote:
> Hi,
>
> We have been using the kernel ftrace infra to collect TCP per-flow statistics.
> The following patch set is a first slim-down version of our
> existing implementation. We would like to get some early feedback
> and make it useful for others.
>
> [RFC PATCH net-next 1/5] tcp: Add TCP TRACE_EVENTs:
> Defines some basic tracepoints (by TRACE_EVENT).
>
> [RFC PATCH net-next 2/5] tcp: A perf script for TCP tracepoints:
> A sample perf script with simple ip/port filtering and summary output.
>
> [RFC PATCH net-next 3/5] tcp: Add a few more tracepoints for tcp tracer:
> Declares a few more tracepoints (by DECLARE_TRACE) which are
> used by the tcp_tracer.  The tcp_tracer is in the patch 5/5.
>
> [RFC PATCH net-next 4/5] tcp: Introduce tcp_sk_trace and related structs:
> Defines a few tcp_trace structs which are used to collect statistics
> on each tcp_sock.
>
> [RFC PATCH net-next 5/5] tcp: Add TCP tracer:
> It introduces a tcp_tracer which hooks onto the tracepoints defined in the
> patch 1/5 and 3/5.  It collects data defined in patch 4/5. We currently
> use this tracer to collect per-flow statistics.  The commit log has
> some more details.

I think patches 1 and 3 are good additions, since they establish
few permanent points of instrumentation in tcp stack.
Patches 4-5 look more like use cases of tracepoints established
before. They may feel like simple additions and, no doubt,
they are useful, but since they expose things via tracing
infra they become part of api and cannot be changed later,
when more stats would be needed.
I think systemtap like scripting on top of patches 1 and 3
should solve your use case ?
Also, have you looked at recent eBPF work?
Though it's not completely ready yet, soon it should
be able to do the same stats collection as you have
in 4/5 without adding permanent pieces to the kernel.

^ permalink raw reply

* [PATCH net-next] fib_trie.txt: fix typo
From: Duan Jiong @ 2014-12-15  6:58 UTC (permalink / raw)
  To: David Miller; +Cc: netdev


Fix the typo, there should be "It".
On the other hand, fix whitespace errors detected by checkpatch.pl

Signed-off-by: Duan Jiong <duanj.fnst@cn.fujitsu.com>
---
 Documentation/networking/fib_trie.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/fib_trie.txt b/Documentation/networking/fib_trie.txt
index 0723db7..fe71938 100644
--- a/Documentation/networking/fib_trie.txt
+++ b/Documentation/networking/fib_trie.txt
@@ -73,8 +73,8 @@ trie_leaf_remove()
 
 trie_rebalance()
 	The key function for the dynamic trie after any change in the trie
-	it is run to optimize and reorganize. Tt will walk the trie upwards 
-	towards the root from a given tnode, doing a resize() at each step 
+	it is run to optimize and reorganize. It will walk the trie upwards
+	towards the root from a given tnode, doing a resize() at each step
 	to implement level compression.
 
 resize()
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH net, v2] gre: fix the inner mac header in nbma tunnel xmit path
From: Timo Teräs @ 2014-12-15  7:24 UTC (permalink / raw)
  To: David Miller, netdev; +Cc: Timo Teräs, Tom Herbert, Alexander Duyck
In-Reply-To: <20141211.150706.1736198745550279282.davem@davemloft.net>

The NBMA GRE tunnels temporarily push GRE header that contain the
per-packet NBMA destination on the skb via header ops early in xmit
path. It is the later pulled before the real GRE header is constructed.

The inner mac was thus set differently in nbma case: the GRE header
has been pushed by neighbor layer, and mac header points to beginning
of the temporary gre header (set by dev_queue_xmit).

Now that the offloads expect mac header to point to the gre payload,
fix the xmit patch to:
 - pull first the temporary gre header away
 - and reset mac header to point to gre payload

This fixes tso to work again with nbma tunnels.

Fixes: 14051f0452a2 ("gre: Use inner mac length when computing tunnel length")
Signed-off-by: Timo Teräs <timo.teras@iki.fi>
Cc: Tom Herbert <therbert@google.com>
Cc: Alexander Duyck <alexander.h.duyck@redhat.com>
---
Fixed the stupid typo of skb_reset_mac_header()'s argument. And yes,
I did recompile, retest and verify that this works.

Comments from v1 apply:

Though, normally mac header does point to the begging of the hardware header.
E.g. in ethernet case it's pointing to the ethernet header. But now in gre
case it's instead pointing to the payload which seems counter-intuitive to me.
But I guess tunnels are a bit of special case, and there's valid reasons to
have it point to tunnel payload too.

Applying this patch on top of the Tom's previous fix of 14051f0452a2 seems to
now make my dmvpn scenario work again. So this should go to -stable too
(atleast 3.14).

 net/ipv4/ip_gre.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 94213c8..b40b90d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -250,10 +250,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	const struct iphdr *tnl_params;

-	skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
-	if (IS_ERR(skb))
-		goto out;
-
 	if (dev->header_ops) {
 		/* Need space for new headers */
 		if (skb_cow_head(skb, dev->needed_headroom -
@@ -266,6 +262,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 		 * to gre header.
 		 */
 		skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+		skb_reset_mac_header(skb);
 	} else {
 		if (skb_cow_head(skb, dev->needed_headroom))
 			goto free_skb;
@@ -273,6 +270,10 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
 		tnl_params = &tunnel->parms.iph;
 	}

+	skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
+	if (IS_ERR(skb))
+		goto out;
+
 	__gre_xmit(skb, dev, tnl_params, skb->protocol);

 	return NETDEV_TX_OK;
-- 
2.2.0

^ permalink raw reply related

* Re: Multicast packets being lost (3.10 stable)
From: Simon Horman @ 2014-12-15  8:04 UTC (permalink / raw)
  To: David Miller
  Cc: linus.luessing, shemming, netdev, bridge, gregkh, openwrt-devel
In-Reply-To: <20141213.153741.1636406298389279104.davem@davemloft.net>

On Sat, Dec 13, 2014 at 03:37:41PM -0500, David Miller wrote:
> From: Linus Lüssing <linus.luessing@c0d3.blue>
> Date: Wed, 10 Dec 2014 20:16:33 +0100
> 
> > did you have a chance to look into backporting these fixes for
> > stable yet?
> 
> I am not submitting -stable fixes back to 3.10 any longer, at most
> I am doing 4 -stable releases and right now that is 3.18, 3.17,
> v3.14, and v3.12

Hi Dave,

is there a method for people to get networking -stable fixes into older
(longterm) stable releases? Would it be best if people submitted them to
-stable themselves?

^ permalink raw reply

* [PATCH 0/5] e1000e: fix nic not boot after rebooting
From: Zhu Yanjun @ 2014-12-15  8:39 UTC (permalink / raw)
  To: netdev, w, zyjzyj2000; +Cc: Zhu Yanjun

With kernel 2.6.x, e1000e with 82577/8/9 sometimes will not boot
after rebooting. 

If a kernel 2.6.x board with 82577/8/9 e1000e nic is rebooted for 
100 times, there are 7~8 times that 82577/8/9 e1000e nic will not boot 
normally.

Zhu Yanjun (5):
  e1000e: reset MAC-PHY interconnect on 82577/82578
  e1000e: workaround EEPROM configuration change on 82579 on kernel
    2.6.x
  e1000e: do not toggle LANPHYPC value bit when PHY reset is blocked
  e1000e: update workaround for 82579 intermittently disabled during
    S0->Sx
  e1000e: cleanup use of check_reset_block function pointer

 drivers/net/e1000e/defines.h |  2 ++
 drivers/net/e1000e/hw.h      |  1 +
 drivers/net/e1000e/ich8lan.c | 37 +++++++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

-- 
1.9.1

^ permalink raw reply

* [PATCH 1/5] e1000e: reset MAC-PHY interconnect on 82577/82578
From: Zhu Yanjun @ 2014-12-15  8:39 UTC (permalink / raw)
  To: netdev, w, zyjzyj2000
  Cc: Zhu Yanjun, Bruce Allan, Jeff Kirsher, David S. Miller
In-Reply-To: <1418632754-16698-1-git-send-email-Yanjun.Zhu@windriver.com>

2.6.x kernels require a similar logic change as commit 901b2b95
[e1000e: reset MAC-PHY interconnect on 82577/82578] introduces
for newer kernels.

During Sx->S0 transitions, the interconnect between the MAC and PHY on
82577/82578 can remain in SMBus mode instead of transitioning to the
PCIe-like mode required during normal operation.  Toggling the LANPHYPC
Value bit essentially resets the interconnect forcing it to the correct
mode.

after review of all intel drivers, found several instances where
drivers had the incorrect pattern of:
memory mapped write();
delay();

which should always be:
memory mapped write();
write flush(); /* aka memory mapped read */
delay();

explanation:
The reason for including the flush is that writes can be held
(posted) in PCI/PCIe bridges, but the read always has to complete
synchronously and therefore has to flush all pending writes to a
device.  If a write is held and followed by a delay, the delay
means nothing because the write may not have reached hardware
(maybe even not until the next read)

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Zhu Yanjun <Yanjun.Zhu@windriver.com>
---
 drivers/net/e1000e/defines.h |  2 ++
 drivers/net/e1000e/ich8lan.c | 20 ++++++++++++++++++++
 2 files changed, 22 insertions(+)

diff --git a/drivers/net/e1000e/defines.h b/drivers/net/e1000e/defines.h
index 1190167..52283a6 100644
--- a/drivers/net/e1000e/defines.h
+++ b/drivers/net/e1000e/defines.h
@@ -214,6 +214,8 @@
 #define E1000_CTRL_SPD_1000 0x00000200  /* Force 1Gb */
 #define E1000_CTRL_FRCSPD   0x00000800  /* Force Speed */
 #define E1000_CTRL_FRCDPX   0x00001000  /* Force Duplex */
+#define E1000_CTRL_LANPHYPC_OVERRIDE 0x00010000 /* SW control of LANPHYPC */
+#define E1000_CTRL_LANPHYPC_VALUE    0x00020000 /* SW value of LANPHYPC */
 #define E1000_CTRL_SWDPIN0  0x00040000  /* SWDPIN 0 value */
 #define E1000_CTRL_SWDPIN1  0x00080000  /* SWDPIN 1 value */
 #define E1000_CTRL_SWDPIO0  0x00400000  /* SWDPIN 0 Input or output */
diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index de39f9a..020657c 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -88,6 +88,8 @@
 
 
 #define E1000_ICH_FWSM_RSPCIPHY	0x00000040 /* Reset PHY on PCI Reset */
+/* FW established a valid mode */ 
+#define E1000_ICH_FWSM_FW_VALID                0x00008000
 
 #define E1000_ICH_MNG_IAMT_MODE		0x2
 
@@ -260,6 +262,7 @@ static inline void __ew32flash(struct e1000_hw *hw, unsigned long reg, u32 val)
 static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 {
 	struct e1000_phy_info *phy = &hw->phy;
+	u32 ctrl;
 	s32 ret_val = 0;
 
 	phy->addr                     = 1;
@@ -274,6 +277,23 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	phy->ops.write_phy_reg_locked = e1000_write_phy_reg_hv_locked;
 	phy->autoneg_mask             = AUTONEG_ADVERTISE_SPEED_DEFAULT;
 
+	if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) {
+		/*
+		 * The MAC-PHY interconnect may still be in SMBus mode
+		 * after Sx->S0.  Toggle the LANPHYPC Value bit to force
+		 * the interconnect to PCIe mode, but only if there is no
+		 * firmware present otherwise firmware will have done it.
+		*/
+		ctrl = er32(CTRL);
+		ctrl |=  E1000_CTRL_LANPHYPC_OVERRIDE;
+		ctrl &= ~E1000_CTRL_LANPHYPC_VALUE;
+		ew32(CTRL, ctrl);
+		e1e_flush();
+		udelay(10);
+		ctrl &= ~E1000_CTRL_LANPHYPC_OVERRIDE;
+		ew32(CTRL, ctrl);
+		msleep(50);
+	}
 	/*
 	 * Reset the PHY before any acccess to it.  Doing so, ensures that
 	 * the PHY is in a known good state before we read/write PHY registers.
-- 
1.9.1

^ permalink raw reply related

* [PATCH 2/5] e1000e: workaround EEPROM configuration change on 82579 on kernel 2.6.x
From: Zhu Yanjun @ 2014-12-15  8:39 UTC (permalink / raw)
  To: netdev, w, zyjzyj2000; +Cc: Zhu Yanjun, Bruce Allan, Jeff Kirsher
In-Reply-To: <1418632754-16698-1-git-send-email-Yanjun.Zhu@windriver.com>

2.6.x kernels require a similar logic change as commit 62f1d8d1
[e1000e: workaround EEPROM configuration change on 82579] introduces
for newer kernels.

An update to the EEPROM on 82579 will extend a delay in hardware to fix an
issue with WoL not working after a G3->S5 transition which is unrelated to
the driver.  However, this extended delay conflicts with nominal operation
of the device when it is initialized by the driver and after every reset
of the hardware (i.e. the driver starts configuring the device before the
hardware is done with it's own configuration work).  The workaround for
when the driver is in control of the device is to tell the hardware after
every reset the configuration delay should be the original shorter one.

Some pre-existing variables are renamed generically to be re-used with
new register accesses.

[e1000_toggle_lanphypc_value_ich8lan does not exist. Its implementations
exist in e1000_init_phy_params_pchlan. Renamed variables remain unchanged]

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Zhu Yanjun <Yanjun.Zhu@windriver.com>
---
 drivers/net/e1000e/hw.h      |  1 +
 drivers/net/e1000e/ich8lan.c | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/drivers/net/e1000e/hw.h b/drivers/net/e1000e/hw.h
index 11f3b7c..b055d78 100644
--- a/drivers/net/e1000e/hw.h
+++ b/drivers/net/e1000e/hw.h
@@ -60,6 +60,7 @@ enum e1e_registers {
 	E1000_FEXTNVM  = 0x00028, /* Future Extended NVM - RW */
 	E1000_FCT      = 0x00030, /* Flow Control Type - RW */
 	E1000_VET      = 0x00038, /* VLAN Ether Type - RW */
+	E1000_FEXTNVM3 = 0x0003C, /* Future Extended NVM 3 - RW */
 	E1000_ICR      = 0x000C0, /* Interrupt Cause Read - R/clr */
 	E1000_ITR      = 0x000C4, /* Interrupt Throttling Rate - RW */
 	E1000_ICS      = 0x000C8, /* Interrupt Cause Set - WO */
diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index 020657c..c4b2d15 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -108,6 +108,9 @@
 #define E1000_FEXTNVM_SW_CONFIG		1
 #define E1000_FEXTNVM_SW_CONFIG_ICH8M (1 << 27) /* Bit redefined for ICH8M :/ */
 
+#define E1000_FEXTNVM3_PHY_CFG_COUNTER_MASK    0x0C000000
+#define E1000_FEXTNVM3_PHY_CFG_COUNTER_50MSEC  0x08000000
+
 #define PCIE_ICH8_SNOOP_ALL		PCIE_NO_SNOOP_ALL
 
 #define E1000_ICH_RAR_ENTRIES		7
@@ -278,6 +281,12 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	phy->autoneg_mask             = AUTONEG_ADVERTISE_SPEED_DEFAULT;
 
 	if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) {
+		/*Set Phy Config Counter to 50msec */
+		ctrl = er32(FEXTNVM3);
+		ctrl &= ~E1000_FEXTNVM3_PHY_CFG_COUNTER_MASK;
+		ctrl |= E1000_FEXTNVM3_PHY_CFG_COUNTER_50MSEC;
+		ew32(FEXTNVM3, ctrl);
+
 		/*
 		 * The MAC-PHY interconnect may still be in SMBus mode
 		 * after Sx->S0.  Toggle the LANPHYPC Value bit to force
@@ -2685,6 +2694,14 @@ static s32 e1000_reset_hw_ich8lan(struct e1000_hw *hw)
 	ew32(CTRL, (ctrl | E1000_CTRL_RST));
 	msleep(20);
 
+	/* Set Phy Config Counter to 50msec */
+	if (hw->mac.type == e1000_pch2lan) {
+		u32 phycc_reg = er32(FEXTNVM3);
+		phycc_reg &= ~E1000_FEXTNVM3_PHY_CFG_COUNTER_MASK;
+		phycc_reg |= E1000_FEXTNVM3_PHY_CFG_COUNTER_50MSEC;
+		ew32(FEXTNVM3, phycc_reg);
+	}
+
 	if (!ret_val)
 		e1000_release_swflag_ich8lan(hw);
 
-- 
1.9.1

^ permalink raw reply related

* [PATCH 3/5] e1000e: do not toggle LANPHYPC value bit when PHY reset is blocked
From: Zhu Yanjun @ 2014-12-15  8:39 UTC (permalink / raw)
  To: netdev, w, zyjzyj2000; +Cc: Zhu Yanjun, Bruce Allan, Jeff Kirsher
In-Reply-To: <1418632754-16698-1-git-send-email-Yanjun.Zhu@windriver.com>

2.6.x kernels require a similar logic change as commit b7d6e335
[e1000e: do not toggle LANPHYPC value bit when PHY reset is blocked]
introduces for newer kernels.

When PHY reset is intentionally blocked on 82577/8/9, do not toggle the
LANPHYPC value bit (essentially performing a hard power reset of the
device) otherwise the PHY can be put into an unknown state.

Cleanup whitespace in the same function.

[yanjun.zhu: whitespace remains unchanged]

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Zhu Yanjun <Yanjun.Zhu@windriver.com>
---
 drivers/net/e1000e/ich8lan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index c4b2d15..8c7e4aa 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -280,7 +280,8 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	phy->ops.write_phy_reg_locked = e1000_write_phy_reg_hv_locked;
 	phy->autoneg_mask             = AUTONEG_ADVERTISE_SPEED_DEFAULT;
 
-	if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) {
+	if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID) && 
+		!e1000_check_reset_block(hw)) {
 		/*Set Phy Config Counter to 50msec */
 		ctrl = er32(FEXTNVM3);
 		ctrl &= ~E1000_FEXTNVM3_PHY_CFG_COUNTER_MASK;
-- 
1.9.1

^ permalink raw reply related

* [PATCH 4/5] e1000e: update workaround for 82579 intermittently disabled during S0->Sx
From: Zhu Yanjun @ 2014-12-15  8:39 UTC (permalink / raw)
  To: netdev, w, zyjzyj2000; +Cc: Zhu Yanjun, Bruce Allan, Jeff Kirsher
In-Reply-To: <1418632754-16698-1-git-send-email-Yanjun.Zhu@windriver.com>

2.6.x kernels require a similar logic change as commit c6f8b74f
[e1000e: update workaround for 82579 intermittently disabled during S0->Sx]
introduces for newer kernels.

The workaround which toggles the LANPHYPC (LAN PHY Power Control) value bit
to force the MAC-Phy interconnect into PCIe mode from SMBus mode during
driver load and resume should always be done except if PHY resets are
blocked by the Manageability Engine (ME).  Previously, the toggle was done
only if PHY resets are blocked and the ME was disabled.

The rest of the patch is just indentation changes as a consequence of the
updated workaround.

[yanjun.zhu: indentation changes are removed.
function e1000_init_phy_workarounds_pchlan does not exist]

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Zhu Yanjun <Yanjun.Zhu@windriver.com>
---
 drivers/net/e1000e/ich8lan.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index 8c7e4aa..0da2c2c 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -280,8 +280,7 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	phy->ops.write_phy_reg_locked = e1000_write_phy_reg_hv_locked;
 	phy->autoneg_mask             = AUTONEG_ADVERTISE_SPEED_DEFAULT;

-	if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID) && 
-		!e1000_check_reset_block(hw)) {
+	if (!e1000_check_reset_block(hw)) {
 		/*Set Phy Config Counter to 50msec */
 		ctrl = er32(FEXTNVM3);
 		ctrl &= ~E1000_FEXTNVM3_PHY_CFG_COUNTER_MASK;
-- 
1.9.1

^ permalink raw reply related

* [PATCH 5/5] e1000e: cleanup use of check_reset_block function pointer
From: Zhu Yanjun @ 2014-12-15  8:39 UTC (permalink / raw)
  To: netdev, w, zyjzyj2000; +Cc: Zhu Yanjun, Bruce Allan, Jeff Kirsher
In-Reply-To: <1418632754-16698-1-git-send-email-Yanjun.Zhu@windriver.com>

2.6.x kernels require a similar logic change as commit 13ca85e0
[e1000e: cleanup use of check_reset_block function pointer] introduces
for newer kernels.

Replace e1000_check_reset_block() inline function with calls to the PHY ops
check_reset_block function pointer.

[yanjun.zhu: only modifications in function e1000_init_phy_params_pchlan will
be backported. Others remain unchanged]

Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Signed-off-by: Zhu Yanjun <Yanjun.Zhu@windriver.com>
---
 drivers/net/e1000e/ich8lan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c
index 0da2c2c..732cd48 100644
--- a/drivers/net/e1000e/ich8lan.c
+++ b/drivers/net/e1000e/ich8lan.c
@@ -280,7 +280,7 @@ static s32 e1000_init_phy_params_pchlan(struct e1000_hw *hw)
 	phy->ops.write_phy_reg_locked = e1000_write_phy_reg_hv_locked;
 	phy->autoneg_mask             = AUTONEG_ADVERTISE_SPEED_DEFAULT;
 
-	if (!e1000_check_reset_block(hw)) {
+	if (!hw->phy.ops.check_reset_block(hw)) {
 		/*Set Phy Config Counter to 50msec */
 		ctrl = er32(FEXTNVM3);
 		ctrl &= ~E1000_FEXTNVM3_PHY_CFG_COUNTER_MASK;
-- 
1.9.1

^ permalink raw reply related

* Re: [PATCH 0/5] e1000e: fix nic not boot after rebooting
From: Willy Tarreau @ 2014-12-15  8:48 UTC (permalink / raw)
  To: Zhu Yanjun, Bruce Allan, Jeff Kirsher; +Cc: netdev, Zhu Yanjun
In-Reply-To: <1418632754-16698-1-git-send-email-Yanjun.Zhu@windriver.com>

Hello,

On Mon, Dec 15, 2014 at 04:39:09PM +0800, Zhu Yanjun wrote:
> With kernel 2.6.x, e1000e with 82577/8/9 sometimes will not boot
> after rebooting. 
> 
> If a kernel 2.6.x board with 82577/8/9 e1000e nic is rebooted for 
> 100 times, there are 7~8 times that 82577/8/9 e1000e nic will not boot 
> normally.
> 
> Zhu Yanjun (5):
>   e1000e: reset MAC-PHY interconnect on 82577/82578
>   e1000e: workaround EEPROM configuration change on 82579 on kernel
>     2.6.x
>   e1000e: do not toggle LANPHYPC value bit when PHY reset is blocked
>   e1000e: update workaround for 82579 intermittently disabled during
>     S0->Sx
>   e1000e: cleanup use of check_reset_block function pointer

Great, thanks! Jeff/Bruce, do you have any objection against me applying
these fixes to 2.6.32 ?

Thanks,
Willy

^ permalink raw reply

* Re: [PATCH net v2] net: smc91x: Fix build without gpiolib
From: Tobias Klauser @ 2014-12-15  8:56 UTC (permalink / raw)
  To: David Miller; +Cc: tony, nico, netdev
In-Reply-To: <20141212.115838.1155760587618419145.davem@davemloft.net>

On 2014-12-12 at 17:58:38 +0100, David Miller <davem@davemloft.net> wrote:
> From: Tobias Klauser <tklauser@distanz.ch>
> Date: Fri, 12 Dec 2014 17:45:29 +0100
> 
> > On 2014-12-12 at 17:30:20 +0100, David Miller <davem@davemloft.net> wrote:
> >> In my opinion, if the code blocks enabling the configurations that
> >> need this are enabled, so should GPIO be depended upon.
> >> 
> >> I think, at a minimum, when CONFIG_OF is enabled smsc91x should
> >> require GPIO.
> > 
> > Agreed. This is the better solution, causing less surprises for the
> > user. Should this be a "select GPIOLIB if OF" then?
> 
> If GPIO is a child node in the Kconfig hierarchy (ie. has no
> dependencies of it's own), then yes.  Otherwise, a depends
> will need to be used, because select does not recursively
> trigger a select'd nodes dependencies.

Thank you for the explanation. Since GPIOLIB depends on
ARCH_WANT_OPTIONAL_GPIOLIB || ARCH_REQUIRE_GPIOLIB, I think it's
appropriate to let SMC91X depend on (!OF || GPIOLIB). I'll send an
updated patch.

^ permalink raw reply

* [PATCH net v3] net: smc91x: Fix build without gpiolib
From: Tobias Klauser @ 2014-12-15  9:02 UTC (permalink / raw)
  To: Nicolas Pitre; +Cc: David S. Miller, Tony Lindgren, netdev

If GPIOLIB=n the following build errors occur:

drivers/net/ethernet/smsc/smc91x.c: In function 'try_toggle_control_gpio':
drivers/net/ethernet/smsc/smc91x.c:2204:2: error: implicit declaration of function 'devm_gpiod_get_index' [-Werror=implicit-function-declaration]
drivers/net/ethernet/smsc/smc91x.c:2204:7: warning: assignment makes pointer from integer without a cast [enabled by default]
drivers/net/ethernet/smsc/smc91x.c:2213:2: error: implicit declaration of function 'gpiod_direction_output' [-Werror=implicit-function-declaration]
drivers/net/ethernet/smsc/smc91x.c:2216:3: error: implicit declaration of function 'devm_gpiod_put' [-Werror=implicit-function-declaration]
drivers/net/ethernet/smsc/smc91x.c:2222:2: error: implicit declaration of function 'gpiod_set_value_cansleep' [-Werror=implicit-function-declaration]

Fix this by letting the driver depend on GPIOLIB if OF is selected.

Fixes: 7d2911c4381 ("net: smc91x: Fix gpios for device tree based booting")
Cc: Tony Lindgren <tony@atomide.com>
Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
---
v3:
 - Let the driver depend on GPIOLIB if OF is selected instead of including GPIO
   function stubs.

 drivers/net/ethernet/smsc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/smsc/Kconfig b/drivers/net/ethernet/smsc/Kconfig
index 6279268..9468e64 100644
--- a/drivers/net/ethernet/smsc/Kconfig
+++ b/drivers/net/ethernet/smsc/Kconfig
@@ -39,7 +39,7 @@ config SMC91X
 	select CRC32
 	select MII
 	depends on (ARM || M32R || SUPERH || MIPS || BLACKFIN || \
-		    MN10300 || COLDFIRE || ARM64 || XTENSA || NIOS2)
+		    MN10300 || COLDFIRE || ARM64 || XTENSA || NIOS2) && (!OF || GPIOLIB)
 	---help---
 	  This is a driver for SMC's 91x series of Ethernet chipsets,
 	  including the SMC91C94 and the SMC91C111. Say Y if you want it
-- 
2.2.0

^ permalink raw reply related

* RE: [RFC PATCH net-next 1/1] net: Support for switch port configuration
From: Varlese, Marco @ 2014-12-15  9:39 UTC (permalink / raw)
  To: Roopa Prabhu
  Cc: Jiri Pirko, John Fastabend, netdev@vger.kernel.org,
	stephen@networkplumber.org, Fastabend, John R, sfeldma@gmail.com,
	linux-kernel@vger.kernel.org
In-Reply-To: <548BE569.5040803@cumulusnetworks.com>

> -----Original Message-----
> From: Roopa Prabhu [mailto:roopa@cumulusnetworks.com]
> Sent: Saturday, December 13, 2014 7:06 AM
> To: Varlese, Marco
> Cc: Jiri Pirko; John Fastabend; netdev@vger.kernel.org;
> stephen@networkplumber.org; Fastabend, John R; sfeldma@gmail.com;
> linux-kernel@vger.kernel.org
> Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch port
> configuration
> 
> On 12/12/14, 1:19 AM, Varlese, Marco wrote:
> >> -----Original Message-----
> >> From: Roopa Prabhu [mailto:roopa@cumulusnetworks.com]
> >> Sent: Thursday, December 11, 2014 5:41 PM
> >> To: Jiri Pirko
> >> Cc: Varlese, Marco; John Fastabend; netdev@vger.kernel.org;
> >> stephen@networkplumber.org; Fastabend, John R; sfeldma@gmail.com;
> >> linux-kernel@vger.kernel.org
> >> Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch port
> >> configuration
> >>
> >> On 12/11/14, 8:56 AM, Jiri Pirko wrote:
> >>> Thu, Dec 11, 2014 at 05:37:46PM CET, roopa@cumulusnetworks.com
> wrote:
> >>>> On 12/11/14, 3:01 AM, Jiri Pirko wrote:
> >>>>> Thu, Dec 11, 2014 at 10:59:42AM CET, marco.varlese@intel.com wrote:
> >>>>>>> -----Original Message-----
> >>>>>>> From: John Fastabend [mailto:john.fastabend@gmail.com]
> >>>>>>> Sent: Wednesday, December 10, 2014 5:04 PM
> >>>>>>> To: Jiri Pirko
> >>>>>>> Cc: Varlese, Marco; netdev@vger.kernel.org;
> >>>>>>> stephen@networkplumber.org; Fastabend, John R;
> >>>>>>> roopa@cumulusnetworks.com; sfeldma@gmail.com; linux-
> >>>>>>> kernel@vger.kernel.org
> >>>>>>> Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch
> >>>>>>> port configuration
> >>>>>>>
> >>>>>>> On 12/10/2014 08:50 AM, Jiri Pirko wrote:
> >>>>>>>> Wed, Dec 10, 2014 at 05:23:40PM CET, marco.varlese@intel.com
> >> wrote:
> >>>>>>>>> From: Marco Varlese <marco.varlese@intel.com>
> >>>>>>>>>
> >>>>>>>>> Switch hardware offers a list of attributes that are
> >>>>>>>>> configurable on a per port basis.
> >>>>>>>>> This patch provides a mechanism to configure switch ports by
> >>>>>>>>> adding an NDO for setting specific values to specific attributes.
> >>>>>>>>> There will be a separate patch that extends iproute2 to call
> >>>>>>>>> the new NDO.
> >>>>>>>> What are these attributes? Can you give some examples. I'm
> >>>>>>>> asking because there is a plan to pass generic attributes to
> >>>>>>>> switch ports replacing current specific
> >>>>>>>> ndo_switch_port_stp_update. In this case, bridge is setting that
> attribute.
> >>>>>>>>
> >>>>>>>> Is there need to set something directly from userspace or does
> >>>>>>>> it make rather sense to use involved bridge/ovs/bond ? I think
> >>>>>>>> that both will be needed.
> >>>>>>> +1
> >>>>>>>
> >>>>>>> I think for many attributes it would be best to have both. The
> >>>>>>> in kernel callers and netlink userspace can use the same driver
> ndo_ops.
> >>>>>>>
> >>>>>>> But then we don't _require_ any specific bridge/ovs/etc module.
> >>>>>>> And we may have some attributes that are not specific to any
> >>>>>>> existing software module. I'm guessing Marco has some examples
> >>>>>>> of
> >> these.
> >>>>>>> [...]
> >>>>>>>
> >>>>>>>
> >>>>>>> --
> >>>>>>> John Fastabend         Intel Corporation
> >>>>>> We do have a need to configure the attributes directly from
> >>>>>> user-space
> >> and I have identified the tool to do that in iproute2.
> >>>>>> An example of attributes are:
> >>>>>> * enabling/disabling of learning of source addresses on a given
> >>>>>> port (you can imagine the attribute called LEARNING for example);
> >>>>>> * internal loopback control (i.e. LOOPBACK) which will control
> >>>>>> how the flow of traffic behaves from the switch fabric towards an
> >>>>>> egress port;
> >>>>>> * flooding for broadcast/multicast/unicast type of packets (i.e.
> >>>>>> BFLOODING, MFLOODING, UFLOODING);
> >>>>>>
> >>>>>> Some attributes would be of the type enabled/disabled while other
> >>>>>> will
> >> allow specific values to allow the user to configure different
> >> behaviours of that feature on that particular port on that platform.
> >>>>>> One thing to mention - as John stated as well - there might be
> >>>>>> some
> >> attributes that are not specific to any software module but rather
> >> have to do with the actual hardware/platform to configure.
> >>>>>> I hope this clarifies some points.
> >>>>> It does. Makes sense. We need to expose this attr set/get for both
> >>>>> in-kernel and userspace use cases.
> >>>>>
> >>>>> Please adjust you patch for this. Also, as a second patch, it
> >>>>> would be great if you can convert ndo_switch_port_stp_update to
> >>>>> this new
> >> ndo.
> >>>> Why are we exposing generic switch attribute get/set from userspace
> >>>> ?. We already have specific attributes for learning/flooding which
> >>>> can be extended further.
> >>> Yes, but that is for PF_BRIDGE and bridge specific attributes. There
> >>> might be another generic attrs, no?
> >> I cant think of any. And plus, the whole point of switchdev l2
> >> offloads was to map these to existing bridge attributes. And we
> >> already have a match for some of the attributes that marco wants.
> >>
> >> If there is a need for such attributes, i don't see why it is needed
> >> for switch devices only.
> >> It is needed for any hw (nics etc). And, a precedence to this is to
> >> do it via ethtool.
> >>
> >> Having said that, am sure we will find a need for this in the future.
> >> And having a netlink attribute always helps.
> >>
> >> Today, it seems like these can be mapped to existing attributes that
> >> are settable via ndo_bridge_setlink/getlink.
> >>
> >>>> And for in kernel api....i had a sample patch in my RFC series
> >>>> (Which i was going to resubmit, until it was decided that we will
> >>>> use existing api around
> >>>> ndo_bridge_setlink/ndo_bridge_getlink):
> >>>> http://www.spinics.net/lists/netdev/msg305473.html
> >>> Yes, this might become handy for other generic non-bridge attrs.
> >>>
> >>>> Thanks,
> >>>> Roopa
> >>>>
> >>>>
> >>>>
> > The list I provided is only a subset of the attributes we will need to be
> exposed. I do have more and I'm sure that more will come in the future. As I
> mentioned in few posts earlier, some attributes are generic and some are
> not.
> >
> > I did not consider ethtool for few reasons but the main one is that I was
> under the impression that netlink was preferred in many circumstances over
> the ethotool_ops.
> That is correct. I don't think anybody hinted that you should extend ethtool.
> >   Plus, all the cases I have identified so far are going to nicely fit into the
> setlink set of operations.
> >
> 
> Would be better if you submitted your iproute2 patch with this patch.
> 
> I do plan to resubmit my generic ndo patch soon.
> 
> Thanks,
> Roopa

I honestly do not understand what extra "help" the iproute2 would have brought to this RFC: that patch simply adds a new section for the iproute2 help and a new args parser for the input. From an infrastructure perspective is leveraging what netlink messages are using RTM_SETLINK hence hooking up eventually in the do_setlink(). Sure, obviously contains all the attributes I have in mind but from an infrastructure patch perspective I don't think that you would have gained much in seeing it.

Anyway, good to know you're reworking you generic patch. I'll keep an eye out for your new NDO.


Thanks,
Marco

^ permalink raw reply

* Re: [PATCH net] net/mlx4_en: correct the endianness of doorbell_qpn on big endian platform
From: Amir Vadai @ 2014-12-15  9:59 UTC (permalink / raw)
  To: Wei Yang, David Miller, David.Laight@ACULAB.COM
  Cc: eric.dumazet@gmail.com, netdev@vger.kernel.org, Gideon Naim,
	edumazet@google.com
In-Reply-To: <20141215013249.GA7341@richard>

On 12/15/2014 3:32 AM, Wei Yang wrote:
[...]

Hi David's,

I need to do a native endianness write to the NIC register (the write is
on the fast path and the register value could be calculated once).

iowrite32be() is calling cpu_to_be32() on the value.
iowrite32() is calling cpu_to_le32().

I thought about using raw_writel() but as David Miller said, it lacks
necessary io barriers on some archs.

Does the only solution to this, is to add some sort of
iowrite32be_native(__be32 val,...) function to all the archs?

Thanks,
Amir

^ permalink raw reply

* RE: [PATCH net] net/mlx4_en: correct the endianness of doorbell_qpn on big endian platform
From: David Laight @ 2014-12-15 10:00 UTC (permalink / raw)
  To: 'Wei Yang', David Miller
  Cc: eric.dumazet@gmail.com, netdev@vger.kernel.org,
	gideonn@mellanox.com, edumazet@google.com, amirv@mellanox.com
In-Reply-To: <20141215013249.GA7341@richard>

From: Wei Yang
> On Sat, Dec 13, 2014 at 11:43:20PM -0500, David Miller wrote:
> >From: Wei Yang <weiyang@linux.vnet.ibm.com>
> >Date: Sat, 13 Dec 2014 11:13:38 +0800
> >
> >> On Mon, Dec 08, 2014 at 10:42:37PM +0800, Wei Yang wrote:
> >> If you prefer this way, I would like to send a new version for this.
> >> Is it ok for you?
> >
> >I'm not so sure.  There are implications when using the __raw_*()
> >routines.
> >
> >In particular, using __raw_{read,write}l() also means that the usual
> >necessary I/O memory barriers are not being performed.
> >
> >There are therefore no ordering guarantees between __raw_*() and other
> >I/O or memory accesses whatsoever.
> 
> Thanks David.
> 
> Actually, the last mail is asking David Laight. I am trying to understanding
> his comment and Amir told me he was suggesting to use __raw_*() version.

I don't know (off hand) which of the access functions contain byteswaps and
memory barriers, from trying to get some drivers running on ppc I know the
whole thing is a complete mess (beats me why all the on-chip devices of
opposite endianness to the cpu iteself).

In this case you seem to have an earlier change to store the value that
doesn't need a byteswap on write (for efficiency), then are using an
access function that does a byteswapping write to memory (efficient, but
the wrong value), and are fixing it by adding another byteswap (inefficient)
prior to the byteswapping write.

So you should either be using a non-byteswapping write, of just save
the byteswapped value so that the written value is correct.

	David

> Hmm... this is really a problem found in the v3.18-rc1 and the root cause is
> the endianess. I am ok to use any method to fix this problem, even revert it.
> Could the maintainer from Mellanox gives me a word?
> 
> --
> Richard Yang
> Help you, Help me

^ permalink raw reply

* RE: [RFC PATCH net-next 1/1] net: Support for switch port configuration
From: Arad, Ronen @ 2014-12-15 10:58 UTC (permalink / raw)
  To: Varlese, Marco, Roopa Prabhu, netdev@vger.kernel.org
  Cc: Jiri Pirko, John Fastabend, stephen@networkplumber.org,
	Fastabend, John R, sfeldma@gmail.com,
	linux-kernel@vger.kernel.org
In-Reply-To: <C4896FB061E7DE4AAC93031BDCA044B104AC533A@IRSMSX108.ger.corp.intel.com>


The proposed patch introduces a way for supporting device specific switch port attributes.
Can we expect user-space tools such as iproute2 to be aware of such attributes from every device?
A generic tool like iproute2 can't be aware of all the specific attributes of all the devices that will use the newly proposed ndo.
Do we need a generic mechanism for a device to expose to user-space the set of device specific attributes it supports?
Exported information should include:
- Attribute keyword - will be used by iproute2 to parse user input and display in device specific help
- Attribute type - the numeric value for the 'attr' argument of ndo_switch_port_set_cfg().
- Attribute value range - range of supported values for the attribute
- Attribute help
  Note: A generic ndo patch as suggested by Roopa requires going beyond simple range to make it useable by generic user-space tool like iproute2.

With such mechanism in place iproute2 could provide end-user friendly experience in a generic way.

-Ronen

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Varlese, Marco
> Sent: Monday, December 15, 2014 1:40 AM
> To: Roopa Prabhu
> Cc: Jiri Pirko; John Fastabend; netdev@vger.kernel.org;
> stephen@networkplumber.org; Fastabend, John R; sfeldma@gmail.com;
> linux-kernel@vger.kernel.org
> Subject: RE: [RFC PATCH net-next 1/1] net: Support for switch port
> configuration
> 
> > -----Original Message-----
> > From: Roopa Prabhu [mailto:roopa@cumulusnetworks.com]
> > Sent: Saturday, December 13, 2014 7:06 AM
> > To: Varlese, Marco
> > Cc: Jiri Pirko; John Fastabend; netdev@vger.kernel.org;
> > stephen@networkplumber.org; Fastabend, John R; sfeldma@gmail.com;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch port
> > configuration
> >
> > On 12/12/14, 1:19 AM, Varlese, Marco wrote:
> > >> -----Original Message-----
> > >> From: Roopa Prabhu [mailto:roopa@cumulusnetworks.com]
> > >> Sent: Thursday, December 11, 2014 5:41 PM
> > >> To: Jiri Pirko
> > >> Cc: Varlese, Marco; John Fastabend; netdev@vger.kernel.org;
> > >> stephen@networkplumber.org; Fastabend, John R;
> sfeldma@gmail.com;
> > >> linux-kernel@vger.kernel.org
> > >> Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch port
> > >> configuration
> > >>
> > >> On 12/11/14, 8:56 AM, Jiri Pirko wrote:
> > >>> Thu, Dec 11, 2014 at 05:37:46PM CET, roopa@cumulusnetworks.com
> > wrote:
> > >>>> On 12/11/14, 3:01 AM, Jiri Pirko wrote:
> > >>>>> Thu, Dec 11, 2014 at 10:59:42AM CET, marco.varlese@intel.com
> wrote:
> > >>>>>>> -----Original Message-----
> > >>>>>>> From: John Fastabend [mailto:john.fastabend@gmail.com]
> > >>>>>>> Sent: Wednesday, December 10, 2014 5:04 PM
> > >>>>>>> To: Jiri Pirko
> > >>>>>>> Cc: Varlese, Marco; netdev@vger.kernel.org;
> > >>>>>>> stephen@networkplumber.org; Fastabend, John R;
> > >>>>>>> roopa@cumulusnetworks.com; sfeldma@gmail.com; linux-
> > >>>>>>> kernel@vger.kernel.org
> > >>>>>>> Subject: Re: [RFC PATCH net-next 1/1] net: Support for switch
> > >>>>>>> port configuration
> > >>>>>>>
> > >>>>>>> On 12/10/2014 08:50 AM, Jiri Pirko wrote:
> > >>>>>>>> Wed, Dec 10, 2014 at 05:23:40PM CET, marco.varlese@intel.com
> > >> wrote:
> > >>>>>>>>> From: Marco Varlese <marco.varlese@intel.com>
> > >>>>>>>>>
> > >>>>>>>>> Switch hardware offers a list of attributes that are
> > >>>>>>>>> configurable on a per port basis.
> > >>>>>>>>> This patch provides a mechanism to configure switch ports by
> > >>>>>>>>> adding an NDO for setting specific values to specific attributes.
> > >>>>>>>>> There will be a separate patch that extends iproute2 to call
> > >>>>>>>>> the new NDO.
> > >>>>>>>> What are these attributes? Can you give some examples. I'm
> > >>>>>>>> asking because there is a plan to pass generic attributes to
> > >>>>>>>> switch ports replacing current specific
> > >>>>>>>> ndo_switch_port_stp_update. In this case, bridge is setting
> > >>>>>>>> that
> > attribute.
> > >>>>>>>>
> > >>>>>>>> Is there need to set something directly from userspace or
> > >>>>>>>> does it make rather sense to use involved bridge/ovs/bond ? I
> > >>>>>>>> think that both will be needed.
> > >>>>>>> +1
> > >>>>>>>
> > >>>>>>> I think for many attributes it would be best to have both. The
> > >>>>>>> in kernel callers and netlink userspace can use the same
> > >>>>>>> driver
> > ndo_ops.
> > >>>>>>>
> > >>>>>>> But then we don't _require_ any specific bridge/ovs/etc module.
> > >>>>>>> And we may have some attributes that are not specific to any
> > >>>>>>> existing software module. I'm guessing Marco has some examples
> > >>>>>>> of
> > >> these.
> > >>>>>>> [...]
> > >>>>>>>
> > >>>>>>>
> > >>>>>>> --
> > >>>>>>> John Fastabend         Intel Corporation
> > >>>>>> We do have a need to configure the attributes directly from
> > >>>>>> user-space
> > >> and I have identified the tool to do that in iproute2.
> > >>>>>> An example of attributes are:
> > >>>>>> * enabling/disabling of learning of source addresses on a given
> > >>>>>> port (you can imagine the attribute called LEARNING for
> > >>>>>> example);
> > >>>>>> * internal loopback control (i.e. LOOPBACK) which will control
> > >>>>>> how the flow of traffic behaves from the switch fabric towards
> > >>>>>> an egress port;
> > >>>>>> * flooding for broadcast/multicast/unicast type of packets (i.e.
> > >>>>>> BFLOODING, MFLOODING, UFLOODING);
> > >>>>>>
> > >>>>>> Some attributes would be of the type enabled/disabled while
> > >>>>>> other will
> > >> allow specific values to allow the user to configure different
> > >> behaviours of that feature on that particular port on that platform.
> > >>>>>> One thing to mention - as John stated as well - there might be
> > >>>>>> some
> > >> attributes that are not specific to any software module but rather
> > >> have to do with the actual hardware/platform to configure.
> > >>>>>> I hope this clarifies some points.
> > >>>>> It does. Makes sense. We need to expose this attr set/get for
> > >>>>> both in-kernel and userspace use cases.
> > >>>>>
> > >>>>> Please adjust you patch for this. Also, as a second patch, it
> > >>>>> would be great if you can convert ndo_switch_port_stp_update to
> > >>>>> this new
> > >> ndo.
> > >>>> Why are we exposing generic switch attribute get/set from
> > >>>> userspace ?. We already have specific attributes for
> > >>>> learning/flooding which can be extended further.
> > >>> Yes, but that is for PF_BRIDGE and bridge specific attributes.
> > >>> There might be another generic attrs, no?
> > >> I cant think of any. And plus, the whole point of switchdev l2
> > >> offloads was to map these to existing bridge attributes. And we
> > >> already have a match for some of the attributes that marco wants.
> > >>
> > >> If there is a need for such attributes, i don't see why it is
> > >> needed for switch devices only.
> > >> It is needed for any hw (nics etc). And, a precedence to this is to
> > >> do it via ethtool.
> > >>
> > >> Having said that, am sure we will find a need for this in the future.
> > >> And having a netlink attribute always helps.
> > >>
> > >> Today, it seems like these can be mapped to existing attributes
> > >> that are settable via ndo_bridge_setlink/getlink.
> > >>
> > >>>> And for in kernel api....i had a sample patch in my RFC series
> > >>>> (Which i was going to resubmit, until it was decided that we will
> > >>>> use existing api around
> > >>>> ndo_bridge_setlink/ndo_bridge_getlink):
> > >>>> http://www.spinics.net/lists/netdev/msg305473.html
> > >>> Yes, this might become handy for other generic non-bridge attrs.
> > >>>
> > >>>> Thanks,
> > >>>> Roopa
> > >>>>
> > >>>>
> > >>>>
> > > The list I provided is only a subset of the attributes we will need
> > > to be
> > exposed. I do have more and I'm sure that more will come in the
> > future. As I mentioned in few posts earlier, some attributes are
> > generic and some are not.
> > >
> > > I did not consider ethtool for few reasons but the main one is that
> > > I was
> > under the impression that netlink was preferred in many circumstances
> > over the ethotool_ops.
> > That is correct. I don't think anybody hinted that you should extend
> ethtool.
> > >   Plus, all the cases I have identified so far are going to nicely
> > > fit into the
> > setlink set of operations.
> > >
> >
> > Would be better if you submitted your iproute2 patch with this patch.
> >
> > I do plan to resubmit my generic ndo patch soon.
> >
> > Thanks,
> > Roopa
> 
> I honestly do not understand what extra "help" the iproute2 would have
> brought to this RFC: that patch simply adds a new section for the iproute2
> help and a new args parser for the input. From an infrastructure perspective
> is leveraging what netlink messages are using RTM_SETLINK hence hooking
> up eventually in the do_setlink(). Sure, obviously contains all the attributes I
> have in mind but from an infrastructure patch perspective I don't think that
> you would have gained much in seeing it.
> 
> Anyway, good to know you're reworking you generic patch. I'll keep an eye
> out for your new NDO.
> 
> 
> Thanks,
> Marco
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox