Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v4 0/6] net: tcp: sctp: dccp: Replace jprobe usage with trace events
From: Masami Hiramatsu @ 2017-12-20  4:14 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat

Hi,

This series is v4 of the replacement of jprobe usage with trace
events. This version is rebased on net-next, fixes a build warning
and moves a temporal variable definition in a block.

Previous version is here;
https://lkml.org/lkml/2017/12/19/153

Changes from v3:
  All: Rebased on net-next
  [3/6]: fixes a build warning for i386 by casting pointer unsigned
        long instead of __u64, and moves a temporal variable
         definition in a block.

Thank you,

---

Masami Hiramatsu (6):
      net: tcp: Add trace events for TCP congestion window tracing
      net: tcp: Remove TCP probe module
      net: sctp: Add SCTP ACK tracking trace event
      net: sctp: Remove debug SCTP probe module
      net: dccp: Add DCCP sendmsg trace event
      net: dccp: Remove dccpprobe module


 include/trace/events/sctp.h |   99 ++++++++++++++
 include/trace/events/tcp.h  |   80 +++++++++++
 net/Kconfig                 |   17 --
 net/dccp/Kconfig            |   17 --
 net/dccp/Makefile           |    2 
 net/dccp/probe.c            |  203 -----------------------------
 net/dccp/proto.c            |    5 +
 net/dccp/trace.h            |  105 +++++++++++++++
 net/ipv4/Makefile           |    1 
 net/ipv4/tcp_input.c        |    3 
 net/ipv4/tcp_probe.c        |  301 -------------------------------------------
 net/sctp/Kconfig            |   12 --
 net/sctp/Makefile           |    3 
 net/sctp/probe.c            |  244 -----------------------------------
 net/sctp/sm_statefuns.c     |    5 +
 15 files changed, 297 insertions(+), 800 deletions(-)
 create mode 100644 include/trace/events/sctp.h
 delete mode 100644 net/dccp/probe.c
 create mode 100644 net/dccp/trace.h
 delete mode 100644 net/ipv4/tcp_probe.c
 delete mode 100644 net/sctp/probe.c

--
Masami Hiramatsu (Linaro) <mhiramat@kernel.org>

^ permalink raw reply

* [PATCH net-next v4 1/6] net: tcp: Add trace events for TCP congestion window tracing
From: Masami Hiramatsu @ 2017-12-20  4:14 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151374325126.2497.6934744693865165386.stgit@devbox>

This adds an event to trace TCP stat variables with
slightly intrusive trace-event. This uses ftrace/perf
event log buffer to trace those state, no needs to
prepare own ring-buffer, nor custom user apps.

User can use ftrace to trace this event as below;

  # cd /sys/kernel/debug/tracing
  # echo 1 > events/tcp/tcp_probe/enable
  (run workloads)
  # cat trace

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 Changes in v3:
  - Fix build errors caused by including events/tcp.h twice.
  - Sort out the including headers.
---
 include/trace/events/tcp.h |   80 ++++++++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_input.c       |    3 ++
 2 files changed, 83 insertions(+)

diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h
index 07cccca6cbf1..14ad60b468fb 100644
--- a/include/trace/events/tcp.h
+++ b/include/trace/events/tcp.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 */
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM tcp
 
@@ -8,6 +9,7 @@
 #include <linux/tcp.h>
 #include <linux/tracepoint.h>
 #include <net/ipv6.h>
+#include <net/tcp.h>
 
 #define tcp_state_name(state)	{ state, #state }
 #define show_tcp_state_name(val)			\
@@ -293,6 +295,84 @@ TRACE_EVENT(tcp_retransmit_synack,
 		  __entry->saddr_v6, __entry->daddr_v6)
 );
 
+TRACE_EVENT(tcp_probe,
+
+	TP_PROTO(struct sock *sk, struct sk_buff *skb),
+
+	TP_ARGS(sk, skb),
+
+	TP_STRUCT__entry(
+		/* sockaddr_in6 is always bigger than sockaddr_in */
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u32, mark)
+		__field(__u16, length)
+		__field(__u32, snd_nxt)
+		__field(__u32, snd_una)
+		__field(__u32, snd_cwnd)
+		__field(__u32, ssthresh)
+		__field(__u32, snd_wnd)
+		__field(__u32, srtt)
+		__field(__u32, rcv_wnd)
+	),
+
+	TP_fast_assign(
+		const struct tcp_sock *tp = tcp_sk(sk);
+		const struct inet_sock *inet = inet_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		if (sk->sk_family == AF_INET) {
+			struct sockaddr_in *v4 = (void *)__entry->saddr;
+
+			v4->sin_family = AF_INET;
+			v4->sin_port = inet->inet_sport;
+			v4->sin_addr.s_addr = inet->inet_saddr;
+			v4 = (void *)__entry->daddr;
+			v4->sin_family = AF_INET;
+			v4->sin_port = inet->inet_dport;
+			v4->sin_addr.s_addr = inet->inet_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else if (sk->sk_family == AF_INET6) {
+			struct sockaddr_in6 *v6 = (void *)__entry->saddr;
+
+			v6->sin6_family = AF_INET6;
+			v6->sin6_port = inet->inet_sport;
+			v6->sin6_addr = inet6_sk(sk)->saddr;
+			v6 = (void *)__entry->daddr;
+			v6->sin6_family = AF_INET6;
+			v6->sin6_port = inet->inet_dport;
+			v6->sin6_addr = sk->sk_v6_daddr;
+#endif
+		}
+
+		/* For filtering use */
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+		__entry->mark = skb->mark;
+
+		__entry->length = skb->len;
+		__entry->snd_nxt = tp->snd_nxt;
+		__entry->snd_una = tp->snd_una;
+		__entry->snd_cwnd = tp->snd_cwnd;
+		__entry->snd_wnd = tp->snd_wnd;
+		__entry->rcv_wnd = tp->rcv_wnd;
+		__entry->ssthresh = tcp_current_ssthresh(sk);
+		__entry->srtt = tp->srtt_us >> 3;
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc mark=%#x length=%d snd_nxt=%#x "
+		  "snd_una=%#x snd_cwnd=%u ssthresh=%u snd_wnd=%u srtt=%u "
+		  "rcv_wnd=%u",
+		  __entry->saddr, __entry->daddr, __entry->mark,
+		  __entry->length, __entry->snd_nxt, __entry->snd_una,
+		  __entry->snd_cwnd, __entry->ssthresh, __entry->snd_wnd,
+		  __entry->srtt, __entry->rcv_wnd)
+);
+
 #endif /* _TRACE_TCP_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4d55c4b338ee..ff71b18d9682 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5299,6 +5299,9 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	unsigned int len = skb->len;
 	struct tcp_sock *tp = tcp_sk(sk);
 
+	/* TCP congestion window tracking */
+	trace_tcp_probe(sk, skb);
+
 	tcp_mstamp_refresh(tp);
 	if (unlikely(!sk->sk_rx_dst))
 		inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);

^ permalink raw reply related

* [PATCH net-next v4 2/6] net: tcp: Remove TCP probe module
From: Masami Hiramatsu @ 2017-12-20  4:15 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151374325126.2497.6934744693865165386.stgit@devbox>

Remove TCP probe module since jprobe has been deprecated.
That function is now replaced by tcp/tcp_probe trace-event.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 net/Kconfig          |   17 ---
 net/ipv4/Makefile    |    1 
 net/ipv4/tcp_probe.c |  301 --------------------------------------------------
 3 files changed, 319 deletions(-)
 delete mode 100644 net/ipv4/tcp_probe.c

diff --git a/net/Kconfig b/net/Kconfig
index 9dba2715919d..efe930db3c08 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -336,23 +336,6 @@ config NET_PKTGEN
 	  To compile this code as a module, choose M here: the
 	  module will be called pktgen.
 
-config NET_TCPPROBE
-	tristate "TCP connection probing"
-	depends on INET && PROC_FS && KPROBES
-	---help---
-	This module allows for capturing the changes to TCP connection
-	state in response to incoming packets. It is used for debugging
-	TCP congestion avoidance modules. If you don't understand
-	what was just said, you don't need it: say N.
-
-	Documentation on how to use TCP connection probing can be found
-	at:
-	
-	  http://www.linuxfoundation.org/collaborate/workgroups/networking/tcpprobe
-
-	To compile this code as a module, choose M here: the
-	module will be called tcp_probe.
-
 config NET_DROP_MONITOR
 	tristate "Network packet drop alerting service"
 	depends on INET && TRACEPOINTS
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c6c8ad1d4b6d..47a0a6649a9d 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,6 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
-obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
deleted file mode 100644
index 697f4c67b2e3..000000000000
--- a/net/ipv4/tcp_probe.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * tcpprobe - Observe the TCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/tcp.h>
-#include <linux/slab.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/ktime.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/tcp.h>
-
-MODULE_AUTHOR("Stephen Hemminger <shemminger@linux-foundation.org>");
-MODULE_DESCRIPTION("TCP cwnd snooper");
-MODULE_LICENSE("GPL");
-MODULE_VERSION("1.1");
-
-static int port __read_mostly;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int bufsize __read_mostly = 4096;
-MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
-module_param(bufsize, uint, 0);
-
-static unsigned int fwmark __read_mostly;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int full __read_mostly;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "tcpprobe";
-
-struct tcp_log {
-	ktime_t tstamp;
-	union {
-		struct sockaddr		raw;
-		struct sockaddr_in	v4;
-		struct sockaddr_in6	v6;
-	}	src, dst;
-	u16	length;
-	u32	snd_nxt;
-	u32	snd_una;
-	u32	snd_wnd;
-	u32	rcv_wnd;
-	u32	snd_cwnd;
-	u32	ssthresh;
-	u32	srtt;
-};
-
-static struct {
-	spinlock_t	lock;
-	wait_queue_head_t wait;
-	ktime_t		start;
-	u32		lastcwnd;
-
-	unsigned long	head, tail;
-	struct tcp_log	*log;
-} tcp_probe;
-
-static inline int tcp_probe_used(void)
-{
-	return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
-}
-
-static inline int tcp_probe_avail(void)
-{
-	return bufsize - tcp_probe_used() - 1;
-}
-
-#define tcp_probe_copy_fl_to_si4(inet, si4, mem)		\
-	do {							\
-		si4.sin_family = AF_INET;			\
-		si4.sin_port = inet->inet_##mem##port;		\
-		si4.sin_addr.s_addr = inet->inet_##mem##addr;	\
-	} while (0)						\
-
-/*
- * Hook inserted to be called before each receive packet.
- * Note: arguments must match tcp_rcv_established()!
- */
-static void jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
-				 const struct tcphdr *th)
-{
-	unsigned int len = skb->len;
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct inet_sock *inet = inet_sk(sk);
-
-	/* Only update if port or skb mark matches */
-	if (((port == 0 && fwmark == 0) ||
-	     ntohs(inet->inet_dport) == port ||
-	     ntohs(inet->inet_sport) == port ||
-	     (fwmark > 0 && skb->mark == fwmark)) &&
-	    (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
-
-		spin_lock(&tcp_probe.lock);
-		/* If log fills, just silently drop */
-		if (tcp_probe_avail() > 1) {
-			struct tcp_log *p = tcp_probe.log + tcp_probe.head;
-
-			p->tstamp = ktime_get();
-			switch (sk->sk_family) {
-			case AF_INET:
-				tcp_probe_copy_fl_to_si4(inet, p->src.v4, s);
-				tcp_probe_copy_fl_to_si4(inet, p->dst.v4, d);
-				break;
-			case AF_INET6:
-				memset(&p->src.v6, 0, sizeof(p->src.v6));
-				memset(&p->dst.v6, 0, sizeof(p->dst.v6));
-#if IS_ENABLED(CONFIG_IPV6)
-				p->src.v6.sin6_family = AF_INET6;
-				p->src.v6.sin6_port = inet->inet_sport;
-				p->src.v6.sin6_addr = inet6_sk(sk)->saddr;
-
-				p->dst.v6.sin6_family = AF_INET6;
-				p->dst.v6.sin6_port = inet->inet_dport;
-				p->dst.v6.sin6_addr = sk->sk_v6_daddr;
-#endif
-				break;
-			default:
-				BUG();
-			}
-
-			p->length = len;
-			p->snd_nxt = tp->snd_nxt;
-			p->snd_una = tp->snd_una;
-			p->snd_cwnd = tp->snd_cwnd;
-			p->snd_wnd = tp->snd_wnd;
-			p->rcv_wnd = tp->rcv_wnd;
-			p->ssthresh = tcp_current_ssthresh(sk);
-			p->srtt = tp->srtt_us >> 3;
-
-			tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
-		}
-		tcp_probe.lastcwnd = tp->snd_cwnd;
-		spin_unlock(&tcp_probe.lock);
-
-		wake_up(&tcp_probe.wait);
-	}
-
-	jprobe_return();
-}
-
-static struct jprobe tcp_jprobe = {
-	.kp = {
-		.symbol_name	= "tcp_rcv_established",
-	},
-	.entry	= jtcp_rcv_established,
-};
-
-static int tcpprobe_open(struct inode *inode, struct file *file)
-{
-	/* Reset (empty) log */
-	spin_lock_bh(&tcp_probe.lock);
-	tcp_probe.head = tcp_probe.tail = 0;
-	tcp_probe.start = ktime_get();
-	spin_unlock_bh(&tcp_probe.lock);
-
-	return 0;
-}
-
-static int tcpprobe_sprint(char *tbuf, int n)
-{
-	const struct tcp_log *p
-		= tcp_probe.log + tcp_probe.tail;
-	struct timespec64 ts
-		= ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
-
-	return scnprintf(tbuf, n,
-			"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
-			(unsigned long)ts.tv_sec,
-			(unsigned long)ts.tv_nsec,
-			&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
-			p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
-}
-
-static ssize_t tcpprobe_read(struct file *file, char __user *buf,
-			     size_t len, loff_t *ppos)
-{
-	int error = 0;
-	size_t cnt = 0;
-
-	if (!buf)
-		return -EINVAL;
-
-	while (cnt < len) {
-		char tbuf[256];
-		int width;
-
-		/* Wait for data in buffer */
-		error = wait_event_interruptible(tcp_probe.wait,
-						 tcp_probe_used() > 0);
-		if (error)
-			break;
-
-		spin_lock_bh(&tcp_probe.lock);
-		if (tcp_probe.head == tcp_probe.tail) {
-			/* multiple readers race? */
-			spin_unlock_bh(&tcp_probe.lock);
-			continue;
-		}
-
-		width = tcpprobe_sprint(tbuf, sizeof(tbuf));
-
-		if (cnt + width < len)
-			tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
-
-		spin_unlock_bh(&tcp_probe.lock);
-
-		/* if record greater than space available
-		   return partial buffer (so far) */
-		if (cnt + width >= len)
-			break;
-
-		if (copy_to_user(buf + cnt, tbuf, width))
-			return -EFAULT;
-		cnt += width;
-	}
-
-	return cnt == 0 ? error : cnt;
-}
-
-static const struct file_operations tcpprobe_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = tcpprobe_open,
-	.read    = tcpprobe_read,
-	.llseek  = noop_llseek,
-};
-
-static __init int tcpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	/* Warning: if the function signature of tcp_rcv_established,
-	 * has been changed, you also have to change the signature of
-	 * jtcp_rcv_established, otherwise you end up right here!
-	 */
-	BUILD_BUG_ON(__same_type(tcp_rcv_established,
-				 jtcp_rcv_established) == 0);
-
-	init_waitqueue_head(&tcp_probe.wait);
-	spin_lock_init(&tcp_probe.lock);
-
-	if (bufsize == 0)
-		return -EINVAL;
-
-	bufsize = roundup_pow_of_two(bufsize);
-	tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
-	if (!tcp_probe.log)
-		goto err0;
-
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net, &tcpprobe_fops))
-		goto err0;
-
-	ret = register_jprobe(&tcp_jprobe);
-	if (ret)
-		goto err1;
-
-	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
-		port, fwmark, bufsize);
-	return 0;
- err1:
-	remove_proc_entry(procname, init_net.proc_net);
- err0:
-	kfree(tcp_probe.log);
-	return ret;
-}
-module_init(tcpprobe_init);
-
-static __exit void tcpprobe_exit(void)
-{
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&tcp_jprobe);
-	kfree(tcp_probe.log);
-}
-module_exit(tcpprobe_exit);

^ permalink raw reply related

* [PATCH net-next v4 3/6] net: sctp: Add SCTP ACK tracking trace event
From: Masami Hiramatsu @ 2017-12-20  4:15 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151374325126.2497.6934744693865165386.stgit@devbox>

Add SCTP ACK tracking trace event to trace the changes of SCTP
association state in response to incoming packets.
It is used for debugging SCTP congestion control algorithms,
and will replace sctp_probe module.

Note that this event a bit tricky. Since this consists of 2
events (sctp_probe and sctp_probe_path) so you have to enable
both events as below.

  # cd /sys/kernel/debug/tracing
  # echo 1 > events/sctp/sctp_probe/enable
  # echo 1 > events/sctp/sctp_probe_path/enable

Or, you can enable all the events under sctp.

  # echo 1 > events/sctp/enable

Since sctp_probe_path event is always invoked from sctp_probe
event, you can not see any output if you only enable
sctp_probe_path.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
  Changes in v3:
   - Add checking whether sctp_probe_path event is enabled
     before iterating sctp paths to record. Thanks Steven.
  Changes in v4:
   - Move a temporal variable definition in the block.
   - Fix to cast pointer to unsigned long instead of __u64
     for 32bit environment.
---
 include/trace/events/sctp.h |   99 +++++++++++++++++++++++++++++++++++++++++++
 net/sctp/sm_statefuns.c     |    5 ++
 2 files changed, 104 insertions(+)
 create mode 100644 include/trace/events/sctp.h

diff --git a/include/trace/events/sctp.h b/include/trace/events/sctp.h
new file mode 100644
index 000000000000..7475c7be165a
--- /dev/null
+++ b/include/trace/events/sctp.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sctp
+
+#if !defined(_TRACE_SCTP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCTP_H
+
+#include <net/sctp/structs.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(sctp_probe_path,
+
+	TP_PROTO(struct sctp_transport *sp,
+		 const struct sctp_association *asoc),
+
+	TP_ARGS(sp, asoc),
+
+	TP_STRUCT__entry(
+		__field(__u64, asoc)
+		__field(__u32, primary)
+		__array(__u8, ipaddr, sizeof(union sctp_addr))
+		__field(__u32, state)
+		__field(__u32, cwnd)
+		__field(__u32, ssthresh)
+		__field(__u32, flight_size)
+		__field(__u32, partial_bytes_acked)
+		__field(__u32, pathmtu)
+	),
+
+	TP_fast_assign(
+		__entry->asoc = (unsigned long)asoc;
+		__entry->primary = (sp == asoc->peer.primary_path);
+		memcpy(__entry->ipaddr, &sp->ipaddr, sizeof(union sctp_addr));
+		__entry->state = sp->state;
+		__entry->cwnd = sp->cwnd;
+		__entry->ssthresh = sp->ssthresh;
+		__entry->flight_size = sp->flight_size;
+		__entry->partial_bytes_acked = sp->partial_bytes_acked;
+		__entry->pathmtu = sp->pathmtu;
+	),
+
+	TP_printk("asoc=%#llx%s ipaddr=%pISpc state=%u cwnd=%u ssthresh=%u "
+		  "flight_size=%u partial_bytes_acked=%u pathmtu=%u",
+		  __entry->asoc, __entry->primary ? "(*)" : "",
+		  __entry->ipaddr, __entry->state, __entry->cwnd,
+		  __entry->ssthresh, __entry->flight_size,
+		  __entry->partial_bytes_acked, __entry->pathmtu)
+);
+
+TRACE_EVENT(sctp_probe,
+
+	TP_PROTO(const struct sctp_endpoint *ep,
+		 const struct sctp_association *asoc,
+		 struct sctp_chunk *chunk),
+
+	TP_ARGS(ep, asoc, chunk),
+
+	TP_STRUCT__entry(
+		__field(__u64, asoc)
+		__field(__u32, mark)
+		__field(__u16, bind_port)
+		__field(__u16, peer_port)
+		__field(__u32, pathmtu)
+		__field(__u32, rwnd)
+		__field(__u16, unack_data)
+	),
+
+	TP_fast_assign(
+		struct sk_buff *skb = chunk->skb;
+
+		__entry->asoc = (unsigned long)asoc;
+		__entry->mark = skb->mark;
+		__entry->bind_port = ep->base.bind_addr.port;
+		__entry->peer_port = asoc->peer.port;
+		__entry->pathmtu = asoc->pathmtu;
+		__entry->rwnd = asoc->peer.rwnd;
+		__entry->unack_data = asoc->unack_data;
+
+		if (trace_sctp_probe_path_enabled()) {
+			struct sctp_transport *sp;
+
+			list_for_each_entry(sp, &asoc->peer.transport_addr_list,
+					    transports) {
+				trace_sctp_probe_path(sp, asoc);
+			}
+		}
+	),
+
+	TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d "
+		  "rwnd=%u unack_data=%d",
+		  __entry->asoc, __entry->mark, __entry->bind_port,
+		  __entry->peer_port, __entry->pathmtu, __entry->rwnd,
+		  __entry->unack_data)
+);
+
+#endif /* _TRACE_SCTP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 541f34735346..eb7905ffe5f2 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -59,6 +59,9 @@
 #include <net/sctp/sm.h>
 #include <net/sctp/structs.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/sctp.h>
+
 static struct sctp_packet *sctp_abort_pkt_new(
 					struct net *net,
 					const struct sctp_endpoint *ep,
@@ -3219,6 +3222,8 @@ enum sctp_disposition sctp_sf_eat_sack_6_2(struct net *net,
 	struct sctp_sackhdr *sackh;
 	__u32 ctsn;
 
+	trace_sctp_probe(ep, asoc, chunk);
+
 	if (!sctp_vtag_verify(chunk, asoc))
 		return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
 

^ permalink raw reply related

* [PATCH net-next v4 4/6] net: sctp: Remove debug SCTP probe module
From: Masami Hiramatsu @ 2017-12-20  4:16 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151374325126.2497.6934744693865165386.stgit@devbox>

Remove SCTP probe module since jprobe has been deprecated.
That function is now replaced by sctp/sctp_probe and
sctp/sctp_probe_path trace-events.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 net/sctp/Kconfig  |   12 ---
 net/sctp/Makefile |    3 -
 net/sctp/probe.c  |  244 -----------------------------------------------------
 3 files changed, 259 deletions(-)
 delete mode 100644 net/sctp/probe.c

diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index d9c04dc1b3f3..c740b189d4ba 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -37,18 +37,6 @@ menuconfig IP_SCTP
 
 if IP_SCTP
 
-config NET_SCTPPROBE
-	tristate "SCTP: Association probing"
-        depends on PROC_FS && KPROBES
-        ---help---
-        This module allows for capturing the changes to SCTP association
-        state in response to incoming packets. It is used for debugging
-        SCTP congestion control algorithms. If you don't understand
-        what was just said, you don't need it: say N.
-
-        To compile this code as a module, choose M here: the
-        module will be called sctp_probe.
-
 config SCTP_DBG_OBJCNT
 	bool "SCTP: Debug object counts"
 	depends on PROC_FS
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index 54bd9c1a8aa1..6776582ec449 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -4,7 +4,6 @@
 #
 
 obj-$(CONFIG_IP_SCTP) += sctp.o
-obj-$(CONFIG_NET_SCTPPROBE) += sctp_probe.o
 obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
 
 sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
@@ -16,8 +15,6 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  offload.o stream_sched.o stream_sched_prio.o \
 	  stream_sched_rr.o stream_interleave.o
 
-sctp_probe-y := probe.o
-
 sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o
 sctp-$(CONFIG_PROC_FS) += proc.o
 sctp-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
deleted file mode 100644
index 1280f85a598d..000000000000
--- a/net/sctp/probe.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * sctp_probe - Observe the SCTP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for SCTP from Stephen Hemminger's code
- * Copyright (C) 2010, Wei Yongjun <yjwei@cn.fujitsu.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/sctp.h>
-#include <linux/proc_fs.h>
-#include <linux/vmalloc.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/time.h>
-#include <net/net_namespace.h>
-
-#include <net/sctp/sctp.h>
-#include <net/sctp/sm.h>
-
-MODULE_SOFTDEP("pre: sctp");
-MODULE_AUTHOR("Wei Yongjun <yjwei@cn.fujitsu.com>");
-MODULE_DESCRIPTION("SCTP snooper");
-MODULE_LICENSE("GPL");
-
-static int port __read_mostly = 0;
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-static unsigned int fwmark __read_mostly = 0;
-MODULE_PARM_DESC(fwmark, "skb mark to match (0=no mark)");
-module_param(fwmark, uint, 0);
-
-static int bufsize __read_mostly = 64 * 1024;
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-static int full __read_mostly = 1;
-MODULE_PARM_DESC(full, "Full log (1=every ack packet received,  0=only cwnd changes)");
-module_param(full, int, 0);
-
-static const char procname[] = "sctpprobe";
-
-static struct {
-	struct kfifo	  fifo;
-	spinlock_t	  lock;
-	wait_queue_head_t wait;
-	struct timespec64 tstart;
-} sctpw;
-
-static __printf(1, 2) void printl(const char *fmt, ...)
-{
-	va_list args;
-	int len;
-	char tbuf[256];
-
-	va_start(args, fmt);
-	len = vscnprintf(tbuf, sizeof(tbuf), fmt, args);
-	va_end(args);
-
-	kfifo_in_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
-	wake_up(&sctpw.wait);
-}
-
-static int sctpprobe_open(struct inode *inode, struct file *file)
-{
-	kfifo_reset(&sctpw.fifo);
-	ktime_get_ts64(&sctpw.tstart);
-
-	return 0;
-}
-
-static ssize_t sctpprobe_read(struct file *file, char __user *buf,
-			      size_t len, loff_t *ppos)
-{
-	int error = 0, cnt = 0;
-	unsigned char *tbuf;
-
-	if (!buf)
-		return -EINVAL;
-
-	if (len == 0)
-		return 0;
-
-	tbuf = vmalloc(len);
-	if (!tbuf)
-		return -ENOMEM;
-
-	error = wait_event_interruptible(sctpw.wait,
-					 kfifo_len(&sctpw.fifo) != 0);
-	if (error)
-		goto out_free;
-
-	cnt = kfifo_out_locked(&sctpw.fifo, tbuf, len, &sctpw.lock);
-	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
-
-out_free:
-	vfree(tbuf);
-
-	return error ? error : cnt;
-}
-
-static const struct file_operations sctpprobe_fops = {
-	.owner	= THIS_MODULE,
-	.open	= sctpprobe_open,
-	.read	= sctpprobe_read,
-	.llseek = noop_llseek,
-};
-
-static enum sctp_disposition jsctp_sf_eat_sack(
-					struct net *net,
-					const struct sctp_endpoint *ep,
-					const struct sctp_association *asoc,
-					const union sctp_subtype type,
-					void *arg,
-					struct sctp_cmd_seq *commands)
-{
-	struct sctp_chunk *chunk = arg;
-	struct sk_buff *skb = chunk->skb;
-	struct sctp_transport *sp;
-	static __u32 lcwnd = 0;
-	struct timespec64 now;
-
-	sp = asoc->peer.primary_path;
-
-	if (((port == 0 && fwmark == 0) ||
-	     asoc->peer.port == port ||
-	     ep->base.bind_addr.port == port ||
-	     (fwmark > 0 && skb->mark == fwmark)) &&
-	    (full || sp->cwnd != lcwnd)) {
-		lcwnd = sp->cwnd;
-
-		ktime_get_ts64(&now);
-		now = timespec64_sub(now, sctpw.tstart);
-
-		printl("%lu.%06lu ", (unsigned long) now.tv_sec,
-		       (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-
-		printl("%p %5d %5d %5d %8d %5d ", asoc,
-		       ep->base.bind_addr.port, asoc->peer.port,
-		       asoc->pathmtu, asoc->peer.rwnd, asoc->unack_data);
-
-		list_for_each_entry(sp, &asoc->peer.transport_addr_list,
-					transports) {
-			if (sp == asoc->peer.primary_path)
-				printl("*");
-
-			printl("%pISc %2u %8u %8u %8u %8u %8u ",
-			       &sp->ipaddr, sp->state, sp->cwnd, sp->ssthresh,
-			       sp->flight_size, sp->partial_bytes_acked,
-			       sp->pathmtu);
-		}
-		printl("\n");
-	}
-
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe sctp_recv_probe = {
-	.kp	= {
-		.symbol_name = "sctp_sf_eat_sack_6_2",
-	},
-	.entry	= jsctp_sf_eat_sack,
-};
-
-static __init int sctp_setup_jprobe(void)
-{
-	int ret = register_jprobe(&sctp_recv_probe);
-
-	if (ret) {
-		if (request_module("sctp"))
-			goto out;
-		ret = register_jprobe(&sctp_recv_probe);
-	}
-
-out:
-	return ret;
-}
-
-static __init int sctpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	/* Warning: if the function signature of sctp_sf_eat_sack_6_2,
-	 * has been changed, you also have to change the signature of
-	 * jsctp_sf_eat_sack, otherwise you end up right here!
-	 */
-	BUILD_BUG_ON(__same_type(sctp_sf_eat_sack_6_2,
-				 jsctp_sf_eat_sack) == 0);
-
-	init_waitqueue_head(&sctpw.wait);
-	spin_lock_init(&sctpw.lock);
-	if (kfifo_alloc(&sctpw.fifo, bufsize, GFP_KERNEL))
-		return ret;
-
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net,
-			 &sctpprobe_fops))
-		goto free_kfifo;
-
-	ret = sctp_setup_jprobe();
-	if (ret)
-		goto remove_proc;
-
-	pr_info("probe registered (port=%d/fwmark=%u) bufsize=%u\n",
-		port, fwmark, bufsize);
-	return 0;
-
-remove_proc:
-	remove_proc_entry(procname, init_net.proc_net);
-free_kfifo:
-	kfifo_free(&sctpw.fifo);
-	return ret;
-}
-
-static __exit void sctpprobe_exit(void)
-{
-	kfifo_free(&sctpw.fifo);
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&sctp_recv_probe);
-}
-
-module_init(sctpprobe_init);
-module_exit(sctpprobe_exit);

^ permalink raw reply related

* [PATCH net-next v4 5/6] net: dccp: Add DCCP sendmsg trace event
From: Masami Hiramatsu @ 2017-12-20  4:16 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151374325126.2497.6934744693865165386.stgit@devbox>

Add DCCP sendmsg trace event (dccp/dccp_probe) for
replacing dccpprobe. User can trace this event via
ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 net/dccp/proto.c |    5 +++
 net/dccp/trace.h |  105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 110 insertions(+)
 create mode 100644 net/dccp/trace.h

diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 9d43c1f40274..e57b5db495cd 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -38,6 +38,9 @@
 #include "dccp.h"
 #include "feat.h"
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
 DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
 
 EXPORT_SYMBOL_GPL(dccp_statistics);
@@ -761,6 +764,8 @@ int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	int rc, size;
 	long timeo;
 
+	trace_dccp_probe(sk, len);
+
 	if (len > dp->dccps_mss_cache)
 		return -EMSGSIZE;
 
diff --git a/net/dccp/trace.h b/net/dccp/trace.h
new file mode 100644
index 000000000000..aa01321a6c37
--- /dev/null
+++ b/net/dccp/trace.h
@@ -0,0 +1,105 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dccp
+
+#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_DCCP_H
+
+#include <net/sock.h>
+#include "dccp.h"
+#include "ccids/ccid3.h"
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(dccp_probe,
+
+	TP_PROTO(struct sock *sk, size_t size),
+
+	TP_ARGS(sk, size),
+
+	TP_STRUCT__entry(
+		/* sockaddr_in6 is always bigger than sockaddr_in */
+		__array(__u8, saddr, sizeof(struct sockaddr_in6))
+		__array(__u8, daddr, sizeof(struct sockaddr_in6))
+		__field(__u16, sport)
+		__field(__u16, dport)
+		__field(__u16, size)
+		__field(__u16, tx_s)
+		__field(__u32, tx_rtt)
+		__field(__u32, tx_p)
+		__field(__u32, tx_x_calc)
+		__field(__u64, tx_x_recv)
+		__field(__u64, tx_x)
+		__field(__u32, tx_t_ipi)
+	),
+
+	TP_fast_assign(
+		const struct inet_sock *inet = inet_sk(sk);
+		struct ccid3_hc_tx_sock *hc = NULL;
+
+		if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+			hc = ccid3_hc_tx_sk(sk);
+
+		memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
+		memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
+
+		if (sk->sk_family == AF_INET) {
+			struct sockaddr_in *v4 = (void *)__entry->saddr;
+
+			v4->sin_family = AF_INET;
+			v4->sin_port = inet->inet_sport;
+			v4->sin_addr.s_addr = inet->inet_saddr;
+			v4 = (void *)__entry->daddr;
+			v4->sin_family = AF_INET;
+			v4->sin_port = inet->inet_dport;
+			v4->sin_addr.s_addr = inet->inet_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+		} else if (sk->sk_family == AF_INET6) {
+			struct sockaddr_in6 *v6 = (void *)__entry->saddr;
+
+			v6->sin6_family = AF_INET6;
+			v6->sin6_port = inet->inet_sport;
+			v6->sin6_addr = inet6_sk(sk)->saddr;
+			v6 = (void *)__entry->daddr;
+			v6->sin6_family = AF_INET6;
+			v6->sin6_port = inet->inet_dport;
+			v6->sin6_addr = sk->sk_v6_daddr;
+#endif
+		}
+
+		/* For filtering use */
+		__entry->sport = ntohs(inet->inet_sport);
+		__entry->dport = ntohs(inet->inet_dport);
+
+		__entry->size = size;
+		if (hc) {
+			__entry->tx_s = hc->tx_s;
+			__entry->tx_rtt = hc->tx_rtt;
+			__entry->tx_p = hc->tx_p;
+			__entry->tx_x_calc = hc->tx_x_calc;
+			__entry->tx_x_recv = hc->tx_x_recv >> 6;
+			__entry->tx_x = hc->tx_x >> 6;
+			__entry->tx_t_ipi = hc->tx_t_ipi;
+		} else {
+			__entry->tx_s = 0;
+			memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi -
+			       (void *)&__entry->tx_rtt +
+			       sizeof(__entry->tx_t_ipi));
+		}
+	),
+
+	TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d "
+		  "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d",
+		  __entry->saddr, __entry->daddr, __entry->size,
+		  __entry->tx_s, __entry->tx_rtt, __entry->tx_p,
+		  __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x,
+		  __entry->tx_t_ipi)
+);
+
+#endif /* _TRACE_TCP_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>

^ permalink raw reply related

* [PATCH net-next v4 6/6] net: dccp: Remove dccpprobe module
From: Masami Hiramatsu @ 2017-12-20  4:17 UTC (permalink / raw)
  To: Ingo Molnar, David S . Miller, Ian McDonald, Vlad Yasevich,
	Stephen Hemminger, Steven Rostedt
  Cc: Peter Zijlstra, Thomas Gleixner, LKML, H . Peter Anvin,
	Gerrit Renker, Neil Horman, dccp, netdev, linux-sctp,
	Stephen Rothwell, mhiramat
In-Reply-To: <151374325126.2497.6934744693865165386.stgit@devbox>

Remove DCCP probe module since jprobe has been deprecated.
That function is now replaced by dccp/dccp_probe trace-event.
You can use it via ftrace or perftools.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
---
 net/dccp/Kconfig  |   17 ----
 net/dccp/Makefile |    2 -
 net/dccp/probe.c  |  203 -----------------------------------------------------
 3 files changed, 222 deletions(-)
 delete mode 100644 net/dccp/probe.c

diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 8c0ef71bed2f..b270e84d9c13 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -39,23 +39,6 @@ config IP_DCCP_DEBUG
 
 	  Just say N.
 
-config NET_DCCPPROBE
-	tristate "DCCP connection probing"
-	depends on PROC_FS && KPROBES
-	---help---
-	This module allows for capturing the changes to DCCP connection
-	state in response to incoming packets. It is used for debugging
-	DCCP congestion avoidance modules. If you don't understand
-	what was just said, you don't need it: say N.
-
-	Documentation on how to use DCCP connection probing can be found
-	at:
-	
-	  http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
-
-	To compile this code as a module, choose M here: the
-	module will be called dccp_probe.
-
 
 endmenu
 
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 2e7b56097bc4..9d0383d2f277 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -21,9 +21,7 @@ obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
 dccp_ipv6-y := ipv6.o
 
 obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
-obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
 
 dccp-$(CONFIG_SYSCTL) += sysctl.o
 
 dccp_diag-y := diag.o
-dccp_probe-y := probe.o
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
deleted file mode 100644
index 3d3fda05b32d..000000000000
--- a/net/dccp/probe.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * dccp_probe - Observe the DCCP flow with kprobes.
- *
- * The idea for this came from Werner Almesberger's umlsim
- * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
- *
- * Modified for DCCP from Stephen Hemminger's code
- * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/kprobes.h>
-#include <linux/socket.h>
-#include <linux/dccp.h>
-#include <linux/proc_fs.h>
-#include <linux/module.h>
-#include <linux/kfifo.h>
-#include <linux/vmalloc.h>
-#include <linux/time64.h>
-#include <linux/gfp.h>
-#include <net/net_namespace.h>
-
-#include "dccp.h"
-#include "ccid.h"
-#include "ccids/ccid3.h"
-
-static int port;
-
-static int bufsize = 64 * 1024;
-
-static const char procname[] = "dccpprobe";
-
-static struct {
-	struct kfifo	  fifo;
-	spinlock_t	  lock;
-	wait_queue_head_t wait;
-	struct timespec64 tstart;
-} dccpw;
-
-static void printl(const char *fmt, ...)
-{
-	va_list args;
-	int len;
-	struct timespec64 now;
-	char tbuf[256];
-
-	va_start(args, fmt);
-	getnstimeofday64(&now);
-
-	now = timespec64_sub(now, dccpw.tstart);
-
-	len = sprintf(tbuf, "%lu.%06lu ",
-		      (unsigned long) now.tv_sec,
-		      (unsigned long) now.tv_nsec / NSEC_PER_USEC);
-	len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
-	va_end(args);
-
-	kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
-	wake_up(&dccpw.wait);
-}
-
-static int jdccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
-{
-	const struct inet_sock *inet = inet_sk(sk);
-	struct ccid3_hc_tx_sock *hc = NULL;
-
-	if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
-		hc = ccid3_hc_tx_sk(sk);
-
-	if (port == 0 || ntohs(inet->inet_dport) == port ||
-	    ntohs(inet->inet_sport) == port) {
-		if (hc)
-			printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
-			       &inet->inet_saddr, ntohs(inet->inet_sport),
-			       &inet->inet_daddr, ntohs(inet->inet_dport), size,
-			       hc->tx_s, hc->tx_rtt, hc->tx_p,
-			       hc->tx_x_calc, hc->tx_x_recv >> 6,
-			       hc->tx_x >> 6, hc->tx_t_ipi);
-		else
-			printl("%pI4:%u %pI4:%u %d\n",
-			       &inet->inet_saddr, ntohs(inet->inet_sport),
-			       &inet->inet_daddr, ntohs(inet->inet_dport),
-			       size);
-	}
-
-	jprobe_return();
-	return 0;
-}
-
-static struct jprobe dccp_send_probe = {
-	.kp	= {
-		.symbol_name = "dccp_sendmsg",
-	},
-	.entry	= jdccp_sendmsg,
-};
-
-static int dccpprobe_open(struct inode *inode, struct file *file)
-{
-	kfifo_reset(&dccpw.fifo);
-	getnstimeofday64(&dccpw.tstart);
-	return 0;
-}
-
-static ssize_t dccpprobe_read(struct file *file, char __user *buf,
-			      size_t len, loff_t *ppos)
-{
-	int error = 0, cnt = 0;
-	unsigned char *tbuf;
-
-	if (!buf)
-		return -EINVAL;
-
-	if (len == 0)
-		return 0;
-
-	tbuf = vmalloc(len);
-	if (!tbuf)
-		return -ENOMEM;
-
-	error = wait_event_interruptible(dccpw.wait,
-					 kfifo_len(&dccpw.fifo) != 0);
-	if (error)
-		goto out_free;
-
-	cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
-	error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
-
-out_free:
-	vfree(tbuf);
-
-	return error ? error : cnt;
-}
-
-static const struct file_operations dccpprobe_fops = {
-	.owner	 = THIS_MODULE,
-	.open	 = dccpprobe_open,
-	.read    = dccpprobe_read,
-	.llseek  = noop_llseek,
-};
-
-static __init int dccpprobe_init(void)
-{
-	int ret = -ENOMEM;
-
-	init_waitqueue_head(&dccpw.wait);
-	spin_lock_init(&dccpw.lock);
-	if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
-		return ret;
-	if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
-		goto err0;
-
-	ret = register_jprobe(&dccp_send_probe);
-	if (ret) {
-		ret = request_module("dccp");
-		if (!ret)
-			ret = register_jprobe(&dccp_send_probe);
-	}
-
-	if (ret)
-		goto err1;
-
-	pr_info("DCCP watch registered (port=%d)\n", port);
-	return 0;
-err1:
-	remove_proc_entry(procname, init_net.proc_net);
-err0:
-	kfifo_free(&dccpw.fifo);
-	return ret;
-}
-module_init(dccpprobe_init);
-
-static __exit void dccpprobe_exit(void)
-{
-	kfifo_free(&dccpw.fifo);
-	remove_proc_entry(procname, init_net.proc_net);
-	unregister_jprobe(&dccp_send_probe);
-
-}
-module_exit(dccpprobe_exit);
-
-MODULE_PARM_DESC(port, "Port to match (0=all)");
-module_param(port, int, 0);
-
-MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
-module_param(bufsize, int, 0);
-
-MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
-MODULE_DESCRIPTION("DCCP snooper");
-MODULE_LICENSE("GPL");

^ permalink raw reply related

* [PATCH net-next] virtio_net: Add ethtool stats
From: Toshiaki Makita @ 2017-12-20  4:40 UTC (permalink / raw)
  To: David S . Miller, Michael S . Tsirkin, Jason Wang
  Cc: Toshiaki Makita, netdev, virtualization

The main purpose of this patch is adding a way of checking per-queue stats.
It's useful to debug performance problems on multiqueue environment.

$ ethtool -S ens10
NIC statistics:
     rx_packets: 4172939
     tx_packets: 5855538
     rx_bytes: 6317757408
     tx_bytes: 8865151846
     rx_dropped: 0
     rx_length_errors: 0
     rx_frame_errors: 0
     tx_dropped: 0
     tx_fifo_errors: 0
     rx_queue_0_packets: 2090408
     rx_queue_0_bytes: 3164825094
     rx_queue_1_packets: 2082531
     rx_queue_1_bytes: 3152932314
     tx_queue_0_packets: 2770841
     tx_queue_0_bytes: 4194955474
     tx_queue_1_packets: 3084697
     tx_queue_1_bytes: 4670196372

Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
---
 drivers/net/virtio_net.c | 187 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 136 insertions(+), 51 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6fb7b65..a0a7bf5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -65,14 +65,31 @@
 	VIRTIO_NET_F_GUEST_UFO
 };
 
-struct virtnet_stats {
-	struct u64_stats_sync tx_syncp;
-	struct u64_stats_sync rx_syncp;
-	u64 tx_bytes;
-	u64 tx_packets;
-
-	u64 rx_bytes;
-	u64 rx_packets;
+struct virtnet_gstats {
+	char stat_string[ETH_GSTRING_LEN];
+	int stat_offset;
+};
+
+#define VIRTNET_NETDEV_STAT(m)	offsetof(struct rtnl_link_stats64, m)
+
+static const struct virtnet_gstats virtnet_gstrings_stats[] = {
+	{ "rx_packets",		VIRTNET_NETDEV_STAT(rx_packets) },
+	{ "tx_packets",		VIRTNET_NETDEV_STAT(tx_packets) },
+	{ "rx_bytes",		VIRTNET_NETDEV_STAT(rx_bytes) },
+	{ "tx_bytes",		VIRTNET_NETDEV_STAT(tx_bytes) },
+	{ "rx_dropped",		VIRTNET_NETDEV_STAT(rx_dropped) },
+	{ "rx_length_errors",	VIRTNET_NETDEV_STAT(rx_length_errors) },
+	{ "rx_frame_errors",	VIRTNET_NETDEV_STAT(rx_frame_errors) },
+	{ "tx_dropped",		VIRTNET_NETDEV_STAT(tx_dropped) },
+	{ "tx_fifo_errors",	VIRTNET_NETDEV_STAT(tx_fifo_errors) },
+};
+
+# define VIRTNET_GSTATS_LEN	ARRAY_SIZE(virtnet_gstrings_stats)
+
+struct virtnet_queue_stats {
+	struct u64_stats_sync syncp;
+	u64 bytes;
+	u64 packets;
 };
 
 /* Internal representation of a send virtqueue */
@@ -86,6 +103,8 @@ struct send_queue {
 	/* Name of the send queue: output.$index */
 	char name[40];
 
+	struct virtnet_queue_stats stats;
+
 	struct napi_struct napi;
 };
 
@@ -98,6 +117,8 @@ struct receive_queue {
 
 	struct bpf_prog __rcu *xdp_prog;
 
+	struct virtnet_queue_stats stats;
+
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
@@ -149,9 +170,6 @@ struct virtnet_info {
 	/* Packet virtio header size */
 	u8 hdr_len;
 
-	/* Active statistics */
-	struct virtnet_stats __percpu *stats;
-
 	/* Work struct for refilling if we run low on memory. */
 	struct delayed_work refill;
 
@@ -1121,7 +1139,6 @@ static int virtnet_receive(struct receive_queue *rq, int budget, bool *xdp_xmit)
 	struct virtnet_info *vi = rq->vq->vdev->priv;
 	unsigned int len, received = 0, bytes = 0;
 	void *buf;
-	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 
 	if (!vi->big_packets || vi->mergeable_rx_bufs) {
 		void *ctx;
@@ -1144,10 +1161,10 @@ static int virtnet_receive(struct receive_queue *rq, int budget, bool *xdp_xmit)
 			schedule_delayed_work(&vi->refill, 0);
 	}
 
-	u64_stats_update_begin(&stats->rx_syncp);
-	stats->rx_bytes += bytes;
-	stats->rx_packets += received;
-	u64_stats_update_end(&stats->rx_syncp);
+	u64_stats_update_begin(&rq->stats.syncp);
+	rq->stats.bytes += bytes;
+	rq->stats.packets += received;
+	u64_stats_update_end(&rq->stats.syncp);
 
 	return received;
 }
@@ -1156,8 +1173,6 @@ static void free_old_xmit_skbs(struct send_queue *sq)
 {
 	struct sk_buff *skb;
 	unsigned int len;
-	struct virtnet_info *vi = sq->vq->vdev->priv;
-	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 	unsigned int packets = 0;
 	unsigned int bytes = 0;
 
@@ -1176,10 +1191,10 @@ static void free_old_xmit_skbs(struct send_queue *sq)
 	if (!packets)
 		return;
 
-	u64_stats_update_begin(&stats->tx_syncp);
-	stats->tx_bytes += bytes;
-	stats->tx_packets += packets;
-	u64_stats_update_end(&stats->tx_syncp);
+	u64_stats_update_begin(&sq->stats.syncp);
+	sq->stats.bytes += bytes;
+	sq->stats.packets += packets;
+	u64_stats_update_end(&sq->stats.syncp);
 }
 
 static void virtnet_poll_cleantx(struct receive_queue *rq)
@@ -1463,24 +1478,24 @@ static void virtnet_stats(struct net_device *dev,
 			  struct rtnl_link_stats64 *tot)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	int cpu;
 	unsigned int start;
+	int i;
 
-	for_each_possible_cpu(cpu) {
-		struct virtnet_stats *stats = per_cpu_ptr(vi->stats, cpu);
+	for (i = 0; i < vi->max_queue_pairs; i++) {
 		u64 tpackets, tbytes, rpackets, rbytes;
+		struct receive_queue *rq = &vi->rq[i];
+		struct send_queue *sq = &vi->sq[i];
 
 		do {
-			start = u64_stats_fetch_begin_irq(&stats->tx_syncp);
-			tpackets = stats->tx_packets;
-			tbytes   = stats->tx_bytes;
-		} while (u64_stats_fetch_retry_irq(&stats->tx_syncp, start));
-
+			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
+			tpackets = sq->stats.packets;
+			tbytes   = sq->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
 		do {
-			start = u64_stats_fetch_begin_irq(&stats->rx_syncp);
-			rpackets = stats->rx_packets;
-			rbytes   = stats->rx_bytes;
-		} while (u64_stats_fetch_retry_irq(&stats->rx_syncp, start));
+			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
+			rpackets = rq->stats.packets;
+			rbytes   = rq->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
 
 		tot->rx_packets += rpackets;
 		tot->tx_packets += tpackets;
@@ -1817,6 +1832,84 @@ static int virtnet_set_channels(struct net_device *dev,
 	return err;
 }
 
+static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	char *p = (char *)data;
+	unsigned int i;
+
+	switch (stringset) {
+	case ETH_SS_STATS:
+		for (i = 0; i < VIRTNET_GSTATS_LEN; i++) {
+			memcpy(p, virtnet_gstrings_stats[i].stat_string,
+			       ETH_GSTRING_LEN);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < vi->curr_queue_pairs; i++) {
+			sprintf(p, "rx_queue_%u_packets", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "rx_queue_%u_bytes", i);
+			p += ETH_GSTRING_LEN;
+		}
+		for (i = 0; i < vi->curr_queue_pairs; i++) {
+			sprintf(p, "tx_queue_%u_packets", i);
+			p += ETH_GSTRING_LEN;
+			sprintf(p, "tx_queue_%u_bytes", i);
+			p += ETH_GSTRING_LEN;
+		}
+		break;
+	}
+}
+
+static int virtnet_get_sset_count(struct net_device *dev, int sset)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+
+	switch (sset) {
+	case ETH_SS_STATS:
+		return VIRTNET_GSTATS_LEN + vi->curr_queue_pairs * 4;
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void virtnet_get_ethtool_stats(struct net_device *dev,
+				      struct ethtool_stats *stats, u64 *data)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct rtnl_link_stats64 storage;
+	unsigned int idx = 0, start, i;
+	const u8 *stats_base;
+
+	stats_base = (u8 *)dev_get_stats(dev, &storage);
+	for (i = 0; i < VIRTNET_GSTATS_LEN; i++) {
+		data[idx++] = *(u64 *)(stats_base +
+				       virtnet_gstrings_stats[i].stat_offset);
+	}
+
+	for (i = 0; i < vi->curr_queue_pairs; i++) {
+		struct receive_queue *rq = &vi->rq[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&rq->stats.syncp);
+			data[idx] = rq->stats.packets;
+			data[idx + 1] = rq->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&rq->stats.syncp, start));
+		idx += 2;
+	}
+
+	for (i = 0; i < vi->curr_queue_pairs; i++) {
+		struct send_queue *sq = &vi->sq[i];
+
+		do {
+			start = u64_stats_fetch_begin_irq(&sq->stats.syncp);
+			data[idx] = sq->stats.packets;
+			data[idx + 1] = sq->stats.bytes;
+		} while (u64_stats_fetch_retry_irq(&sq->stats.syncp, start));
+		idx += 2;
+	}
+}
+
 static void virtnet_get_channels(struct net_device *dev,
 				 struct ethtool_channels *channels)
 {
@@ -1898,6 +1991,9 @@ static void virtnet_init_settings(struct net_device *dev)
 	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
+	.get_strings = virtnet_get_strings,
+	.get_sset_count = virtnet_get_sset_count,
+	.get_ethtool_stats = virtnet_get_ethtool_stats,
 	.set_channels = virtnet_set_channels,
 	.get_channels = virtnet_get_channels,
 	.get_ts_info = ethtool_op_get_ts_info,
@@ -2389,6 +2485,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
 		ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
 		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
+
+		u64_stats_init(&vi->rq[i].stats.syncp);
+		u64_stats_init(&vi->sq[i].stats.syncp);
 	}
 
 	return 0;
@@ -2513,7 +2612,7 @@ static int virtnet_validate(struct virtio_device *vdev)
 
 static int virtnet_probe(struct virtio_device *vdev)
 {
-	int i, err;
+	int i, err = -ENOMEM;
 	struct net_device *dev;
 	struct virtnet_info *vi;
 	u16 max_queue_pairs;
@@ -2590,17 +2689,6 @@ static int virtnet_probe(struct virtio_device *vdev)
 	vi->dev = dev;
 	vi->vdev = vdev;
 	vdev->priv = vi;
-	vi->stats = alloc_percpu(struct virtnet_stats);
-	err = -ENOMEM;
-	if (vi->stats == NULL)
-		goto free;
-
-	for_each_possible_cpu(i) {
-		struct virtnet_stats *virtnet_stats;
-		virtnet_stats = per_cpu_ptr(vi->stats, i);
-		u64_stats_init(&virtnet_stats->tx_syncp);
-		u64_stats_init(&virtnet_stats->rx_syncp);
-	}
 
 	INIT_WORK(&vi->config_work, virtnet_config_changed_work);
 
@@ -2637,7 +2725,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 			 */
 			dev_err(&vdev->dev, "device MTU appears to have changed "
 				"it is now %d < %d", mtu, dev->min_mtu);
-			goto free_stats;
+			goto free;
 		}
 
 		dev->mtu = mtu;
@@ -2661,7 +2749,7 @@ static int virtnet_probe(struct virtio_device *vdev)
 	/* Allocate/initialize the rx/tx queues, and invoke find_vqs */
 	err = init_vqs(vi);
 	if (err)
-		goto free_stats;
+		goto free;
 
 #ifdef CONFIG_SYSFS
 	if (vi->mergeable_rx_bufs)
@@ -2715,8 +2803,6 @@ static int virtnet_probe(struct virtio_device *vdev)
 	cancel_delayed_work_sync(&vi->refill);
 	free_receive_page_frags(vi);
 	virtnet_del_vqs(vi);
-free_stats:
-	free_percpu(vi->stats);
 free:
 	free_netdev(dev);
 	return err;
@@ -2749,7 +2835,6 @@ static void virtnet_remove(struct virtio_device *vdev)
 
 	remove_vq_common(vi);
 
-	free_percpu(vi->stats);
 	free_netdev(vi->dev);
 }
 
-- 
1.8.3.1

^ permalink raw reply related

* Re: [PATCH net 0/3] Few mvneta fixes
From: Willy Tarreau @ 2017-12-20  5:19 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Thomas Petazzoni, Andrew Lunn, Jason Cooper, Networking,
	Antoine Tenart, Linux Kernel Mailing List, Dmitri Epshtein,
	Nadav Haklai, Lior Amsalem, Miquèl Raynal, Gregory CLEMENT,
	Marcin Wojtas, David S. Miller, Linux ARM, Sebastian Hesselbarth
In-Reply-To: <CAK8P3a18h_JB_P4DOFmd+v+f5KM1X9h513qUke_7nxoSJtiOUw@mail.gmail.com>

Hi Arnd,

On Tue, Dec 19, 2017 at 09:18:35PM +0100, Arnd Bergmann wrote:
> On Tue, Dec 19, 2017 at 5:59 PM, Gregory CLEMENT
> <gregory.clement@free-electrons.com> wrote:
> > Hello,
> >
> > here it is a small series of fixes found on the mvneta driver. They
> > had been already used in the vendor kernel and are now ported to
> > mainline.
> 
> Does one of the patches look like it addresses the rare Oops we discussed on
> #kernelci this morning?
> 
> https://storage.kernelci.org/stable/linux-4.9.y/v4.9.70/arm/mvebu_v7_defconfig/lab-free-electrons/boot-armada-375-db.html

I could be wrong but for me the 375 uses mvpp2, not mvneta, so this
should have no effect there.

Willy

^ permalink raw reply

* Re: r8169 regression: UDP packets dropped intermittantly
From: Jonathan Woithe @ 2017-12-20  5:20 UTC (permalink / raw)
  To: Michal Kubecek; +Cc: Holger Hoffstätte, netdev, linux-kernel
In-Reply-To: <20171219122523.lhavmoxo3ippftyn@unicorn.suse.cz>

On Tue, Dec 19, 2017 at 01:25:23PM +0100, Michal Kubecek wrote:
> On Tue, Dec 19, 2017 at 04:15:32PM +1030, Jonathan Woithe wrote:
> > This clearly indicates that not every card using the r8169 driver is
> > vulnerable to the problem.  It also explains why Holger was unable to
> > reproduce the result on his system: the PCIe cards do not appear to suffer
> > from the problem.  Most likely the PCI RTL-8169 chip is affected, but newer
> > PCIe variations do not.  However, obviously more testing will be required
> > with a wider variety of cards if this inference is to hold up.
> 
> The r8169 driver supports many slightly different variants of the chip.
> To identify your variant more precisely, look for a line like
> 
>   r8169 0000:02:00.0 eth0: RTL8168evl/8111evl at 0xffffc90003135000, d4:3d:7e:2a:30:08, XID 0c900800 IRQ 38
> 
> in kernel log.

The PCIe card (the one which works correctly with the current driver) shows
this:

  r8169 0000:02:00.0 eth0: RTL8168e/8111e at 0xf862e000, 80:1f:02:45:25:a4, 
    XID 0c200000 IRQ 30
  r8169 0000:02:00.0 eth0: jumbo features [frames: 9200 bytes, 
    tx checksumming: ko]

The PCI card (Netgear GA311) which is affected by the problem shows this:

  r8169 0000:05:01.0 eth1: RTL8110s at 0xf8706800, e0:91:f5:1b:5f:c6, 
    XID 04000000 IRQ 22
  r8169 0000:05:01.0 eth1: jumbo features [frames: 7152 bytes, 
    tx checksumming: ok]

The system which has shown the regressed behaviour is running a 32-bit
kernel; for various reasons we can't move to a 64-bit kernel at present. 
However, I was able to boot this system using Slackware 14.2 install discs,
and therefore test using both 32-bit and 64-bit 4.4.14 kernels.  In both
cases the fault was observed within 30 minutes of starting the tests when
the GA311 card was in use.  The fault is therefore not specific to 32-bit
environments.

Regards
  jonathan

^ permalink raw reply

* Re: [PATCH net 1/2] cls_bpf: fix offload assumptions after callback conversion
From: Jiri Pirko @ 2017-12-20  6:05 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, daniel, oss-drivers
In-Reply-To: <20171219213214.1084-2-jakub.kicinski@netronome.com>

Tue, Dec 19, 2017 at 10:32:13PM CET, jakub.kicinski@netronome.com wrote:
>cls_bpf used to take care of tracking what offload state a filter
>is in, i.e. it would track if offload request succeeded or not.
>This information would then be used to issue correct requests to
>the driver, e.g. requests for statistics only on offloaded filters,
>removing only filters which were offloaded, using add instead of
>replace if previous filter was not added etc.
>
>This tracking of offload state no longer functions with the new
>callback infrastructure.  There could be multiple entities trying
>to offload the same filter.
>
>Throw out all the tracking and corresponding commands and simply
>pass to the drivers both old and new bpf program.  Drivers will
>have to deal with offload state tracking by themselves.
>
>Fixes: 3f7889c4c79b ("net: sched: cls_bpf: call block callbacks for offload")
>Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Acked-by: Jiri Pirko <jiri@mellanox.com>

Thanks Jakub!

^ permalink raw reply

* Re: RCU callback crashes
From: Jiri Pirko @ 2017-12-20  6:11 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev@vger.kernel.org, Cong Wang
In-Reply-To: <20171219175921.7db9b0e1@cakuba.netronome.com>

Wed, Dec 20, 2017 at 02:59:21AM CET, kubakici@wp.pl wrote:
>Hi!
>
>If I run the netdevsim test long enough on a kernel with no debugging 

Just running tools/testing/selftests/bpf/test_offload.py?

>I get this:

Could you try to run it with kasan on?

>
>[ 1400.450124] BUG: unable to handle kernel paging request at 000000046474e552
>[ 1400.458005] IP: 0x46474e552
>[ 1400.461231] PGD 0 P4D 0 
>[ 1400.464150] Oops: 0010 [#1] PREEMPT SMP
>[ 1400.468525] Modules linked in: cls_bpf sch_ingress algif_hash af_alg netdevsim rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace f3
>[ 1400.516951] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 4.15.0-rc3-perf-00918-g129c9981a55f #918
>[ 1400.526678] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.3.4 11/08/2016
>[ 1400.535150] RIP: 0010:0x46474e552
>[ 1400.538941] RSP: 0018:ffff9f736f083f08 EFLAGS: 00010216
>[ 1400.544870] RAX: ffff9f736b4771b8 RBX: ffff9f736f09b880 RCX: ffff9f736b4771b8
>[ 1400.552935] RDX: 000000046474e552 RSI: ffff9f736f083f18 RDI: ffff9f736b4771b8
>[ 1400.561001] RBP: ffffffff8bc4a740 R08: ffff9f736b4771b8 R09: 0000000000000000
>[ 1400.569066] R10: ffff9f736f083d90 R11: 0000000000000000 R12: ffff9f736f09b8b8
>[ 1400.577132] R13: 000000000000000a R14: 7fffffffffffffff R15: 0000000000000202
>[ 1400.585197] FS:  0000000000000000(0000) GS:ffff9f736f080000(0000) knlGS:0000000000000000
>[ 1400.594349] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>[ 1400.600859] CR2: 000000046474e552 CR3: 0000000839c09001 CR4: 00000000003606e0
>[ 1400.608917] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>[ 1400.616982] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>[ 1400.625048] Call Trace:
>[ 1400.627868]  <IRQ>
>[ 1400.630207]  ? rcu_process_callbacks+0x1a0/0x4d0
>[ 1400.635458]  ? __do_softirq+0xd1/0x30a
>[ 1400.639739]  ? irq_exit+0xae/0xb0
>[ 1400.643532]  ? smp_apic_timer_interrupt+0x60/0x140
>[ 1400.648977]  ? apic_timer_interrupt+0x8c/0xa0
>[ 1400.653934]  </IRQ>
>[ 1400.656370]  ? cpuidle_enter_state+0xb0/0x2f0
>[ 1400.661328]  ? cpuidle_enter_state+0x8d/0x2f0
>[ 1400.666287]  ? do_idle+0x17b/0x1d0
>[ 1400.670167]  ? cpu_startup_entry+0x5f/0x70
>[ 1400.674836]  ? start_secondary+0x169/0x190
>[ 1400.679504]  ? secondary_startup_64+0xa5/0xb0
>[ 1400.684466] Code:  Bad RIP value.
>[ 1400.688259] RIP: 0x46474e552 RSP: ffff9f736f083f08
>[ 1400.693703] CR2: 000000046474e552
>[ 1400.697501] ---[ end trace fab2c0fb826644df ]---
>[ 1400.708442] Kernel panic - not syncing: Fatal exception in interrupt
>[ 1400.715693] Kernel Offset: 0xa000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
>[ 1400.732994] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
>
>Unfortunately reproducing the crash on an instrumented kernel seems to
>be difficult..
>
>I managed to gather this:
>
>[   26.157415] ------------[ cut here ]------------
>[   26.162670] ODEBUG: free active (active state 1) object type: rcu_head hint:           (null)
>[   26.172361] WARNING: CPU: 19 PID: 1352 at ../lib/debugobjects.c:291 debug_print_object+0x64/0x80
>[   26.182288] Modules linked in: cls_bpf sch_ingress algif_hash af_alg netdevsim rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace f3
>[   26.230728] CPU: 19 PID: 1352 Comm: tc Not tainted 4.15.0-rc3-perf-00918-g129c9981a55f #4
>[   26.239977] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.3.4 11/08/2016
>[   26.248453] RIP: 0010:debug_print_object+0x64/0x80
>[   26.253896] RSP: 0018:ffffb7340410fa00 EFLAGS: 00010086
>[   26.259825] RAX: 0000000000000051 RBX: ffff8f1f6b7cc5a0 RCX: 0000000000000006
>[   26.267892] RDX: 0000000000000007 RSI: 0000000000000082 RDI: ffff8f1f6f48cdd0
>[   26.275959] RBP: ffffffffb3c48600 R08: 0000000000000000 R09: 00000000000005f2
>[   26.284042] R10: 000000000000001e R11: ffffffffb41c35ad R12: ffffffffb3a1d101
>[   26.292125] R13: ffff8f1f6b7cc5a0 R14: ffffffffb423a8b8 R15: 0000000000000001
>[   26.300194] FS:  00007f64d4956700(0000) GS:ffff8f1f6f480000(0000) knlGS:0000000000000000
>[   26.309346] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>[   26.315859] CR2: 0000000001cbc498 CR3: 000000086a8a2004 CR4: 00000000003606e0
>[   26.323925] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
>[   26.331994] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
>[   26.331994] Call Trace:
>[   26.331998]  debug_check_no_obj_freed+0x1e6/0x220
>[   26.332020]  ? qdisc_graft+0x14f/0x450
>[   26.332025]  kfree+0x14d/0x1b0
>[   26.332027]  qdisc_graft+0x14f/0x450
>[   26.332029]  tc_get_qdisc+0x12f/0x200
>[   26.332035]  rtnetlink_rcv_msg+0x122/0x310
>[   26.332039]  ? __skb_try_recv_datagram+0xef/0x150
>[   26.332040]  ? __kmalloc_node_track_caller+0x205/0x2b0
>[   26.332042]  ? rtnl_calcit.isra.12+0x100/0x100
>[   26.332044]  netlink_rcv_skb+0x8d/0x130
>[   26.332046]  netlink_unicast+0x16a/0x210
>[   26.332048]  netlink_sendmsg+0x32a/0x370
>[   26.332054]  sock_sendmsg+0x2d/0x40
>[   26.332056]  ___sys_sendmsg+0x298/0x2e0
>[   26.332061]  ? mem_cgroup_commit_charge+0x7a/0x540
>[   26.332062]  ? mem_cgroup_try_charge+0x8e/0x1d0
>[   26.332066]  ? __handle_mm_fault+0x3a1/0x1190
>[   26.332068]  ? __sys_sendmsg+0x41/0x70
>[   26.332069]  __sys_sendmsg+0x41/0x70
>[   26.332074]  entry_SYSCALL_64_fastpath+0x1e/0x81
>[   26.332076] RIP: 0033:0x7f64d3b53450
>[   26.332076] RSP: 002b:00007fffb5ea4388 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
>[   26.332077] RAX: ffffffffffffffda RBX: 00007f64d3e0fb20 RCX: 00007f64d3b53450
>[   26.332078] RDX: 0000000000000000 RSI: 00007fffb5ea43e0 RDI: 0000000000000003
>[   26.332078] RBP: 0000000000000a11 R08: 0000000000000000 R09: 000000000000000f
>[   26.332079] R10: 00000000000005e7 R11: 0000000000000246 R12: 00007f64d3e0fb78
>[   26.332079] R13: 00007f64d3e0fb78 R14: 000000000000270f R15: 00007f64d3e0fb78
>[   26.332081] Code: c1 83 c2 01 8b 4b 14 4c 8b 45 00 89 15 f6 d0 e5 00 8b 53 10 4c 89 e6 48 c7 c7 38 7c a3 b3 48 8b 14 d5 80 3d 85 b 
>[   26.332097] ---[ end trace bd33b199ae76ad43 ]---

^ permalink raw reply

* Re: [PATCH V2 net-next 01/17] net: hns3: add support to query tqps number
From: lipeng (Y) @ 2017-12-20  6:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel, linuxarm, salil.mehta
In-Reply-To: <20171219.141644.1574828912135603880.davem@davemloft.net>



On 2017/12/20 3:16, David Miller wrote:
> From: Lipeng <lipeng321@huawei.com>
> Date: Tue, 19 Dec 2017 12:02:23 +0800
>
>> @@ -5002,6 +5002,26 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev)
>>   	ae_dev->priv = NULL;
>>   }
>>   
>> +static u32 hclge_get_max_channels(struct hnae3_handle *handle)
>> +{
>> +	struct hclge_vport *vport = hclge_get_vport(handle);
>> +	struct hnae3_knic_private_info *kinfo = &handle->kinfo;
>> +	struct hclge_dev *hdev = vport->back;
>> +
> Please order local variables from longest to shortest line.
>
> Please audit your entire submission for this problem.
>
> .
will check this patch-set about this problem. Thanks

^ permalink raw reply

* Re: [PATCH V2 net-next 02/17] net: hns3: add support to modify tqps number
From: lipeng (Y) @ 2017-12-20  6:15 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-kernel, linuxarm, salil.mehta
In-Reply-To: <20171219.141840.1031328928935244349.davem@davemloft.net>



On 2017/12/20 3:18, David Miller wrote:
> From: Lipeng <lipeng321@huawei.com>
> Date: Tue, 19 Dec 2017 12:02:24 +0800
>
>> @@ -2651,6 +2651,19 @@ static int hns3_get_ring_config(struct hns3_nic_priv *priv)
>>   	return ret;
>>   }
>>   
>> +static void hns3_put_ring_config(struct hns3_nic_priv *priv)
>> +{
>> +	struct hnae3_handle *h = priv->ae_handle;
>> +	u16 i;
>> +
>> +	for (i = 0; i < h->kinfo.num_tqps; i++) {
> Please use a plain "int" for index iteration loops like this since
> that is the canonical type to use.
will check and fix this , Thanks.
>> +static void hclge_release_tqp(struct hclge_vport *vport)
>> +{
>> +	struct hnae3_knic_private_info *kinfo = &vport->nic.kinfo;
>> +	struct hclge_dev *hdev = vport->back;
>> +	u16 i;
>> +
>> +	for (i = 0; i < kinfo->num_tqps; i++) {
> Likewise.
>
> .
>

^ permalink raw reply

* Re: RCU callback crashes
From: Jakub Kicinski @ 2017-12-20  6:22 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev@vger.kernel.org, Cong Wang
In-Reply-To: <20171220061118.GB1916@nanopsycho>

On Wed, 20 Dec 2017 07:11:18 +0100, Jiri Pirko wrote:
> Wed, Dec 20, 2017 at 02:59:21AM CET, kubakici@wp.pl wrote:
> >Hi!
> >
> >If I run the netdevsim test long enough on a kernel with no debugging   
> 
> Just running tools/testing/selftests/bpf/test_offload.py?

Yes, like this:

while ./linux/tools/testing/selftests/bpf/test_offload.py --log /tmp/log; do echo; done

I usually crashes after ~10 minutes on my machine.

> >I get this:  
> 
> Could you try to run it with kasan on?

I didn't manage to reproduce it with KASAN on so far :(  Even enabling
object debugging to get the second splat in my email (which is more
useful) actually makes the crash go away, I only see the warning...

^ permalink raw reply

* Re: [Intel-wired-lan] v4.15-rc2 on thinkpad x60: ethernet stopped working
From: Neftin, Sasha @ 2017-12-20  6:24 UTC (permalink / raw)
  To: Pavel Machek, jacob.e.keller
  Cc: bpoirier, nix.or.die, netdev, linux-kernel, intel-wired-lan,
	lsorense, David Miller
In-Reply-To: <077087f2-551a-c045-6b07-b1b661e53dad@intel.com>

On 12/18/2017 17:50, Neftin, Sasha wrote:
> On 12/18/2017 13:58, Pavel Machek wrote:
>> On Mon 2017-12-18 13:24:40, Neftin, Sasha wrote:
>>> On 12/18/2017 12:26, Pavel Machek wrote:
>>>> Hi!
>>>>
>>>>>>>> In v4.15-rc2+, network manager can not see my ethernet card, and
>>>>>>>> manual attempts to ifconfig it up did not really help, either.
>>>>>>>>
>>>>>>>> Card is:
>>>>>>>>
>>>>>>>> 02:00.0 Ethernet controller: Intel Corporation 82573L Gigabit 
>>>>>>>> Ethernet
>>>>>>>> Controller
>>>>>> ....
>>>>>>>> Any ideas ?
>>>>>>> Yes , 19110cfbb34d4af0cdfe14cd243f3b09dc95b013 broke it.
>>>>>>>
>>>>>>> See:
>>>>>>> https://bugzilla.kernel.org/show_bug.cgi?id=198047
>>>>>>>
>>>>>>> Fix there :
>>>>>>> https://marc.info/?l=linux-kernel&m=151272209903675&w=2
>>>>>> I don't see the patch in latest mainline. Not having ethernet
>>>>>> is... somehow annoying. What is going on there?
>>>>> Generally speaking, e1000 maintainence has been handled very 
>>>>> poorly over
>>>>> the past few years, I have to say.
>>>>>
>>>>> Fixes take forever to propagate even when someone other than the
>>>>> maintainer provides a working and tested fix, just like this case.
>>>>>
>>>>> Jeff, please take e1000 maintainence seriously and get these critical
>>>>> bug fixes propagated.
>>>> No response AFAICT. I guess I should test reverting
>>>> 19110cfbb34d4af0cdfe14cd243f3b09dc95b013, then ask you for revert?
>>> Hello Pavel,
>>>
>>> Before ask for reverting 19110cfbb..., please, check if follow patch of
>>> Benjamin work for you http://patchwork.ozlabs.org/patch/846825/
>> Jacob said, in another email:
>>
>> # Digging into this, the problem is complicated. The original bug
>> # assumed behavior of the .check_for_link call, which is universally not
>> # implemented.
>> #
>> # I think the correct fix is to revert 19110cfbb34d ("e1000e: Separate
>> # signaling for link check/link up", 2017-10-10) and find a more 
>> proper solution.
>>
>> ...which makes me think that revert is preffered?
>>
>>                                     Pavel
>>
> Pavel, before ask for revert - let's check Benjamin's patch following 
> to his previous patch. Previous patch was not competed and latest one 
> come to complete changes.
>
> _______________________________________________
> Intel-wired-lan mailing list
> Intel-wired-lan@osuosl.org
> https://lists.osuosl.org/mailman/listinfo/intel-wired-lan

Pavel, any update? Is Benjamin's last patch solved your network problem?

^ permalink raw reply

* Re: RCU callback crashes
From: Jakub Kicinski @ 2017-12-20  6:34 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev@vger.kernel.org, Cong Wang
In-Reply-To: <20171219222227.402e684a@cakuba.netronome.com>

On Tue, 19 Dec 2017 22:22:27 -0800, Jakub Kicinski wrote:
> > >I get this:    
> > 
> > Could you try to run it with kasan on?  
> 
> I didn't manage to reproduce it with KASAN on so far :(  Even enabling
> object debugging to get the second splat in my email (which is more
> useful) actually makes the crash go away, I only see the warning...

Ah, no object debug but KASAN on produces this:

[   39.268209] BUG: KASAN: use-after-free in cpu_needs_another_gp+0x246/0x2b0
[   39.275965] Read of size 8 at addr ffff8803aa64f138 by task swapper/13/0
[   39.283524] 
[   39.285256] CPU: 13 PID: 0 Comm: swapper/13 Not tainted 4.15.0-rc3-perf-00955-g1d0b01347dd5-dirty #8
[   39.295535] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.3.4 11/08/2016
[   39.303969] Call Trace:
[   39.306769]  <IRQ>
[   39.309088]  dump_stack+0xa6/0x118
[   39.312957]  ? _atomic_dec_and_lock+0xe8/0xe8
[   39.317895]  ? cpu_needs_another_gp+0x246/0x2b0
[   39.323030]  print_address_description+0x6a/0x270
[   39.328380]  ? cpu_needs_another_gp+0x246/0x2b0
[   39.333510]  kasan_report+0x23f/0x350
[   39.337672]  cpu_needs_another_gp+0x246/0x2b0
...
[   39.383026]  rcu_process_callbacks+0x1a0/0x620
...
[   39.426713]  __do_softirq+0x17f/0x4de
...
[   39.463841]  irq_exit+0xe1/0xf0
[   39.467437]  smp_apic_timer_interrupt+0xd9/0x290
[   39.472685]  ? smp_call_function_single_interrupt+0x230/0x230
[   39.479195]  ? smp_reschedule_interrupt+0x240/0x240
[   39.484736]  apic_timer_interrupt+0x8c/0xa0
[   39.489497]  </IRQ>
[   39.491929] RIP: 0010:cpuidle_enter_state+0x12a/0x510
[   39.497660] RSP: 0018:ffff88086bf9fd08 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff11
[   39.506228] RAX: 0000000000000000 RBX: ffffe8ffffb060e0 RCX: ffffffff921329f5
[   39.514291] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88086f3246e8
[   39.522354] RBP: 1ffff1010d7f3fa6 R08: fffffbfff2742768 R09: fffffbfff2742768
[   39.530418] R10: ffff88086bf9fcc8 R11: fffffbfff2742767 R12: 0000000924148b4b
[   39.538480] R13: 0000000000000004 R14: 0000000000000004 R15: ffffffff9383eb80
[   39.546545]  ? sched_idle_set_state+0x25/0x30
[   39.551502]  ? cpuidle_enter_state+0x106/0x510
[   39.556556]  ? cpuidle_enter_s2idle+0x130/0x130
[   39.561706]  ? rcu_eqs_enter_common.constprop.62+0xd1/0x1e0
[   39.568037]  ? rcu_gp_init+0xf70/0xf70
[   39.572331]  ? sched_set_stop_task+0x160/0x160
[   39.577384]  do_idle+0x1af/0x200
[   39.581076]  cpu_startup_entry+0xd2/0xe0
[   39.585545]  ? cpu_in_idle+0x20/0x20
[   39.589626]  ? _raw_spin_trylock+0xe0/0xe0
[   39.594292]  ? memcpy+0x34/0x50
[   39.597890]  start_secondary+0x271/0x2b0
[   39.602361]  ? set_cpu_sibling_map+0x840/0x840
[   39.607416]  secondary_startup_64+0xa5/0xb0
[   39.612180] 
[   39.613929] Allocated by task 1358:
[   39.617914]  __kmalloc_node+0x183/0x2c0
[   39.622290]  qdisc_alloc+0xbd/0x3f0
[   39.626274]  qdisc_create+0xd8/0x720
[   39.630355]  tc_modify_qdisc+0x657/0x910
[   39.634826]  rtnetlink_rcv_msg+0x37c/0x7e0
[   39.639491]  netlink_rcv_skb+0x122/0x230
[   39.643960]  netlink_unicast+0x2ae/0x360
[   39.648443]  netlink_sendmsg+0x5d5/0x620
[   39.652915]  sock_sendmsg+0x64/0x80
[   39.656900]  ___sys_sendmsg+0x4a8/0x500
[   39.661272]  __sys_sendmsg+0xa9/0x140
[   39.665450]  entry_SYSCALL_64_fastpath+0x1e/0x81
[   39.670695] 
[   39.672441] Freed by task 1370:
[   39.676052]  kfree+0x8d/0x1c0
[   39.679454]  qdisc_graft+0x208/0x670
[   39.683535]  tc_get_qdisc+0x229/0x350
[   39.687713]  rtnetlink_rcv_msg+0x37c/0x7e0
[   39.692411]  netlink_rcv_skb+0x122/0x230
[   39.696881]  netlink_unicast+0x2ae/0x360
[   39.701350]  netlink_sendmsg+0x5d5/0x620
[   39.705819]  sock_sendmsg+0x64/0x80
[   39.709801]  ___sys_sendmsg+0x4a8/0x500
[   39.714172]  __sys_sendmsg+0xa9/0x140
[   39.718351]  entry_SYSCALL_64_fastpath+0x1e/0x81
[   39.723597] 
[   39.725347] The buggy address belongs to the object at ffff8803aa64ef80
[   39.725347]  which belongs to the cache kmalloc-512 of size 512
[   39.739453] The buggy address is located 440 bytes inside of
[   39.739453]  512-byte region [ffff8803aa64ef80, ffff8803aa64f180)
[   39.752684] The buggy address belongs to the page:
[   39.758127] page:0000000042b3124b count:1 mapcount:0 mapping:          (null) index:0x0 compound_mapcount: 0
[   39.769222] flags: 0x2ffff0000008100(slab|head)
[   39.774365] raw: 02ffff0000008100 0000000000000000 0000000000000000 0000000180190019
[   39.783129] raw: dead000000000100 dead000000000200 ffff8803afc0ed80 0000000000000000
[   39.791986] page dumped because: kasan: bad access detected
[   39.798300] 
[   39.800063] Memory state around the buggy address:
[   39.805503]  ffff8803aa64f000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   39.813684]  ffff8803aa64f080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   39.821866] >ffff8803aa64f100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   39.830045]                                         ^
[   39.835778]  ffff8803aa64f180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[   39.843958]  ffff8803aa64f200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb

^ permalink raw reply

* Re: [RFC] hv_netvsc: automatically name slave VF network device
From: Jiri Pirko @ 2017-12-20  6:41 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Stephen Hemminger, Samudrala, Sridhar, netdev, Stephen Hemminger
In-Reply-To: <20171219155353.4aa0fe9b@cakuba.netronome.com>

Wed, Dec 20, 2017 at 12:53:53AM CET, jakub.kicinski@netronome.com wrote:
>On Tue, 19 Dec 2017 15:44:05 -0800, Stephen Hemminger wrote:
>> On Tue, 19 Dec 2017 15:20:57 -0800
>> Jakub Kicinski <jakub.kicinski@netronome.com> wrote:
>> 
>> > On Tue, 19 Dec 2017 14:50:17 -0800, Stephen Hemminger wrote:  
>> > > On Tue, 19 Dec 2017 14:44:37 -0800
>> > > "Samudrala, Sridhar" <sridhar.samudrala@intel.com> wrote:
>> > >     
>> > > >  -static void __netvsc_vf_setup(struct net_device *ndev,      
>> > > > > -			      struct net_device *vf_netdev)
>> > > > > -{
>> > > > > -	int ret;
>> > > > > +	/* set the name of VF device based on upper device name */
>> > > > > +	snprintf(vf_name, IFNAMSIZ, "%s_vf", ndev->name);
>> > > > > +	ret = dev_change_name(vf_netdev, vf_name);
>> > > > > +	if (ret != 0)
>> > > > > +		netdev_warn(vf_netdev,
>> > > > > +			    "can not rename device: (%d)\n", ret);        
>> > > > 
>> > > > It is possible that upper device name can change after this call.  I 
>> > > > noticed this
>> > > > when i tried this approach with virtio_net.
>> > > > 
>> > > > Also, what should happen if the upper device is unloaded? Should we rename
>> > > > the VF name?      
>> > > 
>> > > Yes upper device can change name. So sure, netdevice could trap that
>> > > in callback (it already has notifier) and rename VF. Will add that in V2.
>> > > 
>> > > If upper device is unloaded then it is already decoupled from the VF.
>> > > There is no good value to change it back to. The orignal name probably
>> > > has been reused by then.    
>> > 
>> > Both of those issues would be solved by just exposing phys_port_name
>> > from the VF driver, and letting systemd do its thing independent of 
>> > magic bonds.
>> > 
>> > Reluctance to do driver work aside :/  
>> 
>> The port name for Mellanox driver is already set in the driver as a numeric value.
>> It indicates which port is used.
>> This won't work.
>
>You must be looking at representor ndos.  I don't think MLX NIC drivers
>are implementing phys_port_name for normal netdevs at all today.

You are right Jakub. I plan to generate phys_port_name according to the
netdev flavours (vf,pf,qp,vfrep,pfrep,qprep,etc). That would resolve
this.

^ permalink raw reply

* Re: [PATCH v10 1/5] add infrastructure for tagging functions as error injectable
From: Masami Hiramatsu @ 2017-12-20  7:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Josef Bacik, rostedt, mingo, davem, netdev, linux-kernel, ast,
	kernel-team, daniel, linux-btrfs, darrick.wong, Josef Bacik
In-Reply-To: <7b73cbfa-959f-2efe-8a7c-f9d0b0c2ccaa@fb.com>

On Tue, 19 Dec 2017 18:14:17 -0800
Alexei Starovoitov <ast@fb.com> wrote:

> On 12/18/17 10:29 PM, Masami Hiramatsu wrote:
> >>
> >> +#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
> >> +#ifdef CONFIG_BPF_KPROBE_OVERRIDE
> >
> > BTW, CONFIG_BPF_KPROBE_OVERRIDE is also confusable name.
> > Since this feature override a function to just return with
> > some return value (as far as I understand, or would you
> > also plan to modify execution path inside a function?),
> > I think it should be better CONFIG_BPF_FUNCTION_OVERRIDE or
> > CONFIG_BPF_EXECUTION_OVERRIDE.
> 
> I don't think such renaming makes sense.
> The feature is overriding kprobe by changing how kprobe returns.
> It doesn't override BPF_FUNCTION or BPF_EXECUTION.

No, I meant this is BPF's feature which override FUNCTION, so
BPF is a kind of namespace. (Is that only for a function entry
because it can not tweak stackframe at this morment?)

> The kernel enters and exists bpf program as normal.

Yeah, but that bpf program modifies instruction pointer, am I correct?

> 
> > Indeed, BPF is based on kprobes, but it seems you are limiting it
> > with ftrace (function-call trace) (I'm not sure the reason why),
> > so using "kprobes" for this feature seems strange for me.
> 
> do you have an idea how kprobe override can happen when kprobe
> placed in the middle of the function?

For example, if you know a basic block in the function, maybe
you can skip a block or something like that. But nowadays
it is somewhat hard because optimizer mixed it up.

> 
> Please make your suggestion as patches based on top of bpf-next.

bpf-next seems already pick this series. Would you mean I revert it and
write new patch?

Thank you,

> 
> Thanks
> 


-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply

* Re: [PATCH v1 2/4] lib/net_utils: Introduce mac_pton_from_user()
From: Greg Kroah-Hartman @ 2017-12-20  7:13 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: David S. Miller, netdev, Larry Finger, Florian Schilhabel, devel
In-Reply-To: <20171219191412.14880-2-andriy.shevchenko@linux.intel.com>

On Tue, Dec 19, 2017 at 09:14:10PM +0200, Andy Shevchenko wrote:
> Some drivers are getting MAC from user space. Make a helper for them.
> 
> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> ---
>  include/linux/kernel.h |  1 +
>  lib/net_utils.c        | 12 ++++++++++++
>  2 files changed, 13 insertions(+)

Don't do this just for some horrid staging drivers.  They can just drop
that functionality entirely and use the "normal" way of doing this if
they really want it.

thanks,

greg k-h

^ permalink raw reply

* [PATCH] selftests: net: Adding config fragment CONFIG_NUMA=y
From: Naresh Kamboju @ 2017-12-20  7:20 UTC (permalink / raw)
  To: netdev; +Cc: davem, guro, bamvor.zhangjian, shuahkh

kernel config fragement CONFIG_NUMA=y is need for reuseport_bpf_numa.

Signed-off-by: Naresh Kamboju <naresh.kamboju@linaro.org>
---
 tools/testing/selftests/net/config | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
index e57b4ac..7177bea 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -1,3 +1,4 @@
 CONFIG_USER_NS=y
 CONFIG_BPF_SYSCALL=y
 CONFIG_TEST_BPF=m
+CONFIG_NUMA=y
-- 
2.7.4

^ permalink raw reply related

* [PATCH iproute2 0/3] Forbid "type" for peer, update ifname and make it array in ll_cache
From: Serhey Popovych @ 2017-12-20  7:37 UTC (permalink / raw)
  To: netdev

In this series I present following improvements and fixes:

  1) Forbid "type" parameter when parsing command line
     for peer in iplink_vxcan.c and link_veth.c using
     iplink_parse(): we already known it.

  2) In ll_remember_index() update ifname, not only rehash
     it. It might be changed for same ifindex since last
     run (i.e. in cache "eth0" during the dump "ppp0").

  3) Make ifname fixed size array of chars in @struct ll_cache:
     names are never exceed IFNAMSIZ (16 bytes). Replace
     strcmp()/strcpy() with memcmp()/memcpy() to possibly
     benefit from compiler call inlining.

See individual patch description message for details.

Thanks,
Serhii

Serhey Popovych (3):
  vxcan,veth: Forbid "type" for peer device
  utils: ll_map: Update name and type for existing entry
  utils: ll_map: Make network device name fixed size array of char

 ip/iplink_vxcan.c |    3 +++
 ip/link_veth.c    |    3 +++
 lib/ll_map.c      |   47 +++++++++++++++++++++++++++--------------------
 3 files changed, 33 insertions(+), 20 deletions(-)

-- 
1.7.10.4

^ permalink raw reply

* [PATCH iproute2 1/3] vxcan,veth: Forbid "type" for peer device
From: Serhey Popovych @ 2017-12-20  7:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1513755451-9800-1-git-send-email-serhe.popovych@gmail.com>

It is already given for original device we configure this
peer for.

Results from following command before/after change applied
are shown below:

  $ ip link add dev veth1a type veth peer name veth1b \
                           type veth peer name veth1c

Before:
-------

<no output, no netdevs created>

After:
------

Error: argument "type" is wrong: not supported for peer

Signed-off-by: Serhey Popovych <serhe.popovych@gmail.com>
---
 ip/iplink_vxcan.c |    3 +++
 ip/link_veth.c    |    3 +++
 2 files changed, 6 insertions(+)

diff --git a/ip/iplink_vxcan.c b/ip/iplink_vxcan.c
index c13224c..13f2577 100644
--- a/ip/iplink_vxcan.c
+++ b/ip/iplink_vxcan.c
@@ -65,6 +65,9 @@ static int vxcan_parse_opt(struct link_util *lu, int argc, char **argv,
 	if (err < 0)
 		return err;
 
+	if (type)
+		invarg("not supported for peer", "type");
+
 	if (name) {
 		addattr_l(hdr, 1024,
 			  IFLA_IFNAME, name, strlen(name) + 1);
diff --git a/ip/link_veth.c b/ip/link_veth.c
index fcfd1ef..cc43198 100644
--- a/ip/link_veth.c
+++ b/ip/link_veth.c
@@ -63,6 +63,9 @@ static int veth_parse_opt(struct link_util *lu, int argc, char **argv,
 	if (err < 0)
 		return err;
 
+	if (type)
+		invarg("not supported for peer", "type");
+
 	if (name) {
 		addattr_l(hdr, 1024,
 			  IFLA_IFNAME, name, strlen(name) + 1);
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH iproute2 2/3] utils: ll_map: Update name and type for existing entry
From: Serhey Popovych @ 2017-12-20  7:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1513755451-9800-1-git-send-email-serhe.popovych@gmail.com>

In case of we update existing entry we need not only rehash
but also update name in existing entry.

Need to update device type too since cached interface might
be deleted and new with same index, but different type
added (e.g. eth0 and ppp0).

Reuse new entry initialization path to avoid duplications.

Signed-off-by: Serhey Popovych <serhe.popovych@gmail.com>
---
 lib/ll_map.c |   33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/lib/ll_map.c b/lib/ll_map.c
index f65614f..abe7bdc 100644
--- a/lib/ll_map.c
+++ b/lib/ll_map.c
@@ -10,6 +10,7 @@
  *
  */
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -85,6 +86,7 @@ int ll_remember_index(const struct sockaddr_nl *who,
 	struct ifinfomsg *ifi = NLMSG_DATA(n);
 	struct ll_cache *im;
 	struct rtattr *tb[IFLA_MAX+1];
+	bool rehash;
 
 	if (n->nlmsg_type != RTM_NEWLINK && n->nlmsg_type != RTM_DELLINK)
 		return 0;
@@ -109,29 +111,30 @@ int ll_remember_index(const struct sockaddr_nl *who,
 
 	if (im) {
 		/* change to existing entry */
-		if (strcmp(im->name, ifname) != 0) {
+		rehash = strcmp(im->name, ifname);
+		if (rehash)
 			hlist_del(&im->name_hash);
-			h = namehash(ifname) & (IDXMAP_SIZE - 1);
-			hlist_add_head(&im->name_hash, &name_head[h]);
-		}
+	} else {
+		im = malloc(sizeof(*im) + strlen(ifname) + 1);
+		if (!im)
+			return 0;
+		im->index = ifi->ifi_index;
 
-		im->flags = ifi->ifi_flags;
-		return 0;
+		h = ifi->ifi_index & (IDXMAP_SIZE - 1);
+		hlist_add_head(&im->idx_hash, &idx_head[h]);
+
+		rehash = true;
 	}
 
-	im = malloc(sizeof(*im) + strlen(ifname) + 1);
-	if (im == NULL)
-		return 0;
-	im->index = ifi->ifi_index;
-	strcpy(im->name, ifname);
 	im->type = ifi->ifi_type;
 	im->flags = ifi->ifi_flags;
 
-	h = ifi->ifi_index & (IDXMAP_SIZE - 1);
-	hlist_add_head(&im->idx_hash, &idx_head[h]);
+	if (rehash) {
+		h = namehash(ifname) & (IDXMAP_SIZE - 1);
+		hlist_add_head(&im->name_hash, &name_head[h]);
 
-	h = namehash(ifname) & (IDXMAP_SIZE - 1);
-	hlist_add_head(&im->name_hash, &name_head[h]);
+		strcpy(im->name, ifname);
+	}
 
 	return 0;
 }
-- 
1.7.10.4

^ permalink raw reply related

* [PATCH iproute2 3/3] utils: ll_map: Make network device name fixed size array of char
From: Serhey Popovych @ 2017-12-20  7:37 UTC (permalink / raw)
  To: netdev
In-Reply-To: <1513755451-9800-1-git-send-email-serhe.popovych@gmail.com>

Network device names are fixed in size and never exceed
IFNAMSIZ (16 bytes).

Make name fixed size array to always malloc() same size chunk
of memory and use memcpy()/memcmp() with constant IFNAMSIZ
to benefit from possible compiler optimizations replacing
call to a function with two/four load/store instructions
on 64/32 bit systems.

Check if IFLA_IFNAME attribute present in netlink message
(should always) and use strncpy() to pad name with zeros.

Signed-off-by: Serhey Popovych <serhe.popovych@gmail.com>
---
 lib/ll_map.c |   20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/lib/ll_map.c b/lib/ll_map.c
index abe7bdc..fcbf0fb 100644
--- a/lib/ll_map.c
+++ b/lib/ll_map.c
@@ -30,7 +30,7 @@ struct ll_cache {
 	unsigned	flags;
 	unsigned 	index;
 	unsigned short	type;
-	char		name[];
+	char		name[IFNAMSIZ];
 };
 
 #define IDXMAP_SIZE	1024
@@ -71,7 +71,7 @@ static struct ll_cache *ll_get_by_name(const char *name)
 		struct ll_cache *im
 			= container_of(n, struct ll_cache, name_hash);
 
-		if (strncmp(im->name, name, IFNAMSIZ) == 0)
+		if (!strcmp(im->name, name))
 			return im;
 	}
 
@@ -82,7 +82,7 @@ int ll_remember_index(const struct sockaddr_nl *who,
 		      struct nlmsghdr *n, void *arg)
 {
 	unsigned int h;
-	const char *ifname;
+	char ifname[IFNAMSIZ];
 	struct ifinfomsg *ifi = NLMSG_DATA(n);
 	struct ll_cache *im;
 	struct rtattr *tb[IFLA_MAX+1];
@@ -105,17 +105,21 @@ int ll_remember_index(const struct sockaddr_nl *who,
 	}
 
 	parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), IFLA_PAYLOAD(n));
-	ifname = rta_getattr_str(tb[IFLA_IFNAME]);
-	if (ifname == NULL)
+
+	if (!tb[IFLA_IFNAME])
+		return 0;
+	strncpy(ifname, rta_getattr_str(tb[IFLA_IFNAME]), IFNAMSIZ);
+	if (!ifname[0])
 		return 0;
+	ifname[IFNAMSIZ - 1] = '\0';
 
 	if (im) {
 		/* change to existing entry */
-		rehash = strcmp(im->name, ifname);
+		rehash = memcmp(im->name, ifname, IFNAMSIZ);
 		if (rehash)
 			hlist_del(&im->name_hash);
 	} else {
-		im = malloc(sizeof(*im) + strlen(ifname) + 1);
+		im = malloc(sizeof(*im));
 		if (!im)
 			return 0;
 		im->index = ifi->ifi_index;
@@ -133,7 +137,7 @@ int ll_remember_index(const struct sockaddr_nl *who,
 		h = namehash(ifname) & (IDXMAP_SIZE - 1);
 		hlist_add_head(&im->name_hash, &name_head[h]);
 
-		strcpy(im->name, ifname);
+		memcpy(im->name, ifname, IFNAMSIZ);
 	}
 
 	return 0;
-- 
1.7.10.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox