Netdev List
 help / color / mirror / Atom feed
* [PATCH bpf-next 03/10] [bpf]: add bpf_xdp_adjust_tail sample prog
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann; +Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

adding bpf's sample program which is using bpf_xdp_adjust_tail helper
by generating ICMPv4 "packet to big" message if ingress packet's size is
bigger then 600 bytes

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 samples/bpf/Makefile                      |   4 +
 samples/bpf/xdp_adjust_tail_kern.c        | 151 ++++++++++++++++++++++++++++++
 samples/bpf/xdp_adjust_tail_user.c        | 141 ++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/bpf_helpers.h |   2 +
 4 files changed, 298 insertions(+)
 create mode 100644 samples/bpf/xdp_adjust_tail_kern.c
 create mode 100644 samples/bpf/xdp_adjust_tail_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6edd4bf6..aa8c392e2e52 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -44,6 +44,7 @@ hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
+hostprogs-y += xdp_adjust_tail
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -95,6 +96,7 @@ xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
+xdp_adjust_tail-objs := bpf_load.o $(LIBBPF) xdp_adjust_tail_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -148,6 +150,7 @@ always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
+always += xdp_adjust_tail_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -193,6 +196,7 @@ HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
+HOSTLOADLIBES_xdp_adjust_tail += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/xdp_adjust_tail_kern.c b/samples/bpf/xdp_adjust_tail_kern.c
new file mode 100644
index 000000000000..17570559fd08
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_kern.c
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program shows how to use bpf_xdp_adjust_tail() by
+ * generating ICMPv4 "packet to big" (unreachable/ df bit set frag needed
+ * to be more preice in case of v4)" where receiving packets bigger then
+ * 600 bytes.
+ */
+#define KBUILD_MODNAME "foo"
+#include <uapi/linux/bpf.h>
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include "bpf_helpers.h"
+
+#define DEFAULT_TTL 64
+#define MAX_PCKT_SIZE 600
+#define ICMP_TOOBIG_SIZE 98
+#define ICMP_TOOBIG_PAYLOAD_SIZE 92
+
+struct bpf_map_def SEC("maps") icmpcnt = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64),
+	.max_entries = 1,
+};
+
+static __always_inline void count_icmp(void)
+{
+	u64 key = 0;
+	u64 *icmp_count;
+
+	icmp_count = bpf_map_lookup_elem(&icmpcnt, &key);
+	if (icmp_count)
+		*icmp_count += 1;
+}
+
+static __always_inline void swap_mac(void *data, struct ethhdr *orig_eth)
+{
+	struct ethhdr *eth;
+
+	eth = data;
+	memcpy(eth->h_source, orig_eth->h_dest, ETH_ALEN);
+	memcpy(eth->h_dest, orig_eth->h_source, ETH_ALEN);
+	eth->h_proto = orig_eth->h_proto;
+}
+
+static __always_inline __u16 csum_fold_helper(__u32 csum)
+{
+	return ~((csum & 0xffff) + (csum >> 16));
+}
+
+static __always_inline void ipv4_csum(void *data_start, int data_size,
+				      __u32 *csum)
+{
+	*csum = bpf_csum_diff(0, 0, data_start, data_size, *csum);
+	*csum = csum_fold_helper(*csum);
+}
+
+static __always_inline int send_icmp4_too_big(struct xdp_md *xdp)
+{
+	int headroom = (int)sizeof(struct iphdr) + (int)sizeof(struct icmphdr);
+
+	if (bpf_xdp_adjust_head(xdp, 0 - headroom))
+		return XDP_DROP;
+	void *data = (void *)(long)xdp->data;
+	void *data_end = (void *)(long)xdp->data_end;
+
+	if (data + (ICMP_TOOBIG_SIZE + headroom) > data_end)
+		return XDP_DROP;
+
+	struct iphdr *iph, *orig_iph;
+	struct icmphdr *icmp_hdr;
+	struct ethhdr *orig_eth;
+	__u32 csum = 0;
+	__u64 off = 0;
+
+	orig_eth = data + headroom;
+	swap_mac(data, orig_eth);
+	off += sizeof(struct ethhdr);
+	iph = data + off;
+	off += sizeof(struct iphdr);
+	icmp_hdr = data + off;
+	off += sizeof(struct icmphdr);
+	orig_iph = data + off;
+	icmp_hdr->type = ICMP_DEST_UNREACH;
+	icmp_hdr->code = ICMP_FRAG_NEEDED;
+	icmp_hdr->un.frag.mtu = htons(MAX_PCKT_SIZE-sizeof(struct ethhdr));
+	icmp_hdr->checksum = 0;
+	ipv4_csum(icmp_hdr, ICMP_TOOBIG_PAYLOAD_SIZE, &csum);
+	icmp_hdr->checksum = csum;
+	iph->ttl = DEFAULT_TTL;
+	iph->daddr = orig_iph->saddr;
+	iph->saddr = orig_iph->daddr;
+	iph->version = 4;
+	iph->ihl = 5;
+	iph->protocol = IPPROTO_ICMP;
+	iph->tos = 0;
+	iph->tot_len = htons(
+		ICMP_TOOBIG_SIZE + headroom - sizeof(struct ethhdr));
+	iph->check = 0;
+	csum = 0;
+	ipv4_csum(iph, sizeof(struct iphdr), &csum);
+	iph->check = csum;
+	count_icmp();
+	return XDP_TX;
+}
+
+
+static __always_inline int handle_ipv4(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	int pckt_size = data_end - data;
+	int offset;
+
+	if (pckt_size > MAX_PCKT_SIZE) {
+		offset = pckt_size - ICMP_TOOBIG_SIZE;
+		if (bpf_xdp_adjust_tail(xdp, 0 - offset))
+			return XDP_PASS;
+		return send_icmp4_too_big(xdp);
+	}
+	return XDP_PASS;
+}
+
+SEC("xdp_icmp")
+int _xdp_icmp(struct xdp_md *xdp)
+{
+	void *data_end = (void *)(long)xdp->data_end;
+	void *data = (void *)(long)xdp->data;
+	struct ethhdr *eth = data;
+	__u16 h_proto;
+
+	if (eth + 1 > data_end)
+		return XDP_DROP;
+
+	h_proto = eth->h_proto;
+
+	if (h_proto == htons(ETH_P_IP))
+		return handle_ipv4(xdp);
+	else
+		return XDP_PASS;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_adjust_tail_user.c b/samples/bpf/xdp_adjust_tail_user.c
new file mode 100644
index 000000000000..8a2b8879ac66
--- /dev/null
+++ b/samples/bpf/xdp_adjust_tail_user.c
@@ -0,0 +1,141 @@
+/* Copyright (c) 2018 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/bpf.h>
+#include <linux/if_link.h>
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <arpa/inet.h>
+#include <netinet/ether.h>
+#include <unistd.h>
+#include <time.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+#include "bpf_util.h"
+
+#define STATS_INTERVAL_S 2U
+
+static int ifindex = -1;
+static __u32 xdp_flags;
+
+static void int_exit(int sig)
+{
+	if (ifindex > -1)
+		bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+	exit(0);
+}
+
+/* simple "icmp packet too big sent" counter
+ */
+static void poll_stats(unsigned int kill_after_s)
+{
+	time_t started_at = time(NULL);
+	__u64 value = 0;
+	int key = 0;
+
+
+	while (!kill_after_s || time(NULL) - started_at <= kill_after_s) {
+		sleep(STATS_INTERVAL_S);
+
+		assert(bpf_map_lookup_elem(map_fd[0], &key, &value) == 0);
+
+		printf("icmp \"packet too big\" sent: %10llu pkts\n", value);
+	}
+}
+
+static void usage(const char *cmd)
+{
+	printf("Start a XDP prog which send ICMP \"packet too big\" \n"
+		"messages if ingress packet is bigger then MAX_SIZE bytes\n");
+	printf("Usage: %s [...]\n", cmd);
+	printf("    -i <ifindex> Interface Index\n");
+	printf("    -T <stop-after-X-seconds> Default: 0 (forever)\n");
+	printf("    -S use skb-mode\n");
+	printf("    -N enforce native mode\n");
+	printf("    -h Display this help\n");
+}
+
+int main(int argc, char **argv)
+{
+	unsigned char opt_flags[256] = {};
+	unsigned int kill_after_s = 0;
+	const char *optstr = "i:T:SNh";
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	char filename[256];
+	int opt;
+	int i;
+
+
+	for (i = 0; i < strlen(optstr); i++)
+		if (optstr[i] != 'h' && 'a' <= optstr[i] && optstr[i] <= 'z')
+			opt_flags[(unsigned char)optstr[i]] = 1;
+
+	while ((opt = getopt(argc, argv, optstr)) != -1) {
+
+		switch (opt) {
+		case 'i':
+			ifindex = atoi(optarg);
+			break;
+		case 'T':
+			kill_after_s = atoi(optarg);
+			break;
+		case 'S':
+			xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'N':
+			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
+		default:
+			usage(argv[0]);
+			return 1;
+		}
+		opt_flags[opt] = 0;
+	}
+
+	for (i = 0; i < strlen(optstr); i++) {
+		if (opt_flags[(unsigned int)optstr[i]]) {
+			fprintf(stderr, "Missing argument -%c\n", optstr[i]);
+			usage(argv[0]);
+			return 1;
+		}
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)");
+		return 1;
+	}
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if (!prog_fd[0]) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
+
+	if (bpf_set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
+		printf("link set xdp fd failed\n");
+		return 1;
+	}
+
+	poll_stats(kill_after_s);
+
+	bpf_set_link_xdp_fd(ifindex, -1, xdp_flags);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 50c607014b22..9271576bdc8f 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -132,6 +132,8 @@ static int (*bpf_l3_csum_replace)(void *ctx, int off, int from, int to, int flag
 	(void *) BPF_FUNC_l3_csum_replace;
 static int (*bpf_l4_csum_replace)(void *ctx, int off, int from, int to, int flags) =
 	(void *) BPF_FUNC_l4_csum_replace;
+static int (*bpf_csum_diff)(void *from, int from_size, void *to, int to_size, int seed) =
+	(void *) BPF_FUNC_csum_diff;
 static int (*bpf_skb_under_cgroup)(void *ctx, void *map, int index) =
 	(void *) BPF_FUNC_skb_under_cgroup;
 static int (*bpf_skb_change_head)(void *, int len, int flags) =
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 04/10] [bpf]: make generic xdp compatible w/ bpf_xdp_adjust_tail
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, David S. Miller 
  Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for generic XDP we need to reflect this packet's length change by
adjusting skb's tail pointer

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 net/core/dev.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 969462ebb296..11c789231a03 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3996,9 +3996,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 				     struct bpf_prog *xdp_prog)
 {
 	struct netdev_rx_queue *rxqueue;
+	void *orig_data, *orig_data_end;
 	u32 metalen, act = XDP_DROP;
 	struct xdp_buff xdp;
-	void *orig_data;
 	int hlen, off;
 	u32 mac_len;
 
@@ -4037,6 +4037,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 	xdp.data_meta = xdp.data;
 	xdp.data_end = xdp.data + hlen;
 	xdp.data_hard_start = skb->data - skb_headroom(skb);
+	orig_data_end = xdp.data_end;
 	orig_data = xdp.data;
 
 	rxqueue = netif_get_rxqueue(skb);
@@ -4051,6 +4052,13 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
 		__skb_push(skb, -off);
 	skb->mac_header += off;
 
+	/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
+	 * pckt.
+	 */
+	off = orig_data_end - xdp.data_end;
+	if (off != 0)
+		skb_set_tail_pointer(skb, xdp.data_end - xdp.data);
+
 	switch (act) {
 	case XDP_REDIRECT:
 	case XDP_TX:
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 05/10] [bpf]: make mlx4 compatible w/ bpf_xdp_adjust_tail
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Tariq Toukan
  Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for mlx4 driver we will just calculate packet's length unconditionally
(the same way as it's already being done in mlx5)

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 5c613c6663da..efc55feddc5c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -775,8 +775,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 			act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
+			length = xdp.data_end - xdp.data;
 			if (xdp.data != orig_data) {
-				length = xdp.data_end - xdp.data;
 				frags[0].page_offset = xdp.data -
 					xdp.data_hard_start;
 				va = xdp.data;
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 06/10] [bpf]: make bnxt compatible w/ bpf_xdp_adjust_tail
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Michael Chan
  Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for bnxt driver we will just calculate packet's length unconditionally

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 1389ab5e05df..1f0e872d0667 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -113,10 +113,10 @@ bool bnxt_rx_xdp(struct bnxt *bp, struct bnxt_rx_ring_info *rxr, u16 cons,
 	if (tx_avail != bp->tx_ring_size)
 		*event &= ~BNXT_RX_EVENT;
 
+	*len = xdp.data_end - xdp.data;
 	if (orig_data != xdp.data) {
 		offset = xdp.data - xdp.data_hard_start;
 		*data_ptr = xdp.data_hard_start + offset;
-		*len = xdp.data_end - xdp.data;
 	}
 	switch (act) {
 	case XDP_PASS:
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 07/10] [bpf]: make cavium thunder compatible w/ bpf_xdp_adjust_tail
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Robert Richter,
	Sunil Goutham
  Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for cavium's thunder driver we will just calculate packet's length
unconditionally

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 707db3304396..7135db45927e 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -538,9 +538,9 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog,
 	action = bpf_prog_run_xdp(prog, &xdp);
 	rcu_read_unlock();
 
+	len = xdp.data_end - xdp.data;
 	/* Check if XDP program has changed headers */
 	if (orig_data != xdp.data) {
-		len = xdp.data_end - xdp.data;
 		offset = orig_data - xdp.data;
 		dma_addr -= offset;
 	}
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 08/10] [bpf]: make netronome nfp compatible w/ bpf_xdp_adjust_tail
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Jakub Kicinski
  Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for nfp driver we will just calculate packet's length unconditionally

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 1eb6549f2a54..d9111c077699 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -1722,7 +1722,7 @@ static int nfp_net_rx(struct nfp_net_rx_ring *rx_ring, int budget)
 
 			act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
-			pkt_len -= xdp.data - orig_data;
+			pkt_len = xdp.data_end - xdp.data;
 			pkt_off += xdp.data - orig_data;
 
 			switch (act) {
-- 
2.15.1

^ permalink raw reply related

* Re: tcp hang when socket fills up ?
From: Michal Kubecek @ 2018-04-17  6:52 UTC (permalink / raw)
  To: netdev; +Cc: Florian Westphal, Marcelo Ricardo Leitner, Eric Dumazet
In-Reply-To: <19b55265-c010-3981-d503-ab8a5a89ed5e@gmail.com>

On Mon, Apr 16, 2018 at 10:28:11PM -0700, Eric Dumazet wrote:
> > I turned pr_debug on in tcp_in_window() for another try and it's a bit
> > mangled because the information on multiple lines and the function is
> > called in parallel but it looks like I do have some seq > maxend +1
> > 
> > Although it's weird, the maxend was set WAY earlier apparently?
> > Apr 17 11:13:14 res=1 sender end=1913287798 maxend=1913316998 maxwin=29312 receiver end=505004284 maxend=505033596 maxwin=29200
> > then window decreased drastically e.g. previous ack just before refusal:
> > Apr 17 11:13:53 seq=1913292311 ack=505007789+(0) sack=505007789+(0) win=284 end=1913292311
> > Apr 17 11:13:53 sender end=1913292311 maxend=1913331607 maxwin=284 scale=0 receiver end=505020155 maxend=505033596 maxwin=39296 scale=7
> 
> scale=0 is suspect.
> 
> Really if conntrack does not see SYN SYNACK packets, it should not
> make any window check, since windows can be scaled up to 14 :/

Or maybe set the scaling to

  - TCP_MAX_WSCALE (14) by default
  - 0 when SYN or SYNACK without window scale option is seen
  - value of window scale option when SYN or SYNACK with it is seen

Michal Kubecek

^ permalink raw reply

* [PATCH bpf-next 09/10] [bpf]: make tun compatible w/ bpf_xdp_adjust_tail
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, Jason Wang, mst
  Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for tun driver we need to adjust XDP_PASS handling by recalculating
length of the packet if it was passed to the TCP/IP stack
(in case if after xdp's prog run data_end pointer was adjusted)

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 drivers/net/tun.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 28583aa0c17d..0b488a958076 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1688,6 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 			return NULL;
 		case XDP_PASS:
 			delta = orig_data - xdp.data;
+			len = xdp.data_end - xdp.data;
 			break;
 		default:
 			bpf_warn_invalid_xdp_action(act);
@@ -1708,7 +1709,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 	}
 
 	skb_reserve(skb, pad - delta);
-	skb_put(skb, len + delta);
+	skb_put(skb, len);
 	get_page(alloc_frag->page);
 	alloc_frag->offset += buflen;
 
-- 
2.15.1

^ permalink raw reply related

* [PATCH bpf-next 10/10] [bpf]: make virtio compatible w/ bpf_xdp_adjust_tail
From: Nikita V. Shirokov @ 2018-04-17  6:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann,
	Michael S. Tsirkin mst @ redhat . com , Jason Wang 
  Cc: netdev, Nikita V. Shirokov
In-Reply-To: <20180417065131.3632-1-tehnerd@tehnerd.com>

w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
well (only "decrease" of pointer's location is going to be supported).
changing of this pointer will change packet's size.
for virtio driver we need to adjust XDP_PASS handling by recalculating
length of the packet if it was passed to the TCP/IP stack

Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>
---
 drivers/net/virtio_net.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7b187ec7411e..115d85f7360a 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -604,6 +604,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		case XDP_PASS:
 			/* Recalculate length in case bpf program changed it */
 			delta = orig_data - xdp.data;
+			len = xdp.data_end - xdp.data;
 			break;
 		case XDP_TX:
 			sent = __virtnet_xdp_xmit(vi, &xdp);
@@ -637,7 +638,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		goto err;
 	}
 	skb_reserve(skb, headroom - delta);
-	skb_put(skb, len + delta);
+	skb_put(skb, len);
 	if (!delta) {
 		buf += header_offset;
 		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
@@ -752,6 +753,10 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			offset = xdp.data -
 					page_address(xdp_page) - vi->hdr_len;
 
+			/* recalculate len if xdp.data or xdp.data_end were
+			 * adjusted
+			 */
+			len = xdp.data_end - xdp.data;
 			/* We can only create skb based on xdp_page. */
 			if (unlikely(xdp_page != page)) {
 				rcu_read_unlock();
-- 
2.15.1

^ permalink raw reply related

* Re: [PATCH 1/1] net/mlx4_core: avoid resetting HCA when accessing an offline device
From: Tariq Toukan @ 2018-04-17  7:05 UTC (permalink / raw)
  To: David Miller, yanjun.zhu; +Cc: tariqt, netdev, linux-rdma, haakon.bugge
In-Reply-To: <20180416.125110.1875435797136179428.davem@davemloft.net>



On 16/04/2018 7:51 PM, David Miller wrote:
> From: Zhu Yanjun <yanjun.zhu@oracle.com>
> Date: Sun, 15 Apr 2018 21:02:07 -0400
> 
>> While a faulty cable is used or HCA firmware error, HCA device will
>> be offline. When the driver is accessing this offline device, the
>> following call trace will pop out.
>   ...
>> In the above call trace, the function mlx4_cmd_poll calls the function
>> mlx4_cmd_post to access the HCA while HCA is offline. Then mlx4_cmd_post
>> returns an error -EIO. Per -EIO, the function mlx4_cmd_poll calls
>> mlx4_cmd_reset_flow to reset HCA. And the above call trace pops out.
>>
>> This is not reasonable. Since HCA device is offline when it is being
>> accessed, it should not be reset again.
>>
>> In this patch, since HCA is offline, the function mlx4_cmd_post returns
>> an error -EINVAL. Per -EINVAL, the function mlx4_cmd_poll directly returns
>> instead of resetting HCA.
>>
>> CC: Srinivas Eeda <srinivas.eeda@oracle.com>
>> CC: Junxiao Bi <junxiao.bi@oracle.com>
>> Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
>> Signed-off-by: Zhu Yanjun <yanjun.zhu@oracle.com>
> 
> Tariq, I'm assuming you'll take this in and send it to me later.
> 
> Thanks.

Yes, I will review and send if all is OK.

Thanks,
Tariq

> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH][next] iwlwifi: mvm: remove division by size of sizeof(struct ieee80211_wmm_rule)
From: Luca Coelho @ 2018-04-17  7:06 UTC (permalink / raw)
  To: Colin King, Johannes Berg, Emmanuel Grumbach,
	Intel Linux Wireless, Kalle Valo,
	linux-wireless-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA
  Cc: kernel-janitors-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20180411130533.11827-1-colin.king-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>

On Wed, 2018-04-11 at 14:05 +0100, Colin King wrote:
> From: Colin Ian King <colin.king-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
> 
> The subtraction of two struct ieee80211_wmm_rule pointers leaves a
> result
> that is automatically scaled down by the size of the size of pointed-
> to
> type, hence the division by sizeof(struct ieee80211_wmm_rule) is
> bogus and should be removed.
> 
> Detected by CoverityScan, CID#1467777 ("Extra sizeof expression")
> 
> Fixes: 77e30e10ee28 ("iwlwifi: mvm: query regdb for wmm rule if
> needed")
> Signed-off-by: Colin Ian King <colin.king-Z7WLFzj8eWMS+FvcfC7Uqw@public.gmane.org>
> ---

Thanks, Colin! I've pushed this to our internal tree for review and if
everything goes fine it will land upstream following our normal
upstreaming process.

--
Cheers,
Luca.

^ permalink raw reply

* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
From: Jesper Dangaard Brouer @ 2018-04-17  7:07 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: xdp-newbies@vger.kernel.org, netdev@vger.kernel.org,
	Christoph Hellwig, David Woodhouse, William Tu,
	Björn Töpel, Karlsson, Magnus, Alexander Duyck,
	Arnaldo Carvalho de Melo, brouer
In-Reply-To: <20180417061550.GA21067@infradead.org>

On Mon, 16 Apr 2018 23:15:50 -0700
Christoph Hellwig <hch@infradead.org> wrote:

> On Mon, Apr 16, 2018 at 11:07:04PM +0200, Jesper Dangaard Brouer wrote:
> > On X86 swiotlb fallback (via get_dma_ops -> get_arch_dma_ops) to use
> > x86_swiotlb_dma_ops, instead of swiotlb_dma_ops.  I also included that
> > in below fix patch.  
> 
> x86_swiotlb_dma_ops should not exist any mor, and x86 now uses
> dma_direct_ops.  Looks like you are applying it to an old kernel :)
> 
> > Performance improved to 8.9 Mpps from approx 6.5Mpps.
> > 
> > (This was without my bulking for net_device->ndo_xdp_xmit, so that
> > number should improve more).  
> 
> What is the number for the otherwise comparable setup without repolines?

Approx 12 Mpps.

You forgot to handle the dma_direct_mapping_error() case, which still
used the retpoline in the above 8.9 Mpps measurement, I fixed it up and
performance increased to 9.6 Mpps.

Notice, in this test there are still two retpoline/indirect-calls
left.  The net_device->ndo_xdp_xmit and the invocation of the XDP BPF
prog.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* Re: XDP performance regression due to CONFIG_RETPOLINE Spectre V2
From: Christoph Hellwig @ 2018-04-17  7:13 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Christoph Hellwig, xdp-newbies@vger.kernel.org,
	netdev@vger.kernel.org, Christoph Hellwig, David Woodhouse,
	William Tu, Björn Töpel, Karlsson, Magnus,
	Alexander Duyck, Arnaldo Carvalho de Melo
In-Reply-To: <20180417090701.6b25f456@redhat.com>

On Tue, Apr 17, 2018 at 09:07:01AM +0200, Jesper Dangaard Brouer wrote:
> > > number should improve more).  
> > 
> > What is the number for the otherwise comparable setup without repolines?
> 
> Approx 12 Mpps.
> 
> You forgot to handle the dma_direct_mapping_error() case, which still
> used the retpoline in the above 8.9 Mpps measurement, I fixed it up and
> performance increased to 9.6 Mpps.
> 
> Notice, in this test there are still two retpoline/indirect-calls
> left.  The net_device->ndo_xdp_xmit and the invocation of the XDP BPF
> prog.

But that seems like a pretty clear indicator that we want the fast path
direct mapping.  I'll try to find some time over the next weeks to
do a cleaner version of it.

^ permalink raw reply

* [PATCH] net: change the comment of dev_mc_init
From: sunlianwen @ 2018-04-17  7:27 UTC (permalink / raw)
  To: netdev

the comment of dev_mc_init() is wrong. which use dev_mc_flush
instead of dev_mc_init.


Signed-off-by:Lianwen Sun <sunlw.fnst@cn.fujitsu.com>
---
  net/core/dev_addr_lists.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index e3e6a3e2ca22..d884d8f5f0e5 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -839,7 +839,7 @@ void dev_mc_flush(struct net_device *dev)
  EXPORT_SYMBOL(dev_mc_flush);

  /**
- *     dev_mc_flush - Init multicast address list
+ *     dev_mc_init - Init multicast address list
   *     @dev: device
   *
   *     Init multicast address list.
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next v4 0/3] kernel: add support to collect hardware logs in crash recovery kernel
From: Rahul Lakkireddy @ 2018-04-17  7:44 UTC (permalink / raw)
  To: netdev, kexec, linux-fsdevel, linux-kernel
  Cc: davem, viro, ebiederm, stephen, akpm, torvalds, ganeshgr,
	nirranjan, indranil, Rahul Lakkireddy

On production servers running variety of workloads over time, kernel
panic can happen sporadically after days or even months. It is
important to collect as much debug logs as possible to root cause
and fix the problem, that may not be easy to reproduce. Snapshot of
underlying hardware/firmware state (like register dump, firmware
logs, adapter memory, etc.), at the time of kernel panic will be very
helpful while debugging the culprit device driver.

This series of patches add new generic framework that enable device
drivers to collect device specific snapshot of the hardware/firmware
state of the underlying device in the crash recovery kernel. In crash
recovery kernel, the collected logs are added as elf notes to
/proc/vmcore, which is copied by user space scripts for post-analysis.

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /proc/vmcore are as follows:

1. During probe (before hardware is initialized), device drivers
register to the vmcore module (via vmcore_add_device_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. vmcore module allocates the buffer with requested size. It adds
an elf note and invokes the device driver's registered callback
function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to vmcore module.

The device specific hardware/firmware logs can be seen as elf notes:

# readelf -n /proc/vmcore

Displaying notes found at file offset 0x00001000 with length 0x04003288:
  Owner                 Data size	Description
  VMCOREDD_cxgb4_0000:02:00.4 0x02000fd8	Unknown note type: (0x00000700)
  VMCOREDD_cxgb4_0000:04:00.4 0x02000fd8	Unknown note type: (0x00000700)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  CORE                 0x00000150	NT_PRSTATUS (prstatus structure)
  VMCOREINFO           0x0000074f	Unknown note type: (0x00000000)

Patch 1 adds API to vmcore module to allow drivers to register callback
to collect the device specific hardware/firmware logs.  The logs will
be added to /proc/vmcore as elf notes.

Patch 2 updates read and mmap logic to append device specific hardware/
firmware logs as elf notes.

Patch 3 shows a cxgb4 driver example using the API to collect
hardware/firmware logs in crash recovery kernel, before hardware is
initialized.

Thanks,
Rahul

RFC v1: https://lkml.org/lkml/2018/3/2/542
RFC v2: https://lkml.org/lkml/2018/3/16/326

---
v4:
- Made __vmcore_add_device_dump() static.
- Moved compile check to define vmcore_add_device_dump() to
  crash_dump.h to fix compilation when vmcore.c is not compiled in.
- Convert ---help--- to help in Kconfig as indicated by checkpatch.
- Rebased to tip.

v3:
- Dropped sysfs crashdd module.
- Exported dumps as elf notes. Suggested by Eric Biederman
  <ebiederm@xmission.com>.  Added as patch 2 in this version.
- Added CONFIG_PROC_VMCORE_DEVICE_DUMP to allow configuring device
  dump support.
- Moved logic related to adding dumps from crashdd to vmcore module.
- Rename all crashdd* to vmcoredd*.
- Updated comments.

v2:
- Added ABI Documentation for crashdd.
- Directly use octal permission instead of macro.

Changes since rfc v2:
- Moved exporting crashdd from procfs to sysfs. Suggested by
  Stephen Hemminger <stephen@networkplumber.org>
- Moved code from fs/proc/crashdd.c to fs/crashdd/ directory.
- Replaced all proc API with sysfs API and updated comments.
- Calling driver callback before creating the binary file under
  crashdd sysfs.
- Changed binary dump file permission from S_IRUSR to S_IRUGO.
- Changed module name from CRASH_DRIVER_DUMP to CRASH_DEVICE_DUMP.

rfc v2:
- Collecting logs in 2nd kernel instead of during kernel panic.
  Suggested by Eric Biederman <ebiederm@xmission.com>.
- Added new crashdd module that exports /proc/crashdd/ containing
  driver's registered hardware/firmware logs in patch 1.
- Replaced the API to allow drivers to register their hardware/firmware
  log collect routine in crash recovery kernel in patch 1.
- Updated patch 2 to use the new API in patch 1.

Rahul Lakkireddy (3):
  vmcore: add API to collect hardware dump in second kernel
  vmcore: append device dumps to vmcore as elf notes
  cxgb4: collect hardware dump in second kernel

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h       |   4 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c |  25 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h |   3 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  |  10 +
 fs/proc/Kconfig                                  |  10 +
 fs/proc/vmcore.c                                 | 399 ++++++++++++++++++++++-
 include/linux/crash_core.h                       |   4 +
 include/linux/crash_dump.h                       |  17 +
 include/linux/kcore.h                            |   6 +
 include/uapi/linux/elf.h                         |   1 +
 10 files changed, 466 insertions(+), 13 deletions(-)

-- 
2.14.1

^ permalink raw reply

* [PATCH net-next v4 1/3] vmcore: add API to collect hardware dump in second kernel
From: Rahul Lakkireddy @ 2018-04-17  7:44 UTC (permalink / raw)
  To: netdev, kexec, linux-fsdevel, linux-kernel
  Cc: davem, viro, ebiederm, stephen, akpm, torvalds, ganeshgr,
	nirranjan, indranil, Rahul Lakkireddy
In-Reply-To: <cover.1523950321.git.rahul.lakkireddy@chelsio.com>

The sequence of actions done by device drivers to append their device
specific hardware/firmware logs to /proc/vmcore are as follows:

1. During probe (before hardware is initialized), device drivers
register to the vmcore module (via vmcore_add_device_dump()), with
callback function, along with buffer size and log name needed for
firmware/hardware log collection.

2. vmcore module allocates the buffer with requested size. It adds
an Elf note and invokes the device driver's registered callback
function.

3. Device driver collects all hardware/firmware logs into the buffer
and returns control back to vmcore module.

Ensure that the device dump buffer size is always aligned to page size
so that it can be mmaped.

Also, rename alloc_elfnotes_buf() to vmcore_alloc_buf() to make it more
generic and reserve NT_VMCOREDD note type to indicate vmcore device
dump.

Suggested-by: Eric Biederman <ebiederm@xmission.com>.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
v4:
- Made __vmcore_add_device_dump() static.
- Moved compile check to define vmcore_add_device_dump() to
  crash_dump.h to fix compilation when vmcore.c is not compiled in.
- Convert ---help--- to help in Kconfig as indicated by checkpatch.
- Rebased to tip.

v3:
- Dropped sysfs crashdd module.
- Added CONFIG_PROC_VMCORE_DEVICE_DUMP to allow configuring device
  dump support.
- Moved logic related to adding dumps from crashdd to vmcore module.
- Rename all crashdd* to vmcoredd*.

v2:
- Added ABI Documentation for crashdd.
- Directly use octal permission instead of macro.

Changes since rfc v2:
- Moved exporting crashdd from procfs to sysfs.  Suggested by
  Stephen Hemminger <stephen@networkplumber.org>
- Moved code from fs/proc/crashdd.c to fs/crashdd/ directory.
- Replaced all proc API with sysfs API and updated comments.
- Calling driver callback before creating the binary file under
  crashdd sysfs.
- Changed binary dump file permission from S_IRUSR to S_IRUGO.
- Changed module name from CRASH_DRIVER_DUMP to CRASH_DEVICE_DUMP.

rfc v2:
- Collecting logs in 2nd kernel instead of during kernel panic.
  Suggested by Eric Biederman <ebiederm@xmission.com>.
- Patch added in this series.

 fs/proc/Kconfig            |  10 +++
 fs/proc/vmcore.c           | 152 ++++++++++++++++++++++++++++++++++++++++++---
 include/linux/crash_core.h |   4 ++
 include/linux/crash_dump.h |  17 +++++
 include/linux/kcore.h      |   6 ++
 include/uapi/linux/elf.h   |   1 +
 6 files changed, 181 insertions(+), 9 deletions(-)

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 1ade1206bb89..504c77a444bd 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -43,6 +43,16 @@ config PROC_VMCORE
         help
         Exports the dump image of crashed kernel in ELF format.
 
+config PROC_VMCORE_DEVICE_DUMP
+	bool "Device Hardware/Firmware Log Collection"
+	depends on PROC_VMCORE
+	default y
+	help
+	  Device drivers can collect the device specific snapshot of
+	  their hardware or firmware before they are initialized in
+	  crash recovery kernel. If you say Y here, the device dumps
+	  will be added as ELF notes to /proc/vmcore
+
 config PROC_SYSCTL
 	bool "Sysctl support (/proc/sys)" if EXPERT
 	depends on PROC_FS
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a45f0af22a60..7395462d2f86 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -20,6 +20,7 @@
 #include <linux/init.h>
 #include <linux/crash_dump.h>
 #include <linux/list.h>
+#include <linux/mutex.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
@@ -44,6 +45,12 @@ static u64 vmcore_size;
 
 static struct proc_dir_entry *proc_vmcore;
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/* Device Dump list and mutex to synchronize access to list */
+static LIST_HEAD(vmcoredd_list);
+static DEFINE_MUTEX(vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
 /*
  * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
  * The called function has to take care of module refcounting.
@@ -302,10 +309,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
 };
 
 /**
- * alloc_elfnotes_buf - allocate buffer for ELF note segment in
- *                      vmalloc memory
- *
- * @notes_sz: size of buffer
+ * vmcore_alloc_buf - allocate buffer in vmalloc memory
+ * @sizez: size of buffer
  *
  * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
  * the buffer to user-space by means of remap_vmalloc_range().
@@ -313,12 +318,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
  * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
  * disabled and there's no need to allow users to mmap the buffer.
  */
-static inline char *alloc_elfnotes_buf(size_t notes_sz)
+static inline char *vmcore_alloc_buf(size_t size)
 {
 #ifdef CONFIG_MMU
-	return vmalloc_user(notes_sz);
+	return vmalloc_user(size);
 #else
-	return vzalloc(notes_sz);
+	return vzalloc(size);
 #endif
 }
 
@@ -665,7 +670,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -851,7 +856,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 		return rc;
 
 	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
-	*notes_buf = alloc_elfnotes_buf(*notes_sz);
+	*notes_buf = vmcore_alloc_buf(*notes_sz);
 	if (!*notes_buf)
 		return -ENOMEM;
 
@@ -1145,6 +1150,132 @@ static int __init parse_crash_elf_headers(void)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+/**
+ * vmcoredd_get_note_size - Get size of the note that will be inserted at
+ * beginning of the dump's buffer.
+ * @name: Note's name
+ *
+ * Gets the overall size of the note that will be inserted at the beginning
+ * of the dump's buffer.  It also adds padding, if necessary to meet
+ * alignment requirements.
+ */
+static inline size_t vmcoredd_get_note_size(const char *name)
+{
+	return CRASH_CORE_NOTE_HEAD_BYTES +
+	       ALIGN(VMCOREDD_NOTE_NAME_BYTES + strlen(name), sizeof(Elf_Word));
+}
+
+/**
+ * vmcoredd_write_note - Write note at the beginning of the dump's buffer
+ * @name: Dump's name
+ * @buf: Output buffer where the note is written
+ * @size: Size of the dump
+ *
+ * Fills beginning of the dump's data with elf note.
+ */
+static void vmcoredd_write_note(const char *name, void *buf, size_t size)
+{
+	struct elf_note *note = (struct elf_note *)buf;
+	Elf_Word *word = (Elf_Word *)note;
+
+	note->n_namesz = ALIGN(VMCOREDD_NOTE_NAME_BYTES + strlen(name),
+			       sizeof(Elf_Word));
+	note->n_descsz = size;
+	note->n_type = NT_VMCOREDD;
+	word += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word));
+	snprintf((char *)word, note->n_namesz, "%s_%s", VMCOREDD_NOTE_NAME,
+		 name);
+}
+
+/**
+ * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
+ * @data: dump info.
+ *
+ * Allocate a buffer and invoke the calling driver's dump collect routine.
+ * Write Elf note at the beginning of the buffer to indicate vmcore device
+ * dump and add the dump to global list.
+ */
+static int __vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	size_t note_size, data_size;
+	struct vmcoredd_node *dump;
+	void *buf = NULL;
+	int ret;
+
+	if (!data || !strlen(data->name) ||
+	    !data->vmcoredd_callback || !data->size)
+		return -EINVAL;
+
+	dump = vzalloc(sizeof(*dump));
+	if (!dump) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	note_size = vmcoredd_get_note_size(data->name);
+	/* Keep size of the buffer page aligned so that it can be mmaped */
+	data_size = roundup(note_size + data->size, PAGE_SIZE);
+
+	/* Allocate buffer for driver's to write their dumps */
+	buf = vmcore_alloc_buf(data_size);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	vmcoredd_write_note(data->name, buf, data_size - note_size);
+
+	/* Invoke the driver's dump collection routing */
+	ret = data->vmcoredd_callback(data, buf + note_size);
+	if (ret)
+		goto out_err;
+
+	dump->buf = buf;
+	dump->size = data_size;
+
+	/* Add the dump to driver sysfs list */
+	mutex_lock(&vmcoredd_mutex);
+	list_add_tail(&dump->list, &vmcoredd_list);
+	mutex_unlock(&vmcoredd_mutex);
+
+	return 0;
+
+out_err:
+	if (buf)
+		vfree(buf);
+
+	if (dump)
+		vfree(dump);
+
+	return ret;
+}
+
+int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	return __vmcore_add_device_dump(data);
+}
+EXPORT_SYMBOL(vmcore_add_device_dump);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+/* Free all dumps in vmcore device dump list */
+static void vmcore_free_device_dumps(void)
+{
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+	mutex_lock(&vmcoredd_mutex);
+	while (!list_empty(&vmcoredd_list)) {
+		struct vmcoredd_node *dump;
+
+		dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
+					list);
+		list_del(&dump->list);
+		vfree(dump->buf);
+		vfree(dump);
+	}
+	mutex_unlock(&vmcoredd_mutex);
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+}
+
 /* Init function for vmcore module. */
 static int __init vmcore_init(void)
 {
@@ -1192,4 +1323,7 @@ void vmcore_cleanup(void)
 		kfree(m);
 	}
 	free_elfcorebuf();
+
+	/* clear vmcore device dump list */
+	vmcore_free_device_dumps();
 }
diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h
index b511f6d24b42..3b6b041d84c8 100644
--- a/include/linux/crash_core.h
+++ b/include/linux/crash_core.h
@@ -27,6 +27,10 @@
 				     VMCOREINFO_NOTE_NAME_BYTES +	\
 				     VMCOREINFO_BYTES)
 
+#define VMCOREDD_MAX_NAME_BYTES  32
+#define VMCOREDD_NOTE_NAME	 "VMCOREDD"
+#define VMCOREDD_NOTE_NAME_BYTES sizeof(VMCOREDD_NOTE_NAME)
+
 typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4];
 
 void crash_update_vmcoreinfo_safecopy(void *ptr);
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index f7ac2aa93269..658d508d1ec5 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -93,4 +93,21 @@ static inline bool is_kdump_kernel(void) { return 0; }
 #endif /* CONFIG_CRASH_DUMP */
 
 extern unsigned long saved_max_pfn;
+
+/* Device Dump information to be filled by drivers */
+struct vmcoredd_data {
+	char name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */
+	unsigned long size;                 /* Size of the dump */
+	/* Driver's registered callback to be invoked to collect dump */
+	int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf);
+};
+
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+int vmcore_add_device_dump(struct vmcoredd_data *data);
+#else
+static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 #endif /* LINUX_CRASHDUMP_H */
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
index 80db19d3a505..aa26e7199060 100644
--- a/include/linux/kcore.h
+++ b/include/linux/kcore.h
@@ -28,6 +28,12 @@ struct vmcore {
 	loff_t offset;
 };
 
+struct vmcoredd_node {
+	struct list_head list;	/* List of dumps */
+	void *buf;		/* Buffer containing device's dump */
+	unsigned long size;	/* Size of the buffer */
+};
+
 #ifdef CONFIG_PROC_KCORE
 extern void kclist_add(struct kcore_list *, void *, size_t, int type);
 #else
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index e2535d6dcec7..4e12c423b9fe 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -421,6 +421,7 @@ typedef struct elf64_shdr {
 #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
 #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
 #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
+#define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
 
 /* Note header in a PT_NOTE section */
 typedef struct elf32_note {
-- 
2.14.1

^ permalink raw reply related

* [PATCH net-next v4 2/3] vmcore: append device dumps to vmcore as elf notes
From: Rahul Lakkireddy @ 2018-04-17  7:44 UTC (permalink / raw)
  To: netdev-u79uwXL29TY76Z2rM5mHXA,
	kexec-IAPFreCvJWM7uuMidbF8XUB+6BGkLq7r,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: indranil-ut6Up61K2wZBDgjK7y7TUQ, nirranjan-ut6Up61K2wZBDgjK7y7TUQ,
	stephen-OTpzqLSitTUnbdJkjeBofR2eb7JE58TQ,
	ganeshgr-ut6Up61K2wZBDgjK7y7TUQ, Rahul Lakkireddy,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q,
	viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn
In-Reply-To: <cover.1523950321.git.rahul.lakkireddy-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>

Update read and mmap logic to append device dumps as additional notes
before the other elf notes. We add device dumps before other elf notes
because the other elf notes may not fill the elf notes buffer
completely and we will end up with zero-filled data between the elf
notes and the device dumps. Tools will then try to decode this
zero-filled data as valid notes and we don't want that. Hence, adding
device dumps before the other elf notes ensure that zero-filled data
can be avoided. This also ensures that the device dumps and the
other elf notes can be properly mmaped at page aligned address.

Incorporate device dump size into the total vmcore size. Also update
offsets for other program headers after the device dumps are added.

Suggested-by: Eric Biederman <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>.
Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
Signed-off-by: Ganesh Goudar <ganeshgr-ut6Up61K2wZBDgjK7y7TUQ@public.gmane.org>
---
v4:
- No changes.

v3:
- Patch added in this version.
- Exported dumps as elf notes. Suggested by Eric Biederman
  <ebiederm-aS9lmoZGLiVWk0Htik3J/w@public.gmane.org>.

 fs/proc/vmcore.c | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 243 insertions(+), 4 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7395462d2f86..ed1ebd85e14e 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -39,6 +39,8 @@ static size_t elfcorebuf_sz_orig;
 
 static char *elfnotes_buf;
 static size_t elfnotes_sz;
+/* Size of all notes minus the device dump notes */
+static size_t elfnotes_orig_sz;
 
 /* Total size of vmcore file. */
 static u64 vmcore_size;
@@ -51,6 +53,9 @@ static LIST_HEAD(vmcoredd_list);
 static DEFINE_MUTEX(vmcoredd_mutex);
 #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
 
+/* Device Dump Size */
+static size_t vmcoredd_orig_sz;
+
 /*
  * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
  * The called function has to take care of module refcounting.
@@ -185,6 +190,77 @@ static int copy_to(void *target, void *src, size_t size, int userbuf)
 	return 0;
 }
 
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+static int vmcoredd_copy_dumps(void *dst, u64 start, size_t size, int userbuf)
+{
+	struct vmcoredd_node *dump;
+	u64 offset = 0;
+	int ret = 0;
+	size_t tsz;
+	char *buf;
+
+	mutex_lock(&vmcoredd_mutex);
+	list_for_each_entry(dump, &vmcoredd_list, list) {
+		if (start < offset + dump->size) {
+			tsz = min(offset + (u64)dump->size - start, (u64)size);
+			buf = dump->buf + start - offset;
+			if (copy_to(dst, buf, tsz, userbuf)) {
+				ret = -EFAULT;
+				goto out_unlock;
+			}
+
+			size -= tsz;
+			start += tsz;
+			dst += tsz;
+
+			/* Leave now if buffer filled already */
+			if (!size)
+				goto out_unlock;
+		}
+		offset += dump->size;
+	}
+
+out_unlock:
+	mutex_unlock(&vmcoredd_mutex);
+	return ret;
+}
+
+static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
+			       u64 start, size_t size)
+{
+	struct vmcoredd_node *dump;
+	u64 offset = 0;
+	int ret = 0;
+	size_t tsz;
+	char *buf;
+
+	mutex_lock(&vmcoredd_mutex);
+	list_for_each_entry(dump, &vmcoredd_list, list) {
+		if (start < offset + dump->size) {
+			tsz = min(offset + (u64)dump->size - start, (u64)size);
+			buf = dump->buf + start - offset;
+			if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) {
+				ret = -EFAULT;
+				goto out_unlock;
+			}
+
+			size -= tsz;
+			start += tsz;
+			dst += tsz;
+
+			/* Leave now if buffer filled already */
+			if (!size)
+				goto out_unlock;
+		}
+		offset += dump->size;
+	}
+
+out_unlock:
+	mutex_unlock(&vmcoredd_mutex);
+	return ret;
+}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
 /* Read from the ELF header and then the crash dump. On error, negative value is
  * returned otherwise number of bytes read are returned.
  */
@@ -222,10 +298,41 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
 	if (*fpos < elfcorebuf_sz + elfnotes_sz) {
 		void *kaddr;
 
+		/* We add device dumps before other elf notes because the
+		 * other elf notes may not fill the elf notes buffer
+		 * completely and we will end up with zero-filled data
+		 * between the elf notes and the device dumps. Tools will
+		 * then try to decode this zero-filled data as valid notes
+		 * and we don't want that. Hence, adding device dumps before
+		 * the other elf notes ensure that zero-filled data can be
+		 * avoided.
+		 */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+		/* Read device dumps */
+		if (*fpos < elfcorebuf_sz + vmcoredd_orig_sz) {
+			tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+				  (size_t)*fpos, buflen);
+			start = *fpos - elfcorebuf_sz;
+			if (vmcoredd_copy_dumps(buffer, start, tsz, userbuf))
+				return -EFAULT;
+
+			buflen -= tsz;
+			*fpos += tsz;
+			buffer += tsz;
+			acc += tsz;
+
+			/* leave now if filled buffer already */
+			if (!buflen)
+				return acc;
+		}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+		/* Read remaining elf notes */
 		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
-		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
+		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz - vmcoredd_orig_sz;
 		if (copy_to(buffer, kaddr, tsz, userbuf))
 			return -EFAULT;
+
 		buflen -= tsz;
 		*fpos += tsz;
 		buffer += tsz;
@@ -451,11 +558,46 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 	if (start < elfcorebuf_sz + elfnotes_sz) {
 		void *kaddr;
 
+		/* We add device dumps before other elf notes because the
+		 * other elf notes may not fill the elf notes buffer
+		 * completely and we will end up with zero-filled data
+		 * between the elf notes and the device dumps. Tools will
+		 * then try to decode this zero-filled data as valid notes
+		 * and we don't want that. Hence, adding device dumps before
+		 * the other elf notes ensure that zero-filled data can be
+		 * avoided. This also ensures that the device dumps and
+		 * other elf notes can be properly mmaped at page aligned
+		 * address.
+		 */
+#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
+		/* Read device dumps */
+		if (start < elfcorebuf_sz + vmcoredd_orig_sz) {
+			u64 start_off;
+
+			tsz = min(elfcorebuf_sz + vmcoredd_orig_sz -
+				  (size_t)start, size);
+			start_off = start - elfcorebuf_sz;
+			if (vmcoredd_mmap_dumps(vma, vma->vm_start + len,
+						start_off, tsz))
+				goto fail;
+
+			size -= tsz;
+			start += tsz;
+			len += tsz;
+
+			/* leave now if filled buffer already */
+			if (!size)
+				return 0;
+		}
+#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
+
+		/* Read remaining elf notes */
 		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
-		kaddr = elfnotes_buf + start - elfcorebuf_sz;
+		kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
 		if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
 						kaddr, tsz))
 			goto fail;
+
 		size -= tsz;
 		start += tsz;
 		len += tsz;
@@ -703,6 +845,11 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
 
+	/* Store the size of all notes.  We need this to update the note
+	 * header when the device dumps will be added.
+	 */
+	elfnotes_orig_sz = phdr.p_memsz;
+
 	return 0;
 }
 
@@ -889,6 +1036,11 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
 	/* Modify e_phnum to reflect merged headers. */
 	ehdr_ptr->e_phnum = ehdr_ptr->e_phnum - nr_ptnote + 1;
 
+	/* Store the size of all notes.  We need this to update the note
+	 * header when the device dumps will be added.
+	 */
+	elfnotes_orig_sz = phdr.p_memsz;
+
 	return 0;
 }
 
@@ -981,8 +1133,8 @@ static int __init process_ptload_program_headers_elf32(char *elfptr,
 }
 
 /* Sets offset fields of vmcore elements. */
-static void __init set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
-					   struct list_head *vc_list)
+static void set_vmcore_list_offsets(size_t elfsz, size_t elfnotes_sz,
+				    struct list_head *vc_list)
 {
 	loff_t vmcore_off;
 	struct vmcore *m;
@@ -1188,6 +1340,92 @@ static void vmcoredd_write_note(const char *name, void *buf, size_t size)
 		 name);
 }
 
+/**
+ * vmcoredd_update_program_headers - Update all Elf program headers
+ * @elfptr: Pointer to elf header
+ * @elfnotesz: Size of elf notes aligned to page size
+ * @vmcoreddsz: Size of device dumps to be added to elf note header
+ *
+ * Determine type of Elf header (Elf64 or Elf32) and update the elf note size.
+ * Also update the offsets of all the program headers after the elf note header.
+ */
+static void vmcoredd_update_program_headers(char *elfptr, size_t elfnotesz,
+					    size_t vmcoreddsz)
+{
+	unsigned char *e_ident = (unsigned char *)elfptr;
+	u64 start, end, size;
+	loff_t vmcore_off;
+	u32 i;
+
+	vmcore_off = elfcorebuf_sz + elfnotesz;
+
+	if (e_ident[EI_CLASS] == ELFCLASS64) {
+		Elf64_Ehdr *ehdr = (Elf64_Ehdr *)elfptr;
+		Elf64_Phdr *phdr = (Elf64_Phdr *)(elfptr + sizeof(Elf64_Ehdr));
+
+		/* Update all program headers */
+		for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+			if (phdr->p_type == PT_NOTE) {
+				/* Update note size */
+				phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+				phdr->p_filesz = phdr->p_memsz;
+				continue;
+			}
+
+			start = rounddown(phdr->p_offset, PAGE_SIZE);
+			end = roundup(phdr->p_offset + phdr->p_memsz,
+				      PAGE_SIZE);
+			size = end - start;
+			phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+			vmcore_off += size;
+		}
+	} else {
+		Elf32_Ehdr *ehdr = (Elf32_Ehdr *)elfptr;
+		Elf32_Phdr *phdr = (Elf32_Phdr *)(elfptr + sizeof(Elf32_Ehdr));
+
+		/* Update all program headers */
+		for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+			if (phdr->p_type == PT_NOTE) {
+				/* Update note size */
+				phdr->p_memsz = elfnotes_orig_sz + vmcoreddsz;
+				phdr->p_filesz = phdr->p_memsz;
+				continue;
+			}
+
+			start = rounddown(phdr->p_offset, PAGE_SIZE);
+			end = roundup(phdr->p_offset + phdr->p_memsz,
+				      PAGE_SIZE);
+			size = end - start;
+			phdr->p_offset = vmcore_off + (phdr->p_offset - start);
+			vmcore_off += size;
+		}
+	}
+}
+
+/**
+ * vmcoredd_update_size - Update the total size of the device dumps and update
+ * Elf header
+ * @dump_size: Size of the current device dump to be added to total size
+ *
+ * Update the total size of all the device dumps and update the Elf program
+ * headers. Calculate the new offsets for the vmcore list and update the
+ * total vmcore size.
+ */
+static void vmcoredd_update_size(size_t dump_size)
+{
+	vmcoredd_orig_sz += dump_size;
+	elfnotes_sz = roundup(elfnotes_orig_sz, PAGE_SIZE) + vmcoredd_orig_sz;
+	vmcoredd_update_program_headers(elfcorebuf, elfnotes_sz,
+					vmcoredd_orig_sz);
+
+	/* Update vmcore list offsets */
+	set_vmcore_list_offsets(elfcorebuf_sz, elfnotes_sz, &vmcore_list);
+
+	vmcore_size = get_vmcore_size(elfcorebuf_sz, elfnotes_sz,
+				      &vmcore_list);
+	proc_vmcore->size = vmcore_size;
+}
+
 /**
  * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
  * @data: dump info.
@@ -1239,6 +1477,7 @@ static int __vmcore_add_device_dump(struct vmcoredd_data *data)
 	list_add_tail(&dump->list, &vmcoredd_list);
 	mutex_unlock(&vmcoredd_mutex);
 
+	vmcoredd_update_size(data_size);
 	return 0;
 
 out_err:
-- 
2.14.1

^ permalink raw reply related

* [PATCH net-next v4 3/3] cxgb4: collect hardware dump in second kernel
From: Rahul Lakkireddy @ 2018-04-17  7:44 UTC (permalink / raw)
  To: netdev, kexec, linux-fsdevel, linux-kernel
  Cc: davem, viro, ebiederm, stephen, akpm, torvalds, ganeshgr,
	nirranjan, indranil, Rahul Lakkireddy
In-Reply-To: <cover.1523950321.git.rahul.lakkireddy@chelsio.com>

Register callback to collect hardware/firmware dumps in second kernel
before hardware/firmware is initialized. The dumps for each device
will be available as elf notes in /proc/vmcore in second kernel.

Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
v4:
- No changes.

v3:
- Replaced all crashdd* with vmcoredd*.
- Replaced crashdd_add_dump() with vmcore_add_device_dump().
- Updated comments and commit message.

v2:
- No Changes.

Changes since rfc v2:
- Update comments and commit message for sysfs change.

rfc v2:
- Updated dump registration to the new API in patch 1.

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h       |  4 ++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 25 ++++++++++++++++++++++++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h |  3 +++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c  | 10 ++++++++++
 4 files changed, 42 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 688f95440af2..01e7aad4ce5b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -50,6 +50,7 @@
 #include <linux/net_tstamp.h>
 #include <linux/ptp_clock_kernel.h>
 #include <linux/ptp_classify.h>
+#include <linux/crash_dump.h>
 #include <asm/io.h>
 #include "t4_chip_type.h"
 #include "cxgb4_uld.h"
@@ -964,6 +965,9 @@ struct adapter {
 	struct hma_data hma;
 
 	struct srq_data *srq;
+
+	/* Dump buffer for collecting logs in kdump kernel */
+	struct vmcoredd_data vmcoredd;
 };
 
 /* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
index 143686c60234..76433d4fe483 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
@@ -488,3 +488,28 @@ void cxgb4_init_ethtool_dump(struct adapter *adapter)
 	adapter->eth_dump.version = adapter->params.fw_vers;
 	adapter->eth_dump.len = 0;
 }
+
+static int cxgb4_cudbg_vmcoredd_collect(struct vmcoredd_data *data, void *buf)
+{
+	struct adapter *adap = container_of(data, struct adapter, vmcoredd);
+	u32 len = data->size;
+
+	return cxgb4_cudbg_collect(adap, buf, &len, CXGB4_ETH_DUMP_ALL);
+}
+
+int cxgb4_cudbg_vmcore_add_dump(struct adapter *adap)
+{
+	struct vmcoredd_data *data = &adap->vmcoredd;
+	u32 len;
+
+	len = sizeof(struct cudbg_hdr) +
+	      sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY;
+	len += CUDBG_DUMP_BUFF_SIZE;
+
+	data->size = len;
+	snprintf(data->name, sizeof(data->name), "%s_%s", cxgb4_driver_name,
+		 adap->name);
+	data->vmcoredd_callback = cxgb4_cudbg_vmcoredd_collect;
+
+	return vmcore_add_device_dump(data);
+}
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
index ce1ac9a1c878..ef59ba1ed968 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h
@@ -41,8 +41,11 @@ enum CXGB4_ETHTOOL_DUMP_FLAGS {
 	CXGB4_ETH_DUMP_HW = (1 << 1), /* various FW and HW dumps */
 };
 
+#define CXGB4_ETH_DUMP_ALL (CXGB4_ETH_DUMP_MEM | CXGB4_ETH_DUMP_HW)
+
 u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag);
 int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size,
 			u32 flag);
 void cxgb4_init_ethtool_dump(struct adapter *adapter);
+int cxgb4_cudbg_vmcore_add_dump(struct adapter *adap);
 #endif /* __CXGB4_CUDBG_H__ */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 24d2865b8806..32cad0acf76c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -5544,6 +5544,16 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (err)
 		goto out_free_adapter;
 
+	if (is_kdump_kernel()) {
+		/* Collect hardware state and append to /proc/vmcore */
+		err = cxgb4_cudbg_vmcore_add_dump(adapter);
+		if (err) {
+			dev_warn(adapter->pdev_dev,
+				 "Fail collecting vmcore device dump, err: %d. Continuing\n",
+				 err);
+			err = 0;
+		}
+	}
 
 	if (!is_t4(adapter->params.chip)) {
 		s_qpp = (QUEUESPERPAGEPF0_S +
-- 
2.14.1

^ permalink raw reply related

* Re: [PATCH net-next 3/5] ipv4: support sport, dport and ip protocol in RTM_GETROUTE
From: Ido Schimmel @ 2018-04-17  8:10 UTC (permalink / raw)
  To: Roopa Prabhu; +Cc: davem, netdev, dsa
In-Reply-To: <1523911298-8965-4-git-send-email-roopa@cumulusnetworks.com>

On Mon, Apr 16, 2018 at 01:41:36PM -0700, Roopa Prabhu wrote:
> @@ -2757,6 +2796,12 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
>  	fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
>  	fl4.flowi4_mark = mark;
>  	fl4.flowi4_uid = uid;
> +	if (sport)
> +		fl4.fl4_sport = sport;
> +	if (dport)
> +		fl4.fl4_dport = dport;
> +	if (ip_proto)
> +		fl4.flowi4_proto = ip_proto;

Hi Roopa,

This info isn't set in the synthesized skb, but only in the flow info
and therefore not used for input routes. I see you added a test case,
but it's only for output routes. I believe an input route test case will
fail.

Also, note that the skb as synthesized now is invalid - iph->ihl is 0
for example - so the flow dissector will spit it out. It effectively
means that route get is broken when L4 hashing is used. It also affects
output routes because since commit 3765d35ed8b9 ("net: ipv4: Convert
inet_rtm_getroute to rcu versions of route lookup") the skb is used to
calculate the multipath hash.

^ permalink raw reply

* Re: [PATCH v2 3/8] net: ax88796: Do not free IRQ in ax_remove() (already freed in ax_close()).
From: Geert Uytterhoeven @ 2018-04-17  8:20 UTC (permalink / raw)
  To: Michael Schmitz, John Paul Adrian Glaubitz
  Cc: netdev, Andrew Lunn, Linux/m68k, Michael.Karcher, Michael Karcher
In-Reply-To: <1523930895-6973-4-git-send-email-schmitzmic@gmail.com>

Hi Michael, Adrian,

Thanks for your patch!

On Tue, Apr 17, 2018 at 4:08 AM, Michael Schmitz <schmitzmic@gmail.com> wrote:
> From: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
>
> This complements the fix in 82533ad9a1c that removed the free_irq

Please quote the commit's subject, too, like

... fix in commit 82533ad9a1 ("net: ethernet: ax88796: don't call free_irq
without request_irq first")

BTW, I have a git alias for that:

$ git help fixes
`git fixes' is aliased to `show --format='Fixes: %h ("%s")' -s'
$ git fixes 82533ad9a1c
Fixes: 82533ad9a1c ("net: ethernet: ax88796: don't call free_irq
without request_irq first")

> call in the error path of probe, to also not call free_irq when
> remove is called to revert the effects of probe.
>
> Signed-off-by: Michael Karcher <kernel@mkarcher.dialup.fu-berlin.de>

The patch is authored by Adrian, but his SoB is missing?

Michael (Schmitz): as you took the patch, you should add your SoB, too.

For the actual patch contents:
Reviewed-by: Geert Uytterhoeven <geert@linux-m68k.org>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCH] VSOCK: make af_vsock.ko removable again
From: Jorgen S. Hansen @ 2018-04-17  8:30 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: netdev@vger.kernel.org, Cong Wang
In-Reply-To: <20180417062558.18018-1-stefanha@redhat.com>


> On Apr 17, 2018, at 8:25 AM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> 
> Commit c1eef220c1760762753b602c382127bfccee226d ("vsock: always call
> vsock_init_tables()") introduced a module_init() function without a
> corresponding module_exit() function.
> 
> Modules with an init function can only be removed if they also have an
> exit function.  Therefore the vsock module was considered "permanent"
> and could not be removed.
> 
> This patch adds an empty module_exit() function so that "rmmod vsock"
> works.  No explicit cleanup is required because:
> 
> 1. Transports call vsock_core_exit() upon exit and cannot be removed
>   while sockets are still alive.
> 2. vsock_diag.ko does not perform any action that requires cleanup by
>   vsock.ko.
> 
> Reported-by: Xiumei Mu <xmu@redhat.com>
> Cc: Cong Wang <xiyou.wangcong@gmail.com>
> Cc: Jorgen Hansen <jhansen@vmware.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> net/vmw_vsock/af_vsock.c | 6 ++++++
> 1 file changed, 6 insertions(+)
> 
> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> index aac9b8f6552e..c1076c19b858 100644
> --- a/net/vmw_vsock/af_vsock.c
> +++ b/net/vmw_vsock/af_vsock.c
> @@ -2018,7 +2018,13 @@ const struct vsock_transport *vsock_core_get_transport(void)
> }
> EXPORT_SYMBOL_GPL(vsock_core_get_transport);
> 
> +static void __exit vsock_exit(void)
> +{
> +	/* Do nothing.  This function makes this module removable. */
> +}
> +
> module_init(vsock_init_tables);
> +module_exit(vsock_exit);
> 
> MODULE_AUTHOR("VMware, Inc.");
> MODULE_DESCRIPTION("VMware Virtual Socket Family");
> -- 
> 2.14.3
> 

Looks good to me.

Reviewed-by: Jorgen Hansen <jhansen@vmware.com>

^ permalink raw reply

* net: 4.9-stable regression in drivers/net/phy/micrel.c on 4.9.94
From: Lars Persson @ 2018-04-17  8:31 UTC (permalink / raw)
  To: netdev

Hi

We run into a NULL pointer dereference crash when booting 4.9.94 on our
Artpec-6 board with stmmac ethernet and Micrel KSZ9031 phy.

I traced this to the patch d7ba3c00047d ("net: phy: micrel: Restore
led_mode and clk_sel on resume") that was added in 4.9.94. This patch
makes kszphy_resume() depend on the kszphy_priv object having been
created and this happens only for those Micrel PHYs that have a .probe
callback assigned. This is not the case for KSZ9031.

This is already fixed in later kernels by bfe72442578b ("net: phy:
micrel: fix crash when statistic requested for KSZ9031 phy") thas assigns
a probe function for all Micrel PHYs that depend on the kszphy_priv existing.

Please consider applying this to the 4.9 stable tree.

Crash dump splat:
   Unable to handle kernel NULL pointer dereference at virtual address 00000008
   pgd = bd8bc000
   [00000008] *pgd=3d98e831, *pte=00000000, *ppte=00000000
   Internal error: Oops: 17 [#1] PREEMPT SMP ARM
   Modules linked in: e1000e nvmem_artpec6_efuse nvmem_core artpec6_trng(O) artpec6_lcpu(O)
   CPU: 0 PID: 216 Comm: netd Tainted: G           O    4.9.94-axis5-devel #1
   Hardware name: Axis ARTPEC-6 Platform
   task: bf344620 task.stack: bd10c000
   PC is at kszphy_config_reset+0x14/0x148
   LR is at kszphy_resume+0x1c/0x5c
   pc : [<804ad358>]    lr : [<804ad608>]    psr: 600c0113
   sp : bd10dd00  ip : ffff8dc7  fp : bf393200
   r10: 00000000  r9 : 00000002  r8 : 00000000
   r7 : bf3ad000  r6 : 00000000  r5 : bf086000  r4 : bf3ad400
   r3 : 00000001  r2 : 00000000  r1 : 00040003  r0 : bf3ad400
   Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
   Control: 10c5387d  Table: 3d8bc04a  DAC: 00000051
   Process netd (pid: 216, stack limit = 0xbd10c210)
   Stack: (0xbd10dd00 to 0xbd10e000)
   dd00: bf3ad400 bf086000 00000000 bf3ad000 00000000 804ad608 bf3ad400 bf086000
   dd20: 00000000 804ab404 bf3ad400 bf086000 804b345c 00000000 7ee94b94 00000000
   dd40: 00000000 804ab5a4 bf3ad400 bf086000 804b345c 80509278 bf086000 bf086000
   dd60: 00000002 ffffffff 7ee94b94 804ae4a4 00000002 00000001 8014bc78 801647dc
   dd80: 00000000 beb25cc0 beb634e0 00000000 be97c87c 000000c3 800f0093 80682d1c
   dda0: 800f0093 beb634e0 00000000 80682d1c beb25cc0 802a6388 beb88444 00000000
   ddc0: beb88450 000000c3 00000001 00000001 00000001 801647dc 00000001 beb88440
   dde0: 00000001 801414fc 00000001 805de9f4 bd10dea0 00000001 beae9b8c bf086000
   de00: 00000001 80743d58 bf086030 804b25b0 8064fd7c 801414fc fffffff2 bd10de64
   de20: 0000000d 801414fc bf0864c0 bd10de64 0000000d 801414fc bf086000 bf086000
   de40: 00000001 80743d58 bf086030 7ee94b94 00000000 00000000 bf393200 80567b00
   de60: bf086188 bf086000 80567da4 bf086000 00000001 00001003 00001002 80567dcc
   de80: bf086000 00001002 00000000 bf086148 7ee94b94 80567e9c 00000000 bd10dec8
   dea0: 00000000 bf39320c 7ee94b94 805de4b0 00000000 bd10df00 00008914 bf086000
   dec0: 00000014 bf39320c 30687465 00000000 00000000 00000000 00001003 00000000
   dee0: 00000000 00000000 00000000 00008914 be643360 7ee94b94 be643340 7ee94b94
   df00: 00000008 00000000 00000000 80548420 7ee94b94 be643360 beae7ee0 00000008
   df20: 7ee94b94 80271884 00000000 00000000 00000000 beb0a700 00000000 be4f3360
   df40: 00000002 00000023 beb0a708 00000000 76f216c4 8025ec18 00000000 8027db08
   df60: beae7ee0 8027db08 00000000 beae7ee1 beae7ee0 00008914 7ee94b94 00000008
   df80: 00000000 802721c4 01f1bcb0 76fadcf0 00000001 00000036 80108984 bd10c000
   dfa0: 00000000 801087c0 01f1bcb0 76fadcf0 00000008 00008914 7ee94b94 01f1be48
   dfc0: 01f1bcb0 76fadcf0 00000001 00000036 7ee94b94 00000008 0004cd2c 00000000
   dfe0: 00063d60 7ee94b74 00027344 76b10b2c 600f0010 00000008 00000000 7ee727f4
   [<804ad358>] (kszphy_config_reset) from [<804ad608>] (kszphy_resume+0x1c/0x5c)
   [<804ad608>] (kszphy_resume) from [<804ab404>] (phy_attach_direct+0xbc/0x1c4)
   [<804ab404>] (phy_attach_direct) from [<804ab5a4>] (phy_connect_direct+0x1c/0x54)
   [<804ab5a4>] (phy_connect_direct) from [<80509278>] (of_phy_connect+0x40/0x68)
   [<80509278>] (of_phy_connect) from [<804ae4a4>] (stmmac_init_phy+0x50/0x1ec)
   [<804ae4a4>] (stmmac_init_phy) from [<804b25b0>] (stmmac_open+0x70/0xc90)
   [<804b25b0>] (stmmac_open) from [<80567b00>] (__dev_open+0xc4/0x140)
   [<80567b00>] (__dev_open) from [<80567dcc>] (__dev_change_flags+0x9c/0x14c)
   [<80567dcc>] (__dev_change_flags) from [<80567e9c>] (dev_change_flags+0x20/0x50)
   [<80567e9c>] (dev_change_flags) from [<805de4b0>] (devinet_ioctl+0x6d4/0x798)
   [<805de4b0>] (devinet_ioctl) from [<80548420>] (sock_ioctl+0x158/0x2e4)
   [<80548420>] (sock_ioctl) from [<80271884>] (do_vfs_ioctl+0xa8/0x974)
   [<80271884>] (do_vfs_ioctl) from [<802721c4>] (SyS_ioctl+0x74/0x84)
   [<802721c4>] (SyS_ioctl) from [<801087c0>] (ret_fast_syscall+0x0/0x48)
   Code: e52de004 e8bd4000 e1a04000 e59061d0 (e5d63008)

^ permalink raw reply

* [PATCH/RFC net-next 0/5] ravb: updates
From: Simon Horman @ 2018-04-17  8:50 UTC (permalink / raw)
  To: Sergei Shtylyov
  Cc: Magnus Damm, netdev, linux-renesas-soc, Wolfram Sang,
	Simon Horman

Hi Sergei,

this series is composed of otherwise unrelated RAVB patches from the R-Car
BSP v3.6.2 which at a first pass seem worth considering for upstream.

I would value your feedback on these patches so they can either proceed
into net-next or remain local to the BSP.

Thanks!

Kazuya Mizuguchi (4):
  ravb: correct ptp does failure after suspend and resume
  ravb: do not write 1 to reserved bits
  ravb: remove undocumented processing
  ravb: remove tx buffer addr 4byte alilgnment restriction for R-Car
    Gen3

Masaru Nagai (1):
  ravb: fix inconsistent lock state at enabling tx timestamp

 drivers/net/ethernet/renesas/ravb.h      |  23 +++-
 drivers/net/ethernet/renesas/ravb_main.c | 192 ++++++++++++++++---------------
 drivers/net/ethernet/renesas/ravb_ptp.c  |   2 +-
 3 files changed, 117 insertions(+), 100 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH/RFC net-next 1/5] ravb: fix inconsistent lock state at enabling tx timestamp
From: Simon Horman @ 2018-04-17  8:50 UTC (permalink / raw)
  To: Sergei Shtylyov
  Cc: Magnus Damm, netdev, linux-renesas-soc, Wolfram Sang,
	Masaru Nagai, Kazuya Mizuguchi, Simon Horman
In-Reply-To: <20180417085030.32650-1-horms+renesas@verge.net.au>

From: Masaru Nagai <masaru.nagai.vx@renesas.com>

[   58.490829] =================================
[   58.495205] [ INFO: inconsistent lock state ]
[   58.499583] 4.9.0-yocto-standard-00007-g2ef7caf #57 Not tainted
[   58.505529] ---------------------------------
[   58.509904] inconsistent {HARDIRQ-ON-W} -> {IN-HARDIRQ-W} usage.
[   58.515939] swapper/0/0 [HC1[1]:SC1[1]:HE0:SE0] takes:
[   58.521099]  (&(&list->lock)->rlock#2){?.-...}, at: [<ffff00000899f474>] skb_queue_tail+0x2c/0x68
{HARDIRQ-ON-W} state was registered at:
[   58.533654]   [   58.535155] [<ffff000008127e94>] mark_lock+0x1c4/0x718
[   58.540318]   [   58.541814] [<ffff000008129068>] __lock_acquire+0x660/0x1890
[   58.547501]   [   58.548997] [<ffff00000812a840>] lock_acquire+0xd0/0x290
[   58.554334]   [   58.555834] [<ffff000008acfd28>] _raw_spin_lock_bh+0x50/0x90
[   58.561520]   [   58.563018] [<ffff000008a31908>] first_packet_length+0x40/0x2b0
[   58.568965]   [   58.570461] [<ffff000008a31bd0>] udp_ioctl+0x58/0x120
[   58.575535]   [   58.577032] [<ffff000008a41df8>] inet_ioctl+0x58/0x128
[   58.582194]   [   58.583691] [<ffff0000089938e0>] sock_do_ioctl+0x40/0x88
[   58.589028]   [   58.590523] [<ffff000008994824>] sock_ioctl+0x284/0x350
[   58.595773]   [   58.597271] [<ffff000008295b88>] do_vfs_ioctl+0xb0/0x7c0
[   58.602607]   [   58.604103] [<ffff00000829632c>] SyS_ioctl+0x94/0xa8
[   58.609090]   [   58.610588] [<ffff00000808374c>] __sys_trace_return+0x0/0x4
[   58.616187] irq event stamp: 335205
[   58.619690] hardirqs last  enabled at (335204): [<ffff00000808180c>] __do_softirq+0xdc/0x5c4
[   58.628168] hardirqs last disabled at (335205): [<ffff000008082f70>] el1_irq+0x70/0x12c
[   58.636211] softirqs last  enabled at (335202): [<ffff0000080d7168>] _local_bh_enable+0x28/0x50
[   58.644950] softirqs last disabled at (335203): [<ffff0000080d76e4>] irq_exit+0xd4/0x100
[   58.653076]
[   58.653076] other info that might help us debug this:
[   58.659632]  Possible unsafe locking scenario:
[   58.659632]
[   58.665577]        CPU0
[   58.668031]        ----
[   58.670484]   lock(&(&list->lock)->rlock#2);
[   58.674799]   <Interrupt>
[   58.677427]     lock(&(&list->lock)->rlock#2);
[   58.681916]
[   58.681916]  *** DEADLOCK ***
[   58.681916]
[   58.687863] 1 lock held by swapper/0/0:
[   58.691713]  #0:  (&(&priv->lock)->rlock){-.-...}, at: [<ffff0000087981b0>] ravb_multi_interrupt+0x28/0x98
[   58.701456]
[   58.701456] stack backtrace:
[   58.705833] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.9.0-yocto-standard-00007-g2ef7caf #57
[   58.714396] Hardware name: Renesas Salvator-X board based on r8a7796 (DT)
[   58.721214] Call trace:
[   58.723672] [<ffff00000808a478>] dump_backtrace+0x0/0x1d8
[   58.729095] [<ffff00000808a674>] show_stack+0x24/0x30
[   58.734170] [<ffff000008490858>] dump_stack+0xb0/0xe8
[   58.740285] [<ffff0000081eb668>] print_usage_bug.part.24+0x264/0x27c
[   58.747697] [<ffff000008127e20>] mark_lock+0x150/0x718
[   58.753892] [<ffff000008129618>] __lock_acquire+0xc10/0x1890
[   58.760602] [<ffff00000812a840>] lock_acquire+0xd0/0x290
[   58.766956] [<ffff000008acfe50>] _raw_spin_lock_irqsave+0x58/0x98
[   58.774089] [<ffff00000899f474>] skb_queue_tail+0x2c/0x68
[   58.780518] [<ffff0000089a1678>] sock_queue_err_skb+0xc8/0x138
[   58.787364] [<ffff0000089a1774>] __skb_complete_tx_timestamp+0x8c/0xb8
[   58.794888] [<ffff0000089a7168>] __skb_tstamp_tx+0xd8/0x130
[   58.801437] [<ffff0000089a71f0>] skb_tstamp_tx+0x30/0x40
[   58.807723] [<ffff000008798144>] ravb_timestamp_interrupt+0x164/0x1a8
[   58.815144] [<ffff000008798210>] ravb_multi_interrupt+0x88/0x98
[   58.822043] [<ffff00000813add4>] __handle_irq_event_percpu+0x94/0x418
[   58.829464] [<ffff00000813b180>] handle_irq_event_percpu+0x28/0x60
[   58.836622] [<ffff00000813b208>] handle_irq_event+0x50/0x80
[   58.843166] [<ffff00000813f0f4>] handle_fasteoi_irq+0xdc/0x1e0
[   58.849968] [<ffff000008139cac>] generic_handle_irq+0x34/0x50
[   58.856681] [<ffff00000813a41c>] __handle_domain_irq+0x8c/0x100
[   58.863568] [<ffff000008081570>] gic_handle_irq+0x60/0xb8
[   58.869930] Exception stack(0xffff80063b0f9de0 to 0xffff80063b0f9f10)
[   58.877348] 9de0: ffff80063b0f9e10 0001000000000000 ffff80063b0f9f40 ffff000008081810
[   58.886159] 9e00: 0000000060000145 ffff000008082f70 ffff000009194b00 0000000000190f2c
[   58.894961] 9e20: 0000800632171000 000000000000000a 0000000000000000 000000000003a4d0
[   58.903767] 9e40: 0000000000000016 0000000000000023 ffff0000091952f8 0000000000000000
[   58.912568] 9e60: 0000000000000040 0000000000000000 0000000034d5d91d 0000000000000000
[   58.921363] 9e80: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
[   58.930133] 9ea0: 0000000000000000 ffff000009180000 ffff0000080d76e4 0000000000000052
[   58.938897] 9ec0: ffff000008d70000 0000000000000008 0000000000000000 0000000000000001
[   58.947660] 9ee0: ffff80063a428000 ffff000009185000 ffff000009180000 ffff80063b0f9f40
[   58.956430] 9f00: ffff00000808180c ffff80063b0f9f40
[   58.962253] [<ffff000008082fb4>] el1_irq+0xb4/0x12c
[   58.968096] [<ffff0000080d76e4>] irq_exit+0xd4/0x100
[   58.974025] [<ffff00000813a420>] __handle_domain_irq+0x90/0x100
[   58.980916] [<ffff000008081570>] gic_handle_irq+0x60/0xb8
[   58.987281] Exception stack(0xffff000009183d20 to 0xffff000009183e50)
[   58.994708] 3d20: ffff000009194b00 0000000000190f2b 0000800632171000 8c6318c6318c6320
[   59.003554] 3d40: 0000000000000000 000000000003a4d0 0000000000000016 000000000000002a
[   59.012416] 3d60: ffff0000091952f8 0000000000000000 0000000000001000 0000000000000000
[   59.021279] 3d80: 0000000034d5d91d 0000000000000000 0000000000000000 0000000000000000
[   59.030111] 3da0: 0000000000000000 0000000000000000 0000000000000000 0000000d9e3b53c4
[   59.038913] 3dc0: ffff800638fb1800 0000000000000001 ffff00000925ad40 0000000000000004
[   59.047726] 3de0: 0000000d9e0899ee 0000000000000001 ffff000008e3cc90 ffff00000918b000
[   59.056521] 3e00: ffff00000918b000 ffff000009183e50 ffff0000088d0acc ffff000009183e50
[   59.065298] 3e20: ffff0000088d0ad0 0000000060000145 0000000000000001 ffff0000088d0b70
[   59.074068] 3e40: ffffffffffffffff ffff0000088d0acc
[   59.079878] [<ffff000008082fb4>] el1_irq+0xb4/0x12c
[   59.085696] [<ffff0000088d0ad0>] cpuidle_enter_state+0x130/0x408
[   59.092656] [<ffff0000088d0e1c>] cpuidle_enter+0x34/0x48
[   59.098909] [<ffff00000811ff50>] call_cpuidle+0x40/0x70
[   59.105070] [<ffff00000812026c>] cpu_startup_entry+0x144/0x1f0
[   59.111845] [<ffff000008ac7f98>] rest_init+0x150/0x160
[   59.117925] [<ffff000008ec0b54>] start_kernel+0x38c/0x3a0
[   59.124261] [<ffff000008ec01d8>] __primary_switched+0x5c/0x64

Fixes: f51bdc236b6c ("ravb: Add dma queue interrupt support")
Signed-off-by: Masaru Nagai <masaru.nagai.vx@renesas.com>
Signed-off-by: Kazuya Mizuguchi <kazuya.mizuguchi.ks@renesas.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/net/ethernet/renesas/ravb_main.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 68f122140966..b311b1ac1286 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -481,7 +481,7 @@ static int ravb_dmac_init(struct net_device *ndev)
 	/* Receive FIFO full error, descriptor empty */
 	ravb_write(ndev, RIC2_QFE0 | RIC2_QFE1 | RIC2_RFFE, RIC2);
 	/* Frame transmitted, timestamp FIFO updated */
-	ravb_write(ndev, TIC_FTE0 | TIC_FTE1 | TIC_TFUE, TIC);
+	ravb_write(ndev, TIC_FTE0 | TIC_FTE1, TIC);
 
 	/* Setting the control will start the AVB-DMAC process. */
 	ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_OPERATION);
@@ -793,18 +793,6 @@ static bool ravb_queue_interrupt(struct net_device *ndev, int q)
 	return false;
 }
 
-static bool ravb_timestamp_interrupt(struct net_device *ndev)
-{
-	u32 tis = ravb_read(ndev, TIS);
-
-	if (tis & TIS_TFUF) {
-		ravb_write(ndev, ~TIS_TFUF, TIS);
-		ravb_get_tx_tstamp(ndev);
-		return true;
-	}
-	return false;
-}
-
 static irqreturn_t ravb_interrupt(int irq, void *dev_id)
 {
 	struct net_device *ndev = dev_id;
@@ -817,13 +805,9 @@ static irqreturn_t ravb_interrupt(int irq, void *dev_id)
 	iss = ravb_read(ndev, ISS);
 
 	/* Received and transmitted interrupts */
-	if (iss & (ISS_FRS | ISS_FTS | ISS_TFUS)) {
+	if (iss & (ISS_FRS | ISS_FTS)) {
 		int q;
 
-		/* Timestamp updated */
-		if (ravb_timestamp_interrupt(ndev))
-			result = IRQ_HANDLED;
-
 		/* Network control and best effort queue RX/TX */
 		for (q = RAVB_NC; q >= RAVB_BE; q--) {
 			if (ravb_queue_interrupt(ndev, q))
@@ -854,7 +838,7 @@ static irqreturn_t ravb_interrupt(int irq, void *dev_id)
 	return result;
 }
 
-/* Timestamp/Error/gPTP interrupt handler */
+/* Error/gPTP interrupt handler */
 static irqreturn_t ravb_multi_interrupt(int irq, void *dev_id)
 {
 	struct net_device *ndev = dev_id;
@@ -866,10 +850,6 @@ static irqreturn_t ravb_multi_interrupt(int irq, void *dev_id)
 	/* Get interrupt status */
 	iss = ravb_read(ndev, ISS);
 
-	/* Timestamp updated */
-	if ((iss & ISS_TFUS) && ravb_timestamp_interrupt(ndev))
-		result = IRQ_HANDLED;
-
 	/* Error status summary */
 	if (iss & ISS_ES) {
 		ravb_error_interrupt(ndev);
@@ -939,6 +919,10 @@ static int ravb_poll(struct napi_struct *napi, int budget)
 		}
 		/* Processing TX Descriptor Ring */
 		if (tis & mask) {
+			/* Timestamp updated */
+			if (q == RAVB_NC)
+				ravb_get_tx_tstamp(ndev);
+
 			spin_lock_irqsave(&priv->lock, flags);
 			/* Clear TX interrupt */
 			ravb_write(ndev, ~mask, TIS);
-- 
2.11.0

^ permalink raw reply related

* [PATCH/RFC net-next 2/5] ravb: correct ptp does failure after suspend and resume
From: Simon Horman @ 2018-04-17  8:50 UTC (permalink / raw)
  To: Sergei Shtylyov
  Cc: Magnus Damm, netdev, linux-renesas-soc, Wolfram Sang,
	Kazuya Mizuguchi, Simon Horman
In-Reply-To: <20180417085030.32650-1-horms+renesas@verge.net.au>

From: Kazuya Mizuguchi <kazuya.mizuguchi.ks@renesas.com>

This patch fixes the problem that ptp4l command does not work after
suspend and resume.
Add the initial setting in ravb_suspend() and ravb_resume(),
because ptp does not work.

Fixes: a0d2f20650e8 ("Renesas Ethernet AVB PTP clock driver")
Signed-off-by: Kazuya Mizuguchi <kazuya.mizuguchi.ks@renesas.com>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/net/ethernet/renesas/ravb_main.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index b311b1ac1286..dbde3d11458b 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -2295,6 +2295,9 @@ static int __maybe_unused ravb_suspend(struct device *dev)
 	else
 		ret = ravb_close(ndev);
 
+	if (priv->chip_id != RCAR_GEN2)
+		ravb_ptp_stop(ndev);
+
 	return ret;
 }
 
@@ -2302,6 +2305,7 @@ static int __maybe_unused ravb_resume(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
 	struct ravb_private *priv = netdev_priv(ndev);
+	struct platform_device *pdev = priv->pdev;
 	int ret = 0;
 
 	/* If WoL is enabled set reset mode to rearm the WoL logic */
@@ -2330,6 +2334,9 @@ static int __maybe_unused ravb_resume(struct device *dev)
 	/* Restore descriptor base address table */
 	ravb_write(ndev, priv->desc_bat_dma, DBAT);
 
+	if (priv->chip_id != RCAR_GEN2)
+		ravb_ptp_init(ndev, pdev);
+
 	if (netif_running(ndev)) {
 		if (priv->wol_enabled) {
 			ret = ravb_wol_restore(ndev);
-- 
2.11.0

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox