Netdev List
 help / color / mirror / Atom feed
* [RFC bpf-next 3/7] bpf: sync tools/include/uapi/linux/bpf.h for pcap support
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

sync bpf.h updates for bpf_pcap helper and associated definitions

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/include/uapi/linux/bpf.h | 92 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 91 insertions(+), 1 deletion(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 77c6be9..13f86d3 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2750,6 +2750,54 @@ struct bpf_stack_build_id {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
+ *		u64 flags)
+ *	Description
+ *		Write packet data from *data* into a special BPF perf event
+ *              held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
+ *		perf event has the same attributes as perf events generated
+ *		by bpf_perf_event_output.  For skb and xdp programs, *data*
+ *		is the relevant context, while for tracing programs,
+ *		*data* must be a pointer to a **struct sk_buff** derived
+ *		from kprobe or tracepoint arguments.
+ *
+ *		Metadata for this event is a **struct bpf_pcap_hdr**; this
+ *		contains the capture length, actual packet length and
+ *		the starting protocol.
+ *
+ *		The max number of bytes of context to store is specified via
+ *		*size*.
+ *
+ *		The flags value can be used to specify an id value of up
+ *		to 48 bits; the id can be used to correlate captured packets
+ *		with other trace data, since the passed-in flags value is stored
+ *		stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ *		Specifying **BPF_F_PCAP_ID_IIFINDEX** and a non-zero value in
+ *		the id portion of the flags limits capture events to skbs
+ *		with the specified incoming ifindex, allowing limiting of
+ *		tracing to the the associated interface.  Specifying
+ *		**BPF_F_PCAP_STRICT_TYPE** will cause *bpf_pcap* to return
+ *		-EPROTO and skip capture if a specific protocol is specified
+ *		and it does not match the current skb.  These additional flags
+ *		are only valid (and useful) for tracing programs.
+ *
+ *		The *protocol* value specifies the protocol type of the start
+ *		of the packet so that packet capture can carry out
+ *		interpretation.  See **pcap-linktype** (7) for details on
+ *		the supported values.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *		-ENOENT will be returned if the associated perf event
+ *		map entry is empty, the skb is zero-length,  or the incoming
+ *		ifindex was specified and we failed to match.
+ *		-EPROTO will be returned if **BPF_PCAP_TYPE_UNSET** is specified
+ *		and no protocol can be determined, or if we specify a protocol
+ *		along with **BPF_F_PCAP_STRICT_TYPE** and the skb protocol does
+ *		not match.
+ *		-EINVAL will be returned if the flags value is invalid.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2862,7 +2910,8 @@ struct bpf_stack_build_id {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(pcap),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2941,6 +2990,11 @@ enum bpf_func_id {
 /* BPF_FUNC_sk_storage_get flags */
 #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
 
+/* BPF_FUNC_pcap flags */
+#define	BPF_F_PCAP_ID_MASK		0xffffffffffff
+#define BPF_F_PCAP_ID_IIFINDEX		(1ULL << 48)
+#define BPF_F_PCAP_STRICT_TYPE         (1ULL << 56)
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -3613,4 +3667,40 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+/* bpf_pcap_hdr contains information related to a particular packet capture
+ * flow.  It specifies
+ *
+ * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
+ *   a pcap-related event.
+ * - a starting protocol is the protocol associated with the header
+ * - a flags value, copied from the flags value passed into bpf_pcap().
+ *   IDs can be used to correlate packet capture data and other tracing data.
+ *
+ * bpf_pcap_hdr also contains the information relating to the to-be-captured
+ * packet, and closely corresponds to the struct pcap_pkthdr used by
+ * pcap_dump (3PCAP).  The bpf_pcap helper sets ktime_ns (nanoseconds since
+ * boot) to the ktime_ns value; to get sensible pcap times this value should
+ * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
+ *
+ * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
+ * need both information about the particular packet and the protocol
+ * we are capturing.
+ */
+
+#define BPF_PCAP_MAGIC		0xb7fca7
+
+struct bpf_pcap_hdr {
+	__u32			magic;
+	int			protocol;
+	__u64			flags;
+	__u64			ktime_ns;
+	__u32			tot_len;
+	__u32			cap_len;
+	__u8			data[0];
+};
+
+#define BPF_PCAP_TYPE_UNSET	-1
+#define BPF_PCAP_TYPE_ETH	1
+#define	BPF_PCAP_TYPE_IP	12
+
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 1/7] bpf: add bpf_pcap() helper to simplify packet capture
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

bpf_pcap() simplifies packet capture for skb and XDP
BPF programs by creating a BPF perf event containing information
relevant for packet capture (protocol, actual/captured packet
size, time of capture, etc) along with the packet payload itself.
All of this is stored in a "struct bpf_pcap_hdr".

This header information can then be retrieved from the perf
event map and used by packet capture frameworks such as libpcap
to carry out packet capture.

skb and XDP programs currently deal in Ethernet-based traffic
exclusively, so should specify BPF_PCAP_TYPE_ETH or
BPF_PCAP_TYPE_UNSET.  The protocol parameter will be used
in a later commit.

Note that libpcap assumes times are relative to the epoch while
we record nanoseconds since boot; as a result any times need
to be normalized with respect to the boot time for libpcap
storage; sysinfo(2) can be used to retrieve boot time to normalize
values appropriately.

Example usage for a tc-bpf program:

struct bpf_map_def SEC("maps") pcap_map = {
	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
	.key_size = sizeof(int),
	.value_size = sizeof(int),
	.max_entries = 1024,
};

SEC("cap")
int cap(struct __sk_buff *skb)
{
	bpf_pcap(skb, 1514, &pcap_map, BPF_PCAP_TYPE_ETH, 0);

	return TC_ACT_OK;
}

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 include/linux/bpf.h      | 20 +++++++++++++
 include/uapi/linux/bpf.h | 75 +++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/bpf/verifier.c    |  4 ++-
 net/core/filter.c        | 67 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 164 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b9d223..033c9cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1145,4 +1145,24 @@ static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 }
 #endif /* CONFIG_INET */
 
+
+static inline int bpf_pcap_prepare(int protocol, u32 cap_len, u32 tot_len,
+				   u64 flags, struct bpf_pcap_hdr *pcap)
+{
+	if (protocol < 0 || pcap == NULL)
+		return -EINVAL;
+
+	pcap->magic = BPF_PCAP_MAGIC;
+	pcap->protocol = protocol;
+	pcap->flags = flags;
+
+	if (cap_len == 0 || tot_len < cap_len)
+		cap_len = tot_len;
+	pcap->cap_len = cap_len;
+	pcap->tot_len = tot_len;
+	pcap->ktime_ns = ktime_get_mono_fast_ns();
+
+	return 0;
+}
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 77c6be9..a27e58e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2750,6 +2750,39 @@ struct bpf_stack_build_id {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_pcap(void *data, u32 size, struct bpf_map *map, int protocol,
+ *		u64 flags)
+ *	Description
+ *		Write packet data from *data* into a special BPF perf event
+ *              held by *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This
+ *		perf event has the same attributes as perf events generated
+ *		by bpf_perf_event_output.  For skb and xdp programs, *data*
+ *		is the relevant context.
+ *
+ *		Metadata for this event is a **struct bpf_pcap_hdr**; this
+ *		contains the capture length, actual packet length and
+ *		the starting protocol.
+ *
+ *		The max number of bytes of context to store is specified via
+ *		*size*.
+ *
+ *		The flags value can be used to specify an id value of up
+ *		to 48 bits; the id can be used to correlate captured packets
+ *		with other trace data, since the passed-in flags value is stored
+ *		stored in the **struct bpf_pcap_hdr** in the **flags** field.
+ *
+ *		The *protocol* value specifies the protocol type of the start
+ *		of the packet so that packet capture can carry out
+ *		interpretation.  See **pcap-linktype** (7) for details on
+ *		the supported values.
+ *
+ *	Return
+ *		0 on success, or a negative error in case of failure.
+ *		-ENOENT will be returned if the associated perf event
+ *		map entry is empty, or the skb is zero-length.
+ *		-EINVAL will be returned if the flags value is invalid.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2862,7 +2895,8 @@ struct bpf_stack_build_id {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(pcap),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -2941,6 +2975,9 @@ enum bpf_func_id {
 /* BPF_FUNC_sk_storage_get flags */
 #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
 
+/* BPF_FUNC_pcap flags */
+#define	BPF_F_PCAP_ID_MASK		0xffffffffffff
+
 /* Mode for BPF_FUNC_skb_adjust_room helper. */
 enum bpf_adj_room_mode {
 	BPF_ADJ_ROOM_NET,
@@ -3613,4 +3650,40 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+/* bpf_pcap_hdr contains information related to a particular packet capture
+ * flow.  It specifies
+ *
+ * - a magic number BPF_PCAP_MAGIC which identifies the perf event as
+ *   a pcap-related event.
+ * - a starting protocol is the protocol associated with the header
+ * - a flags value, copied from the flags value passed into bpf_pcap().
+ *   IDs can be used to correlate packet capture data and other tracing data.
+ *
+ * bpf_pcap_hdr also contains the information relating to the to-be-captured
+ * packet, and closely corresponds to the struct pcap_pkthdr used by
+ * pcap_dump (3PCAP).  The bpf_pcap helper sets ktime_ns (nanoseconds since
+ * boot) to the ktime_ns value; to get sensible pcap times this value should
+ * be converted to a struct timeval time since epoch in the struct pcap_pkthdr.
+ *
+ * When bpf_pcap() is used, a "struct bpf_pcap_hdr" is stored as we
+ * need both information about the particular packet and the protocol
+ * we are capturing.
+ */
+
+#define BPF_PCAP_MAGIC		0xb7fca7
+
+struct bpf_pcap_hdr {
+	__u32			magic;
+	int			protocol;
+	__u64			flags;
+	__u64			ktime_ns;
+	__u32			tot_len;
+	__u32			cap_len;
+	__u8			data[0];
+};
+
+#define BPF_PCAP_TYPE_UNSET	-1
+#define BPF_PCAP_TYPE_ETH	1
+#define	BPF_PCAP_TYPE_IP	12
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 3fb5075..a33ed24 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3440,7 +3440,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
 		    func_id != BPF_FUNC_perf_event_output &&
-		    func_id != BPF_FUNC_perf_event_read_value)
+		    func_id != BPF_FUNC_perf_event_read_value &&
+		    func_id != BPF_FUNC_pcap)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -3527,6 +3528,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
 	case BPF_FUNC_perf_event_read_value:
+	case BPF_FUNC_pcap:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
diff --git a/net/core/filter.c b/net/core/filter.c
index ed65636..e0e23ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4158,6 +4158,35 @@ static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
 	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
+BPF_CALL_5(bpf_xdp_pcap, struct xdp_buff *, xdp, u32, size,
+	   struct bpf_map *, map, int, protocol, u64, flags)
+{
+	unsigned long len = (unsigned long)(xdp->data_end - xdp->data);
+	struct bpf_pcap_hdr pcap;
+	int ret;
+
+	if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
+		return -EINVAL;
+
+	ret = bpf_pcap_prepare(protocol, size, len, flags, &pcap);
+	if (ret)
+		return ret;
+
+	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+				xdp->data, pcap.cap_len, bpf_xdp_copy);
+}
+
+static const struct bpf_func_proto bpf_xdp_pcap_proto = {
+	.func		= bpf_xdp_pcap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_MAP_PTR,
+	.arg4_type	= ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
 {
 	return skb->sk ? sock_gen_cookie(skb->sk) : 0;
@@ -5926,6 +5955,34 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
 
 #endif /* CONFIG_INET */
 
+BPF_CALL_5(bpf_skb_pcap, struct sk_buff *, skb, u32, size,
+	   struct bpf_map *, map, int, protocol, u64, flags)
+{
+	struct bpf_pcap_hdr pcap;
+	int ret;
+
+	if (unlikely(flags & ~BPF_F_PCAP_ID_MASK))
+		return -EINVAL;
+
+	ret = bpf_pcap_prepare(protocol, size, skb->len, flags, &pcap);
+	if (ret)
+		return ret;
+
+	return bpf_event_output(map, BPF_F_CURRENT_CPU, &pcap, sizeof(pcap),
+				skb, pcap.cap_len, bpf_skb_copy);
+}
+
+static const struct bpf_func_proto bpf_skb_pcap_proto = {
+	.func		= bpf_skb_pcap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_CONST_MAP_PTR,
+	.arg4_type      = ARG_ANYTHING,
+	.arg5_type	= ARG_ANYTHING,
+};
+
 bool bpf_helper_changes_pkt_data(void *func)
 {
 	if (func == bpf_skb_vlan_push ||
@@ -6075,6 +6132,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 		return &bpf_get_socket_uid_proto;
 	case BPF_FUNC_perf_event_output:
 		return &bpf_skb_event_output_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6216,6 +6275,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
 #endif
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6256,6 +6317,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 		return &bpf_tcp_check_syncookie_proto;
 	case BPF_FUNC_tcp_gen_syncookie:
 		return &bpf_tcp_gen_syncookie_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_xdp_pcap_proto;
 #endif
 	default:
 		return bpf_base_func_proto(func_id);
@@ -6361,6 +6424,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	case BPF_FUNC_skc_lookup_tcp:
 		return &bpf_skc_lookup_tcp_proto;
 #endif
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
@@ -6399,6 +6464,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_skb_under_cgroup:
 		return &bpf_skb_under_cgroup_proto;
+	case BPF_FUNC_pcap:
+		return &bpf_skb_pcap_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 6/7] bpf: add documentation for bpftool pcap subcommand
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

Document supported "bpf pcap" subcommands.

"prog" is used to capture packets from already-loaded programs.
"trace" loads/atttaches tracing programs to capture packets.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/bpf/bpftool/Documentation/bpftool-btf.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-cgroup.rst |   1 +
 .../bpf/bpftool/Documentation/bpftool-feature.rst  |   1 +
 tools/bpf/bpftool/Documentation/bpftool-map.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-net.rst    |   1 +
 tools/bpf/bpftool/Documentation/bpftool-pcap.rst   | 119 +++++++++++++++++++++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst   |   1 +
 tools/bpf/bpftool/Documentation/bpftool-prog.rst   |   1 +
 tools/bpf/bpftool/Documentation/bpftool.rst        |   1 +
 9 files changed, 127 insertions(+)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-pcap.rst

diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
index 39615f8..54045f0 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst
@@ -235,4 +235,5 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
index 06a28b0..1df98e1 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-cgroup.rst
@@ -164,5 +164,6 @@ SEE ALSO
 	**bpftool-map**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-feature.rst b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
index 4d08f35..0f36ad8 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-feature.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-feature.rst
@@ -86,5 +86,6 @@ SEE ALSO
 	**bpftool-map**\ (8),
 	**bpftool-cgroup**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-map.rst b/tools/bpf/bpftool/Documentation/bpftool-map.rst
index 1c0f714..8408022 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-map.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-map.rst
@@ -271,5 +271,6 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
index 8651b00..6bd24bb 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
@@ -198,5 +198,6 @@ SEE ALSO
 	**bpftool-map**\ (8),
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-pcap.rst b/tools/bpf/bpftool/Documentation/bpftool-pcap.rst
new file mode 100644
index 0000000..53ed226d
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-pcap.rst
@@ -0,0 +1,119 @@
+================
+bpftool-pcap
+================
+-------------------------------------------------------------------------------
+tool for inspection and simple manipulation of eBPF progs
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **pcap** *COMMAND*
+
+	*OPTIONS* := { { **-j** | **--json** } [{ **-p** | **--pretty** }] | { **-f** | **--bpffs** } }
+
+	*COMMANDS* :=
+	{ **prog** | **trace** | **help** }
+
+PCAP COMMANDS
+=============
+
+|	**bpftool** **pcap** **prog **  *PROG* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **pages** *NUMPAGES*}]
+|	**bpftool** **pcap** **trace** [*OBJ*] *TRACE* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **dev** *DEVNAME* | **pages** *NUMPAGES*}]
+|	**bpftool** **pcap help**
+|
+|	*PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
+|	*PROTOCOL* := {
+|		**eth** | **ip** | **ieee_80211** | ... }
+|       *TRACE* := {
+|		**kprobe**|**tracepoint**:*probename*[:arg{1-4}] }
+
+
+DESCRIPTION
+===========
+	**bpftool pcap prog [*PROG*] *PROG* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **pages** *NUMPAGES*}]
+
+		  Capture packet data from perf event map associated with
+		  program specified.  By default capture data is displayed on
+		  stdout, but if a capture file is preferred the data_out FILE
+		  option can be used.  The link type (termed DLT_TYPE in
+		  libpcap) is assumed to be Ethernet if not explicitly
+		  specified via the **proto** option.
+
+		  Maximum capture length can be adjusted via the **len**
+		  option.
+
+		  To work with bpftool pcap, the associated BPF program must
+		  at least define a perf event map, but if config options
+		  (protocol, max len) are to be supported it should also
+		  provide an array map with a single value of at least
+		  *struct bpf_pcap_conf* size.
+
+	**bpftool** **pcap** **trace** [*OBJ*] *TRACE* [{**data_out** *FILE* | **proto** *PROTOCOL* | **len** *MAXLEN* | **dev** *DEV* | **pages** *NUMPAGES*}]
+
+		  Attach the specified program in *OBJ* or load a
+		  pre-existing BPF kprobe/tracepoint program capable
+		  of capturing packets.
+
+		  Trace specification is of the form
+
+			trace_type:probe[:arg]
+
+		  For example tracepoint:iwlwifi_dev_tx_tb:arg2 will
+		  capture packet data from the second argument to the
+		  iwlwifi_dev_tx_tb tracepoint.  *DEV* can be used to
+		  limit capture to a specific incoming interface.
+
+	**bpftool prog help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-V, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+	-f, --bpffs
+		  When showing BPF programs, show file names of pinned
+		  programs.
+
+	-m, --mapcompat
+		  Allow loading maps with unknown map definitions.
+
+	-n, --nomount
+		  Do not automatically attempt to mount any virtual file system
+		  (such as tracefs or BPF virtual file system) when necessary.
+
+	-d, --debug
+		  Print all logs available, even debug-level information. This
+		  includes logs from libbpf as well as from the verifier, when
+		  attempting to load programs.
+
+EXAMPLES
+========
+**# bpftool pcap trace tracepoint:net_dev_xmit:arg1 proto eth | tcpdump -r -**
+reading from file -, link-type EN10MB (Ethernet)
+00:16:49.150880 IP 10.11.12.13 > 10.11.12.14: ICMP echo reply, id 10519, seq 1, length 64
+
+SEE ALSO
+========
+	**bpf**\ (2),
+	**bpf-helpers**\ (7),
+	**bpftool**\ (8),
+	**bpftool-map**\ (8),
+	**bpftool-cgroup**\ (8),
+	**bpftool-feature**\ (8),
+	**bpftool-net**\ (8),
+	**bpftool-perf**\ (8),
+	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
index e252bd0..d618bbd 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-perf.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -90,4 +90,5 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index 7a374b3..b4dd779 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -311,5 +311,6 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 6a9c52e..4126246 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -80,5 +80,6 @@ SEE ALSO
 	**bpftool-cgroup**\ (8),
 	**bpftool-feature**\ (8),
 	**bpftool-net**\ (8),
+	**bpftool-pcap**\ (8),
 	**bpftool-perf**\ (8),
 	**bpftool-btf**\ (8)
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 5/7] bpf: add pcap support to bpftool
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

bpftool is enhanced to be able to both capture from existing skb/xdp
programs ("bpftool pcap prog") and to load tracing programs - including
built-in simple kprobe/raw_tracepoint programs.  The end result is
to have a way of dynamically tapping BPF programs, kernel functions
and tracepoints to capture packet data.

"bpftool pcap" support depends on libpcap library and headers presence,
hence the new feature test is used to check for these.

If present, "bpftool pcap" can be used to capture pcap perf event
data from the perf event map associated with a program.  For example,

$ bpftool pcap prog id 32 data_out /tmp/cap

...will capture perf event data from the BPF program with id 32,
storing it in the capture file /tmp/pcap.

bpftool looks for a perf event map associated with that program and
then captures packets from it in a loop until Ctrl^C is pressed.
By default stdout is used, so the following also works:

$ bpftool pcap prog id 32 | tcpdump -r -

Configuration can also be passed to pcap programs, provided they
define a single-element BPF_MAP_TYPE_ARRAY with value size
greater than "sizeof struct bpf_pcap_hdr".  Options include

 data_out FILE  packet capture file to use (stdout is default)
 proto PROTO    DLT_* type as per libpcap; by specifying a type
                BPF programs can query the map in-kernel and
                capture packets of that type.  A string
                or numeric value can be used.  It is set in the
                bpf_pcap_hdr associated with the configuration
                map as the "protocol" value.
 len MAXLEN     maximum capture length in bytes.  It is set in
                the bpf_pcap_hdr associated with the configuration
                map as the "cap_len" value.
 dev DEVICE     incoming interface.  Tracing will be restricted
                to skbs which have this incoming interface set.
                The flags associated with the bpf_pcap_hdr
                in the configuration map are adjusted to record
                the associated ifindex to limit tracing.

In addition to capturing from existing programs, it is possible
to load provided programs which trace kprobe entry and raw_tracepoints,
making the first four arguments of each available for tracing.
For example

$ bpftool pcap trace kprobe/ip_rcv proto ip | tcpdump -r -

...will load a provided kprobe program, set the configuration options
in its associated map and capture packets which the bpf_pcap()
helper identifies as of type IPv[4,6].  Similarly for tracepoints,

$ bpftool pcap trace tracepoint:net_dev_xmit:arg1 proto eth | tcpdump -r -

In this case we explicitly specify an argument (:arg1), but the
default assumption is the first argument is to be captured.

To achieve the built-in tracing capabilities, two BPF objects need
to be delivered with bpftool - bpftool_pcap_kprobe.o and
bpftool_pcap_tracepoint.o.  These are accordingly added to the
install target for bpftool.  Each contains a separate program for
extracting arg1, arg2, arg3 and arg4.  This may seem wasteful -
why not just have the arg number as a map parameter?  In practice
tracepoints fail to attach with that approach.

A question arises here.  First, if we deliver a kprobe program, won't
it only work for the specific kernel?  Just by dumb luck on my part
the program appears to dodge the kernel version check in libbpf by not
passing an explicit program type at load time.  That said, the
program does not reference any data structures outside of the context
provided (struct pt_regs *), so maybe there's something else going
on too?

Note that a user-defined tracing program can also be passed in,
and that program will be attached to the target probe in a similar
manner.  We first look for programs with "arg[1-4]" in the name
if an argnum is specified, otherwise we fall back to using the
first program.

$ bpftool pcap trace mytraceprog.o tracepoint:net_dev_xmit:arg1 \
	data_out /tmp/cap

bpftool looks for a BPF_MAP_TYPE_ARRAY containing one value of
size >= "struct bpf_pcap_hdr", and assumes that configuration
provided to the program should be set in that map.  This allows
the user to provide a maximum packet length, starting protocol
etc to tracing programs.

The idea behind providing packet capture/tracing functionality in
bpftool is to similify developer access to dynamic packet capture.
An alternative approach would be to provide libbpf interfaces, but
this would require linking libbpf with libpcap.

A possible approach would be to take the code from bpftool that
interacts with programs (to retrieve pcap-related maps and
set config) and move it to libbpf, but it may make sense to
start with the functionality in bpftool and see if other
consumers need/want it.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/bpf/bpftool/Makefile                        |  39 +-
 tools/bpf/bpftool/main.c                          |   3 +-
 tools/bpf/bpftool/main.h                          |   1 +
 tools/bpf/bpftool/pcap.c                          | 496 ++++++++++++++++++++++
 tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c     |  80 ++++
 tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c |  68 +++
 tools/testing/selftests/bpf/bpf_helpers.h         |  11 +
 7 files changed, 690 insertions(+), 8 deletions(-)
 create mode 100644 tools/bpf/bpftool/pcap.c
 create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
 create mode 100644 tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 39bc6f0..16c4104 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 include ../../scripts/Makefile.include
+include ../../scripts/Makefile.arch
 include ../../scripts/utilities.mak
 
 ifeq ($(srctree),)
@@ -61,8 +62,8 @@ INSTALL ?= install
 RM ?= rm -f
 
 FEATURE_USER = .bpftool
-FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
-FEATURE_DISPLAY = libbfd disassembler-four-args zlib
+FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libpcap
+FEATURE_DISPLAY = libbfd disassembler-four-args zlib libpcap
 
 check_feat := 1
 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall
@@ -90,7 +91,14 @@ endif
 
 include $(wildcard $(OUTPUT)*.d)
 
-all: $(OUTPUT)bpftool
+ifeq ($(feature-libpcap),1)
+  LIBS += -lpcap
+  CFLAGS += -DHAVE_LIBPCAP_SUPPORT
+  BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
+  BPF_SRCS = $(wildcard progs/*.c)
+endif
+
+all: $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES)
 
 BFD_SRCS = jit_disasm.c
 
@@ -109,6 +117,18 @@ CFLAGS += -DHAVE_LIBBFD_SUPPORT
 SRCS += $(BFD_SRCS)
 endif
 
+CLANG           ?= clang
+LLC             ?= llc
+
+CLANG_SYS_INCLUDES := $(shell $(CLANG) -v -E - </dev/null 2>&1 \
+	| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }')
+
+CLANG_FLAGS = -I. -I$(srctree)/tools/include/uapi \
+	      -I$(srctree)/tools/testing/selftests/bpf \
+	      $(CLANG_SYS_INCLUDES) \
+	      -Wno-compare-distinct-pointer-types \
+	      -D__TARGET_ARCH_$(SRCARCH)
+
 OBJS = $(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o
 
 $(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c
@@ -122,24 +142,29 @@ $(OUTPUT)bpftool: $(OBJS) $(LIBBPF)
 $(OUTPUT)%.o: %.c
 	$(QUIET_CC)$(COMPILE.c) -MMD -o $@ $<
 
+$(OUTPUT)$(BPF_OBJ_FILES): $(BPF_SRCS)
+	($(CLANG) $(CLANG_FLAGS) -O2 -target bpf -emit-llvm \
+		-c $(patsubst %.o,progs/%.c,$@) -o - || echo "clang failed") | \
+	$(LLC) -march=bpf -mcpu=$(CPU) $(LLC_FLAGS) -filetype=obj -o $@
+
 clean: $(LIBBPF)-clean
 	$(call QUIET_CLEAN, bpftool)
-	$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+	$(Q)$(RM) -- $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d $(OUTPUT)$(BPF_OBJ_FILES)
 	$(Q)$(RM) -r -- $(OUTPUT)libbpf/
 	$(call QUIET_CLEAN, core-gen)
 	$(Q)$(RM) -- $(OUTPUT)FEATURE-DUMP.bpftool
 	$(Q)$(RM) -r -- $(OUTPUT)feature/
 
-install: $(OUTPUT)bpftool
+install: $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES)
 	$(call QUIET_INSTALL, bpftool)
 	$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(prefix)/sbin
-	$(Q)$(INSTALL) $(OUTPUT)bpftool $(DESTDIR)$(prefix)/sbin/bpftool
+	$(Q)$(INSTALL) $(OUTPUT)bpftool $(OUTPUT)$(BPF_OBJ_FILES) $(DESTDIR)$(prefix)/sbin/
 	$(Q)$(INSTALL) -m 0755 -d $(DESTDIR)$(bash_compdir)
 	$(Q)$(INSTALL) -m 0644 bash-completion/bpftool $(DESTDIR)$(bash_compdir)
 
 uninstall:
 	$(call QUIET_UNINST, bpftool)
-	$(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool
+	$(Q)$(RM) -- $(DESTDIR)$(prefix)/sbin/bpftool*
 	$(Q)$(RM) -- $(DESTDIR)$(bash_compdir)/bpftool
 
 doc:
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 93d0086..e7c7969 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -58,7 +58,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup | perf | net | feature | btf }\n"
+		"       OBJECT := { prog | map | cgroup | perf | net | feature | btf | pcap }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -227,6 +227,7 @@ static int make_args(char *line, char *n_argv[], int maxargs, int cmd_nb)
 	{ "net",	do_net },
 	{ "feature",	do_feature },
 	{ "btf",	do_btf },
+	{ "pcap",	do_pcap },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index af9ad56..079409c 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -155,6 +155,7 @@ int cmd_select(const struct cmd *cmds, int argc, char **argv,
 int do_tracelog(int argc, char **arg);
 int do_feature(int argc, char **argv);
 int do_btf(int argc, char **argv);
+int do_pcap(int argc, char **argv);
 
 int parse_u32_arg(int *argc, char ***argv, __u32 *val, const char *what);
 int prog_parse_fd(int *argc, char ***argv);
diff --git a/tools/bpf/bpftool/pcap.c b/tools/bpf/bpftool/pcap.c
new file mode 100644
index 0000000..ab18d1f
--- /dev/null
+++ b/tools/bpf/bpftool/pcap.c
@@ -0,0 +1,496 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <assert.h>
+#include <bpf.h>
+#include <errno.h>
+#include <libbpf.h>
+#include <libgen.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/perf_event.h>
+#include <linux/sysinfo.h>
+#include <net/if.h>
+#include <signal.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/sysinfo.h>
+#include <time.h>
+
+#include "json_writer.h"
+#include "main.h"
+
+#ifdef HAVE_LIBPCAP_SUPPORT
+
+/* To avoid conflicting definitions of bpf_insn */
+#define PCAP_DONT_INCLUDE_PCAP_BPF_H
+#include <pcap.h>
+
+#include <perf-sys.h>
+
+#define	PCAP_MAX_MAPS	8
+#define	PCAP_PROTOCOL_DEFAULT	BPF_PCAP_TYPE_ETH
+#define PCAP_NUM_PAGES_DEFAULT	16
+#define PCAP_NUM_PAGES_MIN	8
+#define PCAP_MAX_LEN		65536
+#define	PCAP_FILE_STDOUT	"-"
+#define	PCAP_FILE_DEFAULT	PCAP_FILE_STDOUT
+#define NANOSEC			1000000000
+
+
+pcap_dumper_t *pcap_dumper;
+static bool flush = true;
+volatile bool stop;
+
+static __u64 boottime;		/* seconds since epoch at boot time. */
+
+static unsigned int proto_from_str(const char *proto_str)
+{
+	int proto;
+
+	/* Names for DLT_ ethernet (en10mb) and IP (raw) aren't obvious. */
+	if (strcmp(proto_str, "eth") == 0)
+		proto = BPF_PCAP_TYPE_ETH;
+	else if (strcmp(proto_str, "ip") == 0)
+		proto = BPF_PCAP_TYPE_IP;
+	else {
+		proto = pcap_datalink_name_to_val(proto_str);
+		if (proto == PCAP_ERROR) {
+			proto = strtol(proto_str, NULL, 10);
+			if (errno == ERANGE)
+				proto = -1;
+		}
+	}
+	return proto;
+}
+
+static int verify_map(int map_fd, enum bpf_map_type map_type,
+		      __u32 num_entries, __u32 min_value_size)
+{
+	__u32 info_len = sizeof(struct bpf_map_info);
+	struct bpf_map_info info;
+
+	if (!bpf_obj_get_info_by_fd(map_fd, &info, &info_len) &&
+	    info.type == map_type &&
+	    (!num_entries || info.max_entries == num_entries) &&
+	    (!min_value_size || info.value_size >= min_value_size))
+		return 0;
+	return -1;
+}
+
+static void int_exit(int signo)
+{
+	stop = true;
+}
+
+static void handle_pcap_event(void *ctx, int cpu, void *data, __u32 size)
+{
+	struct bpf_pcap_hdr *conf = ctx;
+	struct bpf_pcap_hdr *hdr = data;
+	struct pcap_pkthdr caphdr;
+
+	if (hdr->magic != BPF_PCAP_MAGIC)
+		return;
+
+	/* If we are looking for a specific protocol, and this isn't a
+	 * match, ignore.
+	 */
+	if (conf->protocol != BPF_PCAP_TYPE_UNSET &&
+	    conf->protocol != hdr->protocol)
+		return;
+
+	caphdr.len = hdr->tot_len;
+	caphdr.caplen = hdr->cap_len;
+	caphdr.ts.tv_sec = boottime + (hdr->ktime_ns/NANOSEC);
+	caphdr.ts.tv_usec = (hdr->ktime_ns % NANOSEC) / 1000;
+
+	pcap_dump((u_char *)pcap_dumper, &caphdr, hdr->data);
+	if (flush)
+		pcap_dump_flush(pcap_dumper);
+}
+
+static int handle_pcap(int data_map_fd, int conf_map_fd,
+		       struct bpf_pcap_hdr *conf, const char *pcap_filename,
+		       int pages, struct bpf_link *trace_link)
+{
+	struct perf_buffer_opts pb_opts = {};
+	struct perf_buffer *pb;
+	struct sysinfo info;
+	pcap_t *pcap;
+	int err;
+
+	if (signal(SIGHUP, int_exit) ||
+	    signal(SIGTERM, int_exit)) {
+		perror("signal");
+		return 1;
+	}
+	(void) signal(SIGINT, int_exit);
+
+	/* pcap expects time since epoch and bpf_pcap() records nanoseconds
+	 * since boot; get time of boot to add to pcap time to give a (rough)
+	 * time since epoch for capture event.
+	 */
+	if (sysinfo(&info) == 0)
+		boottime = time(NULL) - info.uptime;
+
+	pcap = pcap_open_dead(conf->protocol, conf->cap_len ?
+					      conf->cap_len : PCAP_MAX_LEN);
+	if (!pcap) {
+		perror("pcap_open");
+		return -1;
+	}
+	pcap_dumper = pcap_dump_open(pcap, pcap_filename);
+	if (!pcap_dumper) {
+		perror("pcap_dumper");
+		return -1;
+	}
+
+	pb_opts.sample_cb = handle_pcap_event;
+	pb_opts.ctx = conf;
+	pb = perf_buffer__new(data_map_fd, pages, &pb_opts);
+	if (libbpf_get_error(pb)) {
+		perror("perf_buffer setup failed");
+		return -1;
+	}
+
+	while (!stop) {
+		err = perf_buffer__poll(pb, 1000);
+		if (err < 0 && err != -EINTR) {
+			p_err("perf buffer polling failed: %s (%d)",
+			      strerror(err), err);
+			break;
+		}
+	}
+
+	/* detach program if we attached one. */
+	if (trace_link)
+		bpf_link__destroy(trace_link);
+	perf_buffer__free(pb);
+	close(data_map_fd);
+	if (conf_map_fd >= 0)
+		close(conf_map_fd);
+	if (pcap_dumper) {
+		pcap_dump_flush(pcap_dumper);
+		pcap_dump_close(pcap_dumper);
+	}
+	if (pcap)
+		pcap_close(pcap);
+
+	return 0;
+}
+
+static int handle_opts(int argc, char **argv,
+		       struct bpf_pcap_hdr *conf,
+		       const char **pcap_filename, int *pages)
+{
+	int conf_used = 0;
+
+	while (argc) {
+		if (!REQ_ARGS(2))
+			return -1;
+
+		if (is_prefix(*argv, "data_out")) {
+			NEXT_ARG();
+			*pcap_filename = *argv;
+			/* no need to flush to capture file if not stdout */
+			if (strcmp(*pcap_filename, PCAP_FILE_STDOUT) != 0)
+				flush = false;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "proto")) {
+			NEXT_ARG();
+			conf->protocol = proto_from_str(*argv);
+			if (conf->protocol == -1) {
+				p_err("unrecognized protocol %s", *argv);
+				return -1;
+			}
+			conf_used = 1;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "len")) {
+			NEXT_ARG();
+			conf->cap_len = atoi(*argv);
+			conf_used = 1;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "dev")) {
+			unsigned long iifindex;
+
+			NEXT_ARG();
+			iifindex = if_nametoindex(*argv);
+			if (!iifindex) {
+				p_err("no such device %s", *argv);
+				return -1;
+			}
+			conf->flags |= (BPF_F_PCAP_ID_IIFINDEX |
+					(iifindex & BPF_F_PCAP_ID_MASK));
+			conf_used = 1;
+			NEXT_ARG();
+		} else if (is_prefix(*argv, "pages")) {
+			NEXT_ARG();
+			*pages = atoi(*argv);
+			if (*pages < PCAP_NUM_PAGES_MIN) {
+				p_err("at least %d pages are required",
+				      PCAP_NUM_PAGES_MIN);
+				return -1;
+			}
+			NEXT_ARG();
+		} else {
+			p_err("unknown arg %s", *argv);
+			return -1;
+		}
+	}
+	return conf_used;
+}
+
+static int handle_conf_map(int conf_map_fd, struct bpf_pcap_hdr *conf)
+{
+	int key = 0;
+
+	if (bpf_map_update_elem(conf_map_fd, &key, conf, BPF_ANY)) {
+		p_err("could not populate config in map");
+		return -1;
+	}
+	return 0;
+}
+
+/* For the prog specified, the conf map is optional but the data map must
+ * be present to facilitate capture.
+ */
+static int prog_info(int prog_fd, enum bpf_prog_type *type,
+		     int *data_map_fd, int *conf_map_fd)
+{
+	__u32 info_len = sizeof(struct bpf_prog_info);
+	struct bpf_prog_info prog_info;
+	__u32 map_ids[PCAP_MAX_MAPS];
+	int map_fd;
+	__u32 i;
+
+	*data_map_fd = -1;
+	*conf_map_fd = -1;
+
+	/* Find data and (optionally) conf map associated with program. */
+	memset(&prog_info, 0, sizeof(prog_info));
+	prog_info.nr_map_ids = PCAP_MAX_MAPS;
+	prog_info.map_ids =  ptr_to_u64(map_ids);
+	if (bpf_obj_get_info_by_fd(prog_fd, &prog_info, &info_len) < 0) {
+		p_err("could not retrieve info for program");
+		return -1;
+	}
+	*type = prog_info.type;
+
+	for (i = 0; i < prog_info.nr_map_ids; i++) {
+		map_fd = bpf_map_get_fd_by_id(map_ids[i]);
+
+		if (!verify_map(map_fd,
+				BPF_MAP_TYPE_PERF_EVENT_ARRAY, 0, 0)) {
+			*data_map_fd = map_fd;
+			continue;
+		}
+		if (!verify_map(map_fd,
+				BPF_MAP_TYPE_ARRAY, 1,
+				sizeof(struct bpf_pcap_hdr)))
+			*conf_map_fd = map_fd;
+	}
+
+	/* For the prog specified, the conf map is optional but the data map
+	 * must be present to facilitate capture.
+	 */
+	if (*data_map_fd == -1) {
+		p_err("no perf event map associated with program");
+		return -1;
+	}
+	return 0;
+}
+
+static struct bpf_link *trace_attach(int prog_fd, enum bpf_prog_type prog_type,
+			struct bpf_program *prog, const char *trace)
+{
+	switch (prog_type) {
+	case BPF_PROG_TYPE_KPROBE:
+		return bpf_program__attach_kprobe(prog, false, trace);
+
+	case BPF_PROG_TYPE_RAW_TRACEPOINT:
+		return bpf_program__attach_raw_tracepoint(prog, trace);
+	default:
+		p_err("unexpected type; kprobes, raw tracepoints supported");
+		return NULL;
+	}
+}
+
+static int do_pcap_common(int argc, char **argv, int prog_fd,
+			  struct bpf_program *prog, char *trace)
+{
+	struct bpf_pcap_hdr conf = { .protocol = PCAP_PROTOCOL_DEFAULT,
+				     .cap_len = 0,
+				     .flags = 0 };
+	const char *pcap_filename = PCAP_FILE_DEFAULT;
+	int data_map_fd = -1, conf_map_fd = -1;
+	int pages = PCAP_NUM_PAGES_DEFAULT;
+	struct bpf_link *trace_link = NULL;
+	enum bpf_prog_type prog_type;
+	int conf_used;
+
+	if (prog_info(prog_fd, &prog_type, &data_map_fd, &conf_map_fd) < 0)
+		return -1;
+
+	conf_used = handle_opts(argc, argv, &conf, &pcap_filename, &pages);
+	switch (conf_used) {
+	case 0:
+		break;
+	case 1:
+		if (conf_map_fd < 0) {
+			p_err("no single-element BPF array map to store configuration found");
+			return -1;
+		}
+		break;
+	default:
+		return -1;
+	}
+
+	set_max_rlimit();
+
+	if (conf_map_fd >= 0 && handle_conf_map(conf_map_fd, &conf) < 0)
+		return -1;
+
+	if (trace && !prog) {
+		p_err("to specify trace option, '%s pcap load' must be used",
+		      bin_name);
+		return -1;
+	}
+	if (trace) {
+		trace_link = trace_attach(prog_fd, prog_type, prog, trace);
+		if (IS_ERR(trace_link))
+			return -1;
+	}
+
+	return handle_pcap(data_map_fd, conf_map_fd, &conf, pcap_filename,
+			   pages, trace_link);
+}
+
+static int do_pcap_trace(int argc, char **argv)
+{
+	char trace_prog[PATH_MAX], trace[PATH_MAX], trace_type[PATH_MAX];
+	struct bpf_program *prog;
+	struct bpf_object *obj;
+	int trace_argnum = 1;
+	char argstr[8];
+	int prog_fd;
+
+	if (!REQ_ARGS(1))
+		return -1;
+
+	trace_prog[0] = '\0';
+
+	/* Optional trace program; if not specified we load a builtin program
+	 * based on probe prefix (kprobe|tracepoint).
+	 */
+	if (strcmp(*argv + strlen(*argv) - 2, ".o") == 0) {
+		strncpy(trace_prog, *argv, sizeof(trace_prog));
+		if (!REQ_ARGS(2))
+			return -1;
+		NEXT_ARG();
+	}
+
+	if (sscanf(*argv, "%[^:]:%[^:]:arg%d", trace_type, trace, &trace_argnum)
+	    != 3 &&
+	    sscanf(*argv, "%[^:]:%[^:]", trace_type, trace) != 2) {
+		p_err("expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'");
+		return -1;
+	}
+	if (strcmp(trace_type, "kprobe") != 0 &&
+	    strcmp(trace_type, "tracepoint") != 0) {
+		p_err("invalid trace type %s, expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'",
+		      trace_type);
+		return -1;
+	}
+	if (trace_argnum < 1 || trace_argnum > 4) {
+		p_err("'arg%d' invalid, expected '[kprobe|tracepoint]:PROBENAME[:arg[1-4]]'",
+		      trace_argnum);
+		return -1;
+	}
+	NEXT_ARG();
+
+	if (strlen(trace_prog) == 0) {
+		char bin_path[PATH_MAX];
+
+		/* derive path of currently-executing command; BPF programs will
+		 * be in the same directory, with suffix based on trace type.
+		 */
+		if (readlink("/proc/self/exe", bin_path, sizeof(bin_path)) <= 0)
+			return -1;
+		snprintf(trace_prog, sizeof(trace_prog), "%s_pcap_%s.o",
+			 bin_path, trace_type);
+	}
+
+	if (bpf_prog_load(trace_prog, BPF_PROG_TYPE_UNSPEC, &obj, &prog_fd) < 0)
+		return -1;
+
+	snprintf(argstr, sizeof(argstr), "arg%d", trace_argnum);
+
+	bpf_object__for_each_program(prog, obj) {
+		if (strstr(bpf_program__title(prog, false), argstr))
+			break;
+	}
+	/* No argnum-specific program, fall back to first program. */
+	if (!prog)
+		prog = bpf_program__next(NULL, obj);
+	if (!prog) {
+		p_err("could not get program");
+		return -1;
+	}
+
+	return do_pcap_common(argc, argv, prog_fd, prog, trace);
+}
+
+static int do_pcap_prog(int argc, char **argv)
+{
+	int prog_fd;
+
+	prog_fd = prog_parse_fd(&argc, &argv);
+	if (prog_fd == -1)
+		return -1;
+
+	return do_pcap_common(argc, argv, prog_fd, NULL, NULL);
+}
+
+static int do_help(int argc, char **argv)
+{
+	if (json_output) {
+		jsonw_null(json_wtr);
+		return 0;
+	}
+	fprintf(stderr,
+		"        %s %s prog {id ID | pinned PATH }\n"
+		"              [data_out FILE] [proto PROTOCOL] [len MAXLEN]\n"
+		"              [pages NUMPAGES]\n"
+		"        %s %s trace [OBJ] {kprobe|tracepoint}:probename[:arg[1-4]]]\n"
+		"              [data_out FILE] [proto PROTOCOL] [len MAXLEN]\n"
+		"              [dev DEVICE] [pages NUMPAGES]\n"
+		"        %s %s help\n",
+		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "prog",		do_pcap_prog },
+	{ "trace",		do_pcap_trace },
+	{ "help",		do_help },
+	{ 0 }
+};
+
+#endif /* HAVE_LIBPCAP_SUPPORT */
+
+int do_pcap(int argc, char **argv)
+{
+#ifdef HAVE_LIBPCAP_SUPPORT
+	return cmd_select(cmds, argc, argv, do_help);
+#else
+	p_err("pcap support was not compiled into bpftool as libpcap\n"
+	      "and associated headers were not available at build time.\n");
+	return -1;
+#endif /* HAVE_LIBPCAP_SUPPORT */
+}
diff --git a/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c b/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
new file mode 100644
index 0000000..00a945d
--- /dev/null
+++ b/tools/bpf/bpftool/progs/bpftool_pcap_kprobe.c
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+static __always_inline int kprobe_pcap(struct pt_regs *ctx, int argnum)
+{
+	struct bpf_pcap_hdr *conf;
+	int key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	switch (argnum) {
+	case 1:
+		bpf_pcap((void *)PT_REGS_PARM1(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	case 2:
+		bpf_pcap((void *)PT_REGS_PARM2(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	case 3:
+		bpf_pcap((void *)PT_REGS_PARM3(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	case 4:
+		bpf_pcap((void *)PT_REGS_PARM4(ctx), conf->cap_len,
+			 &pcap_data_map, conf->protocol, conf->flags);
+		return 0;
+	}
+	return 0;
+}
+
+SEC("kprobe/pcap_arg1")
+int pcap_arg1(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 1);
+}
+
+SEC("kprobe/pcap_arg2")
+int pcap_arg2(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 2);
+}
+
+SEC("kprobe/pcap_arg3")
+int pcap_arg3(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 3);
+}
+
+SEC("kprobe/pcap_arg4")
+int pcap_arg4(struct pt_regs *ctx)
+{
+	return kprobe_pcap(ctx, 4);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c b/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
new file mode 100644
index 0000000..639806a
--- /dev/null
+++ b/tools/bpf/bpftool/progs/bpftool_pcap_tracepoint.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+/* To attach to raw tracepoints, we need one program for each arg choice 1-4.
+ * Otherwise attach fails.
+ */
+static __always_inline int trace_pcap(struct bpf_raw_tracepoint_args *ctx,
+				      int argnum)
+{
+	struct bpf_pcap_hdr *conf;
+	int ret, key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	bpf_pcap((void *)ctx->args[argnum], conf->cap_len,
+		 &pcap_data_map, conf->protocol, conf->flags);
+	return 0;
+}
+
+SEC("raw_tracepoint/pcap_arg1")
+int trace_arg1(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 0);
+}
+
+SEC("raw_tracepoint/pcap_arg2")
+int trace_arg2(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 1);
+}
+
+SEC("raw_tracepoint/pcap_arg3")
+int trace_arg3(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 2);
+}
+
+SEC("raw_tracepoint/pcap_arg4")
+int trace_arg4(struct bpf_raw_tracepoint_args *ctx)
+{
+	return trace_pcap(ctx, 3);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 6c4930b..2a61126 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -231,6 +231,9 @@ static int (*bpf_sk_storage_delete)(void *map, struct bpf_sock *sk) =
 static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
 					  int ip_len, void *tcp, int tcp_len) =
 	(void *) BPF_FUNC_tcp_gen_syncookie;
+static int (*bpf_pcap)(void *data, __u32 size, void *map, int protocol,
+		       __u64 flags) =
+	(void *) BPF_FUNC_pcap;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
@@ -520,8 +523,16 @@ static int (*bpf_skb_adjust_room)(void *ctx, __s32 len_diff, __u32 mode,
  * actual field offset, based on target kernel BTF type that matches original
  * (local) BTF, used to record relocation.
  */
+#ifdef __builtin_preserve_access_index
 #define BPF_CORE_READ(dst, src)						\
 	bpf_probe_read((dst), sizeof(*(src)),				\
 		       __builtin_preserve_access_index(src))
 
+#else
+
+#define	BPF_CORE_READ(dst, src)						\
+	bpf_probe_read((dst), sizeof(*(src)), src)
+
+#endif /* __builtin_preserve_access_index */
+
 #endif
-- 
1.8.3.1


^ permalink raw reply related

* [RFC bpf-next 7/7] bpf: add tests for bpftool packet capture
From: Alan Maguire @ 2019-09-07 21:40 UTC (permalink / raw)
  To: ast, daniel, kafai, songliubraving, yhs, davem, jakub.kicinski,
	hawk, john.fastabend, rostedt, mingo, quentin.monnet, rdna, joe,
	acme, jolsa, alexey.budankov, gregkh, namhyung, sdf, f.fainelli,
	shuah, peter, ivan, andriin, bhole_prashant_q7, david.calavera,
	danieltimlee, ctakshak, netdev, bpf, linux-kselftest
  Cc: Alan Maguire
In-Reply-To: <1567892444-16344-1-git-send-email-alan.maguire@oracle.com>

add tests which verify packet capture works for tracing of
kprobes and raw tracepoints, and for capturing packets from
existing skb/xdp programs.

Signed-off-by: Alan Maguire <alan.maguire@oracle.com>
---
 tools/testing/selftests/bpf/Makefile               |   3 +-
 .../testing/selftests/bpf/progs/bpftool_pcap_tc.c  |  41 +++++++
 .../testing/selftests/bpf/progs/bpftool_pcap_xdp.c |  39 ++++++
 tools/testing/selftests/bpf/test_bpftool_pcap.sh   | 132 +++++++++++++++++++++
 4 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
 create mode 100755 tools/testing/selftests/bpf/test_bpftool_pcap.sh

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 7f3196a..1e8b68d 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -66,7 +66,8 @@ TEST_PROGS := test_kmod.sh \
 	test_tc_tunnel.sh \
 	test_tc_edt.sh \
 	test_xdping.sh \
-	test_bpftool_build.sh
+	test_bpftool_build.sh \
+	test_bpftool_pcap.sh
 
 TEST_PROGS_EXTENDED := with_addr.sh \
 	with_tunnels.sh \
diff --git a/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c b/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
new file mode 100644
index 0000000..b51f8fc
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpftool_pcap_tc.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/ptrace.h>
+#include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+SEC("tc_pcap")
+int tc_pcap(struct __sk_buff *skb)
+{
+	struct bpf_pcap_hdr *conf;
+	int key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	bpf_pcap(skb, conf->cap_len, &pcap_data_map, conf->protocol,
+		 conf->flags);
+
+	return TC_ACT_OK;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c b/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
new file mode 100644
index 0000000..a7d6866
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpftool_pcap_xdp.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. */
+
+#include <stddef.h>
+#include <linux/bpf.h>
+
+#include <bpf_helpers.h>
+
+#define KBUILD_MODNAME "foo"
+
+struct bpf_map_def SEC("maps") pcap_data_map = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(int),
+	.max_entries = 1024,
+};
+
+struct bpf_map_def SEC("maps") pcap_conf_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct bpf_pcap_hdr),
+	.max_entries = 1,
+};
+
+SEC("xdp_pcap")
+int xdp_pcap(struct xdp_md *ctx)
+{
+	struct bpf_pcap_hdr *conf;
+	int key = 0;
+
+	conf = bpf_map_lookup_elem(&pcap_conf_map, &key);
+	if (!conf)
+		return 0;
+
+	bpf_pcap(ctx, conf->cap_len, &pcap_data_map, conf->protocol,
+		 conf->flags);
+
+	return XDP_PASS;
+}
diff --git a/tools/testing/selftests/bpf/test_bpftool_pcap.sh b/tools/testing/selftests/bpf/test_bpftool_pcap.sh
new file mode 100755
index 0000000..92b5438
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_bpftool_pcap.sh
@@ -0,0 +1,132 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+
+readonly src="../../../../"
+readonly bpftool="${src}/tools/bpf/bpftool/bpftool"
+readonly capfile="/tmp/cap.$$"
+readonly ns="ns-$$"
+readonly badport="5555"
+readonly addr1="192.168.1.1"
+readonly addr2="192.168.1.2"
+readonly pinpath="/sys/fs/bpf/"
+readonly veth1="${ns}-veth1"
+readonly veth2="${ns}-veth2"
+# 24 bytes for the pcap header
+readonly cap_minsize=24
+readonly caplens="0 8192"
+readonly addrs="127.0.0.1 ::1"
+readonly devs="none lo"
+
+cleanup() {
+  iptables -D INPUT -p tcp --dport $badport -j DROP
+  ip6tables -D INPUT -p tcp --dport $badport -j DROP
+  ip netns del $ns 2>/dev/null
+  rm -f $capfile
+}
+
+verify_capture() {
+  capsize=$(stat -c '%s' $capfile)
+  if [[ $capsize -le $cap_minsize ]]; then
+    exit 1
+  fi
+  if [[ $no_tcpdump == 0 ]]; then
+    count=$(tcpdump -lnr $capfile $1 2>/dev/null)
+    if [[ -z "$count" ]]; then
+      exit 1
+    fi
+  fi
+}
+
+which tcpdump 2>&1 > /dev/null
+no_tcpdump=$?
+
+pcap_supported=$(bpftool pcap help >/dev/null 2>&1)
+if [[ $? -ne 0 ]]; then
+	echo "no pcap support in bpftool, cannot test feature."
+	exit 0
+fi
+
+set -e
+
+trap cleanup EXIT
+
+iptables -A INPUT -p tcp --dport $badport -j DROP
+ip6tables -A INPUT -p tcp --dport $badport -j DROP
+
+# Test "bpftool pcap trace" - kprobe, tracepoint tracing
+for probe in kprobe tracepoint; do
+  for dev in $devs; do
+    devarg=
+    if [[ $dev != "none" ]]; then
+      devarg="dev $dev"
+    fi
+    args="$probe:kfree_skb proto ip data_out $capfile $devarg"
+    echo "Test trace $args"
+    for caplen in $caplens ; do
+      for progname in none $probe ; do
+        progpath=
+        if [[ $progname != "none" ]]; then
+          progpath=${bpftool}_pcap_${probe}.o
+        fi
+        allargs="$progpath $args len $caplen"
+        for addr in $addrs ; do
+          $bpftool pcap trace $allargs &
+          bpftool_pid=$!
+          set +e
+          timeout 2 nc $addr $badport 2>/dev/null
+          kill -TERM $bpftool_pid
+          set -e
+          sleep 1
+          verify_capture "host $addr and port $badport"
+          rm -f $capfile
+        done
+      done
+    done
+    echo "Test trace $args: PASS"
+  done
+done
+
+# Test "bpftool pcap prog" - skb, xdp program tracing
+ip netns add $ns
+ip link add dev $veth2 netns $ns type veth peer name $veth1
+ip link set $veth1 up
+ip addr add ${addr1}/24 dev $veth1
+ip -netns $ns link set $veth2 up
+ip netns exec $ns ip addr add ${addr2}/24 dev $veth2
+
+for prog in tc xdp ; do
+  if [[ $prog == tc ]]; then
+    ip netns exec $ns tc qdisc add dev $veth2 clsact
+    ip netns exec $ns tc filter add dev $veth2 ingress bpf da \
+      obj bpftool_pcap_${prog}.o sec ${prog}_pcap
+    id=$(ip netns exec $ns tc filter show dev $veth2 ingress | \
+         awk '/direct-action/ { for(i=1;i<=NF;i++)if($i=="id")print $(i+1)}')
+  else
+    ip netns exec $ns ip link set dev $veth2 xdp obj bpftool_pcap_${prog}.o \
+      sec ${prog}_pcap
+    id=$(ip netns exec $ns ip link show $veth2 | awk '/prog\/xdp/ { print $3 }')
+    sleep 5
+  fi
+  args="id $id data_out $capfile"
+  echo "Test prog $args"
+  for caplen in $caplens ; do
+    allargs="$args len $caplen"
+    $bpftool pcap prog $allargs &
+    bpftool_pid=$!
+    set +e
+    ping -q -c 5 $addr2 1>/dev/null
+    kill -TERM $bpftool_pid
+    set -e
+    sleep 1
+    verify_capture "host $addr1"
+    rm -f $capfile
+  done
+  if [[ $prog == tc ]]; then
+    ip netns exec $ns tc qdisc del dev $veth2 clsact
+    sleep 1
+  else
+    ip netns exec $ns ip link set dev $veth2 xdp off
+  fi
+  echo "Test trace $args: PASS"
+done
-- 
1.8.3.1


^ permalink raw reply related

* Re: [PATCH net-next 2/3] net: dsa: mv88e6xxx: introduce .port_set_policy
From: Marek Behun @ 2019-09-07 23:54 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, davem, f.fainelli, andrew
In-Reply-To: <20190907200049.25273-3-vivien.didelot@gmail.com>

On Sat,  7 Sep 2019 16:00:48 -0400
Vivien Didelot <vivien.didelot@gmail.com> wrote:

> @@ -3132,6 +3132,7 @@ static const struct mv88e6xxx_ops mv88e6172_ops = {
>  	.port_set_rgmii_delay = mv88e6352_port_set_rgmii_delay,
>  	.port_set_speed = mv88e6352_port_set_speed,
>  	.port_tag_remap = mv88e6095_port_tag_remap,
> +	.port_set_policy = mv88e6352_port_set_policy,
>  	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
>  	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
>  	.port_set_ether_type = mv88e6351_port_set_ether_type,

Topaz also supports this, 6141 and 6341.

^ permalink raw reply

* Re: general protection fault in dev_map_hash_update_elem
From: syzbot @ 2019-09-08  1:59 UTC (permalink / raw)
  To: alexei.starovoitov, ast, bpf, daniel, davem, hawk, jakub.kicinski,
	jbrouer, john.fastabend, kafai, linux-kernel, netdev,
	songliubraving, syzkaller-bugs, toke, yhs
In-Reply-To: <0000000000005091a70591d3e1d9@google.com>

syzbot has found a reproducer for the following crash on:

HEAD commit:    a2c11b03 kcm: use BPF_PROG_RUN
git tree:       bpf-next
console output: https://syzkaller.appspot.com/x/log.txt?x=13d46ec1600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=cf0c85d15c20ade3
dashboard link: https://syzkaller.appspot.com/bug?extid=4e7a85b1432052e8d6f8
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=1220b2d1600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1360b26e600000

Bisection is inconclusive: the first bad commit could be any of:

116e7dbe Merge branch 'gen-syn-cookie'
91bc3578 selftests/bpf: add test for bpf_tcp_gen_syncookie
637f71c0 selftests/bpf: bpf_tcp_gen_syncookie->bpf_helpers
bf8ff0f8 selftests/bpf: fix clearing buffered output between tests/subtests
3745ee18 bpf: sync bpf.h to tools/
a98bf573 tools: bpftool: add support for reporting the effective cgroup  
progs
70d66244 bpf: add bpf_tcp_gen_syncookie helper
9babe825 bpf: always allocate at least 16 bytes for setsockopt hook
9349d600 tcp: add skb-less helpers to retrieve SYN cookie
fd5ef31f selftests/bpf: extend sockopt_sk selftest with TCP_CONGESTION use  
case
02bc2b64 Merge branch 'setsockopt-extra-mem'
96511278 tcp: tcp_syn_flood_action read port from socket
a78d0dbe selftests/bpf: add loop test 4
d3406913 Merge branch 'devmap_hash'
1375dc4a tools: Add definitions for devmap_hash map type
8c303960 selftests/bpf: add loop test 5
946152b3 selftests/bpf: test_progs: switch to open_memstream
e4234619 tools/libbpf_probes: Add new devmap_hash type
10fbe211 tools/include/uapi: Add devmap_hash BPF map type
66bd2ec1 selftests/bpf: test_progs: test__printf -> printf
16e910d4 selftests/bpf: test_progs: drop extra trailing tab
6f9d451a xdp: Add devmap_hash map type for looking up devices by hashed  
index
682cdbdc Merge branch 'test_progs-stdio'
fca16e51 xdp: Refactor devmap allocation code for reuse
6dbff13c include/bpf.h: Remove map_insert_ctx() stubs
ef20a9b2 libbpf: add helpers for working with BTF types
475e31f8 Merge branch 'revamp-test_progs'
b03bc685 libbpf: convert libbpf code to use new btf helpers
4cedc0da libbpf: add .BTF.ext offset relocation section loading
b207edfe selftests/bpf: convert send_signal.c to use subtests
51436ed7 selftests/bpf: convert bpf_verif_scale.c to sub-tests API
ddc7c304 libbpf: implement BPF CO-RE offset relocation algorithm
2dc26d5a selftests/bpf: add BPF_CORE_READ relocatable read macro
3a516a0a selftests/bpf: add sub-tests support for test_progs
0ff97e56 selftests/bpf: abstract away test log output
df36e621 selftests/bpf: add CO-RE relocs testing setup
002d3afc selftests/bpf: add CO-RE relocs struct flavors tests
329e38f7 selftest/bpf: centralize libbpf logging management for test_progs
e87fd8ba libbpf: return previous print callback from libbpf_set_print
ec6438a9 selftests/bpf: add CO-RE relocs nesting tests
20a9ad2e selftests/bpf: add CO-RE relocs array tests
8160bae2 selftests/bpf: add test selectors by number and name to test_progs
766f2a59 selftests/bpf: revamp test_progs to allow more control
d9db3550 selftests/bpf: add CO-RE relocs enum/ptr/func_proto tests
61098e89 selftests/bpf: prevent headers to be compiled as C code
9654e2ae selftests/bpf: add CO-RE relocs modifiers/typedef tests
943e398d Merge branch 'flow_dissector-input-flags'
d698f9db selftests/bpf: add CO-RE relocs ptr-as-array tests
c1f5e7dd selftests/bpf: add CO-RE relocs ints tests
e853ae77 selftests/bpf: support BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP
29e1c668 selftests/bpf: add CO-RE relocs misc tests
71c99e32 bpf/flow_dissector: support ipv6 flow_label and  
BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL
726e333f Merge branch 'compile-once-run-everywhere'
ae173a91 selftests/bpf: support BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG
57debff2 tools/bpf: sync bpf_flow_keys flags
b7076592 tools/bpf: fix core_reloc.c compilation error
b2ca4e1c bpf/flow_dissector: support flags in BPF_PROG_TEST_RUN
d9973cec xdp: xdp_umem: fix umem pages mapping for 32bits systems
1ac6b126 bpf/flow_dissector: document flags
3783d437 samples/bpf: xdp_fwd rename devmap name to be xdp_tx_ports
086f9568 bpf/flow_dissector: pass input flags to BPF flow dissector program
a32a32cb samples/bpf: make xdp_fwd more practically usable via devmap lookup
03cd1d1a selftests/bpf: Add selftests for bpf_perf_event_output
abcce733 samples/bpf: xdp_fwd explain bpf_fib_lookup return codes
7c4b90d7 bpf: Allow bpf_skb_event_output for a few prog types
9f30cd56 Merge branch 'bpf-xdp-fwd-sample-improvements'
5e31d507 Merge branch 'convert-tests-to-libbpf'
a664a834 tools: bpftool: fix reading from /proc/config.gz
341dfcf8 btf: expose BTF info through sysfs
47da6e4d selftests/bpf: remove perf buffer helpers
c17bec54 samples/bpf: switch trace_output sample to perf_buffer API
d66fa3c7 tools: bpftool: add feature check for zlib
9840a4ff selftests/bpf: fix race in flow dissector tests
f58a4d51 samples/bpf: convert xdp_sample_pkts_user to perf_buffer API
7fd78568 btf: rename /sys/kernel/btf/kernel into /sys/kernel/btf/vmlinux
898ca681 selftests/bpf: switch test_tcpnotify to perf_buffer API
58b80815 selftests/bpf: convert test_get_stack_raw_tp to perf_buffer API
a1916a15 libbpf: attempt to load kernel BTF from sysfs first
72ef80b5 Merge branch 'bpf-libbpf-read-sysfs-btf'
f2a3e4e9 libbpf: provide more helpful message on uninitialized global var
708852dc Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=1130846e600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 10210 Comm: syz-executor910 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:__write_once_size include/linux/compiler.h:226 [inline]
RIP: 0010:__hlist_del include/linux/list.h:762 [inline]
RIP: 0010:hlist_del_rcu include/linux/rculist.h:455 [inline]
RIP: 0010:__dev_map_hash_update_elem kernel/bpf/devmap.c:668 [inline]
RIP: 0010:dev_map_hash_update_elem+0x3c8/0x6e0 kernel/bpf/devmap.c:691
Code: 48 89 f1 48 89 75 c8 48 c1 e9 03 80 3c 11 00 0f 85 d3 02 00 00 48 b9  
00 00 00 00 00 fc ff df 48 8b 53 10 48 89 d6 48 c1 ee 03 <80> 3c 0e 00 0f  
85 97 02 00 00 48 85 c0 48 89 02 74 38 48 89 55 b8
RSP: 0018:ffff88808c757c30 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff8880a216a980 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a216a988
RBP: ffff88808c757c78 R08: 0000000000000004 R09: ffffed10118eaf73
R10: ffffed10118eaf72 R11: 0000000000000003 R12: ffff88808c7fb2c0
R13: ffff88808aa98800 R14: 0000000000000000 R15: ffff88808c7fb3e8
FS:  00007fd4c5528700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff47596210 CR3: 000000008b442000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  map_update_elem+0xc82/0x10b0 kernel/bpf/syscall.c:966
  __do_sys_bpf+0x8b5/0x3350 kernel/bpf/syscall.c:2854
  __se_sys_bpf kernel/bpf/syscall.c:2825 [inline]
  __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:2825
  do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x446a29
Code: e8 0c e8 ff ff 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 5b 07 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fd4c5527db8 EFLAGS: 00000246 ORIG_RAX: 0000000000000141
RAX: ffffffffffffffda RBX: 00000000006dbc28 RCX: 0000000000446a29
RDX: 0000000000000020 RSI: 0000000020000180 RDI: 0000000000000002
RBP: 00000000006dbc20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc2c
R13: 00007fff4759618f R14: 00007fd4c55289c0 R15: 0000000000000000
Modules linked in:
---[ end trace 9a6d00abce3fe1c8 ]---
RIP: 0010:__write_once_size include/linux/compiler.h:226 [inline]
RIP: 0010:__hlist_del include/linux/list.h:762 [inline]
RIP: 0010:hlist_del_rcu include/linux/rculist.h:455 [inline]
RIP: 0010:__dev_map_hash_update_elem kernel/bpf/devmap.c:668 [inline]
RIP: 0010:dev_map_hash_update_elem+0x3c8/0x6e0 kernel/bpf/devmap.c:691
Code: 48 89 f1 48 89 75 c8 48 c1 e9 03 80 3c 11 00 0f 85 d3 02 00 00 48 b9  
00 00 00 00 00 fc ff df 48 8b 53 10 48 89 d6 48 c1 ee 03 <80> 3c 0e 00 0f  
85 97 02 00 00 48 85 c0 48 89 02 74 38 48 89 55 b8
RSP: 0018:ffff88808c757c30 EFLAGS: 00010046
RAX: 0000000000000000 RBX: ffff8880a216a980 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880a216a988
RBP: ffff88808c757c78 R08: 0000000000000004 R09: ffffed10118eaf73
R10: ffffed10118eaf72 R11: 0000000000000003 R12: ffff88808c7fb2c0
R13: ffff88808aa98800 R14: 0000000000000000 R15: ffff88808c7fb3e8
FS:  00007fd4c5528700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fff47596210 CR3: 000000008b442000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


^ permalink raw reply

* Re: [PATCH net-next 3/3] net: dsa: mv88e6xxx: add RXNFC support
From: Florian Fainelli @ 2019-09-08  2:48 UTC (permalink / raw)
  To: Vivien Didelot, netdev; +Cc: davem, andrew
In-Reply-To: <20190907200049.25273-4-vivien.didelot@gmail.com>



On 9/7/2019 1:00 PM, Vivien Didelot wrote:
> Implement the .get_rxnfc and .set_rxnfc DSA operations to configure
> a port's Layer 2 Policy Control List (PCL) via ethtool.
> 
> Currently only dropping frames based on MAC Destination or Source
> Address (including the option VLAN parameter) is supported.
> 
> Signed-off-by: Vivien Didelot <vivien.didelot@gmail.com>

For the ethtool interface part:

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
-- 
Florian

^ permalink raw reply

* general protection fault in cbs_destroy
From: syzbot @ 2019-09-08  6:08 UTC (permalink / raw)
  To: davem, jhs, jiri, linux-kernel, netdev, syzkaller-bugs,
	xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    3b47fd5c Merge tag 'nfs-for-5.3-4' of git://git.linux-nfs...
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=14854e71600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=144488c6c6c6d2b6
dashboard link: https://syzkaller.appspot.com/bug?extid=3a8d6a998cbb73bcf337
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=17998f9e600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10421efa600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+3a8d6a998cbb73bcf337@syzkaller.appspotmail.com

8021q: adding VLAN 0 to HW filter on device batadv0
netlink: 24 bytes leftover after parsing attributes in process  
`syz-executor457'.
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 9249 Comm: syz-executor457 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:__list_del_entry_valid+0x6b/0x100 lib/list_debug.c:51
Code: 4c 89 f7 e8 97 d0 58 fe 48 ba 00 01 00 00 00 00 ad de 49 8b 1e 48 39  
d3 74 54 48 83 c2 22 49 39 d7 74 5e 4c 89 f8 48 c1 e8 03 <42> 80 3c 20 00  
74 08 4c 89 ff e8 66 d0 58 fe 49 8b 17 4c 39 f2 75
RSP: 0018:ffff88809898f568 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: dead000000000122 RSI: 0000000000000004 RDI: ffff88809fb5a7e8
RBP: ffff88809898f588 R08: dffffc0000000000 R09: ffffed1013131ea8
R10: ffffed1013131ea8 R11: 0000000000000000 R12: dffffc0000000000
R13: ffff88809fb5a480 R14: ffff88809fb5a7e0 R15: 0000000000000000
FS:  00005555568cb880(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000610 CR3: 00000000a3968000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  __list_del_entry include/linux/list.h:131 [inline]
  list_del include/linux/list.h:139 [inline]
  cbs_destroy+0x85/0x3e0 net/sched/sch_cbs.c:435
  qdisc_create+0xff8/0x13e0 net/sched/sch_api.c:1302
  tc_modify_qdisc+0x989/0x1ea0 net/sched/sch_api.c:1652
  rtnetlink_rcv_msg+0x889/0xd40 net/core/rtnetlink.c:5223
  netlink_rcv_skb+0x19e/0x3d0 net/netlink/af_netlink.c:2477
  rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:5241
  netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
  netlink_unicast+0x787/0x900 net/netlink/af_netlink.c:1328
  netlink_sendmsg+0x993/0xc50 net/netlink/af_netlink.c:1917
  sock_sendmsg_nosec net/socket.c:637 [inline]
  sock_sendmsg net/socket.c:657 [inline]
  ___sys_sendmsg+0x60d/0x910 net/socket.c:2311
  __sys_sendmsg net/socket.c:2356 [inline]
  __do_sys_sendmsg net/socket.c:2365 [inline]
  __se_sys_sendmsg net/socket.c:2363 [inline]
  __x64_sys_sendmsg+0x17c/0x200 net/socket.c:2363
  do_syscall_64+0xfe/0x140 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x441b59
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 7b 10 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffe29572cf8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000441b59
RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000003
RBP: 00007ffe29572d10 R08: 0000000001bbbbbb R09: 0000000001bbbbbb
R10: 0000000001bbbbbb R11: 0000000000000246 R12: 0000000000000000
R13: 00000000004030f0 R14: 0000000000000000 R15: 0000000000000000
Modules linked in:
---[ end trace 226030e488aca074 ]---
RIP: 0010:__list_del_entry_valid+0x6b/0x100 lib/list_debug.c:51
Code: 4c 89 f7 e8 97 d0 58 fe 48 ba 00 01 00 00 00 00 ad de 49 8b 1e 48 39  
d3 74 54 48 83 c2 22 49 39 d7 74 5e 4c 89 f8 48 c1 e8 03 <42> 80 3c 20 00  
74 08 4c 89 ff e8 66 d0 58 fe 49 8b 17 4c 39 f2 75
RSP: 0018:ffff88809898f568 EFLAGS: 00010246
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
RDX: dead000000000122 RSI: 0000000000000004 RDI: ffff88809fb5a7e8
RBP: ffff88809898f588 R08: dffffc0000000000 R09: ffffed1013131ea8
R10: ffffed1013131ea8 R11: 0000000000000000 R12: dffffc0000000000
R13: ffff88809fb5a480 R14: ffff88809fb5a7e0 R15: 0000000000000000
FS:  00005555568cb880(0000) GS:ffff8880aea00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000610 CR3: 00000000a3968000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* WARNING in cbs_dequeue_soft
From: syzbot @ 2019-09-08  6:08 UTC (permalink / raw)
  To: davem, jhs, jiri, leandro.maciel.dorileo, linux-kernel, netdev,
	syzkaller-bugs, vedang.patel, xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    6d028043 Add linux-next specific files for 20190830
git tree:       linux-next
console output: https://syzkaller.appspot.com/x/log.txt?x=17f1421a600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=82a6bec43ab0cb69
dashboard link: https://syzkaller.appspot.com/bug?extid=cdbea9b616d35e2365ae
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=147b54d1600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16c5da6e600000

The bug was bisected to:

commit e0a7683d30e91e30ee6cf96314ae58a0314a095e
Author: Leandro Dorileo <leandro.maciel.dorileo@intel.com>
Date:   Mon Apr 8 17:12:18 2019 +0000

     net/sched: cbs: fix port_rate miscalculation

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=130c614e600000
final crash:    https://syzkaller.appspot.com/x/report.txt?x=108c614e600000
console output: https://syzkaller.appspot.com/x/log.txt?x=170c614e600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+cdbea9b616d35e2365ae@syzkaller.appspotmail.com
Fixes: e0a7683d30e9 ("net/sched: cbs: fix port_rate miscalculation")

------------[ cut here ]------------
cbs: dequeue() called with unknown port rate.
WARNING: CPU: 1 PID: 8572 at net/sched/sch_cbs.c:185  
cbs_dequeue_soft+0x37e/0x4b0 net/sched/sch_cbs.c:185
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 8572 Comm: kworker/1:2 Not tainted 5.3.0-rc6-next-20190830 #75
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Workqueue: ipv6_addrconf addrconf_dad_work
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 lib/dump_stack.c:113
  panic+0x2dc/0x755 kernel/panic.c:220
  __warn.cold+0x2f/0x3c kernel/panic.c:581
  report_bug+0x289/0x300 lib/bug.c:195
  fixup_bug arch/x86/kernel/traps.c:179 [inline]
  fixup_bug arch/x86/kernel/traps.c:174 [inline]
  do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:272
  do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:291
  invalid_op+0x23/0x30 arch/x86/entry/entry_64.S:1028
RIP: 0010:cbs_dequeue_soft+0x37e/0x4b0 net/sched/sch_cbs.c:185
Code: 1d 2c b3 f5 03 31 ff 89 de e8 fe 6d a6 fb 84 db 75 1a e8 b5 6c a6 fb  
48 c7 c7 80 7d 4a 88 c6 05 0c b3 f5 03 01 e8 0a bb 77 fb <0f> 0b 45 31 e4  
eb b1 49 bc ff ff ff ff ff ff ff 7f 48 89 55 d0 e8
RSP: 0018:ffff8880a129f3e8 EFLAGS: 00010282
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff815bf786 RDI: ffffed1014253e6f
RBP: ffff8880a129f430 R08: ffff8880a63f4040 R09: fffffbfff14ed341
R10: fffffbfff14ed340 R11: ffffffff8a769a07 R12: ffff8880911a5800
R13: ffff888095de92c8 R14: 0000000f8f3a4493 R15: ffffffffffffffff
  cbs_dequeue+0x34/0x40 net/sched/sch_cbs.c:237
  dequeue_skb net/sched/sch_generic.c:258 [inline]
  qdisc_restart net/sched/sch_generic.c:361 [inline]
  __qdisc_run+0x1e7/0x19d0 net/sched/sch_generic.c:379
  __dev_xmit_skb net/core/dev.c:3533 [inline]
  __dev_queue_xmit+0x16f1/0x37c0 net/core/dev.c:3838
  dev_queue_xmit+0x18/0x20 net/core/dev.c:3902
  neigh_resolve_output net/core/neighbour.c:1490 [inline]
  neigh_resolve_output+0x5a5/0x970 net/core/neighbour.c:1470
  neigh_output include/net/neighbour.h:511 [inline]
  ip6_finish_output2+0x1034/0x2550 net/ipv6/ip6_output.c:116
  __ip6_finish_output+0x444/0xaa0 net/ipv6/ip6_output.c:142
  ip6_finish_output+0x38/0x1f0 net/ipv6/ip6_output.c:152
  NF_HOOK_COND include/linux/netfilter.h:294 [inline]
  ip6_output+0x235/0x7f0 net/ipv6/ip6_output.c:175
  dst_output include/net/dst.h:436 [inline]
  NF_HOOK include/linux/netfilter.h:305 [inline]
  ndisc_send_skb+0xf29/0x14a0 net/ipv6/ndisc.c:505
  ndisc_send_ns+0x3a9/0x850 net/ipv6/ndisc.c:647
  addrconf_dad_work+0xb88/0x1150 net/ipv6/addrconf.c:4120
  process_one_work+0x9af/0x1740 kernel/workqueue.c:2269
  worker_thread+0x98/0xe40 kernel/workqueue.c:2415
  kthread+0x361/0x430 kernel/kthread.c:255
  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
Kernel Offset: disabled
Rebooting in 86400 seconds..


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* general protection fault in qdisc_put
From: syzbot @ 2019-09-08  6:08 UTC (permalink / raw)
  To: akinobu.mita, akpm, davem, dvyukov, jhs, jiri, linux-kernel,
	mhocko, netdev, syzkaller-bugs, torvalds, xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    3b47fd5c Merge tag 'nfs-for-5.3-4' of git://git.linux-nfs...
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=10244dd6600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=b89bb446a3faaba4
dashboard link: https://syzkaller.appspot.com/bug?extid=d5870a903591faaca4ae
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=174743fe600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=11f8c43e600000

The bug was bisected to:

commit e41d58185f1444368873d4d7422f7664a68be61d
Author: Dmitry Vyukov <dvyukov@google.com>
Date:   Wed Jul 12 21:34:35 2017 +0000

     fault-inject: support systematic fault injection

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=13f66bc6600000
final crash:    https://syzkaller.appspot.com/x/report.txt?x=100e6bc6600000
console output: https://syzkaller.appspot.com/x/log.txt?x=17f66bc6600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+d5870a903591faaca4ae@syzkaller.appspotmail.com
Fixes: e41d58185f14 ("fault-inject: support systematic fault injection")

RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000003
RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000001bbbbbb
R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000000
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 9699 Comm: syz-executor169 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:qdisc_put+0x25/0x90 net/sched/sch_generic.c:983
Code: 00 00 00 00 00 55 48 89 e5 41 54 49 89 fc 53 e8 c1 52 bf fb 49 8d 7c  
24 10 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84  
c0 74 04 3c 03 7e 54 41 8b 5c 24 10 31 ff 83 e3 01
RSP: 0018:ffff8880944c7488 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff8880945c8540 RCX: ffffffff85b49e8a
RDX: 0000000000000002 RSI: ffffffff85b3228f RDI: 0000000000000010
RBP: ffff8880944c7498 R08: ffff888099d50480 R09: ffffed1012898e45
R10: ffffed1012898e44 R11: 0000000000000003 R12: 0000000000000000
R13: ffff8880945c8540 R14: ffff888094894500 R15: ffff8880945c857c
FS:  0000555557553880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000610 CR3: 000000008c29d000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  sfb_destroy+0x61/0x80 net/sched/sch_sfb.c:468
  qdisc_create+0xbc6/0x1210 net/sched/sch_api.c:1285
  tc_modify_qdisc+0x524/0x1c50 net/sched/sch_api.c:1652
  rtnetlink_rcv_msg+0x463/0xb00 net/core/rtnetlink.c:5223
  netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477
  rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5241
  netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
  netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328
  netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917
  sock_sendmsg_nosec net/socket.c:637 [inline]
  sock_sendmsg+0xd7/0x130 net/socket.c:657
  ___sys_sendmsg+0x803/0x920 net/socket.c:2311
  __sys_sendmsg+0x105/0x1d0 net/socket.c:2356
  __do_sys_sendmsg net/socket.c:2365 [inline]
  __se_sys_sendmsg net/socket.c:2363 [inline]
  __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2363
  do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4424f9
Code: e8 9c 07 03 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 3b 0a fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007fffed10bed8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004424f9
RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000003
RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000001bbbbbb
R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffffff
R13: 0000000000000005 R14: 0000000000000000 R15: 0000000000000000
Modules linked in:
---[ end trace 97e52c48ae7a3cc1 ]---
RIP: 0010:qdisc_put+0x25/0x90 net/sched/sch_generic.c:983
Code: 00 00 00 00 00 55 48 89 e5 41 54 49 89 fc 53 e8 c1 52 bf fb 49 8d 7c  
24 10 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84  
c0 74 04 3c 03 7e 54 41 8b 5c 24 10 31 ff 83 e3 01
RSP: 0018:ffff8880944c7488 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: ffff8880945c8540 RCX: ffffffff85b49e8a
RDX: 0000000000000002 RSI: ffffffff85b3228f RDI: 0000000000000010
RBP: ffff8880944c7498 R08: ffff888099d50480 R09: ffffed1012898e45
R10: ffffed1012898e44 R11: 0000000000000003 R12: 0000000000000000
R13: ffff8880945c8540 R14: ffff888094894500 R15: ffff8880945c857c
FS:  0000555557553880(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000610 CR3: 000000008c29d000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* INFO: rcu detected stall in mld_ifc_timer_expire
From: syzbot @ 2019-09-08  6:08 UTC (permalink / raw)
  To: davem, jhs, jiri, linux-kernel, netdev, syzkaller-bugs,
	xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    3b47fd5c Merge tag 'nfs-for-5.3-4' of git://git.linux-nfs...
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=15807dc6600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=144488c6c6c6d2b6
dashboard link: https://syzkaller.appspot.com/bug?extid=bc6297c11f19ee807dc2
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=119ee6c1600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15c4eb0a600000

Bisection is inconclusive: the bug happens on the oldest tested release.

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=13b7343e600000
console output: https://syzkaller.appspot.com/x/log.txt?x=17b7343e600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+bc6297c11f19ee807dc2@syzkaller.appspotmail.com

rcu: INFO: rcu_preempt self-detected stall on CPU
rcu: 	0-...!: (10500 ticks this GP) idle=d6e/0/0x3 softirq=9083/9083 fqs=0
	(t=10501 jiffies g=6617 q=143)
rcu: rcu_preempt kthread starved for 10502 jiffies! g6617 f0x0  
RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=0
rcu: RCU grace-period kthread stack dump:
rcu_preempt     I29080    10      2 0x80004000
Call Trace:
  context_switch kernel/sched/core.c:3254 [inline]
  __schedule+0x877/0xc50 kernel/sched/core.c:3880
  schedule+0x131/0x1e0 kernel/sched/core.c:3947
  schedule_timeout+0x14f/0x240 kernel/time/timer.c:1807
  rcu_gp_fqs_loop kernel/rcu/tree.c:1611 [inline]
  rcu_gp_kthread+0xef8/0x1790 kernel/rcu/tree.c:1768
  kthread+0x332/0x350 kernel/kthread.c:255
  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
NMI backtrace for cpu 0
CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1d8/0x2f8 lib/dump_stack.c:113
  nmi_cpu_backtrace+0xaf/0x1a0 lib/nmi_backtrace.c:101
  nmi_trigger_cpumask_backtrace+0x174/0x290 lib/nmi_backtrace.c:62
  arch_trigger_cpumask_backtrace+0x10/0x20 arch/x86/kernel/apic/hw_nmi.c:38
  trigger_single_cpu_backtrace include/linux/nmi.h:164 [inline]
  rcu_dump_cpu_stacks+0x15a/0x220 kernel/rcu/tree_stall.h:254
  print_cpu_stall kernel/rcu/tree_stall.h:455 [inline]
  check_cpu_stall kernel/rcu/tree_stall.h:529 [inline]
  rcu_pending kernel/rcu/tree.c:2736 [inline]
  rcu_sched_clock_irq+0xb95/0x16d0 kernel/rcu/tree.c:2183
  update_process_times+0x134/0x190 kernel/time/timer.c:1639
  tick_sched_handle kernel/time/tick-sched.c:167 [inline]
  tick_sched_timer+0x263/0x420 kernel/time/tick-sched.c:1296
  __run_hrtimer kernel/time/hrtimer.c:1389 [inline]
  __hrtimer_run_queues+0x403/0x850 kernel/time/hrtimer.c:1451
  hrtimer_interrupt+0x38c/0xda0 kernel/time/hrtimer.c:1509
  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1106 [inline]
  smp_apic_timer_interrupt+0x109/0x280 arch/x86/kernel/apic/apic.c:1131
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830
RIP: 0010:__list_add_valid+0xc/0xc0 lib/list_debug.c:22
Code: 89 e5 53 48 89 fb e8 83 d6 1f fe 48 c7 c7 56 5a 45 88 48 89 de e8 44  
fd ff ff 5b 5d c3 90 55 48 89 e5 41 57 41 56 41 55 41 54 <53> 49 89 d6 49  
89 f4 49 89 ff 49 bd 00 00 00 00 00 fc ff df 48 8d
RSP: 0018:ffff8880aea09730 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
RAX: 1ffff11012f53d0a RBX: 1ffff11012f53d0b RCX: ffffffff88875a00
RDX: ffff888097a9e850 RSI: ffff888097a9e850 RDI: ffff888097a9e7b8
RBP: ffff8880aea09750 R08: ffffffff860c4d6a R09: 0000000000000000
R10: fffffbfff117be8d R11: 0000000000000000 R12: dffffc0000000000
R13: ffff888097a9e4c0 R14: ffff888097a9e850 R15: ffff888097a9e840
  __list_add include/linux/list.h:60 [inline]
  list_add_tail include/linux/list.h:93 [inline]
  list_move_tail include/linux/list.h:214 [inline]
  hhf_dequeue+0x535/0xaa0 net/sched/sch_hhf.c:439
  dequeue_skb net/sched/sch_generic.c:258 [inline]
  qdisc_restart net/sched/sch_generic.c:361 [inline]
  __qdisc_run+0x217/0x1b30 net/sched/sch_generic.c:379
  __dev_xmit_skb net/core/dev.c:3533 [inline]
  __dev_queue_xmit+0x1161/0x3020 net/core/dev.c:3838
  dev_queue_xmit+0x17/0x20 net/core/dev.c:3902
  neigh_hh_output include/net/neighbour.h:500 [inline]
  neigh_output include/net/neighbour.h:509 [inline]
  ip6_finish_output2+0xff2/0x13d0 net/ipv6/ip6_output.c:116
  __ip6_finish_output+0x693/0x910 net/ipv6/ip6_output.c:142
  ip6_finish_output+0x52/0x1e0 net/ipv6/ip6_output.c:152
  NF_HOOK_COND include/linux/netfilter.h:294 [inline]
  ip6_output+0x26f/0x390 net/ipv6/ip6_output.c:175
  dst_output include/net/dst.h:436 [inline]
  NF_HOOK include/linux/netfilter.h:305 [inline]
  mld_sendpack+0x770/0xb90 net/ipv6/mcast.c:1682
  mld_send_cr net/ipv6/mcast.c:1978 [inline]
  mld_ifc_timer_expire+0x820/0xb70 net/ipv6/mcast.c:2477
  call_timer_fn+0x95/0x170 kernel/time/timer.c:1322
  expire_timers kernel/time/timer.c:1366 [inline]
  __run_timers+0x79e/0x970 kernel/time/timer.c:1685
  run_timer_softirq+0x4a/0x90 kernel/time/timer.c:1698
  __do_softirq+0x333/0x7c4 arch/x86/include/asm/paravirt.h:778
  invoke_softirq kernel/softirq.c:373 [inline]
  irq_exit+0x227/0x230 kernel/softirq.c:413
  exiting_irq arch/x86/include/asm/apic.h:537 [inline]
  smp_apic_timer_interrupt+0x113/0x280 arch/x86/kernel/apic/apic.c:1133
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830
  </IRQ>
RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61
Code: 30 fa eb ae 89 d9 80 e1 07 80 c1 03 38 c1 7c ba 48 89 df e8 f4 b0 30  
fa eb b0 90 90 e9 07 00 00 00 0f 00 2d 96 b0 46 00 fb f4 <c3> 90 e9 07 00  
00 00 0f 00 2d 86 b0 46 00 f4 c3 90 90 55 48 89 e5
RSP: 0018:ffffffff88807dc0 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
RAX: 1ffffffff11150f3 RBX: ffffffff88875a00 RCX: dffffc0000000000
RDX: 0000000000000000 RSI: ffffffff812b7a1a RDI: ffffffff877bd46a
RBP: ffffffff88807dc8 R08: ffffffff81790e14 R09: fffffbfff110eb41
R10: fffffbfff110eb41 R11: 0000000000000000 R12: dffffc0000000000
R13: 1ffffffff110eb40 R14: dffffc0000000000 R15: 0000000000000000
  arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571
  default_idle_call+0x59/0xa0 kernel/sched/idle.c:94
  cpuidle_idle_call kernel/sched/idle.c:154 [inline]
  do_idle+0x140/0x6d0 kernel/sched/idle.c:263
  cpu_startup_entry+0x25/0x30 kernel/sched/idle.c:354
  rest_init+0x29d/0x2b0 init/main.c:451
  arch_call_rest_init+0xe/0x10
  start_kernel+0x6f5/0x7f6 init/main.c:785
  x86_64_start_reservations+0x18/0x2e arch/x86/kernel/head64.c:472
  x86_64_start_kernel+0x7a/0x7d arch/x86/kernel/head64.c:453
  secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
For information about bisection process see: https://goo.gl/tpsmEJ#bisection
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* BUG: unable to handle kernel NULL pointer dereference in tc_bind_tclass
From: syzbot @ 2019-09-08  6:08 UTC (permalink / raw)
  To: davem, jhs, jiri, linux-kernel, netdev, syzkaller-bugs,
	xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    0e5b36bc r8152: adjust the settings of ups flags
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=10e5ad76600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=67b69b427c3b2dbf
dashboard link: https://syzkaller.appspot.com/bug?extid=21b29db13c065852f64b
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=16cebbda600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15fb9d0a600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+21b29db13c065852f64b@syzkaller.appspotmail.com

8021q: adding VLAN 0 to HW filter on device batadv0
BUG: kernel NULL pointer dereference, address: 0000000000000000
#PF: supervisor instruction fetch in kernel mode
#PF: error_code(0x0010) - not-present page
PGD a9ba0067 P4D a9ba0067 PUD a7851067 PMD 0
Oops: 0010 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 8672 Comm: syz-executor994 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:0x0
Code: Bad RIP value.
RSP: 0018:ffff888097fb74d8 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffffffff884a7740 RCX: ffffffff85b55676
RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8880a4cd7400
RBP: ffff888097fb75d0 R08: ffff88808dc2e440 R09: ffff888097fb7658
R10: ffffed1012ff6ed9 R11: ffff888097fb76cf R12: ffff8880a4cd7400
R13: 0000000000000001 R14: ffff888097fb75a8 R15: ffffffff884a7740
FS:  0000555556952880(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffffffffd6 CR3: 000000009c578000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
  tc_bind_tclass+0x13e/0x2f0 net/sched/sch_api.c:1923
  tc_ctl_tclass+0xadb/0xcd0 net/sched/sch_api.c:2059
  rtnetlink_rcv_msg+0x463/0xb00 net/core/rtnetlink.c:5223
  netlink_rcv_skb+0x177/0x450 net/netlink/af_netlink.c:2477
  rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5241
  netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
  netlink_unicast+0x531/0x710 net/netlink/af_netlink.c:1328
  netlink_sendmsg+0x8a5/0xd60 net/netlink/af_netlink.c:1917
  sock_sendmsg_nosec net/socket.c:637 [inline]
  sock_sendmsg+0xd7/0x130 net/socket.c:657
  ___sys_sendmsg+0x803/0x920 net/socket.c:2311
  __sys_sendmsg+0x105/0x1d0 net/socket.c:2356
  __do_sys_sendmsg net/socket.c:2365 [inline]
  __se_sys_sendmsg net/socket.c:2363 [inline]
  __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2363
  do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x441cd9
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 7b 10 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffc9938bcf8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000315f6576616c RCX: 0000000000441cd9
RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000005
RBP: 735f656764697262 R08: 0000000001bbbbbb R09: 0000000001bbbbbb
R10: 0000000001bbbbbb R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000403270 R14: 0000000000000000 R15: 0000000000000000
Modules linked in:
CR2: 0000000000000000
---[ end trace d5605e2bdb92fab7 ]---
RIP: 0010:0x0
Code: Bad RIP value.
RSP: 0018:ffff888097fb74d8 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffffffff884a7740 RCX: ffffffff85b55676
RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff8880a4cd7400
RBP: ffff888097fb75d0 R08: ffff88808dc2e440 R09: ffff888097fb7658
R10: ffffed1012ff6ed9 R11: ffff888097fb76cf R12: ffff8880a4cd7400
R13: 0000000000000001 R14: ffff888097fb75a8 R15: ffffffff884a7740
FS:  0000555556952880(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: ffffffffffffffd6 CR3: 000000009c578000 CR4: 00000000001406f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: WARNING in __vunmap
From: syzbot @ 2019-09-08  7:05 UTC (permalink / raw)
  To: davem, herbert, linux-kernel, netdev, steffen.klassert,
	syzkaller-bugs
In-Reply-To: <00000000000092839d0581fd74ad@google.com>

syzbot has found a reproducer for the following crash on:

HEAD commit:    b3a9964c Merge tag 'char-misc-5.3-rc8' of git://git.kernel..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16c9f70a600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=144488c6c6c6d2b6
dashboard link: https://syzkaller.appspot.com/bug?extid=5ec9bb042ddfe9644773
compiler:       clang version 9.0.0 (/home/glider/llvm/clang  
80fee25776c2fb61e74c1ecb1a523375c2500b69)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=11a30371600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+5ec9bb042ddfe9644773@syzkaller.appspotmail.com

------------[ cut here ]------------
Trying to vfree() nonexistent vm area (00000000dddfa71b)
WARNING: CPU: 0 PID: 10463 at mm/vmalloc.c:2235 __vunmap+0x148/0xa20  
mm/vmalloc.c:2234
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 10463 Comm: syz-executor.0 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x1d8/0x2f8 lib/dump_stack.c:113
  panic+0x25c/0x799 kernel/panic.c:219
  __warn+0x22f/0x230 kernel/panic.c:576
  report_bug+0x190/0x290 lib/bug.c:186
  fixup_bug arch/x86/kernel/traps.c:179 [inline]
  do_error_trap+0xd7/0x440 arch/x86/kernel/traps.c:272
  do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:291
  invalid_op+0x23/0x30 arch/x86/entry/entry_64.S:1028
RIP: 0010:__vunmap+0x148/0xa20 mm/vmalloc.c:2234
Code: 0c e8 8c 36 d0 ff eb 24 e8 85 36 d0 ff 48 c7 c7 a8 f5 8f 88 e8 69 9b  
d8 05 48 c7 c7 e5 5c 3e 88 4c 89 f6 31 c0 e8 68 18 a3 ff <0f> 0b 48 83 c4  
60 5b 41 5c 41 5d 41 5e 41 5f 5d c3 48 c7 c7 a8 f5
RSP: 0018:ffff8880a81d75b8 EFLAGS: 00010246
RAX: 4324ba28a2c9f400 RBX: 0000000000000000 RCX: ffff8880a48ce080
RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000
RBP: ffff8880a81d7640 R08: ffffffff815cfa54 R09: ffffed1015d46088
R10: ffffed1015d46088 R11: 0000000000000000 R12: ffff88808a93f708
R13: dffffc0000000000 R14: ffffc900080f7000 R15: ffffc90008108000
  __vfree mm/vmalloc.c:2299 [inline]
  vfree+0x85/0x130 mm/vmalloc.c:2329
  ipcomp_free_scratches net/xfrm/xfrm_ipcomp.c:212 [inline]
  ipcomp_free_data+0x12a/0x1d0 net/xfrm/xfrm_ipcomp.c:321
  ipcomp_init_state+0x7bf/0x8b0 net/xfrm/xfrm_ipcomp.c:373
  ipcomp6_init_state+0xb7/0x630 net/ipv6/ipcomp6.c:153
  __xfrm_init_state+0x7d0/0xbf0 net/xfrm/xfrm_state.c:2493
  xfrm_state_construct net/xfrm/xfrm_user.c:626 [inline]
  xfrm_add_sa+0x223f/0x38e0 net/xfrm/xfrm_user.c:683
  xfrm_user_rcv_msg+0x3e6/0x650 net/xfrm/xfrm_user.c:2676
  netlink_rcv_skb+0x19e/0x3d0 net/netlink/af_netlink.c:2477
  xfrm_netlink_rcv+0x74/0x90 net/xfrm/xfrm_user.c:2684
  netlink_unicast_kernel net/netlink/af_netlink.c:1302 [inline]
  netlink_unicast+0x787/0x900 net/netlink/af_netlink.c:1328
  netlink_sendmsg+0x993/0xc50 net/netlink/af_netlink.c:1917
  sock_sendmsg_nosec net/socket.c:637 [inline]
  sock_sendmsg net/socket.c:657 [inline]
  ___sys_sendmsg+0x60d/0x910 net/socket.c:2311
  __sys_sendmsg net/socket.c:2356 [inline]
  __do_sys_sendmsg net/socket.c:2365 [inline]
  __se_sys_sendmsg net/socket.c:2363 [inline]
  __x64_sys_sendmsg+0x17c/0x200 net/socket.c:2363
  do_syscall_64+0xfe/0x140 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4598e9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f4880699c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004598e9
RDX: 0000000000000000 RSI: 0000000020000840 RDI: 0000000000000003
RBP: 000000000075bfc8 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f488069a6d4
R13: 00000000004c7812 R14: 00000000004dd0b0 R15: 00000000ffffffff
Kernel Offset: disabled
Rebooting in 86400 seconds..


^ permalink raw reply

* INFO: rcu detected stall in pppoe_sendmsg
From: syzbot @ 2019-09-08  7:19 UTC (permalink / raw)
  To: davem, jhs, jiri, linux-kernel, netdev, syzkaller-bugs,
	xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    1e3778cb Merge tag 'scsi-fixes' of git://git.kernel.org/pu..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=137b2971600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=b89bb446a3faaba4
dashboard link: https://syzkaller.appspot.com/bug?extid=55be5f513bed37fc4367
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)

Unfortunately, I don't have any reproducer for this crash yet.

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+55be5f513bed37fc4367@syzkaller.appspotmail.com

rcu: INFO: rcu_preempt self-detected stall on CPU
rcu: 	0-...!: (10501 ticks this GP) idle=06a/1/0x4000000000000002  
softirq=173683/173683 fqs=0
	(t=10502 jiffies g=271749 q=3228)
rcu: rcu_preempt kthread starved for 10503 jiffies! g271749 f0x0  
RCU_GP_WAIT_FQS(5) ->state=0x402 ->cpu=0
rcu: RCU grace-period kthread stack dump:
rcu_preempt     I29160    10      2 0x80004000
Call Trace:
  context_switch kernel/sched/core.c:3254 [inline]
  __schedule+0x755/0x1580 kernel/sched/core.c:3880
  schedule+0xd9/0x260 kernel/sched/core.c:3947
  schedule_timeout+0x486/0xc50 kernel/time/timer.c:1807
  rcu_gp_fqs_loop kernel/rcu/tree.c:1611 [inline]
  rcu_gp_kthread+0x9b2/0x18c0 kernel/rcu/tree.c:1768
  kthread+0x361/0x430 kernel/kthread.c:255
  ret_from_fork+0x24/0x30 arch/x86/entry/entry_64.S:352
NMI backtrace for cpu 0
CPU: 0 PID: 4124 Comm: syz-executor.2 Not tainted 5.3.0-rc7+ #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  <IRQ>
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 lib/dump_stack.c:113
  nmi_cpu_backtrace.cold+0x70/0xb2 lib/nmi_backtrace.c:101
  nmi_trigger_cpumask_backtrace+0x23b/0x28b lib/nmi_backtrace.c:62
  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
  trigger_single_cpu_backtrace include/linux/nmi.h:164 [inline]
  rcu_dump_cpu_stacks+0x183/0x1cf kernel/rcu/tree_stall.h:254
  print_cpu_stall kernel/rcu/tree_stall.h:455 [inline]
  check_cpu_stall kernel/rcu/tree_stall.h:529 [inline]
  rcu_pending kernel/rcu/tree.c:2736 [inline]
  rcu_sched_clock_irq.cold+0x4dd/0xc13 kernel/rcu/tree.c:2183
  update_process_times+0x32/0x80 kernel/time/timer.c:1639
  tick_sched_handle+0xa2/0x190 kernel/time/tick-sched.c:167
  tick_sched_timer+0x53/0x140 kernel/time/tick-sched.c:1296
  __run_hrtimer kernel/time/hrtimer.c:1389 [inline]
  __hrtimer_run_queues+0x364/0xe40 kernel/time/hrtimer.c:1451
  hrtimer_interrupt+0x314/0x770 kernel/time/hrtimer.c:1509
  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1106 [inline]
  smp_apic_timer_interrupt+0x160/0x610 arch/x86/kernel/apic/apic.c:1131
  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:830
  </IRQ>
RIP: 0010:preempt_count arch/x86/include/asm/preempt.h:26 [inline]
RIP: 0010:check_kcov_mode kernel/kcov.c:68 [inline]
RIP: 0010:__sanitizer_cov_trace_pc+0xd/0x50 kernel/kcov.c:102
Code: 6d 9f e9 ff 48 c7 05 1e 4d 19 09 00 00 00 00 e9 77 e9 ff ff 90 90 90  
90 90 90 90 90 90 55 48 89 e5 65 48 8b 04 25 40 fe 01 00 <65> 8b 15 04 89  
8f 7e 81 e2 00 01 1f 00 48 8b 75 08 75 2b 8b 90 f0
RSP: 0018:ffff88821b02f098 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff13
RAX: ffff88806c8524c0 RBX: ffff888096f52c38 RCX: ffffc9000bf4d000
RDX: 0000000000000000 RSI: ffffffff85c65fd2 RDI: ffff888096f52cec
RBP: ffff88821b02f098 R08: ffff88806c8524c0 R09: 0000000000000000
R10: fffffbfff134afaf R11: ffff88806c8524c0 R12: dffffc0000000000
R13: ffff888096f52940 R14: 0000000000000000 R15: 0000000000000000
  hhf_dequeue+0x586/0xa20 net/sched/sch_hhf.c:438
  dequeue_skb net/sched/sch_generic.c:258 [inline]
  qdisc_restart net/sched/sch_generic.c:361 [inline]
  __qdisc_run+0x1e7/0x19d0 net/sched/sch_generic.c:379
  __dev_xmit_skb net/core/dev.c:3533 [inline]
  __dev_queue_xmit+0x16f1/0x3650 net/core/dev.c:3838
  dev_queue_xmit+0x18/0x20 net/core/dev.c:3902
  br_dev_queue_push_xmit+0x3f3/0x5c0 net/bridge/br_forward.c:52
  NF_HOOK include/linux/netfilter.h:305 [inline]
  NF_HOOK include/linux/netfilter.h:299 [inline]
  br_forward_finish+0xfa/0x400 net/bridge/br_forward.c:65
  NF_HOOK include/linux/netfilter.h:305 [inline]
  NF_HOOK include/linux/netfilter.h:299 [inline]
  __br_forward+0x641/0xb00 net/bridge/br_forward.c:109
  br_forward+0x47c/0x500 net/bridge/br_forward.c:158
  br_dev_xmit+0xbf0/0x15a0 net/bridge/br_device.c:102
  __netdev_start_xmit include/linux/netdevice.h:4406 [inline]
  netdev_start_xmit include/linux/netdevice.h:4420 [inline]
  xmit_one net/core/dev.c:3280 [inline]
  dev_hard_start_xmit+0x1a3/0x9c0 net/core/dev.c:3296
  __dev_queue_xmit+0x2b15/0x3650 net/core/dev.c:3869
  dev_queue_xmit+0x18/0x20 net/core/dev.c:3902
  pppoe_sendmsg+0x661/0x7f0 drivers/net/ppp/pppoe.c:899
  sock_sendmsg_nosec net/socket.c:637 [inline]
  sock_sendmsg+0xd7/0x130 net/socket.c:657
  ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311
  __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413
  __do_sys_sendmmsg net/socket.c:2442 [inline]
  __se_sys_sendmmsg net/socket.c:2439 [inline]
  __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439
  do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4598e9
Code: fd b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 cb b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f20901b4c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000133
RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00000000004598e9
RDX: 000000000000033b RSI: 000000002000d180 RDI: 0000000000000003
RBP: 000000000075bf20 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007f20901b56d4
R13: 00000000004c70a7 R14: 00000000004dc768 R15: 00000000ffffffff


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

^ permalink raw reply

* Q: fixed link
From: Ranran @ 2019-09-08  7:30 UTC (permalink / raw)
  To: netdev

Hello,

In documentation of fixed-link it is said:"
Some Ethernet MACs have a "fixed link", and are not connected to a
normal MDIO-managed PHY device. For those situations, a Device Tree
binding allows to describe a "fixed link".
"
Does it mean, that on using unmanaged switch ("no cpu" mode), it is
better be used with fixed-link ?

Thanks,
ranran

^ permalink raw reply

* Re: general protection fault in dev_map_hash_update_elem
From: Toke Høiland-Jørgensen @ 2019-09-08  8:09 UTC (permalink / raw)
  To: Hillf Danton, syzbot
  Cc: alexei.starovoitov, ast, bpf, daniel, davem, hawk, jakub.kicinski,
	jbrouer, john.fastabend, kafai, linux-kernel, netdev,
	songliubraving, syzkaller-bugs, yhs
In-Reply-To: <20190908030726.7520-1-hdanton@sina.com>

Hillf Danton <hdanton@sina.com> writes:

>> syzbot has found a reproducer for the following crash on Sat, 07 Sep 2019 18:59:06 -0700
>> 
>> HEAD commit:    a2c11b03 kcm: use BPF_PROG_RUN
>> git tree:       bpf-next
>> console output: https://syzkaller.appspot.com/x/log.txt?x=13d46ec1600000
>> kernel config:  https://syzkaller.appspot.com/x/.config?x=cf0c85d15c20ade3
>> dashboard link: https://syzkaller.appspot.com/bug?extid=4e7a85b1432052e8d6f8
>> compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
>> syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=1220b2d1600000
>> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1360b26e600000
>> 
>> general protection fault: 0000 [#1] PREEMPT SMP KASAN
>> CPU: 1 PID: 10210 Comm: syz-executor910 Not tainted 5.3.0-rc7+ #0
>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
>> Google 01/01/2011
>> RIP: 0010:__write_once_size include/linux/compiler.h:226 [inline]
>> RIP: 0010:__hlist_del include/linux/list.h:762 [inline]
>> RIP: 0010:hlist_del_rcu include/linux/rculist.h:455 [inline]
>> RIP: 0010:__dev_map_hash_update_elem kernel/bpf/devmap.c:668 [inline]
>> RIP: 0010:dev_map_hash_update_elem+0x3c8/0x6e0 kernel/bpf/devmap.c:691
>
> Fix commit 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking
> up devices by hashed index")

While this minimal patch does fix the bug (as Jesper already noted), I
prefer to rework the logic instead of just repeating the lookup; a patch
is on its way :)

-Toke

^ permalink raw reply

* [PATCH bpf-next] xdp: Fix race in dev_map_hash_update_elem() when replacing element
From: Toke Høiland-Jørgensen @ 2019-09-08  8:20 UTC (permalink / raw)
  To: make-wifi-fast, linux-wireless, ast, bpf, daniel, davem, hawk,
	jakub.kicinski, john.fastabend, kafai, linux-kernel, netdev,
	songliubraving, syzkaller-bugs, yhs
  Cc: Toke Høiland-Jørgensen, syzbot+4e7a85b1432052e8d6f8
In-Reply-To: <0000000000005091a70591d3e1d9@google.com>

syzbot found a crash in dev_map_hash_update_elem(), when replacing an
element with a new one. Jesper correctly identified the cause of the crash
as a race condition between the initial lookup in the map (which is done
before taking the lock), and the removal of the old element.

Rather than just add a second lookup into the hashmap after taking the
lock, fix this by reworking the function logic to take the lock before the
initial lookup.

Fixes: 6f9d451ab1a3 ("xdp: Add devmap_hash map type for looking up devices by hashed index")
Reported-and-tested-by: syzbot+4e7a85b1432052e8d6f8@syzkaller.appspotmail.com
Signed-off-by: Toke Høiland-Jørgensen <toke@redhat.com>
---
 kernel/bpf/devmap.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 9af048a932b5..d27f3b60ff6d 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -650,19 +650,22 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 	u32 ifindex = *(u32 *)value;
 	u32 idx = *(u32 *)key;
 	unsigned long flags;
+	int err = -EEXIST;
 
 	if (unlikely(map_flags > BPF_EXIST || !ifindex))
 		return -EINVAL;
 
+	spin_lock_irqsave(&dtab->index_lock, flags);
+
 	old_dev = __dev_map_hash_lookup_elem(map, idx);
 	if (old_dev && (map_flags & BPF_NOEXIST))
-		return -EEXIST;
+		goto out_err;
 
 	dev = __dev_map_alloc_node(net, dtab, ifindex, idx);
-	if (IS_ERR(dev))
-		return PTR_ERR(dev);
-
-	spin_lock_irqsave(&dtab->index_lock, flags);
+	if (IS_ERR(dev)) {
+		err = PTR_ERR(dev);
+		goto out_err;
+	}
 
 	if (old_dev) {
 		hlist_del_rcu(&old_dev->index_hlist);
@@ -683,6 +686,10 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 		call_rcu(&old_dev->rcu, __dev_map_entry_free);
 
 	return 0;
+
+out_err:
+	spin_unlock_irqrestore(&dtab->index_lock, flags);
+	return err;
 }
 
 static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
-- 
2.23.0


^ permalink raw reply related

* Load-balancing considering queue lengths
From: Daniel Schaffrath @ 2019-09-08  8:13 UTC (permalink / raw)
  To: netdev

Hello everybody,

when load balancing packets/bytes among several links it seems to be a 
natural choice to rely the decisions about the outgoing device on the 
current queue lengths of the available devices. Looking at typical 
netfilter configurations or nftlb this does not seem to be a common 
choice, though.

Considering the abilities of eBPF and netfilter I think it would be 
totally possible. But I might be mistaken in either regard (good idea / 
technical possibility).

I would be very grateful, if you could provide me with any pointers that 
I could educate myself on that matter.

Thanks a lot in advance, Daniel

^ permalink raw reply

* Re: [PATCH 1/2] net: phy: dp83867: Add documentation for SGMII mode type
From: Andrew Lunn @ 2019-09-08  8:54 UTC (permalink / raw)
  To: Vitaly Gaiduk
  Cc: davem@davemloft.net, robh+dt@kernel.org, f.fainelli@gmail.com,
	Mark Rutland, netdev@vger.kernel.org, devicetree@vger.kernel.org,
	linux-kernel@vger.kernel.org, Trent Piepho
In-Reply-To: <2894361567896439@iva5-be053096037b.qloud-c.yandex.net>

On Sun, Sep 08, 2019 at 01:47:19AM +0300, Vitaly Gaiduk wrote:
> Hi, Andrew.<div>I’m ready to do this property with such name but is it good practice to do such long names? :)</div><div>Also, Trent Piepho wrote about sgmii-clk and merged all ideas we have “ti,sgmii-ref-clk”.</div><div>It’s better, isn’t it?</div><div>Vitaly.</div><div><div><br />07.09.2019, 18:39, "Andrew Lunn" &lt;andrew@lunn.ch&gt;:<br /><blockquote><p>On Thu, Sep 05, 2019 at 07:26:00PM +0300, Vitaly Gaiduk wrote:<br /></p><blockquote class="b4fd5cf2ec92bc68cb898700bb81355fwmi-quote"> Add documentation of ti,sgmii-type which can be used to select<br /> SGMII mode type (4 or 6-wire).<br /><br /> Signed-off-by: Vitaly Gaiduk &lt;<a href="mailto:vitaly.gaiduk@cloudbear.ru">vitaly.gaiduk@cloudbear.ru</a>&gt;<br /> ---<br />  Documentation/devicetree/bindings/net/ti,dp83867.txt | 1 +<br />  1 file changed, 1 insertion(+)<br /><br /> diff --git a/Documentation/devicetree/bindings/net/ti,dp83867.txt b/Documentation/devicetree/bindings/net/ti,dp83867.txt<br /> index db6aa3f2215b..18e7fd52897f 100644<br /> --- a/Documentation/devicetree/bindings/net/ti,dp83867.txt<br /> +++ b/Documentation/devicetree/bindings/net/ti,dp83867.txt<br /> @@ -37,6 +37,7 @@ Optional property:<br />                                for applicable values.  The CLK_OUT pin can also<br />                                be disabled by this property.  When omitted, the<br />                                PHY's default will be left as is.<br /> +	- ti,sgmii-type - This denotes the fact which SGMII mode is used (4 or 6-wire).<br /></blockquote><p><br />Hi Vitaly<br /><br />You probably want to make this a Boolean. I don't think SGMII type is<br />a good idea. This is about enabling the receive clock to be passed to<br />the MAC. So how about ti,sgmii-ref-clock-output-enable.<br /><br />    Andrew<br /></p></blockquote></div></div>

Hi Vitaly

Please reconfigure your mail client to not obfuscate with HTML.

The length should be O.K. For a PHY node, it should not be too deeply
indented, unless it happens to be part of an Ethernet switch.

	  Andrew

^ permalink raw reply

* Re: [PATCH net-next 3/3] net: dsa: mv88e6xxx: add RXNFC support
From: Andrew Lunn @ 2019-09-08  8:55 UTC (permalink / raw)
  To: Vivien Didelot; +Cc: netdev, davem, f.fainelli
In-Reply-To: <20190907172510.GB27514@t480s.localdomain>

On Sat, Sep 07, 2019 at 05:25:10PM -0400, Vivien Didelot wrote:
> Hi Andrew,
> 
> On Sat, 7 Sep 2019 22:32:56 +0200, Andrew Lunn <andrew@lunn.ch> wrote:
> > > +	policy = devm_kzalloc(chip->dev, sizeof(*policy), GFP_KERNEL);
> > > +	if (!policy)
> > > +		return -ENOMEM;
> > 
> > I think this might be the first time we have done dynamic memory
> > allocation in the mv88e6xxx driver. It might even be a first for a DSA
> > driver?
> > 
> > I'm not saying it is wrong, but maybe we should discuss it. 
> > 
> > I assume you are doing this because the ATU entry itself is not
> > sufficient?
> > 
> > How much memory is involved here, worst case? I assume one struct
> > mv88e6xxx_policy per ATU entry? Which you think is too much to
> > allocate as part of chip? I guess most users will never use this
> > feature, so for most users it would be wasted memory. So i do see the
> > point for dynamically allocating it.
> 
> A layer 2 policy is not limited to the ATU. It can also be based on a VTU
> entry, on the port's Etype, or frame's Etype. We can have 0, 1 or literally
> thousands of policies programmed by the user.

O.K, then it has to by dynamic memory.

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: Q: fixed link
From: Andrew Lunn @ 2019-09-08  9:05 UTC (permalink / raw)
  To: Ranran; +Cc: netdev
In-Reply-To: <CAJ2oMhKUTUU0eHTmS62itBw6L9Jut=ps6y8GuVDP44xadn03dw@mail.gmail.com>

On Sun, Sep 08, 2019 at 10:30:51AM +0300, Ranran wrote:
> Hello,
> 
> In documentation of fixed-link it is said:"
> Some Ethernet MACs have a "fixed link", and are not connected to a
> normal MDIO-managed PHY device. For those situations, a Device Tree
> binding allows to describe a "fixed link".
> "
> Does it mean, that on using unmanaged switch ("no cpu" mode), it is
> better be used with fixed-link ?

Hi Ranran

Is there a MAC to MAC connection, or PHY to PHY connection?

If the interface MAC is directly connected to the switch MAC, fixed
link is what you should use. The fixed link will then tell the
interface MAC what speed it should use.

If you have back to back PHYs, you need a PHY driver for the PHY
connected to the interface MAC, to configure its speed, duplex
etc. The dumb switch should be controlling its PHY, and auto-neg will
probably work.

	 Andrew

^ permalink raw reply

* Re: [PATCH v3 1/2] net: core: Notify on changes to dev->promiscuity.
From: Ido Schimmel @ 2019-09-08 10:15 UTC (permalink / raw)
  To: Allan W. Nielsen
  Cc: Jiri Pirko, David Miller, andrew, horatiu.vultur,
	alexandre.belloni, UNGLinuxDriver, ivecera, f.fainelli, netdev,
	linux-kernel
In-Reply-To: <20190903081410.zpcdm2dzqrxyg43c@lx-anielsen.microsemi.net>

On Tue, Sep 03, 2019 at 10:14:12AM +0200, Allan W. Nielsen wrote:
> The 09/03/2019 09:13, Ido Schimmel wrote:
> > On Mon, Sep 02, 2019 at 07:42:31PM +0200, Allan W. Nielsen wrote:
> > With these patches applied I assume I will see the following traffic
> > when running tcpdump on one of the netdevs exposed by the ocelot driver:
> > 
> > - Ingress: All
> > - Egress: Only locally generated traffic and traffic forwarded by the
> >   kernel from interfaces not belonging to the ocelot driver
> > 
> > The above means I will not see any offloaded traffic transmitted by the
> > port. Is that correct?
> Correct - but maybe we should change this.
> 
> In Ocelot and in LANxxxx (the part we are working on now), we can come pretty
> close. We can get the offloaded TX traffic to the CPU, but it will not be
> re-written (it will look like the ingress frame, which is not always the same as
> the egress frame, vlan tags an others may be re-written).

Yes, this is the same with mlxsw. You can trap the egress frames, but
they will reach the CPU unmodified via the ingress port.

> In some of our chips we can actually do this (not Ocelot, and not the LANxxxx
> part we are working on now) after the frame as been re-written.

Cool.

> > I see that the driver is setting 'offload_fwd_mark' for any traffic trapped
> > from bridged ports, which means the bridge will drop it before it traverses
> > the packet taps on egress.
> Correct.
> 
> > Large parts of the discussion revolve around the fact that switch ports
> > are not any different than other ports. Dave wrote "Please stop
> > portraying switches as special in this regard" and Andrew wrote "[The
> > user] just wants tcpdump to work like on their desktop."
> And we are trying to get as close to this as practical possible, knowing that it
> may not be exactly the same.
> 
> > But if anything, this discussion proves that switch ports are special in
> > this regard and that tcpdump will not work like on the desktop.
> I think it can come really close. Some drivers may be able to fix the TX issue
> you point out, others may not.
> 
> > Beside the fact that I don't agree (but gave up) with the new
> > interpretation of promisc mode, I wonder if we're not asking for trouble
> > with this patchset. Users will see all offloaded traffic on ingress, but
> > none of it on egress. This is in contrast to the sever/desktop, where
> > Linux is much more dominant in comparison to switches (let alone hw
> > accelerated ones) and where all the traffic is visible through tcpdump.
> > I can already see myself having to explain this over and over again to
> > confused users.
> > 
> > Now, I understand that showing egress traffic is inherently difficult.
> > It means one of two things:
> > 
> > 1. We allow packets to be forwarded by both the software and the
> > hardware
> > 2. We trap all ingressing traffic from all the ports
> If the HW cannot copy the egress traffic to the CPU (which our HW cannot), then
> you need to do both. All ingress traffic needs to go to the CPU, you need to
> make all the forwarding decisions in the CPU, to figure out what traffic happens
> to go to the port you want to monitor.
> 
> I really doubt this will work in real life. Too much traffic, and HW may make
> different forwarding decision that the SW (tc rules in HW but not in SW), which
> means that it will not be good for debugging anyway.

I agree.

> 
> > Both options can have devastating effects on the network and therefore
> > should not be triggered by a supposedly innocent invocation of tcpdump.
> Agree.
> 
> > I again wonder if it would not be wiser to solve this by introducing two
> > new flags to tcpdump for ingress/egress (similar to -Q in/out) capturing
> > of offloaded traffic. The capturing of egress offloaded traffic can be
> > documented with the appropriate warnings.
> Not sure I agree, but I will try to spend some more time considering it.
> 
> In the mean while, what TC action was it that Jiri suggestion we should use? The
> trap action is no good, and it prevents the forwarding in silicon, and I'm not
> aware of a "COPY-TO-CPU" action.

I agree. We would either need a new or just extend the existing one with
a new attribute.

> > Anyway, I don't want to hold you up, I merely want to make sure that the
> > above (assuming it's correct) is considered before the patches are
> > applied.
> Sounds good, and thanks for all the time spend on reviewing and asking the
> critical questions.

Thanks for bringing up these issues. I will be happy to review future
patches.

^ permalink raw reply

* Re: [patch net-next v2 3/3] net: devlink: move reload fail indication to devlink core and expose to user
From: Ido Schimmel @ 2019-09-08 10:39 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev@vger.kernel.org, davem@davemloft.net, dsahern@gmail.com,
	jakub.kicinski@netronome.com, Tariq Toukan, mlxsw
In-Reply-To: <20190907205400.14589-4-jiri@resnulli.us>

On Sat, Sep 07, 2019 at 10:54:00PM +0200, Jiri Pirko wrote:
> diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
> index 546e75dd74ac..7cb5e8c5ae0d 100644
> --- a/include/uapi/linux/devlink.h
> +++ b/include/uapi/linux/devlink.h
> @@ -410,6 +410,8 @@ enum devlink_attr {
>  	DEVLINK_ATTR_TRAP_METADATA,			/* nested */
>  	DEVLINK_ATTR_TRAP_GROUP_NAME,			/* string */
>  
> +	DEVLINK_ATTR_RELOAD_FAILED,			/* u8 0 or 1 */
> +
>  	/* add new attributes above here, update the policy in devlink.c */
>  
>  	__DEVLINK_ATTR_MAX,
> diff --git a/net/core/devlink.c b/net/core/devlink.c
> index 1e3a2288b0b2..e00a4a643d17 100644
> --- a/net/core/devlink.c
> +++ b/net/core/devlink.c
> @@ -471,6 +471,8 @@ static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
>  
>  	if (devlink_nl_put_handle(msg, devlink))
>  		goto nla_put_failure;
> +	if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))

Why not use NLA_FLAG for this?

> +		goto nla_put_failure;
>  
>  	genlmsg_end(msg, hdr);
>  	return 0;
> @@ -2677,6 +2679,21 @@ static bool devlink_reload_supported(struct devlink *devlink)
>  	return devlink->ops->reload_down && devlink->ops->reload_up;
>  }

^ permalink raw reply

* Re: [PATCH 2/2] vhost: re-introducing metadata acceleration through kernel virtual address
From: Michael S. Tsirkin @ 2019-09-08 11:05 UTC (permalink / raw)
  To: Jason Wang
  Cc: kvm, virtualization, netdev, linux-kernel, jgg, aarcange, jglisse,
	linux-mm, James Bottomley, Christoph Hellwig, David Miller,
	linux-arm-kernel, linux-parisc
In-Reply-To: <20190905122736.19768-3-jasowang@redhat.com>

On Thu, Sep 05, 2019 at 08:27:36PM +0800, Jason Wang wrote:
> This is a rework on the commit 7f466032dc9e ("vhost: access vq
> metadata through kernel virtual address").
> 
> It was noticed that the copy_to/from_user() friends that was used to
> access virtqueue metdata tends to be very expensive for dataplane
> implementation like vhost since it involves lots of software checks,
> speculation barriers,

So if we drop speculation barrier,
there's a problem here in access will now be speculated.
This effectively disables the defence in depth effect of
b3bbfb3fb5d25776b8e3f361d2eedaabb0b496cd
    x86: Introduce __uaccess_begin_nospec() and uaccess_try_nospec


So now we need to sprinkle array_index_nospec or barrier_nospec over the
code whenever we use an index we got from userspace.
See below for some examples.


> hardware feature toggling (e.g SMAP). The
> extra cost will be more obvious when transferring small packets since
> the time spent on metadata accessing become more significant.
> 
> This patch tries to eliminate those overheads by accessing them
> through direct mapping of those pages. Invalidation callbacks is
> implemented for co-operation with general VM management (swap, KSM,
> THP or NUMA balancing). We will try to get the direct mapping of vq
> metadata before each round of packet processing if it doesn't
> exist. If we fail, we will simplely fallback to copy_to/from_user()
> friends.
> 
> This invalidation, direct mapping access and set are synchronized
> through spinlock. This takes a step back from the original commit
> 7f466032dc9e ("vhost: access vq metadata through kernel virtual
> address") which tries to RCU which is suspicious and hard to be
> reviewed. This won't perform as well as RCU because of the atomic,
> this could be addressed by the future optimization.
> 
> This method might does not work for high mem page which requires
> temporary mapping so we just fallback to normal
> copy_to/from_user() and may not for arch that has virtual tagged cache
> since extra cache flushing is needed to eliminate the alias. This will
> result complex logic and bad performance. For those archs, this patch
> simply go for copy_to/from_user() friends. This is done by ruling out
> kernel mapping codes through ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.
> 
> Note that this is only done when device IOTLB is not enabled. We
> could use similar method to optimize IOTLB in the future.
> 
> Tests shows at most about 22% improvement on TX PPS when using
> virtio-user + vhost_net + xdp1 + TAP on 4.0GHz Kaby Lake.
> 
>         SMAP on | SMAP off
> Before: 4.9Mpps | 6.9Mpps
> After:  6.0Mpps | 7.5Mpps
> 
> On a elder CPU Sandy Bridge without SMAP support. TX PPS doesn't see
> any difference.

Why is not Kaby Lake with SMAP off the same as Sandy Bridge?


> Cc: Andrea Arcangeli <aarcange@redhat.com>
> Cc: James Bottomley <James.Bottomley@hansenpartnership.com>
> Cc: Christoph Hellwig <hch@infradead.org>
> Cc: David Miller <davem@davemloft.net>
> Cc: Jerome Glisse <jglisse@redhat.com>
> Cc: Jason Gunthorpe <jgg@mellanox.com>
> Cc: linux-mm@kvack.org
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-parisc@vger.kernel.org
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/vhost/vhost.c | 551 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/vhost.h |  41 ++++
>  2 files changed, 589 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 791562e03fe0..f98155f28f02 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -298,6 +298,182 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
>  		__vhost_vq_meta_reset(d->vqs[i]);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_map_unprefetch(struct vhost_map *map)
> +{
> +	kfree(map->pages);
> +	kfree(map);
> +}
> +
> +static void vhost_set_map_dirty(struct vhost_virtqueue *vq,
> +				struct vhost_map *map, int index)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	int i;
> +
> +	if (uaddr->write) {
> +		for (i = 0; i < map->npages; i++)
> +			set_page_dirty(map->pages[i]);
> +	}
> +}
> +
> +static void vhost_uninit_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	struct vhost_map *map[VHOST_NUM_ADDRS];
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++) {
> +		map[i] = vq->maps[i];
> +		if (map[i]) {
> +			vhost_set_map_dirty(vq, map[i], i);
> +			vq->maps[i] = NULL;
> +		}
> +	}
> +	spin_unlock(&vq->mmu_lock);
> +
> +	/* No need for synchronization since we are serialized with
> +	 * memory accessors (e.g vq mutex held).
> +	 */
> +
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		if (map[i])
> +			vhost_map_unprefetch(map[i]);
> +
> +}
> +
> +static void vhost_reset_vq_maps(struct vhost_virtqueue *vq)
> +{
> +	int i;
> +
> +	vhost_uninit_vq_maps(vq);
> +	for (i = 0; i < VHOST_NUM_ADDRS; i++)
> +		vq->uaddrs[i].size = 0;
> +}
> +
> +static bool vhost_map_range_overlap(struct vhost_uaddr *uaddr,
> +				     unsigned long start,
> +				     unsigned long end)
> +{
> +	if (unlikely(!uaddr->size))
> +		return false;
> +
> +	return !(end < uaddr->uaddr || start > uaddr->uaddr - 1 + uaddr->size);
> +}
> +
> +static void inline vhost_vq_access_map_begin(struct vhost_virtqueue *vq)
> +{
> +	spin_lock(&vq->mmu_lock);
> +}
> +
> +static void inline vhost_vq_access_map_end(struct vhost_virtqueue *vq)
> +{
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_vq_start(struct vhost_virtqueue *vq,
> +				     int index,
> +				     unsigned long start,
> +				     unsigned long end,
> +				     bool blockable)
> +{
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct vhost_map *map;
> +
> +	if (!vhost_map_range_overlap(uaddr, start, end))
> +		return 0;
> +	else if (!blockable)
> +		return -EAGAIN;
> +
> +	spin_lock(&vq->mmu_lock);
> +	++vq->invalidate_count;
> +
> +	map = vq->maps[index];
> +	if (map)
> +		vq->maps[index] = NULL;
> +	spin_unlock(&vq->mmu_lock);
> +
> +	if (map) {
> +		vhost_set_map_dirty(vq, map, index);
> +		vhost_map_unprefetch(map);
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_vq_end(struct vhost_virtqueue *vq,
> +				    int index,
> +				    unsigned long start,
> +				    unsigned long end)
> +{
> +	if (!vhost_map_range_overlap(&vq->uaddrs[index], start, end))
> +		return;
> +
> +	spin_lock(&vq->mmu_lock);
> +	--vq->invalidate_count;
> +	spin_unlock(&vq->mmu_lock);
> +}
> +
> +static int vhost_invalidate_range_start(struct mmu_notifier *mn,
> +					const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	bool blockable = mmu_notifier_range_blockable(range);
> +	int i, j, ret;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++) {
> +			ret = vhost_invalidate_vq_start(vq, j,
> +							range->start,
> +							range->end, blockable);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void vhost_invalidate_range_end(struct mmu_notifier *mn,
> +				       const struct mmu_notifier_range *range)
> +{
> +	struct vhost_dev *dev = container_of(mn, struct vhost_dev,
> +					     mmu_notifier);
> +	int i, j;
> +
> +	for (i = 0; i < dev->nvqs; i++) {
> +		struct vhost_virtqueue *vq = dev->vqs[i];
> +
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vhost_invalidate_vq_end(vq, j,
> +						range->start,
> +						range->end);
> +	}
> +}
> +
> +static const struct mmu_notifier_ops vhost_mmu_notifier_ops = {
> +	.invalidate_range_start = vhost_invalidate_range_start,
> +	.invalidate_range_end = vhost_invalidate_range_end,
> +};
> +
> +static void vhost_init_maps(struct vhost_dev *dev)
> +{
> +	struct vhost_virtqueue *vq;
> +	int i, j;
> +
> +	dev->mmu_notifier.ops = &vhost_mmu_notifier_ops;
> +
> +	for (i = 0; i < dev->nvqs; ++i) {
> +		vq = dev->vqs[i];
> +		for (j = 0; j < VHOST_NUM_ADDRS; j++)
> +			vq->maps[j] = NULL;
> +	}
> +}
> +#endif
> +
>  static void vhost_vq_reset(struct vhost_dev *dev,
>  			   struct vhost_virtqueue *vq)
>  {
> @@ -326,7 +502,11 @@ static void vhost_vq_reset(struct vhost_dev *dev,
>  	vq->busyloop_timeout = 0;
>  	vq->umem = NULL;
>  	vq->iotlb = NULL;
> +	vq->invalidate_count = 0;
>  	__vhost_vq_meta_reset(vq);
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_reset_vq_maps(vq);
> +#endif
>  }
>  
>  static int vhost_worker(void *data)
> @@ -471,12 +651,15 @@ void vhost_dev_init(struct vhost_dev *dev,
>  	dev->iov_limit = iov_limit;
>  	dev->weight = weight;
>  	dev->byte_weight = byte_weight;
> +	dev->has_notifier = false;
>  	init_llist_head(&dev->work_list);
>  	init_waitqueue_head(&dev->wait);
>  	INIT_LIST_HEAD(&dev->read_list);
>  	INIT_LIST_HEAD(&dev->pending_list);
>  	spin_lock_init(&dev->iotlb_lock);
> -
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	vhost_init_maps(dev);
> +#endif
>  
>  	for (i = 0; i < dev->nvqs; ++i) {
>  		vq = dev->vqs[i];
> @@ -485,6 +668,7 @@ void vhost_dev_init(struct vhost_dev *dev,
>  		vq->heads = NULL;
>  		vq->dev = dev;
>  		mutex_init(&vq->mutex);
> +		spin_lock_init(&vq->mmu_lock);
>  		vhost_vq_reset(dev, vq);
>  		if (vq->handle_kick)
>  			vhost_poll_init(&vq->poll, vq->handle_kick,
> @@ -564,7 +748,19 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>  	if (err)
>  		goto err_cgroup;
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	err = mmu_notifier_register(&dev->mmu_notifier, dev->mm);
> +	if (err)
> +		goto err_mmu_notifier;
> +#endif
> +	dev->has_notifier = true;
> +
>  	return 0;
> +
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +err_mmu_notifier:
> +	vhost_dev_free_iovecs(dev);
> +#endif
>  err_cgroup:
>  	kthread_stop(worker);
>  	dev->worker = NULL;
> @@ -655,6 +851,107 @@ static void vhost_clear_msg(struct vhost_dev *dev)
>  	spin_unlock(&dev->iotlb_lock);
>  }
>  
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +static void vhost_setup_uaddr(struct vhost_virtqueue *vq,
> +			      int index, unsigned long uaddr,
> +			      size_t size, bool write)
> +{
> +	struct vhost_uaddr *addr = &vq->uaddrs[index];
> +
> +	addr->uaddr = uaddr;
> +	addr->size = size;
> +	addr->write = write;
> +}
> +
> +static void vhost_setup_vq_uaddr(struct vhost_virtqueue *vq)
> +{
> +	vhost_setup_uaddr(vq, VHOST_ADDR_DESC,
> +			  (unsigned long)vq->desc,
> +			  vhost_get_desc_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_AVAIL,
> +			  (unsigned long)vq->avail,
> +			  vhost_get_avail_size(vq, vq->num),
> +			  false);
> +	vhost_setup_uaddr(vq, VHOST_ADDR_USED,
> +			  (unsigned long)vq->used,
> +			  vhost_get_used_size(vq, vq->num),
> +			  true);
> +}
> +
> +static int vhost_map_prefetch(struct vhost_virtqueue *vq,
> +			       int index)
> +{
> +	struct vhost_map *map;
> +	struct vhost_uaddr *uaddr = &vq->uaddrs[index];
> +	struct page **pages;
> +	int npages = DIV_ROUND_UP(uaddr->size, PAGE_SIZE);
> +	int npinned;
> +	void *vaddr, *v;
> +	int err;
> +	int i;
> +
> +	spin_lock(&vq->mmu_lock);
> +
> +	err = -EFAULT;
> +	if (vq->invalidate_count)
> +		goto err;
> +
> +	err = -ENOMEM;
> +	map = kmalloc(sizeof(*map), GFP_ATOMIC);
> +	if (!map)
> +		goto err;
> +
> +	pages = kmalloc_array(npages, sizeof(struct page *), GFP_ATOMIC);
> +	if (!pages)
> +		goto err_pages;
> +
> +	err = EFAULT;
> +	npinned = __get_user_pages_fast(uaddr->uaddr, npages,
> +					uaddr->write, pages);
> +	if (npinned > 0)
> +		release_pages(pages, npinned);
> +	if (npinned != npages)
> +		goto err_gup;
> +
> +	for (i = 0; i < npinned; i++)
> +		if (PageHighMem(pages[i]))
> +			goto err_gup;
> +
> +	vaddr = v = page_address(pages[0]);
> +
> +	/* For simplicity, fallback to userspace address if VA is not
> +	 * contigious.
> +	 */
> +	for (i = 1; i < npinned; i++) {
> +		v += PAGE_SIZE;
> +		if (v != page_address(pages[i]))
> +			goto err_gup;
> +	}
> +
> +	map->addr = vaddr + (uaddr->uaddr & (PAGE_SIZE - 1));
> +	map->npages = npages;
> +	map->pages = pages;
> +
> +	vq->maps[index] = map;
> +	/* No need for a synchronize_rcu(). This function should be
> +	 * called by dev->worker so we are serialized with all
> +	 * readers.
> +	 */
> +	spin_unlock(&vq->mmu_lock);
> +
> +	return 0;
> +
> +err_gup:
> +	kfree(pages);
> +err_pages:
> +	kfree(map);
> +err:
> +	spin_unlock(&vq->mmu_lock);
> +	return err;
> +}
> +#endif
> +
>  void vhost_dev_cleanup(struct vhost_dev *dev)
>  {
>  	int i;
> @@ -684,8 +981,20 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
>  		kthread_stop(dev->worker);
>  		dev->worker = NULL;
>  	}
> -	if (dev->mm)
> +	if (dev->mm) {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +		if (dev->has_notifier) {
> +			mmu_notifier_unregister(&dev->mmu_notifier,
> +						dev->mm);
> +			dev->has_notifier = false;
> +		}
> +#endif
>  		mmput(dev->mm);
> +	}
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	for (i = 0; i < dev->nvqs; i++)
> +		vhost_uninit_vq_maps(dev->vqs[i]);
> +#endif
>  	dev->mm = NULL;
>  }
>  EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -914,6 +1223,26 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
>  
>  static inline int vhost_put_avail_event(struct vhost_virtqueue *vq)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*((__virtio16 *)&used->ring[vq->num]) =
> +				cpu_to_vhost16(vq, vq->avail_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->avail_idx),
>  			      vhost_avail_event(vq));
>  }
> @@ -922,6 +1251,27 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  				 struct vring_used_elem *head, int idx,
>  				 int count)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +	size_t size;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			size = count * sizeof(*head);
> +			memcpy(used->ring + idx, head, size);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_to_user(vq, vq->used->ring + idx, head,
>  				  count * sizeof(*head));
>  }
> @@ -929,6 +1279,25 @@ static inline int vhost_put_used(struct vhost_virtqueue *vq,
>  static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->flags = cpu_to_vhost16(vq, vq->used_flags);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->used_flags),
>  			      &vq->used->flags);
>  }
> @@ -936,6 +1305,25 @@ static inline int vhost_put_used_flags(struct vhost_virtqueue *vq)
>  static inline int vhost_put_used_idx(struct vhost_virtqueue *vq)
>  
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			used->idx = cpu_to_vhost16(vq, vq->last_used_idx);
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_put_user(vq, cpu_to_vhost16(vq, vq->last_used_idx),
>  			      &vq->used->idx);
>  }
> @@ -981,12 +1369,50 @@ static void vhost_dev_unlock_vqs(struct vhost_dev *d)
>  static inline int vhost_get_avail_idx(struct vhost_virtqueue *vq,
>  				      __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*idx = avail->idx;

index can now be speculated.

> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *idx, &vq->avail->idx);
>  }
>  
>  static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  				       __virtio16 *head, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*head = avail->ring[idx & (vq->num - 1)];


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *head,
>  			       &vq->avail->ring[idx & (vq->num - 1)]);
>  }
> @@ -994,24 +1420,98 @@ static inline int vhost_get_avail_head(struct vhost_virtqueue *vq,
>  static inline int vhost_get_avail_flags(struct vhost_virtqueue *vq,
>  					__virtio16 *flags)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*flags = avail->flags;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *flags, &vq->avail->flags);
>  }
>  
>  static inline int vhost_get_used_event(struct vhost_virtqueue *vq,
>  				       __virtio16 *event)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_avail *avail;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +		map = vq->maps[VHOST_ADDR_AVAIL];
> +		if (likely(map)) {
> +			avail = map->addr;
> +			*event = (__virtio16)avail->ring[vq->num];
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_avail(vq, *event, vhost_used_event(vq));
>  }
>  
>  static inline int vhost_get_used_idx(struct vhost_virtqueue *vq,
>  				     __virtio16 *idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_used *used;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_USED];
> +		if (likely(map)) {
> +			used = map->addr;
> +			*idx = used->idx;
> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_get_used(vq, *idx, &vq->used->idx);
>  }


This seems to be used during init. Why do we bother
accelerating this?


>  
>  static inline int vhost_get_desc(struct vhost_virtqueue *vq,
>  				 struct vring_desc *desc, int idx)
>  {
> +#if VHOST_ARCH_CAN_ACCEL_UACCESS
> +	struct vhost_map *map;
> +	struct vring_desc *d;
> +
> +	if (!vq->iotlb) {
> +		vhost_vq_access_map_begin(vq);
> +
> +		map = vq->maps[VHOST_ADDR_DESC];
> +		if (likely(map)) {
> +			d = map->addr;
> +			*desc = *(d + idx);


Since idx can be speculated, I guess we need array_index_nospec here?


> +			vhost_vq_access_map_end(vq);
> +			return 0;
> +		}
> +
> +		vhost_vq_access_map_end(vq);
> +	}
> +#endif
> +
>  	return vhost_copy_from_user(vq, desc, vq->desc + idx, sizeof(*desc));
>  }
>  

I also wonder about the userspace address we get eventualy.
It would seem that we need to prevent that from speculating -
and that seems like a good idea even if this patch isn't
applied. As you are playing with micro-benchmarks, maybe
you could the below patch?
It's unfortunately untested.
Thanks a lot in advance!

===>
vhost: block speculation of translated descriptors

iovec addresses coming from vhost are assumed to be
pre-validated, but in fact can be speculated to a value
out of range.

Userspace address are later validated with array_index_nospec so we can
be sure kernel info does not leak through these addresses, but vhost
must also not leak userspace info outside the allowed memory table to
guests.

Following the defence in depth principle, make sure
the address is not validated out of node range.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

---


diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 5dc174ac8cac..863e25011ef6 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2072,7 +2076,9 @@ static int translate_desc(struct vhost_virtqueue *vq, u64 addr, u32 len,
 		size = node->size - addr + node->start;
 		_iov->iov_len = min((u64)len - s, size);
 		_iov->iov_base = (void __user *)(unsigned long)
-			(node->userspace_addr + addr - node->start);
+			(node->userspace_addr +
+			 array_index_nospec(addr - node->start,
+					    node->size));
 		s += size;
 		addr += size;
 		++ret;

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox