- * [PATCH net-next v4 1/3] net: flow_dissector: extend bpf flow dissector support with vnet hdr
  2021-06-08 17:02 [PATCH net-next v4 0/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb Tanner Love
@ 2021-06-08 17:02 ` Tanner Love
  2021-06-08 22:08   ` Martin KaFai Lau
  2021-06-08 17:02 ` [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb Tanner Love
  2021-06-08 17:02 ` [PATCH net-next v4 3/3] selftests/net: amend bpf flow dissector prog to do vnet hdr validation Tanner Love
  2 siblings, 1 reply; 27+ messages in thread
From: Tanner Love @ 2021-06-08 17:02 UTC (permalink / raw)
  To: netdev
  Cc: davem, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Willem de Bruijn, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Jason Wang, Tanner Love, Stanislav Fomichev
From: Tanner Love <tannerlove@google.com>
Amend the bpf flow dissector program type to be able to process
virtio-net headers. Do this to enable bpf flow dissector programs to
perform virtio-net header validation. The next patch in this series
will add a flow dissection hook in virtio_net_hdr_to_skb and make use
of this extended functionality. That commit message has more
background on the use case.
Add two new members to struct bpf_flow_keys: a pointer to struct
virtio_net_hdr, and vhdr_is_little_endian. The latter is required to
inform the BPF program of the endianness of the virtio-net header
fields, to handle the case of a version 1+ header on a big endian
machine.
Changes
v4:
  - Add virtio_net_hdr pointer to struct bpf_flow_keys
  - Add vhdr_is_little_endian to struct bpf_flow_keys
v2:
  - Describe parameter vhdr in __skb_flow_dissect documentation
Signed-off-by: Tanner Love <tannerlove@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Reviewed-by: Petar Penkov <ppenkov@google.com>
Reviewed-by: Stanislav Fomichev <sdf@google.com>
---
 drivers/net/bonding/bond_main.c |  2 +-
 include/linux/bpf.h             |  2 ++
 include/linux/skbuff.h          | 35 +++++++++++++++++++-----
 include/uapi/linux/bpf.h        |  2 ++
 kernel/bpf/verifier.c           | 48 +++++++++++++++++++++++++++++++--
 net/bpf/test_run.c              |  2 +-
 net/core/filter.c               | 26 ++++++++++++++++++
 net/core/flow_dissector.c       | 16 ++++++++---
 tools/include/uapi/linux/bpf.h  |  2 ++
 9 files changed, 120 insertions(+), 15 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index eb79a9f05914..36993636d56d 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3554,7 +3554,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	case BOND_XMIT_POLICY_ENCAP34:
 		memset(fk, 0, sizeof(*fk));
 		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
-					  fk, NULL, 0, 0, 0, 0);
+					  fk, NULL, 0, 0, 0, 0, NULL, false);
 	default:
 		break;
 	}
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9dc44ba97584..a333e6177de1 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -430,6 +430,8 @@ enum bpf_reg_type {
 	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
 	PTR_TO_FUNC,		 /* reg points to a bpf program function */
 	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
+	PTR_TO_VNET_HDR,	 /* reg points to struct virtio_net_hdr */
+	PTR_TO_VNET_HDR_OR_NULL, /* reg points to virtio_net_hdr or NULL */
 	__BPF_REG_TYPE_MAX,
 };
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b2db9cd9a73f..4e390cd8f72a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1314,21 +1314,27 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
 			     unsigned int key_count);
 
 struct bpf_flow_dissector;
+struct virtio_net_hdr;
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
-		      __be16 proto, int nhoff, int hlen, unsigned int flags);
+		      __be16 proto, int nhoff, int hlen, unsigned int flags,
+		      const struct virtio_net_hdr *vhdr,
+		      bool vhdr_is_little_endian);
 
 bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container, const void *data,
-			__be16 proto, int nhoff, int hlen, unsigned int flags);
+			__be16 proto, int nhoff, int hlen, unsigned int flags,
+			const struct virtio_net_hdr *vhdr,
+			bool vhdr_is_little_endian);
 
 static inline bool skb_flow_dissect(const struct sk_buff *skb,
 				    struct flow_dissector *flow_dissector,
 				    void *target_container, unsigned int flags)
 {
 	return __skb_flow_dissect(NULL, skb, flow_dissector,
-				  target_container, NULL, 0, 0, 0, flags);
+				  target_container, NULL, 0, 0, 0, flags, NULL,
+				  false);
 }
 
 static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
@@ -1337,7 +1343,22 @@ static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
 {
 	memset(flow, 0, sizeof(*flow));
 	return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
-				  flow, NULL, 0, 0, 0, flags);
+				  flow, NULL, 0, 0, 0, flags, NULL, false);
+}
+
+static inline bool
+__skb_flow_dissect_flow_keys_basic(const struct net *net,
+				   const struct sk_buff *skb,
+				   struct flow_keys_basic *flow,
+				   const void *data, __be16 proto,
+				   int nhoff, int hlen, unsigned int flags,
+				   const struct virtio_net_hdr *vhdr,
+				   bool vhdr_is_little_endian)
+{
+	memset(flow, 0, sizeof(*flow));
+	return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
+				  data, proto, nhoff, hlen, flags, vhdr,
+				  vhdr_is_little_endian);
 }
 
 static inline bool
@@ -1347,9 +1368,9 @@ skb_flow_dissect_flow_keys_basic(const struct net *net,
 				 const void *data, __be16 proto,
 				 int nhoff, int hlen, unsigned int flags)
 {
-	memset(flow, 0, sizeof(*flow));
-	return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
-				  data, proto, nhoff, hlen, flags);
+	return __skb_flow_dissect_flow_keys_basic(net, skb, flow, data, proto,
+						  nhoff, hlen, flags, NULL,
+						  false);
 }
 
 void skb_flow_dissect_meta(const struct sk_buff *skb,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 418b9b813d65..e1ac34548f9a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -6017,6 +6017,8 @@ struct bpf_flow_keys {
 	};
 	__u32	flags;
 	__be32	flow_label;
+	__bpf_md_ptr(const struct virtio_net_hdr *, vhdr);
+	__u8	vhdr_is_little_endian;
 };
 
 struct bpf_func_info {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 331b170d9fcc..2962b537da28 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -22,6 +22,7 @@
 #include <linux/error-injection.h>
 #include <linux/bpf_lsm.h>
 #include <linux/btf_ids.h>
+#include <linux/virtio_net.h>
 
 #include "disasm.h"
 
@@ -441,7 +442,8 @@ static bool reg_type_not_null(enum bpf_reg_type type)
 		type == PTR_TO_TCP_SOCK ||
 		type == PTR_TO_MAP_VALUE ||
 		type == PTR_TO_MAP_KEY ||
-		type == PTR_TO_SOCK_COMMON;
+		type == PTR_TO_SOCK_COMMON ||
+		type == PTR_TO_VNET_HDR;
 }
 
 static bool reg_type_may_be_null(enum bpf_reg_type type)
@@ -453,7 +455,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	       type == PTR_TO_BTF_ID_OR_NULL ||
 	       type == PTR_TO_MEM_OR_NULL ||
 	       type == PTR_TO_RDONLY_BUF_OR_NULL ||
-	       type == PTR_TO_RDWR_BUF_OR_NULL;
+	       type == PTR_TO_RDWR_BUF_OR_NULL ||
+	       type == PTR_TO_VNET_HDR_OR_NULL;
 }
 
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -576,6 +579,8 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
 	[PTR_TO_FUNC]		= "func",
 	[PTR_TO_MAP_KEY]	= "map_key",
+	[PTR_TO_VNET_HDR]	= "virtio_net_hdr",
+	[PTR_TO_VNET_HDR_OR_NULL] = "virtio_net_hdr_or_null",
 };
 
 static char slot_type_char[] = {
@@ -1166,6 +1171,9 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
 	case PTR_TO_RDWR_BUF_OR_NULL:
 		reg->type = PTR_TO_RDWR_BUF;
 		break;
+	case PTR_TO_VNET_HDR_OR_NULL:
+		reg->type = PTR_TO_VNET_HDR;
+		break;
 	default:
 		WARN_ONCE(1, "unknown nullable register type");
 	}
@@ -2528,6 +2536,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_MEM_OR_NULL:
 	case PTR_TO_FUNC:
 	case PTR_TO_MAP_KEY:
+	case PTR_TO_VNET_HDR:
+	case PTR_TO_VNET_HDR_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -3384,6 +3394,18 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
 	return 0;
 }
 
+static int check_virtio_net_hdr_access(struct bpf_verifier_env *env, int off,
+				       int size)
+{
+	if (size < 0 || off < 0 ||
+	    (u64)off + size > sizeof(struct virtio_net_hdr)) {
+		verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
+			off, size);
+		return -EACCES;
+	}
+	return 0;
+}
+
 static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
 			     u32 regno, int off, int size,
 			     enum bpf_access_type t)
@@ -3568,6 +3590,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	case PTR_TO_XDP_SOCK:
 		pointer_desc = "xdp_sock ";
 		break;
+	case PTR_TO_VNET_HDR:
+		pointer_desc = "virtio_net_hdr ";
+		break;
 	default:
 		break;
 	}
@@ -4218,6 +4243,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 		}
 
 		err = check_flow_keys_access(env, off, size);
+		if (!err && t == BPF_READ && value_regno >= 0) {
+			if (off == offsetof(struct bpf_flow_keys, vhdr)) {
+				regs[value_regno].type = PTR_TO_VNET_HDR_OR_NULL;
+				/* required for dropping or_null */
+				regs[value_regno].id = ++env->id_gen;
+			} else {
+				mark_reg_unknown(env, regs, value_regno);
+			}
+		}
+	} else if (reg->type == PTR_TO_VNET_HDR) {
+		if (t == BPF_WRITE) {
+			verbose(env, "R%d cannot write into %s\n",
+				regno, reg_type_str[reg->type]);
+			return -EACCES;
+		}
+
+		err = check_virtio_net_hdr_access(env, off, size);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (type_is_sk_pointer(reg->type)) {
@@ -9989,6 +10031,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	case PTR_TO_TCP_SOCK:
 	case PTR_TO_TCP_SOCK_OR_NULL:
 	case PTR_TO_XDP_SOCK:
+	case PTR_TO_VNET_HDR:
+	case PTR_TO_VNET_HDR_OR_NULL:
 		/* Only valid matches are exact, which memcmp() above
 		 * would have accepted
 		 */
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index aa47af349ba8..a11c5ce99ccb 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -797,7 +797,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
 	bpf_test_timer_enter(&t);
 	do {
 		retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
-					  size, flags);
+					  size, flags, NULL, false);
 	} while (bpf_test_timer_continue(&t, repeat, &ret, &duration));
 	bpf_test_timer_leave(&t);
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 239de1306de9..58a8a43380ee 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -8358,6 +8358,8 @@ static bool flow_dissector_is_valid_access(int off, int size,
 			return false;
 		info->reg_type = PTR_TO_FLOW_KEYS;
 		return true;
+	case bpf_ctx_range(struct __sk_buff, len):
+		return size == size_default;
 	default:
 		return false;
 	}
@@ -8390,6 +8392,30 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
 				      si->dst_reg, si->src_reg,
 				      offsetof(struct bpf_flow_dissector, flow_keys));
 		break;
+
+	case offsetof(struct __sk_buff, len):
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, skb),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_flow_dissector, skb));
+		*insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, 0, 4);
+		/* bpf_flow_dissector->skb == NULL */
+		/* dst_reg = bpf_flow_dissector->data_end */
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
+				      si->dst_reg, si->src_reg,
+				      offsetof(struct bpf_flow_dissector, data_end));
+		/* TMP = bpf_flow_dissector->data */
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
+				      BPF_REG_TMP, si->src_reg,
+				      offsetof(struct bpf_flow_dissector, data));
+		/* dst_reg -= bpf_flow_dissector->data */
+		*insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_TMP);
+		*insn++ = BPF_JMP_A(1);
+		/* bpf_flow_dissector->skb != NULL */
+		/* bpf_flow_dissector->skb->len */
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+				      si->dst_reg, si->dst_reg,
+				      offsetof(struct sk_buff, len));
+		break;
 	}
 
 	return insn - insn_buf;
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 3ed7c98a98e1..4bdad2b1d3a0 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -28,6 +28,7 @@
 #include <scsi/fc/fc_fcoe.h>
 #include <uapi/linux/batadv_packet.h>
 #include <linux/bpf.h>
+#include <linux/virtio_net.h>
 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack_labels.h>
@@ -864,7 +865,9 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
 }
 
 bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
-		      __be16 proto, int nhoff, int hlen, unsigned int flags)
+		      __be16 proto, int nhoff, int hlen, unsigned int flags,
+		      const struct virtio_net_hdr *vhdr,
+		      bool vhdr_is_little_endian)
 {
 	struct bpf_flow_keys *flow_keys = ctx->flow_keys;
 	u32 result;
@@ -874,6 +877,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
 	flow_keys->n_proto = proto;
 	flow_keys->nhoff = nhoff;
 	flow_keys->thoff = flow_keys->nhoff;
+	flow_keys->vhdr = vhdr;
+	flow_keys->vhdr_is_little_endian = vhdr_is_little_endian;
 
 	BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
 		     (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
@@ -915,7 +920,9 @@ bool __skb_flow_dissect(const struct net *net,
 			const struct sk_buff *skb,
 			struct flow_dissector *flow_dissector,
 			void *target_container, const void *data,
-			__be16 proto, int nhoff, int hlen, unsigned int flags)
+			__be16 proto, int nhoff, int hlen, unsigned int flags,
+			const struct virtio_net_hdr *vhdr,
+			bool vhdr_is_little_endian)
 {
 	struct flow_dissector_key_control *key_control;
 	struct flow_dissector_key_basic *key_basic;
@@ -1012,7 +1019,8 @@ bool __skb_flow_dissect(const struct net *net,
 
 			prog = READ_ONCE(run_array->items[0].prog);
 			ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
-					       hlen, flags);
+					       hlen, flags, vhdr,
+					       vhdr_is_little_endian);
 			__skb_flow_bpf_to_target(&flow_keys, flow_dissector,
 						 target_container);
 			rcu_read_unlock();
@@ -1610,7 +1618,7 @@ u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
 	memset(&keys, 0, sizeof(keys));
 	__skb_flow_dissect(NULL, skb, &flow_keys_dissector_symmetric,
 			   &keys, NULL, 0, 0, 0,
-			   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+			   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL, NULL, false);
 
 	return __flow_hash_from_keys(&keys, &hashrnd);
 }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 418b9b813d65..e1ac34548f9a 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -6017,6 +6017,8 @@ struct bpf_flow_keys {
 	};
 	__u32	flags;
 	__be32	flow_label;
+	__bpf_md_ptr(const struct virtio_net_hdr *, vhdr);
+	__u8	vhdr_is_little_endian;
 };
 
 struct bpf_func_info {
-- 
2.32.0.rc1.229.g3e70b5a671-goog
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 1/3] net: flow_dissector: extend bpf flow dissector support with vnet hdr
  2021-06-08 17:02 ` [PATCH net-next v4 1/3] net: flow_dissector: extend bpf flow dissector support with vnet hdr Tanner Love
@ 2021-06-08 22:08   ` Martin KaFai Lau
       [not found]     ` <CAOckAf8W04ynA4iXXzBe8kB_yauH9TKEJ_o6tt9tQuTJBx-G6A@mail.gmail.com>
  0 siblings, 1 reply; 27+ messages in thread
From: Martin KaFai Lau @ 2021-06-08 22:08 UTC (permalink / raw)
  To: Tanner Love
  Cc: netdev, davem, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Willem de Bruijn, Petar Penkov,
	Jakub Kicinski, Michael S . Tsirkin, Jason Wang, Tanner Love,
	Stanislav Fomichev
On Tue, Jun 08, 2021 at 01:02:22PM -0400, Tanner Love wrote:
[ ... ]
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 9dc44ba97584..a333e6177de1 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -430,6 +430,8 @@ enum bpf_reg_type {
>  	PTR_TO_PERCPU_BTF_ID,	 /* reg points to a percpu kernel variable */
>  	PTR_TO_FUNC,		 /* reg points to a bpf program function */
>  	PTR_TO_MAP_KEY,		 /* reg points to a map element key */
> +	PTR_TO_VNET_HDR,	 /* reg points to struct virtio_net_hdr */
> +	PTR_TO_VNET_HDR_OR_NULL, /* reg points to virtio_net_hdr or NULL */
>  	__BPF_REG_TYPE_MAX,
>  };
>
[ ... ]
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 418b9b813d65..e1ac34548f9a 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -6017,6 +6017,8 @@ struct bpf_flow_keys {
>  	};
>  	__u32	flags;
>  	__be32	flow_label;
> +	__bpf_md_ptr(const struct virtio_net_hdr *, vhdr);
> +	__u8	vhdr_is_little_endian;
>  };
>  
>  struct bpf_func_info {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 331b170d9fcc..2962b537da28 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -22,6 +22,7 @@
>  #include <linux/error-injection.h>
>  #include <linux/bpf_lsm.h>
>  #include <linux/btf_ids.h>
> +#include <linux/virtio_net.h>
>  
>  #include "disasm.h"
>  
> @@ -441,7 +442,8 @@ static bool reg_type_not_null(enum bpf_reg_type type)
>  		type == PTR_TO_TCP_SOCK ||
>  		type == PTR_TO_MAP_VALUE ||
>  		type == PTR_TO_MAP_KEY ||
> -		type == PTR_TO_SOCK_COMMON;
> +		type == PTR_TO_SOCK_COMMON ||
> +		type == PTR_TO_VNET_HDR;
>  }
>  
>  static bool reg_type_may_be_null(enum bpf_reg_type type)
> @@ -453,7 +455,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
>  	       type == PTR_TO_BTF_ID_OR_NULL ||
>  	       type == PTR_TO_MEM_OR_NULL ||
>  	       type == PTR_TO_RDONLY_BUF_OR_NULL ||
> -	       type == PTR_TO_RDWR_BUF_OR_NULL;
> +	       type == PTR_TO_RDWR_BUF_OR_NULL ||
> +	       type == PTR_TO_VNET_HDR_OR_NULL;
>  }
>  
>  static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
> @@ -576,6 +579,8 @@ static const char * const reg_type_str[] = {
>  	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
>  	[PTR_TO_FUNC]		= "func",
>  	[PTR_TO_MAP_KEY]	= "map_key",
> +	[PTR_TO_VNET_HDR]	= "virtio_net_hdr",
> +	[PTR_TO_VNET_HDR_OR_NULL] = "virtio_net_hdr_or_null",
>  };
>  
>  static char slot_type_char[] = {
> @@ -1166,6 +1171,9 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
>  	case PTR_TO_RDWR_BUF_OR_NULL:
>  		reg->type = PTR_TO_RDWR_BUF;
>  		break;
> +	case PTR_TO_VNET_HDR_OR_NULL:
> +		reg->type = PTR_TO_VNET_HDR;
> +		break;
>  	default:
>  		WARN_ONCE(1, "unknown nullable register type");
>  	}
> @@ -2528,6 +2536,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
>  	case PTR_TO_MEM_OR_NULL:
>  	case PTR_TO_FUNC:
>  	case PTR_TO_MAP_KEY:
> +	case PTR_TO_VNET_HDR:
> +	case PTR_TO_VNET_HDR_OR_NULL:
>  		return true;
>  	default:
>  		return false;
> @@ -3384,6 +3394,18 @@ static int check_flow_keys_access(struct bpf_verifier_env *env, int off,
>  	return 0;
>  }
>  
> +static int check_virtio_net_hdr_access(struct bpf_verifier_env *env, int off,
> +				       int size)
> +{
> +	if (size < 0 || off < 0 ||
> +	    (u64)off + size > sizeof(struct virtio_net_hdr)) {
> +		verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
> +			off, size);
> +		return -EACCES;
> +	}
> +	return 0;
> +}
> +
>  static int check_sock_access(struct bpf_verifier_env *env, int insn_idx,
>  			     u32 regno, int off, int size,
>  			     enum bpf_access_type t)
> @@ -3568,6 +3590,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
>  	case PTR_TO_XDP_SOCK:
>  		pointer_desc = "xdp_sock ";
>  		break;
> +	case PTR_TO_VNET_HDR:
> +		pointer_desc = "virtio_net_hdr ";
> +		break;
>  	default:
>  		break;
>  	}
> @@ -4218,6 +4243,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>  		}
>  
>  		err = check_flow_keys_access(env, off, size);
> +		if (!err && t == BPF_READ && value_regno >= 0) {
> +			if (off == offsetof(struct bpf_flow_keys, vhdr)) {
> +				regs[value_regno].type = PTR_TO_VNET_HDR_OR_NULL;
check_flow_keys_access() needs some modifications
1. What if t == BPF_WRITE?  I think "keys->vhdr = 0xdead" has to be rejected.
   
2. It needs to check loading keys->vhdr is in sizeof(__u64) like other
   pointer loading does.  Take a look at the flow_keys case in
   flow_dissector_is_valid_access().
It also needs to convert the pointer loading like how
flow_dissector_convert_ctx_access() does on flow_keys.
A high level design question.  "struct virtio_net_hdr" is in uapi and
there is no need to do convert_ctx.  I think using PTR_TO_BTF_ID_OR_NULL
will be easier here and the new PTR_TO_VNET_HDR* related changes will go away.
The "else if (reg->type == PTR_TO_CTX)" case earlier could be a good example.
To get the btf_id for "struct virtio_net_hdr", take a look at
the BTF_ID_LIST_SINGLE() usage in filter.c
> +				/* required for dropping or_null */
> +				regs[value_regno].id = ++env->id_gen;
> +			} else {
> +				mark_reg_unknown(env, regs, value_regno);
> +			}
> +		}
> +	} else if (reg->type == PTR_TO_VNET_HDR) {
> +		if (t == BPF_WRITE) {
> +			verbose(env, "R%d cannot write into %s\n",
> +				regno, reg_type_str[reg->type]);
> +			return -EACCES;
> +		}
> +
> +		err = check_virtio_net_hdr_access(env, off, size);
>  		if (!err && t == BPF_READ && value_regno >= 0)
>  			mark_reg_unknown(env, regs, value_regno);
>  	} else if (type_is_sk_pointer(reg->type)) {
[ ... ]
> @@ -8390,6 +8392,30 @@ static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
>  				      si->dst_reg, si->src_reg,
>  				      offsetof(struct bpf_flow_dissector, flow_keys));
>  		break;
> +
> +	case offsetof(struct __sk_buff, len):
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, skb),
> +				      si->dst_reg, si->src_reg,
> +				      offsetof(struct bpf_flow_dissector, skb));
> +		*insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, 0, 4);
> +		/* bpf_flow_dissector->skb == NULL */
> +		/* dst_reg = bpf_flow_dissector->data_end */
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
> +				      si->dst_reg, si->src_reg,
> +				      offsetof(struct bpf_flow_dissector, data_end));
> +		/* TMP = bpf_flow_dissector->data */
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
> +				      BPF_REG_TMP, si->src_reg,
I don't think BPF_REG_TMP can be used here.  My understanding is that is for
classic bpf.  Try BPF_REG_AX instead.
It will be a good idea to cover this case if it has not been done in patch 3.
> +				      offsetof(struct bpf_flow_dissector, data));
> +		/* dst_reg -= bpf_flow_dissector->data */
> +		*insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_TMP);
> +		*insn++ = BPF_JMP_A(1);
> +		/* bpf_flow_dissector->skb != NULL */
> +		/* bpf_flow_dissector->skb->len */
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
> +				      si->dst_reg, si->dst_reg,
> +				      offsetof(struct sk_buff, len));
> +		break;
>  	}
>  
>  	return insn - insn_buf;
^ permalink raw reply	[flat|nested] 27+ messages in thread
 
- * [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-08 17:02 [PATCH net-next v4 0/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb Tanner Love
  2021-06-08 17:02 ` [PATCH net-next v4 1/3] net: flow_dissector: extend bpf flow dissector support with vnet hdr Tanner Love
@ 2021-06-08 17:02 ` Tanner Love
  2021-06-10  3:09   ` Jason Wang
  2021-06-08 17:02 ` [PATCH net-next v4 3/3] selftests/net: amend bpf flow dissector prog to do vnet hdr validation Tanner Love
  2 siblings, 1 reply; 27+ messages in thread
From: Tanner Love @ 2021-06-08 17:02 UTC (permalink / raw)
  To: netdev
  Cc: davem, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Willem de Bruijn, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Jason Wang, Tanner Love
From: Tanner Love <tannerlove@google.com>
Syzkaller bugs have resulted from loose specification of
virtio_net_hdr[1]. Enable execution of a BPF flow dissector program
in virtio_net_hdr_to_skb to validate the vnet header and drop bad
input.
Introduce a new sysctl net.core.flow_dissect_vnet_hdr controlling a
static key to decide whether to perform flow dissection. When the key
is false, virtio_net_hdr_to_skb computes as before.
A permissive specification of vnet headers is part of the ABI. Some
applications now depend on it. Still, many of these packets are bogus.
Give admins the option to interpret behavior more strictly. For
instance, verifying that a VIRTIO_NET_HDR_GSO_TCPV6 header matches a
packet with unencapsulated IPv6/TCP without extension headers, with
payload length exceeding gso_size and hdr_len exactly at TCP payload
offset.
BPF flow dissection implements protocol parsing in an safe way. And is
configurable, so can be as pedantic as the workload allows (e.g.,
dropping UFO altogether).
Vnet_header flow dissection is *not* a substitute for fixing bugs when
reported. But even if not enabled continuously, offers a quick path to
mitigating vulnerabilities.
[1] https://syzkaller.appspot.com/bug?id=b419a5ca95062664fe1a60b764621eb4526e2cd0
Changes
v4:
  - Expand commit message with rationale for bpf flow dissector based
    implementation
v3:
  - Move sysctl_flow_dissect_vnet_hdr_key definition to
    flow_dissector.c to fix CONFIG_SYSCTL warning when building UML
Suggested-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Tanner Love <tannerlove@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
---
 include/linux/virtio_net.h | 25 +++++++++++++++++++++----
 net/core/flow_dissector.c  |  3 +++
 net/core/sysctl_net_core.c |  9 +++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index b465f8f3e554..b67b5413f2ce 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -25,10 +25,13 @@ static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
 	return 0;
 }
 
+DECLARE_STATIC_KEY_FALSE(sysctl_flow_dissect_vnet_hdr_key);
+
 static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 					const struct virtio_net_hdr *hdr,
 					bool little_endian)
 {
+	struct flow_keys_basic keys;
 	unsigned int gso_type = 0;
 	unsigned int thlen = 0;
 	unsigned int p_off = 0;
@@ -78,13 +81,24 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 		p_off = skb_transport_offset(skb) + thlen;
 		if (!pskb_may_pull(skb, p_off))
 			return -EINVAL;
-	} else {
+	}
+
+	/* BPF flow dissection for optional strict validation.
+	 *
+	 * Admins can define permitted packets more strictly, such as dropping
+	 * deprecated UDP_UFO packets and requiring skb->protocol to be non-zero
+	 * and matching packet headers.
+	 */
+	if (static_branch_unlikely(&sysctl_flow_dissect_vnet_hdr_key) &&
+	    !__skb_flow_dissect_flow_keys_basic(NULL, skb, &keys, NULL, 0, 0, 0,
+						0, hdr, little_endian))
+		return -EINVAL;
+
+	if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)) {
 		/* gso packets without NEEDS_CSUM do not set transport_offset.
 		 * probe and drop if does not match one of the above types.
 		 */
 		if (gso_type && skb->network_header) {
-			struct flow_keys_basic keys;
-
 			if (!skb->protocol) {
 				__be16 protocol = dev_parse_header_protocol(skb);
 
@@ -92,8 +106,11 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 				if (protocol && protocol != skb->protocol)
 					return -EINVAL;
 			}
+
 retry:
-			if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
+			/* only if flow dissection not already done */
+			if (!static_branch_unlikely(&sysctl_flow_dissect_vnet_hdr_key) &&
+			    !skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
 							      NULL, 0, 0, 0,
 							      0)) {
 				/* UFO does not specify ipv4 or 6: try both */
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 4bdad2b1d3a0..27a6ad7c72da 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -35,6 +35,9 @@
 #endif
 #include <linux/bpf-netns.h>
 
+DEFINE_STATIC_KEY_FALSE(sysctl_flow_dissect_vnet_hdr_key);
+EXPORT_SYMBOL(sysctl_flow_dissect_vnet_hdr_key);
+
 static void dissector_set_key(struct flow_dissector *flow_dissector,
 			      enum flow_dissector_key_id key_id)
 {
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index c8496c1142c9..c01b9366bb75 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -36,6 +36,8 @@ static int net_msg_warn;	/* Unused, but still a sysctl */
 int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
 EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
 
+DECLARE_STATIC_KEY_FALSE(sysctl_flow_dissect_vnet_hdr_key);
+
 /* 0 - Keep current behavior:
  *     IPv4: inherit all current settings from init_net
  *     IPv6: reset all settings to default
@@ -580,6 +582,13 @@ static struct ctl_table net_core_table[] = {
 		.extra1		= SYSCTL_ONE,
 		.extra2		= &int_3600,
 	},
+	{
+		.procname       = "flow_dissect_vnet_hdr",
+		.data           = &sysctl_flow_dissect_vnet_hdr_key.key,
+		.maxlen         = sizeof(sysctl_flow_dissect_vnet_hdr_key),
+		.mode           = 0644,
+		.proc_handler   = proc_do_static_key,
+	},
 	{ }
 };
 
-- 
2.32.0.rc1.229.g3e70b5a671-goog
^ permalink raw reply related	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-08 17:02 ` [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb Tanner Love
@ 2021-06-10  3:09   ` Jason Wang
  2021-06-10  3:15     ` Willem de Bruijn
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-10  3:09 UTC (permalink / raw)
  To: Tanner Love, netdev
  Cc: davem, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Willem de Bruijn, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
在 2021/6/9 上午1:02, Tanner Love 写道:
>   retry:
> -			if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
> +			/* only if flow dissection not already done */
> +			if (!static_branch_unlikely(&sysctl_flow_dissect_vnet_hdr_key) &&
> +			    !skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
>   							      NULL, 0, 0, 0,
>   							      0)) {
So I still wonder the benefit we could gain from reusing the bpf flow 
dissector here. Consider the only context we need is the flow keys, we 
had two choices
a1) embed the vnet header checking inside bpf flow dissector
a2) introduce a dedicated eBPF type for doing that
And we have two ways to access the vnet header
b1) via pesudo __sk_buff
b2) introduce bpf helpers
I second for a2 and b2. The main motivation is to hide the vnet header 
details from the bpf subsystem.
Thanks
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10  3:09   ` Jason Wang
@ 2021-06-10  3:15     ` Willem de Bruijn
  2021-06-10  3:53       ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Willem de Bruijn @ 2021-06-10  3:15 UTC (permalink / raw)
  To: Jason Wang
  Cc: Tanner Love, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
On Wed, Jun 9, 2021 at 11:09 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/6/9 上午1:02, Tanner Love 写道:
> >   retry:
> > -                     if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
> > +                     /* only if flow dissection not already done */
> > +                     if (!static_branch_unlikely(&sysctl_flow_dissect_vnet_hdr_key) &&
> > +                         !skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
> >                                                             NULL, 0, 0, 0,
> >                                                             0)) {
>
>
> So I still wonder the benefit we could gain from reusing the bpf flow
> dissector here. Consider the only context we need is the flow keys,
Maybe I misunderstand the comment, but the goal is not just to access
the flow keys, but to correlate data in the packet payload with the fields
of the virtio_net_header. E.g., to parse a packet payload to verify that
it matches the gso type and header length. I don't see how we could
make that two separate programs, one to parse a packet (flow dissector)
and one to validate the vnet_hdr.
> we
> had two choices
>
> a1) embed the vnet header checking inside bpf flow dissector
> a2) introduce a dedicated eBPF type for doing that
>
> And we have two ways to access the vnet header
>
> b1) via pesudo __sk_buff
> b2) introduce bpf helpers
>
> I second for a2 and b2. The main motivation is to hide the vnet header
> details from the bpf subsystem.
b2 assumes that we might have many different variants of
vnet_hdr, but that all will be able to back a functional
interface like "return the header length"? Historically, the
struct has not seen such a rapid rate of change that I think
would warrant introducing this level of indirection.
The little_endian field does demonstrate one form of how
the struct can be context sensitive -- and not self describing.
I think we can capture multiple variants of the struct, if it ever
comes to that, with a versioning field. We did not add that
right away, because that can be introduced later, when a
new version arrives. But we have a plan for the eventuality.
>
> Thanks
>
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10  3:15     ` Willem de Bruijn
@ 2021-06-10  3:53       ` Jason Wang
  2021-06-10  4:05         ` Alexei Starovoitov
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-10  3:53 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Tanner Love, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
在 2021/6/10 上午11:15, Willem de Bruijn 写道:
> On Wed, Jun 9, 2021 at 11:09 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/6/9 上午1:02, Tanner Love 写道:
>>>    retry:
>>> -                     if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
>>> +                     /* only if flow dissection not already done */
>>> +                     if (!static_branch_unlikely(&sysctl_flow_dissect_vnet_hdr_key) &&
>>> +                         !skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
>>>                                                              NULL, 0, 0, 0,
>>>                                                              0)) {
>>
>> So I still wonder the benefit we could gain from reusing the bpf flow
>> dissector here. Consider the only context we need is the flow keys,
> Maybe I misunderstand the comment, but the goal is not just to access
> the flow keys, but to correlate data in the packet payload with the fields
> of the virtio_net_header. E.g., to parse a packet payload to verify that
> it matches the gso type and header length. I don't see how we could
> make that two separate programs, one to parse a packet (flow dissector)
> and one to validate the vnet_hdr.
I may miss something here. I think it could be done via passing flow 
keys built by flow dissectors to vnet header validation eBPF program.
(Looking at validate_vnet_hdr(), it needs only flow keys and skb length).
Having two programs may have other advantages:
1) bpf vnet header validation can work without bpf flow dissector
2) keep bpf flow dissector simpler (unaware of vnet header), otherwise 
you need to prepare a dedicated bpf dissector for virtio-net/tap etc.
>
>> we
>> had two choices
>>
>> a1) embed the vnet header checking inside bpf flow dissector
>> a2) introduce a dedicated eBPF type for doing that
>>
>> And we have two ways to access the vnet header
>>
>> b1) via pesudo __sk_buff
>> b2) introduce bpf helpers
>>
>> I second for a2 and b2. The main motivation is to hide the vnet header
>> details from the bpf subsystem.
> b2 assumes that we might have many different variants of
> vnet_hdr, but that all will be able to back a functional
> interface like "return the header length"?
Probably.
> Historically, the
> struct has not seen such a rapid rate of change that I think
> would warrant introducing this level of indirection.
For the rapid change, yes. But:
1) __sk_buff is part of the general uAPI, adding virtio-net fields looks 
like a layer violation and a duplication to the existing virtio-net uAPI
2) as you said below the vnet header is not self contained
3) using helpers we can make vnet header format transparent to bpf core 
(that's the way how bpf access TCP headers AFAIK)
>
> The little_endian field does demonstrate one form of how
> the struct can be context sensitive -- and not self describing.
Yes and we probably need to provide more context, e.g the negotiated 
features.
>
> I think we can capture multiple variants of the struct, if it ever
> comes to that, with a versioning field. We did not add that
> right away, because that can be introduced later, when a
> new version arrives. But we have a plan for the eventuality.
It works but it couples virtio with bpf.
Thanks
>> Thanks
>>
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10  3:53       ` Jason Wang
@ 2021-06-10  4:05         ` Alexei Starovoitov
  2021-06-10  4:13           ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Alexei Starovoitov @ 2021-06-10  4:05 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Tanner Love, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
On Wed, Jun 9, 2021 at 8:53 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> It works but it couples virtio with bpf.
Jason,
I think your main concern is that it makes virtio_net_hdr into uapi?
That's not the case. __sk_buff is uapi, but pointers to sockets
and other kernel data structures are not.
Yes. It's a bit weird that uapi struct has a pointer to kernel internal,
but I don't see it as a deal breaker.
Tracing progs have plenty of such cases.
In networking there is tcp-bpf where everything is kernel internal and non-uapi.
So after this patch virtio_net_hdr is free to change without worrying about bpf
progs reading it.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10  4:05         ` Alexei Starovoitov
@ 2021-06-10  4:13           ` Jason Wang
  2021-06-10  4:19             ` Alexei Starovoitov
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-10  4:13 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Willem de Bruijn, Tanner Love, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
在 2021/6/10 下午12:05, Alexei Starovoitov 写道:
> On Wed, Jun 9, 2021 at 8:53 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> It works but it couples virtio with bpf.
> Jason,
>
> I think your main concern is that it makes virtio_net_hdr into uapi?
Actually not. Note that, vnet header is already a part of uAPI 
(include/uapi/linux/virtio_net.h).
The concern is that is may confuse the userspace since we will have two 
sets of vnet header uapi (and they are already out of sync).
> That's not the case. __sk_buff is uapi, but pointers to sockets
> and other kernel data structures are not.
> Yes. It's a bit weird that uapi struct has a pointer to kernel internal,
> but I don't see it as a deal breaker.
Yes, but looking at the existing fields of __sk_buff. All are pretty 
generic fields that are device agnostic. This patch breaks this.
> Tracing progs have plenty of such cases.
> In networking there is tcp-bpf where everything is kernel internal and non-uapi.
> So after this patch virtio_net_hdr is free to change without worrying about bpf
> progs reading it.
So I wonder why not simply use helpers to access the vnet header like 
how tcp-bpf access the tcp header?
Thanks
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10  4:13           ` Jason Wang
@ 2021-06-10  4:19             ` Alexei Starovoitov
  2021-06-10  5:23               ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Alexei Starovoitov @ 2021-06-10  4:19 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Tanner Love, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
>
> So I wonder why not simply use helpers to access the vnet header like
> how tcp-bpf access the tcp header?
Short answer - speed.
tcp-bpf accesses all uapi and non-uapi structs directly.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10  4:19             ` Alexei Starovoitov
@ 2021-06-10  5:23               ` Jason Wang
  2021-06-10 14:04                 ` Willem de Bruijn
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-10  5:23 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Willem de Bruijn, Tanner Love, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
> On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
>> So I wonder why not simply use helpers to access the vnet header like
>> how tcp-bpf access the tcp header?
> Short answer - speed.
> tcp-bpf accesses all uapi and non-uapi structs directly.
>
Ok, this makes sense. But instead of coupling device specific stuffs 
like vnet header and neediness into general flow_keys as a context.
It would be better to introduce a vnet header context which contains
1) vnet header
2) flow keys
3) other contexts like endian and virtio-net features
So we preserve the performance and decouple the virtio-net stuffs from 
general structures like flow_keys or __sk_buff.
Thanks
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10  5:23               ` Jason Wang
@ 2021-06-10 14:04                 ` Willem de Bruijn
  2021-06-11  2:10                   ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Willem de Bruijn @ 2021-06-10 14:04 UTC (permalink / raw)
  To: Jason Wang
  Cc: Alexei Starovoitov, Willem de Bruijn, Tanner Love,
	Network Development, David Miller, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Eric Dumazet, Petar Penkov,
	Jakub Kicinski, Michael S . Tsirkin, Tanner Love
On Thu, Jun 10, 2021 at 1:25 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
> > On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
> >> So I wonder why not simply use helpers to access the vnet header like
> >> how tcp-bpf access the tcp header?
> > Short answer - speed.
> > tcp-bpf accesses all uapi and non-uapi structs directly.
> >
>
> Ok, this makes sense. But instead of coupling device specific stuffs
> like vnet header and neediness into general flow_keys as a context.
>
> It would be better to introduce a vnet header context which contains
>
> 1) vnet header
> 2) flow keys
> 3) other contexts like endian and virtio-net features
>
> So we preserve the performance and decouple the virtio-net stuffs from
> general structures like flow_keys or __sk_buff.
You are advocating for a separate BPF program that takes a vnet hdr
and flow_keys as context and is run separately after flow dissection?
I don't understand the benefit of splitting the program in two in this manner.
Your previous comment mentions two vnet_hdr definitions that can get
out of sync. Do you mean v1 of this patch, that adds the individual
fields to bpf_flow_dissector? That is no longer the case: the latest
version directly access the real struct. As Alexei points out, doing
this does not set virtio_net_hdr in stone in the ABI. That is a valid
worry. But so this patch series will not restrict how that struct may
develop over time. A version field allows a BPF program to parse the
different variants of the struct -- in the same manner as other
protocol headers. If you prefer, we can add that field from the start.
I don't see a benefit to an extra layer of indirection in the form of
helper functions.
I do see downsides to splitting the program. The goal is to ensure
consistency between vnet_hdr and packet payload. A program split
limits to checking vnet_hdr against what the flow_keys struct has
extracted. That is a great reduction over full packet access. For
instance, does the packet contain IP options? No idea.
If stable ABI is not a concern and there are no different struct
definitions that can go out of sync, does that address your main
concerns?
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-10 14:04                 ` Willem de Bruijn
@ 2021-06-11  2:10                   ` Jason Wang
  2021-06-11  2:45                     ` Willem de Bruijn
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-11  2:10 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Alexei Starovoitov, Tanner Love, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
在 2021/6/10 下午10:04, Willem de Bruijn 写道:
> On Thu, Jun 10, 2021 at 1:25 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
>>> On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
>>>> So I wonder why not simply use helpers to access the vnet header like
>>>> how tcp-bpf access the tcp header?
>>> Short answer - speed.
>>> tcp-bpf accesses all uapi and non-uapi structs directly.
>>>
>> Ok, this makes sense. But instead of coupling device specific stuffs
>> like vnet header and neediness into general flow_keys as a context.
>>
>> It would be better to introduce a vnet header context which contains
>>
>> 1) vnet header
>> 2) flow keys
>> 3) other contexts like endian and virtio-net features
>>
>> So we preserve the performance and decouple the virtio-net stuffs from
>> general structures like flow_keys or __sk_buff.
> You are advocating for a separate BPF program that takes a vnet hdr
> and flow_keys as context and is run separately after flow dissection?
Yes.
>
> I don't understand the benefit of splitting the program in two in this manner.
It decouples a device specific attributes from the general structures 
like flow keys. We have xen-netfront, netvsc and a lot of drivers that 
works for the emulated devices. We could not add all those metadatas as 
the context of flow keys. That's why I suggest to use something more 
generic like XDP from the start. Yes, GSO stuffs was disabled by 
virtio-net on XDP but it's not something that can not be fixed. If the 
GSO and s/g support can not be done in short time, then a virtio-net 
specific BPF program still looks much better than coupling virtio-net 
metadata into flow keys or other general data structures.
>
> Your previous comment mentions two vnet_hdr definitions that can get
> out of sync. Do you mean v1 of this patch, that adds the individual
> fields to bpf_flow_dissector?
No, I meant this part of the patch:
+static int check_virtio_net_hdr_access(struct bpf_verifier_env *env, 
int off,
+                       int size)
+{
+    if (size < 0 || off < 0 ||
+        (u64)off + size > sizeof(struct virtio_net_hdr)) {
+        verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
+            off, size);
+        return -EACCES;
+    }
+    return 0;
+}
+
It prevents the program from accessing e.g num_buffers.
> That is no longer the case: the latest
> version directly access the real struct. As Alexei points out, doing
> this does not set virtio_net_hdr in stone in the ABI. That is a valid
> worry. But so this patch series will not restrict how that struct may
> develop over time. A version field allows a BPF program to parse the
> different variants of the struct -- in the same manner as other
> protocol headers.
The format of the virtio-net header depends on the virtio features, any 
reason for another version? The correct way is to provide features in 
the context, in this case you don't event need the endian hint.
> If you prefer, we can add that field from the start.
> I don't see a benefit to an extra layer of indirection in the form of
> helper functions.
>
> I do see downsides to splitting the program. The goal is to ensure
> consistency between vnet_hdr and packet payload. A program split
> limits to checking vnet_hdr against what the flow_keys struct has
> extracted. That is a great reduction over full packet access.
Full packet access could be still done in bpf flow dissector.
> For
> instance, does the packet contain IP options? No idea.
I don't understand here. You can figure out this in flow dissector, and 
you can extend the flow keys to carry out this information if necessary.
And if you want to have more capability, XDP which is designed for early 
packet filtering is the right way to go which have even more functions 
that a simple bpf flow dissector.
>
> If stable ABI is not a concern and there are no different struct
> definitions that can go out of sync, does that address your main
> concerns?
I think not. Assuming we provide sufficient contexts (e.g the virtio 
features), problem still: 1) coupling virtio-net with flow_keys 2) can't 
work for XDP.
Thanks
>
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-11  2:10                   ` Jason Wang
@ 2021-06-11  2:45                     ` Willem de Bruijn
  2021-06-11  3:38                       ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Willem de Bruijn @ 2021-06-11  2:45 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Alexei Starovoitov, Tanner Love,
	Network Development, David Miller, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Eric Dumazet, Petar Penkov,
	Jakub Kicinski, Michael S . Tsirkin, Tanner Love
On Thu, Jun 10, 2021 at 10:11 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/6/10 下午10:04, Willem de Bruijn 写道:
> > On Thu, Jun 10, 2021 at 1:25 AM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
> >>> On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>> So I wonder why not simply use helpers to access the vnet header like
> >>>> how tcp-bpf access the tcp header?
> >>> Short answer - speed.
> >>> tcp-bpf accesses all uapi and non-uapi structs directly.
> >>>
> >> Ok, this makes sense. But instead of coupling device specific stuffs
> >> like vnet header and neediness into general flow_keys as a context.
> >>
> >> It would be better to introduce a vnet header context which contains
> >>
> >> 1) vnet header
> >> 2) flow keys
> >> 3) other contexts like endian and virtio-net features
> >>
> >> So we preserve the performance and decouple the virtio-net stuffs from
> >> general structures like flow_keys or __sk_buff.
> > You are advocating for a separate BPF program that takes a vnet hdr
> > and flow_keys as context and is run separately after flow dissection?
>
>
> Yes.
>
>
> >
> > I don't understand the benefit of splitting the program in two in this manner.
>
>
> It decouples a device specific attributes from the general structures
> like flow keys. We have xen-netfront, netvsc and a lot of drivers that
> works for the emulated devices. We could not add all those metadatas as
> the context of flow keys.
What are device-specific attributes here? What kind of metadata?
The only metadata that can be passed with tuntap, pf_packet et al is
virtio_net_hdr.
I quite don't understand where xen-netfront et al come in.
> That's why I suggest to use something more
> generic like XDP from the start. Yes, GSO stuffs was disabled by
> virtio-net on XDP but it's not something that can not be fixed. If the
> GSO and s/g support can not be done in short time
An alternative interface does not address that we already have this
interface and it is already causing problems.
> then a virtio-net
> specific BPF program still looks much better than coupling virtio-net
> metadata into flow keys or other general data structures.
>
>
> >
> > Your previous comment mentions two vnet_hdr definitions that can get
> > out of sync. Do you mean v1 of this patch, that adds the individual
> > fields to bpf_flow_dissector?
>
>
> No, I meant this part of the patch:
>
>
> +static int check_virtio_net_hdr_access(struct bpf_verifier_env *env,
> int off,
> +                       int size)
> +{
> +    if (size < 0 || off < 0 ||
> +        (u64)off + size > sizeof(struct virtio_net_hdr)) {
> +        verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
> +            off, size);
> +        return -EACCES;
> +    }
> +    return 0;
> +}
> +
>
>
> It prevents the program from accessing e.g num_buffers.
I see, thanks. See my response to your following point.
>
>
> > That is no longer the case: the latest
> > version directly access the real struct. As Alexei points out, doing
> > this does not set virtio_net_hdr in stone in the ABI. That is a valid
> > worry. But so this patch series will not restrict how that struct may
> > develop over time. A version field allows a BPF program to parse the
> > different variants of the struct -- in the same manner as other
> > protocol headers.
>
>
> The format of the virtio-net header depends on the virtio features, any
> reason for another version? The correct way is to provide features in
> the context, in this case you don't event need the endian hint.
That might work. It clearly works for virtio. Not sure how to apply it
to pf_packet or tuntap callers of virtio_net_hdr_to_skb.
>
> > If you prefer, we can add that field from the start.
> > I don't see a benefit to an extra layer of indirection in the form of
> > helper functions.
> >
> > I do see downsides to splitting the program. The goal is to ensure
> > consistency between vnet_hdr and packet payload. A program split
> > limits to checking vnet_hdr against what the flow_keys struct has
> > extracted. That is a great reduction over full packet access.
>
>
> Full packet access could be still done in bpf flow dissector.
>
>
> > For
> > instance, does the packet contain IP options? No idea.
>
>
> I don't understand here. You can figure out this in flow dissector, and
> you can extend the flow keys to carry out this information if necessary.
This I disagree with. flow_keys are a distillation/simplification of
the packet contents. It is unlikely to capture every feature of every
packet. We end up having to extend it for every new case we're
interested in. That is ugly and a lot of busywork. And for what
purpose? The virtio_net_hdr field prefaces the protocol headers in the
same buffer in something like tpacket. Processing the metadata
together with the data is straightforward. I don't see what isolation
or layering that breaks.
> And if you want to have more capability, XDP which is designed for early
> packet filtering is the right way to go which have even more functions
> that a simple bpf flow dissector.
>
> >
> > If stable ABI is not a concern and there are no different struct
> > definitions that can go out of sync, does that address your main
> > concerns?
>
>
> I think not. Assuming we provide sufficient contexts (e.g the virtio
> features), problem still: 1) coupling virtio-net with flow_keys
A flow dissection program is allowed to read both contents of struct
virtio_net_hdr and packet contents. virtio_net_hdr is not made part of
struct bpf_flow_keys. The pointer there is just a way to give access
to multiple data sources through the single bpf program context.
> 2) can't work for XDP.
This future work (AF_?)XDP based alternative to
pf_packet/tuntap/virtio does not exist yet, so it's hard to fully
prepare for. But any replacement interface will observe the same
issue: when specifying offloads like GSO/csum, that metadata may not
agree with actual packet contents. We have to have a way to validate
that. I could imagine that this XDP program attached to the AF_XDP
interface would do the validation itself? Is that what you mean?
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-11  2:45                     ` Willem de Bruijn
@ 2021-06-11  3:38                       ` Jason Wang
  2021-06-11 14:12                         ` Willem de Bruijn
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-11  3:38 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Alexei Starovoitov, Tanner Love, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
在 2021/6/11 上午10:45, Willem de Bruijn 写道:
> On Thu, Jun 10, 2021 at 10:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/6/10 下午10:04, Willem de Bruijn 写道:
>>> On Thu, Jun 10, 2021 at 1:25 AM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
>>>>> On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> So I wonder why not simply use helpers to access the vnet header like
>>>>>> how tcp-bpf access the tcp header?
>>>>> Short answer - speed.
>>>>> tcp-bpf accesses all uapi and non-uapi structs directly.
>>>>>
>>>> Ok, this makes sense. But instead of coupling device specific stuffs
>>>> like vnet header and neediness into general flow_keys as a context.
>>>>
>>>> It would be better to introduce a vnet header context which contains
>>>>
>>>> 1) vnet header
>>>> 2) flow keys
>>>> 3) other contexts like endian and virtio-net features
>>>>
>>>> So we preserve the performance and decouple the virtio-net stuffs from
>>>> general structures like flow_keys or __sk_buff.
>>> You are advocating for a separate BPF program that takes a vnet hdr
>>> and flow_keys as context and is run separately after flow dissection?
>>
>> Yes.
>>
>>
>>> I don't understand the benefit of splitting the program in two in this manner.
>>
>> It decouples a device specific attributes from the general structures
>> like flow keys. We have xen-netfront, netvsc and a lot of drivers that
>> works for the emulated devices. We could not add all those metadatas as
>> the context of flow keys.
> What are device-specific attributes here? What kind of metadata?
Isn't virtio_net_hdr a virito-net specific metadata?
>
> The only metadata that can be passed with tuntap, pf_packet et al is
> virtio_net_hdr.
>
> I quite don't understand where xen-netfront et al come in.
The problem is, what kind of issue you want to solve. If you want to 
solve virtio specific issue, why do you need to do that in the general 
flow dissector?
If you want to solve the general issue of the packet metadata validation 
from untrusted source, you need validate not only virtio-net but all the 
others. Netfront is another dodgy source and there're a lot of implied 
untrusted source in the case of virtualization environment.
>
>> That's why I suggest to use something more
>> generic like XDP from the start. Yes, GSO stuffs was disabled by
>> virtio-net on XDP but it's not something that can not be fixed. If the
>> GSO and s/g support can not be done in short time
> An alternative interface does not address that we already have this
> interface and it is already causing problems.
What problems did you meant here?
>
>> then a virtio-net
>> specific BPF program still looks much better than coupling virtio-net
>> metadata into flow keys or other general data structures.
>>
>>
>>> Your previous comment mentions two vnet_hdr definitions that can get
>>> out of sync. Do you mean v1 of this patch, that adds the individual
>>> fields to bpf_flow_dissector?
>>
>> No, I meant this part of the patch:
>>
>>
>> +static int check_virtio_net_hdr_access(struct bpf_verifier_env *env,
>> int off,
>> +                       int size)
>> +{
>> +    if (size < 0 || off < 0 ||
>> +        (u64)off + size > sizeof(struct virtio_net_hdr)) {
>> +        verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
>> +            off, size);
>> +        return -EACCES;
>> +    }
>> +    return 0;
>> +}
>> +
>>
>>
>> It prevents the program from accessing e.g num_buffers.
> I see, thanks. See my response to your following point.
>
>>
>>> That is no longer the case: the latest
>>> version directly access the real struct. As Alexei points out, doing
>>> this does not set virtio_net_hdr in stone in the ABI. That is a valid
>>> worry. But so this patch series will not restrict how that struct may
>>> develop over time. A version field allows a BPF program to parse the
>>> different variants of the struct -- in the same manner as other
>>> protocol headers.
>>
>> The format of the virtio-net header depends on the virtio features, any
>> reason for another version? The correct way is to provide features in
>> the context, in this case you don't event need the endian hint.
> That might work. It clearly works for virtio. Not sure how to apply it
> to pf_packet or tuntap callers of virtio_net_hdr_to_skb.
This requires more thought but it also applies to the version. For 
tuntap, features could be deduced from the 1) TUN_SET_VET_HDR and 2) 
TUN_SET_OFFLOADS
Note that theatrically features could be provided by the userspace, but 
version is not (unless it's a part of uAPI but it became a duplication 
of the features).
Actually, this is a strong hint that the conext for packet, tuntap is 
different with virtio-net (though they are sharing the same (or patial) 
vnet header structure). E.g tuntap is unaware of mergerable buffers, it 
just leave the room for vhost-net or qemu to fill that fields.
>
>>> If you prefer, we can add that field from the start.
>>> I don't see a benefit to an extra layer of indirection in the form of
>>> helper functions.
>>>
>>> I do see downsides to splitting the program. The goal is to ensure
>>> consistency between vnet_hdr and packet payload. A program split
>>> limits to checking vnet_hdr against what the flow_keys struct has
>>> extracted. That is a great reduction over full packet access.
>>
>> Full packet access could be still done in bpf flow dissector.
>>
>>
>>> For
>>> instance, does the packet contain IP options? No idea.
>>
>> I don't understand here. You can figure out this in flow dissector, and
>> you can extend the flow keys to carry out this information if necessary.
> This I disagree with. flow_keys are a distillation/simplification of
> the packet contents. It is unlikely to capture every feature of every
> packet.
It depends on what kind of checks you want to do. When introducing a new 
API like this, we need to make sure all the fields could be validated 
instead of limiting it to some specific fields. Not all the fields are 
related to the flow, that's another point that validating it in the flow 
dissector is not a good choice.
For vnet_header fields that is related to the flow, they should be a 
subset of the current flow keys otherwise it's a hint a flow keys need 
to be extended. For vnet_header fields that is not related to the flow, 
validate it in flow dissector requires a lot of other context (features, 
and probably the virtqueue size for the num_buffers). And if you want to 
validate things that is totally unrelated to the vnet header (e.g IP 
option), it can be done in the flow dissector right now or via XDP.
>   We end up having to extend it for every new case we're
> interested in. That is ugly and a lot of busywork. And for what
> purpose? The virtio_net_hdr field prefaces the protocol headers in the
> same buffer in something like tpacket. Processing the metadata
> together with the data is straightforward. I don't see what isolation
> or layering that breaks.
Well, if you want to process metadata with the data, isn't XDP a much 
more better place to do that?
>
>> And if you want to have more capability, XDP which is designed for early
>> packet filtering is the right way to go which have even more functions
>> that a simple bpf flow dissector.
>>
>>> If stable ABI is not a concern and there are no different struct
>>> definitions that can go out of sync, does that address your main
>>> concerns?
>>
>> I think not. Assuming we provide sufficient contexts (e.g the virtio
>> features), problem still: 1) coupling virtio-net with flow_keys
> A flow dissection program is allowed to read both contents of struct
> virtio_net_hdr and packet contents. virtio_net_hdr is not made part of
> struct bpf_flow_keys.
It doesn't matter whether or not it's a pointer. And actually you had 
vhdr_is_little_endian:
@@ -6017,6 +6017,8 @@ struct bpf_flow_keys {
  	};
  	__u32	flags;
  	__be32	flow_label;
+	__bpf_md_ptr(const struct virtio_net_hdr *, vhdr);
+	__u8	vhdr_is_little_endian;
  };
And if we want to add the support to validate other untrusted sources, 
do we want:
struct bpf_flow_keys {
     /* general fields */
     virtio_net_context;
     xen_netfront_context;
     netvsc_context;
     ...
};
?
>   The pointer there is just a way to give access
> to multiple data sources through the single bpf program context.
>
>> 2) can't work for XDP.
> This future work (AF_?)XDP based alternative to
> pf_packet/tuntap/virtio does not exist yet, so it's hard to fully
> prepare for. But any replacement interface will observe the same
> issue: when specifying offloads like GSO/csum, that metadata may not
> agree with actual packet contents. We have to have a way to validate
> that. I could imagine that this XDP program attached to the AF_XDP
> interface would do the validation itself?
If the XDP can do validation itself, any reason to duplicate the work in 
the flow dissector?
> Is that what you mean?
Kind of, it's not specific to AF_XDP, assuming XDP supports GSO/sg. With 
XDP_REDIRECT/XDP_TX, you still need to validate the vnet header.
Thanks
>
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-11  3:38                       ` Jason Wang
@ 2021-06-11 14:12                         ` Willem de Bruijn
  2021-06-14 20:41                           ` Tanner Love
  2021-06-15  8:55                           ` Jason Wang
  0 siblings, 2 replies; 27+ messages in thread
From: Willem de Bruijn @ 2021-06-11 14:12 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Alexei Starovoitov, Tanner Love,
	Network Development, David Miller, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Eric Dumazet, Petar Penkov,
	Jakub Kicinski, Michael S . Tsirkin, Tanner Love
On Thu, Jun 10, 2021 at 11:40 PM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/6/11 上午10:45, Willem de Bruijn 写道:
> > On Thu, Jun 10, 2021 at 10:11 PM Jason Wang <jasowang@redhat.com> wrote:
> >>
> >> 在 2021/6/10 下午10:04, Willem de Bruijn 写道:
> >>> On Thu, Jun 10, 2021 at 1:25 AM Jason Wang <jasowang@redhat.com> wrote:
> >>>> 在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
> >>>>> On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
> >>>>>> So I wonder why not simply use helpers to access the vnet header like
> >>>>>> how tcp-bpf access the tcp header?
> >>>>> Short answer - speed.
> >>>>> tcp-bpf accesses all uapi and non-uapi structs directly.
> >>>>>
> >>>> Ok, this makes sense. But instead of coupling device specific stuffs
> >>>> like vnet header and neediness into general flow_keys as a context.
> >>>>
> >>>> It would be better to introduce a vnet header context which contains
> >>>>
> >>>> 1) vnet header
> >>>> 2) flow keys
> >>>> 3) other contexts like endian and virtio-net features
> >>>>
> >>>> So we preserve the performance and decouple the virtio-net stuffs from
> >>>> general structures like flow_keys or __sk_buff.
> >>> You are advocating for a separate BPF program that takes a vnet hdr
> >>> and flow_keys as context and is run separately after flow dissection?
> >>
> >> Yes.
> >>
> >>
> >>> I don't understand the benefit of splitting the program in two in this manner.
> >>
> >> It decouples a device specific attributes from the general structures
> >> like flow keys. We have xen-netfront, netvsc and a lot of drivers that
> >> works for the emulated devices. We could not add all those metadatas as
> >> the context of flow keys.
> > What are device-specific attributes here? What kind of metadata?
>
>
> Isn't virtio_net_hdr a virito-net specific metadata?
I don't think it is ever since it was also adopted for tun, tap,
pf_packet and uml. Basically, all callers of virtio_net_hdr_to_skb.
>
>
> >
> > The only metadata that can be passed with tuntap, pf_packet et al is
> > virtio_net_hdr.
> >
> > I quite don't understand where xen-netfront et al come in.
>
>
> The problem is, what kind of issue you want to solve. If you want to
> solve virtio specific issue, why do you need to do that in the general
> flow dissector?
>
> If you want to solve the general issue of the packet metadata validation
> from untrusted source, you need validate not only virtio-net but all the
> others. Netfront is another dodgy source and there're a lot of implied
> untrusted source in the case of virtualization environment.
Ah understood.
Yes, the goal is to protect the kernel from untrusted packets coming
from userspace in general. There are only a handful such injection
APIs.
I have not had to deal with netfront as much as the virtio_net_hdr
users. I'll take a look at that and netvsc. I cannot recall seeing as
many syzkaller reports about those, but that may not necessarily imply
that they are more robust -- it could just be that syzkaller has no
easy way to exercise them, like with packet sockets.
>
> >
> >> That's why I suggest to use something more
> >> generic like XDP from the start. Yes, GSO stuffs was disabled by
> >> virtio-net on XDP but it's not something that can not be fixed. If the
> >> GSO and s/g support can not be done in short time
> > An alternative interface does not address that we already have this
> > interface and it is already causing problems.
>
>
> What problems did you meant here?
The long list of syzkaller bugs that required fixes either directly in
virtio_net_hdr_to_skb or deeper in the stack, e.g.,
924a9bc362a5 net: check if protocol extracted by
virtio_net_hdr_set_proto is correct
6dd912f82680 net: check untrusted gso_size at kernel entry
9274124f023b net: stricter validation of untrusted gso packets
d2aa125d6290 net: Don't set transport offset to invalid value
d5be7f632bad net: validate untrusted gso packets without csum offload
9d2f67e43b73 net/packet: fix packet drop as of virtio gso
7c68d1a6b4db net: qdisc_pkt_len_init() should be more robust
cb9f1b783850 ip: validate header length on virtual device xmit
418e897e0716 gso: validate gso_type on ipip style tunnels
121d57af308d gso: validate gso_type in GSO handlers
This is not necessarily an exhaustive list.
And others not directly due to gso/csum metadata, but malformed
packets from userspace nonetheless, such as
76c0ddd8c3a6 ip6_tunnel: be careful when accessing the inner header
> >
> >> then a virtio-net
> >> specific BPF program still looks much better than coupling virtio-net
> >> metadata into flow keys or other general data structures.
> >>
> >>
> >>> Your previous comment mentions two vnet_hdr definitions that can get
> >>> out of sync. Do you mean v1 of this patch, that adds the individual
> >>> fields to bpf_flow_dissector?
> >>
> >> No, I meant this part of the patch:
> >>
> >>
> >> +static int check_virtio_net_hdr_access(struct bpf_verifier_env *env,
> >> int off,
> >> +                       int size)
> >> +{
> >> +    if (size < 0 || off < 0 ||
> >> +        (u64)off + size > sizeof(struct virtio_net_hdr)) {
> >> +        verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
> >> +            off, size);
> >> +        return -EACCES;
> >> +    }
> >> +    return 0;
> >> +}
> >> +
> >>
> >>
> >> It prevents the program from accessing e.g num_buffers.
> > I see, thanks. See my response to your following point.
> >
> >>
> >>> That is no longer the case: the latest
> >>> version directly access the real struct. As Alexei points out, doing
> >>> this does not set virtio_net_hdr in stone in the ABI. That is a valid
> >>> worry. But so this patch series will not restrict how that struct may
> >>> develop over time. A version field allows a BPF program to parse the
> >>> different variants of the struct -- in the same manner as other
> >>> protocol headers.
> >>
> >> The format of the virtio-net header depends on the virtio features, any
> >> reason for another version? The correct way is to provide features in
> >> the context, in this case you don't event need the endian hint.
> > That might work. It clearly works for virtio. Not sure how to apply it
> > to pf_packet or tuntap callers of virtio_net_hdr_to_skb.
>
>
> This requires more thought but it also applies to the version. For
> tuntap, features could be deduced from the 1) TUN_SET_VET_HDR and 2)
> TUN_SET_OFFLOADS
>
> Note that theatrically features could be provided by the userspace, but
> version is not (unless it's a part of uAPI but it became a duplication
> of the features).
>
> Actually, this is a strong hint that the conext for packet, tuntap is
> different with virtio-net (though they are sharing the same (or patial)
> vnet header structure). E.g tuntap is unaware of mergerable buffers, it
> just leave the room for vhost-net or qemu to fill that fields.
True. We need to pass the length of the struct along with any
version/typing information (which is currently only that endianness
boolean). Also to address the bounds checking issue you raised above.
>
> >
> >>> If you prefer, we can add that field from the start.
> >>> I don't see a benefit to an extra layer of indirection in the form of
> >>> helper functions.
> >>>
> >>> I do see downsides to splitting the program. The goal is to ensure
> >>> consistency between vnet_hdr and packet payload. A program split
> >>> limits to checking vnet_hdr against what the flow_keys struct has
> >>> extracted. That is a great reduction over full packet access.
> >>
> >> Full packet access could be still done in bpf flow dissector.
> >>
> >>
> >>> For
> >>> instance, does the packet contain IP options? No idea.
> >>
> >> I don't understand here. You can figure out this in flow dissector, and
> >> you can extend the flow keys to carry out this information if necessary.
> > This I disagree with. flow_keys are a distillation/simplification of
> > the packet contents. It is unlikely to capture every feature of every
> > packet.
>
>
> It depends on what kind of checks you want to do. When introducing a new
> API like this, we need to make sure all the fields could be validated
> instead of limiting it to some specific fields. Not all the fields are
> related to the flow, that's another point that validating it in the flow
> dissector is not a good choice.
>
> For vnet_header fields that is related to the flow, they should be a
> subset of the current flow keys otherwise it's a hint a flow keys need
> to be extended. For vnet_header fields that is not related to the flow,
> validate it in flow dissector requires a lot of other context (features,
> and probably the virtqueue size for the num_buffers). And if you want to
> validate things that is totally unrelated to the vnet header (e.g IP
> option), it can be done in the flow dissector right now or via XDP.
But correlating this data with the metadata is needlessly more complex
with two programs vs one. The issue is not that flow keys can capture
all vnet_hdr fields. It is that it won't capture all relevant packet
contents.
>
> >   We end up having to extend it for every new case we're
> > interested in. That is ugly and a lot of busywork. And for what
> > purpose? The virtio_net_hdr field prefaces the protocol headers in the
> > same buffer in something like tpacket. Processing the metadata
> > together with the data is straightforward. I don't see what isolation
> > or layering that breaks.
>
>
> Well, if you want to process metadata with the data, isn't XDP a much
> more better place to do that?
XDP or bpf flow dissector: they are both BPF programs. The defining
point is what context to pass along.
> >
> >> And if you want to have more capability, XDP which is designed for early
> >> packet filtering is the right way to go which have even more functions
> >> that a simple bpf flow dissector.
> >>
> >>> If stable ABI is not a concern and there are no different struct
> >>> definitions that can go out of sync, does that address your main
> >>> concerns?
> >>
> >> I think not. Assuming we provide sufficient contexts (e.g the virtio
> >> features), problem still: 1) coupling virtio-net with flow_keys
> > A flow dissection program is allowed to read both contents of struct
> > virtio_net_hdr and packet contents. virtio_net_hdr is not made part of
> > struct bpf_flow_keys.
>
>
> It doesn't matter whether or not it's a pointer. And actually you had
> vhdr_is_little_endian:
>
> @@ -6017,6 +6017,8 @@ struct bpf_flow_keys {
>         };
>         __u32   flags;
>         __be32  flow_label;
> +       __bpf_md_ptr(const struct virtio_net_hdr *, vhdr);
> +       __u8    vhdr_is_little_endian;
>   };
>
>
> And if we want to add the support to validate other untrusted sources,
> do we want:
>
> struct bpf_flow_keys {
>      /* general fields */
>      virtio_net_context;
>      xen_netfront_context;
>      netvsc_context;
>      ...
> };
>
> ?
No, I think just a pointer to the metadata context and a
type/versioning field that tells the program how to interpret it. A
strict program can drop everything that it does not understand. It
does not have to be complete.
> >   The pointer there is just a way to give access
> > to multiple data sources through the single bpf program context.
> >
> >> 2) can't work for XDP.
> > This future work (AF_?)XDP based alternative to
> > pf_packet/tuntap/virtio does not exist yet, so it's hard to fully
> > prepare for. But any replacement interface will observe the same
> > issue: when specifying offloads like GSO/csum, that metadata may not
> > agree with actual packet contents. We have to have a way to validate
> > that. I could imagine that this XDP program attached to the AF_XDP
> > interface would do the validation itself?
>
>
> If the XDP can do validation itself, any reason to duplicate the work in
> the flow dissector?
>
>
> > Is that what you mean?
>
>
> Kind of, it's not specific to AF_XDP, assuming XDP supports GSO/sg. With
> XDP_REDIRECT/XDP_TX, you still need to validate the vnet header.
The purpose is validation of data coming from untrusted userspace into
the kernel.
XDP_REDIRECT and XDP_TX forward data within the kernel, so are out of scope.
In the case where they are used along with some ABI through which data
can be inserted into the kernel, such as AF_XDP, it is relevant. And I
agree that then the XDP program can do the validation directly.
That just does not address validating of legacy interfaces. Those
interfaces generally require CAP_NET_RAW to setup, but they are often
used after setup to accept untrusted contents, so I don't think they
should be considered "root, so anything goes".
> Thanks
Thanks for the discussion. As said, we'll start by looking at the
other similar netfront interfaces.
>
> >
>
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-11 14:12                         ` Willem de Bruijn
@ 2021-06-14 20:41                           ` Tanner Love
  2021-06-15  8:57                             ` Jason Wang
  2021-06-15  8:55                           ` Jason Wang
  1 sibling, 1 reply; 27+ messages in thread
From: Tanner Love @ 2021-06-14 20:41 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Jason Wang, Alexei Starovoitov, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
On Fri, Jun 11, 2021 at 7:13 AM Willem de Bruijn
<willemdebruijn.kernel@gmail.com> wrote:
>
> On Thu, Jun 10, 2021 at 11:40 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> >
> > 在 2021/6/11 上午10:45, Willem de Bruijn 写道:
> > > On Thu, Jun 10, 2021 at 10:11 PM Jason Wang <jasowang@redhat.com> wrote:
> > >>
> > >> 在 2021/6/10 下午10:04, Willem de Bruijn 写道:
> > >>> On Thu, Jun 10, 2021 at 1:25 AM Jason Wang <jasowang@redhat.com> wrote:
> > >>>> 在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
> > >>>>> On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
> > >>>>>> So I wonder why not simply use helpers to access the vnet header like
> > >>>>>> how tcp-bpf access the tcp header?
> > >>>>> Short answer - speed.
> > >>>>> tcp-bpf accesses all uapi and non-uapi structs directly.
> > >>>>>
> > >>>> Ok, this makes sense. But instead of coupling device specific stuffs
> > >>>> like vnet header and neediness into general flow_keys as a context.
> > >>>>
> > >>>> It would be better to introduce a vnet header context which contains
> > >>>>
> > >>>> 1) vnet header
> > >>>> 2) flow keys
> > >>>> 3) other contexts like endian and virtio-net features
> > >>>>
> > >>>> So we preserve the performance and decouple the virtio-net stuffs from
> > >>>> general structures like flow_keys or __sk_buff.
> > >>> You are advocating for a separate BPF program that takes a vnet hdr
> > >>> and flow_keys as context and is run separately after flow dissection?
> > >>
> > >> Yes.
> > >>
> > >>
> > >>> I don't understand the benefit of splitting the program in two in this manner.
> > >>
> > >> It decouples a device specific attributes from the general structures
> > >> like flow keys. We have xen-netfront, netvsc and a lot of drivers that
> > >> works for the emulated devices. We could not add all those metadatas as
> > >> the context of flow keys.
> > > What are device-specific attributes here? What kind of metadata?
> >
> >
> > Isn't virtio_net_hdr a virito-net specific metadata?
>
> I don't think it is ever since it was also adopted for tun, tap,
> pf_packet and uml. Basically, all callers of virtio_net_hdr_to_skb.
>
> >
> >
> > >
> > > The only metadata that can be passed with tuntap, pf_packet et al is
> > > virtio_net_hdr.
> > >
> > > I quite don't understand where xen-netfront et al come in.
> >
> >
> > The problem is, what kind of issue you want to solve. If you want to
> > solve virtio specific issue, why do you need to do that in the general
> > flow dissector?
Suppose we determine that it would indeed also be good to add
support for xen-netfront, netvsc validation in this way. Is your
suggestion that these would need to be added all in the same patch
series? Could we not implement just virtio-net first, and add the
others subsequently, if we can demonstrate a feasible plan for
doing so? Thanks
>
> >
> > If you want to solve the general issue of the packet metadata validation
> > from untrusted source, you need validate not only virtio-net but all the
> > others. Netfront is another dodgy source and there're a lot of implied
> > untrusted source in the case of virtualization environment.
>
> Ah understood.
>
> Yes, the goal is to protect the kernel from untrusted packets coming
> from userspace in general. There are only a handful such injection
> APIs.
>
> I have not had to deal with netfront as much as the virtio_net_hdr
> users. I'll take a look at that and netvsc. I cannot recall seeing as
> many syzkaller reports about those, but that may not necessarily imply
> that they are more robust -- it could just be that syzkaller has no
> easy way to exercise them, like with packet sockets.
>
> >
> > >
> > >> That's why I suggest to use something more
> > >> generic like XDP from the start. Yes, GSO stuffs was disabled by
> > >> virtio-net on XDP but it's not something that can not be fixed. If the
> > >> GSO and s/g support can not be done in short time
> > > An alternative interface does not address that we already have this
> > > interface and it is already causing problems.
> >
> >
> > What problems did you meant here?
>
> The long list of syzkaller bugs that required fixes either directly in
> virtio_net_hdr_to_skb or deeper in the stack, e.g.,
>
> 924a9bc362a5 net: check if protocol extracted by
> virtio_net_hdr_set_proto is correct
> 6dd912f82680 net: check untrusted gso_size at kernel entry
> 9274124f023b net: stricter validation of untrusted gso packets
> d2aa125d6290 net: Don't set transport offset to invalid value
> d5be7f632bad net: validate untrusted gso packets without csum offload
> 9d2f67e43b73 net/packet: fix packet drop as of virtio gso
>
> 7c68d1a6b4db net: qdisc_pkt_len_init() should be more robust
> cb9f1b783850 ip: validate header length on virtual device xmit
> 418e897e0716 gso: validate gso_type on ipip style tunnels
> 121d57af308d gso: validate gso_type in GSO handlers
>
> This is not necessarily an exhaustive list.
>
> And others not directly due to gso/csum metadata, but malformed
> packets from userspace nonetheless, such as
>
> 76c0ddd8c3a6 ip6_tunnel: be careful when accessing the inner header
>
> > >
> > >> then a virtio-net
> > >> specific BPF program still looks much better than coupling virtio-net
> > >> metadata into flow keys or other general data structures.
> > >>
> > >>
> > >>> Your previous comment mentions two vnet_hdr definitions that can get
> > >>> out of sync. Do you mean v1 of this patch, that adds the individual
> > >>> fields to bpf_flow_dissector?
> > >>
> > >> No, I meant this part of the patch:
> > >>
> > >>
> > >> +static int check_virtio_net_hdr_access(struct bpf_verifier_env *env,
> > >> int off,
> > >> +                       int size)
> > >> +{
> > >> +    if (size < 0 || off < 0 ||
> > >> +        (u64)off + size > sizeof(struct virtio_net_hdr)) {
> > >> +        verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
> > >> +            off, size);
> > >> +        return -EACCES;
> > >> +    }
> > >> +    return 0;
> > >> +}
> > >> +
> > >>
> > >>
> > >> It prevents the program from accessing e.g num_buffers.
> > > I see, thanks. See my response to your following point.
> > >
> > >>
> > >>> That is no longer the case: the latest
> > >>> version directly access the real struct. As Alexei points out, doing
> > >>> this does not set virtio_net_hdr in stone in the ABI. That is a valid
> > >>> worry. But so this patch series will not restrict how that struct may
> > >>> develop over time. A version field allows a BPF program to parse the
> > >>> different variants of the struct -- in the same manner as other
> > >>> protocol headers.
> > >>
> > >> The format of the virtio-net header depends on the virtio features, any
> > >> reason for another version? The correct way is to provide features in
> > >> the context, in this case you don't event need the endian hint.
> > > That might work. It clearly works for virtio. Not sure how to apply it
> > > to pf_packet or tuntap callers of virtio_net_hdr_to_skb.
> >
> >
> > This requires more thought but it also applies to the version. For
> > tuntap, features could be deduced from the 1) TUN_SET_VET_HDR and 2)
> > TUN_SET_OFFLOADS
> >
> > Note that theatrically features could be provided by the userspace, but
> > version is not (unless it's a part of uAPI but it became a duplication
> > of the features).
> >
> > Actually, this is a strong hint that the conext for packet, tuntap is
> > different with virtio-net (though they are sharing the same (or patial)
> > vnet header structure). E.g tuntap is unaware of mergerable buffers, it
> > just leave the room for vhost-net or qemu to fill that fields.
>
> True. We need to pass the length of the struct along with any
> version/typing information (which is currently only that endianness
> boolean). Also to address the bounds checking issue you raised above.
>
> >
> > >
> > >>> If you prefer, we can add that field from the start.
> > >>> I don't see a benefit to an extra layer of indirection in the form of
> > >>> helper functions.
> > >>>
> > >>> I do see downsides to splitting the program. The goal is to ensure
> > >>> consistency between vnet_hdr and packet payload. A program split
> > >>> limits to checking vnet_hdr against what the flow_keys struct has
> > >>> extracted. That is a great reduction over full packet access.
> > >>
> > >> Full packet access could be still done in bpf flow dissector.
> > >>
> > >>
> > >>> For
> > >>> instance, does the packet contain IP options? No idea.
> > >>
> > >> I don't understand here. You can figure out this in flow dissector, and
> > >> you can extend the flow keys to carry out this information if necessary.
> > > This I disagree with. flow_keys are a distillation/simplification of
> > > the packet contents. It is unlikely to capture every feature of every
> > > packet.
> >
> >
> > It depends on what kind of checks you want to do. When introducing a new
> > API like this, we need to make sure all the fields could be validated
> > instead of limiting it to some specific fields. Not all the fields are
> > related to the flow, that's another point that validating it in the flow
> > dissector is not a good choice.
> >
> > For vnet_header fields that is related to the flow, they should be a
> > subset of the current flow keys otherwise it's a hint a flow keys need
> > to be extended. For vnet_header fields that is not related to the flow,
> > validate it in flow dissector requires a lot of other context (features,
> > and probably the virtqueue size for the num_buffers). And if you want to
> > validate things that is totally unrelated to the vnet header (e.g IP
> > option), it can be done in the flow dissector right now or via XDP.
>
> But correlating this data with the metadata is needlessly more complex
> with two programs vs one. The issue is not that flow keys can capture
> all vnet_hdr fields. It is that it won't capture all relevant packet
> contents.
>
> >
> > >   We end up having to extend it for every new case we're
> > > interested in. That is ugly and a lot of busywork. And for what
> > > purpose? The virtio_net_hdr field prefaces the protocol headers in the
> > > same buffer in something like tpacket. Processing the metadata
> > > together with the data is straightforward. I don't see what isolation
> > > or layering that breaks.
> >
> >
> > Well, if you want to process metadata with the data, isn't XDP a much
> > more better place to do that?
>
> XDP or bpf flow dissector: they are both BPF programs. The defining
> point is what context to pass along.
>
> > >
> > >> And if you want to have more capability, XDP which is designed for early
> > >> packet filtering is the right way to go which have even more functions
> > >> that a simple bpf flow dissector.
> > >>
> > >>> If stable ABI is not a concern and there are no different struct
> > >>> definitions that can go out of sync, does that address your main
> > >>> concerns?
> > >>
> > >> I think not. Assuming we provide sufficient contexts (e.g the virtio
> > >> features), problem still: 1) coupling virtio-net with flow_keys
> > > A flow dissection program is allowed to read both contents of struct
> > > virtio_net_hdr and packet contents. virtio_net_hdr is not made part of
> > > struct bpf_flow_keys.
> >
> >
> > It doesn't matter whether or not it's a pointer. And actually you had
> > vhdr_is_little_endian:
> >
> > @@ -6017,6 +6017,8 @@ struct bpf_flow_keys {
> >         };
> >         __u32   flags;
> >         __be32  flow_label;
> > +       __bpf_md_ptr(const struct virtio_net_hdr *, vhdr);
> > +       __u8    vhdr_is_little_endian;
> >   };
> >
> >
> > And if we want to add the support to validate other untrusted sources,
> > do we want:
> >
> > struct bpf_flow_keys {
> >      /* general fields */
> >      virtio_net_context;
> >      xen_netfront_context;
> >      netvsc_context;
> >      ...
> > };
> >
> > ?
>
> No, I think just a pointer to the metadata context and a
> type/versioning field that tells the program how to interpret it. A
> strict program can drop everything that it does not understand. It
> does not have to be complete.
>
> > >   The pointer there is just a way to give access
> > > to multiple data sources through the single bpf program context.
> > >
> > >> 2) can't work for XDP.
> > > This future work (AF_?)XDP based alternative to
> > > pf_packet/tuntap/virtio does not exist yet, so it's hard to fully
> > > prepare for. But any replacement interface will observe the same
> > > issue: when specifying offloads like GSO/csum, that metadata may not
> > > agree with actual packet contents. We have to have a way to validate
> > > that. I could imagine that this XDP program attached to the AF_XDP
> > > interface would do the validation itself?
> >
> >
> > If the XDP can do validation itself, any reason to duplicate the work in
> > the flow dissector?
> >
> >
> > > Is that what you mean?
> >
> >
> > Kind of, it's not specific to AF_XDP, assuming XDP supports GSO/sg. With
> > XDP_REDIRECT/XDP_TX, you still need to validate the vnet header.
>
> The purpose is validation of data coming from untrusted userspace into
> the kernel.
>
> XDP_REDIRECT and XDP_TX forward data within the kernel, so are out of scope.
>
> In the case where they are used along with some ABI through which data
> can be inserted into the kernel, such as AF_XDP, it is relevant. And I
> agree that then the XDP program can do the validation directly.
>
> That just does not address validating of legacy interfaces. Those
> interfaces generally require CAP_NET_RAW to setup, but they are often
> used after setup to accept untrusted contents, so I don't think they
> should be considered "root, so anything goes".
>
> > Thanks
>
> Thanks for the discussion. As said, we'll start by looking at the
> other similar netfront interfaces.
>
> >
> > >
> >
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-14 20:41                           ` Tanner Love
@ 2021-06-15  8:57                             ` Jason Wang
  0 siblings, 0 replies; 27+ messages in thread
From: Jason Wang @ 2021-06-15  8:57 UTC (permalink / raw)
  To: Tanner Love, Willem de Bruijn
  Cc: Alexei Starovoitov, Network Development, David Miller,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Petar Penkov, Jakub Kicinski, Michael S . Tsirkin,
	Tanner Love
在 2021/6/15 上午4:41, Tanner Love 写道:
>>>> The only metadata that can be passed with tuntap, pf_packet et al is
>>>> virtio_net_hdr.
>>>>
>>>> I quite don't understand where xen-netfront et al come in.
>>> The problem is, what kind of issue you want to solve. If you want to
>>> solve virtio specific issue, why do you need to do that in the general
>>> flow dissector?
> Suppose we determine that it would indeed also be good to add
> support for xen-netfront, netvsc validation in this way. Is your
> suggestion that these would need to be added all in the same patch
> series?
No.
> Could we not implement just virtio-net first, and add the
> others subsequently, if we can demonstrate a feasible plan for
> doing so? Thanks
Yes, as replied in another thread. I want to make sure whether doing 
this via flow dissector is the best way.
Thanks
>
^ permalink raw reply	[flat|nested] 27+ messages in thread 
 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-11 14:12                         ` Willem de Bruijn
  2021-06-14 20:41                           ` Tanner Love
@ 2021-06-15  8:55                           ` Jason Wang
  2021-06-15 14:47                             ` Willem de Bruijn
  1 sibling, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-15  8:55 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Alexei Starovoitov, Tanner Love, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
在 2021/6/11 下午10:12, Willem de Bruijn 写道:
> On Thu, Jun 10, 2021 at 11:40 PM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/6/11 上午10:45, Willem de Bruijn 写道:
>>> On Thu, Jun 10, 2021 at 10:11 PM Jason Wang <jasowang@redhat.com> wrote:
>>>> 在 2021/6/10 下午10:04, Willem de Bruijn 写道:
>>>>> On Thu, Jun 10, 2021 at 1:25 AM Jason Wang <jasowang@redhat.com> wrote:
>>>>>> 在 2021/6/10 下午12:19, Alexei Starovoitov 写道:
>>>>>>> On Wed, Jun 9, 2021 at 9:13 PM Jason Wang <jasowang@redhat.com> wrote:
>>>>>>>> So I wonder why not simply use helpers to access the vnet header like
>>>>>>>> how tcp-bpf access the tcp header?
>>>>>>> Short answer - speed.
>>>>>>> tcp-bpf accesses all uapi and non-uapi structs directly.
>>>>>>>
>>>>>> Ok, this makes sense. But instead of coupling device specific stuffs
>>>>>> like vnet header and neediness into general flow_keys as a context.
>>>>>>
>>>>>> It would be better to introduce a vnet header context which contains
>>>>>>
>>>>>> 1) vnet header
>>>>>> 2) flow keys
>>>>>> 3) other contexts like endian and virtio-net features
>>>>>>
>>>>>> So we preserve the performance and decouple the virtio-net stuffs from
>>>>>> general structures like flow_keys or __sk_buff.
>>>>> You are advocating for a separate BPF program that takes a vnet hdr
>>>>> and flow_keys as context and is run separately after flow dissection?
>>>> Yes.
>>>>
>>>>
>>>>> I don't understand the benefit of splitting the program in two in this manner.
>>>> It decouples a device specific attributes from the general structures
>>>> like flow keys. We have xen-netfront, netvsc and a lot of drivers that
>>>> works for the emulated devices. We could not add all those metadatas as
>>>> the context of flow keys.
>>> What are device-specific attributes here? What kind of metadata?
>>
>> Isn't virtio_net_hdr a virito-net specific metadata?
> I don't think it is ever since it was also adopted for tun, tap,
> pf_packet and uml. Basically, all callers of virtio_net_hdr_to_skb.
For tun/tap it was used to serve the backend of the virtio-net datapath 
which is still virtio-net specific.
For pf_packet and uml, they looks more like a byproduct or shortcut of 
the virtio-net backend (vhost has code to use af_packet, but I'm not 
whether or not it still work), so still kind of.
But the most important thing is that, the semantic of vnet header is 
defined by virtio spec.
>
>>
>>> The only metadata that can be passed with tuntap, pf_packet et al is
>>> virtio_net_hdr.
>>>
>>> I quite don't understand where xen-netfront et al come in.
>>
>> The problem is, what kind of issue you want to solve. If you want to
>> solve virtio specific issue, why do you need to do that in the general
>> flow dissector?
>>
>> If you want to solve the general issue of the packet metadata validation
>> from untrusted source, you need validate not only virtio-net but all the
>> others. Netfront is another dodgy source and there're a lot of implied
>> untrusted source in the case of virtualization environment.
> Ah understood.
>
> Yes, the goal is to protect the kernel from untrusted packets coming
> from userspace in general. There are only a handful such injection
> APIs.
Userspace is not necessarily the source: it could come from a device or 
BPF.
Theoretically, in the case of virtualization/smart NIC, they could be 
other type of untrusted source. Do we need to validate/mitigate the issues
1) per source/device like what this patch did, looks like a lot of work
2) validate via the general eBPF facility like XDP which requires 
multi-buffer and gso/csum support (which we know will happen for sure)
?
>
> I have not had to deal with netfront as much as the virtio_net_hdr
> users. I'll take a look at that and netvsc.
For xen, it's interesting that dodgy was set by netfront but not netback.
>   I cannot recall seeing as
> many syzkaller reports about those, but that may not necessarily imply
> that they are more robust -- it could just be that syzkaller has no
> easy way to exercise them, like with packet sockets.
Yes.
>
>>>> That's why I suggest to use something more
>>>> generic like XDP from the start. Yes, GSO stuffs was disabled by
>>>> virtio-net on XDP but it's not something that can not be fixed. If the
>>>> GSO and s/g support can not be done in short time
>>> An alternative interface does not address that we already have this
>>> interface and it is already causing problems.
>>
>> What problems did you meant here?
> The long list of syzkaller bugs that required fixes either directly in
> virtio_net_hdr_to_skb or deeper in the stack, e.g.,
>
> 924a9bc362a5 net: check if protocol extracted by
> virtio_net_hdr_set_proto is correct
> 6dd912f82680 net: check untrusted gso_size at kernel entry
> 9274124f023b net: stricter validation of untrusted gso packets
> d2aa125d6290 net: Don't set transport offset to invalid value
> d5be7f632bad net: validate untrusted gso packets without csum offload
> 9d2f67e43b73 net/packet: fix packet drop as of virtio gso
>
> 7c68d1a6b4db net: qdisc_pkt_len_init() should be more robust
> cb9f1b783850 ip: validate header length on virtual device xmit
> 418e897e0716 gso: validate gso_type on ipip style tunnels
> 121d57af308d gso: validate gso_type in GSO handlers
>
> This is not necessarily an exhaustive list.
>
> And others not directly due to gso/csum metadata, but malformed
> packets from userspace nonetheless, such as
>
> 76c0ddd8c3a6 ip6_tunnel: be careful when accessing the inner header
I see. So if I understand correctly, introducing SKB_GSO_DODGY is a 
balance between security and performance. That is to say, defer the 
gso/header check until just before they are used. But a lot of codes 
were not wrote under this assumption (forget to deal with dodgy packets) 
which breaks things.
>
>>>> then a virtio-net
>>>> specific BPF program still looks much better than coupling virtio-net
>>>> metadata into flow keys or other general data structures.
>>>>
>>>>
>>>>> Your previous comment mentions two vnet_hdr definitions that can get
>>>>> out of sync. Do you mean v1 of this patch, that adds the individual
>>>>> fields to bpf_flow_dissector?
>>>> No, I meant this part of the patch:
>>>>
>>>>
>>>> +static int check_virtio_net_hdr_access(struct bpf_verifier_env *env,
>>>> int off,
>>>> +                       int size)
>>>> +{
>>>> +    if (size < 0 || off < 0 ||
>>>> +        (u64)off + size > sizeof(struct virtio_net_hdr)) {
>>>> +        verbose(env, "invalid access to virtio_net_hdr off=%d size=%d\n",
>>>> +            off, size);
>>>> +        return -EACCES;
>>>> +    }
>>>> +    return 0;
>>>> +}
>>>> +
>>>>
>>>>
>>>> It prevents the program from accessing e.g num_buffers.
>>> I see, thanks. See my response to your following point.
>>>
>>>>> That is no longer the case: the latest
>>>>> version directly access the real struct. As Alexei points out, doing
>>>>> this does not set virtio_net_hdr in stone in the ABI. That is a valid
>>>>> worry. But so this patch series will not restrict how that struct may
>>>>> develop over time. A version field allows a BPF program to parse the
>>>>> different variants of the struct -- in the same manner as other
>>>>> protocol headers.
>>>> The format of the virtio-net header depends on the virtio features, any
>>>> reason for another version? The correct way is to provide features in
>>>> the context, in this case you don't event need the endian hint.
>>> That might work. It clearly works for virtio. Not sure how to apply it
>>> to pf_packet or tuntap callers of virtio_net_hdr_to_skb.
>>
>> This requires more thought but it also applies to the version. For
>> tuntap, features could be deduced from the 1) TUN_SET_VET_HDR and 2)
>> TUN_SET_OFFLOADS
>>
>> Note that theatrically features could be provided by the userspace, but
>> version is not (unless it's a part of uAPI but it became a duplication
>> of the features).
>>
>> Actually, this is a strong hint that the conext for packet, tuntap is
>> different with virtio-net (though they are sharing the same (or patial)
>> vnet header structure). E.g tuntap is unaware of mergerable buffers, it
>> just leave the room for vhost-net or qemu to fill that fields.
> True. We need to pass the length of the struct along with any
> version/typing information (which is currently only that endianness
> boolean). Also to address the bounds checking issue you raised above.
For packet socket, it's work probably via assuming a fixed set of feature.
>>>>> If you prefer, we can add that field from the start.
>>>>> I don't see a benefit to an extra layer of indirection in the form of
>>>>> helper functions.
>>>>>
>>>>> I do see downsides to splitting the program. The goal is to ensure
>>>>> consistency between vnet_hdr and packet payload. A program split
>>>>> limits to checking vnet_hdr against what the flow_keys struct has
>>>>> extracted. That is a great reduction over full packet access.
>>>> Full packet access could be still done in bpf flow dissector.
>>>>
>>>>
>>>>> For
>>>>> instance, does the packet contain IP options? No idea.
>>>> I don't understand here. You can figure out this in flow dissector, and
>>>> you can extend the flow keys to carry out this information if necessary.
>>> This I disagree with. flow_keys are a distillation/simplification of
>>> the packet contents. It is unlikely to capture every feature of every
>>> packet.
>>
>> It depends on what kind of checks you want to do. When introducing a new
>> API like this, we need to make sure all the fields could be validated
>> instead of limiting it to some specific fields. Not all the fields are
>> related to the flow, that's another point that validating it in the flow
>> dissector is not a good choice.
>>
>> For vnet_header fields that is related to the flow, they should be a
>> subset of the current flow keys otherwise it's a hint a flow keys need
>> to be extended. For vnet_header fields that is not related to the flow,
>> validate it in flow dissector requires a lot of other context (features,
>> and probably the virtqueue size for the num_buffers). And if you want to
>> validate things that is totally unrelated to the vnet header (e.g IP
>> option), it can be done in the flow dissector right now or via XDP.
> But correlating this data with the metadata is needlessly more complex
> with two programs vs one. The issue is not that flow keys can capture
> all vnet_hdr fields. It is that it won't capture all relevant packet
> contents.
>
>>>    We end up having to extend it for every new case we're
>>> interested in. That is ugly and a lot of busywork. And for what
>>> purpose? The virtio_net_hdr field prefaces the protocol headers in the
>>> same buffer in something like tpacket. Processing the metadata
>>> together with the data is straightforward. I don't see what isolation
>>> or layering that breaks.
>>
>> Well, if you want to process metadata with the data, isn't XDP a much
>> more better place to do that?
> XDP or bpf flow dissector: they are both BPF programs. The defining
> point is what context to pass along.
I think XDP covers the use cases here and XDP can do more than just flow 
dissection.
A minor issue is that XDP doesn't work for af_packet, but it's not hard 
to add.
(Assuming XDP has multi-buffer support).
>
>>>> And if you want to have more capability, XDP which is designed for early
>>>> packet filtering is the right way to go which have even more functions
>>>> that a simple bpf flow dissector.
>>>>
>>>>> If stable ABI is not a concern and there are no different struct
>>>>> definitions that can go out of sync, does that address your main
>>>>> concerns?
>>>> I think not. Assuming we provide sufficient contexts (e.g the virtio
>>>> features), problem still: 1) coupling virtio-net with flow_keys
>>> A flow dissection program is allowed to read both contents of struct
>>> virtio_net_hdr and packet contents. virtio_net_hdr is not made part of
>>> struct bpf_flow_keys.
>>
>> It doesn't matter whether or not it's a pointer. And actually you had
>> vhdr_is_little_endian:
>>
>> @@ -6017,6 +6017,8 @@ struct bpf_flow_keys {
>>          };
>>          __u32   flags;
>>          __be32  flow_label;
>> +       __bpf_md_ptr(const struct virtio_net_hdr *, vhdr);
>> +       __u8    vhdr_is_little_endian;
>>    };
>>
>>
>> And if we want to add the support to validate other untrusted sources,
>> do we want:
>>
>> struct bpf_flow_keys {
>>       /* general fields */
>>       virtio_net_context;
>>       xen_netfront_context;
>>       netvsc_context;
>>       ...
>> };
>>
>> ?
> No, I think just a pointer to the metadata context and a
> type/versioning field that tells the program how to interpret it. A
> strict program can drop everything that it does not understand. It
> does not have to be complete.
Yes, but still a kind of coupling consider we have a way to have this. (XDP)
>
>>>    The pointer there is just a way to give access
>>> to multiple data sources through the single bpf program context.
>>>
>>>> 2) can't work for XDP.
>>> This future work (AF_?)XDP based alternative to
>>> pf_packet/tuntap/virtio does not exist yet, so it's hard to fully
>>> prepare for. But any replacement interface will observe the same
>>> issue: when specifying offloads like GSO/csum, that metadata may not
>>> agree with actual packet contents. We have to have a way to validate
>>> that. I could imagine that this XDP program attached to the AF_XDP
>>> interface would do the validation itself?
>>
>> If the XDP can do validation itself, any reason to duplicate the work in
>> the flow dissector?
>>
>>
>>> Is that what you mean?
>>
>> Kind of, it's not specific to AF_XDP, assuming XDP supports GSO/sg. With
>> XDP_REDIRECT/XDP_TX, you still need to validate the vnet header.
> The purpose is validation of data coming from untrusted userspace into
> the kernel.
>
> XDP_REDIRECT and XDP_TX forward data within the kernel, so are out of scope.
So the problem is that, the NIC driver is wrote under the assumption 
that the gso metadata is correct. I remember there's a kernel CVE 
(divide by zero) because of the gso_size or other fields several years 
ago for a driver.
>
> In the case where they are used along with some ABI through which data
> can be inserted into the kernel,
CPUMAP is something that the data can be inserted into the kernel.
> such as AF_XDP, it is relevant. And I
> agree that then the XDP program can do the validation directly.
>
> That just does not address validating of legacy interfaces. Those
> interfaces generally require CAP_NET_RAW to setup, but they are often
> used after setup to accept untrusted contents, so I don't think they
> should be considered "root, so anything goes".
I'm not sure I understand the meaning of "legacy" interface here.
>
>> Thanks
> Thanks for the discussion. As said, we'll start by looking at the
> other similar netfront interfaces.
Thanks, I'm not objecting the goal, but I want to figure out whether 
it's the best to choice to do that via flow dissector.
>
^ permalink raw reply	[flat|nested] 27+ messages in thread
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-15  8:55                           ` Jason Wang
@ 2021-06-15 14:47                             ` Willem de Bruijn
  2021-06-16 10:21                               ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Willem de Bruijn @ 2021-06-15 14:47 UTC (permalink / raw)
  To: Jason Wang
  Cc: Alexei Starovoitov, Tanner Love, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
> >> Isn't virtio_net_hdr a virito-net specific metadata?
> > I don't think it is ever since it was also adopted for tun, tap,
> > pf_packet and uml. Basically, all callers of virtio_net_hdr_to_skb.
>
>
> For tun/tap it was used to serve the backend of the virtio-net datapath
> which is still virtio-net specific.
>
> For pf_packet and uml, they looks more like a byproduct or shortcut of
> the virtio-net backend (vhost has code to use af_packet, but I'm not
> whether or not it still work), so still kind of.
>
> But the most important thing is that, the semantic of vnet header is
> defined by virtio spec.
The packet socket and tuntap interfaces have use cases unrelated
to virtio. virtio_net_hdr is just a way to egress packets with offloads.
So yes and no. The current definition of struct virtio_net_hdr is part of
the Linux ABI as is, and used outside virtio.
If virtio changes its spec, let's say to add SO_TXTIME to give an
example, we may or may not bother to add a pf_packet setsockopt
to optionally support that.
>
> >
> >>
> >>> The only metadata that can be passed with tuntap, pf_packet et al is
> >>> virtio_net_hdr.
> >>>
> >>> I quite don't understand where xen-netfront et al come in.
> >>
> >> The problem is, what kind of issue you want to solve. If you want to
> >> solve virtio specific issue, why do you need to do that in the general
> >> flow dissector?
> >>
> >> If you want to solve the general issue of the packet metadata validation
> >> from untrusted source, you need validate not only virtio-net but all the
> >> others. Netfront is another dodgy source and there're a lot of implied
> >> untrusted source in the case of virtualization environment.
> > Ah understood.
> >
> > Yes, the goal is to protect the kernel from untrusted packets coming
> > from userspace in general. There are only a handful such injection
> > APIs.
>
> Userspace is not necessarily the source: it could come from a device or
> BPF.
>
> Theoretically, in the case of virtualization/smart NIC, they could be
> other type of untrusted source. Do we need to validate/mitigate the issues
Adding packets coming from NICs or BPF programs greatly expands
the scope.
The purpose of this patch series is to protect the kernel against packets
inserted from userspace, by adding validation at the entry point.
Agreed that BPF programs can do unspeakable things to packets, but
that is a different issue (with a different trust model), and out of scope.
> 1) per source/device like what this patch did, looks like a lot of work
> 2) validate via the general eBPF facility like XDP which requires
> multi-buffer and gso/csum support (which we know will happen for sure)
>
> ?
Are you thinking of the XDP support in virtio-net, which is specific
to that device and does not capture these other uses cases of
struct virtio_net_hdr_to_skb?
Or are you okay with the validation hook, but suggesting using
an XDP program type instead of a flow dissector program type?
>
> >
> > I have not had to deal with netfront as much as the virtio_net_hdr
> > users. I'll take a look at that and netvsc.
>
>
> For xen, it's interesting that dodgy was set by netfront but not netback.
>
>
> >   I cannot recall seeing as
> > many syzkaller reports about those, but that may not necessarily imply
> > that they are more robust -- it could just be that syzkaller has no
> > easy way to exercise them, like with packet sockets.
>
>
> Yes.
>
>
> >
> >>>> That's why I suggest to use something more
> >>>> generic like XDP from the start. Yes, GSO stuffs was disabled by
> >>>> virtio-net on XDP but it's not something that can not be fixed. If the
> >>>> GSO and s/g support can not be done in short time
> >>> An alternative interface does not address that we already have this
> >>> interface and it is already causing problems.
> >>
> >> What problems did you meant here?
> > The long list of syzkaller bugs that required fixes either directly in
> > virtio_net_hdr_to_skb or deeper in the stack, e.g.,
> >
> > 924a9bc362a5 net: check if protocol extracted by
> > virtio_net_hdr_set_proto is correct
> > 6dd912f82680 net: check untrusted gso_size at kernel entry
> > 9274124f023b net: stricter validation of untrusted gso packets
> > d2aa125d6290 net: Don't set transport offset to invalid value
> > d5be7f632bad net: validate untrusted gso packets without csum offload
> > 9d2f67e43b73 net/packet: fix packet drop as of virtio gso
> >
> > 7c68d1a6b4db net: qdisc_pkt_len_init() should be more robust
> > cb9f1b783850 ip: validate header length on virtual device xmit
> > 418e897e0716 gso: validate gso_type on ipip style tunnels
> > 121d57af308d gso: validate gso_type in GSO handlers
> >
> > This is not necessarily an exhaustive list.
> >
> > And others not directly due to gso/csum metadata, but malformed
> > packets from userspace nonetheless, such as
> >
> > 76c0ddd8c3a6 ip6_tunnel: be careful when accessing the inner header
>
>
> I see. So if I understand correctly, introducing SKB_GSO_DODGY is a
> balance between security and performance. That is to say, defer the
> gso/header check until just before they are used. But a lot of codes
> were not wrote under this assumption (forget to deal with dodgy packets)
> which breaks things.
DODGY and requiring robust kernel stack code is part of it, but
Making the core kernel stack robust against bad packets incurs cost on
every packet for the relatively rare case of these packets coming from
outside the stack. Ideally, this cost is incurred on such packets
alone. That's why I prefer validation at the source.
This patch series made it *optional* validation at the source to
address your concerns about performance regressions. I.e., if you know
that the path packets will take is robust, such as a simple bridging
case between VMs, it is perhaps fine to forgo the checks. But egress
over a physical NIC or even through the kernel egress path (or
loopback to ingress, which has a different set of expectations) has
higher integrity requirements.
Not all concerns are GSO, and thus captured by DODGY. Such as truncated hdrlen.
> So the problem is that, the NIC driver is wrote under the assumption
> that the gso metadata is correct. I remember there's a kernel CVE
> (divide by zero) because of the gso_size or other fields several years
> ago for a driver.
Right, and more: both NICs and the software stack itself have various
expectations on input. Such as whether protocol headers are in the
linear section, or that the csum offset is within bounds.
> > In the case where they are used along with some ABI through which data
> > can be inserted into the kernel,
>
>
> CPUMAP is something that the data can be inserted into the kernel.
>
>
> > such as AF_XDP, it is relevant. And I
> > agree that then the XDP program can do the validation directly.
> >
> > That just does not address validating of legacy interfaces. Those
> > interfaces generally require CAP_NET_RAW to setup, but they are often
> > used after setup to accept untrusted contents, so I don't think they
> > should be considered "root, so anything goes".
>
>
> I'm not sure I understand the meaning of "legacy" interface here.
the existing interfaces to insert packets from userspace:
tuntap/uml/pf_packet/..
>
>
> >
> >> Thanks
> > Thanks for the discussion. As said, we'll start by looking at the
> > other similar netfront interfaces.
>
>
> Thanks, I'm not objecting the goal, but I want to figure out whether
> it's the best to choice to do that via flow dissector.
Understood. My main question at this point is whether you are
suggesting a different bpf program type at the same hooks, or are
arguing that a XDP-based interface will make all this obsolete.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-15 14:47                             ` Willem de Bruijn
@ 2021-06-16 10:21                               ` Jason Wang
  2021-06-17 14:43                                 ` Willem de Bruijn
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-16 10:21 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Alexei Starovoitov, Tanner Love, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
在 2021/6/15 下午10:47, Willem de Bruijn 写道:
>>>> Isn't virtio_net_hdr a virito-net specific metadata?
>>> I don't think it is ever since it was also adopted for tun, tap,
>>> pf_packet and uml. Basically, all callers of virtio_net_hdr_to_skb.
>>
>> For tun/tap it was used to serve the backend of the virtio-net datapath
>> which is still virtio-net specific.
>>
>> For pf_packet and uml, they looks more like a byproduct or shortcut of
>> the virtio-net backend (vhost has code to use af_packet, but I'm not
>> whether or not it still work), so still kind of.
>>
>> But the most important thing is that, the semantic of vnet header is
>> defined by virtio spec.
> The packet socket and tuntap interfaces have use cases unrelated
> to virtio. virtio_net_hdr is just a way to egress packets with offloads.
>
> So yes and no. The current definition of struct virtio_net_hdr is part of
> the Linux ABI as is, and used outside virtio.
>
> If virtio changes its spec, let's say to add SO_TXTIME to give an
> example, we may or may not bother to add a pf_packet setsockopt
> to optionally support that.
Right.
>
>>>>> The only metadata that can be passed with tuntap, pf_packet et al is
>>>>> virtio_net_hdr.
>>>>>
>>>>> I quite don't understand where xen-netfront et al come in.
>>>> The problem is, what kind of issue you want to solve. If you want to
>>>> solve virtio specific issue, why do you need to do that in the general
>>>> flow dissector?
>>>>
>>>> If you want to solve the general issue of the packet metadata validation
>>>> from untrusted source, you need validate not only virtio-net but all the
>>>> others. Netfront is another dodgy source and there're a lot of implied
>>>> untrusted source in the case of virtualization environment.
>>> Ah understood.
>>>
>>> Yes, the goal is to protect the kernel from untrusted packets coming
>>> from userspace in general. There are only a handful such injection
>>> APIs.
>> Userspace is not necessarily the source: it could come from a device or
>> BPF.
>>
>> Theoretically, in the case of virtualization/smart NIC, they could be
>> other type of untrusted source. Do we need to validate/mitigate the issues
> Adding packets coming from NICs or BPF programs greatly expands
> the scope.
>
> The purpose of this patch series is to protect the kernel against packets
> inserted from userspace, by adding validation at the entry point.
>
> Agreed that BPF programs can do unspeakable things to packets, but
> that is a different issue (with a different trust model), and out of scope.
Ok, I think I understand your point, so basically we had two types of 
untrusted sources for virtio_net_hdr_to_skb():
1) packet injected from userspace: tun, tap, packet
2) packet received from a NIC: virtio-net, uml
If I understand correctly, you only care about case 1). But the method 
could also be used by case 2).
For 1) the proposal should work, we only need to care about csum/gso 
stuffs instead of virtio specific attributes like num_buffers.
And 2) is the one that I want to make sure it works as expected since it 
requires more context to validate and we have other untrusted NICs
>
>> 1) per source/device like what this patch did, looks like a lot of work
>> 2) validate via the general eBPF facility like XDP which requires
>> multi-buffer and gso/csum support (which we know will happen for sure)
>>
>> ?
> Are you thinking of the XDP support in virtio-net, which is specific
> to that device and does not capture these other uses cases of
> struct virtio_net_hdr_to_skb?
Just to clarify, currently when using XDP, virtio-net will think the 
vnet header is invalid since there's no way to access the vnet header 
and XDP doesn't support s/g, csum and offloads. But I believe in the 
future it will get all those support, and then XDP should have a way to 
access the device specific packet header like vnet header. Assuming all 
of these are ready, virtio_net_hdr_to_skb() is not necessarily called in 
the RX path. This means, we may lose to track some packets if they were 
go via XDP which bypass the validation hooks in virtio_net_hdr_skb().
>
> Or are you okay with the validation hook, but suggesting using
> an XDP program type instead of a flow dissector program type?
Basically, I think the virtio_net_hdr_to_skb() is probably not the best 
place (see above).
I wonder whether XDP is the best place to do that. But XDP has its own 
issues e.g it can't work for packet and macvtap.
Or maybe we can simply tweak this patch to make sure the hooks works 
even if XDP is used.
>
>>> I have not had to deal with netfront as much as the virtio_net_hdr
>>> users. I'll take a look at that and netvsc.
>>
>> For xen, it's interesting that dodgy was set by netfront but not netback.
>>
>>
>>>    I cannot recall seeing as
>>> many syzkaller reports about those, but that may not necessarily imply
>>> that they are more robust -- it could just be that syzkaller has no
>>> easy way to exercise them, like with packet sockets.
>>
>> Yes.
>>
>>
>>>>>> That's why I suggest to use something more
>>>>>> generic like XDP from the start. Yes, GSO stuffs was disabled by
>>>>>> virtio-net on XDP but it's not something that can not be fixed. If the
>>>>>> GSO and s/g support can not be done in short time
>>>>> An alternative interface does not address that we already have this
>>>>> interface and it is already causing problems.
>>>> What problems did you meant here?
>>> The long list of syzkaller bugs that required fixes either directly in
>>> virtio_net_hdr_to_skb or deeper in the stack, e.g.,
>>>
>>> 924a9bc362a5 net: check if protocol extracted by
>>> virtio_net_hdr_set_proto is correct
>>> 6dd912f82680 net: check untrusted gso_size at kernel entry
>>> 9274124f023b net: stricter validation of untrusted gso packets
>>> d2aa125d6290 net: Don't set transport offset to invalid value
>>> d5be7f632bad net: validate untrusted gso packets without csum offload
>>> 9d2f67e43b73 net/packet: fix packet drop as of virtio gso
>>>
>>> 7c68d1a6b4db net: qdisc_pkt_len_init() should be more robust
>>> cb9f1b783850 ip: validate header length on virtual device xmit
>>> 418e897e0716 gso: validate gso_type on ipip style tunnels
>>> 121d57af308d gso: validate gso_type in GSO handlers
>>>
>>> This is not necessarily an exhaustive list.
>>>
>>> And others not directly due to gso/csum metadata, but malformed
>>> packets from userspace nonetheless, such as
>>>
>>> 76c0ddd8c3a6 ip6_tunnel: be careful when accessing the inner header
>>
>> I see. So if I understand correctly, introducing SKB_GSO_DODGY is a
>> balance between security and performance. That is to say, defer the
>> gso/header check until just before they are used. But a lot of codes
>> were not wrote under this assumption (forget to deal with dodgy packets)
>> which breaks things.
> DODGY and requiring robust kernel stack code is part of it, but
>
> Making the core kernel stack robust against bad packets incurs cost on
> every packet for the relatively rare case of these packets coming from
> outside the stack. Ideally, this cost is incurred on such packets
> alone. That's why I prefer validation at the source.
>
> This patch series made it *optional* validation at the source to
> address your concerns about performance regressions. I.e., if you know
> that the path packets will take is robust, such as a simple bridging
> case between VMs, it is perhaps fine to forgo the checks. But egress
> over a physical NIC or even through the kernel egress path (or
> loopback to ingress, which has a different set of expectations) has
> higher integrity requirements.
>
> Not all concerns are GSO, and thus captured by DODGY. Such as truncated hdrlen.
Yes, what I understand that the major use case is the mitigation and we 
still need to make sure the stack is robust.
>
>> So the problem is that, the NIC driver is wrote under the assumption
>> that the gso metadata is correct. I remember there's a kernel CVE
>> (divide by zero) because of the gso_size or other fields several years
>> ago for a driver.
> Right, and more: both NICs and the software stack itself have various
> expectations on input. Such as whether protocol headers are in the
> linear section, or that the csum offset is within bounds.
>
>>> In the case where they are used along with some ABI through which data
>>> can be inserted into the kernel,
>>
>> CPUMAP is something that the data can be inserted into the kernel.
>>
>>
>>> such as AF_XDP, it is relevant. And I
>>> agree that then the XDP program can do the validation directly.
>>>
>>> That just does not address validating of legacy interfaces. Those
>>> interfaces generally require CAP_NET_RAW to setup, but they are often
>>> used after setup to accept untrusted contents, so I don't think they
>>> should be considered "root, so anything goes".
>>
>> I'm not sure I understand the meaning of "legacy" interface here.
> the existing interfaces to insert packets from userspace:
> tuntap/uml/pf_packet/..
Ok.
>
>>
>>>> Thanks
>>> Thanks for the discussion. As said, we'll start by looking at the
>>> other similar netfront interfaces.
>>
>> Thanks, I'm not objecting the goal, but I want to figure out whether
>> it's the best to choice to do that via flow dissector.
> Understood. My main question at this point is whether you are
> suggesting a different bpf program type at the same hooks, or are
> arguing that a XDP-based interface will make all this obsolete.
Ideally, I think XDP is probably the best. It is designed to do such 
early packet dropping per device. But it misses some important features 
which makes it can't work today.
The method proposed in this patch should work for userspace injected 
packets, but it may not work very well in the case of XDP in the future. 
Using another bpf program type may only solve the issue of vnet header 
coupling with vnet header.
I wonder whether or not we can do something in the middle:
1) make sure the flow dissector works even for the case of XDP (note 
that tun support XDP)
2) use some general fields instead of virtio-net specific fields, e.g 
using device header instead of vnet header in the flow keys strcuture
Or if the maintainers are happy with the current method, I don't want to 
block it.
Thanks
>
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-16 10:21                               ` Jason Wang
@ 2021-06-17 14:43                                 ` Willem de Bruijn
  2021-06-21  6:33                                   ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Willem de Bruijn @ 2021-06-17 14:43 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Alexei Starovoitov, Tanner Love,
	Network Development, David Miller, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Eric Dumazet, Petar Penkov,
	Jakub Kicinski, Michael S . Tsirkin, Tanner Love
On Wed, Jun 16, 2021 at 6:22 AM Jason Wang <jasowang@redhat.com> wrote:
>
>
> 在 2021/6/15 下午10:47, Willem de Bruijn 写道:
> >>>> Isn't virtio_net_hdr a virito-net specific metadata?
> >>> I don't think it is ever since it was also adopted for tun, tap,
> >>> pf_packet and uml. Basically, all callers of virtio_net_hdr_to_skb.
> >>
> >> For tun/tap it was used to serve the backend of the virtio-net datapath
> > The purpose of this patch series is to protect the kernel against packets
> > inserted from userspace, by adding validation at the entry point.
> >
> > Agreed that BPF programs can do unspeakable things to packets, but
> > that is a different issue (with a different trust model), and out of scope.
>
>
> Ok, I think I understand your point, so basically we had two types of
> untrusted sources for virtio_net_hdr_to_skb():
>
> 1) packet injected from userspace: tun, tap, packet
> 2) packet received from a NIC: virtio-net, uml
>
> If I understand correctly, you only care about case 1). But the method
> could also be used by case 2).
>
> For 1) the proposal should work, we only need to care about csum/gso
> stuffs instead of virtio specific attributes like num_buffers.
>
> And 2) is the one that I want to make sure it works as expected since it
> requires more context to validate and we have other untrusted NICs
Yes. I did not fully appreciate the two different use cases of this
interface. For packets entering the the receive stack from a network
device, I agree that XDP is a suitable drop filter in principle. No
need for another layer. In practice, the single page constraint is a
large constraint today. But as you point out multi-buffer is a work in
progress.
> Ideally, I think XDP is probably the best. It is designed to do such
> early packet dropping per device. But it misses some important features
> which makes it can't work today.
>
> The method proposed in this patch should work for userspace injected
> packets, but it may not work very well in the case of XDP in the future.
> Using another bpf program type may only solve the issue of vnet header
> coupling with vnet header.
>
> I wonder whether or not we can do something in the middle:
>
> 1) make sure the flow dissector works even for the case of XDP (note
> that tun support XDP)
I think that wherever an XDP program exists in the datapath, that can
do the filtering, so there is no need for additional flow dissection.
If restricting this patch series to the subset of callers that inject
into the egress path *and* one of them has an XDP hook, the scope for
this feature set is narrower.
> 2) use some general fields instead of virtio-net specific fields, e.g
> using device header instead of vnet header in the flow keys strcuture
Can you give an example of what would be in the device header?
Specific for GSO, we have two sets of constants: VIRTIO_NET_HDR_GSO_..
and SKB_GSO_.. Is the suggestion to replace the current use of the
first in field flow_keys->virtio_net_hdr.gso_type with the second in
flow_keys->gso_type?
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-17 14:43                                 ` Willem de Bruijn
@ 2021-06-21  6:33                                   ` Jason Wang
  2021-06-21 13:18                                     ` Willem de Bruijn
  0 siblings, 1 reply; 27+ messages in thread
From: Jason Wang @ 2021-06-21  6:33 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Alexei Starovoitov, Tanner Love, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
在 2021/6/17 下午10:43, Willem de Bruijn 写道:
> On Wed, Jun 16, 2021 at 6:22 AM Jason Wang <jasowang@redhat.com> wrote:
>>
>> 在 2021/6/15 下午10:47, Willem de Bruijn 写道:
>>>>>> Isn't virtio_net_hdr a virito-net specific metadata?
>>>>> I don't think it is ever since it was also adopted for tun, tap,
>>>>> pf_packet and uml. Basically, all callers of virtio_net_hdr_to_skb.
>>>> For tun/tap it was used to serve the backend of the virtio-net datapath
>>> The purpose of this patch series is to protect the kernel against packets
>>> inserted from userspace, by adding validation at the entry point.
>>>
>>> Agreed that BPF programs can do unspeakable things to packets, but
>>> that is a different issue (with a different trust model), and out of scope.
>>
>> Ok, I think I understand your point, so basically we had two types of
>> untrusted sources for virtio_net_hdr_to_skb():
>>
>> 1) packet injected from userspace: tun, tap, packet
>> 2) packet received from a NIC: virtio-net, uml
>>
>> If I understand correctly, you only care about case 1). But the method
>> could also be used by case 2).
>>
>> For 1) the proposal should work, we only need to care about csum/gso
>> stuffs instead of virtio specific attributes like num_buffers.
>>
>> And 2) is the one that I want to make sure it works as expected since it
>> requires more context to validate and we have other untrusted NICs
> Yes. I did not fully appreciate the two different use cases of this
> interface. For packets entering the the receive stack from a network
> device, I agree that XDP is a suitable drop filter in principle. No
> need for another layer.
So strictly speaking, tuntap is also the path that the packet entering 
the receive stack.
> In practice, the single page constraint is a
> large constraint today. But as you point out multi-buffer is a work in
> progress.
Another advantage is that XDP is per device, but it looks to me flow 
dissector is per ns.
>
>> Ideally, I think XDP is probably the best. It is designed to do such
>> early packet dropping per device. But it misses some important features
>> which makes it can't work today.
>>
>> The method proposed in this patch should work for userspace injected
>> packets, but it may not work very well in the case of XDP in the future.
>> Using another bpf program type may only solve the issue of vnet header
>> coupling with vnet header.
>>
>> I wonder whether or not we can do something in the middle:
>>
>> 1) make sure the flow dissector works even for the case of XDP (note
>> that tun support XDP)
> I think that wherever an XDP program exists in the datapath, that can
> do the filtering, so there is no need for additional flow dissection.
I agree.
>
> If restricting this patch series to the subset of callers that inject
> into the egress path *and* one of them has an XDP hook, the scope for
> this feature set is narrower.
Right, if we don't restrict, we can't prevent user from using this in 2).
>
>> 2) use some general fields instead of virtio-net specific fields, e.g
>> using device header instead of vnet header in the flow keys strcuture
> Can you give an example of what would be in the device header?
>
> Specific for GSO, we have two sets of constants: VIRTIO_NET_HDR_GSO_..
> and SKB_GSO_.. Is the suggestion to replace the current use of the
> first in field flow_keys->virtio_net_hdr.gso_type with the second in
> flow_keys->gso_type?
No, I meant using a general fields like flow_keys->device_hdr. And use 
bpf helpers to access the field.
I think for mitigation, we don't care too much about the performance lose.
Thanks
>
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-21  6:33                                   ` Jason Wang
@ 2021-06-21 13:18                                     ` Willem de Bruijn
  2021-06-22  2:37                                       ` Jason Wang
  0 siblings, 1 reply; 27+ messages in thread
From: Willem de Bruijn @ 2021-06-21 13:18 UTC (permalink / raw)
  To: Jason Wang
  Cc: Willem de Bruijn, Alexei Starovoitov, Tanner Love,
	Network Development, David Miller, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Eric Dumazet, Petar Penkov,
	Jakub Kicinski, Michael S . Tsirkin, Tanner Love
> >> 2) use some general fields instead of virtio-net specific fields, e.g
> >> using device header instead of vnet header in the flow keys strcuture
> > Can you give an example of what would be in the device header?
> >
> > Specific for GSO, we have two sets of constants: VIRTIO_NET_HDR_GSO_..
> > and SKB_GSO_.. Is the suggestion to replace the current use of the
> > first in field flow_keys->virtio_net_hdr.gso_type with the second in
> > flow_keys->gso_type?
>
>
> No, I meant using a general fields like flow_keys->device_hdr. And use
> bpf helpers to access the field.
What would be in this device_hdr field, and what would the bpf helpers
access? I don't fully follow what this is if not vnet_hdr.
^ permalink raw reply	[flat|nested] 27+ messages in thread 
- * Re: [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb
  2021-06-21 13:18                                     ` Willem de Bruijn
@ 2021-06-22  2:37                                       ` Jason Wang
  0 siblings, 0 replies; 27+ messages in thread
From: Jason Wang @ 2021-06-22  2:37 UTC (permalink / raw)
  To: Willem de Bruijn
  Cc: Alexei Starovoitov, Tanner Love, Network Development,
	David Miller, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Eric Dumazet, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Tanner Love
在 2021/6/21 下午9:18, Willem de Bruijn 写道:
>>>> 2) use some general fields instead of virtio-net specific fields, e.g
>>>> using device header instead of vnet header in the flow keys strcuture
>>> Can you give an example of what would be in the device header?
>>>
>>> Specific for GSO, we have two sets of constants: VIRTIO_NET_HDR_GSO_..
>>> and SKB_GSO_.. Is the suggestion to replace the current use of the
>>> first in field flow_keys->virtio_net_hdr.gso_type with the second in
>>> flow_keys->gso_type?
>>
>> No, I meant using a general fields like flow_keys->device_hdr. And use
>> bpf helpers to access the field.
> What would be in this device_hdr field, and what would the bpf helpers
> access? I don't fully follow what this is if not vnet_hdr.
For virtio-net, it should be just vnet_hdr. Maybe "device_hdr" is not 
accurate, "packet_hdr" should be better.
This allows the field to be reused by other type of userspace injected 
packet, like tun packet info.
Bpf helpers could be used to access the packet header in this case.
Thanks
>
^ permalink raw reply	[flat|nested] 27+ messages in thread 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
- * [PATCH net-next v4 3/3] selftests/net: amend bpf flow dissector prog to do vnet hdr validation
  2021-06-08 17:02 [PATCH net-next v4 0/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb Tanner Love
  2021-06-08 17:02 ` [PATCH net-next v4 1/3] net: flow_dissector: extend bpf flow dissector support with vnet hdr Tanner Love
  2021-06-08 17:02 ` [PATCH net-next v4 2/3] virtio_net: add optional flow dissection in virtio_net_hdr_to_skb Tanner Love
@ 2021-06-08 17:02 ` Tanner Love
  2 siblings, 0 replies; 27+ messages in thread
From: Tanner Love @ 2021-06-08 17:02 UTC (permalink / raw)
  To: netdev
  Cc: davem, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Eric Dumazet, Willem de Bruijn, Petar Penkov, Jakub Kicinski,
	Michael S . Tsirkin, Jason Wang, Tanner Love
From: Tanner Love <tannerlove@google.com>
Change the BPF flow dissector program to perform various checks on the
virtio_net_hdr fields after doing flow dissection.
Amend test_flow_dissector.(c|sh) to add test cases that inject packets
with reasonable or unreasonable virtio-net headers and assert that bad
packets are dropped and good packets are not. Do this via packet
socket; the kernel executes tpacket_snd, which enters
virtio_net_hdr_to_skb, where flow dissection / vnet header validation
occurs.
Changes
v4:
  - Read virtio_net_hdr pointer from struct bpf_flow_keys
  - Add vnet header endianness logic to BPF program
Signed-off-by: Tanner Love <tannerlove@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
---
 tools/testing/selftests/bpf/progs/bpf_flow.c  | 208 ++++++++++++++----
 .../selftests/bpf/test_flow_dissector.c       | 181 +++++++++++++--
 .../selftests/bpf/test_flow_dissector.sh      |  19 ++
 3 files changed, 342 insertions(+), 66 deletions(-)
diff --git a/tools/testing/selftests/bpf/progs/bpf_flow.c b/tools/testing/selftests/bpf/progs/bpf_flow.c
index 95a5a0778ed7..556e1cb1f781 100644
--- a/tools/testing/selftests/bpf/progs/bpf_flow.c
+++ b/tools/testing/selftests/bpf/progs/bpf_flow.c
@@ -13,6 +13,7 @@
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/if_packet.h>
+#include <linux/virtio_net.h>
 #include <sys/socket.h>
 #include <linux/if_tunnel.h>
 #include <linux/mpls.h>
@@ -71,15 +72,141 @@ struct {
 	__type(value, struct bpf_flow_keys);
 } last_dissection SEC(".maps");
 
-static __always_inline int export_flow_keys(struct bpf_flow_keys *keys,
+static __always_inline __u16 __virtio16_to_cpu(bool little_endian,
+					       __virtio16 val)
+{
+	if (little_endian)
+		return __le16_to_cpu((__le16)val);
+	else
+		return __be16_to_cpu((__be16)val);
+}
+
+/* Drops invalid virtio-net headers */
+static __always_inline int validate_vnet_hdr(const struct bpf_flow_keys *keys,
+					     __u32 skb_len)
+{
+	__u16 gso_type, hdr_len, gso_size, csum_start, csum_offset;
+	const struct virtio_net_hdr *vhdr = keys->vhdr;
+
+	if (!vhdr)
+		return BPF_OK;
+
+	gso_type = __virtio16_to_cpu(keys->vhdr_is_little_endian,
+				     vhdr->gso_type);
+	hdr_len = __virtio16_to_cpu(keys->vhdr_is_little_endian,
+				    vhdr->hdr_len);
+	gso_size = __virtio16_to_cpu(keys->vhdr_is_little_endian,
+				     vhdr->gso_size);
+	csum_start = __virtio16_to_cpu(keys->vhdr_is_little_endian,
+				       vhdr->csum_start);
+	csum_offset = __virtio16_to_cpu(keys->vhdr_is_little_endian,
+					vhdr->csum_offset);
+
+	/* Check gso */
+	if (gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+		if (!(vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
+			return BPF_DROP;
+
+		if (keys->is_encap)
+			return BPF_DROP;
+
+		switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+		case VIRTIO_NET_HDR_GSO_TCPV4:
+			if (keys->addr_proto != ETH_P_IP ||
+			    keys->ip_proto != IPPROTO_TCP)
+				return BPF_DROP;
+
+			if (gso_size >= skb_len - keys->thoff -
+					sizeof(struct tcphdr))
+				return BPF_DROP;
+
+			break;
+		case VIRTIO_NET_HDR_GSO_TCPV6:
+			if (keys->addr_proto != ETH_P_IPV6 ||
+			    keys->ip_proto != IPPROTO_TCP)
+				return BPF_DROP;
+
+			if (gso_size >= skb_len - keys->thoff -
+					sizeof(struct tcphdr))
+				return BPF_DROP;
+
+			break;
+		case VIRTIO_NET_HDR_GSO_UDP:
+			if (keys->ip_proto != IPPROTO_UDP)
+				return BPF_DROP;
+
+			if (gso_size >= skb_len - keys->thoff -
+					sizeof(struct udphdr))
+				return BPF_DROP;
+
+			break;
+		default:
+			return BPF_DROP;
+		}
+	}
+
+	/* Check hdr_len */
+	if (hdr_len) {
+		switch (keys->ip_proto) {
+		case IPPROTO_TCP:
+			if (hdr_len != keys->thoff + sizeof(struct tcphdr))
+				return BPF_DROP;
+
+			break;
+		case IPPROTO_UDP:
+			if (hdr_len != keys->thoff + sizeof(struct udphdr))
+				return BPF_DROP;
+
+			break;
+		}
+	}
+
+	/* Check csum */
+	if (vhdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+		if (keys->addr_proto != ETH_P_IP &&
+		    keys->addr_proto != ETH_P_IPV6)
+			return BPF_DROP;
+
+		if (csum_start != keys->thoff)
+			return BPF_DROP;
+
+		switch (keys->ip_proto) {
+		case IPPROTO_TCP:
+			if (csum_offset != offsetof(struct tcphdr, check))
+				return BPF_DROP;
+
+			break;
+		case IPPROTO_UDP:
+			if (csum_offset != offsetof(struct udphdr, check))
+				return BPF_DROP;
+
+			break;
+		default:
+			return BPF_DROP;
+		}
+	}
+
+	return BPF_OK;
+}
+
+/* Common steps to perform regardless of where protocol parsing finishes:
+ * 1. store flow keys in map
+ * 2. if parse result is BPF_OK, parse the vnet hdr if present
+ * 3. return the parse result
+ */
+static __always_inline int parse_epilogue(struct __sk_buff *skb,
 					    int ret)
 {
+	const struct bpf_flow_keys *keys = skb->flow_keys;
 	__u32 key = (__u32)(keys->sport) << 16 | keys->dport;
 	struct bpf_flow_keys val;
 
 	memcpy(&val, keys, sizeof(val));
 	bpf_map_update_elem(&last_dissection, &key, &val, BPF_ANY);
-	return ret;
+
+	if (ret != BPF_OK)
+		return ret;
+	return validate_vnet_hdr(keys, skb->len);
 }
 
 #define IPV6_FLOWLABEL_MASK		__bpf_constant_htonl(0x000FFFFF)
@@ -114,8 +241,6 @@ static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
 /* Dispatches on ETHERTYPE */
 static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
 {
-	struct bpf_flow_keys *keys = skb->flow_keys;
-
 	switch (proto) {
 	case bpf_htons(ETH_P_IP):
 		bpf_tail_call_static(skb, &jmp_table, IP);
@@ -131,12 +256,10 @@ static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
 	case bpf_htons(ETH_P_8021AD):
 		bpf_tail_call_static(skb, &jmp_table, VLAN);
 		break;
-	default:
-		/* Protocol not supported */
-		return export_flow_keys(keys, BPF_DROP);
 	}
 
-	return export_flow_keys(keys, BPF_DROP);
+	/* Protocol not supported */
+	return parse_epilogue(skb, BPF_DROP);
 }
 
 SEC("flow_dissector")
@@ -162,28 +285,28 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 	case IPPROTO_ICMP:
 		icmp = bpf_flow_dissect_get_header(skb, sizeof(*icmp), &_icmp);
 		if (!icmp)
-			return export_flow_keys(keys, BPF_DROP);
-		return export_flow_keys(keys, BPF_OK);
+			return parse_epilogue(skb, BPF_DROP);
+		return parse_epilogue(skb, BPF_OK);
 	case IPPROTO_IPIP:
 		keys->is_encap = true;
 		if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
-			return export_flow_keys(keys, BPF_OK);
+			return parse_epilogue(skb, BPF_OK);
 
 		return parse_eth_proto(skb, bpf_htons(ETH_P_IP));
 	case IPPROTO_IPV6:
 		keys->is_encap = true;
 		if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
-			return export_flow_keys(keys, BPF_OK);
+			return parse_epilogue(skb, BPF_OK);
 
 		return parse_eth_proto(skb, bpf_htons(ETH_P_IPV6));
 	case IPPROTO_GRE:
 		gre = bpf_flow_dissect_get_header(skb, sizeof(*gre), &_gre);
 		if (!gre)
-			return export_flow_keys(keys, BPF_DROP);
+			return parse_epilogue(skb, BPF_DROP);
 
 		if (bpf_htons(gre->flags & GRE_VERSION))
 			/* Only inspect standard GRE packets with version 0 */
-			return export_flow_keys(keys, BPF_OK);
+			return parse_epilogue(skb, BPF_OK);
 
 		keys->thoff += sizeof(*gre); /* Step over GRE Flags and Proto */
 		if (GRE_IS_CSUM(gre->flags))
@@ -195,13 +318,13 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 
 		keys->is_encap = true;
 		if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP)
-			return export_flow_keys(keys, BPF_OK);
+			return parse_epilogue(skb, BPF_OK);
 
 		if (gre->proto == bpf_htons(ETH_P_TEB)) {
 			eth = bpf_flow_dissect_get_header(skb, sizeof(*eth),
 							  &_eth);
 			if (!eth)
-				return export_flow_keys(keys, BPF_DROP);
+				return parse_epilogue(skb, BPF_DROP);
 
 			keys->thoff += sizeof(*eth);
 
@@ -212,37 +335,35 @@ static __always_inline int parse_ip_proto(struct __sk_buff *skb, __u8 proto)
 	case IPPROTO_TCP:
 		tcp = bpf_flow_dissect_get_header(skb, sizeof(*tcp), &_tcp);
 		if (!tcp)
-			return export_flow_keys(keys, BPF_DROP);
+			return parse_epilogue(skb, BPF_DROP);
 
 		if (tcp->doff < 5)
-			return export_flow_keys(keys, BPF_DROP);
+			return parse_epilogue(skb, BPF_DROP);
 
 		if ((__u8 *)tcp + (tcp->doff << 2) > data_end)
-			return export_flow_keys(keys, BPF_DROP);
+			return parse_epilogue(skb, BPF_DROP);
 
 		keys->sport = tcp->source;
 		keys->dport = tcp->dest;
-		return export_flow_keys(keys, BPF_OK);
+		return parse_epilogue(skb, BPF_OK);
 	case IPPROTO_UDP:
 	case IPPROTO_UDPLITE:
 		udp = bpf_flow_dissect_get_header(skb, sizeof(*udp), &_udp);
 		if (!udp)
-			return export_flow_keys(keys, BPF_DROP);
+			return parse_epilogue(skb, BPF_DROP);
 
 		keys->sport = udp->source;
 		keys->dport = udp->dest;
-		return export_flow_keys(keys, BPF_OK);
+		return parse_epilogue(skb, BPF_OK);
 	default:
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 	}
 
-	return export_flow_keys(keys, BPF_DROP);
+	return parse_epilogue(skb, BPF_DROP);
 }
 
 static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
 {
-	struct bpf_flow_keys *keys = skb->flow_keys;
-
 	switch (nexthdr) {
 	case IPPROTO_HOPOPTS:
 	case IPPROTO_DSTOPTS:
@@ -255,7 +376,7 @@ static __always_inline int parse_ipv6_proto(struct __sk_buff *skb, __u8 nexthdr)
 		return parse_ip_proto(skb, nexthdr);
 	}
 
-	return export_flow_keys(keys, BPF_DROP);
+	return parse_epilogue(skb, BPF_DROP);
 }
 
 PROG(IP)(struct __sk_buff *skb)
@@ -268,11 +389,11 @@ PROG(IP)(struct __sk_buff *skb)
 
 	iph = bpf_flow_dissect_get_header(skb, sizeof(*iph), &_iph);
 	if (!iph)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	/* IP header cannot be smaller than 20 bytes */
 	if (iph->ihl < 5)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	keys->addr_proto = ETH_P_IP;
 	keys->ipv4_src = iph->saddr;
@@ -281,7 +402,7 @@ PROG(IP)(struct __sk_buff *skb)
 
 	keys->thoff += iph->ihl << 2;
 	if (data + keys->thoff > data_end)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	if (iph->frag_off & bpf_htons(IP_MF | IP_OFFSET)) {
 		keys->is_frag = true;
@@ -302,7 +423,7 @@ PROG(IP)(struct __sk_buff *skb)
 	}
 
 	if (done)
-		return export_flow_keys(keys, BPF_OK);
+		return parse_epilogue(skb, BPF_OK);
 
 	return parse_ip_proto(skb, iph->protocol);
 }
@@ -314,7 +435,7 @@ PROG(IPV6)(struct __sk_buff *skb)
 
 	ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
 	if (!ip6h)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	keys->addr_proto = ETH_P_IPV6;
 	memcpy(&keys->ipv6_src, &ip6h->saddr, 2*sizeof(ip6h->saddr));
@@ -324,7 +445,7 @@ PROG(IPV6)(struct __sk_buff *skb)
 	keys->flow_label = ip6_flowlabel(ip6h);
 
 	if (keys->flags & BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)
-		return export_flow_keys(keys, BPF_OK);
+		return parse_epilogue(skb, BPF_OK);
 
 	return parse_ipv6_proto(skb, ip6h->nexthdr);
 }
@@ -336,7 +457,7 @@ PROG(IPV6OP)(struct __sk_buff *skb)
 
 	ip6h = bpf_flow_dissect_get_header(skb, sizeof(*ip6h), &_ip6h);
 	if (!ip6h)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	/* hlen is in 8-octets and does not include the first 8 bytes
 	 * of the header
@@ -354,7 +475,7 @@ PROG(IPV6FR)(struct __sk_buff *skb)
 
 	fragh = bpf_flow_dissect_get_header(skb, sizeof(*fragh), &_fragh);
 	if (!fragh)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	keys->thoff += sizeof(*fragh);
 	keys->is_frag = true;
@@ -367,9 +488,9 @@ PROG(IPV6FR)(struct __sk_buff *skb)
 		 * explicitly asked for.
 		 */
 		if (!(keys->flags & BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG))
-			return export_flow_keys(keys, BPF_OK);
+			return parse_epilogue(skb, BPF_OK);
 	} else {
-		return export_flow_keys(keys, BPF_OK);
+		return parse_epilogue(skb, BPF_OK);
 	}
 
 	return parse_ipv6_proto(skb, fragh->nexthdr);
@@ -377,14 +498,13 @@ PROG(IPV6FR)(struct __sk_buff *skb)
 
 PROG(MPLS)(struct __sk_buff *skb)
 {
-	struct bpf_flow_keys *keys = skb->flow_keys;
 	struct mpls_label *mpls, _mpls;
 
 	mpls = bpf_flow_dissect_get_header(skb, sizeof(*mpls), &_mpls);
 	if (!mpls)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
-	return export_flow_keys(keys, BPF_OK);
+	return parse_epilogue(skb, BPF_OK);
 }
 
 PROG(VLAN)(struct __sk_buff *skb)
@@ -396,10 +516,10 @@ PROG(VLAN)(struct __sk_buff *skb)
 	if (keys->n_proto == bpf_htons(ETH_P_8021AD)) {
 		vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
 		if (!vlan)
-			return export_flow_keys(keys, BPF_DROP);
+			return parse_epilogue(skb, BPF_DROP);
 
 		if (vlan->h_vlan_encapsulated_proto != bpf_htons(ETH_P_8021Q))
-			return export_flow_keys(keys, BPF_DROP);
+			return parse_epilogue(skb, BPF_DROP);
 
 		keys->nhoff += sizeof(*vlan);
 		keys->thoff += sizeof(*vlan);
@@ -407,14 +527,14 @@ PROG(VLAN)(struct __sk_buff *skb)
 
 	vlan = bpf_flow_dissect_get_header(skb, sizeof(*vlan), &_vlan);
 	if (!vlan)
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	keys->nhoff += sizeof(*vlan);
 	keys->thoff += sizeof(*vlan);
 	/* Only allow 8021AD + 8021Q double tagging and no triple tagging.*/
 	if (vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021AD) ||
 	    vlan->h_vlan_encapsulated_proto == bpf_htons(ETH_P_8021Q))
-		return export_flow_keys(keys, BPF_DROP);
+		return parse_epilogue(skb, BPF_DROP);
 
 	keys->n_proto = vlan->h_vlan_encapsulated_proto;
 	return parse_eth_proto(skb, vlan->h_vlan_encapsulated_proto);
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.c b/tools/testing/selftests/bpf/test_flow_dissector.c
index 571cc076dd7d..aa80055a5518 100644
--- a/tools/testing/selftests/bpf/test_flow_dissector.c
+++ b/tools/testing/selftests/bpf/test_flow_dissector.c
@@ -17,6 +17,8 @@
 #include <linux/if_packet.h>
 #include <linux/if_ether.h>
 #include <linux/ipv6.h>
+#include <linux/virtio_net.h>
+#include <net/if.h>
 #include <netinet/ip.h>
 #include <netinet/in.h>
 #include <netinet/udp.h>
@@ -65,7 +67,8 @@ struct guehdr {
 static uint8_t	cfg_dsfield_inner;
 static uint8_t	cfg_dsfield_outer;
 static uint8_t	cfg_encap_proto;
-static bool	cfg_expect_failure = false;
+static bool	cfg_expect_norx;
+static bool	cfg_expect_snd_failure;
 static int	cfg_l3_extra = AF_UNSPEC;	/* optional SIT prefix */
 static int	cfg_l3_inner = AF_UNSPEC;
 static int	cfg_l3_outer = AF_UNSPEC;
@@ -77,8 +80,14 @@ static int	cfg_port_gue = 6080;
 static bool	cfg_only_rx;
 static bool	cfg_only_tx;
 static int	cfg_src_port = 9;
+static bool	cfg_tx_pf_packet;
+static bool	cfg_use_vnet;
+static bool	cfg_vnet_use_hdr_len_bad;
+static bool	cfg_vnet_use_gso;
+static bool	cfg_vnet_use_csum_off;
+static bool	cfg_partial_udp_hdr;
 
-static char	buf[ETH_DATA_LEN];
+static char	buf[ETH_MAX_MTU];
 
 #define INIT_ADDR4(name, addr4, port)				\
 	static struct sockaddr_in name = {			\
@@ -273,8 +282,48 @@ static int l3_length(int family)
 		return sizeof(struct ipv6hdr);
 }
 
+static int build_vnet_header(void *header, int il3_len)
+{
+	struct virtio_net_hdr *vh = header;
+
+	vh->hdr_len = ETH_HLEN + il3_len + sizeof(struct udphdr);
+
+	if (cfg_partial_udp_hdr) {
+		vh->hdr_len -= (sizeof(struct udphdr) >> 1);
+		return sizeof(*vh);
+	}
+
+	/* Alteration must increase hdr_len; if not, kernel overwrites it */
+	if (cfg_vnet_use_hdr_len_bad)
+		vh->hdr_len++;
+
+	if (cfg_vnet_use_csum_off) {
+		vh->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		vh->csum_start = ETH_HLEN + il3_len;
+		vh->csum_offset = __builtin_offsetof(struct udphdr, check);
+	}
+
+	if (cfg_vnet_use_gso) {
+		vh->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+		vh->gso_size = ETH_DATA_LEN - il3_len;
+	}
+
+	return sizeof(*vh);
+}
+
+static int build_eth_header(void *header)
+{
+	struct ethhdr *eth = header;
+	uint16_t proto = cfg_l3_inner == PF_INET ? ETH_P_IP : ETH_P_IPV6;
+
+	eth->h_proto = htons(proto);
+
+	return ETH_HLEN;
+}
+
 static int build_packet(void)
 {
+	int l2_len = 0;
 	int ol3_len = 0, ol4_len = 0, il3_len = 0, il4_len = 0;
 	int el3_len = 0;
 
@@ -294,23 +343,29 @@ static int build_packet(void)
 	il3_len = l3_length(cfg_l3_inner);
 	il4_len = sizeof(struct udphdr);
 
-	if (el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len >=
-	    sizeof(buf))
+	if (cfg_use_vnet)
+		l2_len += build_vnet_header(buf, il3_len);
+	if (cfg_tx_pf_packet)
+		l2_len += build_eth_header(buf + l2_len);
+
+	if (l2_len + el3_len + ol3_len + ol4_len + il3_len + il4_len +
+	    cfg_payload_len >= sizeof(buf))
 		error(1, 0, "packet too large\n");
 
 	/*
 	 * Fill packet from inside out, to calculate correct checksums.
 	 * But create ip before udp headers, as udp uses ip for pseudo-sum.
 	 */
-	memset(buf + el3_len + ol3_len + ol4_len + il3_len + il4_len,
+	memset(buf + l2_len + el3_len + ol3_len + ol4_len + il3_len + il4_len,
 	       cfg_payload_char, cfg_payload_len);
 
 	/* add zero byte for udp csum padding */
-	buf[el3_len + ol3_len + ol4_len + il3_len + il4_len + cfg_payload_len] = 0;
+	buf[l2_len + el3_len + ol3_len + ol4_len + il3_len + il4_len +
+	    cfg_payload_len] = 0;
 
 	switch (cfg_l3_inner) {
 	case PF_INET:
-		build_ipv4_header(buf + el3_len + ol3_len + ol4_len,
+		build_ipv4_header(buf + l2_len + el3_len + ol3_len + ol4_len,
 				  IPPROTO_UDP,
 				  in_saddr4.sin_addr.s_addr,
 				  in_daddr4.sin_addr.s_addr,
@@ -318,7 +373,7 @@ static int build_packet(void)
 				  cfg_dsfield_inner);
 		break;
 	case PF_INET6:
-		build_ipv6_header(buf + el3_len + ol3_len + ol4_len,
+		build_ipv6_header(buf + l2_len + el3_len + ol3_len + ol4_len,
 				  IPPROTO_UDP,
 				  &in_saddr6, &in_daddr6,
 				  il4_len + cfg_payload_len,
@@ -326,22 +381,25 @@ static int build_packet(void)
 		break;
 	}
 
-	build_udp_header(buf + el3_len + ol3_len + ol4_len + il3_len,
+	build_udp_header(buf + l2_len + el3_len + ol3_len + ol4_len + il3_len,
 			 cfg_payload_len, CFG_PORT_INNER, cfg_l3_inner);
 
+	if (cfg_partial_udp_hdr)
+		return l2_len + il3_len + (il4_len >> 1);
+
 	if (!cfg_encap_proto)
-		return il3_len + il4_len + cfg_payload_len;
+		return l2_len + il3_len + il4_len + cfg_payload_len;
 
 	switch (cfg_l3_outer) {
 	case PF_INET:
-		build_ipv4_header(buf + el3_len, cfg_encap_proto,
+		build_ipv4_header(buf + l2_len + el3_len, cfg_encap_proto,
 				  out_saddr4.sin_addr.s_addr,
 				  out_daddr4.sin_addr.s_addr,
 				  ol4_len + il3_len + il4_len + cfg_payload_len,
 				  cfg_dsfield_outer);
 		break;
 	case PF_INET6:
-		build_ipv6_header(buf + el3_len, cfg_encap_proto,
+		build_ipv6_header(buf + l2_len + el3_len, cfg_encap_proto,
 				  &out_saddr6, &out_daddr6,
 				  ol4_len + il3_len + il4_len + cfg_payload_len,
 				  cfg_dsfield_outer);
@@ -350,17 +408,17 @@ static int build_packet(void)
 
 	switch (cfg_encap_proto) {
 	case IPPROTO_UDP:
-		build_gue_header(buf + el3_len + ol3_len + ol4_len -
+		build_gue_header(buf + l2_len + el3_len + ol3_len + ol4_len -
 				 sizeof(struct guehdr),
 				 cfg_l3_inner == PF_INET ? IPPROTO_IPIP
 							 : IPPROTO_IPV6);
-		build_udp_header(buf + el3_len + ol3_len,
+		build_udp_header(buf + l2_len + el3_len + ol3_len,
 				 sizeof(struct guehdr) + il3_len + il4_len +
 				 cfg_payload_len,
 				 cfg_port_gue, cfg_l3_outer);
 		break;
 	case IPPROTO_GRE:
-		build_gre_header(buf + el3_len + ol3_len,
+		build_gre_header(buf + l2_len + el3_len + ol3_len,
 				 cfg_l3_inner == PF_INET ? ETH_P_IP
 							 : ETH_P_IPV6);
 		break;
@@ -368,7 +426,7 @@ static int build_packet(void)
 
 	switch (cfg_l3_extra) {
 	case PF_INET:
-		build_ipv4_header(buf,
+		build_ipv4_header(buf + l2_len,
 				  cfg_l3_outer == PF_INET ? IPPROTO_IPIP
 							  : IPPROTO_IPV6,
 				  extra_saddr4.sin_addr.s_addr,
@@ -377,7 +435,7 @@ static int build_packet(void)
 				  cfg_payload_len, 0);
 		break;
 	case PF_INET6:
-		build_ipv6_header(buf,
+		build_ipv6_header(buf + l2_len,
 				  cfg_l3_outer == PF_INET ? IPPROTO_IPIP
 							  : IPPROTO_IPV6,
 				  &extra_saddr6, &extra_daddr6,
@@ -386,15 +444,46 @@ static int build_packet(void)
 		break;
 	}
 
-	return el3_len + ol3_len + ol4_len + il3_len + il4_len +
+	return l2_len + el3_len + ol3_len + ol4_len + il3_len + il4_len +
 	       cfg_payload_len;
 }
 
+static int setup_tx_pfpacket(void)
+{
+	struct sockaddr_ll laddr = {0};
+	const int one = 1;
+	uint16_t proto;
+	int fd;
+
+	fd = socket(PF_PACKET, SOCK_RAW, 0);
+	if (fd == -1)
+		error(1, errno, "socket tx");
+
+	if (cfg_use_vnet &&
+	    setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, &one, sizeof(one)))
+		error(1, errno, "setsockopt vnet");
+
+	proto = cfg_l3_inner == PF_INET ? ETH_P_IP : ETH_P_IPV6;
+	laddr.sll_family = AF_PACKET;
+	laddr.sll_protocol = htons(proto);
+	laddr.sll_ifindex = if_nametoindex("lo");
+	if (!laddr.sll_ifindex)
+		error(1, errno, "if_nametoindex");
+
+	if (bind(fd, (void *)&laddr, sizeof(laddr)))
+		error(1, errno, "bind");
+
+	return fd;
+}
+
 /* sender transmits encapsulated over RAW or unencap'd over UDP */
 static int setup_tx(void)
 {
 	int family, fd, ret;
 
+	if (cfg_tx_pf_packet)
+		return setup_tx_pfpacket();
+
 	if (cfg_l3_extra)
 		family = cfg_l3_extra;
 	else if (cfg_l3_outer)
@@ -464,6 +553,13 @@ static int do_tx(int fd, const char *pkt, int len)
 	int ret;
 
 	ret = write(fd, pkt, len);
+
+	if (cfg_expect_snd_failure) {
+		if (ret == -1)
+			return 0;
+		error(1, 0, "expected tx to fail but it did not");
+	}
+
 	if (ret == -1)
 		error(1, errno, "send");
 	if (ret != len)
@@ -571,7 +667,7 @@ static int do_main(void)
 	 * success (== 0) only if received all packets
 	 * unless failure is expected, in which case none must arrive.
 	 */
-	if (cfg_expect_failure)
+	if (cfg_expect_norx || cfg_expect_snd_failure)
 		return rx != 0;
 	else
 		return rx != tx;
@@ -623,8 +719,12 @@ static void parse_opts(int argc, char **argv)
 {
 	int c;
 
-	while ((c = getopt(argc, argv, "d:D:e:f:Fhi:l:n:o:O:Rs:S:t:Tx:X:")) != -1) {
+	while ((c = getopt(argc, argv,
+			   "cd:D:e:Ef:FGghi:l:Ln:o:O:pRs:S:t:TUvx:X:")) != -1) {
 		switch (c) {
+		case 'c':
+			cfg_vnet_use_csum_off = true;
+			break;
 		case 'd':
 			if (cfg_l3_outer == AF_UNSPEC)
 				error(1, 0, "-d must be preceded by -o");
@@ -653,11 +753,17 @@ static void parse_opts(int argc, char **argv)
 			else
 				usage(argv[0]);
 			break;
+		case 'E':
+			cfg_expect_snd_failure = true;
+			break;
 		case 'f':
 			cfg_src_port = strtol(optarg, NULL, 0);
 			break;
 		case 'F':
-			cfg_expect_failure = true;
+			cfg_expect_norx = true;
+			break;
+		case 'g':
+			cfg_vnet_use_gso = true;
 			break;
 		case 'h':
 			usage(argv[0]);
@@ -673,6 +779,9 @@ static void parse_opts(int argc, char **argv)
 		case 'l':
 			cfg_payload_len = strtol(optarg, NULL, 0);
 			break;
+		case 'L':
+			cfg_vnet_use_hdr_len_bad = true;
+			break;
 		case 'n':
 			cfg_num_pkt = strtol(optarg, NULL, 0);
 			break;
@@ -682,6 +791,9 @@ static void parse_opts(int argc, char **argv)
 		case 'O':
 			cfg_l3_extra = parse_protocol_family(argv[0], optarg);
 			break;
+		case 'p':
+			cfg_tx_pf_packet = true;
+			break;
 		case 'R':
 			cfg_only_rx = true;
 			break;
@@ -703,6 +815,12 @@ static void parse_opts(int argc, char **argv)
 		case 'T':
 			cfg_only_tx = true;
 			break;
+		case 'U':
+			cfg_partial_udp_hdr = true;
+			break;
+		case 'v':
+			cfg_use_vnet = true;
+			break;
 		case 'x':
 			cfg_dsfield_outer = strtol(optarg, NULL, 0);
 			break;
@@ -733,7 +851,26 @@ static void parse_opts(int argc, char **argv)
 	 */
 	if (((cfg_dsfield_outer & 0x3) == 0x3) &&
 	    ((cfg_dsfield_inner & 0x3) == 0x0))
-		cfg_expect_failure = true;
+		cfg_expect_norx = true;
+
+	/* Don't wait around for packets that we expect to fail to send */
+	if (cfg_expect_snd_failure && !cfg_num_secs)
+		cfg_num_secs = 3;
+
+	if (cfg_partial_udp_hdr && cfg_encap_proto)
+		error(1, 0,
+		      "ops: can't specify partial UDP hdr (-U) and encap (-e)");
+
+	if (cfg_use_vnet && cfg_encap_proto)
+		error(1, 0, "options: cannot specify encap (-e) with vnet (-v)");
+	if (cfg_use_vnet && !cfg_tx_pf_packet)
+		error(1, 0, "options: vnet (-v) requires psock for tx (-p)");
+	if (cfg_vnet_use_gso && !cfg_use_vnet)
+		error(1, 0, "options: gso (-g) requires vnet (-v)");
+	if (cfg_vnet_use_csum_off && !cfg_use_vnet)
+		error(1, 0, "options: vnet csum (-c) requires vnet (-v)");
+	if (cfg_vnet_use_hdr_len_bad && !cfg_use_vnet)
+		error(1, 0, "options: bad vnet hdrlen (-L) requires vnet (-v)");
 }
 
 static void print_opts(void)
diff --git a/tools/testing/selftests/bpf/test_flow_dissector.sh b/tools/testing/selftests/bpf/test_flow_dissector.sh
index 174b72a64a4c..5852cf815eeb 100755
--- a/tools/testing/selftests/bpf/test_flow_dissector.sh
+++ b/tools/testing/selftests/bpf/test_flow_dissector.sh
@@ -51,6 +51,9 @@ if [[ -z $(ip netns identify $$) ]]; then
 		echo "Skipping root flow dissector test, bpftool not found" >&2
 	fi
 
+	orig_flow_dissect_sysctl=$(</proc/sys/net/core/flow_dissect_vnet_hdr)
+	sysctl net.core.flow_dissect_vnet_hdr=1
+
 	# Run the rest of the tests in a net namespace.
 	../net/in_netns.sh "$0" "$@"
 	err=$(( $err + $? ))
@@ -61,6 +64,7 @@ if [[ -z $(ip netns identify $$) ]]; then
 		echo "selftests: $TESTNAME [FAILED]";
 	fi
 
+	sysctl net.core.flow_dissect_vnet_hdr=$orig_flow_dissect_sysctl
 	exit $err
 fi
 
@@ -165,4 +169,19 @@ tc filter add dev lo parent ffff: protocol ipv6 pref 1337 flower ip_proto \
 # Send 10 IPv6/UDP packets from port 10. Filter should not drop any.
 ./test_flow_dissector -i 6 -f 10
 
+
+echo "Testing virtio-net header validation..."
+echo "Testing valid vnet headers. Should *not* be dropped."
+./with_addr.sh ./test_flow_dissector -i 4 -D 192.168.0.1 -S 1.1.1.1 -p -v
+echo "Testing partial transport header. Should be dropped."
+./with_addr.sh ./test_flow_dissector -i 4 -D 192.168.0.1 -S 1.1.1.1 -p -v -U -E
+echo "Testing valid vnet gso spec. Should *not* be dropped."
+./with_addr.sh ./test_flow_dissector -i 4 -D 192.168.0.1 -S 1.1.1.1 -p -v -g -c -l 8000
+echo "Testing invalid vnet gso size. Should be dropped."
+./with_addr.sh ./test_flow_dissector -i 4 -D 192.168.0.1 -S 1.1.1.1 -p -v -g -c -l 100 -E
+echo "Testing invalid vnet header len. Should be dropped."
+./with_addr.sh ./test_flow_dissector -i 4 -D 192.168.0.1 -S 1.1.1.1 -p -v -L -E
+echo "Testing vnet gso without csum. Should be dropped."
+./with_addr.sh ./test_flow_dissector -i 4 -D 192.168.0.1 -S 1.1.1.1 -p -v -g -l 8000 -E
+
 exit 0
-- 
2.32.0.rc1.229.g3e70b5a671-goog
^ permalink raw reply related	[flat|nested] 27+ messages in thread