Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH bpf-next v2 3/8] bpf, sockmap: convert to generic sk_msg interface
From: Daniel Borkmann @ 2018-10-13  0:45 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: john.fastabend, davejwatson, netdev, Daniel Borkmann
In-Reply-To: <20181013004603.3747-1-daniel@iogearbox.net>

Add a generic sk_msg layer, and convert current sockmap and later
kTLS over to make use of it. While sk_buff handles network packet
representation from netdevice up to socket, sk_msg handles data
representation from application to socket layer.

This means that sk_msg framework spans across ULP users in the
kernel, and enables features such as introspection or filtering
of data with the help of BPF programs that operate on this data
structure.

Latter becomes in particular useful for kTLS where data encryption
is deferred into the kernel, and as such enabling the kernel to
perform L7 introspection and policy based on BPF for TLS connections
where the record is being encrypted after BPF has run and came to
a verdict. In order to get there, first step is to transform open
coding of scatter-gather list handling into a common core framework
that subsystems can use.

The code itself has been split and refactored into three bigger
pieces: i) the generic sk_msg API which deals with managing the
scatter gather ring, providing helpers for walking and mangling,
transferring application data from user space into it, and preparing
it for BPF pre/post-processing, ii) the plain sock map itself
where sockets can be attached to or detached from; these bits
are independent of i) which can now be used also without sock
map, and iii) the integration with plain TCP as one protocol
to be used for processing L7 application data (later this could
e.g. also be extended to other protocols like UDP). The semantics
are the same with the old sock map code and therefore no change
of user facing behavior or APIs. While pursuing this work it
also helped finding a number of bugs in the old sockmap code
that we've fixed already in earlier commits. The test_sockmap
kselftest suite passes through fine as well.

Joint work with John.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/linux/bpf.h       |   33 +-
 include/linux/bpf_types.h |    2 +-
 include/linux/filter.h    |   21 -
 include/linux/skmsg.h     |  371 +++++++
 include/net/tcp.h         |   27 +
 kernel/bpf/Makefile       |    5 -
 kernel/bpf/core.c         |    2 -
 kernel/bpf/sockmap.c      | 2610 ---------------------------------------------
 kernel/bpf/syscall.c      |    6 +-
 net/Kconfig               |   11 +
 net/core/Makefile         |    2 +
 net/core/filter.c         |  270 ++---
 net/core/skmsg.c          |  763 +++++++++++++
 net/core/sock_map.c       | 1002 +++++++++++++++++
 net/ipv4/Makefile         |    1 +
 net/ipv4/tcp_bpf.c        |  655 ++++++++++++
 net/strparser/Kconfig     |    4 +-
 17 files changed, 2925 insertions(+), 2860 deletions(-)
 create mode 100644 include/linux/skmsg.h
 delete mode 100644 kernel/bpf/sockmap.c
 create mode 100644 net/core/skmsg.c
 create mode 100644 net/core/sock_map.c
 create mode 100644 net/ipv4/tcp_bpf.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 9b55871..e60fff4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -737,33 +737,18 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map)
 }
 #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
 
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET)
-struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key);
-struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key);
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type);
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-			struct bpf_prog *prog);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, u32 which);
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
 #else
-static inline struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-	return NULL;
-}
-
-static inline struct sock  *__sock_hash_lookup_elem(struct bpf_map *map,
-						    void *key)
-{
-	return NULL;
-}
-
-static inline int sock_map_prog(struct bpf_map *map,
-				struct bpf_prog *prog,
-				u32 type)
+static inline int sock_map_prog_update(struct bpf_map *map,
+				       struct bpf_prog *prog, u32 which)
 {
 	return -EOPNOTSUPP;
 }
 
-static inline int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-				      struct bpf_prog *prog)
+static inline int sock_map_get_from_fd(const union bpf_attr *attr,
+				       struct bpf_prog *prog)
 {
 	return -EINVAL;
 }
@@ -839,6 +824,10 @@ extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
+extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
 
 extern const struct bpf_func_proto bpf_get_local_storage_proto;
 
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5432f4c..fa48343 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -57,7 +57,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops)
 #ifdef CONFIG_NET
 BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops)
-#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET)
+#if defined(CONFIG_BPF_STREAM_PARSER)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops)
 #endif
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 6791a0a..5771874 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -520,24 +520,6 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
-struct sk_msg_buff {
-	void *data;
-	void *data_end;
-	__u32 apply_bytes;
-	__u32 cork_bytes;
-	int sg_copybreak;
-	int sg_start;
-	int sg_curr;
-	int sg_end;
-	struct scatterlist sg_data[MAX_SKB_FRAGS];
-	bool sg_copy[MAX_SKB_FRAGS];
-	__u32 flags;
-	struct sock *sk_redir;
-	struct sock *sk;
-	struct sk_buff *skb;
-	struct list_head list;
-};
-
 struct bpf_redirect_info {
 	u32 ifindex;
 	u32 flags;
@@ -833,9 +815,6 @@ void xdp_do_flush_map(void);
 
 void bpf_warn_invalid_xdp_action(u32 act);
 
-struct sock *do_sk_redirect_map(struct sk_buff *skb);
-struct sock *do_msg_redirect_map(struct sk_msg_buff *md);
-
 #ifdef CONFIG_INET
 struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 				  struct bpf_prog *prog, struct sk_buff *skb,
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
new file mode 100644
index 0000000..9567810
--- /dev/null
+++ b/include/linux/skmsg.h
@@ -0,0 +1,371 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#ifndef _LINUX_SKMSG_H
+#define _LINUX_SKMSG_H
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/scatterlist.h>
+#include <linux/skbuff.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/strparser.h>
+
+#define MAX_MSG_FRAGS			MAX_SKB_FRAGS
+
+enum __sk_action {
+	__SK_DROP = 0,
+	__SK_PASS,
+	__SK_REDIRECT,
+	__SK_NONE,
+};
+
+struct sk_msg_sg {
+	u32				start;
+	u32				curr;
+	u32				end;
+	u32				size;
+	u32				copybreak;
+	bool				copy[MAX_MSG_FRAGS];
+	struct scatterlist		data[MAX_MSG_FRAGS];
+};
+
+struct sk_msg {
+	struct sk_msg_sg		sg;
+	void				*data;
+	void				*data_end;
+	u32				apply_bytes;
+	u32				cork_bytes;
+	u32				flags;
+	struct sk_buff			*skb;
+	struct sock			*sk_redir;
+	struct sock			*sk;
+	struct list_head		list;
+};
+
+struct sk_psock_progs {
+	struct bpf_prog			*msg_parser;
+	struct bpf_prog			*skb_parser;
+	struct bpf_prog			*skb_verdict;
+};
+
+enum sk_psock_state_bits {
+	SK_PSOCK_TX_ENABLED,
+};
+
+struct sk_psock_link {
+	struct list_head		list;
+	struct bpf_map			*map;
+	void				*link_raw;
+};
+
+struct sk_psock_parser {
+	struct strparser		strp;
+	bool				enabled;
+	void (*saved_data_ready)(struct sock *sk);
+};
+
+struct sk_psock_work_state {
+	struct sk_buff			*skb;
+	u32				len;
+	u32				off;
+};
+
+struct sk_psock {
+	struct sock			*sk;
+	struct sock			*sk_redir;
+	u32				apply_bytes;
+	u32				cork_bytes;
+	u32				eval;
+	struct sk_msg			*cork;
+	struct sk_psock_progs		progs;
+	struct sk_psock_parser		parser;
+	struct sk_buff_head		ingress_skb;
+	struct list_head		ingress_msg;
+	unsigned long			state;
+	struct list_head		link;
+	spinlock_t			link_lock;
+	refcount_t			refcnt;
+	void (*saved_unhash)(struct sock *sk);
+	void (*saved_close)(struct sock *sk, long timeout);
+	void (*saved_write_space)(struct sock *sk);
+	struct proto			*sk_proto;
+	struct sk_psock_work_state	work_state;
+	struct work_struct		work;
+	union {
+		struct rcu_head		rcu;
+		struct work_struct	gc;
+	};
+};
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+		 int elem_first_coalesce);
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len);
+int sk_msg_free(struct sock *sk, struct sk_msg *msg);
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg);
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes);
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+				  u32 bytes);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+			      struct sk_msg *msg, u32 bytes);
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+			     struct sk_msg *msg, u32 bytes);
+
+static inline void sk_msg_check_to_free(struct sk_msg *msg, u32 i, u32 bytes)
+{
+	WARN_ON(i == msg->sg.end && bytes);
+}
+
+static inline void sk_msg_apply_bytes(struct sk_psock *psock, u32 bytes)
+{
+	if (psock->apply_bytes) {
+		if (psock->apply_bytes < bytes)
+			psock->apply_bytes = 0;
+		else
+			psock->apply_bytes -= bytes;
+	}
+}
+
+#define sk_msg_iter_var_prev(var)			\
+	do {						\
+		if (var == 0)				\
+			var = MAX_MSG_FRAGS - 1;	\
+		else					\
+			var--;				\
+	} while (0)
+
+#define sk_msg_iter_var_next(var)			\
+	do {						\
+		var++;					\
+		if (var == MAX_MSG_FRAGS)		\
+			var = 0;			\
+	} while (0)
+
+#define sk_msg_iter_prev(msg, which)			\
+	sk_msg_iter_var_prev(msg->sg.which)
+
+#define sk_msg_iter_next(msg, which)			\
+	sk_msg_iter_var_next(msg->sg.which)
+
+static inline void sk_msg_clear_meta(struct sk_msg *msg)
+{
+	memset(&msg->sg, 0, offsetofend(struct sk_msg_sg, copy));
+}
+
+static inline void sk_msg_init(struct sk_msg *msg)
+{
+	memset(msg, 0, sizeof(*msg));
+	sg_init_marker(msg->sg.data, ARRAY_SIZE(msg->sg.data));
+}
+
+static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src,
+			       int which, u32 size)
+{
+	dst->sg.data[which] = src->sg.data[which];
+	dst->sg.data[which].length  = size;
+	src->sg.data[which].length -= size;
+	src->sg.data[which].offset += size;
+}
+
+static inline u32 sk_msg_elem_used(const struct sk_msg *msg)
+{
+	return msg->sg.end >= msg->sg.start ?
+		msg->sg.end - msg->sg.start :
+		msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start);
+}
+
+static inline bool sk_msg_full(const struct sk_msg *msg)
+{
+	return (msg->sg.end == msg->sg.start) && msg->sg.size;
+}
+
+static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which)
+{
+	return &msg->sg.data[which];
+}
+
+static inline struct page *sk_msg_page(struct sk_msg *msg, int which)
+{
+	return sg_page(sk_msg_elem(msg, which));
+}
+
+static inline bool sk_msg_to_ingress(const struct sk_msg *msg)
+{
+	return msg->flags & BPF_F_INGRESS;
+}
+
+static inline void sk_msg_compute_data_pointers(struct sk_msg *msg)
+{
+	struct scatterlist *sge = sk_msg_elem(msg, msg->sg.start);
+
+	if (msg->sg.copy[msg->sg.start]) {
+		msg->data = NULL;
+		msg->data_end = NULL;
+	} else {
+		msg->data = sg_virt(sge);
+		msg->data_end = msg->data + sge->length;
+	}
+}
+
+static inline void sk_msg_page_add(struct sk_msg *msg, struct page *page,
+				   u32 len, u32 offset)
+{
+	struct scatterlist *sge;
+
+	get_page(page);
+	sge = sk_msg_elem(msg, msg->sg.end);
+	sg_set_page(sge, page, len, offset);
+	sg_unmark_end(sge);
+
+	msg->sg.copy[msg->sg.end] = true;
+	msg->sg.size += len;
+	sk_msg_iter_next(msg, end);
+}
+
+static inline struct sk_psock *sk_psock(const struct sock *sk)
+{
+	return rcu_dereference_sk_user_data(sk);
+}
+
+static inline bool sk_has_psock(struct sock *sk)
+{
+	return sk_psock(sk) != NULL && sk->sk_prot->recvmsg == tcp_bpf_recvmsg;
+}
+
+static inline void sk_psock_queue_msg(struct sk_psock *psock,
+				      struct sk_msg *msg)
+{
+	list_add_tail(&msg->list, &psock->ingress_msg);
+}
+
+static inline void sk_psock_report_error(struct sk_psock *psock, int err)
+{
+	struct sock *sk = psock->sk;
+
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node);
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock);
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock);
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+			 struct sk_msg *msg);
+
+static inline struct sk_psock_link *sk_psock_init_link(void)
+{
+	return kzalloc(sizeof(struct sk_psock_link),
+		       GFP_ATOMIC | __GFP_NOWARN);
+}
+
+static inline void sk_psock_free_link(struct sk_psock_link *link)
+{
+	kfree(link);
+}
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock);
+#if defined(CONFIG_BPF_STREAM_PARSER)
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link);
+#else
+static inline void sk_psock_unlink(struct sock *sk,
+				   struct sk_psock_link *link)
+{
+}
+#endif
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock);
+
+static inline void sk_psock_cork_free(struct sk_psock *psock)
+{
+	if (psock->cork) {
+		sk_msg_free(psock->sk, psock->cork);
+		kfree(psock->cork);
+		psock->cork = NULL;
+	}
+}
+
+static inline void sk_psock_update_proto(struct sock *sk,
+					 struct sk_psock *psock,
+					 struct proto *ops)
+{
+	psock->saved_unhash = sk->sk_prot->unhash;
+	psock->saved_close = sk->sk_prot->close;
+	psock->saved_write_space = sk->sk_write_space;
+
+	psock->sk_proto = sk->sk_prot;
+	sk->sk_prot = ops;
+}
+
+static inline void sk_psock_restore_proto(struct sock *sk,
+					  struct sk_psock *psock)
+{
+	if (psock->sk_proto) {
+		sk->sk_prot = psock->sk_proto;
+		psock->sk_proto = NULL;
+	}
+}
+
+static inline void sk_psock_set_state(struct sk_psock *psock,
+				      enum sk_psock_state_bits bit)
+{
+	set_bit(bit, &psock->state);
+}
+
+static inline void sk_psock_clear_state(struct sk_psock *psock,
+					enum sk_psock_state_bits bit)
+{
+	clear_bit(bit, &psock->state);
+}
+
+static inline bool sk_psock_test_state(const struct sk_psock *psock,
+				       enum sk_psock_state_bits bit)
+{
+	return test_bit(bit, &psock->state);
+}
+
+static inline struct sk_psock *sk_psock_get(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (psock && !refcount_inc_not_zero(&psock->refcnt))
+		psock = NULL;
+	rcu_read_unlock();
+	return psock;
+}
+
+void sk_psock_stop(struct sock *sk, struct sk_psock *psock);
+void sk_psock_destroy(struct rcu_head *rcu);
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
+
+static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
+{
+	if (refcount_dec_and_test(&psock->refcnt))
+		sk_psock_drop(sk, psock);
+}
+
+static inline void psock_set_prog(struct bpf_prog **pprog,
+				  struct bpf_prog *prog)
+{
+	prog = xchg(pprog, prog);
+	if (prog)
+		bpf_prog_put(prog);
+}
+
+static inline void psock_progs_drop(struct sk_psock_progs *progs)
+{
+	psock_set_prog(&progs->msg_parser, NULL);
+	psock_set_prog(&progs->skb_parser, NULL);
+	psock_set_prog(&progs->skb_verdict, NULL);
+}
+
+#endif /* _LINUX_SKMSG_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8f5cef6..3600ae0 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -858,6 +858,21 @@ static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
 	TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
 }
 
+static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS;
+}
+
+static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb)
+{
+	return TCP_SKB_CB(skb)->bpf.sk_redir;
+}
+
+static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
+}
+
 #if IS_ENABLED(CONFIG_IPV6)
 /* This is the variant of inet6_iif() that must be used by TCP,
  * as TCP moves IP6CB into a different location in skb->cb[]
@@ -2064,6 +2079,18 @@ void tcp_cleanup_ulp(struct sock *sk);
 	__MODULE_INFO(alias, alias_userspace, name);		\
 	__MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name)
 
+struct sk_msg;
+struct sk_psock;
+
+int tcp_bpf_init(struct sock *sk);
+void tcp_bpf_reinit(struct sock *sk);
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
+			  int flags);
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		    int nonblock, int flags, int *addr_len);
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+		      struct msghdr *msg, int len);
+
 /* Call BPF_SOCK_OPS program that returns an int. If the return value
  * is < 0, then the BPF op failed (for example if the loaded BPF
  * program does not support the chosen operation or there is no BPF
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0488b82..ff82626 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -13,11 +13,6 @@ ifeq ($(CONFIG_XDP_SOCKETS),y)
 obj-$(CONFIG_BPF_SYSCALL) += xskmap.o
 endif
 obj-$(CONFIG_BPF_SYSCALL) += offload.o
-ifeq ($(CONFIG_STREAM_PARSER),y)
-ifeq ($(CONFIG_INET),y)
-obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
-endif
-endif
 endif
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 3f5bf1a..defcf4d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1792,8 +1792,6 @@ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak;
 const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak;
 const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
-const struct bpf_func_proto bpf_sock_map_update_proto __weak;
-const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 const struct bpf_func_proto bpf_get_local_storage_proto __weak;
 
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
deleted file mode 100644
index de6f7a6..0000000
--- a/kernel/bpf/sockmap.c
+++ /dev/null
@@ -1,2610 +0,0 @@
-/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-/* A BPF sock_map is used to store sock objects. This is primarly used
- * for doing socket redirect with BPF helper routines.
- *
- * A sock map may have BPF programs attached to it, currently a program
- * used to parse packets and a program to provide a verdict and redirect
- * decision on the packet are supported. Any programs attached to a sock
- * map are inherited by sock objects when they are added to the map. If
- * no BPF programs are attached the sock object may only be used for sock
- * redirect.
- *
- * A sock object may be in multiple maps, but can only inherit a single
- * parse or verdict program. If adding a sock object to a map would result
- * in having multiple parsing programs the update will return an EBUSY error.
- *
- * For reference this program is similar to devmap used in XDP context
- * reviewing these together may be useful. For an example please review
- * ./samples/bpf/sockmap/.
- */
-#include <linux/bpf.h>
-#include <net/sock.h>
-#include <linux/filter.h>
-#include <linux/errno.h>
-#include <linux/file.h>
-#include <linux/kernel.h>
-#include <linux/net.h>
-#include <linux/skbuff.h>
-#include <linux/workqueue.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <net/strparser.h>
-#include <net/tcp.h>
-#include <linux/ptr_ring.h>
-#include <net/inet_common.h>
-#include <linux/sched/signal.h>
-
-#define SOCK_CREATE_FLAG_MASK \
-	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
-
-struct bpf_sock_progs {
-	struct bpf_prog *bpf_tx_msg;
-	struct bpf_prog *bpf_parse;
-	struct bpf_prog *bpf_verdict;
-};
-
-struct bpf_stab {
-	struct bpf_map map;
-	struct sock **sock_map;
-	struct bpf_sock_progs progs;
-	raw_spinlock_t lock;
-};
-
-struct bucket {
-	struct hlist_head head;
-	raw_spinlock_t lock;
-};
-
-struct bpf_htab {
-	struct bpf_map map;
-	struct bucket *buckets;
-	atomic_t count;
-	u32 n_buckets;
-	u32 elem_size;
-	struct bpf_sock_progs progs;
-	struct rcu_head rcu;
-};
-
-struct htab_elem {
-	struct rcu_head rcu;
-	struct hlist_node hash_node;
-	u32 hash;
-	struct sock *sk;
-	char key[0];
-};
-
-enum smap_psock_state {
-	SMAP_TX_RUNNING,
-};
-
-struct smap_psock_map_entry {
-	struct list_head list;
-	struct bpf_map *map;
-	struct sock **entry;
-	struct htab_elem __rcu *hash_link;
-};
-
-struct smap_psock {
-	struct rcu_head	rcu;
-	refcount_t refcnt;
-
-	/* datapath variables */
-	struct sk_buff_head rxqueue;
-	bool strp_enabled;
-
-	/* datapath error path cache across tx work invocations */
-	int save_rem;
-	int save_off;
-	struct sk_buff *save_skb;
-
-	/* datapath variables for tx_msg ULP */
-	struct sock *sk_redir;
-	int apply_bytes;
-	int cork_bytes;
-	int sg_size;
-	int eval;
-	struct sk_msg_buff *cork;
-	struct list_head ingress;
-
-	struct strparser strp;
-	struct bpf_prog *bpf_tx_msg;
-	struct bpf_prog *bpf_parse;
-	struct bpf_prog *bpf_verdict;
-	struct list_head maps;
-	spinlock_t maps_lock;
-
-	/* Back reference used when sock callback trigger sockmap operations */
-	struct sock *sock;
-	unsigned long state;
-
-	struct work_struct tx_work;
-	struct work_struct gc_work;
-
-	struct proto *sk_proto;
-	void (*save_unhash)(struct sock *sk);
-	void (*save_close)(struct sock *sk, long timeout);
-	void (*save_data_ready)(struct sock *sk);
-	void (*save_write_space)(struct sock *sk);
-};
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-			   int nonblock, int flags, int *addr_len);
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
-			    int offset, size_t size, int flags);
-static void bpf_tcp_unhash(struct sock *sk);
-static void bpf_tcp_close(struct sock *sk, long timeout);
-
-static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
-{
-	return rcu_dereference_sk_user_data(sk);
-}
-
-static bool bpf_tcp_stream_read(const struct sock *sk)
-{
-	struct smap_psock *psock;
-	bool empty = true;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out;
-	empty = list_empty(&psock->ingress);
-out:
-	rcu_read_unlock();
-	return !empty;
-}
-
-enum {
-	SOCKMAP_IPV4,
-	SOCKMAP_IPV6,
-	SOCKMAP_NUM_PROTS,
-};
-
-enum {
-	SOCKMAP_BASE,
-	SOCKMAP_TX,
-	SOCKMAP_NUM_CONFIGS,
-};
-
-static struct proto *saved_tcpv6_prot __read_mostly;
-static DEFINE_SPINLOCK(tcpv6_prot_lock);
-static struct proto bpf_tcp_prots[SOCKMAP_NUM_PROTS][SOCKMAP_NUM_CONFIGS];
-
-static void build_protos(struct proto prot[SOCKMAP_NUM_CONFIGS],
-			 struct proto *base)
-{
-	prot[SOCKMAP_BASE]			= *base;
-	prot[SOCKMAP_BASE].unhash		= bpf_tcp_unhash;
-	prot[SOCKMAP_BASE].close		= bpf_tcp_close;
-	prot[SOCKMAP_BASE].recvmsg		= bpf_tcp_recvmsg;
-	prot[SOCKMAP_BASE].stream_memory_read	= bpf_tcp_stream_read;
-
-	prot[SOCKMAP_TX]			= prot[SOCKMAP_BASE];
-	prot[SOCKMAP_TX].sendmsg		= bpf_tcp_sendmsg;
-	prot[SOCKMAP_TX].sendpage		= bpf_tcp_sendpage;
-}
-
-static void update_sk_prot(struct sock *sk, struct smap_psock *psock)
-{
-	int family = sk->sk_family == AF_INET6 ? SOCKMAP_IPV6 : SOCKMAP_IPV4;
-	int conf = psock->bpf_tx_msg ? SOCKMAP_TX : SOCKMAP_BASE;
-
-	sk->sk_prot = &bpf_tcp_prots[family][conf];
-}
-
-static int bpf_tcp_init(struct sock *sk)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		return -EINVAL;
-	}
-
-	if (unlikely(psock->sk_proto)) {
-		rcu_read_unlock();
-		return -EBUSY;
-	}
-
-	psock->save_unhash = sk->sk_prot->unhash;
-	psock->save_close = sk->sk_prot->close;
-	psock->sk_proto = sk->sk_prot;
-
-	/* Build IPv6 sockmap whenever the address of tcpv6_prot changes */
-	if (sk->sk_family == AF_INET6 &&
-	    unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
-		spin_lock_bh(&tcpv6_prot_lock);
-		if (likely(sk->sk_prot != saved_tcpv6_prot)) {
-			build_protos(bpf_tcp_prots[SOCKMAP_IPV6], sk->sk_prot);
-			smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
-		}
-		spin_unlock_bh(&tcpv6_prot_lock);
-	}
-	update_sk_prot(sk, psock);
-	rcu_read_unlock();
-	return 0;
-}
-
-static int __init bpf_sock_init(void)
-{
-	build_protos(bpf_tcp_prots[SOCKMAP_IPV4], &tcp_prot);
-	return 0;
-}
-core_initcall(bpf_sock_init);
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge);
-
-static void bpf_tcp_release(struct sock *sk)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out;
-
-	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork, true);
-		kfree(psock->cork);
-		psock->cork = NULL;
-	}
-
-	if (psock->sk_proto) {
-		sk->sk_prot = psock->sk_proto;
-		psock->sk_proto = NULL;
-	}
-out:
-	rcu_read_unlock();
-}
-
-static struct htab_elem *lookup_elem_raw(struct hlist_head *head,
-					 u32 hash, void *key, u32 key_size)
-{
-	struct htab_elem *l;
-
-	hlist_for_each_entry_rcu(l, head, hash_node) {
-		if (l->hash == hash && !memcmp(&l->key, key, key_size))
-			return l;
-	}
-
-	return NULL;
-}
-
-static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &htab->buckets[hash & (htab->n_buckets - 1)];
-}
-
-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
-{
-	return &__select_bucket(htab, hash)->head;
-}
-
-static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
-{
-	atomic_dec(&htab->count);
-	kfree_rcu(l, rcu);
-}
-
-static struct smap_psock_map_entry *psock_map_pop(struct sock *sk,
-						  struct smap_psock *psock)
-{
-	struct smap_psock_map_entry *e;
-
-	spin_lock_bh(&psock->maps_lock);
-	e = list_first_entry_or_null(&psock->maps,
-				     struct smap_psock_map_entry,
-				     list);
-	if (e)
-		list_del(&e->list);
-	spin_unlock_bh(&psock->maps_lock);
-	return e;
-}
-
-static void bpf_tcp_remove(struct sock *sk, struct smap_psock *psock)
-{
-	struct smap_psock_map_entry *e;
-	struct sk_msg_buff *md, *mtmp;
-	struct sock *osk;
-
-	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork, true);
-		kfree(psock->cork);
-		psock->cork = NULL;
-	}
-
-	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
-		list_del(&md->list);
-		free_start_sg(psock->sock, md, true);
-		kfree(md);
-	}
-
-	e = psock_map_pop(sk, psock);
-	while (e) {
-		if (e->entry) {
-			struct bpf_stab *stab = container_of(e->map, struct bpf_stab, map);
-
-			raw_spin_lock_bh(&stab->lock);
-			osk = *e->entry;
-			if (osk == sk) {
-				*e->entry = NULL;
-				smap_release_sock(psock, sk);
-			}
-			raw_spin_unlock_bh(&stab->lock);
-		} else {
-			struct htab_elem *link = rcu_dereference(e->hash_link);
-			struct bpf_htab *htab = container_of(e->map, struct bpf_htab, map);
-			struct hlist_head *head;
-			struct htab_elem *l;
-			struct bucket *b;
-
-			b = __select_bucket(htab, link->hash);
-			head = &b->head;
-			raw_spin_lock_bh(&b->lock);
-			l = lookup_elem_raw(head,
-					    link->hash, link->key,
-					    htab->map.key_size);
-			/* If another thread deleted this object skip deletion.
-			 * The refcnt on psock may or may not be zero.
-			 */
-			if (l && l == link) {
-				hlist_del_rcu(&link->hash_node);
-				smap_release_sock(psock, link->sk);
-				free_htab_elem(htab, link);
-			}
-			raw_spin_unlock_bh(&b->lock);
-		}
-		kfree(e);
-		e = psock_map_pop(sk, psock);
-	}
-}
-
-static void bpf_tcp_unhash(struct sock *sk)
-{
-	void (*unhash_fun)(struct sock *sk);
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		if (sk->sk_prot->unhash)
-			sk->sk_prot->unhash(sk);
-		return;
-	}
-	unhash_fun = psock->save_unhash;
-	bpf_tcp_remove(sk, psock);
-	rcu_read_unlock();
-	unhash_fun(sk);
-}
-
-static void bpf_tcp_close(struct sock *sk, long timeout)
-{
-	void (*close_fun)(struct sock *sk, long timeout);
-	struct smap_psock *psock;
-
-	lock_sock(sk);
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		release_sock(sk);
-		return sk->sk_prot->close(sk, timeout);
-	}
-	close_fun = psock->save_close;
-	bpf_tcp_remove(sk, psock);
-	rcu_read_unlock();
-	release_sock(sk);
-	close_fun(sk, timeout);
-}
-
-enum __sk_action {
-	__SK_DROP = 0,
-	__SK_PASS,
-	__SK_REDIRECT,
-	__SK_NONE,
-};
-
-static int memcopy_from_iter(struct sock *sk,
-			     struct sk_msg_buff *md,
-			     struct iov_iter *from, int bytes)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = md->sg_curr, rc = -ENOSPC;
-
-	do {
-		int copy;
-		char *to;
-
-		if (md->sg_copybreak >= sg[i].length) {
-			md->sg_copybreak = 0;
-
-			if (++i == MAX_SKB_FRAGS)
-				i = 0;
-
-			if (i == md->sg_end)
-				break;
-		}
-
-		copy = sg[i].length - md->sg_copybreak;
-		to = sg_virt(&sg[i]) + md->sg_copybreak;
-		md->sg_copybreak += copy;
-
-		if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
-			rc = copy_from_iter_nocache(to, copy, from);
-		else
-			rc = copy_from_iter(to, copy, from);
-
-		if (rc != copy) {
-			rc = -EFAULT;
-			goto out;
-		}
-
-		bytes -= copy;
-		if (!bytes)
-			break;
-
-		md->sg_copybreak = 0;
-		if (++i == MAX_SKB_FRAGS)
-			i = 0;
-	} while (i != md->sg_end);
-out:
-	md->sg_curr = i;
-	return rc;
-}
-
-static int bpf_tcp_push(struct sock *sk, int apply_bytes,
-			struct sk_msg_buff *md,
-			int flags, bool uncharge)
-{
-	bool apply = apply_bytes;
-	struct scatterlist *sg;
-	int offset, ret = 0;
-	struct page *p;
-	size_t size;
-
-	while (1) {
-		sg = md->sg_data + md->sg_start;
-		size = (apply && apply_bytes < sg->length) ?
-			apply_bytes : sg->length;
-		offset = sg->offset;
-
-		tcp_rate_check_app_limited(sk);
-		p = sg_page(sg);
-retry:
-		ret = do_tcp_sendpages(sk, p, offset, size, flags);
-		if (ret != size) {
-			if (ret > 0) {
-				if (apply)
-					apply_bytes -= ret;
-
-				sg->offset += ret;
-				sg->length -= ret;
-				size -= ret;
-				offset += ret;
-				if (uncharge)
-					sk_mem_uncharge(sk, ret);
-				goto retry;
-			}
-
-			return ret;
-		}
-
-		if (apply)
-			apply_bytes -= ret;
-		sg->offset += ret;
-		sg->length -= ret;
-		if (uncharge)
-			sk_mem_uncharge(sk, ret);
-
-		if (!sg->length) {
-			put_page(p);
-			md->sg_start++;
-			if (md->sg_start == MAX_SKB_FRAGS)
-				md->sg_start = 0;
-			sg_init_table(sg, 1);
-
-			if (md->sg_start == md->sg_end)
-				break;
-		}
-
-		if (apply && !apply_bytes)
-			break;
-	}
-	return 0;
-}
-
-static inline void bpf_compute_data_pointers_sg(struct sk_msg_buff *md)
-{
-	struct scatterlist *sg = md->sg_data + md->sg_start;
-
-	if (md->sg_copy[md->sg_start]) {
-		md->data = md->data_end = 0;
-	} else {
-		md->data = sg_virt(sg);
-		md->data_end = md->data + sg->length;
-	}
-}
-
-static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = md->sg_start;
-
-	do {
-		int uncharge = (bytes < sg[i].length) ? bytes : sg[i].length;
-
-		sk_mem_uncharge(sk, uncharge);
-		bytes -= uncharge;
-		if (!bytes)
-			break;
-		i++;
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	} while (i != md->sg_end);
-}
-
-static void free_bytes_sg(struct sock *sk, int bytes,
-			  struct sk_msg_buff *md, bool charge)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = md->sg_start, free;
-
-	while (bytes && sg[i].length) {
-		free = sg[i].length;
-		if (bytes < free) {
-			sg[i].length -= bytes;
-			sg[i].offset += bytes;
-			if (charge)
-				sk_mem_uncharge(sk, bytes);
-			break;
-		}
-
-		if (charge)
-			sk_mem_uncharge(sk, sg[i].length);
-		put_page(sg_page(&sg[i]));
-		bytes -= sg[i].length;
-		sg[i].length = 0;
-		sg[i].page_link = 0;
-		sg[i].offset = 0;
-		i++;
-
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	}
-	md->sg_start = i;
-}
-
-static int free_sg(struct sock *sk, int start,
-		   struct sk_msg_buff *md, bool charge)
-{
-	struct scatterlist *sg = md->sg_data;
-	int i = start, free = 0;
-
-	while (sg[i].length) {
-		free += sg[i].length;
-		if (charge)
-			sk_mem_uncharge(sk, sg[i].length);
-		if (!md->skb)
-			put_page(sg_page(&sg[i]));
-		sg[i].length = 0;
-		sg[i].page_link = 0;
-		sg[i].offset = 0;
-		i++;
-
-		if (i == MAX_SKB_FRAGS)
-			i = 0;
-	}
-	consume_skb(md->skb);
-
-	return free;
-}
-
-static int free_start_sg(struct sock *sk, struct sk_msg_buff *md, bool charge)
-{
-	int free = free_sg(sk, md->sg_start, md, charge);
-
-	md->sg_start = md->sg_end;
-	return free;
-}
-
-static int free_curr_sg(struct sock *sk, struct sk_msg_buff *md)
-{
-	return free_sg(sk, md->sg_curr, md, true);
-}
-
-static int bpf_map_msg_verdict(int _rc, struct sk_msg_buff *md)
-{
-	return ((_rc == SK_PASS) ?
-	       (md->sk_redir ? __SK_REDIRECT : __SK_PASS) :
-	       __SK_DROP);
-}
-
-static unsigned int smap_do_tx_msg(struct sock *sk,
-				   struct smap_psock *psock,
-				   struct sk_msg_buff *md)
-{
-	struct bpf_prog *prog;
-	unsigned int rc, _rc;
-
-	preempt_disable();
-	rcu_read_lock();
-
-	/* If the policy was removed mid-send then default to 'accept' */
-	prog = READ_ONCE(psock->bpf_tx_msg);
-	if (unlikely(!prog)) {
-		_rc = SK_PASS;
-		goto verdict;
-	}
-
-	bpf_compute_data_pointers_sg(md);
-	md->sk = sk;
-	rc = (*prog->bpf_func)(md, prog->insnsi);
-	psock->apply_bytes = md->apply_bytes;
-
-	/* Moving return codes from UAPI namespace into internal namespace */
-	_rc = bpf_map_msg_verdict(rc, md);
-
-	/* The psock has a refcount on the sock but not on the map and because
-	 * we need to drop rcu read lock here its possible the map could be
-	 * removed between here and when we need it to execute the sock
-	 * redirect. So do the map lookup now for future use.
-	 */
-	if (_rc == __SK_REDIRECT) {
-		if (psock->sk_redir)
-			sock_put(psock->sk_redir);
-		psock->sk_redir = do_msg_redirect_map(md);
-		if (!psock->sk_redir) {
-			_rc = __SK_DROP;
-			goto verdict;
-		}
-		sock_hold(psock->sk_redir);
-	}
-verdict:
-	rcu_read_unlock();
-	preempt_enable();
-
-	return _rc;
-}
-
-static int bpf_tcp_ingress(struct sock *sk, int apply_bytes,
-			   struct smap_psock *psock,
-			   struct sk_msg_buff *md, int flags)
-{
-	bool apply = apply_bytes;
-	size_t size, copied = 0;
-	struct sk_msg_buff *r;
-	int err = 0, i;
-
-	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL);
-	if (unlikely(!r))
-		return -ENOMEM;
-
-	lock_sock(sk);
-	r->sg_start = md->sg_start;
-	i = md->sg_start;
-
-	do {
-		size = (apply && apply_bytes < md->sg_data[i].length) ?
-			apply_bytes : md->sg_data[i].length;
-
-		if (!sk_wmem_schedule(sk, size)) {
-			if (!copied)
-				err = -ENOMEM;
-			break;
-		}
-
-		sk_mem_charge(sk, size);
-		r->sg_data[i] = md->sg_data[i];
-		r->sg_data[i].length = size;
-		md->sg_data[i].length -= size;
-		md->sg_data[i].offset += size;
-		copied += size;
-
-		if (md->sg_data[i].length) {
-			get_page(sg_page(&r->sg_data[i]));
-			r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1;
-		} else {
-			i++;
-			if (i == MAX_SKB_FRAGS)
-				i = 0;
-			r->sg_end = i;
-		}
-
-		if (apply) {
-			apply_bytes -= size;
-			if (!apply_bytes)
-				break;
-		}
-	} while (i != md->sg_end);
-
-	md->sg_start = i;
-
-	if (!err) {
-		list_add_tail(&r->list, &psock->ingress);
-		sk->sk_data_ready(sk);
-	} else {
-		free_start_sg(sk, r, true);
-		kfree(r);
-	}
-
-	release_sock(sk);
-	return err;
-}
-
-static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
-				       struct sk_msg_buff *md,
-				       int flags)
-{
-	bool ingress = !!(md->flags & BPF_F_INGRESS);
-	struct smap_psock *psock;
-	int err = 0;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out_rcu;
-
-	if (!refcount_inc_not_zero(&psock->refcnt))
-		goto out_rcu;
-
-	rcu_read_unlock();
-
-	if (ingress) {
-		err = bpf_tcp_ingress(sk, send, psock, md, flags);
-	} else {
-		lock_sock(sk);
-		err = bpf_tcp_push(sk, send, md, flags, false);
-		release_sock(sk);
-	}
-	smap_release_sock(psock, sk);
-	return err;
-out_rcu:
-	rcu_read_unlock();
-	return 0;
-}
-
-static inline void bpf_md_init(struct smap_psock *psock)
-{
-	if (!psock->apply_bytes) {
-		psock->eval =  __SK_NONE;
-		if (psock->sk_redir) {
-			sock_put(psock->sk_redir);
-			psock->sk_redir = NULL;
-		}
-	}
-}
-
-static void apply_bytes_dec(struct smap_psock *psock, int i)
-{
-	if (psock->apply_bytes) {
-		if (psock->apply_bytes < i)
-			psock->apply_bytes = 0;
-		else
-			psock->apply_bytes -= i;
-	}
-}
-
-static int bpf_exec_tx_verdict(struct smap_psock *psock,
-			       struct sk_msg_buff *m,
-			       struct sock *sk,
-			       int *copied, int flags)
-{
-	bool cork = false, enospc = (m->sg_start == m->sg_end);
-	struct sock *redir;
-	int err = 0;
-	int send;
-
-more_data:
-	if (psock->eval == __SK_NONE)
-		psock->eval = smap_do_tx_msg(sk, psock, m);
-
-	if (m->cork_bytes &&
-	    m->cork_bytes > psock->sg_size && !enospc) {
-		psock->cork_bytes = m->cork_bytes - psock->sg_size;
-		if (!psock->cork) {
-			psock->cork = kcalloc(1,
-					sizeof(struct sk_msg_buff),
-					GFP_ATOMIC | __GFP_NOWARN);
-
-			if (!psock->cork) {
-				err = -ENOMEM;
-				goto out_err;
-			}
-		}
-		memcpy(psock->cork, m, sizeof(*m));
-		goto out_err;
-	}
-
-	send = psock->sg_size;
-	if (psock->apply_bytes && psock->apply_bytes < send)
-		send = psock->apply_bytes;
-
-	switch (psock->eval) {
-	case __SK_PASS:
-		err = bpf_tcp_push(sk, send, m, flags, true);
-		if (unlikely(err)) {
-			*copied -= free_start_sg(sk, m, true);
-			break;
-		}
-
-		apply_bytes_dec(psock, send);
-		psock->sg_size -= send;
-		break;
-	case __SK_REDIRECT:
-		redir = psock->sk_redir;
-		apply_bytes_dec(psock, send);
-
-		if (psock->cork) {
-			cork = true;
-			psock->cork = NULL;
-		}
-
-		return_mem_sg(sk, send, m);
-		release_sock(sk);
-
-		err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
-		lock_sock(sk);
-
-		if (unlikely(err < 0)) {
-			int free = free_start_sg(sk, m, false);
-
-			psock->sg_size = 0;
-			if (!cork)
-				*copied -= free;
-		} else {
-			psock->sg_size -= send;
-		}
-
-		if (cork) {
-			free_start_sg(sk, m, true);
-			psock->sg_size = 0;
-			kfree(m);
-			m = NULL;
-			err = 0;
-		}
-		break;
-	case __SK_DROP:
-	default:
-		free_bytes_sg(sk, send, m, true);
-		apply_bytes_dec(psock, send);
-		*copied -= send;
-		psock->sg_size -= send;
-		err = -EACCES;
-		break;
-	}
-
-	if (likely(!err)) {
-		bpf_md_init(psock);
-		if (m &&
-		    m->sg_data[m->sg_start].page_link &&
-		    m->sg_data[m->sg_start].length)
-			goto more_data;
-	}
-
-out_err:
-	return err;
-}
-
-static int bpf_wait_data(struct sock *sk,
-			 struct smap_psock *psk, int flags,
-			 long timeo, int *err)
-{
-	int rc;
-
-	DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
-	add_wait_queue(sk_sleep(sk), &wait);
-	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	rc = sk_wait_event(sk, &timeo,
-			   !list_empty(&psk->ingress) ||
-			   !skb_queue_empty(&sk->sk_receive_queue),
-			   &wait);
-	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
-	remove_wait_queue(sk_sleep(sk), &wait);
-
-	return rc;
-}
-
-static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
-			   int nonblock, int flags, int *addr_len)
-{
-	struct iov_iter *iter = &msg->msg_iter;
-	struct smap_psock *psock;
-	int copied = 0;
-
-	if (unlikely(flags & MSG_ERRQUEUE))
-		return inet_recv_error(sk, msg, len, addr_len);
-	if (!skb_queue_empty(&sk->sk_receive_queue))
-		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto out;
-
-	if (unlikely(!refcount_inc_not_zero(&psock->refcnt)))
-		goto out;
-	rcu_read_unlock();
-
-	lock_sock(sk);
-bytes_ready:
-	while (copied != len) {
-		struct scatterlist *sg;
-		struct sk_msg_buff *md;
-		int i;
-
-		md = list_first_entry_or_null(&psock->ingress,
-					      struct sk_msg_buff, list);
-		if (unlikely(!md))
-			break;
-		i = md->sg_start;
-		do {
-			struct page *page;
-			int n, copy;
-
-			sg = &md->sg_data[i];
-			copy = sg->length;
-			page = sg_page(sg);
-
-			if (copied + copy > len)
-				copy = len - copied;
-
-			n = copy_page_to_iter(page, sg->offset, copy, iter);
-			if (n != copy) {
-				md->sg_start = i;
-				release_sock(sk);
-				smap_release_sock(psock, sk);
-				return -EFAULT;
-			}
-
-			copied += copy;
-			sg->offset += copy;
-			sg->length -= copy;
-			sk_mem_uncharge(sk, copy);
-
-			if (!sg->length) {
-				i++;
-				if (i == MAX_SKB_FRAGS)
-					i = 0;
-				if (!md->skb)
-					put_page(page);
-			}
-			if (copied == len)
-				break;
-		} while (i != md->sg_end);
-		md->sg_start = i;
-
-		if (!sg->length && md->sg_start == md->sg_end) {
-			list_del(&md->list);
-			consume_skb(md->skb);
-			kfree(md);
-		}
-	}
-
-	if (!copied) {
-		long timeo;
-		int data;
-		int err = 0;
-
-		timeo = sock_rcvtimeo(sk, nonblock);
-		data = bpf_wait_data(sk, psock, flags, timeo, &err);
-
-		if (data) {
-			if (!skb_queue_empty(&sk->sk_receive_queue)) {
-				release_sock(sk);
-				smap_release_sock(psock, sk);
-				copied = tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-				return copied;
-			}
-			goto bytes_ready;
-		}
-
-		if (err)
-			copied = err;
-	}
-
-	release_sock(sk);
-	smap_release_sock(psock, sk);
-	return copied;
-out:
-	rcu_read_unlock();
-	return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
-}
-
-
-static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
-{
-	int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
-	struct sk_msg_buff md = {0};
-	unsigned int sg_copy = 0;
-	struct smap_psock *psock;
-	int copied = 0, err = 0;
-	struct scatterlist *sg;
-	long timeo;
-
-	/* Its possible a sock event or user removed the psock _but_ the ops
-	 * have not been reprogrammed yet so we get here. In this case fallback
-	 * to tcp_sendmsg. Note this only works because we _only_ ever allow
-	 * a single ULP there is no hierarchy here.
-	 */
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock)) {
-		rcu_read_unlock();
-		return tcp_sendmsg(sk, msg, size);
-	}
-
-	/* Increment the psock refcnt to ensure its not released while sending a
-	 * message. Required because sk lookup and bpf programs are used in
-	 * separate rcu critical sections. Its OK if we lose the map entry
-	 * but we can't lose the sock reference.
-	 */
-	if (!refcount_inc_not_zero(&psock->refcnt)) {
-		rcu_read_unlock();
-		return tcp_sendmsg(sk, msg, size);
-	}
-
-	sg = md.sg_data;
-	sg_init_marker(sg, MAX_SKB_FRAGS);
-	rcu_read_unlock();
-
-	lock_sock(sk);
-	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-
-	while (msg_data_left(msg)) {
-		struct sk_msg_buff *m = NULL;
-		bool enospc = false;
-		int copy;
-
-		if (sk->sk_err) {
-			err = -sk->sk_err;
-			goto out_err;
-		}
-
-		copy = msg_data_left(msg);
-		if (!sk_stream_memory_free(sk))
-			goto wait_for_sndbuf;
-
-		m = psock->cork_bytes ? psock->cork : &md;
-		m->sg_curr = m->sg_copybreak ? m->sg_curr : m->sg_end;
-		err = sk_alloc_sg(sk, copy, m->sg_data,
-				  m->sg_start, &m->sg_end, &sg_copy,
-				  m->sg_end - 1);
-		if (err) {
-			if (err != -ENOSPC)
-				goto wait_for_memory;
-			enospc = true;
-			copy = sg_copy;
-		}
-
-		err = memcopy_from_iter(sk, m, &msg->msg_iter, copy);
-		if (err < 0) {
-			free_curr_sg(sk, m);
-			goto out_err;
-		}
-
-		psock->sg_size += copy;
-		copied += copy;
-		sg_copy = 0;
-
-		/* When bytes are being corked skip running BPF program and
-		 * applying verdict unless there is no more buffer space. In
-		 * the ENOSPC case simply run BPF prorgram with currently
-		 * accumulated data. We don't have much choice at this point
-		 * we could try extending the page frags or chaining complex
-		 * frags but even in these cases _eventually_ we will hit an
-		 * OOM scenario. More complex recovery schemes may be
-		 * implemented in the future, but BPF programs must handle
-		 * the case where apply_cork requests are not honored. The
-		 * canonical method to verify this is to check data length.
-		 */
-		if (psock->cork_bytes) {
-			if (copy > psock->cork_bytes)
-				psock->cork_bytes = 0;
-			else
-				psock->cork_bytes -= copy;
-
-			if (psock->cork_bytes && !enospc)
-				goto out_cork;
-
-			/* All cork bytes accounted for re-run filter */
-			psock->eval = __SK_NONE;
-			psock->cork_bytes = 0;
-		}
-
-		err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
-		if (unlikely(err < 0))
-			goto out_err;
-		continue;
-wait_for_sndbuf:
-		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
-		err = sk_stream_wait_memory(sk, &timeo);
-		if (err) {
-			if (m && m != psock->cork)
-				free_start_sg(sk, m, true);
-			goto out_err;
-		}
-	}
-out_err:
-	if (err < 0)
-		err = sk_stream_error(sk, msg->msg_flags, err);
-out_cork:
-	release_sock(sk);
-	smap_release_sock(psock, sk);
-	return copied ? copied : err;
-}
-
-static int bpf_tcp_sendpage(struct sock *sk, struct page *page,
-			    int offset, size_t size, int flags)
-{
-	struct sk_msg_buff md = {0}, *m = NULL;
-	int err = 0, copied = 0;
-	struct smap_psock *psock;
-	struct scatterlist *sg;
-	bool enospc = false;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (unlikely(!psock))
-		goto accept;
-
-	if (!refcount_inc_not_zero(&psock->refcnt))
-		goto accept;
-	rcu_read_unlock();
-
-	lock_sock(sk);
-
-	if (psock->cork_bytes) {
-		m = psock->cork;
-		sg = &m->sg_data[m->sg_end];
-	} else {
-		m = &md;
-		sg = m->sg_data;
-		sg_init_marker(sg, MAX_SKB_FRAGS);
-	}
-
-	/* Catch case where ring is full and sendpage is stalled. */
-	if (unlikely(m->sg_end == m->sg_start &&
-	    m->sg_data[m->sg_end].length))
-		goto out_err;
-
-	psock->sg_size += size;
-	sg_set_page(sg, page, size, offset);
-	get_page(page);
-	m->sg_copy[m->sg_end] = true;
-	sk_mem_charge(sk, size);
-	m->sg_end++;
-	copied = size;
-
-	if (m->sg_end == MAX_SKB_FRAGS)
-		m->sg_end = 0;
-
-	if (m->sg_end == m->sg_start)
-		enospc = true;
-
-	if (psock->cork_bytes) {
-		if (size > psock->cork_bytes)
-			psock->cork_bytes = 0;
-		else
-			psock->cork_bytes -= size;
-
-		if (psock->cork_bytes && !enospc)
-			goto out_err;
-
-		/* All cork bytes accounted for re-run filter */
-		psock->eval = __SK_NONE;
-		psock->cork_bytes = 0;
-	}
-
-	err = bpf_exec_tx_verdict(psock, m, sk, &copied, flags);
-out_err:
-	release_sock(sk);
-	smap_release_sock(psock, sk);
-	return copied ? copied : err;
-accept:
-	rcu_read_unlock();
-	return tcp_sendpage(sk, page, offset, size, flags);
-}
-
-static void bpf_tcp_msg_add(struct smap_psock *psock,
-			    struct sock *sk,
-			    struct bpf_prog *tx_msg)
-{
-	struct bpf_prog *orig_tx_msg;
-
-	orig_tx_msg = xchg(&psock->bpf_tx_msg, tx_msg);
-	if (orig_tx_msg)
-		bpf_prog_put(orig_tx_msg);
-}
-
-static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
-{
-	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
-	int rc;
-
-	if (unlikely(!prog))
-		return __SK_DROP;
-
-	skb_orphan(skb);
-	/* We need to ensure that BPF metadata for maps is also cleared
-	 * when we orphan the skb so that we don't have the possibility
-	 * to reference a stale map.
-	 */
-	TCP_SKB_CB(skb)->bpf.sk_redir = NULL;
-	skb->sk = psock->sock;
-	bpf_compute_data_end_sk_skb(skb);
-	preempt_disable();
-	rc = (*prog->bpf_func)(skb, prog->insnsi);
-	preempt_enable();
-	skb->sk = NULL;
-
-	/* Moving return codes from UAPI namespace into internal namespace */
-	return rc == SK_PASS ?
-		(TCP_SKB_CB(skb)->bpf.sk_redir ? __SK_REDIRECT : __SK_PASS) :
-		__SK_DROP;
-}
-
-static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb)
-{
-	struct sock *sk = psock->sock;
-	int copied = 0, num_sg;
-	struct sk_msg_buff *r;
-
-	r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC);
-	if (unlikely(!r))
-		return -EAGAIN;
-
-	if (!sk_rmem_schedule(sk, skb, skb->len)) {
-		kfree(r);
-		return -EAGAIN;
-	}
-
-	sg_init_table(r->sg_data, MAX_SKB_FRAGS);
-	num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len);
-	if (unlikely(num_sg < 0)) {
-		kfree(r);
-		return num_sg;
-	}
-	sk_mem_charge(sk, skb->len);
-	copied = skb->len;
-	r->sg_start = 0;
-	r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg;
-	r->skb = skb;
-	list_add_tail(&r->list, &psock->ingress);
-	sk->sk_data_ready(sk);
-	return copied;
-}
-
-static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
-{
-	struct smap_psock *peer;
-	struct sock *sk;
-	__u32 in;
-	int rc;
-
-	rc = smap_verdict_func(psock, skb);
-	switch (rc) {
-	case __SK_REDIRECT:
-		sk = do_sk_redirect_map(skb);
-		if (!sk) {
-			kfree_skb(skb);
-			break;
-		}
-
-		peer = smap_psock_sk(sk);
-		in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
-
-		if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) ||
-			     !test_bit(SMAP_TX_RUNNING, &peer->state))) {
-			kfree_skb(skb);
-			break;
-		}
-
-		if (!in && sock_writeable(sk)) {
-			skb_set_owner_w(skb, sk);
-			skb_queue_tail(&peer->rxqueue, skb);
-			schedule_work(&peer->tx_work);
-			break;
-		} else if (in &&
-			   atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
-			skb_queue_tail(&peer->rxqueue, skb);
-			schedule_work(&peer->tx_work);
-			break;
-		}
-	/* Fall through and free skb otherwise */
-	case __SK_DROP:
-	default:
-		kfree_skb(skb);
-	}
-}
-
-static void smap_report_sk_error(struct smap_psock *psock, int err)
-{
-	struct sock *sk = psock->sock;
-
-	sk->sk_err = err;
-	sk->sk_error_report(sk);
-}
-
-static void smap_read_sock_strparser(struct strparser *strp,
-				     struct sk_buff *skb)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = container_of(strp, struct smap_psock, strp);
-	smap_do_verdict(psock, skb);
-	rcu_read_unlock();
-}
-
-/* Called with lock held on socket */
-static void smap_data_ready(struct sock *sk)
-{
-	struct smap_psock *psock;
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (likely(psock)) {
-		write_lock_bh(&sk->sk_callback_lock);
-		strp_data_ready(&psock->strp);
-		write_unlock_bh(&sk->sk_callback_lock);
-	}
-	rcu_read_unlock();
-}
-
-static void smap_tx_work(struct work_struct *w)
-{
-	struct smap_psock *psock;
-	struct sk_buff *skb;
-	int rem, off, n;
-
-	psock = container_of(w, struct smap_psock, tx_work);
-
-	/* lock sock to avoid losing sk_socket at some point during loop */
-	lock_sock(psock->sock);
-	if (psock->save_skb) {
-		skb = psock->save_skb;
-		rem = psock->save_rem;
-		off = psock->save_off;
-		psock->save_skb = NULL;
-		goto start;
-	}
-
-	while ((skb = skb_dequeue(&psock->rxqueue))) {
-		__u32 flags;
-
-		rem = skb->len;
-		off = 0;
-start:
-		flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS;
-		do {
-			if (likely(psock->sock->sk_socket)) {
-				if (flags)
-					n = smap_do_ingress(psock, skb);
-				else
-					n = skb_send_sock_locked(psock->sock,
-								 skb, off, rem);
-			} else {
-				n = -EINVAL;
-			}
-
-			if (n <= 0) {
-				if (n == -EAGAIN) {
-					/* Retry when space is available */
-					psock->save_skb = skb;
-					psock->save_rem = rem;
-					psock->save_off = off;
-					goto out;
-				}
-				/* Hard errors break pipe and stop xmit */
-				smap_report_sk_error(psock, n ? -n : EPIPE);
-				clear_bit(SMAP_TX_RUNNING, &psock->state);
-				kfree_skb(skb);
-				goto out;
-			}
-			rem -= n;
-			off += n;
-		} while (rem);
-
-		if (!flags)
-			kfree_skb(skb);
-	}
-out:
-	release_sock(psock->sock);
-}
-
-static void smap_write_space(struct sock *sk)
-{
-	struct smap_psock *psock;
-	void (*write_space)(struct sock *sk);
-
-	rcu_read_lock();
-	psock = smap_psock_sk(sk);
-	if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
-		schedule_work(&psock->tx_work);
-	write_space = psock->save_write_space;
-	rcu_read_unlock();
-	write_space(sk);
-}
-
-static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
-{
-	if (!psock->strp_enabled)
-		return;
-	sk->sk_data_ready = psock->save_data_ready;
-	sk->sk_write_space = psock->save_write_space;
-	psock->save_data_ready = NULL;
-	psock->save_write_space = NULL;
-	strp_stop(&psock->strp);
-	psock->strp_enabled = false;
-}
-
-static void smap_destroy_psock(struct rcu_head *rcu)
-{
-	struct smap_psock *psock = container_of(rcu,
-						  struct smap_psock, rcu);
-
-	/* Now that a grace period has passed there is no longer
-	 * any reference to this sock in the sockmap so we can
-	 * destroy the psock, strparser, and bpf programs. But,
-	 * because we use workqueue sync operations we can not
-	 * do it in rcu context
-	 */
-	schedule_work(&psock->gc_work);
-}
-
-static bool psock_is_smap_sk(struct sock *sk)
-{
-	return inet_csk(sk)->icsk_ulp_ops == &bpf_tcp_ulp_ops;
-}
-
-static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
-{
-	if (refcount_dec_and_test(&psock->refcnt)) {
-		if (psock_is_smap_sk(sock))
-			bpf_tcp_release(sock);
-		write_lock_bh(&sock->sk_callback_lock);
-		smap_stop_sock(psock, sock);
-		write_unlock_bh(&sock->sk_callback_lock);
-		clear_bit(SMAP_TX_RUNNING, &psock->state);
-		rcu_assign_sk_user_data(sock, NULL);
-		call_rcu_sched(&psock->rcu, smap_destroy_psock);
-	}
-}
-
-static int smap_parse_func_strparser(struct strparser *strp,
-				       struct sk_buff *skb)
-{
-	struct smap_psock *psock;
-	struct bpf_prog *prog;
-	int rc;
-
-	rcu_read_lock();
-	psock = container_of(strp, struct smap_psock, strp);
-	prog = READ_ONCE(psock->bpf_parse);
-
-	if (unlikely(!prog)) {
-		rcu_read_unlock();
-		return skb->len;
-	}
-
-	/* Attach socket for bpf program to use if needed we can do this
-	 * because strparser clones the skb before handing it to a upper
-	 * layer, meaning skb_orphan has been called. We NULL sk on the
-	 * way out to ensure we don't trigger a BUG_ON in skb/sk operations
-	 * later and because we are not charging the memory of this skb to
-	 * any socket yet.
-	 */
-	skb->sk = psock->sock;
-	bpf_compute_data_end_sk_skb(skb);
-	rc = (*prog->bpf_func)(skb, prog->insnsi);
-	skb->sk = NULL;
-	rcu_read_unlock();
-	return rc;
-}
-
-static int smap_read_sock_done(struct strparser *strp, int err)
-{
-	return err;
-}
-
-static int smap_init_sock(struct smap_psock *psock,
-			  struct sock *sk)
-{
-	static const struct strp_callbacks cb = {
-		.rcv_msg = smap_read_sock_strparser,
-		.parse_msg = smap_parse_func_strparser,
-		.read_sock_done = smap_read_sock_done,
-	};
-
-	return strp_init(&psock->strp, sk, &cb);
-}
-
-static void smap_init_progs(struct smap_psock *psock,
-			    struct bpf_prog *verdict,
-			    struct bpf_prog *parse)
-{
-	struct bpf_prog *orig_parse, *orig_verdict;
-
-	orig_parse = xchg(&psock->bpf_parse, parse);
-	orig_verdict = xchg(&psock->bpf_verdict, verdict);
-
-	if (orig_verdict)
-		bpf_prog_put(orig_verdict);
-	if (orig_parse)
-		bpf_prog_put(orig_parse);
-}
-
-static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
-{
-	if (sk->sk_data_ready == smap_data_ready)
-		return;
-	psock->save_data_ready = sk->sk_data_ready;
-	psock->save_write_space = sk->sk_write_space;
-	sk->sk_data_ready = smap_data_ready;
-	sk->sk_write_space = smap_write_space;
-	psock->strp_enabled = true;
-}
-
-static void sock_map_remove_complete(struct bpf_stab *stab)
-{
-	bpf_map_area_free(stab->sock_map);
-	kfree(stab);
-}
-
-static void smap_gc_work(struct work_struct *w)
-{
-	struct smap_psock_map_entry *e, *tmp;
-	struct sk_msg_buff *md, *mtmp;
-	struct smap_psock *psock;
-
-	psock = container_of(w, struct smap_psock, gc_work);
-
-	/* no callback lock needed because we already detached sockmap ops */
-	if (psock->strp_enabled)
-		strp_done(&psock->strp);
-
-	cancel_work_sync(&psock->tx_work);
-	__skb_queue_purge(&psock->rxqueue);
-
-	/* At this point all strparser and xmit work must be complete */
-	if (psock->bpf_parse)
-		bpf_prog_put(psock->bpf_parse);
-	if (psock->bpf_verdict)
-		bpf_prog_put(psock->bpf_verdict);
-	if (psock->bpf_tx_msg)
-		bpf_prog_put(psock->bpf_tx_msg);
-
-	if (psock->cork) {
-		free_start_sg(psock->sock, psock->cork, true);
-		kfree(psock->cork);
-	}
-
-	list_for_each_entry_safe(md, mtmp, &psock->ingress, list) {
-		list_del(&md->list);
-		free_start_sg(psock->sock, md, true);
-		kfree(md);
-	}
-
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		list_del(&e->list);
-		kfree(e);
-	}
-
-	if (psock->sk_redir)
-		sock_put(psock->sk_redir);
-
-	sock_put(psock->sock);
-	kfree(psock);
-}
-
-static struct smap_psock *smap_init_psock(struct sock *sock, int node)
-{
-	struct smap_psock *psock;
-
-	psock = kzalloc_node(sizeof(struct smap_psock),
-			     GFP_ATOMIC | __GFP_NOWARN,
-			     node);
-	if (!psock)
-		return ERR_PTR(-ENOMEM);
-
-	psock->eval =  __SK_NONE;
-	psock->sock = sock;
-	skb_queue_head_init(&psock->rxqueue);
-	INIT_WORK(&psock->tx_work, smap_tx_work);
-	INIT_WORK(&psock->gc_work, smap_gc_work);
-	INIT_LIST_HEAD(&psock->maps);
-	INIT_LIST_HEAD(&psock->ingress);
-	refcount_set(&psock->refcnt, 1);
-	spin_lock_init(&psock->maps_lock);
-
-	rcu_assign_sk_user_data(sock, psock);
-	sock_hold(sock);
-	return psock;
-}
-
-static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
-{
-	struct bpf_stab *stab;
-	u64 cost;
-	int err;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	/* check sanity of attributes */
-	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	stab = kzalloc(sizeof(*stab), GFP_USER);
-	if (!stab)
-		return ERR_PTR(-ENOMEM);
-
-	bpf_map_init_from_attr(&stab->map, attr);
-	raw_spin_lock_init(&stab->lock);
-
-	/* make sure page count doesn't overflow */
-	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
-	err = -EINVAL;
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_stab;
-
-	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-
-	/* if map size is larger than memlock limit, reject it early */
-	err = bpf_map_precharge_memlock(stab->map.pages);
-	if (err)
-		goto free_stab;
-
-	err = -ENOMEM;
-	stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
-					    sizeof(struct sock *),
-					    stab->map.numa_node);
-	if (!stab->sock_map)
-		goto free_stab;
-
-	return &stab->map;
-free_stab:
-	kfree(stab);
-	return ERR_PTR(err);
-}
-
-static void smap_list_map_remove(struct smap_psock *psock,
-				 struct sock **entry)
-{
-	struct smap_psock_map_entry *e, *tmp;
-
-	spin_lock_bh(&psock->maps_lock);
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		if (e->entry == entry) {
-			list_del(&e->list);
-			kfree(e);
-		}
-	}
-	spin_unlock_bh(&psock->maps_lock);
-}
-
-static void smap_list_hash_remove(struct smap_psock *psock,
-				  struct htab_elem *hash_link)
-{
-	struct smap_psock_map_entry *e, *tmp;
-
-	spin_lock_bh(&psock->maps_lock);
-	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
-		struct htab_elem *c = rcu_dereference(e->hash_link);
-
-		if (c == hash_link) {
-			list_del(&e->list);
-			kfree(e);
-		}
-	}
-	spin_unlock_bh(&psock->maps_lock);
-}
-
-static void sock_map_free(struct bpf_map *map)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	int i;
-
-	synchronize_rcu();
-
-	/* At this point no update, lookup or delete operations can happen.
-	 * However, be aware we can still get a socket state event updates,
-	 * and data ready callabacks that reference the psock from sk_user_data
-	 * Also psock worker threads are still in-flight. So smap_release_sock
-	 * will only free the psock after cancel_sync on the worker threads
-	 * and a grace period expire to ensure psock is really safe to remove.
-	 */
-	rcu_read_lock();
-	raw_spin_lock_bh(&stab->lock);
-	for (i = 0; i < stab->map.max_entries; i++) {
-		struct smap_psock *psock;
-		struct sock *sock;
-
-		sock = stab->sock_map[i];
-		if (!sock)
-			continue;
-		stab->sock_map[i] = NULL;
-		psock = smap_psock_sk(sock);
-		/* This check handles a racing sock event that can get the
-		 * sk_callback_lock before this case but after xchg happens
-		 * causing the refcnt to hit zero and sock user data (psock)
-		 * to be null and queued for garbage collection.
-		 */
-		if (likely(psock)) {
-			smap_list_map_remove(psock, &stab->sock_map[i]);
-			smap_release_sock(psock, sock);
-		}
-	}
-	raw_spin_unlock_bh(&stab->lock);
-	rcu_read_unlock();
-
-	sock_map_remove_complete(stab);
-}
-
-static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	u32 i = key ? *(u32 *)key : U32_MAX;
-	u32 *next = (u32 *)next_key;
-
-	if (i >= stab->map.max_entries) {
-		*next = 0;
-		return 0;
-	}
-
-	if (i == stab->map.max_entries - 1)
-		return -ENOENT;
-
-	*next = i + 1;
-	return 0;
-}
-
-struct sock  *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-	if (key >= map->max_entries)
-		return NULL;
-
-	return READ_ONCE(stab->sock_map[key]);
-}
-
-static int sock_map_delete_elem(struct bpf_map *map, void *key)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct smap_psock *psock;
-	int k = *(u32 *)key;
-	struct sock *sock;
-
-	if (k >= map->max_entries)
-		return -EINVAL;
-
-	raw_spin_lock_bh(&stab->lock);
-	sock = stab->sock_map[k];
-	stab->sock_map[k] = NULL;
-	raw_spin_unlock_bh(&stab->lock);
-	if (!sock)
-		return -EINVAL;
-
-	psock = smap_psock_sk(sock);
-	if (!psock)
-		return 0;
-	if (psock->bpf_parse) {
-		write_lock_bh(&sock->sk_callback_lock);
-		smap_stop_sock(psock, sock);
-		write_unlock_bh(&sock->sk_callback_lock);
-	}
-	smap_list_map_remove(psock, &stab->sock_map[k]);
-	smap_release_sock(psock, sock);
-	return 0;
-}
-
-/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
- * done inside rcu critical sections. This ensures on updates that the psock
- * will not be released via smap_release_sock() until concurrent updates/deletes
- * complete. All operations operate on sock_map using cmpxchg and xchg
- * operations to ensure we do not get stale references. Any reads into the
- * map must be done with READ_ONCE() because of this.
- *
- * A psock is destroyed via call_rcu and after any worker threads are cancelled
- * and syncd so we are certain all references from the update/lookup/delete
- * operations as well as references in the data path are no longer in use.
- *
- * Psocks may exist in multiple maps, but only a single set of parse/verdict
- * programs may be inherited from the maps it belongs to. A reference count
- * is kept with the total number of references to the psock from all maps. The
- * psock will not be released until this reaches zero. The psock and sock
- * user data data use the sk_callback_lock to protect critical data structures
- * from concurrent access. This allows us to avoid two updates from modifying
- * the user data in sock and the lock is required anyways for modifying
- * callbacks, we simply increase its scope slightly.
- *
- * Rules to follow,
- *  - psock must always be read inside RCU critical section
- *  - sk_user_data must only be modified inside sk_callback_lock and read
- *    inside RCU critical section.
- *  - psock->maps list must only be read & modified inside sk_callback_lock
- *  - sock_map must use READ_ONCE and (cmp)xchg operations
- *  - BPF verdict/parse programs must use READ_ONCE and xchg operations
- */
-
-static int __sock_map_ctx_update_elem(struct bpf_map *map,
-				      struct bpf_sock_progs *progs,
-				      struct sock *sock,
-				      void *key)
-{
-	struct bpf_prog *verdict, *parse, *tx_msg;
-	struct smap_psock *psock;
-	bool new = false;
-	int err = 0;
-
-	/* 1. If sock map has BPF programs those will be inherited by the
-	 * sock being added. If the sock is already attached to BPF programs
-	 * this results in an error.
-	 */
-	verdict = READ_ONCE(progs->bpf_verdict);
-	parse = READ_ONCE(progs->bpf_parse);
-	tx_msg = READ_ONCE(progs->bpf_tx_msg);
-
-	if (parse && verdict) {
-		/* bpf prog refcnt may be zero if a concurrent attach operation
-		 * removes the program after the above READ_ONCE() but before
-		 * we increment the refcnt. If this is the case abort with an
-		 * error.
-		 */
-		verdict = bpf_prog_inc_not_zero(verdict);
-		if (IS_ERR(verdict))
-			return PTR_ERR(verdict);
-
-		parse = bpf_prog_inc_not_zero(parse);
-		if (IS_ERR(parse)) {
-			bpf_prog_put(verdict);
-			return PTR_ERR(parse);
-		}
-	}
-
-	if (tx_msg) {
-		tx_msg = bpf_prog_inc_not_zero(tx_msg);
-		if (IS_ERR(tx_msg)) {
-			if (parse && verdict) {
-				bpf_prog_put(parse);
-				bpf_prog_put(verdict);
-			}
-			return PTR_ERR(tx_msg);
-		}
-	}
-
-	psock = smap_psock_sk(sock);
-
-	/* 2. Do not allow inheriting programs if psock exists and has
-	 * already inherited programs. This would create confusion on
-	 * which parser/verdict program is running. If no psock exists
-	 * create one. Inside sk_callback_lock to ensure concurrent create
-	 * doesn't update user data.
-	 */
-	if (psock) {
-		if (!psock_is_smap_sk(sock)) {
-			err = -EBUSY;
-			goto out_progs;
-		}
-		if (READ_ONCE(psock->bpf_parse) && parse) {
-			err = -EBUSY;
-			goto out_progs;
-		}
-		if (READ_ONCE(psock->bpf_tx_msg) && tx_msg) {
-			err = -EBUSY;
-			goto out_progs;
-		}
-		if (!refcount_inc_not_zero(&psock->refcnt)) {
-			err = -EAGAIN;
-			goto out_progs;
-		}
-	} else {
-		psock = smap_init_psock(sock, map->numa_node);
-		if (IS_ERR(psock)) {
-			err = PTR_ERR(psock);
-			goto out_progs;
-		}
-
-		set_bit(SMAP_TX_RUNNING, &psock->state);
-		new = true;
-	}
-
-	/* 3. At this point we have a reference to a valid psock that is
-	 * running. Attach any BPF programs needed.
-	 */
-	if (tx_msg)
-		bpf_tcp_msg_add(psock, sock, tx_msg);
-	if (new) {
-		err = bpf_tcp_init(sock);
-		if (err)
-			goto out_free;
-	}
-
-	if (parse && verdict && !psock->strp_enabled) {
-		err = smap_init_sock(psock, sock);
-		if (err)
-			goto out_free;
-		smap_init_progs(psock, verdict, parse);
-		write_lock_bh(&sock->sk_callback_lock);
-		smap_start_sock(psock, sock);
-		write_unlock_bh(&sock->sk_callback_lock);
-	}
-
-	return err;
-out_free:
-	smap_release_sock(psock, sock);
-out_progs:
-	if (parse && verdict) {
-		bpf_prog_put(parse);
-		bpf_prog_put(verdict);
-	}
-	if (tx_msg)
-		bpf_prog_put(tx_msg);
-	return err;
-}
-
-static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				    struct bpf_map *map,
-				    void *key, u64 flags)
-{
-	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-	struct bpf_sock_progs *progs = &stab->progs;
-	struct sock *osock, *sock = skops->sk;
-	struct smap_psock_map_entry *e;
-	struct smap_psock *psock;
-	u32 i = *(u32 *)key;
-	int err;
-
-	if (unlikely(flags > BPF_EXIST))
-		return -EINVAL;
-	if (unlikely(i >= stab->map.max_entries))
-		return -E2BIG;
-
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e)
-		return -ENOMEM;
-
-	err = __sock_map_ctx_update_elem(map, progs, sock, key);
-	if (err)
-		goto out;
-
-	/* psock guaranteed to be present. */
-	psock = smap_psock_sk(sock);
-	raw_spin_lock_bh(&stab->lock);
-	osock = stab->sock_map[i];
-	if (osock && flags == BPF_NOEXIST) {
-		err = -EEXIST;
-		goto out_unlock;
-	}
-	if (!osock && flags == BPF_EXIST) {
-		err = -ENOENT;
-		goto out_unlock;
-	}
-
-	e->entry = &stab->sock_map[i];
-	e->map = map;
-	spin_lock_bh(&psock->maps_lock);
-	list_add_tail(&e->list, &psock->maps);
-	spin_unlock_bh(&psock->maps_lock);
-
-	stab->sock_map[i] = sock;
-	if (osock) {
-		psock = smap_psock_sk(osock);
-		smap_list_map_remove(psock, &stab->sock_map[i]);
-		smap_release_sock(psock, osock);
-	}
-	raw_spin_unlock_bh(&stab->lock);
-	return 0;
-out_unlock:
-	smap_release_sock(psock, sock);
-	raw_spin_unlock_bh(&stab->lock);
-out:
-	kfree(e);
-	return err;
-}
-
-int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
-{
-	struct bpf_sock_progs *progs;
-	struct bpf_prog *orig;
-
-	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-		progs = &stab->progs;
-	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH) {
-		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
-		progs = &htab->progs;
-	} else {
-		return -EINVAL;
-	}
-
-	switch (type) {
-	case BPF_SK_MSG_VERDICT:
-		orig = xchg(&progs->bpf_tx_msg, prog);
-		break;
-	case BPF_SK_SKB_STREAM_PARSER:
-		orig = xchg(&progs->bpf_parse, prog);
-		break;
-	case BPF_SK_SKB_STREAM_VERDICT:
-		orig = xchg(&progs->bpf_verdict, prog);
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	if (orig)
-		bpf_prog_put(orig);
-
-	return 0;
-}
-
-int sockmap_get_from_fd(const union bpf_attr *attr, int type,
-			struct bpf_prog *prog)
-{
-	int ufd = attr->target_fd;
-	struct bpf_map *map;
-	struct fd f;
-	int err;
-
-	f = fdget(ufd);
-	map = __bpf_map_get(f);
-	if (IS_ERR(map))
-		return PTR_ERR(map);
-
-	err = sock_map_prog(map, prog, attr->attach_type);
-	fdput(f);
-	return err;
-}
-
-static void *sock_map_lookup(struct bpf_map *map, void *key)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-static int sock_map_update_elem(struct bpf_map *map,
-				void *key, void *value, u64 flags)
-{
-	struct bpf_sock_ops_kern skops;
-	u32 fd = *(u32 *)value;
-	struct socket *socket;
-	int err;
-
-	socket = sockfd_lookup(fd, &err);
-	if (!socket)
-		return err;
-
-	skops.sk = socket->sk;
-	if (!skops.sk) {
-		fput(socket->file);
-		return -EINVAL;
-	}
-
-	/* ULPs are currently supported only for TCP sockets in ESTABLISHED
-	 * state.
-	 */
-	if (skops.sk->sk_type != SOCK_STREAM ||
-	    skops.sk->sk_protocol != IPPROTO_TCP ||
-	    skops.sk->sk_state != TCP_ESTABLISHED) {
-		fput(socket->file);
-		return -EOPNOTSUPP;
-	}
-
-	lock_sock(skops.sk);
-	preempt_disable();
-	rcu_read_lock();
-	err = sock_map_ctx_update_elem(&skops, map, key, flags);
-	rcu_read_unlock();
-	preempt_enable();
-	release_sock(skops.sk);
-	fput(socket->file);
-	return err;
-}
-
-static void sock_map_release(struct bpf_map *map)
-{
-	struct bpf_sock_progs *progs;
-	struct bpf_prog *orig;
-
-	if (map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
-
-		progs = &stab->progs;
-	} else {
-		struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-
-		progs = &htab->progs;
-	}
-
-	orig = xchg(&progs->bpf_parse, NULL);
-	if (orig)
-		bpf_prog_put(orig);
-	orig = xchg(&progs->bpf_verdict, NULL);
-	if (orig)
-		bpf_prog_put(orig);
-
-	orig = xchg(&progs->bpf_tx_msg, NULL);
-	if (orig)
-		bpf_prog_put(orig);
-}
-
-static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
-{
-	struct bpf_htab *htab;
-	int i, err;
-	u64 cost;
-
-	if (!capable(CAP_NET_ADMIN))
-		return ERR_PTR(-EPERM);
-
-	/* check sanity of attributes */
-	if (attr->max_entries == 0 ||
-	    attr->key_size == 0 ||
-	    attr->value_size != 4 ||
-	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
-		return ERR_PTR(-EINVAL);
-
-	if (attr->key_size > MAX_BPF_STACK)
-		/* eBPF programs initialize keys on stack, so they cannot be
-		 * larger than max stack size
-		 */
-		return ERR_PTR(-E2BIG);
-
-	htab = kzalloc(sizeof(*htab), GFP_USER);
-	if (!htab)
-		return ERR_PTR(-ENOMEM);
-
-	bpf_map_init_from_attr(&htab->map, attr);
-
-	htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
-	htab->elem_size = sizeof(struct htab_elem) +
-			  round_up(htab->map.key_size, 8);
-	err = -EINVAL;
-	if (htab->n_buckets == 0 ||
-	    htab->n_buckets > U32_MAX / sizeof(struct bucket))
-		goto free_htab;
-
-	cost = (u64) htab->n_buckets * sizeof(struct bucket) +
-	       (u64) htab->elem_size * htab->map.max_entries;
-
-	if (cost >= U32_MAX - PAGE_SIZE)
-		goto free_htab;
-
-	htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
-	err = bpf_map_precharge_memlock(htab->map.pages);
-	if (err)
-		goto free_htab;
-
-	err = -ENOMEM;
-	htab->buckets = bpf_map_area_alloc(
-				htab->n_buckets * sizeof(struct bucket),
-				htab->map.numa_node);
-	if (!htab->buckets)
-		goto free_htab;
-
-	for (i = 0; i < htab->n_buckets; i++) {
-		INIT_HLIST_HEAD(&htab->buckets[i].head);
-		raw_spin_lock_init(&htab->buckets[i].lock);
-	}
-
-	return &htab->map;
-free_htab:
-	kfree(htab);
-	return ERR_PTR(err);
-}
-
-static void __bpf_htab_free(struct rcu_head *rcu)
-{
-	struct bpf_htab *htab;
-
-	htab = container_of(rcu, struct bpf_htab, rcu);
-	bpf_map_area_free(htab->buckets);
-	kfree(htab);
-}
-
-static void sock_hash_free(struct bpf_map *map)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	int i;
-
-	synchronize_rcu();
-
-	/* At this point no update, lookup or delete operations can happen.
-	 * However, be aware we can still get a socket state event updates,
-	 * and data ready callabacks that reference the psock from sk_user_data
-	 * Also psock worker threads are still in-flight. So smap_release_sock
-	 * will only free the psock after cancel_sync on the worker threads
-	 * and a grace period expire to ensure psock is really safe to remove.
-	 */
-	rcu_read_lock();
-	for (i = 0; i < htab->n_buckets; i++) {
-		struct bucket *b = __select_bucket(htab, i);
-		struct hlist_head *head;
-		struct hlist_node *n;
-		struct htab_elem *l;
-
-		raw_spin_lock_bh(&b->lock);
-		head = &b->head;
-		hlist_for_each_entry_safe(l, n, head, hash_node) {
-			struct sock *sock = l->sk;
-			struct smap_psock *psock;
-
-			hlist_del_rcu(&l->hash_node);
-			psock = smap_psock_sk(sock);
-			/* This check handles a racing sock event that can get
-			 * the sk_callback_lock before this case but after xchg
-			 * causing the refcnt to hit zero and sock user data
-			 * (psock) to be null and queued for garbage collection.
-			 */
-			if (likely(psock)) {
-				smap_list_hash_remove(psock, l);
-				smap_release_sock(psock, sock);
-			}
-			free_htab_elem(htab, l);
-		}
-		raw_spin_unlock_bh(&b->lock);
-	}
-	rcu_read_unlock();
-	call_rcu(&htab->rcu, __bpf_htab_free);
-}
-
-static struct htab_elem *alloc_sock_hash_elem(struct bpf_htab *htab,
-					      void *key, u32 key_size, u32 hash,
-					      struct sock *sk,
-					      struct htab_elem *old_elem)
-{
-	struct htab_elem *l_new;
-
-	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
-		if (!old_elem) {
-			atomic_dec(&htab->count);
-			return ERR_PTR(-E2BIG);
-		}
-	}
-	l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
-			     htab->map.numa_node);
-	if (!l_new) {
-		atomic_dec(&htab->count);
-		return ERR_PTR(-ENOMEM);
-	}
-
-	memcpy(l_new->key, key, key_size);
-	l_new->sk = sk;
-	l_new->hash = hash;
-	return l_new;
-}
-
-static inline u32 htab_map_hash(const void *key, u32 key_len)
-{
-	return jhash(key, key_len, 0);
-}
-
-static int sock_hash_get_next_key(struct bpf_map *map,
-				  void *key, void *next_key)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct htab_elem *l, *next_l;
-	struct hlist_head *h;
-	u32 hash, key_size;
-	int i = 0;
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	key_size = map->key_size;
-	if (!key)
-		goto find_first_elem;
-	hash = htab_map_hash(key, key_size);
-	h = select_bucket(htab, hash);
-
-	l = lookup_elem_raw(h, hash, key, key_size);
-	if (!l)
-		goto find_first_elem;
-	next_l = hlist_entry_safe(
-		     rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
-		     struct htab_elem, hash_node);
-	if (next_l) {
-		memcpy(next_key, next_l->key, key_size);
-		return 0;
-	}
-
-	/* no more elements in this hash list, go to the next bucket */
-	i = hash & (htab->n_buckets - 1);
-	i++;
-
-find_first_elem:
-	/* iterate over buckets */
-	for (; i < htab->n_buckets; i++) {
-		h = select_bucket(htab, i);
-
-		/* pick first element in the bucket */
-		next_l = hlist_entry_safe(
-				rcu_dereference_raw(hlist_first_rcu(h)),
-				struct htab_elem, hash_node);
-		if (next_l) {
-			/* if it's not empty, just return it */
-			memcpy(next_key, next_l->key, key_size);
-			return 0;
-		}
-	}
-
-	/* iterated over all buckets and all elements */
-	return -ENOENT;
-}
-
-static int sock_hash_ctx_update_elem(struct bpf_sock_ops_kern *skops,
-				     struct bpf_map *map,
-				     void *key, u64 map_flags)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct bpf_sock_progs *progs = &htab->progs;
-	struct htab_elem *l_new = NULL, *l_old;
-	struct smap_psock_map_entry *e = NULL;
-	struct hlist_head *head;
-	struct smap_psock *psock;
-	u32 key_size, hash;
-	struct sock *sock;
-	struct bucket *b;
-	int err;
-
-	sock = skops->sk;
-
-	if (sock->sk_type != SOCK_STREAM ||
-	    sock->sk_protocol != IPPROTO_TCP)
-		return -EOPNOTSUPP;
-
-	if (unlikely(map_flags > BPF_EXIST))
-		return -EINVAL;
-
-	e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
-	if (!e)
-		return -ENOMEM;
-
-	WARN_ON_ONCE(!rcu_read_lock_held());
-	key_size = map->key_size;
-	hash = htab_map_hash(key, key_size);
-	b = __select_bucket(htab, hash);
-	head = &b->head;
-
-	err = __sock_map_ctx_update_elem(map, progs, sock, key);
-	if (err)
-		goto err;
-
-	/* psock is valid here because otherwise above *ctx_update_elem would
-	 * have thrown an error. It is safe to skip error check.
-	 */
-	psock = smap_psock_sk(sock);
-	raw_spin_lock_bh(&b->lock);
-	l_old = lookup_elem_raw(head, hash, key, key_size);
-	if (l_old && map_flags == BPF_NOEXIST) {
-		err = -EEXIST;
-		goto bucket_err;
-	}
-	if (!l_old && map_flags == BPF_EXIST) {
-		err = -ENOENT;
-		goto bucket_err;
-	}
-
-	l_new = alloc_sock_hash_elem(htab, key, key_size, hash, sock, l_old);
-	if (IS_ERR(l_new)) {
-		err = PTR_ERR(l_new);
-		goto bucket_err;
-	}
-
-	rcu_assign_pointer(e->hash_link, l_new);
-	e->map = map;
-	spin_lock_bh(&psock->maps_lock);
-	list_add_tail(&e->list, &psock->maps);
-	spin_unlock_bh(&psock->maps_lock);
-
-	/* add new element to the head of the list, so that
-	 * concurrent search will find it before old elem
-	 */
-	hlist_add_head_rcu(&l_new->hash_node, head);
-	if (l_old) {
-		psock = smap_psock_sk(l_old->sk);
-
-		hlist_del_rcu(&l_old->hash_node);
-		smap_list_hash_remove(psock, l_old);
-		smap_release_sock(psock, l_old->sk);
-		free_htab_elem(htab, l_old);
-	}
-	raw_spin_unlock_bh(&b->lock);
-	return 0;
-bucket_err:
-	smap_release_sock(psock, sock);
-	raw_spin_unlock_bh(&b->lock);
-err:
-	kfree(e);
-	return err;
-}
-
-static int sock_hash_update_elem(struct bpf_map *map,
-				void *key, void *value, u64 flags)
-{
-	struct bpf_sock_ops_kern skops;
-	u32 fd = *(u32 *)value;
-	struct socket *socket;
-	int err;
-
-	socket = sockfd_lookup(fd, &err);
-	if (!socket)
-		return err;
-
-	skops.sk = socket->sk;
-	if (!skops.sk) {
-		fput(socket->file);
-		return -EINVAL;
-	}
-
-	/* ULPs are currently supported only for TCP sockets in ESTABLISHED
-	 * state.
-	 */
-	if (skops.sk->sk_type != SOCK_STREAM ||
-	    skops.sk->sk_protocol != IPPROTO_TCP ||
-	    skops.sk->sk_state != TCP_ESTABLISHED) {
-		fput(socket->file);
-		return -EOPNOTSUPP;
-	}
-
-	lock_sock(skops.sk);
-	preempt_disable();
-	rcu_read_lock();
-	err = sock_hash_ctx_update_elem(&skops, map, key, flags);
-	rcu_read_unlock();
-	preempt_enable();
-	release_sock(skops.sk);
-	fput(socket->file);
-	return err;
-}
-
-static int sock_hash_delete_elem(struct bpf_map *map, void *key)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
-	struct bucket *b;
-	struct htab_elem *l;
-	u32 hash, key_size;
-	int ret = -ENOENT;
-
-	key_size = map->key_size;
-	hash = htab_map_hash(key, key_size);
-	b = __select_bucket(htab, hash);
-	head = &b->head;
-
-	raw_spin_lock_bh(&b->lock);
-	l = lookup_elem_raw(head, hash, key, key_size);
-	if (l) {
-		struct sock *sock = l->sk;
-		struct smap_psock *psock;
-
-		hlist_del_rcu(&l->hash_node);
-		psock = smap_psock_sk(sock);
-		/* This check handles a racing sock event that can get the
-		 * sk_callback_lock before this case but after xchg happens
-		 * causing the refcnt to hit zero and sock user data (psock)
-		 * to be null and queued for garbage collection.
-		 */
-		if (likely(psock)) {
-			smap_list_hash_remove(psock, l);
-			smap_release_sock(psock, sock);
-		}
-		free_htab_elem(htab, l);
-		ret = 0;
-	}
-	raw_spin_unlock_bh(&b->lock);
-	return ret;
-}
-
-struct sock  *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
-{
-	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
-	struct hlist_head *head;
-	struct htab_elem *l;
-	u32 key_size, hash;
-	struct bucket *b;
-	struct sock *sk;
-
-	key_size = map->key_size;
-	hash = htab_map_hash(key, key_size);
-	b = __select_bucket(htab, hash);
-	head = &b->head;
-
-	l = lookup_elem_raw(head, hash, key, key_size);
-	sk = l ? l->sk : NULL;
-	return sk;
-}
-
-const struct bpf_map_ops sock_map_ops = {
-	.map_alloc = sock_map_alloc,
-	.map_free = sock_map_free,
-	.map_lookup_elem = sock_map_lookup,
-	.map_get_next_key = sock_map_get_next_key,
-	.map_update_elem = sock_map_update_elem,
-	.map_delete_elem = sock_map_delete_elem,
-	.map_release_uref = sock_map_release,
-	.map_check_btf = map_check_no_btf,
-};
-
-const struct bpf_map_ops sock_hash_ops = {
-	.map_alloc = sock_hash_alloc,
-	.map_free = sock_hash_free,
-	.map_lookup_elem = sock_map_lookup,
-	.map_get_next_key = sock_hash_get_next_key,
-	.map_update_elem = sock_hash_update_elem,
-	.map_delete_elem = sock_hash_delete_elem,
-	.map_release_uref = sock_map_release,
-	.map_check_btf = map_check_no_btf,
-};
-
-static bool bpf_is_valid_sock_op(struct bpf_sock_ops_kern *ops)
-{
-	return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
-	       ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
-}
-BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	/* ULPs are currently supported only for TCP sockets in ESTABLISHED
-	 * state. This checks that the sock ops triggering the update is
-	 * one indicating we are (or will be soon) in an ESTABLISHED state.
-	 */
-	if (!bpf_is_valid_sock_op(bpf_sock))
-		return -EOPNOTSUPP;
-	return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_map_update_proto = {
-	.func		= bpf_sock_map_update,
-	.gpl_only	= false,
-	.pkt_access	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_CONST_MAP_PTR,
-	.arg3_type	= ARG_PTR_TO_MAP_KEY,
-	.arg4_type	= ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, bpf_sock,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	if (!bpf_is_valid_sock_op(bpf_sock))
-		return -EOPNOTSUPP;
-	return sock_hash_ctx_update_elem(bpf_sock, map, key, flags);
-}
-
-const struct bpf_func_proto bpf_sock_hash_update_proto = {
-	.func		= bpf_sock_hash_update,
-	.gpl_only	= false,
-	.pkt_access	= true,
-	.ret_type	= RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type	= ARG_CONST_MAP_PTR,
-	.arg3_type	= ARG_PTR_TO_MAP_KEY,
-	.arg4_type	= ARG_ANYTHING,
-};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4f41623..4d41371 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1651,7 +1651,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	switch (ptype) {
 	case BPF_PROG_TYPE_SK_SKB:
 	case BPF_PROG_TYPE_SK_MSG:
-		ret = sockmap_get_from_fd(attr, ptype, prog);
+		ret = sock_map_get_from_fd(attr, prog);
 		break;
 	case BPF_PROG_TYPE_LIRC_MODE2:
 		ret = lirc_prog_attach(attr, prog);
@@ -1705,10 +1705,10 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
 	case BPF_SK_MSG_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_MSG, NULL);
+		return sock_map_get_from_fd(attr, NULL);
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL);
+		return sock_map_get_from_fd(attr, NULL);
 	case BPF_LIRC_MODE2:
 		return lirc_prog_detach(attr);
 	case BPF_FLOW_DISSECTOR:
diff --git a/net/Kconfig b/net/Kconfig
index 228dfa3..f235edb 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -300,8 +300,11 @@ config BPF_JIT
 
 config BPF_STREAM_PARSER
 	bool "enable BPF STREAM_PARSER"
+	depends on INET
 	depends on BPF_SYSCALL
+	depends on CGROUP_BPF
 	select STREAM_PARSER
+	select NET_SOCK_MSG
 	---help---
 	 Enabling this allows a stream parser to be used with
 	 BPF_MAP_TYPE_SOCKMAP.
@@ -413,6 +416,14 @@ config GRO_CELLS
 config SOCK_VALIDATE_XMIT
 	bool
 
+config NET_SOCK_MSG
+	bool
+	default n
+	help
+	  The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
+	  ULPs (upper layer modules, e.g. TLS) to process L7 application data
+	  with the help of BPF programs.
+
 config NET_DEVLINK
 	tristate "Network physical/parent device Netlink interface"
 	help
diff --git a/net/core/Makefile b/net/core/Makefile
index 80175e6..fccd31e0 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -16,6 +16,7 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 obj-y += net-sysfs.o
 obj-$(CONFIG_PAGE_POOL) += page_pool.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o
 obj-$(CONFIG_FIB_RULES) += fib_rules.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
 obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
 obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
 obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
+obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o
 obj-$(CONFIG_DST_CACHE) += dst_cache.o
 obj-$(CONFIG_HWBM) += hwbm.o
 obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/filter.c b/net/core/filter.c
index 4bbc656..5ffb5cf 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -38,6 +38,7 @@
 #include <net/protocol.h>
 #include <net/netlink.h>
 #include <linux/skbuff.h>
+#include <linux/skmsg.h>
 #include <net/sock.h>
 #include <net/flow_dissector.h>
 #include <linux/errno.h>
@@ -2142,123 +2143,7 @@ static const struct bpf_func_proto bpf_redirect_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
-	if (!tcb->bpf.sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
-	.func           = bpf_sk_redirect_hash,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_PTR_TO_MAP_KEY,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
-	   struct bpf_map *, map, u32, key, u64, flags)
-{
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	tcb->bpf.flags = flags;
-	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
-	if (!tcb->bpf.sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-struct sock *do_sk_redirect_map(struct sk_buff *skb)
-{
-	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-
-	return tcb->bpf.sk_redir;
-}
-
-static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
-	.func           = bpf_sk_redirect_map,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_ANYTHING,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
-	   struct bpf_map *, map, void *, key, u64, flags)
-{
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	msg->flags = flags;
-	msg->sk_redir = __sock_hash_lookup_elem(map, key);
-	if (!msg->sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
-	.func           = bpf_msg_redirect_hash,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_PTR_TO_MAP_KEY,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
-	   struct bpf_map *, map, u32, key, u64, flags)
-{
-	/* If user passes invalid input drop the packet. */
-	if (unlikely(flags & ~(BPF_F_INGRESS)))
-		return SK_DROP;
-
-	msg->flags = flags;
-	msg->sk_redir = __sock_map_lookup_elem(map, key);
-	if (!msg->sk_redir)
-		return SK_DROP;
-
-	return SK_PASS;
-}
-
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
-{
-	return msg->sk_redir;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
-	.func           = bpf_msg_redirect_map,
-	.gpl_only       = false,
-	.ret_type       = RET_INTEGER,
-	.arg1_type	= ARG_PTR_TO_CTX,
-	.arg2_type      = ARG_CONST_MAP_PTR,
-	.arg3_type      = ARG_ANYTHING,
-	.arg4_type      = ARG_ANYTHING,
-};
-
-BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
 {
 	msg->apply_bytes = bytes;
 	return 0;
@@ -2272,7 +2157,7 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
 {
 	msg->cork_bytes = bytes;
 	return 0;
@@ -2286,45 +2171,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
 	.arg2_type      = ARG_ANYTHING,
 };
 
-#define sk_msg_iter_var(var)			\
-	do {					\
-		var++;				\
-		if (var == MAX_SKB_FRAGS)	\
-			var = 0;		\
-	} while (0)
-
-BPF_CALL_4(bpf_msg_pull_data,
-	   struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
+	   u32, end, u64, flags)
 {
-	unsigned int len = 0, offset = 0, copy = 0, poffset = 0;
-	int bytes = end - start, bytes_sg_total;
-	struct scatterlist *sg = msg->sg_data;
-	int first_sg, last_sg, i, shift;
-	unsigned char *p, *to, *from;
+	u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
+	u32 first_sge, last_sge, i, shift, bytes_sg_total;
+	struct scatterlist *sge;
+	u8 *raw, *to, *from;
 	struct page *page;
 
 	if (unlikely(flags || end <= start))
 		return -EINVAL;
 
 	/* First find the starting scatterlist element */
-	i = msg->sg_start;
+	i = msg->sg.start;
 	do {
-		len = sg[i].length;
+		len = sk_msg_elem(msg, i)->length;
 		if (start < offset + len)
 			break;
 		offset += len;
-		sk_msg_iter_var(i);
-	} while (i != msg->sg_end);
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
 
 	if (unlikely(start >= offset + len))
 		return -EINVAL;
 
-	first_sg = i;
+	first_sge = i;
 	/* The start may point into the sg element so we need to also
 	 * account for the headroom.
 	 */
 	bytes_sg_total = start - offset + bytes;
-	if (!msg->sg_copy[i] && bytes_sg_total <= len)
+	if (!msg->sg.copy[i] && bytes_sg_total <= len)
 		goto out;
 
 	/* At this point we need to linearize multiple scatterlist
@@ -2338,12 +2215,12 @@ BPF_CALL_4(bpf_msg_pull_data,
 	 * will copy the entire sg entry.
 	 */
 	do {
-		copy += sg[i].length;
-		sk_msg_iter_var(i);
+		copy += sk_msg_elem(msg, i)->length;
+		sk_msg_iter_var_next(i);
 		if (bytes_sg_total <= copy)
 			break;
-	} while (i != msg->sg_end);
-	last_sg = i;
+	} while (i != msg->sg.end);
+	last_sge = i;
 
 	if (unlikely(bytes_sg_total > copy))
 		return -EINVAL;
@@ -2352,63 +2229,61 @@ BPF_CALL_4(bpf_msg_pull_data,
 			   get_order(copy));
 	if (unlikely(!page))
 		return -ENOMEM;
-	p = page_address(page);
 
-	i = first_sg;
+	raw = page_address(page);
+	i = first_sge;
 	do {
-		from = sg_virt(&sg[i]);
-		len = sg[i].length;
-		to = p + poffset;
+		sge = sk_msg_elem(msg, i);
+		from = sg_virt(sge);
+		len = sge->length;
+		to = raw + poffset;
 
 		memcpy(to, from, len);
 		poffset += len;
-		sg[i].length = 0;
-		put_page(sg_page(&sg[i]));
+		sge->length = 0;
+		put_page(sg_page(sge));
 
-		sk_msg_iter_var(i);
-	} while (i != last_sg);
+		sk_msg_iter_var_next(i);
+	} while (i != last_sge);
 
-	sg[first_sg].length = copy;
-	sg_set_page(&sg[first_sg], page, copy, 0);
+	sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
 
 	/* To repair sg ring we need to shift entries. If we only
 	 * had a single entry though we can just replace it and
 	 * be done. Otherwise walk the ring and shift the entries.
 	 */
-	WARN_ON_ONCE(last_sg == first_sg);
-	shift = last_sg > first_sg ?
-		last_sg - first_sg - 1 :
-		MAX_SKB_FRAGS - first_sg + last_sg - 1;
+	WARN_ON_ONCE(last_sge == first_sge);
+	shift = last_sge > first_sge ?
+		last_sge - first_sge - 1 :
+		MAX_SKB_FRAGS - first_sge + last_sge - 1;
 	if (!shift)
 		goto out;
 
-	i = first_sg;
-	sk_msg_iter_var(i);
+	i = first_sge;
+	sk_msg_iter_var_next(i);
 	do {
-		int move_from;
+		u32 move_from;
 
-		if (i + shift >= MAX_SKB_FRAGS)
-			move_from = i + shift - MAX_SKB_FRAGS;
+		if (i + shift >= MAX_MSG_FRAGS)
+			move_from = i + shift - MAX_MSG_FRAGS;
 		else
 			move_from = i + shift;
-
-		if (move_from == msg->sg_end)
+		if (move_from == msg->sg.end)
 			break;
 
-		sg[i] = sg[move_from];
-		sg[move_from].length = 0;
-		sg[move_from].page_link = 0;
-		sg[move_from].offset = 0;
-
-		sk_msg_iter_var(i);
+		msg->sg.data[i] = msg->sg.data[move_from];
+		msg->sg.data[move_from].length = 0;
+		msg->sg.data[move_from].page_link = 0;
+		msg->sg.data[move_from].offset = 0;
+		sk_msg_iter_var_next(i);
 	} while (1);
-	msg->sg_end -= shift;
-	if (msg->sg_end < 0)
-		msg->sg_end += MAX_SKB_FRAGS;
+
+	msg->sg.end = msg->sg.end - shift > msg->sg.end ?
+		      msg->sg.end - shift + MAX_MSG_FRAGS :
+		      msg->sg.end - shift;
 out:
-	msg->data = sg_virt(&sg[first_sg]) + start - offset;
+	msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
 	msg->data_end = msg->data + bytes;
-
 	return 0;
 }
 
@@ -5200,6 +5075,9 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+
 static const struct bpf_func_proto *
 sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -5223,6 +5101,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
+
 static const struct bpf_func_proto *
 sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -5244,6 +5125,9 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	}
 }
 
+const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
+
 static const struct bpf_func_proto *
 sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -6998,22 +6882,22 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 
 	switch (si->off) {
 	case offsetof(struct sk_msg_md, data):
-		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, data));
+				      offsetof(struct sk_msg, data));
 		break;
 	case offsetof(struct sk_msg_md, data_end):
-		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, data_end));
+				      offsetof(struct sk_msg, data_end));
 		break;
 	case offsetof(struct sk_msg_md, family):
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-					      struct sk_msg_buff, sk),
+					      struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_family));
 		break;
@@ -7022,9 +6906,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_daddr));
 		break;
@@ -7034,9 +6918,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 					  skc_rcv_saddr) != 4);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-					      struct sk_msg_buff, sk),
+					      struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common,
 					       skc_rcv_saddr));
@@ -7051,9 +6935,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		off = si->off;
 		off -= offsetof(struct sk_msg_md, remote_ip6[0]);
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common,
 					       skc_v6_daddr.s6_addr32[0]) +
@@ -7072,9 +6956,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		off = si->off;
 		off -= offsetof(struct sk_msg_md, local_ip6[0]);
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common,
 					       skc_v6_rcv_saddr.s6_addr32[0]) +
@@ -7088,9 +6972,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_dport));
 #ifndef __BIG_ENDIAN_BITFIELD
@@ -7102,9 +6986,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
 		BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
 
 		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct sk_msg_buff, sk),
+						struct sk_msg, sk),
 				      si->dst_reg, si->src_reg,
-				      offsetof(struct sk_msg_buff, sk));
+				      offsetof(struct sk_msg, sk));
 		*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
 				      offsetof(struct sock_common, skc_num));
 		break;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
new file mode 100644
index 0000000..ae2b281
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,763 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+
+static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
+{
+	if (msg->sg.end > msg->sg.start &&
+	    elem_first_coalesce < msg->sg.end)
+		return true;
+
+	if (msg->sg.end < msg->sg.start &&
+	    (elem_first_coalesce > msg->sg.start ||
+	     elem_first_coalesce < msg->sg.end))
+		return true;
+
+	return false;
+}
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+		 int elem_first_coalesce)
+{
+	struct page_frag *pfrag = sk_page_frag(sk);
+	int ret = 0;
+
+	len -= msg->sg.size;
+	while (len > 0) {
+		struct scatterlist *sge;
+		u32 orig_offset;
+		int use, i;
+
+		if (!sk_page_frag_refill(sk, pfrag))
+			return -ENOMEM;
+
+		orig_offset = pfrag->offset;
+		use = min_t(int, len, pfrag->size - orig_offset);
+		if (!sk_wmem_schedule(sk, use))
+			return -ENOMEM;
+
+		i = msg->sg.end;
+		sk_msg_iter_var_prev(i);
+		sge = &msg->sg.data[i];
+
+		if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
+		    sg_page(sge) == pfrag->page &&
+		    sge->offset + sge->length == orig_offset) {
+			sge->length += use;
+		} else {
+			if (sk_msg_full(msg)) {
+				ret = -ENOSPC;
+				break;
+			}
+
+			sge = &msg->sg.data[msg->sg.end];
+			sg_unmark_end(sge);
+			sg_set_page(sge, pfrag->page, use, orig_offset);
+			get_page(pfrag->page);
+			sk_msg_iter_next(msg, end);
+		}
+
+		sk_mem_charge(sk, use);
+		msg->sg.size += use;
+		pfrag->offset += use;
+		len -= use;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_alloc);
+
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+	int i = msg->sg.start;
+
+	do {
+		struct scatterlist *sge = sk_msg_elem(msg, i);
+
+		if (bytes < sge->length) {
+			sge->length -= bytes;
+			sge->offset += bytes;
+			sk_mem_uncharge(sk, bytes);
+			break;
+		}
+
+		sk_mem_uncharge(sk, sge->length);
+		bytes -= sge->length;
+		sge->length = 0;
+		sge->offset = 0;
+		sk_msg_iter_var_next(i);
+	} while (bytes && i != msg->sg.end);
+	msg->sg.start = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_return_zero);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+	int i = msg->sg.start;
+
+	do {
+		struct scatterlist *sge = &msg->sg.data[i];
+		int uncharge = (bytes < sge->length) ? bytes : sge->length;
+
+		sk_mem_uncharge(sk, uncharge);
+		bytes -= uncharge;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+}
+EXPORT_SYMBOL_GPL(sk_msg_return);
+
+static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
+			    bool charge)
+{
+	struct scatterlist *sge = sk_msg_elem(msg, i);
+	u32 len = sge->length;
+
+	if (charge)
+		sk_mem_uncharge(sk, len);
+	if (!msg->skb)
+		put_page(sg_page(sge));
+	memset(sge, 0, sizeof(*sge));
+	return len;
+}
+
+static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
+			 bool charge)
+{
+	struct scatterlist *sge = sk_msg_elem(msg, i);
+	int freed = 0;
+
+	while (msg->sg.size) {
+		msg->sg.size -= sge->length;
+		freed += sk_msg_free_elem(sk, msg, i, charge);
+		sk_msg_iter_var_next(i);
+		sk_msg_check_to_free(msg, i, msg->sg.size);
+		sge = sk_msg_elem(msg, i);
+	}
+	if (msg->skb)
+		consume_skb(msg->skb);
+	sk_msg_init(msg);
+	return freed;
+}
+
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
+{
+	return __sk_msg_free(sk, msg, msg->sg.start, false);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
+
+int sk_msg_free(struct sock *sk, struct sk_msg *msg)
+{
+	return __sk_msg_free(sk, msg, msg->sg.start, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free);
+
+static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
+				  u32 bytes, bool charge)
+{
+	struct scatterlist *sge;
+	u32 i = msg->sg.start;
+
+	while (bytes) {
+		sge = sk_msg_elem(msg, i);
+		if (!sge->length)
+			break;
+		if (bytes < sge->length) {
+			if (charge)
+				sk_mem_uncharge(sk, bytes);
+			sge->length -= bytes;
+			sge->offset += bytes;
+			msg->sg.size -= bytes;
+			break;
+		}
+
+		msg->sg.size -= sge->length;
+		bytes -= sge->length;
+		sk_msg_free_elem(sk, msg, i, charge);
+		sk_msg_iter_var_next(i);
+		sk_msg_check_to_free(msg, i, bytes);
+	}
+	msg->sg.start = i;
+}
+
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
+{
+	__sk_msg_free_partial(sk, msg, bytes, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_partial);
+
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+				  u32 bytes)
+{
+	__sk_msg_free_partial(sk, msg, bytes, false);
+}
+
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
+{
+	int trim = msg->sg.size - len;
+	u32 i = msg->sg.end;
+
+	if (trim <= 0) {
+		WARN_ON(trim < 0);
+		return;
+	}
+
+	sk_msg_iter_var_prev(i);
+	msg->sg.size = len;
+	while (msg->sg.data[i].length &&
+	       trim >= msg->sg.data[i].length) {
+		trim -= msg->sg.data[i].length;
+		sk_msg_free_elem(sk, msg, i, true);
+		sk_msg_iter_var_prev(i);
+		if (!trim)
+			goto out;
+	}
+
+	msg->sg.data[i].length -= trim;
+	sk_mem_uncharge(sk, trim);
+out:
+	/* If we trim data before curr pointer update copybreak and current
+	 * so that any future copy operations start at new copy location.
+	 * However trimed data that has not yet been used in a copy op
+	 * does not require an update.
+	 */
+	if (msg->sg.curr >= i) {
+		msg->sg.curr = i;
+		msg->sg.copybreak = msg->sg.data[i].length;
+	}
+	sk_msg_iter_var_next(i);
+	msg->sg.end = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_trim);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+			      struct sk_msg *msg, u32 bytes)
+{
+	int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
+	const int to_max_pages = MAX_MSG_FRAGS;
+	struct page *pages[MAX_MSG_FRAGS];
+	ssize_t orig, copied, use, offset;
+
+	orig = msg->sg.size;
+	while (bytes > 0) {
+		i = 0;
+		maxpages = to_max_pages - num_elems;
+		if (maxpages == 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		copied = iov_iter_get_pages(from, pages, bytes, maxpages,
+					    &offset);
+		if (copied <= 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+
+		iov_iter_advance(from, copied);
+		bytes -= copied;
+		msg->sg.size += copied;
+
+		while (copied) {
+			use = min_t(int, copied, PAGE_SIZE - offset);
+			sg_set_page(&msg->sg.data[msg->sg.end],
+				    pages[i], use, offset);
+			sg_unmark_end(&msg->sg.data[msg->sg.end]);
+			sk_mem_charge(sk, use);
+
+			offset = 0;
+			copied -= use;
+			sk_msg_iter_next(msg, end);
+			num_elems++;
+			i++;
+		}
+		/* When zerocopy is mixed with sk_msg_*copy* operations we
+		 * may have a copybreak set in this case clear and prefer
+		 * zerocopy remainder when possible.
+		 */
+		msg->sg.copybreak = 0;
+		msg->sg.curr = msg->sg.end;
+	}
+out:
+	/* Revert iov_iter updates, msg will need to use 'trim' later if it
+	 * also needs to be cleared.
+	 */
+	if (ret)
+		iov_iter_revert(from, msg->sg.size - orig);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
+
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+			     struct sk_msg *msg, u32 bytes)
+{
+	int ret = -ENOSPC, i = msg->sg.curr;
+	struct scatterlist *sge;
+	u32 copy, buf_size;
+	void *to;
+
+	do {
+		sge = sk_msg_elem(msg, i);
+		/* This is possible if a trim operation shrunk the buffer */
+		if (msg->sg.copybreak >= sge->length) {
+			msg->sg.copybreak = 0;
+			sk_msg_iter_var_next(i);
+			if (i == msg->sg.end)
+				break;
+			sge = sk_msg_elem(msg, i);
+		}
+
+		buf_size = sge->length - msg->sg.copybreak;
+		copy = (buf_size > bytes) ? bytes : buf_size;
+		to = sg_virt(sge) + msg->sg.copybreak;
+		msg->sg.copybreak += copy;
+		if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+			ret = copy_from_iter_nocache(to, copy, from);
+		else
+			ret = copy_from_iter(to, copy, from);
+		if (ret != copy) {
+			ret = -EFAULT;
+			goto out;
+		}
+		bytes -= copy;
+		if (!bytes)
+			break;
+		msg->sg.copybreak = 0;
+		sk_msg_iter_var_next(i);
+	} while (i != msg->sg.end);
+out:
+	msg->sg.curr = i;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
+{
+	struct sock *sk = psock->sk;
+	int copied = 0, num_sge;
+	struct sk_msg *msg;
+
+	msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC);
+	if (unlikely(!msg))
+		return -EAGAIN;
+	if (!sk_rmem_schedule(sk, skb, skb->len)) {
+		kfree(msg);
+		return -EAGAIN;
+	}
+
+	sk_msg_init(msg);
+	num_sge = skb_to_sgvec(skb, msg->sg.data, 0, skb->len);
+	if (unlikely(num_sge < 0)) {
+		kfree(msg);
+		return num_sge;
+	}
+
+	sk_mem_charge(sk, skb->len);
+	copied = skb->len;
+	msg->sg.start = 0;
+	msg->sg.end = num_sge == MAX_MSG_FRAGS ? 0 : num_sge;
+	msg->skb = skb;
+
+	sk_psock_queue_msg(psock, msg);
+	sk->sk_data_ready(sk);
+	return copied;
+}
+
+static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
+			       u32 off, u32 len, bool ingress)
+{
+	if (ingress)
+		return sk_psock_skb_ingress(psock, skb);
+	else
+		return skb_send_sock_locked(psock->sk, skb, off, len);
+}
+
+static void sk_psock_backlog(struct work_struct *work)
+{
+	struct sk_psock *psock = container_of(work, struct sk_psock, work);
+	struct sk_psock_work_state *state = &psock->work_state;
+	struct sk_buff *skb;
+	bool ingress;
+	u32 len, off;
+	int ret;
+
+	/* Lock sock to avoid losing sk_socket during loop. */
+	lock_sock(psock->sk);
+	if (state->skb) {
+		skb = state->skb;
+		len = state->len;
+		off = state->off;
+		state->skb = NULL;
+		goto start;
+	}
+
+	while ((skb = skb_dequeue(&psock->ingress_skb))) {
+		len = skb->len;
+		off = 0;
+start:
+		ingress = tcp_skb_bpf_ingress(skb);
+		do {
+			ret = -EIO;
+			if (likely(psock->sk->sk_socket))
+				ret = sk_psock_handle_skb(psock, skb, off,
+							  len, ingress);
+			if (ret <= 0) {
+				if (ret == -EAGAIN) {
+					state->skb = skb;
+					state->len = len;
+					state->off = off;
+					goto end;
+				}
+				/* Hard errors break pipe and stop xmit. */
+				sk_psock_report_error(psock, ret ? -ret : EPIPE);
+				sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+				kfree_skb(skb);
+				goto end;
+			}
+			off += ret;
+			len -= ret;
+		} while (len);
+
+		if (!ingress)
+			kfree_skb(skb);
+	}
+end:
+	release_sock(psock->sk);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node)
+{
+	struct sk_psock *psock = kzalloc_node(sizeof(*psock),
+					      GFP_ATOMIC | __GFP_NOWARN,
+					      node);
+	if (!psock)
+		return NULL;
+
+	psock->sk = sk;
+	psock->eval =  __SK_NONE;
+
+	INIT_LIST_HEAD(&psock->link);
+	spin_lock_init(&psock->link_lock);
+
+	INIT_WORK(&psock->work, sk_psock_backlog);
+	INIT_LIST_HEAD(&psock->ingress_msg);
+	skb_queue_head_init(&psock->ingress_skb);
+
+	sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
+	refcount_set(&psock->refcnt, 1);
+
+	rcu_assign_sk_user_data(sk, psock);
+	sock_hold(sk);
+
+	return psock;
+}
+EXPORT_SYMBOL_GPL(sk_psock_init);
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
+{
+	struct sk_psock_link *link;
+
+	spin_lock_bh(&psock->link_lock);
+	link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
+					list);
+	if (link)
+		list_del(&link->list);
+	spin_unlock_bh(&psock->link_lock);
+	return link;
+}
+
+void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+{
+	struct sk_msg *msg, *tmp;
+
+	list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
+		list_del(&msg->list);
+		sk_msg_free(psock->sk, msg);
+		kfree(msg);
+	}
+}
+
+static void sk_psock_zap_ingress(struct sk_psock *psock)
+{
+	__skb_queue_purge(&psock->ingress_skb);
+	__sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_link_destroy(struct sk_psock *psock)
+{
+	struct sk_psock_link *link, *tmp;
+
+	list_for_each_entry_safe(link, tmp, &psock->link, list) {
+		list_del(&link->list);
+		sk_psock_free_link(link);
+	}
+}
+
+static void sk_psock_destroy_deferred(struct work_struct *gc)
+{
+	struct sk_psock *psock = container_of(gc, struct sk_psock, gc);
+
+	/* No sk_callback_lock since already detached. */
+	if (psock->parser.enabled)
+		strp_done(&psock->parser.strp);
+
+	cancel_work_sync(&psock->work);
+
+	psock_progs_drop(&psock->progs);
+
+	sk_psock_link_destroy(psock);
+	sk_psock_cork_free(psock);
+	sk_psock_zap_ingress(psock);
+
+	if (psock->sk_redir)
+		sock_put(psock->sk_redir);
+	sock_put(psock->sk);
+	kfree(psock);
+}
+
+void sk_psock_destroy(struct rcu_head *rcu)
+{
+	struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu);
+
+	INIT_WORK(&psock->gc, sk_psock_destroy_deferred);
+	schedule_work(&psock->gc);
+}
+EXPORT_SYMBOL_GPL(sk_psock_destroy);
+
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
+{
+	rcu_assign_sk_user_data(sk, NULL);
+	sk_psock_cork_free(psock);
+	sk_psock_restore_proto(sk, psock);
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (psock->progs.skb_parser)
+		sk_psock_stop_strp(sk, psock);
+	write_unlock_bh(&sk->sk_callback_lock);
+	sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+
+	call_rcu_sched(&psock->rcu, sk_psock_destroy);
+}
+EXPORT_SYMBOL_GPL(sk_psock_drop);
+
+static int sk_psock_map_verd(int verdict, bool redir)
+{
+	switch (verdict) {
+	case SK_PASS:
+		return redir ? __SK_REDIRECT : __SK_PASS;
+	case SK_DROP:
+	default:
+		break;
+	}
+
+	return __SK_DROP;
+}
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+			 struct sk_msg *msg)
+{
+	struct bpf_prog *prog;
+	int ret;
+
+	preempt_disable();
+	rcu_read_lock();
+	prog = READ_ONCE(psock->progs.msg_parser);
+	if (unlikely(!prog)) {
+		ret = __SK_PASS;
+		goto out;
+	}
+
+	sk_msg_compute_data_pointers(msg);
+	msg->sk = sk;
+	ret = BPF_PROG_RUN(prog, msg);
+	ret = sk_psock_map_verd(ret, msg->sk_redir);
+	psock->apply_bytes = msg->apply_bytes;
+	if (ret == __SK_REDIRECT) {
+		if (psock->sk_redir)
+			sock_put(psock->sk_redir);
+		psock->sk_redir = msg->sk_redir;
+		if (!psock->sk_redir) {
+			ret = __SK_DROP;
+			goto out;
+		}
+		sock_hold(psock->sk_redir);
+	}
+out:
+	rcu_read_unlock();
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
+
+static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
+			    struct sk_buff *skb)
+{
+	int ret;
+
+	skb->sk = psock->sk;
+	bpf_compute_data_end_sk_skb(skb);
+	preempt_disable();
+	ret = BPF_PROG_RUN(prog, skb);
+	preempt_enable();
+	/* strparser clones the skb before handing it to a upper layer,
+	 * meaning skb_orphan has been called. We NULL sk on the way out
+	 * to ensure we don't trigger a BUG_ON() in skb/sk operations
+	 * later and because we are not charging the memory of this skb
+	 * to any socket yet.
+	 */
+	skb->sk = NULL;
+	return ret;
+}
+
+static struct sk_psock *sk_psock_from_strp(struct strparser *strp)
+{
+	struct sk_psock_parser *parser;
+
+	parser = container_of(strp, struct sk_psock_parser, strp);
+	return container_of(parser, struct sk_psock, parser);
+}
+
+static void sk_psock_verdict_apply(struct sk_psock *psock,
+				   struct sk_buff *skb, int verdict)
+{
+	struct sk_psock *psock_other;
+	struct sock *sk_other;
+	bool ingress;
+
+	switch (verdict) {
+	case __SK_REDIRECT:
+		sk_other = tcp_skb_bpf_redirect_fetch(skb);
+		if (unlikely(!sk_other))
+			goto out_free;
+		psock_other = sk_psock(sk_other);
+		if (!psock_other || sock_flag(sk_other, SOCK_DEAD) ||
+		    !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED))
+			goto out_free;
+		ingress = tcp_skb_bpf_ingress(skb);
+		if ((!ingress && sock_writeable(sk_other)) ||
+		    (ingress &&
+		     atomic_read(&sk_other->sk_rmem_alloc) <=
+		     sk_other->sk_rcvbuf)) {
+			if (!ingress)
+				skb_set_owner_w(skb, sk_other);
+			skb_queue_tail(&psock_other->ingress_skb, skb);
+			schedule_work(&psock_other->work);
+			break;
+		}
+		/* fall-through */
+	case __SK_DROP:
+		/* fall-through */
+	default:
+out_free:
+		kfree_skb(skb);
+	}
+}
+
+static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
+{
+	struct sk_psock *psock = sk_psock_from_strp(strp);
+	struct bpf_prog *prog;
+	int ret = __SK_DROP;
+
+	rcu_read_lock();
+	prog = READ_ONCE(psock->progs.skb_verdict);
+	if (likely(prog)) {
+		skb_orphan(skb);
+		tcp_skb_bpf_redirect_clear(skb);
+		ret = sk_psock_bpf_run(psock, prog, skb);
+		ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb));
+	}
+	rcu_read_unlock();
+	sk_psock_verdict_apply(psock, skb, ret);
+}
+
+static int sk_psock_strp_read_done(struct strparser *strp, int err)
+{
+	return err;
+}
+
+static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+	struct sk_psock *psock = sk_psock_from_strp(strp);
+	struct bpf_prog *prog;
+	int ret = skb->len;
+
+	rcu_read_lock();
+	prog = READ_ONCE(psock->progs.skb_parser);
+	if (likely(prog))
+		ret = sk_psock_bpf_run(psock, prog, skb);
+	rcu_read_unlock();
+	return ret;
+}
+
+/* Called with socket lock held. */
+static void sk_psock_data_ready(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock)) {
+		write_lock_bh(&sk->sk_callback_lock);
+		strp_data_ready(&psock->parser.strp);
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+	rcu_read_unlock();
+}
+
+static void sk_psock_write_space(struct sock *sk)
+{
+	struct sk_psock *psock;
+	void (*write_space)(struct sock *sk);
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock && sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)))
+		schedule_work(&psock->work);
+	write_space = psock->saved_write_space;
+	rcu_read_unlock();
+	write_space(sk);
+}
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+	static const struct strp_callbacks cb = {
+		.rcv_msg	= sk_psock_strp_read,
+		.read_sock_done	= sk_psock_strp_read_done,
+		.parse_msg	= sk_psock_strp_parse,
+	};
+
+	psock->parser.enabled = false;
+	return strp_init(&psock->parser.strp, sk, &cb);
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (parser->enabled)
+		return;
+
+	parser->saved_data_ready = sk->sk_data_ready;
+	sk->sk_data_ready = sk_psock_data_ready;
+	sk->sk_write_space = sk_psock_write_space;
+	parser->enabled = true;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_parser *parser = &psock->parser;
+
+	if (!parser->enabled)
+		return;
+
+	sk->sk_data_ready = parser->saved_data_ready;
+	parser->saved_data_ready = NULL;
+	strp_stop(&parser->strp);
+	parser->enabled = false;
+}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
new file mode 100644
index 0000000..3c0e44c
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1002 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/skmsg.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+
+struct bpf_stab {
+	struct bpf_map map;
+	struct sock **sks;
+	struct sk_psock_progs progs;
+	raw_spinlock_t lock;
+};
+
+#define SOCK_CREATE_FLAG_MASK				\
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_stab *stab;
+	u64 cost;
+	int err;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (attr->max_entries == 0 ||
+	    attr->key_size    != 4 ||
+	    attr->value_size  != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+
+	stab = kzalloc(sizeof(*stab), GFP_USER);
+	if (!stab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&stab->map, attr);
+	raw_spin_lock_init(&stab->lock);
+
+	/* Make sure page count doesn't overflow. */
+	cost = (u64) stab->map.max_entries * sizeof(struct sock *);
+	if (cost >= U32_MAX - PAGE_SIZE) {
+		err = -EINVAL;
+		goto free_stab;
+	}
+
+	stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+	err = bpf_map_precharge_memlock(stab->map.pages);
+	if (err)
+		goto free_stab;
+
+	stab->sks = bpf_map_area_alloc(stab->map.max_entries *
+				       sizeof(struct sock *),
+				       stab->map.numa_node);
+	if (stab->sks)
+		return &stab->map;
+	err = -ENOMEM;
+free_stab:
+	kfree(stab);
+	return ERR_PTR(err);
+}
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	u32 ufd = attr->target_fd;
+	struct bpf_map *map;
+	struct fd f;
+	int ret;
+
+	f = fdget(ufd);
+	map = __bpf_map_get(f);
+	if (IS_ERR(map))
+		return PTR_ERR(map);
+	ret = sock_map_prog_update(map, prog, attr->attach_type);
+	fdput(f);
+	return ret;
+}
+
+static void sock_map_sk_acquire(struct sock *sk)
+	__acquires(&sk->sk_lock.slock)
+{
+	lock_sock(sk);
+	preempt_disable();
+	rcu_read_lock();
+}
+
+static void sock_map_sk_release(struct sock *sk)
+	__releases(&sk->sk_lock.slock)
+{
+	rcu_read_unlock();
+	preempt_enable();
+	release_sock(sk);
+}
+
+static void sock_map_add_link(struct sk_psock *psock,
+			      struct sk_psock_link *link,
+			      struct bpf_map *map, void *link_raw)
+{
+	link->link_raw = link_raw;
+	link->map = map;
+	spin_lock_bh(&psock->link_lock);
+	list_add_tail(&link->list, &psock->link);
+	spin_unlock_bh(&psock->link_lock);
+}
+
+static void sock_map_del_link(struct sock *sk,
+			      struct sk_psock *psock, void *link_raw)
+{
+	struct sk_psock_link *link, *tmp;
+	bool strp_stop = false;
+
+	spin_lock_bh(&psock->link_lock);
+	list_for_each_entry_safe(link, tmp, &psock->link, list) {
+		if (link->link_raw == link_raw) {
+			struct bpf_map *map = link->map;
+			struct bpf_stab *stab = container_of(map, struct bpf_stab,
+							     map);
+			if (psock->parser.enabled && stab->progs.skb_parser)
+				strp_stop = true;
+			list_del(&link->list);
+			sk_psock_free_link(link);
+		}
+	}
+	spin_unlock_bh(&psock->link_lock);
+	if (strp_stop) {
+		write_lock_bh(&sk->sk_callback_lock);
+		sk_psock_stop_strp(sk, psock);
+		write_unlock_bh(&sk->sk_callback_lock);
+	}
+}
+
+static void sock_map_unref(struct sock *sk, void *link_raw)
+{
+	struct sk_psock *psock = sk_psock(sk);
+
+	if (likely(psock)) {
+		sock_map_del_link(sk, psock, link_raw);
+		sk_psock_put(sk, psock);
+	}
+}
+
+static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs,
+			 struct sock *sk)
+{
+	struct bpf_prog *msg_parser, *skb_parser, *skb_verdict;
+	bool skb_progs, sk_psock_is_new = false;
+	struct sk_psock *psock;
+	int ret;
+
+	skb_verdict = READ_ONCE(progs->skb_verdict);
+	skb_parser = READ_ONCE(progs->skb_parser);
+	skb_progs = skb_parser && skb_verdict;
+	if (skb_progs) {
+		skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+		if (IS_ERR(skb_verdict))
+			return PTR_ERR(skb_verdict);
+		skb_parser = bpf_prog_inc_not_zero(skb_parser);
+		if (IS_ERR(skb_parser)) {
+			bpf_prog_put(skb_verdict);
+			return PTR_ERR(skb_parser);
+		}
+	}
+
+	msg_parser = READ_ONCE(progs->msg_parser);
+	if (msg_parser) {
+		msg_parser = bpf_prog_inc_not_zero(msg_parser);
+		if (IS_ERR(msg_parser)) {
+			ret = PTR_ERR(msg_parser);
+			goto out;
+		}
+	}
+
+	psock = sk_psock_get(sk);
+	if (psock) {
+		if (!sk_has_psock(sk)) {
+			ret = -EBUSY;
+			goto out_progs;
+		}
+		if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
+		    (skb_progs  && READ_ONCE(psock->progs.skb_parser))) {
+			sk_psock_put(sk, psock);
+			ret = -EBUSY;
+			goto out_progs;
+		}
+	} else {
+		psock = sk_psock_init(sk, map->numa_node);
+		if (!psock) {
+			ret = -ENOMEM;
+			goto out_progs;
+		}
+		sk_psock_is_new = true;
+	}
+
+	if (msg_parser)
+		psock_set_prog(&psock->progs.msg_parser, msg_parser);
+	if (sk_psock_is_new) {
+		ret = tcp_bpf_init(sk);
+		if (ret < 0)
+			goto out_drop;
+	} else {
+		tcp_bpf_reinit(sk);
+	}
+
+	write_lock_bh(&sk->sk_callback_lock);
+	if (skb_progs && !psock->parser.enabled) {
+		ret = sk_psock_init_strp(sk, psock);
+		if (ret) {
+			write_unlock_bh(&sk->sk_callback_lock);
+			goto out_drop;
+		}
+		psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+		psock_set_prog(&psock->progs.skb_parser, skb_parser);
+		sk_psock_start_strp(sk, psock);
+	}
+	write_unlock_bh(&sk->sk_callback_lock);
+	return 0;
+out_drop:
+	sk_psock_put(sk, psock);
+out_progs:
+	if (msg_parser)
+		bpf_prog_put(msg_parser);
+out:
+	if (skb_progs) {
+		bpf_prog_put(skb_verdict);
+		bpf_prog_put(skb_parser);
+	}
+	return ret;
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	int i;
+
+	synchronize_rcu();
+	rcu_read_lock();
+	raw_spin_lock_bh(&stab->lock);
+	for (i = 0; i < stab->map.max_entries; i++) {
+		struct sock **psk = &stab->sks[i];
+		struct sock *sk;
+
+		sk = xchg(psk, NULL);
+		if (sk)
+			sock_map_unref(sk, psk);
+	}
+	raw_spin_unlock_bh(&stab->lock);
+	rcu_read_unlock();
+
+	bpf_map_area_free(stab->sks);
+	kfree(stab);
+}
+
+static void sock_map_release_progs(struct bpf_map *map)
+{
+	psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
+}
+
+static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (unlikely(key >= map->max_entries))
+		return NULL;
+	return READ_ONCE(stab->sks[key]);
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
+			     struct sock **psk)
+{
+	struct sock *sk;
+
+	raw_spin_lock_bh(&stab->lock);
+	sk = *psk;
+	if (!sk_test || sk_test == sk)
+		*psk = NULL;
+	raw_spin_unlock_bh(&stab->lock);
+	if (unlikely(!sk))
+		return -EINVAL;
+	sock_map_unref(sk, psk);
+	return 0;
+}
+
+static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
+				      void *link_raw)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+	__sock_map_delete(stab, sk, link_raw);
+}
+
+static int sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	u32 i = *(u32 *)key;
+	struct sock **psk;
+
+	if (unlikely(i >= map->max_entries))
+		return -EINVAL;
+
+	psk = &stab->sks[i];
+	return __sock_map_delete(stab, NULL, psk);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	u32 i = key ? *(u32 *)key : U32_MAX;
+	u32 *key_next = next;
+
+	if (i == stab->map.max_entries - 1)
+		return -ENOENT;
+	if (i >= stab->map.max_entries)
+		*key_next = 0;
+	else
+		*key_next = i + 1;
+	return 0;
+}
+
+static int sock_map_update_common(struct bpf_map *map, u32 idx,
+				  struct sock *sk, u64 flags)
+{
+	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+	struct sk_psock_link *link;
+	struct sk_psock *psock;
+	struct sock *osk;
+	int ret;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(idx >= map->max_entries))
+		return -E2BIG;
+
+	link = sk_psock_init_link();
+	if (!link)
+		return -ENOMEM;
+
+	ret = sock_map_link(map, &stab->progs, sk);
+	if (ret < 0)
+		goto out_free;
+
+	psock = sk_psock(sk);
+	WARN_ON_ONCE(!psock);
+
+	raw_spin_lock_bh(&stab->lock);
+	osk = stab->sks[idx];
+	if (osk && flags == BPF_NOEXIST) {
+		ret = -EEXIST;
+		goto out_unlock;
+	} else if (!osk && flags == BPF_EXIST) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	sock_map_add_link(psock, link, map, &stab->sks[idx]);
+	stab->sks[idx] = sk;
+	if (osk)
+		sock_map_unref(osk, &stab->sks[idx]);
+	raw_spin_unlock_bh(&stab->lock);
+	return 0;
+out_unlock:
+	raw_spin_unlock_bh(&stab->lock);
+	if (psock)
+		sk_psock_put(sk, psock);
+out_free:
+	sk_psock_free_link(link);
+	return ret;
+}
+
+static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
+{
+	return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+	       ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+	return sk->sk_type == SOCK_STREAM &&
+	       sk->sk_protocol == IPPROTO_TCP;
+}
+
+static int sock_map_update_elem(struct bpf_map *map, void *key,
+				void *value, u64 flags)
+{
+	u32 ufd = *(u32 *)value;
+	u32 idx = *(u32 *)key;
+	struct socket *sock;
+	struct sock *sk;
+	int ret;
+
+	sock = sockfd_lookup(ufd, &ret);
+	if (!sock)
+		return ret;
+	sk = sock->sk;
+	if (!sk) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!sock_map_sk_is_suitable(sk) ||
+	    sk->sk_state != TCP_ESTABLISHED) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	sock_map_sk_acquire(sk);
+	ret = sock_map_update_common(map, idx, sk, flags);
+	sock_map_sk_release(sk);
+out:
+	fput(sock->file);
+	return ret;
+}
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (likely(sock_map_sk_is_suitable(sops->sk) &&
+		   sock_map_op_okay(sops)))
+		return sock_map_update_common(map, *(u32 *)key, sops->sk,
+					      flags);
+	return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+	.func		= bpf_sock_map_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+	   struct bpf_map *, map, u32, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+	.func           = bpf_sk_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
+	   struct bpf_map *, map, u32, key, u64, flags)
+{
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	msg->flags = flags;
+	msg->sk_redir = __sock_map_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+	.func           = bpf_msg_redirect_map,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_ANYTHING,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_map_ops = {
+	.map_alloc		= sock_map_alloc,
+	.map_free		= sock_map_free,
+	.map_get_next_key	= sock_map_get_next_key,
+	.map_update_elem	= sock_map_update_elem,
+	.map_delete_elem	= sock_map_delete_elem,
+	.map_lookup_elem	= sock_map_lookup,
+	.map_release_uref	= sock_map_release_progs,
+	.map_check_btf		= map_check_no_btf,
+};
+
+struct bpf_htab_elem {
+	struct rcu_head rcu;
+	u32 hash;
+	struct sock *sk;
+	struct hlist_node node;
+	u8 key[0];
+};
+
+struct bpf_htab_bucket {
+	struct hlist_head head;
+	raw_spinlock_t lock;
+};
+
+struct bpf_htab {
+	struct bpf_map map;
+	struct bpf_htab_bucket *buckets;
+	u32 buckets_num;
+	u32 elem_size;
+	struct sk_psock_progs progs;
+	atomic_t count;
+};
+
+static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
+{
+	return jhash(key, len, 0);
+}
+
+static struct bpf_htab_bucket *sock_hash_select_bucket(struct bpf_htab *htab,
+						       u32 hash)
+{
+	return &htab->buckets[hash & (htab->buckets_num - 1)];
+}
+
+static struct bpf_htab_elem *
+sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
+			  u32 key_size)
+{
+	struct bpf_htab_elem *elem;
+
+	hlist_for_each_entry_rcu(elem, head, node) {
+		if (elem->hash == hash &&
+		    !memcmp(&elem->key, key, key_size))
+			return elem;
+	}
+
+	return NULL;
+}
+
+static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 key_size = map->key_size, hash;
+	struct bpf_htab_bucket *bucket;
+	struct bpf_htab_elem *elem;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	hash = sock_hash_bucket_hash(key, key_size);
+	bucket = sock_hash_select_bucket(htab, hash);
+	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+
+	return elem ? elem->sk : NULL;
+}
+
+static void sock_hash_free_elem(struct bpf_htab *htab,
+				struct bpf_htab_elem *elem)
+{
+	atomic_dec(&htab->count);
+	kfree_rcu(elem, rcu);
+}
+
+static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
+				       void *link_raw)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_htab_elem *elem_probe, *elem = link_raw;
+	struct bpf_htab_bucket *bucket;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	bucket = sock_hash_select_bucket(htab, elem->hash);
+
+	/* elem may be deleted in parallel from the map, but access here
+	 * is okay since it's going away only after RCU grace period.
+	 * However, we need to check whether it's still present.
+	 */
+	raw_spin_lock_bh(&bucket->lock);
+	elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
+					       elem->key, map->key_size);
+	if (elem_probe && elem_probe == elem) {
+		hlist_del_rcu(&elem->node);
+		sock_map_unref(elem->sk, elem);
+		sock_hash_free_elem(htab, elem);
+	}
+	raw_spin_unlock_bh(&bucket->lock);
+}
+
+static int sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 hash, key_size = map->key_size;
+	struct bpf_htab_bucket *bucket;
+	struct bpf_htab_elem *elem;
+	int ret = -ENOENT;
+
+	hash = sock_hash_bucket_hash(key, key_size);
+	bucket = sock_hash_select_bucket(htab, hash);
+
+	raw_spin_lock_bh(&bucket->lock);
+	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+	if (elem) {
+		hlist_del_rcu(&elem->node);
+		sock_map_unref(elem->sk, elem);
+		sock_hash_free_elem(htab, elem);
+		ret = 0;
+	}
+	raw_spin_unlock_bh(&bucket->lock);
+	return ret;
+}
+
+static struct bpf_htab_elem *sock_hash_alloc_elem(struct bpf_htab *htab,
+						  void *key, u32 key_size,
+						  u32 hash, struct sock *sk,
+						  struct bpf_htab_elem *old)
+{
+	struct bpf_htab_elem *new;
+
+	if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+		if (!old) {
+			atomic_dec(&htab->count);
+			return ERR_PTR(-E2BIG);
+		}
+	}
+
+	new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN,
+			   htab->map.numa_node);
+	if (!new) {
+		atomic_dec(&htab->count);
+		return ERR_PTR(-ENOMEM);
+	}
+	memcpy(new->key, key, key_size);
+	new->sk = sk;
+	new->hash = hash;
+	return new;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+				   struct sock *sk, u64 flags)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	u32 key_size = map->key_size, hash;
+	struct bpf_htab_elem *elem, *elem_new;
+	struct bpf_htab_bucket *bucket;
+	struct sk_psock_link *link;
+	struct sk_psock *psock;
+	int ret;
+
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	if (unlikely(flags > BPF_EXIST))
+		return -EINVAL;
+
+	link = sk_psock_init_link();
+	if (!link)
+		return -ENOMEM;
+
+	ret = sock_map_link(map, &htab->progs, sk);
+	if (ret < 0)
+		goto out_free;
+
+	psock = sk_psock(sk);
+	WARN_ON_ONCE(!psock);
+
+	hash = sock_hash_bucket_hash(key, key_size);
+	bucket = sock_hash_select_bucket(htab, hash);
+
+	raw_spin_lock_bh(&bucket->lock);
+	elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+	if (elem && flags == BPF_NOEXIST) {
+		ret = -EEXIST;
+		goto out_unlock;
+	} else if (!elem && flags == BPF_EXIST) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+
+	elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
+	if (IS_ERR(elem_new)) {
+		ret = PTR_ERR(elem_new);
+		goto out_unlock;
+	}
+
+	sock_map_add_link(psock, link, map, elem_new);
+	/* Add new element to the head of the list, so that
+	 * concurrent search will find it before old elem.
+	 */
+	hlist_add_head_rcu(&elem_new->node, &bucket->head);
+	if (elem) {
+		hlist_del_rcu(&elem->node);
+		sock_map_unref(elem->sk, elem);
+		sock_hash_free_elem(htab, elem);
+	}
+	raw_spin_unlock_bh(&bucket->lock);
+	return 0;
+out_unlock:
+	raw_spin_unlock_bh(&bucket->lock);
+	sk_psock_put(sk, psock);
+out_free:
+	sk_psock_free_link(link);
+	return ret;
+}
+
+static int sock_hash_update_elem(struct bpf_map *map, void *key,
+				 void *value, u64 flags)
+{
+	u32 ufd = *(u32 *)value;
+	struct socket *sock;
+	struct sock *sk;
+	int ret;
+
+	sock = sockfd_lookup(ufd, &ret);
+	if (!sock)
+		return ret;
+	sk = sock->sk;
+	if (!sk) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!sock_map_sk_is_suitable(sk) ||
+	    sk->sk_state != TCP_ESTABLISHED) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+
+	sock_map_sk_acquire(sk);
+	ret = sock_hash_update_common(map, key, sk, flags);
+	sock_map_sk_release(sk);
+out:
+	fput(sock->file);
+	return ret;
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map, void *key,
+				  void *key_next)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_htab_elem *elem, *elem_next;
+	u32 hash, key_size = map->key_size;
+	struct hlist_head *head;
+	int i = 0;
+
+	if (!key)
+		goto find_first_elem;
+	hash = sock_hash_bucket_hash(key, key_size);
+	head = &sock_hash_select_bucket(htab, hash)->head;
+	elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
+	if (!elem)
+		goto find_first_elem;
+
+	elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&elem->node)),
+				     struct bpf_htab_elem, node);
+	if (elem_next) {
+		memcpy(key_next, elem_next->key, key_size);
+		return 0;
+	}
+
+	i = hash & (htab->buckets_num - 1);
+	i++;
+find_first_elem:
+	for (; i < htab->buckets_num; i++) {
+		head = &sock_hash_select_bucket(htab, i)->head;
+		elem_next = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
+					     struct bpf_htab_elem, node);
+		if (elem_next) {
+			memcpy(key_next, elem_next->key, key_size);
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+	struct bpf_htab *htab;
+	int i, err;
+	u64 cost;
+
+	if (!capable(CAP_NET_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (attr->max_entries == 0 ||
+	    attr->key_size    == 0 ||
+	    attr->value_size  != 4 ||
+	    attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+		return ERR_PTR(-EINVAL);
+	if (attr->key_size > MAX_BPF_STACK)
+		return ERR_PTR(-E2BIG);
+
+	htab = kzalloc(sizeof(*htab), GFP_USER);
+	if (!htab)
+		return ERR_PTR(-ENOMEM);
+
+	bpf_map_init_from_attr(&htab->map, attr);
+
+	htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
+	htab->elem_size = sizeof(struct bpf_htab_elem) +
+			  round_up(htab->map.key_size, 8);
+	if (htab->buckets_num == 0 ||
+	    htab->buckets_num > U32_MAX / sizeof(struct bpf_htab_bucket)) {
+		err = -EINVAL;
+		goto free_htab;
+	}
+
+	cost = (u64) htab->buckets_num * sizeof(struct bpf_htab_bucket) +
+	       (u64) htab->elem_size * htab->map.max_entries;
+	if (cost >= U32_MAX - PAGE_SIZE) {
+		err = -EINVAL;
+		goto free_htab;
+	}
+
+	htab->buckets = bpf_map_area_alloc(htab->buckets_num *
+					   sizeof(struct bpf_htab_bucket),
+					   htab->map.numa_node);
+	if (!htab->buckets) {
+		err = -ENOMEM;
+		goto free_htab;
+	}
+
+	for (i = 0; i < htab->buckets_num; i++) {
+		INIT_HLIST_HEAD(&htab->buckets[i].head);
+		raw_spin_lock_init(&htab->buckets[i].lock);
+	}
+
+	return &htab->map;
+free_htab:
+	kfree(htab);
+	return ERR_PTR(err);
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+	struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+	struct bpf_htab_bucket *bucket;
+	struct bpf_htab_elem *elem;
+	struct hlist_node *node;
+	int i;
+
+	synchronize_rcu();
+	rcu_read_lock();
+	for (i = 0; i < htab->buckets_num; i++) {
+		bucket = sock_hash_select_bucket(htab, i);
+		raw_spin_lock_bh(&bucket->lock);
+		hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
+			hlist_del_rcu(&elem->node);
+			sock_map_unref(elem->sk, elem);
+		}
+		raw_spin_unlock_bh(&bucket->lock);
+	}
+	rcu_read_unlock();
+
+	bpf_map_area_free(htab->buckets);
+	kfree(htab);
+}
+
+static void sock_hash_release_progs(struct bpf_map *map)
+{
+	psock_progs_drop(&container_of(map, struct bpf_htab, map)->progs);
+}
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (likely(sock_map_sk_is_suitable(sops->sk) &&
+		   sock_map_op_okay(sops)))
+		return sock_hash_update_common(map, key, sops->sk, flags);
+	return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+	.func		= bpf_sock_hash_update,
+	.gpl_only	= false,
+	.pkt_access	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_PTR_TO_MAP_KEY,
+	.arg4_type	= ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	tcb->bpf.flags = flags;
+	tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!tcb->bpf.sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+	.func           = bpf_sk_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
+	   struct bpf_map *, map, void *, key, u64, flags)
+{
+	if (unlikely(flags & ~(BPF_F_INGRESS)))
+		return SK_DROP;
+	msg->flags = flags;
+	msg->sk_redir = __sock_hash_lookup_elem(map, key);
+	if (!msg->sk_redir)
+		return SK_DROP;
+	return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+	.func           = bpf_msg_redirect_hash,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_CONST_MAP_PTR,
+	.arg3_type      = ARG_PTR_TO_MAP_KEY,
+	.arg4_type      = ARG_ANYTHING,
+};
+
+const struct bpf_map_ops sock_hash_ops = {
+	.map_alloc		= sock_hash_alloc,
+	.map_free		= sock_hash_free,
+	.map_get_next_key	= sock_hash_get_next_key,
+	.map_update_elem	= sock_hash_update_elem,
+	.map_delete_elem	= sock_hash_delete_elem,
+	.map_lookup_elem	= sock_map_lookup,
+	.map_release_uref	= sock_hash_release_progs,
+	.map_check_btf		= map_check_no_btf,
+};
+
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
+{
+	switch (map->map_type) {
+	case BPF_MAP_TYPE_SOCKMAP:
+		return &container_of(map, struct bpf_stab, map)->progs;
+	case BPF_MAP_TYPE_SOCKHASH:
+		return &container_of(map, struct bpf_htab, map)->progs;
+	default:
+		break;
+	}
+
+	return NULL;
+}
+
+int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+			 u32 which)
+{
+	struct sk_psock_progs *progs = sock_map_progs(map);
+
+	if (!progs)
+		return -EOPNOTSUPP;
+
+	switch (which) {
+	case BPF_SK_MSG_VERDICT:
+		psock_set_prog(&progs->msg_parser, prog);
+		break;
+	case BPF_SK_SKB_STREAM_PARSER:
+		psock_set_prog(&progs->skb_parser, prog);
+		break;
+	case BPF_SK_SKB_STREAM_VERDICT:
+		psock_set_prog(&progs->skb_verdict, prog);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	return 0;
+}
+
+void sk_psock_unlink(struct sock *sk, struct sk_psock_link *link)
+{
+	switch (link->map->map_type) {
+	case BPF_MAP_TYPE_SOCKMAP:
+		return sock_map_delete_from_link(link->map, sk,
+						 link->link_raw);
+	case BPF_MAP_TYPE_SOCKHASH:
+		return sock_hash_delete_from_link(link->map, sk,
+						  link->link_raw);
+	default:
+		break;
+	}
+}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98..5862931 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -63,6 +63,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
 obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 0000000..80debb0
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,655 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+
+#include <net/inet_common.h>
+
+static bool tcp_bpf_stream_read(const struct sock *sk)
+{
+	struct sk_psock *psock;
+	bool empty = true;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (likely(psock))
+		empty = list_empty(&psock->ingress_msg);
+	rcu_read_unlock();
+	return !empty;
+}
+
+static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock,
+			     int flags, long timeo, int *err)
+{
+	DEFINE_WAIT_FUNC(wait, woken_wake_function);
+	int ret;
+
+	add_wait_queue(sk_sleep(sk), &wait);
+	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	ret = sk_wait_event(sk, &timeo,
+			    !list_empty(&psock->ingress_msg) ||
+			    !skb_queue_empty(&sk->sk_receive_queue), &wait);
+	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+	remove_wait_queue(sk_sleep(sk), &wait);
+	return ret;
+}
+
+int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock,
+		      struct msghdr *msg, int len)
+{
+	struct iov_iter *iter = &msg->msg_iter;
+	int i, ret, copied = 0;
+
+	while (copied != len) {
+		struct scatterlist *sge;
+		struct sk_msg *msg_rx;
+
+		msg_rx = list_first_entry_or_null(&psock->ingress_msg,
+						  struct sk_msg, list);
+		if (unlikely(!msg_rx))
+			break;
+
+		i = msg_rx->sg.start;
+		do {
+			struct page *page;
+			int copy;
+
+			sge = sk_msg_elem(msg_rx, i);
+			copy = sge->length;
+			page = sg_page(sge);
+			if (copied + copy > len)
+				copy = len - copied;
+			ret = copy_page_to_iter(page, sge->offset, copy, iter);
+			if (ret != copy) {
+				msg_rx->sg.start = i;
+				return -EFAULT;
+			}
+
+			copied += copy;
+			sge->offset += copy;
+			sge->length -= copy;
+			sk_mem_uncharge(sk, copy);
+			if (!sge->length) {
+				i++;
+				if (i == MAX_SKB_FRAGS)
+					i = 0;
+				if (!msg_rx->skb)
+					put_page(page);
+			}
+
+			if (copied == len)
+				break;
+		} while (i != msg_rx->sg.end);
+
+		msg_rx->sg.start = i;
+		if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) {
+			list_del(&msg_rx->list);
+			if (msg_rx->skb)
+				consume_skb(msg_rx->skb);
+			kfree(msg_rx);
+		}
+	}
+
+	return copied;
+}
+EXPORT_SYMBOL_GPL(__tcp_bpf_recvmsg);
+
+int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+		    int nonblock, int flags, int *addr_len)
+{
+	struct sk_psock *psock;
+	int copied, ret;
+
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+	if (!skb_queue_empty(&sk->sk_receive_queue))
+		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+	lock_sock(sk);
+msg_bytes_ready:
+	copied = __tcp_bpf_recvmsg(sk, psock, msg, len);
+	if (!copied) {
+		int data, err = 0;
+		long timeo;
+
+		timeo = sock_rcvtimeo(sk, nonblock);
+		data = tcp_bpf_wait_data(sk, psock, flags, timeo, &err);
+		if (data) {
+			if (skb_queue_empty(&sk->sk_receive_queue))
+				goto msg_bytes_ready;
+			release_sock(sk);
+			sk_psock_put(sk, psock);
+			return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len);
+		}
+		if (err) {
+			ret = err;
+			goto out;
+		}
+	}
+	ret = copied;
+out:
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+			   struct sk_msg *msg, u32 apply_bytes, int flags)
+{
+	bool apply = apply_bytes;
+	struct scatterlist *sge;
+	u32 size, copied = 0;
+	struct sk_msg *tmp;
+	int i, ret = 0;
+
+	tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+	if (unlikely(!tmp))
+		return -ENOMEM;
+
+	lock_sock(sk);
+	tmp->sg.start = msg->sg.start;
+	i = msg->sg.start;
+	do {
+		sge = sk_msg_elem(msg, i);
+		size = (apply && apply_bytes < sge->length) ?
+			apply_bytes : sge->length;
+		if (!sk_wmem_schedule(sk, size)) {
+			if (!copied)
+				ret = -ENOMEM;
+			break;
+		}
+
+		sk_mem_charge(sk, size);
+		sk_msg_xfer(tmp, msg, i, size);
+		copied += size;
+		if (sge->length)
+			get_page(sk_msg_page(tmp, i));
+		sk_msg_iter_var_next(i);
+		tmp->sg.end = i;
+		if (apply) {
+			apply_bytes -= size;
+			if (!apply_bytes)
+				break;
+		}
+	} while (i != msg->sg.end);
+
+	if (!ret) {
+		msg->sg.start = i;
+		msg->sg.size -= apply_bytes;
+		sk_psock_queue_msg(psock, tmp);
+		sk->sk_data_ready(sk);
+	} else {
+		sk_msg_free(sk, tmp);
+		kfree(tmp);
+	}
+
+	release_sock(sk);
+	return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+			int flags, bool uncharge)
+{
+	bool apply = apply_bytes;
+	struct scatterlist *sge;
+	struct page *page;
+	int size, ret = 0;
+	u32 off;
+
+	while (1) {
+		sge = sk_msg_elem(msg, msg->sg.start);
+		size = (apply && apply_bytes < sge->length) ?
+			apply_bytes : sge->length;
+		off  = sge->offset;
+		page = sg_page(sge);
+
+		tcp_rate_check_app_limited(sk);
+retry:
+		ret = do_tcp_sendpages(sk, page, off, size, flags);
+		if (ret <= 0)
+			return ret;
+		if (apply)
+			apply_bytes -= ret;
+		msg->sg.size -= ret;
+		sge->offset += ret;
+		sge->length -= ret;
+		if (uncharge)
+			sk_mem_uncharge(sk, ret);
+		if (ret != size) {
+			size -= ret;
+			off  += ret;
+			goto retry;
+		}
+		if (!sge->length) {
+			put_page(page);
+			sk_msg_iter_next(msg, start);
+			sg_init_table(sge, 1);
+			if (msg->sg.start == msg->sg.end)
+				break;
+		}
+		if (apply && !apply_bytes)
+			break;
+	}
+
+	return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+			       u32 apply_bytes, int flags, bool uncharge)
+{
+	int ret;
+
+	lock_sock(sk);
+	ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+	release_sock(sk);
+	return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
+			  u32 bytes, int flags)
+{
+	bool ingress = sk_msg_to_ingress(msg);
+	struct sk_psock *psock = sk_psock_get(sk);
+	int ret;
+
+	if (unlikely(!psock)) {
+		sk_msg_free(sk, msg);
+		return 0;
+	}
+	ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) :
+			tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+	sk_psock_put(sk, psock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+				struct sk_msg *msg, int *copied, int flags)
+{
+	bool cork = false, enospc = msg->sg.start == msg->sg.end;
+	struct sock *sk_redir;
+	u32 tosend;
+	int ret;
+
+more_data:
+	if (psock->eval == __SK_NONE)
+		psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+
+	if (msg->cork_bytes &&
+	    msg->cork_bytes > msg->sg.size && !enospc) {
+		psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+		if (!psock->cork) {
+			psock->cork = kzalloc(sizeof(*psock->cork),
+					      GFP_ATOMIC | __GFP_NOWARN);
+			if (!psock->cork)
+				return -ENOMEM;
+		}
+		memcpy(psock->cork, msg, sizeof(*msg));
+		return 0;
+	}
+
+	tosend = msg->sg.size;
+	if (psock->apply_bytes && psock->apply_bytes < tosend)
+		tosend = psock->apply_bytes;
+
+	switch (psock->eval) {
+	case __SK_PASS:
+		ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+		if (unlikely(ret)) {
+			*copied -= sk_msg_free(sk, msg);
+			break;
+		}
+		sk_msg_apply_bytes(psock, tosend);
+		break;
+	case __SK_REDIRECT:
+		sk_redir = psock->sk_redir;
+		sk_msg_apply_bytes(psock, tosend);
+		if (psock->cork) {
+			cork = true;
+			psock->cork = NULL;
+		}
+		sk_msg_return(sk, msg, tosend);
+		release_sock(sk);
+		ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+		lock_sock(sk);
+		if (unlikely(ret < 0)) {
+			int free = sk_msg_free_nocharge(sk, msg);
+
+			if (!cork)
+				*copied -= free;
+		}
+		if (cork) {
+			sk_msg_free(sk, msg);
+			kfree(msg);
+			msg = NULL;
+			ret = 0;
+		}
+		break;
+	case __SK_DROP:
+	default:
+		sk_msg_free_partial(sk, msg, tosend);
+		sk_msg_apply_bytes(psock, tosend);
+		*copied -= tosend;
+		return -EACCES;
+	}
+
+	if (likely(!ret)) {
+		if (!psock->apply_bytes) {
+			psock->eval =  __SK_NONE;
+			if (psock->sk_redir) {
+				sock_put(psock->sk_redir);
+				psock->sk_redir = NULL;
+			}
+		}
+		if (msg &&
+		    msg->sg.data[msg->sg.start].page_link &&
+		    msg->sg.data[msg->sg.start].length)
+			goto more_data;
+	}
+	return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+	struct sk_msg tmp, *msg_tx = NULL;
+	int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS;
+	int copied = 0, err = 0;
+	struct sk_psock *psock;
+	long timeo;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_sendmsg(sk, msg, size);
+
+	lock_sock(sk);
+	timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+	while (msg_data_left(msg)) {
+		bool enospc = false;
+		u32 copy, osize;
+
+		if (sk->sk_err) {
+			err = -sk->sk_err;
+			goto out_err;
+		}
+
+		copy = msg_data_left(msg);
+		if (!sk_stream_memory_free(sk))
+			goto wait_for_sndbuf;
+		if (psock->cork) {
+			msg_tx = psock->cork;
+		} else {
+			msg_tx = &tmp;
+			sk_msg_init(msg_tx);
+		}
+
+		osize = msg_tx->sg.size;
+		err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+		if (err) {
+			if (err != -ENOSPC)
+				goto wait_for_memory;
+			enospc = true;
+			copy = msg_tx->sg.size - osize;
+		}
+
+		err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+					       copy);
+		if (err < 0) {
+			sk_msg_trim(sk, msg_tx, osize);
+			goto out_err;
+		}
+
+		copied += copy;
+		if (psock->cork_bytes) {
+			if (size > psock->cork_bytes)
+				psock->cork_bytes = 0;
+			else
+				psock->cork_bytes -= size;
+			if (psock->cork_bytes && !enospc)
+				goto out_err;
+			/* All cork bytes are accounted, rerun the prog. */
+			psock->eval = __SK_NONE;
+			psock->cork_bytes = 0;
+		}
+
+		err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+		if (unlikely(err < 0))
+			goto out_err;
+		continue;
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err) {
+			if (msg_tx && msg_tx != psock->cork)
+				sk_msg_free(sk, msg_tx);
+			goto out_err;
+		}
+	}
+out_err:
+	if (err < 0)
+		err = sk_stream_error(sk, msg->msg_flags, err);
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied ? copied : err;
+}
+
+static int tcp_bpf_sendpage(struct sock *sk, struct page *page, int offset,
+			    size_t size, int flags)
+{
+	struct sk_msg tmp, *msg = NULL;
+	int err = 0, copied = 0;
+	struct sk_psock *psock;
+	bool enospc = false;
+
+	psock = sk_psock_get(sk);
+	if (unlikely(!psock))
+		return tcp_sendpage(sk, page, offset, size, flags);
+
+	lock_sock(sk);
+	if (psock->cork) {
+		msg = psock->cork;
+	} else {
+		msg = &tmp;
+		sk_msg_init(msg);
+	}
+
+	/* Catch case where ring is full and sendpage is stalled. */
+	if (unlikely(sk_msg_full(msg)))
+		goto out_err;
+
+	sk_msg_page_add(msg, page, size, offset);
+	sk_mem_charge(sk, size);
+	copied = size;
+	if (sk_msg_full(msg))
+		enospc = true;
+	if (psock->cork_bytes) {
+		if (size > psock->cork_bytes)
+			psock->cork_bytes = 0;
+		else
+			psock->cork_bytes -= size;
+		if (psock->cork_bytes && !enospc)
+			goto out_err;
+		/* All cork bytes are accounted, rerun the prog. */
+		psock->eval = __SK_NONE;
+		psock->cork_bytes = 0;
+	}
+
+	err = tcp_bpf_send_verdict(sk, psock, msg, &copied, flags);
+out_err:
+	release_sock(sk);
+	sk_psock_put(sk, psock);
+	return copied ? copied : err;
+}
+
+static void tcp_bpf_remove(struct sock *sk, struct sk_psock *psock)
+{
+	struct sk_psock_link *link;
+
+	sk_psock_cork_free(psock);
+	__sk_psock_purge_ingress_msg(psock);
+	while ((link = sk_psock_link_pop(psock))) {
+		sk_psock_unlink(sk, link);
+		sk_psock_free_link(link);
+	}
+}
+
+static void tcp_bpf_unhash(struct sock *sk)
+{
+	void (*saved_unhash)(struct sock *sk);
+	struct sk_psock *psock;
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		if (sk->sk_prot->unhash)
+			sk->sk_prot->unhash(sk);
+		return;
+	}
+
+	saved_unhash = psock->saved_unhash;
+	tcp_bpf_remove(sk, psock);
+	rcu_read_unlock();
+	saved_unhash(sk);
+}
+
+static void tcp_bpf_close(struct sock *sk, long timeout)
+{
+	void (*saved_close)(struct sock *sk, long timeout);
+	struct sk_psock *psock;
+
+	lock_sock(sk);
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock)) {
+		rcu_read_unlock();
+		release_sock(sk);
+		return sk->sk_prot->close(sk, timeout);
+	}
+
+	saved_close = psock->saved_close;
+	tcp_bpf_remove(sk, psock);
+	rcu_read_unlock();
+	release_sock(sk);
+	saved_close(sk, timeout);
+}
+
+enum {
+	TCP_BPF_IPV4,
+	TCP_BPF_IPV6,
+	TCP_BPF_NUM_PROTS,
+};
+
+enum {
+	TCP_BPF_BASE,
+	TCP_BPF_TX,
+	TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+				   struct proto *base)
+{
+	prot[TCP_BPF_BASE]			= *base;
+	prot[TCP_BPF_BASE].unhash		= tcp_bpf_unhash;
+	prot[TCP_BPF_BASE].close		= tcp_bpf_close;
+	prot[TCP_BPF_BASE].recvmsg		= tcp_bpf_recvmsg;
+	prot[TCP_BPF_BASE].stream_memory_read	= tcp_bpf_stream_read;
+
+	prot[TCP_BPF_TX]			= prot[TCP_BPF_BASE];
+	prot[TCP_BPF_TX].sendmsg		= tcp_bpf_sendmsg;
+	prot[TCP_BPF_TX].sendpage		= tcp_bpf_sendpage;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct sock *sk, struct proto *ops)
+{
+	if (sk->sk_family == AF_INET6 &&
+	    unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+		spin_lock_bh(&tcpv6_prot_lock);
+		if (likely(ops != tcpv6_prot_saved)) {
+			tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+			smp_store_release(&tcpv6_prot_saved, ops);
+		}
+		spin_unlock_bh(&tcpv6_prot_lock);
+	}
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+	tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+	return 0;
+}
+core_initcall(tcp_bpf_v4_build_proto);
+
+static void tcp_bpf_update_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
+
+	sk_psock_update_proto(sk, psock, &tcp_bpf_prots[family][config]);
+}
+
+static void tcp_bpf_reinit_sk_prot(struct sock *sk, struct sk_psock *psock)
+{
+	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+	int config = psock->progs.msg_parser   ? TCP_BPF_TX   : TCP_BPF_BASE;
+
+	/* Reinit occurs when program types change e.g. TCP_BPF_TX is removed
+	 * or added requiring sk_prot hook updates. We keep original saved
+	 * hooks in this case.
+	 */
+	sk->sk_prot = &tcp_bpf_prots[family][config];
+}
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+	/* In order to avoid retpoline, we make assumptions when we call
+	 * into ops if e.g. a psock is not present. Make sure they are
+	 * indeed valid assumptions.
+	 */
+	return ops->recvmsg  == tcp_recvmsg &&
+	       ops->sendmsg  == tcp_sendmsg &&
+	       ops->sendpage == tcp_sendpage ? 0 : -ENOTSUPP;
+}
+
+void tcp_bpf_reinit(struct sock *sk)
+{
+	struct sk_psock *psock;
+
+	sock_owned_by_me(sk);
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	tcp_bpf_reinit_sk_prot(sk, psock);
+	rcu_read_unlock();
+}
+
+int tcp_bpf_init(struct sock *sk)
+{
+	struct proto *ops = READ_ONCE(sk->sk_prot);
+	struct sk_psock *psock;
+
+	sock_owned_by_me(sk);
+
+	rcu_read_lock();
+	psock = sk_psock(sk);
+	if (unlikely(!psock || psock->sk_proto ||
+		     tcp_bpf_assert_proto_ops(ops))) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	tcp_bpf_check_v6_needs_rebuild(sk, ops);
+	tcp_bpf_update_sk_prot(sk, psock);
+	rcu_read_unlock();
+	return 0;
+}
diff --git a/net/strparser/Kconfig b/net/strparser/Kconfig
index 6cff3f6..94da19a 100644
--- a/net/strparser/Kconfig
+++ b/net/strparser/Kconfig
@@ -1,4 +1,2 @@
-
 config STREAM_PARSER
-	tristate
-	default n
+	def_bool n
-- 
2.9.5

^ permalink raw reply related

* [PATCH net] ipv6: mcast: fix a use-after-free in inet6_mc_check
From: Eric Dumazet @ 2018-10-13  1:58 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet

syzbot found a use-after-free in inet6_mc_check [1]

The problem here is that inet6_mc_check() uses rcu
and read_lock(&iml->sflock)

So the fact that ip6_mc_leave_src() is called under RTNL
and the socket lock does not help us, we need to acquire
iml->sflock in write mode.

In the future, we should convert all this stuff to RCU.

[1]
BUG: KASAN: use-after-free in ipv6_addr_equal include/net/ipv6.h:521 [inline]
BUG: KASAN: use-after-free in inet6_mc_check+0xae7/0xb40 net/ipv6/mcast.c:649
Read of size 8 at addr ffff8801ce7f2510 by task syz-executor0/22432

CPU: 1 PID: 22432 Comm: syz-executor0 Not tainted 4.19.0-rc7+ #280
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c4/0x2b4 lib/dump_stack.c:113
 print_address_description.cold.8+0x9/0x1ff mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.9+0x242/0x309 mm/kasan/report.c:412
 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:433
 ipv6_addr_equal include/net/ipv6.h:521 [inline]
 inet6_mc_check+0xae7/0xb40 net/ipv6/mcast.c:649
 __raw_v6_lookup+0x320/0x3f0 net/ipv6/raw.c:98
 ipv6_raw_deliver net/ipv6/raw.c:183 [inline]
 raw6_local_deliver+0x3d3/0xcb0 net/ipv6/raw.c:240
 ip6_input_finish+0x467/0x1aa0 net/ipv6/ip6_input.c:345
 NF_HOOK include/linux/netfilter.h:289 [inline]
 ip6_input+0xe9/0x600 net/ipv6/ip6_input.c:426
 ip6_mc_input+0x48a/0xd20 net/ipv6/ip6_input.c:503
 dst_input include/net/dst.h:450 [inline]
 ip6_rcv_finish+0x17a/0x330 net/ipv6/ip6_input.c:76
 NF_HOOK include/linux/netfilter.h:289 [inline]
 ipv6_rcv+0x120/0x640 net/ipv6/ip6_input.c:271
 __netif_receive_skb_one_core+0x14d/0x200 net/core/dev.c:4913
 __netif_receive_skb+0x2c/0x1e0 net/core/dev.c:5023
 netif_receive_skb_internal+0x12c/0x620 net/core/dev.c:5126
 napi_frags_finish net/core/dev.c:5664 [inline]
 napi_gro_frags+0x75a/0xc90 net/core/dev.c:5737
 tun_get_user+0x3189/0x4250 drivers/net/tun.c:1923
 tun_chr_write_iter+0xb9/0x154 drivers/net/tun.c:1968
 call_write_iter include/linux/fs.h:1808 [inline]
 do_iter_readv_writev+0x8b0/0xa80 fs/read_write.c:680
 do_iter_write+0x185/0x5f0 fs/read_write.c:959
 vfs_writev+0x1f1/0x360 fs/read_write.c:1004
 do_writev+0x11a/0x310 fs/read_write.c:1039
 __do_sys_writev fs/read_write.c:1112 [inline]
 __se_sys_writev fs/read_write.c:1109 [inline]
 __x64_sys_writev+0x75/0xb0 fs/read_write.c:1109
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x457421
Code: 75 14 b8 14 00 00 00 0f 05 48 3d 01 f0 ff ff 0f 83 34 b5 fb ff c3 48 83 ec 08 e8 1a 2d 00 00 48 89 04 24 b8 14 00 00 00 0f 05 <48> 8b 3c 24 48 89 c2 e8 63 2d 00 00 48 89 d0 48 83 c4 08 48 3d 01
RSP: 002b:00007f2d30ecaba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000014
RAX: ffffffffffffffda RBX: 000000000000003e RCX: 0000000000457421
RDX: 0000000000000001 RSI: 00007f2d30ecabf0 RDI: 00000000000000f0
RBP: 0000000020000500 R08: 00000000000000f0 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000293 R12: 00007f2d30ecb6d4
R13: 00000000004c4890 R14: 00000000004d7b90 R15: 00000000ffffffff

Allocated by task 22437:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc7/0xe0 mm/kasan/kasan.c:553
 __do_kmalloc mm/slab.c:3718 [inline]
 __kmalloc+0x14e/0x760 mm/slab.c:3727
 kmalloc include/linux/slab.h:518 [inline]
 sock_kmalloc+0x15a/0x1f0 net/core/sock.c:1983
 ip6_mc_source+0x14dd/0x1960 net/ipv6/mcast.c:427
 do_ipv6_setsockopt.isra.9+0x3afb/0x45d0 net/ipv6/ipv6_sockglue.c:743
 ipv6_setsockopt+0xbd/0x170 net/ipv6/ipv6_sockglue.c:933
 rawv6_setsockopt+0x59/0x140 net/ipv6/raw.c:1069
 sock_common_setsockopt+0x9a/0xe0 net/core/sock.c:3038
 __sys_setsockopt+0x1ba/0x3c0 net/socket.c:1902
 __do_sys_setsockopt net/socket.c:1913 [inline]
 __se_sys_setsockopt net/socket.c:1910 [inline]
 __x64_sys_setsockopt+0xbe/0x150 net/socket.c:1910
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 22430:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x102/0x150 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kfree+0xcf/0x230 mm/slab.c:3813
 __sock_kfree_s net/core/sock.c:2004 [inline]
 sock_kfree_s+0x29/0x60 net/core/sock.c:2010
 ip6_mc_leave_src+0x11a/0x1d0 net/ipv6/mcast.c:2448
 __ipv6_sock_mc_close+0x20b/0x4e0 net/ipv6/mcast.c:310
 ipv6_sock_mc_close+0x158/0x1d0 net/ipv6/mcast.c:328
 inet6_release+0x40/0x70 net/ipv6/af_inet6.c:452
 __sock_release+0xd7/0x250 net/socket.c:579
 sock_close+0x19/0x20 net/socket.c:1141
 __fput+0x385/0xa30 fs/file_table.c:278
 ____fput+0x15/0x20 fs/file_table.c:309
 task_work_run+0x1e8/0x2a0 kernel/task_work.c:113
 tracehook_notify_resume include/linux/tracehook.h:193 [inline]
 exit_to_usermode_loop+0x318/0x380 arch/x86/entry/common.c:166
 prepare_exit_to_usermode arch/x86/entry/common.c:197 [inline]
 syscall_return_slowpath arch/x86/entry/common.c:268 [inline]
 do_syscall_64+0x6be/0x820 arch/x86/entry/common.c:293
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at ffff8801ce7f2500
 which belongs to the cache kmalloc-192 of size 192
The buggy address is located 16 bytes inside of
 192-byte region [ffff8801ce7f2500, ffff8801ce7f25c0)
The buggy address belongs to the page:
page:ffffea000739fc80 count:1 mapcount:0 mapping:ffff8801da800040 index:0x0
flags: 0x2fffc0000000100(slab)
raw: 02fffc0000000100 ffffea0006f6e548 ffffea000737b948 ffff8801da800040
raw: 0000000000000000 ffff8801ce7f2000 0000000100000010 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 ffff8801ce7f2400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8801ce7f2480: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
>ffff8801ce7f2500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                         ^
 ffff8801ce7f2580: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
 ffff8801ce7f2600: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/ipv6/mcast.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 4ae54aaca3736d168cceb0cefd254727486f8048..dbab62e3f0d78ab6ab996cb70627f675bb42e487 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -2436,17 +2436,17 @@ static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
 {
 	int err;
 
-	/* callers have the socket lock and rtnl lock
-	 * so no other readers or writers of iml or its sflist
-	 */
+	write_lock_bh(&iml->sflock);
 	if (!iml->sflist) {
 		/* any-source empty exclude case */
-		return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+		err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+	} else {
+		err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
+				iml->sflist->sl_count, iml->sflist->sl_addr, 0);
+		sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
+		iml->sflist = NULL;
 	}
-	err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
-		iml->sflist->sl_count, iml->sflist->sl_addr, 0);
-	sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
-	iml->sflist = NULL;
+	write_unlock_bh(&iml->sflock);
 	return err;
 }
 
-- 
2.19.0.605.g01d371f741-goog

^ permalink raw reply related

* Re: [net-next,v2,2/4] net/smc: ipv6 support for smc_diag.c
From: Eugene Syromiatnikov @ 2018-10-13  3:45 UTC (permalink / raw)
  To: Ursula Braun
  Cc: davem, netdev, linux-s390, schwidefsky, heiko.carstens, raspl,
	kgraul
In-Reply-To: <3cb67163-53c6-dd63-71b6-d34581918708@linux.ibm.com>

On Tue, Oct 09, 2018 at 04:41:43PM +0200, Ursula Braun wrote:
> Eugene,
> 
> we are considering the following patch:
> 
> ---
>  net/smc/smc_diag.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
> index dbf64a93d68a..371b4cf31fcd 100644
> --- a/net/smc/smc_diag.c
> +++ b/net/smc/smc_diag.c
> @@ -38,6 +38,7 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
>  {
>  	struct smc_sock *smc = smc_sk(sk);
>  
> +	r->diag_family = sk->sk_family;
>  	if (!smc->clcsock)
>  		return;
>  	r->id.idiag_sport = htons(smc->clcsock->sk->sk_num);
> @@ -45,14 +46,12 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk)
>  	r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if;
>  	sock_diag_save_cookie(sk, r->id.idiag_cookie);
>  	if (sk->sk_protocol == SMCPROTO_SMC) {
> -		r->diag_family = PF_INET;
>  		memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
>  		memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
>  		r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr;
>  		r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr;
>  #if IS_ENABLED(CONFIG_IPV6)
>  	} else if (sk->sk_protocol == SMCPROTO_SMC6) {
> -		r->diag_family = PF_INET6;
>  		memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr,
>  		       sizeof(smc->clcsock->sk->sk_v6_rcv_saddr));
>  		memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr,
> --
> 
> Tools would then need to derive the PF_INET/PF_INET6 info from the inet_diag_sockid info
> in the smc_diag_msg.

Unfortunately, struct inet_diag_sockid doesn't provide any clue
in order to derive address family.

There's unused (if I haven't missed anything) diag_shutdown field
in struct smc_diag_msg (it looks like that the shutdown state is
reported via a separate attribute in the current implementation),
it probably might be repurposed for providing information about
the underlying socket address family.

> However, this problem is in the mainline kernel since 4.18. So, we are not sure if we are allowed
> to change the user interface again.

Well, anything is better that the current state.

^ permalink raw reply

* Re: [PATCH] rtlwifi: rtl8821ae: add in a missing break in switch statement
From: Kalle Valo @ 2018-10-13 11:59 UTC (permalink / raw)
  To: Colin King
  Cc: Ping-Ke Shih, David S . Miller, Larry Finger, Tsang-Shian Lin,
	linux-wireless, netdev, kernel-janitors, linux-kernel
In-Reply-To: <20181006184246.29985-1-colin.king@canonical.com>

Colin King <colin.king@canonical.com> wrote:

> From: Colin Ian King <colin.king@canonical.com>
> 
> The switch case RATR_INX_WIRELESS_MC has a missing break, this seems
> to be unintentional as the setting of variable ret gets overwritten
> when the case falls through to the following RATR_INX_WIRELESS_AC_5N
> case.  Fix this by adding in the missing break.
> 
> Detected by CoverityScan, CID#1167237 ("Missing break in switch")
> 
> Fixes: 3c05bedb5fef ("Staging: rtl8812ae: Add Realtek 8821 PCI WIFI driver")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Dropping this patch per discussion.

Patch set to Changes Requested.

-- 
https://patchwork.kernel.org/patch/10629291/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH] rtlwifi: rtl8821ae: replace _rtl8821ae_mrate_idx_to_arfr_id with generic version
From: Kalle Valo @ 2018-10-13 12:01 UTC (permalink / raw)
  To: Colin King
  Cc: Ping-Ke Shih, David S . Miller, Larry Finger, linux-wireless,
	netdev, kernel-janitors, linux-kernel
In-Reply-To: <20181008085028.23874-1-colin.king@canonical.com>

Colin King <colin.king@canonical.com> wrote:

> From: Colin Ian King <colin.king@canonical.com>
> 
> Function _rtl8821ae_mrate_idx_to_arfr_id is functionally identical to
> the generic version rtl_mrate_idx_to_arfr_id, so remove 
> _rtl8821ae_mrate_idx_to_arfr_id and use the generic one instead.
> 
> This also fixes a missing break statement found by CoverityScan in
> _rtl8821ae_mrate_idx_to_arfr_id, namely: CID#1167237 ("Missing break
> in switch")
> 
> Thanks to Joe Perches for spotting this when I submitted an earlier patch.
> 
> Fixes: 3c05bedb5fef ("Staging: rtl8812ae: Add Realtek 8821 PCI WIFI driver")
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
> ACKed-by: Larry Finger <Larry.Finger@lwfinger.net>

Patch applied to wireless-drivers-next.git, thanks.

c894696188d5 rtlwifi: rtl8821ae: replace _rtl8821ae_mrate_idx_to_arfr_id with generic version

-- 
https://patchwork.kernel.org/patch/10630249/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH bpf-next] tools: bpftool: add map create command
From: Alexei Starovoitov @ 2018-10-13  6:16 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: daniel, netdev, oss-drivers
In-Reply-To: <20181012180614.22611-1-jakub.kicinski@netronome.com>

On Fri, Oct 12, 2018 at 11:06:14AM -0700, Jakub Kicinski wrote:
> Add a way of creating maps from user space.  The command takes
> as parameters most of the attributes of the map creation system
> call command.  After map is created its pinned to bpffs.  This makes
> it possible to easily and dynamically (without rebuilding programs)
> test various corner cases related to map creation.
> 
> Map type names are taken from bpftool's array used for printing.
> In general these days we try to make use of libbpf type names, but
> there are no map type names in libbpf as of today.
> 
> As with most features I add the motivation is testing (offloads) :)
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
...
>  	fprintf(stderr,
>  		"Usage: %s %s { show | list }   [MAP]\n"
> +		"       %s %s create     FILE type TYPE key KEY_SIZE value VALUE_SIZE \\\n"
> +		"                              entries MAX_ENTRIES [name NAME] [flags FLAGS] \\\n"
> +		"                              [dev NAME]\n"

I suspect as soon as bpftool has an ability to create standalone maps
some folks will start relying on such interface.
Therefore I'd like to request to make 'name' argument to be mandatory.
I think in the future we will require BTF to be mandatory too.
We need to move towards more transparent and debuggable infra.
Do you think requiring json description of key/value would be managable to implement?
Then bpftool could convert it to BTF and the map full be fully defined.
I certainly understand that bpf prog can disregard the key/value layout today,
but we will make verifier to enforce that in the future too.

^ permalink raw reply

* Re: [PATCH 1/1] crypto:chelsio: Update ntx queue received from cxgb4
From: Harsh Jain @ 2018-10-13  7:23 UTC (permalink / raw)
  To: Lino Sanfilippo, herbert, atul.gupta, indranil, swise, varun,
	ganeshgr, netdev, linux-crypto
In-Reply-To: <7e220dd5-fb54-84b4-20c9-a318d18d747b@gmx.de>



On 13-10-2018 05:46, Lino Sanfilippo wrote:
> Hi,
>
>> +	if (uld_type == CXGB4_ULD_CRYPTO) {
>> +		i = min_t(int, adap->vres.ncrypto_fc,
>> +			  num_online_cpus());
>> +		txq_info->ntxq = rounddown(i, adap->params.nports);
>> +		if (txq_info->ntxq <= 0) {
>> +			dev_warn(adap->pdev_dev, "Crypto Tx Queues can't be zero\n");
>> +			return -EINVAL;
>> +		}
> Shouldn't we free txq_info in the error case?
Yes, Will fix this in V2. Thanks.
>
> Regards,
> Lino

^ permalink raw reply

* Re: [PATCH net-next 1/3] veth: Account for packet drops in ndo_xdp_xmit
From: Jesper Dangaard Brouer @ 2018-10-13  7:48 UTC (permalink / raw)
  To: Toshiaki Makita; +Cc: David S. Miller, netdev, brouer
In-Reply-To: <1539250610-2557-2-git-send-email-makita.toshiaki@lab.ntt.co.jp>

On Thu, 11 Oct 2018 18:36:48 +0900
Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:

> Use existing atomic drop counter. Since drop path is really an
> exceptional case here, I'm thinking atomic ops would not hurt the
> performance.

Hmm... we try very hard not to add atomic ops to XDP code path. The
XDP_DROP case is also considered hot-path.  In below code, the
atomic64_add happens for a bulk of dropped packets (currently up-to
16), so it might be okay.

> XDP packets and bytes are not counted in ndo_xdp_xmit, but will be
> accounted on rx side by the following commit.
> 
> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
> ---
>  drivers/net/veth.c | 30 ++++++++++++++++++++++--------
>  1 file changed, 22 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index 224c56a..452193f2 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -308,16 +308,20 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>  {
>  	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>  	struct net_device *rcv;
> +	int i, ret, drops = n;
>  	unsigned int max_len;
>  	struct veth_rq *rq;
> -	int i, drops = 0;
>  
> -	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
> -		return -EINVAL;
> +	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
> +		ret = -EINVAL;
> +		goto drop;
> +	}
>  
>  	rcv = rcu_dereference(priv->peer);
> -	if (unlikely(!rcv))
> -		return -ENXIO;
> +	if (unlikely(!rcv)) {
> +		ret = -ENXIO;
> +		goto drop;
> +	}
>  
>  	rcv_priv = netdev_priv(rcv);
>  	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
> @@ -325,9 +329,12 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>  	 * side. This means an XDP program is loaded on the peer and the peer
>  	 * device is up.
>  	 */
> -	if (!rcu_access_pointer(rq->xdp_prog))
> -		return -ENXIO;
> +	if (!rcu_access_pointer(rq->xdp_prog)) {
> +		ret = -ENXIO;
> +		goto drop;
> +	}
>  
> +	drops = 0;
>  	max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
>  
>  	spin_lock(&rq->xdp_ring.producer_lock);
> @@ -346,7 +353,14 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>  	if (flags & XDP_XMIT_FLUSH)
>  		__veth_xdp_flush(rq);
>  
> -	return n - drops;
> +	if (likely(!drops))
> +		return n;
> +
> +	ret = n - drops;
> +drop:
> +	atomic64_add(drops, &priv->dropped);
> +
> +	return ret;
>  }
>  
>  static void veth_xdp_flush(struct net_device *dev)



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply

* [PATCH] rsi: fix spelling mistake "Initialzing" -> "Initializing"
From: Colin King @ 2018-10-13 15:37 UTC (permalink / raw)
  To: Kalle Valo, David S . Miller, Amitkumar Karwar,
	Prameela Rani Garnepudi, linux-wireless, netdev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Trivial fix to spelling mistake in rsi_dbg debug message

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/wireless/rsi/rsi_91x_sdio_ops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
index 612c211e21a1..449f6d23c5e3 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
@@ -210,7 +210,7 @@ int rsi_init_sdio_slave_regs(struct rsi_hw *adapter)
 	}
 
 	/* This tells SDIO FIFO when to start read to host */
-	rsi_dbg(INIT_ZONE, "%s: Initialzing SDIO read start level\n", __func__);
+	rsi_dbg(INIT_ZONE, "%s: Initializing SDIO read start level\n", __func__);
 	byte = 0x24;
 
 	status = rsi_sdio_write_register(adapter,
@@ -223,7 +223,7 @@ int rsi_init_sdio_slave_regs(struct rsi_hw *adapter)
 		return -1;
 	}
 
-	rsi_dbg(INIT_ZONE, "%s: Initialzing FIFO ctrl registers\n", __func__);
+	rsi_dbg(INIT_ZONE, "%s: Initializing FIFO ctrl registers\n", __func__);
 	byte = (128 - 32);
 
 	status = rsi_sdio_write_register(adapter,
-- 
2.17.1

^ permalink raw reply related

* [PATCH] qed: fix spelling mistake "Ireelevant" -> "Irrelevant"
From: Colin King @ 2018-10-13 15:48 UTC (permalink / raw)
  To: Ariel Elior, everest-linux-l2, David S . Miller, netdev
  Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Trivial fix to spelling mistake in DP_INFO message

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/net/ethernet/qlogic/qed/qed_int.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_int.c b/drivers/net/ethernet/qlogic/qed/qed_int.c
index af3a28ec04eb..0f0aba793352 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_int.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_int.c
@@ -228,7 +228,7 @@ static int qed_grc_attn_cb(struct qed_hwfn *p_hwfn)
 		attn_master_to_str(GET_FIELD(tmp, QED_GRC_ATTENTION_MASTER)),
 		GET_FIELD(tmp2, QED_GRC_ATTENTION_PF),
 		(GET_FIELD(tmp2, QED_GRC_ATTENTION_PRIV) ==
-		 QED_GRC_ATTENTION_PRIV_VF) ? "VF" : "(Ireelevant)",
+		 QED_GRC_ATTENTION_PRIV_VF) ? "VF" : "(Irrelevant)",
 		GET_FIELD(tmp2, QED_GRC_ATTENTION_VF));
 
 out:
-- 
2.17.1

^ permalink raw reply related

* [PATCH v2 1/1]  crypto:chelsio: Update ntx queue received from cxgb4
From: Harsh Jain @ 2018-10-13  8:16 UTC (permalink / raw)
  To: herbert, atul.gupta, indranil, swise, varun, ganeshgr, netdev,
	linux-crypto, LinoSanfilippo
  Cc: Harsh Jain
In-Reply-To: <cover.1539418425.git.harsh@chelsio.com>

Update cxgb4 to send No. of Tx Queue created in lldinfo struct
and use the same ntxq in chcr driver.

This patch depends on following commit
commit  add92a817e60e308a419693413a38d9d1e663aff
"Fix memory corruption in DMA Mapped buffers"

v2:
Free txq_info in error case as pointed by Lino Sanfilippo.

Signed-off-by: Harsh Jain <harsh@chelsio.com>
---
 drivers/crypto/chelsio/chcr_algo.c             |  3 +--
 drivers/crypto/chelsio/chcr_core.c             |  2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c | 20 ++++++++++++++++----
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c
index 010bbf6..9b937cb 100644
--- a/drivers/crypto/chelsio/chcr_algo.c
+++ b/drivers/crypto/chelsio/chcr_algo.c
@@ -1337,8 +1337,7 @@ static int chcr_device_init(struct chcr_context *ctx)
 		}
 		ctx->dev = u_ctx->dev;
 		adap = padap(ctx->dev);
-		ntxq = min_not_zero((unsigned int)u_ctx->lldi.nrxq,
-				    adap->vres.ncrypto_fc);
+		ntxq = u_ctx->lldi.ntxq;
 		rxq_perchan = u_ctx->lldi.nrxq / u_ctx->lldi.nchan;
 		txq_perchan = ntxq / u_ctx->lldi.nchan;
 		spin_lock(&ctx->dev->lock_chcr_dev);
diff --git a/drivers/crypto/chelsio/chcr_core.c b/drivers/crypto/chelsio/chcr_core.c
index 04f277c..2399ce3 100644
--- a/drivers/crypto/chelsio/chcr_core.c
+++ b/drivers/crypto/chelsio/chcr_core.c
@@ -43,7 +43,7 @@ static chcr_handler_func work_handlers[NUM_CPL_CMDS] = {
 static struct cxgb4_uld_info chcr_uld_info = {
 	.name = DRV_MODULE_NAME,
 	.nrxq = MAX_ULD_QSETS,
-	.ntxq = MAX_ULD_QSETS,
+	/* Max ntxq will be derived from fw config file*/
 	.rxq_size = 1024,
 	.add = chcr_uld_add,
 	.state_change = chcr_uld_state_change,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
index 4bc2110..7947ae7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.c
@@ -520,10 +520,20 @@ setup_sge_txq_uld(struct adapter *adap, unsigned int uld_type,
 	txq_info = kzalloc(sizeof(*txq_info), GFP_KERNEL);
 	if (!txq_info)
 		return -ENOMEM;
+	if (uld_type == CXGB4_ULD_CRYPTO) {
+		i = min_t(int, adap->vres.ncrypto_fc,
+			  num_online_cpus());
+		txq_info->ntxq = rounddown(i, adap->params.nports);
+		if (txq_info->ntxq <= 0) {
+			dev_warn(adap->pdev_dev, "Crypto Tx Queues can't be zero\n");
+			kfree(txq_info);
+			return -EINVAL;
+		}
 
-	i = min_t(int, uld_info->ntxq, num_online_cpus());
-	txq_info->ntxq = roundup(i, adap->params.nports);
-
+	} else {
+		i = min_t(int, uld_info->ntxq, num_online_cpus());
+		txq_info->ntxq = roundup(i, adap->params.nports);
+	}
 	txq_info->uldtxq = kcalloc(txq_info->ntxq, sizeof(struct sge_uld_txq),
 				   GFP_KERNEL);
 	if (!txq_info->uldtxq) {
@@ -546,11 +556,14 @@ static void uld_queue_init(struct adapter *adap, unsigned int uld_type,
 			   struct cxgb4_lld_info *lli)
 {
 	struct sge_uld_rxq_info *rxq_info = adap->sge.uld_rxq_info[uld_type];
+	int tx_uld_type = TX_ULD(uld_type);
+	struct sge_uld_txq_info *txq_info = adap->sge.uld_txq_info[tx_uld_type];
 
 	lli->rxq_ids = rxq_info->rspq_id;
 	lli->nrxq = rxq_info->nrxq;
 	lli->ciq_ids = rxq_info->rspq_id + rxq_info->nrxq;
 	lli->nciq = rxq_info->nciq;
+	lli->ntxq = txq_info->ntxq;
 }
 
 int t4_uld_mem_alloc(struct adapter *adap)
@@ -634,7 +647,6 @@ static void uld_init(struct adapter *adap, struct cxgb4_lld_info *lld)
 	lld->ports = adap->port;
 	lld->vr = &adap->vres;
 	lld->mtus = adap->params.mtus;
-	lld->ntxq = adap->sge.ofldqsets;
 	lld->nchan = adap->params.nports;
 	lld->nports = adap->params.nports;
 	lld->wr_cred = adap->params.ofldq_wr_cred;
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH net-next 1/3] veth: Account for packet drops in ndo_xdp_xmit
From: Toshiaki Makita @ 2018-10-13  8:57 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Toshiaki Makita, David S. Miller, netdev
In-Reply-To: <20181013094828.00979d39@redhat.com>

On 18/10/13 (土) 16:48, Jesper Dangaard Brouer wrote:
> On Thu, 11 Oct 2018 18:36:48 +0900
> Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp> wrote:
> 
>> Use existing atomic drop counter. Since drop path is really an
>> exceptional case here, I'm thinking atomic ops would not hurt the
>> performance.
> 
> Hmm... we try very hard not to add atomic ops to XDP code path. The
> XDP_DROP case is also considered hot-path.  In below code, the
> atomic64_add happens for a bulk of dropped packets (currently up-to
> 16), so it might be okay.

Yes, this happens only once in a bulk sending.
Note that this drop does not include XDP_DROP. This drop is counted when
- ndo_xdp_xmit "flags" arg is invalid
- peer is detached
- XDP is not loaded on peer
- XDP ring (256 slots) overflow
So really exceptional. XDP_DROP is counted per-queue basis (non-atomic) 
in the patch 2/3.

Toshiaki Makita

> 
>> XDP packets and bytes are not counted in ndo_xdp_xmit, but will be
>> accounted on rx side by the following commit.
>>
>> Signed-off-by: Toshiaki Makita <makita.toshiaki@lab.ntt.co.jp>
>> ---
>>   drivers/net/veth.c | 30 ++++++++++++++++++++++--------
>>   1 file changed, 22 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
>> index 224c56a..452193f2 100644
>> --- a/drivers/net/veth.c
>> +++ b/drivers/net/veth.c
>> @@ -308,16 +308,20 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>   {
>>   	struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
>>   	struct net_device *rcv;
>> +	int i, ret, drops = n;
>>   	unsigned int max_len;
>>   	struct veth_rq *rq;
>> -	int i, drops = 0;
>>   
>> -	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
>> -		return -EINVAL;
>> +	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) {
>> +		ret = -EINVAL;
>> +		goto drop;
>> +	}
>>   
>>   	rcv = rcu_dereference(priv->peer);
>> -	if (unlikely(!rcv))
>> -		return -ENXIO;
>> +	if (unlikely(!rcv)) {
>> +		ret = -ENXIO;
>> +		goto drop;
>> +	}
>>   
>>   	rcv_priv = netdev_priv(rcv);
>>   	rq = &rcv_priv->rq[veth_select_rxq(rcv)];
>> @@ -325,9 +329,12 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>   	 * side. This means an XDP program is loaded on the peer and the peer
>>   	 * device is up.
>>   	 */
>> -	if (!rcu_access_pointer(rq->xdp_prog))
>> -		return -ENXIO;
>> +	if (!rcu_access_pointer(rq->xdp_prog)) {
>> +		ret = -ENXIO;
>> +		goto drop;
>> +	}
>>   
>> +	drops = 0;
>>   	max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
>>   
>>   	spin_lock(&rq->xdp_ring.producer_lock);
>> @@ -346,7 +353,14 @@ static int veth_xdp_xmit(struct net_device *dev, int n,
>>   	if (flags & XDP_XMIT_FLUSH)
>>   		__veth_xdp_flush(rq);
>>   
>> -	return n - drops;
>> +	if (likely(!drops))
>> +		return n;
>> +
>> +	ret = n - drops;
>> +drop:
>> +	atomic64_add(drops, &priv->dropped);
>> +
>> +	return ret;
>>   }
>>   
>>   static void veth_xdp_flush(struct net_device *dev)

^ permalink raw reply

* Re: Grant--
From: M. M. Fridman @ 2018-10-13  8:36 UTC (permalink / raw)
  To: Recipients

I, Mikhail Fridman have selected you specifically as one of my beneficiaries for my Charitable Donation of $5 Million Dollars,

Check the link below for confirmation:

https://www.rt.com/business/343781-mikhail-fridman-will-charity/

I await your earliest response for further directives.

Best Regards,
Mikhail Fridman.

^ permalink raw reply

* Re: [PATCH] iwlegacy: Add a lock assertion in il4965_send_rxon_assoc()
From: Kalle Valo @ 2018-10-13 17:02 UTC (permalink / raw)
  To: Jia-Ju Bai
  Cc: sgruszka, davem, linux-wireless, netdev, linux-kernel, Jia-Ju Bai
In-Reply-To: <20181005135546.21011-1-baijiaju1990@gmail.com>

Jia-Ju Bai <baijiaju1990@gmail.com> wrote:

> The variables il->staging.filter_flags, rxon1->filter_flags and 
> rxon2->filter_flags need to be protected by the mutex lock il->mutex.
> This patch adds a lock assertion of il->mutex to check whether 
> this lock is held.
> 
> Signed-off-by: Jia-Ju Bai <baijiaju1990@gmail.com>
> Acked-by: Stanislaw Gruszka <sgruszka@redhat.com>

Patch applied to wireless-drivers-next.git, thanks.

52a312673aff iwlegacy: Add a lock assertion in il4965_send_rxon_assoc()

-- 
https://patchwork.kernel.org/patch/10628205/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH] libertas: don't set URB_ZERO_PACKET on IN USB transfer
From: Kalle Valo @ 2018-10-13 17:03 UTC (permalink / raw)
  To: Lubomir Rintel
  Cc: David S. Miller, libertas-dev, linux-wireless, netdev,
	linux-kernel, Lubomir Rintel, stable
In-Reply-To: <20181006201232.2789936-1-lkundrak@v3.sk>

Lubomir Rintel <lkundrak@v3.sk> wrote:

> The USB core gets rightfully upset:
> 
>   usb 1-1: BOGUS urb flags, 240 --> 200
>   WARNING: CPU: 0 PID: 60 at drivers/usb/core/urb.c:503 usb_submit_urb+0x2f8/0x3ed
>   Modules linked in:
>   CPU: 0 PID: 60 Comm: kworker/0:3 Not tainted 4.19.0-rc6-00319-g5206d00a45c7 #39
>   Hardware name: OLPC XO/XO, BIOS OLPC Ver 1.00.01 06/11/2014
>   Workqueue: events request_firmware_work_func
>   EIP: usb_submit_urb+0x2f8/0x3ed
>   Code: 75 06 8b 8f 80 00 00 00 8d 47 78 89 4d e4 89 55 e8 e8 35 1c f6 ff 8b 55 e8 56 52 8b 4d e4 51 50 68 e3 ce c7 c0 e8 ed 18 c6 ff <0f> 0b 83 c4 14 80 7d ef 01 74 0a 80 7d ef 03 0f 85 b8 00 00 00 8b
>   EAX: 00000025 EBX: ce7d4980 ECX: 00000000 EDX: 00000001
>   ESI: 00000200 EDI: ce7d8800 EBP: ce7f5ea8 ESP: ce7f5e70
>   DS: 007b ES: 007b FS: 0000 GS: 00e0 SS: 0068 EFLAGS: 00210292
>   CR0: 80050033 CR2: 00000000 CR3: 00e80000 CR4: 00000090
>   Call Trace:
>    ? if_usb_fw_timeo+0x64/0x64
>    __if_usb_submit_rx_urb+0x85/0xe6
>    ? if_usb_fw_timeo+0x64/0x64
>    if_usb_submit_rx_urb_fwload+0xd/0xf
>    if_usb_prog_firmware+0xc0/0x3db
>    ? _request_firmware+0x54/0x47b
>    ? _request_firmware+0x89/0x47b
>    ? if_usb_probe+0x412/0x412
>    lbs_fw_loaded+0x55/0xa6
>    ? debug_smp_processor_id+0x12/0x14
>    helper_firmware_cb+0x3c/0x3f
>    request_firmware_work_func+0x37/0x6f
>    process_one_work+0x164/0x25a
>    worker_thread+0x1c4/0x284
>    kthread+0xec/0xf1
>    ? cancel_delayed_work_sync+0xf/0xf
>    ? kthread_create_on_node+0x1a/0x1a
>    ret_from_fork+0x2e/0x38
>   ---[ end trace 3ef1e3b2dd53852f ]---
> 
> Cc: stable@vger.kernel.org
> Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>

Patch applied to wireless-drivers-next.git, thanks.

6528d8804780 libertas: don't set URB_ZERO_PACKET on IN USB transfer

-- 
https://patchwork.kernel.org/patch/10629305/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH v2] libertas: return errno from lbs_add_card()
From: Kalle Valo @ 2018-10-13 17:04 UTC (permalink / raw)
  To: Lubomir Rintel
  Cc: David S. Miller, libertas-dev, linux-wireless, netdev,
	linux-kernel, Lubomir Rintel
In-Reply-To: <20181007003327.2806010-1-lkundrak@v3.sk>

Lubomir Rintel <lkundrak@v3.sk> wrote:

> This makes the error handling somewhat cleaner -- lbs_add_card() does no
> logner throw away the errno and lets its callers propagate it.
> 
> Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>

Patch applied to wireless-drivers-next.git, thanks.

bbc2a101f06b libertas: return errno from lbs_add_card()

-- 
https://patchwork.kernel.org/patch/10629357/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH] rtl8xxxu: Remove set but not used variables 'usedesc40' and 'seq_number'
From: Kalle Valo @ 2018-10-13 17:05 UTC (permalink / raw)
  To: YueHaibing
  Cc: Jes Sorensen, YueHaibing, linux-wireless, netdev, linux-kernel,
	kernel-janitors
In-Reply-To: <1539008219-149058-1-git-send-email-yuehaibing@huawei.com>

YueHaibing <yuehaibing@huawei.com> wrote:

> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c: In function 'rtl8xxxu_tx':
> drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c:4925:7: warning:
>  variable 'usedesc40' set but not used [-Wunused-but-set-variable]
> 
> drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c:4921:6: warning:
>  variable 'seq_number' set but not used [-Wunused-but-set-variable]
> 
> 'usedesc40' and 'seq_number' are not used any more after
> commit b59415c2dd08 ("rtl8xxxu: Split filling of TX descriptors into separate functions")
> 
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>

Patch applied to wireless-drivers-next.git, thanks.

03ce6f8a6776 rtl8xxxu: Remove set but not used variables 'usedesc40' and 'seq_number'

-- 
https://patchwork.kernel.org/patch/10630829/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* [PATCH net-next 00/18] mlxsw: Add VxLAN support
From: Ido Schimmel @ 2018-10-13 17:18 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: ivecera@redhat.com, andrew@lunn.ch, f.fainelli@gmail.com, mlxsw,
	Ido Schimmel, vivien.didelot@savoirfairelinux.com,
	nikolay@cumulusnetworks.com, roopa@cumulusnetworks.com,
	bridge@lists.linux-foundation.org, Jiri Pirko, Petr Machata,
	davem@davemloft.net

This patchset adds support for VxLAN offload in the mlxsw driver.

With regards to the forwarding plane, VxLAN support is composed from two
main parts: Encapsulation and decapsulation.

In the device, NVE encapsulation (and VxLAN in particular) takes place
in the bridge. A packet can be encapsulated using VxLAN either because
it hit an FDB entry that forwards it to the router with the IP of the
remote VTEP or because it was flooded, in which case it is sent to a
list of remote VTEPs (in addition to local ports). In either case, the
VNI is derived from the filtering identifier (FID) the packet was
classified to at ingress and the underlay source IP is taken from a
device global configuration.

VxLAN decapsulation takes place in the underlay router, where packets
that hit a local route that corresponds to the source IP of the local
VTEP are decapsulated and injected to the bridge. The packets are
classified to a FID based on the VNI they came with.

The first six patches export the required APIs in the VxLAN and mlxsw
drivers in order to allow for the introduction of the NVE core in the
next two patches. The NVE core is designed to support a variety of NVE
encapsulations (e.g., VxLAN, NVGRE) and different ASICs, but currently
only VxLAN and Spectrum are supported. Spectrum-2 support will be added
in the future.

The last 10 patches add support for VxLAN decapsulation and
encapsulation and include the addition of the required switchdev APIs in
the VxLAN driver. These APIs allow capable drivers to get a notification
about the addition / deletion of FDB entries to / from the VxLAN's FDB.

Subsequent patchset will add selftests (generic and mlxsw-specific),
data plane learning, FDB extack and vetoing and support for VLAN-aware
bridges (one VNI per VxLAN device model).

Ido Schimmel (14):
  mlxsw: spectrum_fid: Allow setting and clearing NVE properties on FID
  mlxsw: spectrum_fid: Add APIs to lookup FID without creating it
  mlxsw: spectrum_router: Enable local routes promotion to perform NVE
    decap
  mlxsw: spectrum_router: Allow querying VR ID based on table ID
  vxlan: Export address checking functions
  inet: Refactor INET_ECN_decapsulate()
  mlxsw: spectrum_nve: Implement common NVE core
  mlxsw: spectrum_nve: Implement VxLAN operations
  mlxsw: spectrum_fid: Clear NVE configuration when destroying 802.1D
    FIDs
  mlxsw: spectrum_router: Configure matching local routes for NVE decap
  vxlan: Add netif_is_vxlan()
  bridge: switchdev: Allow clearing FDB entry offload indication
  mlxsw: spectrum: Enable VxLAN enslavement to bridges
  mlxsw: spectrum_switchdev: Add support for VxLAN encapsulation

Petr Machata (4):
  vxlan: Add switchdev notifications
  vxlan: Add vxlan_fdb_find_uc() for FDB querying
  vxlan: Support marking RDSTs as offloaded
  vxlan: Notify for each remote of a removed FDB entry

 drivers/net/ethernet/mellanox/mlxsw/Makefile  |   3 +-
 .../net/ethernet/mellanox/mlxsw/spectrum.c    | 125 +++
 .../net/ethernet/mellanox/mlxsw/spectrum.h    |  88 ++
 .../ethernet/mellanox/mlxsw/spectrum_fid.c    | 225 +++-
 .../ethernet/mellanox/mlxsw/spectrum_nve.c    | 982 ++++++++++++++++++
 .../ethernet/mellanox/mlxsw/spectrum_nve.h    |  49 +
 .../mellanox/mlxsw/spectrum_nve_vxlan.c       | 249 +++++
 .../ethernet/mellanox/mlxsw/spectrum_router.c | 138 ++-
 .../mellanox/mlxsw/spectrum_switchdev.c       | 552 +++++++++-
 drivers/net/ethernet/rocker/rocker_main.c     |   1 +
 drivers/net/vxlan.c                           | 177 +++-
 include/linux/netdevice.h                     |   8 +
 include/net/inet_ecn.h                        |  18 +-
 include/net/switchdev.h                       |   7 +-
 include/net/vxlan.h                           |  57 +
 net/bridge/br.c                               |   4 +-
 net/bridge/br_fdb.c                           |   4 +-
 net/bridge/br_private.h                       |   2 +-
 net/bridge/br_switchdev.c                     |   9 +-
 net/dsa/slave.c                               |   1 +
 20 files changed, 2643 insertions(+), 56 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.c
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_nve.h
 create mode 100644 drivers/net/ethernet/mellanox/mlxsw/spectrum_nve_vxlan.c

-- 
2.17.2

^ permalink raw reply

* [PATCH net-next 03/18] mlxsw: spectrum_router: Enable local routes promotion to perform NVE decap
From: Ido Schimmel @ 2018-10-13 17:18 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: ivecera@redhat.com, andrew@lunn.ch, f.fainelli@gmail.com, mlxsw,
	Ido Schimmel, vivien.didelot@savoirfairelinux.com,
	nikolay@cumulusnetworks.com, roopa@cumulusnetworks.com,
	bridge@lists.linux-foundation.org, Jiri Pirko, Petr Machata,
	davem@davemloft.net
In-Reply-To: <20181013171725.3261-1-idosch@mellanox.com>

When an NVE tunnel with an IP underlay (e.g., VxLAN) is configured the
local route to the tunnel's source IP needs to be promoted to perform
NVE decapsulation.

Expose an API in the unicast IP router to promote / demote local routes.

The case where a local route is configured after the creation of the NVE
tunnel will be handled in a subsequent patch in the set.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum.h    |   7 ++
 .../ethernet/mellanox/mlxsw/spectrum_router.c | 115 +++++++++++++++++-
 2 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index f463be58c6dc..739a62d024c9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -431,6 +431,13 @@ struct mlxsw_sp_rif *mlxsw_sp_rif_find_by_dev(const struct mlxsw_sp *mlxsw_sp,
 					      const struct net_device *dev);
 u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp);
 struct mlxsw_sp_fid *mlxsw_sp_rif_fid(const struct mlxsw_sp_rif *rif);
+int mlxsw_sp_router_nve_promote_decap(struct mlxsw_sp *mlxsw_sp, u32 ul_tb_id,
+				      enum mlxsw_sp_l3proto ul_proto,
+				      const union mlxsw_sp_l3addr *ul_sip,
+				      u32 tunnel_index);
+void mlxsw_sp_router_nve_demote_decap(struct mlxsw_sp *mlxsw_sp, u32 ul_tb_id,
+				      enum mlxsw_sp_l3proto ul_proto,
+				      const union mlxsw_sp_l3addr *ul_sip);
 
 /* spectrum_kvdl.c */
 enum mlxsw_sp_kvdl_entry_type {
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 2ab9cf25a08a..ca4289d15561 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -366,6 +366,7 @@ enum mlxsw_sp_fib_entry_type {
 	 * encapsulating entries.)
 	 */
 	MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP,
+	MLXSW_SP_FIB_ENTRY_TYPE_NVE_DECAP,
 };
 
 struct mlxsw_sp_nexthop_group;
@@ -1128,6 +1129,52 @@ mlxsw_sp_ipip_entry_promote_decap(struct mlxsw_sp *mlxsw_sp,
 		mlxsw_sp_ipip_entry_demote_decap(mlxsw_sp, ipip_entry);
 }
 
+static struct mlxsw_sp_fib_entry *
+mlxsw_sp_router_ip2me_fib_entry_find(struct mlxsw_sp *mlxsw_sp, u32 tb_id,
+				     enum mlxsw_sp_l3proto proto,
+				     const union mlxsw_sp_l3addr *addr,
+				     enum mlxsw_sp_fib_entry_type type)
+{
+	struct mlxsw_sp_fib_entry *fib_entry;
+	struct mlxsw_sp_fib_node *fib_node;
+	unsigned char addr_prefix_len;
+	struct mlxsw_sp_fib *fib;
+	struct mlxsw_sp_vr *vr;
+	const void *addrp;
+	size_t addr_len;
+	u32 addr4;
+
+	vr = mlxsw_sp_vr_find(mlxsw_sp, tb_id);
+	if (!vr)
+		return NULL;
+	fib = mlxsw_sp_vr_fib(vr, proto);
+
+	switch (proto) {
+	case MLXSW_SP_L3_PROTO_IPV4:
+		addr4 = be32_to_cpu(addr->addr4);
+		addrp = &addr4;
+		addr_len = 4;
+		addr_prefix_len = 32;
+		break;
+	case MLXSW_SP_L3_PROTO_IPV6: /* fall through */
+	default:
+		WARN_ON(1);
+		return NULL;
+	}
+
+	fib_node = mlxsw_sp_fib_node_lookup(fib, addrp, addr_len,
+					    addr_prefix_len);
+	if (!fib_node || list_empty(&fib_node->entry_list))
+		return NULL;
+
+	fib_entry = list_first_entry(&fib_node->entry_list,
+				     struct mlxsw_sp_fib_entry, list);
+	if (fib_entry->type != type)
+		return NULL;
+
+	return fib_entry;
+}
+
 /* Given an IPIP entry, find the corresponding decap route. */
 static struct mlxsw_sp_fib_entry *
 mlxsw_sp_ipip_entry_find_decap(struct mlxsw_sp *mlxsw_sp,
@@ -1765,6 +1812,56 @@ mlxsw_sp_netdevice_ipip_ul_event(struct mlxsw_sp *mlxsw_sp,
 	return 0;
 }
 
+int mlxsw_sp_router_nve_promote_decap(struct mlxsw_sp *mlxsw_sp, u32 ul_tb_id,
+				      enum mlxsw_sp_l3proto ul_proto,
+				      const union mlxsw_sp_l3addr *ul_sip,
+				      u32 tunnel_index)
+{
+	enum mlxsw_sp_fib_entry_type type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+	struct mlxsw_sp_fib_entry *fib_entry;
+	int err;
+
+	/* It is valid to create a tunnel with a local IP and only later
+	 * assign this IP address to a local interface
+	 */
+	fib_entry = mlxsw_sp_router_ip2me_fib_entry_find(mlxsw_sp, ul_tb_id,
+							 ul_proto, ul_sip,
+							 type);
+	if (!fib_entry)
+		return 0;
+
+	fib_entry->decap.tunnel_index = tunnel_index;
+	fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_NVE_DECAP;
+
+	err = mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+	if (err)
+		goto err_fib_entry_update;
+
+	return 0;
+
+err_fib_entry_update:
+	fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+	mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+	return err;
+}
+
+void mlxsw_sp_router_nve_demote_decap(struct mlxsw_sp *mlxsw_sp, u32 ul_tb_id,
+				      enum mlxsw_sp_l3proto ul_proto,
+				      const union mlxsw_sp_l3addr *ul_sip)
+{
+	enum mlxsw_sp_fib_entry_type type = MLXSW_SP_FIB_ENTRY_TYPE_NVE_DECAP;
+	struct mlxsw_sp_fib_entry *fib_entry;
+
+	fib_entry = mlxsw_sp_router_ip2me_fib_entry_find(mlxsw_sp, ul_tb_id,
+							 ul_proto, ul_sip,
+							 type);
+	if (!fib_entry)
+		return;
+
+	fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
+	mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
+}
+
 struct mlxsw_sp_neigh_key {
 	struct neighbour *n;
 };
@@ -3815,6 +3912,7 @@ mlxsw_sp_fib_entry_should_offload(const struct mlxsw_sp_fib_entry *fib_entry)
 	case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL:
 		return !!nh_group->nh_rif;
 	case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP:
+	case MLXSW_SP_FIB_ENTRY_TYPE_NVE_DECAP:
 		return true;
 	default:
 		return false;
@@ -3848,7 +3946,8 @@ mlxsw_sp_fib4_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry)
 	int i;
 
 	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL ||
-	    fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP) {
+	    fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP ||
+	    fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_NVE_DECAP) {
 		nh_grp->nexthops->key.fib_nh->nh_flags |= RTNH_F_OFFLOAD;
 		return;
 	}
@@ -4072,6 +4171,18 @@ mlxsw_sp_fib_entry_op_ipip_decap(struct mlxsw_sp *mlxsw_sp,
 				      fib_entry->decap.tunnel_index);
 }
 
+static int mlxsw_sp_fib_entry_op_nve_decap(struct mlxsw_sp *mlxsw_sp,
+					   struct mlxsw_sp_fib_entry *fib_entry,
+					   enum mlxsw_reg_ralue_op op)
+{
+	char ralue_pl[MLXSW_REG_RALUE_LEN];
+
+	mlxsw_sp_fib_entry_ralue_pack(ralue_pl, fib_entry, op);
+	mlxsw_reg_ralue_act_ip2me_tun_pack(ralue_pl,
+					   fib_entry->decap.tunnel_index);
+	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
+}
+
 static int __mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp,
 				   struct mlxsw_sp_fib_entry *fib_entry,
 				   enum mlxsw_reg_ralue_op op)
@@ -4086,6 +4197,8 @@ static int __mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp,
 	case MLXSW_SP_FIB_ENTRY_TYPE_IPIP_DECAP:
 		return mlxsw_sp_fib_entry_op_ipip_decap(mlxsw_sp,
 							fib_entry, op);
+	case MLXSW_SP_FIB_ENTRY_TYPE_NVE_DECAP:
+		return mlxsw_sp_fib_entry_op_nve_decap(mlxsw_sp, fib_entry, op);
 	}
 	return -EINVAL;
 }
-- 
2.17.2

^ permalink raw reply related

* Re: [PATCH] ath10k: htt_rx: Fix signedness bug in ath10k_update_per_peer_tx_stats
From: Kalle Valo @ 2018-10-13 17:23 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: Gustavo A. R. Silva, netdev, linux-wireless, linux-kernel, ath10k,
	David S. Miller
In-Reply-To: <20181005184245.GA11700@embeddedor.com>

"Gustavo A. R. Silva" <gustavo@embeddedor.com> wrote:

> Currently, the error handling for the call to function
> ath10k_get_legacy_rate_idx() doesn't work because
> *rate_idx* is of type u8 (8 bits, unsigned), which
> makes it impossible for it to hold a value less
> than 0.
> 
> Fix this by changing the type of variable *rate_idx*
> to s8 (8 bits, signed).
> 
> Addresses-Coverity-ID: 1473914 ("Unsigned compared against 0")
> Fixes: 0189dbd71cbd ("ath10k: get the legacy rate index to update the txrate table")
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>

Patch applied to ath-next branch of ath.git, thanks.

9d9cdbf3f9ed ath10k: htt_rx: fix signedness bug in ath10k_update_per_peer_tx_stats

-- 
https://patchwork.kernel.org/patch/10628675/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH] ath10k: htt_rx: Fix signedness bug in ath10k_update_per_peer_tx_stats
From: Kalle Valo @ 2018-10-13 17:23 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: David S. Miller, ath10k, linux-wireless, netdev, linux-kernel,
	Gustavo A. R. Silva
In-Reply-To: <20181005184245.GA11700@embeddedor.com>

"Gustavo A. R. Silva" <gustavo@embeddedor.com> wrote:

> Currently, the error handling for the call to function
> ath10k_get_legacy_rate_idx() doesn't work because
> *rate_idx* is of type u8 (8 bits, unsigned), which
> makes it impossible for it to hold a value less
> than 0.
> 
> Fix this by changing the type of variable *rate_idx*
> to s8 (8 bits, signed).
> 
> Addresses-Coverity-ID: 1473914 ("Unsigned compared against 0")
> Fixes: 0189dbd71cbd ("ath10k: get the legacy rate index to update the txrate table")
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>

Patch applied to ath-next branch of ath.git, thanks.

9d9cdbf3f9ed ath10k: htt_rx: fix signedness bug in ath10k_update_per_peer_tx_stats

-- 
https://patchwork.kernel.org/patch/10628675/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH] ath10k: remove unnecessary comparison of unsigned integer with < 0
From: Kalle Valo @ 2018-10-13 17:24 UTC (permalink / raw)
  To: Gustavo A. R. Silva
  Cc: Gustavo A. R. Silva, netdev, linux-wireless, linux-kernel, ath10k,
	David S. Miller
In-Reply-To: <20181005185623.GA14405@embeddedor.com>

"Gustavo A. R. Silva" <gustavo@embeddedor.com> wrote:

> There is no need to compare *ps_state_enable* with < 0 because
> such variable is of type u8 (8 bits, unsigned), making it
> impossible to hold a negative value.
> 
> Fix this by removing such comparison.
> 
> Addresses-Coverity-ID: 1473921 ("Unsigned compared against 0")
> Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>

Patch applied to ath-next branch of ath.git, thanks.

7bfd82bff60e ath10k: remove unnecessary comparison of unsigned integer with < 0

-- 
https://patchwork.kernel.org/patch/10628679/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH] ath9k: fix RX_STAT_INC() etc macros
From: Kalle Valo @ 2018-10-13 17:28 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: QCA ath9k Development, David S. Miller, Arnd Bergmann,
	Simon Wunderlich, linux-wireless, netdev, linux-kernel
In-Reply-To: <20181009160351.680666-1-arnd@arndb.de>

Arnd Bergmann <arnd@arndb.de> wrote:

> A couple of macros that deal with statistics in ath9k rely on the
> declaration of the 'sc' variable, which they dereference.
> 
> However, when the statistics are disabled, the new instance in
> ath_cmn_process_fft() causes a warning for an unused variable:
> 
> drivers/net/wireless/ath/ath9k/common-spectral.c: In function 'ath_cmn_process_fft':
> drivers/net/wireless/ath/ath9k/common-spectral.c:474:20: error: unused variable 'sc' [-Werror=unused-variable]
> 
> It's better if those macros only operate on their arguments instead of
> known variable names, and adding a cast to (void) kills off that warning.
> 
> Fixes: 03224678c013 ("ath9k: add counters for good and errorneous FFT/spectral frames")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
> Signed-off-by: Kalle Valo <kvalo@codeaurora.org>

Patch applied to ath-next branch of ath.git, thanks.

72569b7be461 ath9k: fix RX_STAT_INC() etc macros

-- 
https://patchwork.kernel.org/patch/10632911/

https://wireless.wiki.kernel.org/en/developers/documentation/submittingpatches

^ permalink raw reply

* Re: [PATCH] ath10k: htt_rx: Fix signedness bug in ath10k_update_per_peer_tx_stats
From: Gustavo A. R. Silva @ 2018-10-13 18:26 UTC (permalink / raw)
  To: Kalle Valo; +Cc: David S. Miller, ath10k, linux-wireless, netdev, linux-kernel
In-Reply-To: <20181013172330.1CC8E60BFE@smtp.codeaurora.org>

On 10/13/18 7:23 PM, Kalle Valo wrote:
> 
> Patch applied to ath-next branch of ath.git, thanks.
> 
> 9d9cdbf3f9ed ath10k: htt_rx: fix signedness bug in ath10k_update_per_peer_tx_stats
> 

Thank you, Kalle.
--
Gustavo

^ permalink raw reply

* [PATCH net-next v2] net: dsa: mv88e6xxx: Fix 88E6141/6341 2500mbps SERDES speed
From: Marek Behún @ 2018-10-13 12:40 UTC (permalink / raw)
  To: netdev; +Cc: Andrew Lunn, David S . Miller, Marek Behún

This is a fix for the port_set_speed method for the Topaz family.
Currently the same method is used as for the Peridot family, but
this is wrong for the SERDES port.

On Topaz, the SERDES port is port 5, not 9 and 10 as in Peridot.
Moreover setting alt_bit on Topaz only makes sense for port 0 (for
(differentiating 100mbps vs 200mbps). The SERDES port does not
support more than 2500mbps, so alt_bit does not make any difference.

Signed-off-by: Marek Behún <marek.behun@nic.cz>
---
 drivers/net/dsa/mv88e6xxx/chip.c |  4 ++--
 drivers/net/dsa/mv88e6xxx/port.c | 25 +++++++++++++++++++++++--
 drivers/net/dsa/mv88e6xxx/port.h |  1 +
 3 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
index 78ce820b5257..e05d4eddc935 100644
--- a/drivers/net/dsa/mv88e6xxx/chip.c
+++ b/drivers/net/dsa/mv88e6xxx/chip.c
@@ -2907,7 +2907,7 @@ static const struct mv88e6xxx_ops mv88e6141_ops = {
 	.port_set_link = mv88e6xxx_port_set_link,
 	.port_set_duplex = mv88e6xxx_port_set_duplex,
 	.port_set_rgmii_delay = mv88e6390_port_set_rgmii_delay,
-	.port_set_speed = mv88e6390_port_set_speed,
+	.port_set_speed = mv88e6341_port_set_speed,
 	.port_tag_remap = mv88e6095_port_tag_remap,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
@@ -3528,7 +3528,7 @@ static const struct mv88e6xxx_ops mv88e6341_ops = {
 	.port_set_link = mv88e6xxx_port_set_link,
 	.port_set_duplex = mv88e6xxx_port_set_duplex,
 	.port_set_rgmii_delay = mv88e6390_port_set_rgmii_delay,
-	.port_set_speed = mv88e6390_port_set_speed,
+	.port_set_speed = mv88e6341_port_set_speed,
 	.port_tag_remap = mv88e6095_port_tag_remap,
 	.port_set_frame_mode = mv88e6351_port_set_frame_mode,
 	.port_set_egress_floods = mv88e6352_port_set_egress_floods,
diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c
index 92945841c8e8..cd7db60a508b 100644
--- a/drivers/net/dsa/mv88e6xxx/port.c
+++ b/drivers/net/dsa/mv88e6xxx/port.c
@@ -228,8 +228,11 @@ static int mv88e6xxx_port_set_speed(struct mv88e6xxx_chip *chip, int port,
 		ctrl = MV88E6XXX_PORT_MAC_CTL_SPEED_1000;
 		break;
 	case 2500:
-		ctrl = MV88E6390_PORT_MAC_CTL_SPEED_10000 |
-			MV88E6390_PORT_MAC_CTL_ALTSPEED;
+		if (alt_bit)
+			ctrl = MV88E6390_PORT_MAC_CTL_SPEED_10000 |
+				MV88E6390_PORT_MAC_CTL_ALTSPEED;
+		else
+			ctrl = MV88E6390_PORT_MAC_CTL_SPEED_10000;
 		break;
 	case 10000:
 		/* all bits set, fall through... */
@@ -291,6 +294,24 @@ int mv88e6185_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed)
 	return mv88e6xxx_port_set_speed(chip, port, speed, false, false);
 }
 
+/* Support 10, 100, 200, 1000, 2500 Mbps (e.g. 88E6341) */
+int mv88e6341_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed)
+{
+	if (speed == SPEED_MAX)
+		speed = port < 5 ? 1000 : 2500;
+
+	if (speed > 2500)
+		return -EOPNOTSUPP;
+
+	if (speed == 200 && port != 0)
+		return -EOPNOTSUPP;
+
+	if (speed == 2500 && port < 5)
+		return -EOPNOTSUPP;
+
+	return mv88e6xxx_port_set_speed(chip, port, speed, !port, true);
+}
+
 /* Support 10, 100, 200, 1000 Mbps (e.g. 88E6352 family) */
 int mv88e6352_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed)
 {
diff --git a/drivers/net/dsa/mv88e6xxx/port.h b/drivers/net/dsa/mv88e6xxx/port.h
index f32f56af8e35..36904c9bf955 100644
--- a/drivers/net/dsa/mv88e6xxx/port.h
+++ b/drivers/net/dsa/mv88e6xxx/port.h
@@ -269,6 +269,7 @@ int mv88e6xxx_port_set_duplex(struct mv88e6xxx_chip *chip, int port, int dup);
 
 int mv88e6065_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed);
 int mv88e6185_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed);
+int mv88e6341_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed);
 int mv88e6352_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed);
 int mv88e6390_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed);
 int mv88e6390x_port_set_speed(struct mv88e6xxx_chip *chip, int port, int speed);
-- 
2.18.1

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox