Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v4 net-next 1/6] lwt: Add net to build_state argument
From: Tom Herbert @ 2017-12-15 18:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171215182800.10248-1-tom@quantonium.net>

Users of LWT need to know net if they want to have per net operations
in LWT.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/net/lwtunnel.h    |  6 +++---
 net/core/lwt_bpf.c        |  2 +-
 net/core/lwtunnel.c       |  4 ++--
 net/ipv4/fib_semantics.c  | 13 ++++++++-----
 net/ipv4/ip_tunnel_core.c |  4 ++--
 net/ipv6/ila/ila_lwt.c    |  2 +-
 net/ipv6/route.c          |  2 +-
 net/ipv6/seg6_iptunnel.c  |  2 +-
 net/ipv6/seg6_local.c     |  5 +++--
 net/mpls/mpls_iptunnel.c  |  2 +-
 10 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index d747ef975cd8..da5e51e0d122 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -34,7 +34,7 @@ struct lwtunnel_state {
 };
 
 struct lwtunnel_encap_ops {
-	int (*build_state)(struct nlattr *encap,
+	int (*build_state)(struct net *net, struct nlattr *encap,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts,
 			   struct netlink_ext_ack *extack);
@@ -113,7 +113,7 @@ int lwtunnel_valid_encap_type(u16 encap_type,
 			      struct netlink_ext_ack *extack);
 int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
 				   struct netlink_ext_ack *extack);
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
 			 struct nlattr *encap,
 			 unsigned int family, const void *cfg,
 			 struct lwtunnel_state **lws,
@@ -192,7 +192,7 @@ static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len,
 	return 0;
 }
 
-static inline int lwtunnel_build_state(u16 encap_type,
+static inline int lwtunnel_build_state(struct net *net, u16 encap_type,
 				       struct nlattr *encap,
 				       unsigned int family, const void *cfg,
 				       struct lwtunnel_state **lws,
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index e7e626fb87bb..3a3ac13fcf06 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -238,7 +238,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
 	[LWT_BPF_XMIT_HEADROOM]	= { .type = NLA_U32 },
 };
 
-static int bpf_build_state(struct nlattr *nla,
+static int bpf_build_state(struct net *net, struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts,
 			   struct netlink_ext_ack *extack)
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 0b171756453c..b3f2f77dfe72 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -103,7 +103,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
 }
 EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
 
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
 			 struct nlattr *encap, unsigned int family,
 			 const void *cfg, struct lwtunnel_state **lws,
 			 struct netlink_ext_ack *extack)
@@ -124,7 +124,7 @@ int lwtunnel_build_state(u16 encap_type,
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
 	if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
 		found = true;
-		ret = ops->build_state(encap, family, cfg, lws, extack);
+		ret = ops->build_state(net, encap, family, cfg, lws, extack);
 		if (ret)
 			module_put(ops->owner);
 	}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f04d944f8abe..4979e5c6b9b8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -523,6 +523,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 			if (nla) {
 				struct lwtunnel_state *lwtstate;
 				struct nlattr *nla_entype;
+				struct net *net = cfg->fc_nlinfo.nl_net;
 
 				nla_entype = nla_find(attrs, attrlen,
 						      RTA_ENCAP_TYPE);
@@ -533,7 +534,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
 					goto err_inval;
 				}
 
-				ret = lwtunnel_build_state(nla_get_u16(
+				ret = lwtunnel_build_state(net, nla_get_u16(
 							   nla_entype),
 							   nla,  AF_INET, cfg,
 							   &lwtstate, extack);
@@ -607,7 +608,7 @@ static void fib_rebalance(struct fib_info *fi)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
-static int fib_encap_match(u16 encap_type,
+static int fib_encap_match(struct net *net, u16 encap_type,
 			   struct nlattr *encap,
 			   const struct fib_nh *nh,
 			   const struct fib_config *cfg,
@@ -619,7 +620,7 @@ static int fib_encap_match(u16 encap_type,
 	if (encap_type == LWTUNNEL_ENCAP_NONE)
 		return 0;
 
-	ret = lwtunnel_build_state(encap_type, encap, AF_INET,
+	ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
 				   cfg, &lwtstate, extack);
 	if (!ret) {
 		result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
@@ -632,6 +633,7 @@ static int fib_encap_match(u16 encap_type,
 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 		 struct netlink_ext_ack *extack)
 {
+	struct net *net = cfg->fc_nlinfo.nl_net;
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	struct rtnexthop *rtnh;
 	int remaining;
@@ -642,7 +644,8 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
 
 	if (cfg->fc_oif || cfg->fc_gw) {
 		if (cfg->fc_encap) {
-			if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
+			if (fib_encap_match(net, cfg->fc_encap_type,
+					    cfg->fc_encap,
 					    fi->fib_nh, cfg, extack))
 				return 1;
 		}
@@ -1180,7 +1183,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 					       "LWT encap type not specified");
 				goto err_inval;
 			}
-			err = lwtunnel_build_state(cfg->fc_encap_type,
+			err = lwtunnel_build_state(net, cfg->fc_encap_type,
 						   cfg->fc_encap, AF_INET, cfg,
 						   &lwtstate, extack);
 			if (err)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 2f39479be92f..32e05aa6117d 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -228,7 +228,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
 	[LWTUNNEL_IP_FLAGS]	= { .type = NLA_U16 },
 };
 
-static int ip_tun_build_state(struct nlattr *attr,
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
 			      unsigned int family, const void *cfg,
 			      struct lwtunnel_state **ts,
 			      struct netlink_ext_ack *extack)
@@ -327,7 +327,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
 	[LWTUNNEL_IP6_FLAGS]		= { .type = NLA_U16 },
 };
 
-static int ip6_tun_build_state(struct nlattr *attr,
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
 			       unsigned int family, const void *cfg,
 			       struct lwtunnel_state **ts,
 			       struct netlink_ext_ack *extack)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 3d56a2fb6f86..9f1e46a1468e 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -125,7 +125,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
 };
 
-static int ila_build_state(struct nlattr *nla,
+static int ila_build_state(struct net *net, struct nlattr *nla,
 			   unsigned int family, const void *cfg,
 			   struct lwtunnel_state **ts,
 			   struct netlink_ext_ack *extack)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b3f4d19b3ca5..0e0cc97e8f42 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2565,7 +2565,7 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
 	if (cfg->fc_encap) {
 		struct lwtunnel_state *lwtstate;
 
-		err = lwtunnel_build_state(cfg->fc_encap_type,
+		err = lwtunnel_build_state(net, cfg->fc_encap_type,
 					   cfg->fc_encap, AF_INET6, cfg,
 					   &lwtstate, extack);
 		if (err)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index bd6cc688bd19..a6cf2fba15f3 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -359,7 +359,7 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	return err;
 }
 
-static int seg6_build_state(struct nlattr *nla,
+static int seg6_build_state(struct net *net, struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts,
 			    struct netlink_ext_ack *extack)
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 825b8e01f947..45dc670c5a93 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -779,8 +779,9 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
 	return 0;
 }
 
-static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
-				  const void *cfg, struct lwtunnel_state **ts,
+static int seg6_local_build_state(struct net *net, struct nlattr *nla,
+				  unsigned int family, const void *cfg,
+				  struct lwtunnel_state **ts,
 				  struct netlink_ext_ack *extack)
 {
 	struct nlattr *tb[SEG6_LOCAL_MAX + 1];
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 6e558a419f60..c947310cc04f 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -157,7 +157,7 @@ static int mpls_xmit(struct sk_buff *skb)
 	return -EINVAL;
 }
 
-static int mpls_build_state(struct nlattr *nla,
+static int mpls_build_state(struct net *net, struct nlattr *nla,
 			    unsigned int family, const void *cfg,
 			    struct lwtunnel_state **ts,
 			    struct netlink_ext_ack *extack)
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 2/6] ila: Fix use of rhashtable walk in ila_xlat.c
From: Tom Herbert @ 2017-12-15 18:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171215182800.10248-1-tom@quantonium.net>

Perform better EAGAIN handling, handle case where ila_dump_info
fails and we missed objects in the dump, and add a skip index
to skip over ila entires in a list on a rhashtable node that have
already been visited (by a previous call to ila_nl_dump).

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv6/ila/ila_xlat.c | 70 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 16 deletions(-)

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 44c39c5f0638..887dd5b785b5 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -474,24 +474,31 @@ static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
 
 struct ila_dump_iter {
 	struct rhashtable_iter rhiter;
+	int skip;
 };
 
 static int ila_nl_dump_start(struct netlink_callback *cb)
 {
 	struct net *net = sock_net(cb->skb->sk);
 	struct ila_net *ilan = net_generic(net, ila_net_id);
-	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
+	struct ila_dump_iter *iter;
+	int ret;
 
-	if (!iter) {
-		iter = kmalloc(sizeof(*iter), GFP_KERNEL);
-		if (!iter)
-			return -ENOMEM;
+	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
 
-		cb->args[0] = (long)iter;
+	ret = rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
+				   GFP_KERNEL);
+	if (ret) {
+		kfree(iter);
+		return ret;
 	}
 
-	return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
-				    GFP_KERNEL);
+	iter->skip = 0;
+	cb->args[0] = (long)iter;
+
+	return ret;
 }
 
 static int ila_nl_dump_done(struct netlink_callback *cb)
@@ -509,20 +516,45 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
 	struct rhashtable_iter *rhiter = &iter->rhiter;
+	int skip = iter->skip;
 	struct ila_map *ila;
 	int ret;
 
 	rhashtable_walk_start(rhiter);
 
-	for (;;) {
-		ila = rhashtable_walk_next(rhiter);
+	/* Get first entry */
+	ila = rhashtable_walk_peek(rhiter);
+
+	if (ila && !IS_ERR(ila) && skip) {
+		/* Skip over visited entries */
+
+		while (ila && skip) {
+			/* Skip over any ila entries in this list that we
+			 * have already dumped.
+			 */
+			ila = rcu_access_pointer(ila->next);
+			skip--;
+		}
+	}
 
+	skip = 0;
+
+	for (;;) {
 		if (IS_ERR(ila)) {
-			if (PTR_ERR(ila) == -EAGAIN)
-				continue;
 			ret = PTR_ERR(ila);
-			goto done;
+			if (ret == -EAGAIN) {
+				/* Table has changed and iter has reset. Return
+				 * -EAGAIN to the application even if we have
+				 * written data to the skb. The application
+				 * needs to deal with this.
+				 */
+
+				goto out_ret;
+			} else {
+				break;
+			}
 		} else if (!ila) {
+			ret = 0;
 			break;
 		}
 
@@ -531,15 +563,21 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 					     cb->nlh->nlmsg_seq, NLM_F_MULTI,
 					     skb, ILA_CMD_GET);
 			if (ret)
-				goto done;
+				goto out;
 
+			skip++;
 			ila = rcu_access_pointer(ila->next);
 		}
+
+		skip = 0;
+		ila = rhashtable_walk_next(rhiter);
 	}
 
-	ret = skb->len;
+out:
+	iter->skip = skip;
+	ret = (skb->len ? : ret);
 
-done:
+out_ret:
 	rhashtable_walk_stop(rhiter);
 	return ret;
 }
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 3/6] ila: Call library function alloc_bucket_locks
From: Tom Herbert @ 2017-12-15 18:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171215182800.10248-1-tom@quantonium.net>

To allocate the array of bucket locks for the hash table we now
call library function alloc_bucket_spinlocks.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv6/ila/ila_xlat.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 887dd5b785b5..3ef8869ac508 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -31,26 +31,14 @@ struct ila_net {
 	bool hooks_registered;
 };
 
+#define MAX_LOCKS 1024
 #define	LOCKS_PER_CPU 10
 
 static int alloc_ila_locks(struct ila_net *ilan)
 {
-	unsigned int i, size;
-	unsigned int nr_pcpus = num_possible_cpus();
-
-	nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
-	size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
-
-	if (sizeof(spinlock_t) != 0) {
-		ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL);
-		if (!ilan->locks)
-			return -ENOMEM;
-		for (i = 0; i < size; i++)
-			spin_lock_init(&ilan->locks[i]);
-	}
-	ilan->locks_mask = size - 1;
-
-	return 0;
+	return alloc_bucket_spinlocks(&ilan->xlat.locks, &ilan->xlat.locks_mask,
+				      MAX_LOCKS, LOCKS_PER_CPU,
+				      GFP_KERNEL);
 }
 
 static u32 hashrnd __read_mostly;
@@ -639,7 +627,7 @@ static __net_exit void ila_exit_net(struct net *net)
 
 	rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL);
 
-	kvfree(ilan->locks);
+	free_bucket_spinlocks(ilan->xlat.locks);
 
 	if (ilan->hooks_registered)
 		nf_unregister_net_hooks(net, ila_nf_hook_ops,
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 4/6] ila: create main ila source file
From: Tom Herbert @ 2017-12-15 18:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171215182800.10248-1-tom@quantonium.net>

Create a main ila file that contains the module initialization functions
as well as netlink definitions. Previously these were defined in
ila_xlat and ila_common. This approach allows better extensibility.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 net/ipv6/ila/Makefile     |   2 +-
 net/ipv6/ila/ila.h        |  26 ++++++++-
 net/ipv6/ila/ila_common.c |  30 ----------
 net/ipv6/ila/ila_main.c   | 115 ++++++++++++++++++++++++++++++++++++++
 net/ipv6/ila/ila_xlat.c   | 138 +++++++++-------------------------------------
 5 files changed, 166 insertions(+), 145 deletions(-)
 create mode 100644 net/ipv6/ila/ila_main.c

diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
index 4b32e5921e5c..b7739aba6e68 100644
--- a/net/ipv6/ila/Makefile
+++ b/net/ipv6/ila/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_IPV6_ILA) += ila.o
 
-ila-objs := ila_common.o ila_lwt.o ila_xlat.o
+ila-objs := ila_main.o ila_common.o ila_lwt.o ila_xlat.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index 3c7a11b62334..faba7824ea56 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -19,6 +19,7 @@
 #include <linux/skbuff.h>
 #include <linux/types.h>
 #include <net/checksum.h>
+#include <net/genetlink.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <uapi/linux/ila.h>
@@ -104,9 +105,30 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
 
 void ila_init_saved_csum(struct ila_params *p);
 
+struct ila_net {
+	struct {
+		struct rhashtable rhash_table;
+		spinlock_t *locks; /* Bucket locks for entry manipulation */
+		unsigned int locks_mask;
+		bool hooks_registered;
+	} xlat;
+};
+
 int ila_lwt_init(void);
 void ila_lwt_fini(void);
-int ila_xlat_init(void);
-void ila_xlat_fini(void);
+
+int ila_xlat_init_net(struct net *net);
+void ila_xlat_exit_net(struct net *net);
+
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_dump_start(struct netlink_callback *cb);
+int ila_xlat_nl_dump_done(struct netlink_callback *cb);
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
+
+extern unsigned int ila_net_id;
+
+extern struct genl_family ila_nl_family;
 
 #endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 8c88ecf29b93..579310466eac 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -154,33 +154,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
 	iaddr->loc = p->locator;
 }
 
-static int __init ila_init(void)
-{
-	int ret;
-
-	ret = ila_lwt_init();
-
-	if (ret)
-		goto fail_lwt;
-
-	ret = ila_xlat_init();
-	if (ret)
-		goto fail_xlat;
-
-	return 0;
-fail_xlat:
-	ila_lwt_fini();
-fail_lwt:
-	return ret;
-}
-
-static void __exit ila_fini(void)
-{
-	ila_xlat_fini();
-	ila_lwt_fini();
-}
-
-module_init(ila_init);
-module_exit(ila_fini);
-MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
new file mode 100644
index 000000000000..f6ac6b14577e
--- /dev/null
+++ b/net/ipv6/ila/ila_main.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <net/genetlink.h>
+#include <net/ila.h>
+#include <net/netns/generic.h>
+#include <uapi/linux/genetlink.h>
+#include "ila.h"
+
+static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+	[ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
+	[ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
+	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
+	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
+};
+
+static const struct genl_ops ila_nl_ops[] = {
+	{
+		.cmd = ILA_CMD_ADD,
+		.doit = ila_xlat_nl_cmd_add_mapping,
+		.policy = ila_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = ILA_CMD_DEL,
+		.doit = ila_xlat_nl_cmd_del_mapping,
+		.policy = ila_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
+		.cmd = ILA_CMD_GET,
+		.doit = ila_xlat_nl_cmd_get_mapping,
+		.start = ila_xlat_nl_dump_start,
+		.dumpit = ila_xlat_nl_dump,
+		.done = ila_xlat_nl_dump_done,
+		.policy = ila_nl_policy,
+	},
+};
+
+unsigned int ila_net_id;
+
+struct genl_family ila_nl_family __ro_after_init = {
+	.hdrsize	= 0,
+	.name		= ILA_GENL_NAME,
+	.version	= ILA_GENL_VERSION,
+	.maxattr	= ILA_ATTR_MAX,
+	.netnsok	= true,
+	.parallel_ops	= true,
+	.module		= THIS_MODULE,
+	.ops		= ila_nl_ops,
+	.n_ops		= ARRAY_SIZE(ila_nl_ops),
+};
+
+static __net_init int ila_init_net(struct net *net)
+{
+	int err;
+
+	err = ila_xlat_init_net(net);
+	if (err)
+		goto ila_xlat_init_fail;
+
+	return 0;
+
+ila_xlat_init_fail:
+	return err;
+}
+
+static __net_exit void ila_exit_net(struct net *net)
+{
+	ila_xlat_exit_net(net);
+}
+
+static struct pernet_operations ila_net_ops = {
+	.init = ila_init_net,
+	.exit = ila_exit_net,
+	.id   = &ila_net_id,
+	.size = sizeof(struct ila_net),
+};
+
+static int __init ila_init(void)
+{
+	int ret;
+
+	ret = register_pernet_device(&ila_net_ops);
+	if (ret)
+		goto register_device_fail;
+
+	ret = genl_register_family(&ila_nl_family);
+	if (ret)
+		goto register_family_fail;
+
+	ret = ila_lwt_init();
+	if (ret)
+		goto fail_lwt;
+
+	return 0;
+
+fail_lwt:
+	genl_unregister_family(&ila_nl_family);
+register_family_fail:
+	unregister_pernet_device(&ila_net_ops);
+register_device_fail:
+	return ret;
+}
+
+static void __exit ila_fini(void)
+{
+	ila_lwt_fini();
+	genl_unregister_family(&ila_nl_family);
+	unregister_pernet_device(&ila_net_ops);
+}
+
+module_init(ila_init);
+module_exit(ila_fini);
+MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 3ef8869ac508..d05de891dfb6 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -22,15 +22,6 @@ struct ila_map {
 	struct rcu_head rcu;
 };
 
-static unsigned int ila_net_id;
-
-struct ila_net {
-	struct rhashtable rhash_table;
-	spinlock_t *locks; /* Bucket locks for entry manipulation */
-	unsigned int locks_mask;
-	bool hooks_registered;
-};
-
 #define MAX_LOCKS 1024
 #define	LOCKS_PER_CPU 10
 
@@ -58,7 +49,7 @@ static inline u32 ila_locator_hash(struct ila_locator loc)
 static inline spinlock_t *ila_get_lock(struct ila_net *ilan,
 				       struct ila_locator loc)
 {
-	return &ilan->locks[ila_locator_hash(loc) & ilan->locks_mask];
+	return &ilan->xlat.locks[ila_locator_hash(loc) & ilan->xlat.locks_mask];
 }
 
 static inline int ila_cmp_wildcards(struct ila_map *ila,
@@ -102,16 +93,6 @@ static const struct rhashtable_params rht_params = {
 	.obj_cmpfn = ila_cmpfn,
 };
 
-static struct genl_family ila_nl_family;
-
-static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
-	[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
-	[ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
-	[ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
-	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
-	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
-};
-
 static int parse_nl_config(struct genl_info *info,
 			   struct ila_xlat_params *xp)
 {
@@ -149,7 +130,7 @@ static inline struct ila_map *ila_lookup_wildcards(struct ila_addr *iaddr,
 {
 	struct ila_map *ila;
 
-	ila = rhashtable_lookup_fast(&ilan->rhash_table, &iaddr->loc,
+	ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table, &iaddr->loc,
 				     rht_params);
 	while (ila) {
 		if (!ila_cmp_wildcards(ila, iaddr, ifindex))
@@ -166,7 +147,7 @@ static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *xp,
 {
 	struct ila_map *ila;
 
-	ila = rhashtable_lookup_fast(&ilan->rhash_table,
+	ila = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
 				     &xp->ip.locator_match,
 				     rht_params);
 	while (ila) {
@@ -222,7 +203,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
 	spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
 	int err = 0, order;
 
-	if (!ilan->hooks_registered) {
+	if (!ilan->xlat.hooks_registered) {
 		/* We defer registering net hooks in the namespace until the
 		 * first mapping is added.
 		 */
@@ -231,7 +212,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
 		if (err)
 			return err;
 
-		ilan->hooks_registered = true;
+		ilan->xlat.hooks_registered = true;
 	}
 
 	ila = kzalloc(sizeof(*ila), GFP_KERNEL);
@@ -246,12 +227,12 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
 
 	spin_lock(lock);
 
-	head = rhashtable_lookup_fast(&ilan->rhash_table,
+	head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
 				      &xp->ip.locator_match,
 				      rht_params);
 	if (!head) {
 		/* New entry for the rhash_table */
-		err = rhashtable_lookup_insert_fast(&ilan->rhash_table,
+		err = rhashtable_lookup_insert_fast(&ilan->xlat.rhash_table,
 						    &ila->node, rht_params);
 	} else {
 		struct ila_map *tila = head, *prev = NULL;
@@ -277,7 +258,7 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
 		} else {
 			/* Make this ila new head */
 			RCU_INIT_POINTER(ila->next, head);
-			err = rhashtable_replace_fast(&ilan->rhash_table,
+			err = rhashtable_replace_fast(&ilan->xlat.rhash_table,
 						      &head->node,
 						      &ila->node, rht_params);
 			if (err)
@@ -303,7 +284,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
 
 	spin_lock(lock);
 
-	head = rhashtable_lookup_fast(&ilan->rhash_table,
+	head = rhashtable_lookup_fast(&ilan->xlat.rhash_table,
 				      &xp->ip.locator_match, rht_params);
 	ila = head;
 
@@ -333,15 +314,15 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
 				 * table
 				 */
 				err = rhashtable_replace_fast(
-					&ilan->rhash_table, &ila->node,
+					&ilan->xlat.rhash_table, &ila->node,
 					&head->node, rht_params);
 				if (err)
 					goto out;
 			} else {
 				/* Entry no longer used */
-				err = rhashtable_remove_fast(&ilan->rhash_table,
-							     &ila->node,
-							     rht_params);
+				err = rhashtable_remove_fast(
+						&ilan->xlat.rhash_table,
+						&ila->node, rht_params);
 			}
 		}
 
@@ -356,7 +337,7 @@ static int ila_del_mapping(struct net *net, struct ila_xlat_params *xp)
 	return err;
 }
 
-static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net *net = genl_info_net(info);
 	struct ila_xlat_params p;
@@ -369,7 +350,7 @@ static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
 	return ila_add_mapping(net, &p);
 }
 
-static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net *net = genl_info_net(info);
 	struct ila_xlat_params xp;
@@ -421,7 +402,7 @@ static int ila_dump_info(struct ila_map *ila,
 	return -EMSGSIZE;
 }
 
-static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
+int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
 {
 	struct net *net = genl_info_net(info);
 	struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -465,7 +446,7 @@ struct ila_dump_iter {
 	int skip;
 };
 
-static int ila_nl_dump_start(struct netlink_callback *cb)
+int ila_xlat_nl_dump_start(struct netlink_callback *cb)
 {
 	struct net *net = sock_net(cb->skb->sk);
 	struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -476,7 +457,7 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
 	if (!iter)
 		return -ENOMEM;
 
-	ret = rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter,
+	ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter,
 				   GFP_KERNEL);
 	if (ret) {
 		kfree(iter);
@@ -489,7 +470,7 @@ static int ila_nl_dump_start(struct netlink_callback *cb)
 	return ret;
 }
 
-static int ila_nl_dump_done(struct netlink_callback *cb)
+int ila_xlat_nl_dump_done(struct netlink_callback *cb)
 {
 	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
 
@@ -500,7 +481,7 @@ static int ila_nl_dump_done(struct netlink_callback *cb)
 	return 0;
 }
 
-static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args[0];
 	struct rhashtable_iter *rhiter = &iter->rhiter;
@@ -570,77 +551,35 @@ static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 	return ret;
 }
 
-static const struct genl_ops ila_nl_ops[] = {
-	{
-		.cmd = ILA_CMD_ADD,
-		.doit = ila_nl_cmd_add_mapping,
-		.policy = ila_nl_policy,
-		.flags = GENL_ADMIN_PERM,
-	},
-	{
-		.cmd = ILA_CMD_DEL,
-		.doit = ila_nl_cmd_del_mapping,
-		.policy = ila_nl_policy,
-		.flags = GENL_ADMIN_PERM,
-	},
-	{
-		.cmd = ILA_CMD_GET,
-		.doit = ila_nl_cmd_get_mapping,
-		.start = ila_nl_dump_start,
-		.dumpit = ila_nl_dump,
-		.done = ila_nl_dump_done,
-		.policy = ila_nl_policy,
-	},
-};
-
-static struct genl_family ila_nl_family __ro_after_init = {
-	.hdrsize	= 0,
-	.name		= ILA_GENL_NAME,
-	.version	= ILA_GENL_VERSION,
-	.maxattr	= ILA_ATTR_MAX,
-	.netnsok	= true,
-	.parallel_ops	= true,
-	.module		= THIS_MODULE,
-	.ops		= ila_nl_ops,
-	.n_ops		= ARRAY_SIZE(ila_nl_ops),
-};
-
 #define ILA_HASH_TABLE_SIZE 1024
 
-static __net_init int ila_init_net(struct net *net)
+int ila_xlat_init_net(struct net *net)
 {
-	int err;
 	struct ila_net *ilan = net_generic(net, ila_net_id);
+	int err;
 
 	err = alloc_ila_locks(ilan);
 	if (err)
 		return err;
 
-	rhashtable_init(&ilan->rhash_table, &rht_params);
+	rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
 
 	return 0;
 }
 
-static __net_exit void ila_exit_net(struct net *net)
+void ila_xlat_exit_net(struct net *net)
 {
 	struct ila_net *ilan = net_generic(net, ila_net_id);
 
-	rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL);
+	rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL);
 
 	free_bucket_spinlocks(ilan->xlat.locks);
 
-	if (ilan->hooks_registered)
+	if (ilan->xlat.hooks_registered)
 		nf_unregister_net_hooks(net, ila_nf_hook_ops,
 					ARRAY_SIZE(ila_nf_hook_ops));
 }
 
-static struct pernet_operations ila_net_ops = {
-	.init = ila_init_net,
-	.exit = ila_exit_net,
-	.id   = &ila_net_id,
-	.size = sizeof(struct ila_net),
-};
-
 static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
 {
 	struct ila_map *ila;
@@ -667,28 +606,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
 	return 0;
 }
 
-int __init ila_xlat_init(void)
-{
-	int ret;
-
-	ret = register_pernet_device(&ila_net_ops);
-	if (ret)
-		goto exit;
-
-	ret = genl_register_family(&ila_nl_family);
-	if (ret < 0)
-		goto unregister;
-
-	return 0;
-
-unregister:
-	unregister_pernet_device(&ila_net_ops);
-exit:
-	return ret;
-}
-
-void ila_xlat_fini(void)
-{
-	genl_unregister_family(&ila_nl_family);
-	unregister_pernet_device(&ila_net_ops);
-}
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 5/6] ila: Flush netlink command to clear xlat table
From: Tom Herbert @ 2017-12-15 18:27 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171215182800.10248-1-tom@quantonium.net>

Add ILA_CMD_FLUSH netlink command to clear the ILA translation table.

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/uapi/linux/ila.h |  1 +
 net/ipv6/ila/ila.h       |  1 +
 net/ipv6/ila/ila_main.c  |  6 +++++
 net/ipv6/ila/ila_xlat.c  | 62 ++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index 483b77af4eb8..db45d3e49a12 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -30,6 +30,7 @@ enum {
 	ILA_CMD_ADD,
 	ILA_CMD_DEL,
 	ILA_CMD_GET,
+	ILA_CMD_FLUSH,
 
 	__ILA_CMD_MAX,
 };
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index faba7824ea56..1f747bcbec29 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -123,6 +123,7 @@ void ila_xlat_exit_net(struct net *net);
 int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info);
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info);
 int ila_xlat_nl_dump_start(struct netlink_callback *cb);
 int ila_xlat_nl_dump_done(struct netlink_callback *cb);
 int ila_xlat_nl_dump(struct sk_buff *skb, struct netlink_callback *cb);
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
index f6ac6b14577e..18fac76b9520 100644
--- a/net/ipv6/ila/ila_main.c
+++ b/net/ipv6/ila/ila_main.c
@@ -27,6 +27,12 @@ static const struct genl_ops ila_nl_ops[] = {
 		.flags = GENL_ADMIN_PERM,
 	},
 	{
+		.cmd = ILA_CMD_FLUSH,
+		.doit = ila_xlat_nl_cmd_flush,
+		.policy = ila_nl_policy,
+		.flags = GENL_ADMIN_PERM,
+	},
+	{
 		.cmd = ILA_CMD_GET,
 		.doit = ila_xlat_nl_cmd_get_mapping,
 		.start = ila_xlat_nl_dump_start,
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index d05de891dfb6..51a15ce50a64 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -164,9 +164,9 @@ static inline void ila_release(struct ila_map *ila)
 	kfree_rcu(ila, rcu);
 }
 
-static void ila_free_cb(void *ptr, void *arg)
+static void ila_free_node(struct ila_map *ila)
 {
-	struct ila_map *ila = (struct ila_map *)ptr, *next;
+	struct ila_map *next;
 
 	/* Assume rcu_readlock held */
 	while (ila) {
@@ -176,6 +176,11 @@ static void ila_free_cb(void *ptr, void *arg)
 	}
 }
 
+static void ila_free_cb(void *ptr, void *arg)
+{
+	ila_free_node((struct ila_map *)ptr);
+}
+
 static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila);
 
 static unsigned int
@@ -365,6 +370,59 @@ int ila_xlat_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
+static inline spinlock_t *lock_from_ila_map(struct ila_net *ilan,
+					    struct ila_map *ila)
+{
+	return ila_get_lock(ilan, ila->xp.ip.locator_match);
+}
+
+int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
+{
+	struct net *net = genl_info_net(info);
+	struct ila_net *ilan = net_generic(net, ila_net_id);
+	struct rhashtable_iter iter;
+	struct ila_map *ila;
+	spinlock_t *lock;
+	int ret;
+
+	ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
+	if (ret)
+		goto done;
+
+	rhashtable_walk_start(&iter);
+
+	for (;;) {
+		ila = rhashtable_walk_next(&iter);
+
+		if (IS_ERR(ila)) {
+			if (PTR_ERR(ila) == -EAGAIN)
+				continue;
+			ret = PTR_ERR(ila);
+			goto done;
+		} else if (!ila) {
+			break;
+		}
+
+		lock = lock_from_ila_map(ilan, ila);
+
+		spin_lock(lock);
+
+		ret = rhashtable_remove_fast(&ilan->xlat.rhash_table,
+					     &ila->node, rht_params);
+		if (!ret)
+			ila_free_node(ila);
+
+		spin_unlock(lock);
+
+		if (ret)
+			break;
+	}
+
+done:
+	rhashtable_walk_stop(&iter);
+	return ret;
+}
+
 static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
 {
 	if (nla_put_u64_64bit(msg, ILA_ATTR_LOCATOR,
-- 
2.11.0

^ permalink raw reply related

* [PATCH v4 net-next 6/6] ila: Route notify
From: Tom Herbert @ 2017-12-15 18:28 UTC (permalink / raw)
  To: davem; +Cc: netdev, roopa, rohit, Tom Herbert
In-Reply-To: <20171215182800.10248-1-tom@quantonium.net>

Implement RTM notifications for ILA routers. This adds support to
ILA LWT to send a netlink RTM message when a router is uses.

THe ILA notify mechanism can be used in two contexts:

- On an ILA forwarding cache a route prefix can be configured to
  do an ILA notification. This method is used when address
  resolution needs to be done on an address.
- One an ILA router an ILA host route entry may include a
  noitification. The purpose of this is to get a notification
  to a userspace daemon to send and ILA redirect

Signed-off-by: Tom Herbert <tom@quantonium.net>
---
 include/uapi/linux/ila.h       |   2 +
 include/uapi/linux/rtnetlink.h |   8 +-
 net/ipv6/ila/ila_lwt.c         | 268 ++++++++++++++++++++++++++++-------------
 3 files changed, 193 insertions(+), 85 deletions(-)

diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h
index db45d3e49a12..5675f3e71fac 100644
--- a/include/uapi/linux/ila.h
+++ b/include/uapi/linux/ila.h
@@ -19,6 +19,8 @@ enum {
 	ILA_ATTR_CSUM_MODE,			/* u8 */
 	ILA_ATTR_IDENT_TYPE,			/* u8 */
 	ILA_ATTR_HOOK_TYPE,			/* u8 */
+	ILA_ATTR_NOTIFY_DST,			/* flag */
+	ILA_ATTR_NOTIFY_SRC,			/* flag */
 
 	__ILA_ATTR_MAX,
 };
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index d8b5f80c2ea6..8d358a300d8a 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -13,7 +13,8 @@
  */
 #define RTNL_FAMILY_IPMR		128
 #define RTNL_FAMILY_IP6MR		129
-#define RTNL_FAMILY_MAX			129
+#define RTNL_FAMILY_ILA			130
+#define RTNL_FAMILY_MAX			130
 
 /****
  *		Routing/neighbour discovery messages.
@@ -150,6 +151,9 @@ enum {
 	RTM_NEWCACHEREPORT = 96,
 #define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT
 
+	RTM_ADDR_RESOLVE = 98,
+#define RTM_ADDR_RESOLVE RTM_ADDR_RESOLVE
+
 	__RTM_MAX,
 #define RTM_MAX		(((__RTM_MAX + 3) & ~3) - 1)
 };
@@ -676,6 +680,8 @@ enum rtnetlink_groups {
 #define RTNLGRP_IPV4_MROUTE_R	RTNLGRP_IPV4_MROUTE_R
 	RTNLGRP_IPV6_MROUTE_R,
 #define RTNLGRP_IPV6_MROUTE_R	RTNLGRP_IPV6_MROUTE_R
+	RTNLGRP_ILA_NOTIFY,
+#define RTNLGRP_ILA_NOTIFY	RTNLGRP_ILA_NOTIFY
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 9f1e46a1468e..303c91e3bf76 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -19,10 +19,15 @@
 struct ila_lwt {
 	struct ila_params p;
 	struct dst_cache dst_cache;
+	u8 hook_type;
 	u32 connected : 1;
-	u32 lwt_output : 1;
+	u32 xlat : 1;
+	u32 notify : 2;
 };
 
+#define ILA_NOTIFY_DST 1
+#define ILA_NOTIFY_SRC 2
+
 static inline struct ila_lwt *ila_lwt_lwtunnel(
 	struct lwtunnel_state *lwt)
 {
@@ -35,6 +40,67 @@ static inline struct ila_params *ila_params_lwtunnel(
 	return &ila_lwt_lwtunnel(lwt)->p;
 }
 
+static size_t ila_rslv_msgsize(void)
+{
+	size_t len =
+		NLMSG_ALIGN(sizeof(struct rtmsg))
+		+ nla_total_size(16)     /* RTA_DST */
+		+ nla_total_size(16)     /* RTA_SRC */
+		;
+
+	return len;
+}
+
+void ila_notify(struct net *net, struct sk_buff *skb, struct ila_lwt *lwt)
+{
+	struct ipv6hdr *ip6h = ipv6_hdr(skb);
+	int flags = NLM_F_MULTI;
+	struct sk_buff *nlskb;
+	struct nlmsghdr *nlh;
+	struct rtmsg *rtm;
+	int err = 0;
+
+	/* Send ILA notification to user */
+	nlskb = nlmsg_new(ila_rslv_msgsize(), GFP_KERNEL);
+	if (!nlskb)
+		return;
+
+	nlh = nlmsg_put(nlskb, 0, 0, RTM_ADDR_RESOLVE, sizeof(*rtm), flags);
+	if (!nlh) {
+		err = -EMSGSIZE;
+		goto errout;
+	}
+
+	rtm = nlmsg_data(nlh);
+	rtm->rtm_family   = AF_INET6;
+	rtm->rtm_dst_len  = 128;
+	rtm->rtm_src_len  = 0;
+	rtm->rtm_tos      = 0;
+	rtm->rtm_table    = RT6_TABLE_UNSPEC;
+	rtm->rtm_type     = RTN_UNICAST;
+	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
+
+	if (((lwt->notify & ILA_NOTIFY_DST) &&
+	     nla_put_in6_addr(nlskb, RTA_DST, &ip6h->daddr)) ||
+	    ((lwt->notify & ILA_NOTIFY_SRC) &&
+	     nla_put_in6_addr(nlskb, RTA_SRC, &ip6h->saddr))) {
+		nlmsg_cancel(nlskb, nlh);
+		err = -EMSGSIZE;
+		goto errout;
+	}
+
+	nlmsg_end(nlskb, nlh);
+
+	rtnl_notify(nlskb, net, 0, RTNLGRP_ILA_NOTIFY, NULL, GFP_ATOMIC);
+
+	return;
+
+errout:
+	kfree_skb(nlskb);
+	WARN_ON(err == -EMSGSIZE);
+	rtnl_set_sk_err(net, RTNLGRP_ILA_NOTIFY, err);
+}
+
 static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *orig_dst = skb_dst(skb);
@@ -46,11 +112,14 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	if (ilwt->lwt_output)
+	if (ilwt->xlat)
 		ila_update_ipv6_locator(skb,
 					ila_params_lwtunnel(orig_dst->lwtstate),
 					true);
 
+	if (ilwt->notify)
+		ila_notify(net, skb, ilwt);
+
 	if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) {
 		/* Already have a next hop address in route, no need for
 		 * dest cache route.
@@ -106,11 +175,14 @@ static int ila_input(struct sk_buff *skb)
 	if (skb->protocol != htons(ETH_P_IPV6))
 		goto drop;
 
-	if (!ilwt->lwt_output)
+	if (ilwt->xlat)
 		ila_update_ipv6_locator(skb,
 					ila_params_lwtunnel(dst->lwtstate),
 					false);
 
+	if (ilwt->notify)
+		ila_notify(dev_net(dst->dev), skb, ilwt);
+
 	return dst->lwtstate->orig_input(skb);
 
 drop:
@@ -123,6 +195,8 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
 	[ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, },
 	[ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, },
 	[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
+	[ILA_ATTR_NOTIFY_DST] = { .type = NLA_FLAG },
+	[ILA_ATTR_NOTIFY_SRC] = { .type = NLA_FLAG },
 };
 
 static int ila_build_state(struct net *net, struct nlattr *nla,
@@ -130,64 +204,73 @@ static int ila_build_state(struct net *net, struct nlattr *nla,
 			   struct lwtunnel_state **ts,
 			   struct netlink_ext_ack *extack)
 {
-	struct ila_lwt *ilwt;
-	struct ila_params *p;
-	struct nlattr *tb[ILA_ATTR_MAX + 1];
-	struct lwtunnel_state *newts;
 	const struct fib6_config *cfg6 = cfg;
-	struct ila_addr *iaddr;
+	struct ila_addr *iaddr = (struct ila_addr *)&cfg6->fc_dst;
 	u8 ident_type = ILA_ATYPE_USE_FORMAT;
 	u8 hook_type = ILA_HOOK_ROUTE_OUTPUT;
+	struct nlattr *tb[ILA_ATTR_MAX + 1];
 	u8 csum_mode = ILA_CSUM_NO_ACTION;
-	bool lwt_output = true;
+	struct lwtunnel_state *newts;
+	struct ila_lwt *ilwt;
+	struct ila_params *p;
 	u8 eff_ident_type;
-	int ret;
+	int err;
 
 	if (family != AF_INET6)
 		return -EINVAL;
 
-	ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack);
-	if (ret < 0)
-		return ret;
+	err = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack);
+	if (err < 0)
+		return err;
 
-	if (!tb[ILA_ATTR_LOCATOR])
-		return -EINVAL;
+	if (tb[ILA_ATTR_LOCATOR]) {
+		/* Doing ILA translation */
 
-	iaddr = (struct ila_addr *)&cfg6->fc_dst;
+		if (tb[ILA_ATTR_IDENT_TYPE])
+			ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]);
 
-	if (tb[ILA_ATTR_IDENT_TYPE])
-		ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]);
+		if (ident_type == ILA_ATYPE_USE_FORMAT) {
+			/* Infer identifier type from type field in formatted
+			 * identifier.
+			 */
 
-	if (ident_type == ILA_ATYPE_USE_FORMAT) {
-		/* Infer identifier type from type field in formatted
-		 * identifier.
-		 */
+			if (cfg6->fc_dst_len < 8 *
+			    sizeof(struct ila_locator) + 3) {
+				/* Need to have full locator and at least type
+				 * field included in destination
+				 */
+				return -EINVAL;
+			}
+
+			eff_ident_type = iaddr->ident.type;
+		} else {
+			eff_ident_type = ident_type;
+		}
 
-		if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) {
-			/* Need to have full locator and at least type field
-			 * included in destination
-			 */
+		switch (eff_ident_type) {
+		case ILA_ATYPE_IID:
+			/* Don't allow ILA for IID type */
+			return -EINVAL;
+		case ILA_ATYPE_LUID:
+			break;
+		case ILA_ATYPE_VIRT_V4:
+		case ILA_ATYPE_VIRT_UNI_V6:
+		case ILA_ATYPE_VIRT_MULTI_V6:
+		case ILA_ATYPE_NONLOCAL_ADDR:
+			/* These ILA formats are not supported yet. */
+		default:
 			return -EINVAL;
 		}
 
-		eff_ident_type = iaddr->ident.type;
-	} else {
-		eff_ident_type = ident_type;
-	}
+		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
 
-	switch (eff_ident_type) {
-	case ILA_ATYPE_IID:
-		/* Don't allow ILA for IID type */
-		return -EINVAL;
-	case ILA_ATYPE_LUID:
-		break;
-	case ILA_ATYPE_VIRT_V4:
-	case ILA_ATYPE_VIRT_UNI_V6:
-	case ILA_ATYPE_VIRT_MULTI_V6:
-	case ILA_ATYPE_NONLOCAL_ADDR:
-		/* These ILA formats are not supported yet. */
-	default:
-		return -EINVAL;
+		if (csum_mode == ILA_CSUM_NEUTRAL_MAP &&
+		    ila_csum_neutral_set(iaddr->ident)) {
+			/* Don't allow translation if checksum neutral bit is
+			 * configured and it's set in the SIR address.
+			 */
+			return -EINVAL;
+		}
 	}
 
 	if (tb[ILA_ATTR_HOOK_TYPE])
@@ -195,58 +278,62 @@ static int ila_build_state(struct net *net, struct nlattr *nla,
 
 	switch (hook_type) {
 	case ILA_HOOK_ROUTE_OUTPUT:
-		lwt_output = true;
-		break;
 	case ILA_HOOK_ROUTE_INPUT:
-		lwt_output = false;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	if (tb[ILA_ATTR_CSUM_MODE])
-		csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]);
-
-	if (csum_mode == ILA_CSUM_NEUTRAL_MAP &&
-	    ila_csum_neutral_set(iaddr->ident)) {
-		/* Don't allow translation if checksum neutral bit is
-		 * configured and it's set in the SIR address.
-		 */
-		return -EINVAL;
-	}
-
 	newts = lwtunnel_state_alloc(sizeof(*ilwt));
 	if (!newts)
 		return -ENOMEM;
 
 	ilwt = ila_lwt_lwtunnel(newts);
-	ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC);
-	if (ret) {
+
+	err = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC);
+	if (err) {
 		kfree(newts);
-		return ret;
+		return err;
 	}
 
-	ilwt->lwt_output = !!lwt_output;
+	newts->type = LWTUNNEL_ENCAP_ILA;
 
-	p = ila_params_lwtunnel(newts);
+	switch (hook_type) {
+	case ILA_HOOK_ROUTE_OUTPUT:
+		newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+		break;
+	case ILA_HOOK_ROUTE_INPUT:
+		newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+		break;
+	}
 
-	p->csum_mode = csum_mode;
-	p->ident_type = ident_type;
-	p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]);
+	ilwt->hook_type = hook_type;
 
-	/* Precompute checksum difference for translation since we
-	 * know both the old locator and the new one.
-	 */
-	p->locator_match = iaddr->loc;
+	if (tb[ILA_ATTR_NOTIFY_DST])
+		ilwt->notify |= ILA_NOTIFY_DST;
 
-	ila_init_saved_csum(p);
+	if (tb[ILA_ATTR_NOTIFY_SRC])
+		ilwt->notify |= ILA_NOTIFY_SRC;
 
-	newts->type = LWTUNNEL_ENCAP_ILA;
-	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT |
-			LWTUNNEL_STATE_INPUT_REDIRECT;
+	p = ila_params_lwtunnel(newts);
 
-	if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr))
-		ilwt->connected = 1;
+	if (tb[ILA_ATTR_LOCATOR]) {
+		ilwt->xlat = true;
+		p->csum_mode = csum_mode;
+		p->ident_type = ident_type;
+		p->locator.v64 = (__force __be64)nla_get_u64(
+							tb[ILA_ATTR_LOCATOR]);
+
+		/* Precompute checksum difference for translation since we
+		 * know both the old locator and the new one.
+		 */
+		p->locator_match = iaddr->loc;
+
+		ila_init_saved_csum(p);
+
+		if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr))
+			ilwt->connected = 1;
+	}
 
 	*ts = newts;
 
@@ -264,21 +351,32 @@ static int ila_fill_encap_info(struct sk_buff *skb,
 	struct ila_params *p = ila_params_lwtunnel(lwtstate);
 	struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate);
 
-	if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64,
-			      ILA_ATTR_PAD))
+	if (ilwt->xlat) {
+		if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR,
+				      (__force u64)p->locator.v64,
+				      ILA_ATTR_PAD))
 		goto nla_put_failure;
 
-	if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode))
-		goto nla_put_failure;
+		if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE,
+			       (__force u8)p->csum_mode))
+			goto nla_put_failure;
 
-	if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type))
-		goto nla_put_failure;
+		if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE,
+			       (__force u8)p->ident_type))
+			goto nla_put_failure;
+	}
 
-	if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE,
-		       ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT :
-					  ILA_HOOK_ROUTE_INPUT))
+	if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE, ilwt->hook_type))
 		goto nla_put_failure;
 
+	if (ilwt->notify & ILA_NOTIFY_DST)
+		if (nla_put_flag(skb, ILA_ATTR_NOTIFY_DST))
+			goto nla_put_failure;
+
+	if (ilwt->notify & ILA_NOTIFY_SRC)
+		if (nla_put_flag(skb, ILA_ATTR_NOTIFY_SRC))
+			goto nla_put_failure;
+
 	return 0;
 
 nla_put_failure:
@@ -291,6 +389,8 @@ static int ila_encap_nlsize(struct lwtunnel_state *lwtstate)
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_CSUM_MODE */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_IDENT_TYPE */
 	       nla_total_size(sizeof(u8)) +        /* ILA_ATTR_HOOK_TYPE */
+	       nla_total_size(0) +		   /* ILA_ATTR_NOTIFY_DST */
+	       nla_total_size(0) +		   /* ILA_ATTR_NOTIFY_SRC */
 	       0;
 }
 
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH] net: alteon: acenic: clean up indentation issue
From: David Miller @ 2017-12-15 18:28 UTC (permalink / raw)
  To: colin.king; +Cc: jes, linux-acenic, netdev, kernel-janitors, linux-kernel
In-Reply-To: <20171214114021.5300-1-colin.king@canonical.com>

From: Colin King <colin.king@canonical.com>
Date: Thu, 14 Dec 2017 11:40:21 +0000

> From: Colin Ian King <colin.king@canonical.com>
> 
> There is a hunk of code that is incorrectly indented with spaces
> and rather than a tab.  Clean this up.
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Applied to net-next, thanks.

^ permalink raw reply

* Re: [PATCH net 0/3] net: sched: Make qdisc offload uapi uniform
From: David Miller @ 2017-12-15 18:36 UTC (permalink / raw)
  To: yuvalm; +Cc: netdev, mlxsw
In-Reply-To: <1513259671-1183-1-git-send-email-yuvalm@mellanox.com>

From: Yuval Mintz <yuvalm@mellanox.com>
Date: Thu, 14 Dec 2017 15:54:28 +0200

> Several qdiscs can already be offloaded to hardware, but there's an
> inconsistecy in regard to the uapi through which they indicate such
> an offload is taking place - indication is passed to the user via
> TCA_OPTIONS where each qdisc retains private logic for setting it.
> 
> The recent addition of offloading to RED in
> 602f3baf2218 ("net_sch: red: Add offload ability to RED qdisc") caused
> the addition of yet another uapi field for this purpose -
> TC_RED_OFFLOADED.
> 
> For clarity and prevention of bloat in the uapi we want to eliminate
> said added uapi, replacing it with a common mechanism that can be used
> to reflect offload status of the various qdiscs.
> 
> The first patch introduces TCA_HW_OFFLOAD as the generic message meant
> for this purpose. The second changes the current RED implementation into
> setting the internal bits necessary for passing it, and the third removes
> TC_RED_OFFLOADED as its no longer needed.
> 
> Dave,
> 
> A bit unorthodox as it's not a fix per-se, but it's the last chance
> for killing the unneeded uapi and replacing it with something better
> before getting stuck with it forever.

I agree, let's take care of this now while we can.

Series applied, thanks.

^ permalink raw reply

* [PATCH bpf-next] nfp: bpf: correct printk formats for size_t
From: Jakub Kicinski @ 2017-12-15 18:39 UTC (permalink / raw)
  To: netdev; +Cc: oss-drivers, alexei.starovoitov, daniel, Jakub Kicinski

Build bot reported warning about invalid printk formats on 32bit
architectures.  Use %zu for size_t and %zd ptr diff.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/bpf/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c
index e76e637bfd25..c50a54bcca63 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/main.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c
@@ -225,7 +225,7 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
 		}
 	}
 	if (mem - start != nfp_cpp_area_size(area)) {
-		nfp_err(cpp, "BPF capabilities left after parsing, parsed:%lu total length:%lu\n",
+		nfp_err(cpp, "BPF capabilities left after parsing, parsed:%zd total length:%zu\n",
 			mem - start, nfp_cpp_area_size(area));
 		goto err_release_free;
 	}
@@ -235,7 +235,7 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app)
 	return 0;
 
 err_release_free:
-	nfp_err(cpp, "invalid BPF capabilities at offset:%ld\n", mem - start);
+	nfp_err(cpp, "invalid BPF capabilities at offset:%zd\n", mem - start);
 	nfp_cpp_area_release_free(area);
 	return -EINVAL;
 }
-- 
2.15.1

^ permalink raw reply related

* Re: [PATCH net-next v3] ip6_vti: adjust vti mtu according to mtu of output device
From: David Miller @ 2017-12-15 18:45 UTC (permalink / raw)
  To: alexey.kodanev; +Cc: netdev, steffen.klassert, pvorel, shannon.nelson
In-Reply-To: <1513265870-29851-1-git-send-email-alexey.kodanev@oracle.com>

From: Alexey Kodanev <alexey.kodanev@oracle.com>
Date: Thu, 14 Dec 2017 18:37:50 +0300

Two minor pieces of feedback:

> LTP/udp6_ipsec_vti tests fail when sending large UDP datagrams that
> require fragmentation and the underlying device has MTU <= 1500. This
> happens because ip6_vti sets mtu to ETH_DATA_LEN and not updating it
> depending on a destination address or link parameter.
> 
> Further attempts to send UDP packets may succeed because pmtu gets
> updated on ICMPV6_PKT_TOOBIG in vti6_err().
> 
> Here is the example when the output device MTU is set to 9000:

You are fixing a problem that occurs when the underlying device has
an MTU smaller than 1500, yet you show an example involving an MTU
of 9000.

Care to adjust that inconsistency or explain why it's legit here?

> +	if (p->flags & IP6_TNL_F_CAP_XMIT) {
> +		int strict = (ipv6_addr_type(&p->raddr) &
> +			      (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
> +
> +		struct rt6_info *rt = rt6_lookup(t->net,
> +						 &p->raddr, &p->laddr,
> +						 p->link, strict);

Please do not place empty lines between local variable declarations.

Thank you.

^ permalink raw reply

* Re: [PATCH 1/1] net: usb: qmi_wwan: add Telit ME910 PID 0x1101 support
From: David Miller @ 2017-12-15 18:47 UTC (permalink / raw)
  To: dnlplm; +Cc: bjorn, netdev
In-Reply-To: <1513266974-15516-1-git-send-email-dnlplm@gmail.com>

From: Daniele Palmas <dnlplm@gmail.com>
Date: Thu, 14 Dec 2017 16:56:14 +0100

> This patch adds support for Telit ME910 PID 0x1101.
> 
> Signed-off-by: Daniele Palmas <dnlplm@gmail.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH] net/tls: Fix inverted error codes to avoid endless loop
From: David Miller @ 2017-12-15 18:49 UTC (permalink / raw)
  To: r.hering; +Cc: netdev
In-Reply-To: <OFB355F51D.12739AB9-ONC12581F6.0067C72B-C12581F6.0067EF4E@avm.de>

From: r.hering@avm.de
Date: Thu, 14 Dec 2017 19:55:14 +0100

> sendfile() calls can hang endless with using Kernel TLS if a socket error 
> occurs.
> Socket error codes must be inverted by Kernel TLS before returning because
> they are stored with positive sign. If returned non-inverted they are
> interpreted as number of bytes sent, causing endless looping of the
> splice mechanic behind sendfile().
> 
> Signed-off-by: Robert Hering <r.hering@avm.de>

Your patch is corrupted again, exactly the same like last time.

I asked you politely to send a test patch to yourself, and make sure you
could apply the patch cleanly.

Because TABs have been corrupted into spaces, exactly like last time,
I cannot see how you could have possibly succesfully done such a test
before posting here again.

Please fix this properly, get your email client sending patches without
modifying the text, and only then resubmit this patch.

Thank you.

^ permalink raw reply

* Re: [PATCH net-next] qmi_wwan: set FLAG_SEND_ZLP to avoid network initiated disconnect
From: David Miller @ 2017-12-15 18:50 UTC (permalink / raw)
  To: bjorn-yOkvZcmFvRU
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA, linux-usb-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20171214185550.15779-1-bjorn-yOkvZcmFvRU@public.gmane.org>

From: Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org>
Date: Thu, 14 Dec 2017 19:55:50 +0100

> It has been reported that the dummy byte we add to avoid
> ZLPs can be forwarded by the modem to the PGW/GGSN, and that
> some operators will drop the connection if this happens.
> 
> In theory, QMI devices are based on CDC ECM and should as such
> both support ZLPs and silently ignore the dummy byte.  The latter
> assumption failed.  Let's test out the first.
> 
> Signed-off-by: Bjørn Mork <bjorn-yOkvZcmFvRU@public.gmane.org>
> ---
> I am a bit worried about the effect of this change on all the
> devices I can't test myself. But trying it is the only way we
> can ever find out....

:-)  Applied to net-next, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [net-next:master 355/378] drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c:236:16: sparse: restricted __le16 degrades to integer
From: kbuild test robot @ 2017-12-15 18:50 UTC (permalink / raw)
  To: Salil Mehta; +Cc: kbuild-all, netdev, lipeng

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git master
head:   82f67bc6beb27120e8ccd1de8bd704edf3f79813
commit: dde1a86e93cadf9b17ec0a95a78c99505c48fd83 [355/378] net: hns3: Add mailbox support to PF driver
reproduce:
        # apt-get install sparse
        git checkout dde1a86e93cadf9b17ec0a95a78c99505c48fd83
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)


vim +236 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mbx.c

   226	
   227	void hclge_mbx_handler(struct hclge_dev *hdev)
   228	{
   229		struct hclge_cmq_ring *crq = &hdev->hw.cmq.crq;
   230		struct hclge_mbx_vf_to_pf_cmd *req;
   231		struct hclge_vport *vport;
   232		struct hclge_desc *desc;
   233		int ret;
   234	
   235		/* handle all the mailbox requests in the queue */
 > 236		while (hnae_get_bit(crq->desc[crq->next_to_use].flag,

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* Re: [PATCH net-next] net: phy: phylink: Handle NULL fwnode_handle
From: David Miller @ 2017-12-15 18:51 UTC (permalink / raw)
  To: f.fainelli; +Cc: netdev, rmk+kernel, andrew, linux-kernel
In-Reply-To: <20171214235758.26122-1-f.fainelli@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 14 Dec 2017 15:57:58 -0800

> Unlike the various of_* routines to fetch properties, fwnode_* routines can
> have an early check against a NULL fwnode_handle reference which makes them
> return -EINVAL (see fwnode_call_int_op), thus making it virtually impossible to
> differentiate what type of error is going on.
> 
> Have an early check in phylink_register_sfp() so we can keep proceeding with
> the initialization, there is not much we can do without a valid fwnode_handle
> except return early and treat this similarly to -ENOENT.
> 
> Fixes: 8fa7b9b6af25 ("phylink: convert to fwnode")
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>

Applied.

^ permalink raw reply

* Re: [PATCHv2 net-next 0/8] sctp: Implement Stream Interleave: Interaction with Other SCTP Extensions
From: David Miller @ 2017-12-15 18:53 UTC (permalink / raw)
  To: lucien.xin; +Cc: netdev, linux-sctp, marcelo.leitner, nhorman
In-Reply-To: <cover.1513269224.git.lucien.xin@gmail.com>

From: Xin Long <lucien.xin@gmail.com>
Date: Fri, 15 Dec 2017 00:41:24 +0800

> Stream Interleave would be implemented in two Parts:
> 
>    1. The I-DATA Chunk Supporting User Message Interleaving
>    2. Interaction with Other SCTP Extensions
> 
> Overview in section 2.3 of RFC8260 for Part 2:
> 
>    The usage of the I-DATA chunk might interfere with other SCTP
>    extensions.  Future SCTP extensions MUST describe if and how they
>    interfere with the usage of I-DATA chunks.  For the SCTP extensions
>    already defined when this document was published, the details are
>    given in the following subsections.
> 
> As the 2nd part of Stream Interleave Implementation, this patchset mostly
> adds the support for SCTP Partial Reliability Extension with I-FORWARD-TSN
> chunk. Then adjusts stream scheduler and stream reconfig to make them work
> properly with I-DATA chunks.
> 
> In the last patch, all stream interleave codes will be enabled by adding
> sysctl to allow users to use this feature.
> 
> v1 -> v2:
>   - removed the intl_enable check from sctp_chunk_event_lookup, as Marcelo's
>     suggestion.
>   - fixed a typo in changelog.

Series applied, thanks.

^ permalink raw reply

* Re: v4.15-rc2 on thinkpad x60: ethernet stopped working
From: Pavel Machek @ 2017-12-15 19:04 UTC (permalink / raw)
  To: Keller, Jacob E
  Cc: Gabriel C, kernel list, netdev@vger.kernel.org,
	intel-wired-lan@lists.osuosl.org
In-Reply-To: <02874ECE860811409154E81DA85FBB5882B5FB49@ORSMSX115.amr.corp.intel.com>

[-- Attachment #1: Type: text/plain, Size: 1169 bytes --]

Hi!

> > > > Any ideas ?
> > >
> > > Yes , 19110cfbb34d4af0cdfe14cd243f3b09dc95b013 broke it.
> > >
> > > See:
> > > https://bugzilla.kernel.org/show_bug.cgi?id=198047
> > >
> > > Fix there :
> > > https://marc.info/?l=linux-kernel&m=151272209903675&w=2
> > >
> > > Regards,
> > >
> > > Gabriel C
> > 
> > Hi,
> > 
> > Digging into this, the problem is complicated. The original bug assumed behavior
> > of the .check_for_link call, which is universally not implemented.
> > 
> > I think the correct fix is to revert 19110cfbb34d ("e1000e: Separate signaling for
> > link check/link up", 2017-10-10) and find a more proper solution.
> > 
> > I don't think any other code which uses check_for_link expects the interface to
> > return in the way this patch attempted.

> Alternatively, we can go a step farther and make sure every implementation of .check_for_link follows the modified interface.
> 

We are at -rc3 now. Reverting 19110cfbb34d seems like good solution at
the moment.
									Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 181 bytes --]

^ permalink raw reply

* Re: [PATCH] ip_gre: fix wrong return value of erspan_rcv
From: David Miller @ 2017-12-15 19:10 UTC (permalink / raw)
  To: yanhaishuang; +Cc: kuznet, yoshfuji, netdev, linux-kernel, u9012063
In-Reply-To: <1513305976-20707-1-git-send-email-yanhaishuang@cmss.chinamobile.com>

From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Fri, 15 Dec 2017 10:46:16 +0800

> If pskb_may_pull return failed, return PACKET_REJECT instead of -ENOMEM.
> 
> Fixes: 84e54fe0a5ea ("gre: introduce native tunnel support for ERSPAN")
> Cc: William Tu <u9012063@gmail.com>
> Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>

Applied.

^ permalink raw reply

* Re: [PATCH] ip6_gre: fix a pontential issue in ip6erspan_rcv
From: David Miller @ 2017-12-15 19:11 UTC (permalink / raw)
  To: yanhaishuang; +Cc: kuznet, yoshfuji, netdev, linux-kernel, u9012063
In-Reply-To: <1513305998-20750-1-git-send-email-yanhaishuang@cmss.chinamobile.com>

From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
Date: Fri, 15 Dec 2017 10:46:38 +0800

> pskb_may_pull() can change skb->data, so we need to load ipv6h/ershdr at
> the right place.
> 
> Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support")
> Cc: William Tu <u9012063@gmail.com>
> Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>

The mentioned commit ID only exists in net-next, and this patch does not apply
cleanly there.

^ permalink raw reply

* [PATCH v10 2/5] btrfs: make open_ctree error injectable
From: Josef Bacik @ 2017-12-15 19:12 UTC (permalink / raw)
  To: rostedt, mingo, davem, netdev, linux-kernel, ast, kernel-team,
	daniel, linux-btrfs, darrick.wong, mhiramat
  Cc: Josef Bacik
In-Reply-To: <1513365176-6744-1-git-send-email-josef@toxicpanda.com>

From: Josef Bacik <jbacik@fb.com>

This allows us to do error injection with BPF for open_ctree.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 fs/btrfs/disk-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 10a2a579cc7f..02b5f5667754 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -30,6 +30,7 @@
 #include <linux/ratelimit.h>
 #include <linux/uuid.h>
 #include <linux/semaphore.h>
+#include <linux/bpf.h>
 #include <asm/unaligned.h>
 #include "ctree.h"
 #include "disk-io.h"
@@ -3123,6 +3124,7 @@ int open_ctree(struct super_block *sb,
 		goto fail_block_groups;
 	goto retry_root_backup;
 }
+BPF_ALLOW_ERROR_INJECTION(open_ctree);
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-- 
2.7.5

^ permalink raw reply related

* [PATCH v10 3/5] bpf: add a bpf_override_function helper
From: Josef Bacik @ 2017-12-15 19:12 UTC (permalink / raw)
  To: rostedt, mingo, davem, netdev, linux-kernel, ast, kernel-team,
	daniel, linux-btrfs, darrick.wong, mhiramat
  Cc: Josef Bacik
In-Reply-To: <1513365176-6744-1-git-send-email-josef@toxicpanda.com>

From: Josef Bacik <jbacik@fb.com>

Error injection is sloppy and very ad-hoc.  BPF could fill this niche
perfectly with it's kprobe functionality.  We could make sure errors are
only triggered in specific call chains that we care about with very
specific situations.  Accomplish this with the bpf_override_funciton
helper.  This will modify the probe'd callers return value to the
specified value and set the PC to an override function that simply
returns, bypassing the originally probed function.  This gives us a nice
clean way to implement systematic error injection for all of our code
paths.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 arch/Kconfig                     |  3 ++
 arch/x86/Kconfig                 |  1 +
 arch/x86/include/asm/kprobes.h   |  4 +++
 arch/x86/include/asm/ptrace.h    |  5 ++++
 arch/x86/kernel/kprobes/ftrace.c | 14 +++++++++
 include/linux/filter.h           |  3 +-
 include/linux/trace_events.h     |  1 +
 include/uapi/linux/bpf.h         |  7 ++++-
 kernel/bpf/core.c                |  3 ++
 kernel/bpf/verifier.c            |  2 ++
 kernel/events/core.c             |  7 +++++
 kernel/trace/Kconfig             | 11 +++++++
 kernel/trace/bpf_trace.c         | 38 ++++++++++++++++++++++++
 kernel/trace/trace_kprobe.c      | 64 +++++++++++++++++++++++++++++++++++-----
 kernel/trace/trace_probe.h       | 12 ++++++++
 15 files changed, 165 insertions(+), 10 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 400b9e1b2f27..d3f4aaf9cb7a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -196,6 +196,9 @@ config HAVE_OPTPROBES
 config HAVE_KPROBES_ON_FTRACE
 	bool
 
+config HAVE_KPROBE_OVERRIDE
+	bool
+
 config HAVE_NMI
 	bool
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 8eed3f94bfc7..04d66e6fa447 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -154,6 +154,7 @@ config X86
 	select HAVE_KERNEL_XZ
 	select HAVE_KPROBES
 	select HAVE_KPROBES_ON_FTRACE
+	select HAVE_KPROBE_OVERRIDE
 	select HAVE_KRETPROBES
 	select HAVE_KVM
 	select HAVE_LIVEPATCH			if X86_64
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 9f2e3102e0bb..36abb23a7a35 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size;
 void arch_remove_kprobe(struct kprobe *p);
 asmlinkage void kretprobe_trampoline(void);
 
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs);
+#endif
+
 /* Architecture specific copy of original instruction*/
 struct arch_specific_insn {
 	/* copy of the original instruction */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 14131dd06b29..6de1fd3d0097 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 	return regs->ax;
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->ax = rc;
+}
+
 /*
  * user_mode(regs) determines whether a register set came from user
  * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c
index 8dc0161cec8f..1ea748d682fd 100644
--- a/arch/x86/kernel/kprobes/ftrace.c
+++ b/arch/x86/kernel/kprobes/ftrace.c
@@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p)
 	p->ainsn.boostable = false;
 	return 0;
 }
+
+asmlinkage void override_func(void);
+asm(
+	".type override_func, @function\n"
+	"override_func:\n"
+	"	ret\n"
+	".size override_func, .-override_func\n"
+);
+
+void arch_ftrace_kprobe_override_function(struct pt_regs *regs)
+{
+	regs->ip = (unsigned long)&override_func;
+}
+NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function);
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 0062302e1285..5feb441d3dd9 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -458,7 +458,8 @@ struct bpf_prog {
 				locked:1,	/* Program image locked? */
 				gpl_compatible:1, /* Is filter GPL compatible? */
 				cb_access:1,	/* Is control block accessed? */
-				dst_needed:1;	/* Do we need dst entry? */
+				dst_needed:1,	/* Do we need dst entry? */
+				kprobe_override:1; /* Do we override a kprobe? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	u32			len;		/* Number of filter blocks */
 	u32			jited_len;	/* Size of jited insns in bytes */
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index af44e7c2d577..5fea451f6e28 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -528,6 +528,7 @@ do {									\
 struct perf_event;
 
 DECLARE_PER_CPU(struct pt_regs, perf_trace_regs);
+DECLARE_PER_CPU(int, bpf_kprobe_override);
 
 extern int  perf_trace_init(struct perf_event *event);
 extern void perf_trace_destroy(struct perf_event *event);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 80d62e88590c..595bda120cfb 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index b9f8686a84cf..fc5a8ab4239a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1320,6 +1320,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
 bool bpf_prog_array_compatible(struct bpf_array *array,
 			       const struct bpf_prog *fp)
 {
+	if (fp->kprobe_override)
+		return false;
+
 	if (!array->owner_prog_type) {
 		/* There's no owner yet where we could check for
 		 * compatibility.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 7afa92e9b409..e807bda7fe29 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4413,6 +4413,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			prog->dst_needed = 1;
 		if (insn->imm == BPF_FUNC_get_prandom_u32)
 			bpf_user_rnd_init_once();
+		if (insn->imm == BPF_FUNC_override_return)
+			prog->kprobe_override = 1;
 		if (insn->imm == BPF_FUNC_tail_call) {
 			/* If we tail call into other programs, we
 			 * cannot make any assumptions since they can
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 16beab4767e1..6e3862bbe9c2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8077,6 +8077,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 		return -EINVAL;
 	}
 
+	/* Kprobe override only works for kprobes, not uprobes. */
+	if (prog->kprobe_override &&
+	    !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
+		bpf_prog_put(prog);
+		return -EINVAL;
+	}
+
 	if (is_tracepoint || is_syscall_tp) {
 		int off = trace_event_get_offsets(event->tp_event);
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..3e6fd580fe7f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -529,6 +529,17 @@ config FUNCTION_PROFILER
 
 	  If in doubt, say N.
 
+config BPF_KPROBE_OVERRIDE
+	bool "Enable BPF programs to override a kprobed function"
+	depends on BPF_EVENTS
+	depends on KPROBES_ON_FTRACE
+	depends on HAVE_KPROBE_OVERRIDE
+	depends on DYNAMIC_FTRACE_WITH_REGS
+	default n
+	help
+	 Allows BPF to override the execution of a probed function and
+	 set a different return value.  This is used for error injection.
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 27d1f4ffa3de..e4bfdbc5a905 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -13,6 +13,10 @@
 #include <linux/filter.h>
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
+#include <linux/kprobes.h>
+#include <asm/kprobes.h>
+
+#include "trace_probe.h"
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
@@ -76,6 +80,29 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 }
 EXPORT_SYMBOL_GPL(trace_call_bpf);
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	__this_cpu_write(bpf_kprobe_override, 1);
+	regs_set_return_value(regs, rc);
+	arch_ftrace_kprobe_override_function(regs);
+	return 0;
+}
+#else
+BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
+{
+	return -EINVAL;
+}
+#endif
+
+static const struct bpf_func_proto bpf_override_return_proto = {
+	.func		= bpf_override_return,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+};
+
 BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
 {
 	int ret;
@@ -551,6 +578,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_stackid_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
+	case BPF_FUNC_override_return:
+		return &bpf_override_return_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -766,6 +795,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
 	struct bpf_prog_array *new_array;
 	int ret = -EEXIST;
 
+	/*
+	 * Kprobe override only works for ftrace based kprobes, and only if they
+	 * are on the opt-in list.
+	 */
+	if (prog->kprobe_override &&
+	    (!trace_kprobe_ftrace(event->tp_event) ||
+	     !trace_kprobe_error_injectable(event->tp_event)))
+		return -EINVAL;
+
 	mutex_lock(&bpf_event_mutex);
 
 	if (event->prog)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 492700c5fb4d..91f4b57dab82 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -42,6 +42,7 @@ struct trace_kprobe {
 	(offsetof(struct trace_kprobe, tp.args) +	\
 	(sizeof(struct probe_arg) * (n)))
 
+DEFINE_PER_CPU(int, bpf_kprobe_override);
 
 static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
 {
@@ -87,6 +88,27 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
 	return nhit;
 }
 
+int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	return kprobe_ftrace(&tk->rp.kp);
+}
+
+int trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+	struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
+	unsigned long addr;
+
+	if (tk->symbol) {
+		addr = (unsigned long)
+			kallsyms_lookup_name(trace_kprobe_symbol(tk));
+		addr += tk->rp.kp.offset;
+	} else {
+		addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return within_kprobe_error_injection_list(addr);
+}
+
 static int register_kprobe_event(struct trace_kprobe *tk);
 static int unregister_kprobe_event(struct trace_kprobe *tk);
 
@@ -1170,7 +1192,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call)
 #ifdef CONFIG_PERF_EVENTS
 
 /* Kprobe profile handler */
-static void
+static int
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
@@ -1179,12 +1201,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	int size, __size, dsize;
 	int rctx;
 
-	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
-		return;
+	if (bpf_prog_array_valid(call)) {
+		int ret;
+
+		ret = trace_call_bpf(call, regs);
+
+		/*
+		 * We need to check and see if we modified the pc of the
+		 * pt_regs, and if so clear the kprobe and return 1 so that we
+		 * don't do the instruction skipping.  Also reset our state so
+		 * we are clean the next pass through.
+		 */
+		if (__this_cpu_read(bpf_kprobe_override)) {
+			__this_cpu_write(bpf_kprobe_override, 0);
+			reset_current_kprobe();
+			return 1;
+		}
+		if (!ret)
+			return 0;
+	}
 
 	head = this_cpu_ptr(call->perf_events);
 	if (hlist_empty(head))
-		return;
+		return 0;
 
 	dsize = __get_data_size(&tk->tp, regs);
 	__size = sizeof(*entry) + tk->tp.size + dsize;
@@ -1193,13 +1232,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 
 	entry = perf_trace_buf_alloc(size, NULL, &rctx);
 	if (!entry)
-		return;
+		return 0;
 
 	entry->ip = (unsigned long)tk->rp.kp.addr;
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
 			      head, NULL);
+	return 0;
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1275,16 +1315,24 @@ static int kprobe_register(struct trace_event_call *event,
 static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
 {
 	struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
+	int ret = 0;
 
 	raw_cpu_inc(*tk->nhit);
 
 	if (tk->tp.flags & TP_FLAG_TRACE)
 		kprobe_trace_func(tk, regs);
 #ifdef CONFIG_PERF_EVENTS
-	if (tk->tp.flags & TP_FLAG_PROFILE)
-		kprobe_perf_func(tk, regs);
+	if (tk->tp.flags & TP_FLAG_PROFILE) {
+		ret = kprobe_perf_func(tk, regs);
+		/*
+		 * The ftrace kprobe handler leaves it up to us to re-enable
+		 * preemption here before returning if we've modified the ip.
+		 */
+		if (ret)
+			preempt_enable_no_resched();
+	}
 #endif
-	return 0;	/* We don't tweek kernel, so just return 0 */
+	return ret;
 }
 NOKPROBE_SYMBOL(kprobe_dispatcher);
 
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb66e3eaa192..5e54d748c84c 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,6 +252,8 @@ struct symbol_cache;
 unsigned long update_symbol_cache(struct symbol_cache *sc);
 void free_symbol_cache(struct symbol_cache *sc);
 struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
+int trace_kprobe_ftrace(struct trace_event_call *call);
+int trace_kprobe_error_injectable(struct trace_event_call *call);
 #else
 /* uprobes do not support symbol fetch methods */
 #define fetch_symbol_u8			NULL
@@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset)
 {
 	return NULL;
 }
+
+static inline int trace_kprobe_ftrace(struct trace_event_call *call)
+{
+	return 0;
+}
+
+static inline int trace_kprobe_error_injectable(struct trace_event_call *call)
+{
+	return 0;
+}
 #endif /* CONFIG_KPROBE_EVENTS */
 
 struct probe_arg {
-- 
2.7.5


^ permalink raw reply related

* [PATCH v10 4/5] samples/bpf: add a test for bpf_override_return
From: Josef Bacik @ 2017-12-15 19:12 UTC (permalink / raw)
  To: rostedt, mingo, davem, netdev, linux-kernel, ast, kernel-team,
	daniel, linux-btrfs, darrick.wong, mhiramat
  Cc: Josef Bacik
In-Reply-To: <1513365176-6744-1-git-send-email-josef@toxicpanda.com>

From: Josef Bacik <jbacik@fb.com>

This adds a basic test for bpf_override_return to verify it works.  We
override the main function for mounting a btrfs fs so it'll return
-ENOMEM and then make sure that trying to mount a btrfs fs will fail.

Acked-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Josef Bacik <jbacik@fb.com>
---
 samples/bpf/Makefile                      |  4 ++++
 samples/bpf/test_override_return.sh       | 15 +++++++++++++++
 samples/bpf/tracex7_kern.c                | 16 ++++++++++++++++
 samples/bpf/tracex7_user.c                | 28 ++++++++++++++++++++++++++++
 tools/include/uapi/linux/bpf.h            |  7 ++++++-
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 6 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100755 samples/bpf/test_override_return.sh
 create mode 100644 samples/bpf/tracex7_kern.c
 create mode 100644 samples/bpf/tracex7_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index adeaa1302f34..4fb944a7ecf8 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -12,6 +12,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
 hostprogs-y += lathist
@@ -58,6 +59,7 @@ tracex3-objs := bpf_load.o $(LIBBPF) tracex3_user.o
 tracex4-objs := bpf_load.o $(LIBBPF) tracex4_user.o
 tracex5-objs := bpf_load.o $(LIBBPF) tracex5_user.o
 tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
+tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
@@ -101,6 +103,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
@@ -155,6 +158,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_tracex7 += -lelf
 HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
diff --git a/samples/bpf/test_override_return.sh b/samples/bpf/test_override_return.sh
new file mode 100755
index 000000000000..e68b9ee6814b
--- /dev/null
+++ b/samples/bpf/test_override_return.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+rm -f testfile.img
+dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1
+DEVICE=$(losetup --show -f testfile.img)
+mkfs.btrfs -f $DEVICE
+mkdir tmpmnt
+./tracex7 $DEVICE
+if [ $? -eq 0 ]
+then
+	echo "SUCCESS!"
+else
+	echo "FAILED!"
+fi
+losetup -d $DEVICE
diff --git a/samples/bpf/tracex7_kern.c b/samples/bpf/tracex7_kern.c
new file mode 100644
index 000000000000..1ab308a43e0f
--- /dev/null
+++ b/samples/bpf/tracex7_kern.c
@@ -0,0 +1,16 @@
+#include <uapi/linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/open_ctree")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	unsigned long rc = -12;
+
+	bpf_override_return(ctx, rc);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/tracex7_user.c b/samples/bpf/tracex7_user.c
new file mode 100644
index 000000000000..8a52ac492e8b
--- /dev/null
+++ b/samples/bpf/tracex7_user.c
@@ -0,0 +1,28 @@
+#define _GNU_SOURCE
+
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int argc, char **argv)
+{
+	FILE *f;
+	char filename[256];
+	char command[256];
+	int ret;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	snprintf(command, 256, "mount %s tmpmnt/", argv[1]);
+	f = popen(command, "r");
+	ret = pclose(f);
+
+	return ret ? 0 : 1;
+}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4c223ab30293..cf446c25c0ec 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -677,6 +677,10 @@ union bpf_attr {
  *     @buf: buf to fill
  *     @buf_size: size of the buf
  *     Return : 0 on success or negative error code
+ *
+ * int bpf_override_return(pt_regs, rc)
+ *	@pt_regs: pointer to struct pt_regs
+ *	@rc: the return value to set
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -736,7 +740,8 @@ union bpf_attr {
 	FN(xdp_adjust_meta),		\
 	FN(perf_event_read_value),	\
 	FN(perf_prog_read_value),	\
-	FN(getsockopt),
+	FN(getsockopt),			\
+	FN(override_return),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index fd9a17fa8a8b..33cb00e46c49 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -82,7 +82,8 @@ static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags,
 static int (*bpf_perf_prog_read_value)(void *ctx, void *buf,
 				       unsigned int buf_size) =
 	(void *) BPF_FUNC_perf_prog_read_value;
-
+static int (*bpf_override_return)(void *ctx, unsigned long rc) =
+	(void *) BPF_FUNC_override_return;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.7.5

^ permalink raw reply related

* [PATCH v10 5/5] btrfs: allow us to inject errors at io_ctl_init
From: Josef Bacik @ 2017-12-15 19:12 UTC (permalink / raw)
  To: rostedt, mingo, davem, netdev, linux-kernel, ast, kernel-team,
	daniel, linux-btrfs, darrick.wong, mhiramat
  Cc: Josef Bacik
In-Reply-To: <1513365176-6744-1-git-send-email-josef@toxicpanda.com>

From: Josef Bacik <jbacik@fb.com>

This was instrumental in reproducing a space cache bug.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 fs/btrfs/free-space-cache.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4426d1c73e50..fb1382893bfc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -22,6 +22,7 @@
 #include <linux/slab.h>
 #include <linux/math64.h>
 #include <linux/ratelimit.h>
+#include <linux/bpf.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -332,6 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
 
 	return 0;
 }
+BPF_ALLOW_ERROR_INJECTION(io_ctl_init);
 
 static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
 {
-- 
2.7.5

^ permalink raw reply related

* [PATCH v10 0/5] Add the ability to do BPF directed error injection
From: Josef Bacik @ 2017-12-15 19:12 UTC (permalink / raw)
  To: rostedt, mingo, davem, netdev, linux-kernel, ast, kernel-team,
	daniel, linux-btrfs, darrick.wong, mhiramat

Just one last go around I hope, fixed the preemption thing that Darrick
reported.

v9->v10:
- the kprobe dispather now requires us to re-enable preemption if we change the
  ip ourselves, so do that.

v8->v9:
- rebased onto the bpf tree.

v7->v8:
- removed the _ASM_KPROBE_ERROR_INJECT since it was not needed.

v6->v7:
- moved the opt-in macro to bpf.h out of kprobes.h.

v5->v6:
- add BPF_ALLOW_ERROR_INJECTION() tagging for functions that will support this
  feature.  This way only functions that opt-in will be allowed to be
  overridden.
- added a btrfs patch to allow error injection for open_ctree() so that the bpf
  sample actually works.

v4->v5:
- disallow kprobe_override programs from being put in the prog map array so we
  don't tail call into something we didn't check.  This allows us to make the
  normal path still fast without a bunch of percpu operations.

v3->v4:
- fix a build error found by kbuild test bot (I didn't wait long enough
  apparently.)
- Added a warning message as per Daniels suggestion.

v2->v3:
- added a ->kprobe_override flag to bpf_prog.
- added some sanity checks to disallow attaching bpf progs that have
  ->kprobe_override set that aren't for ftrace kprobes.
- added the trace_kprobe_ftrace helper to check if the trace_event_call is a
  ftrace kprobe.
- renamed bpf_kprobe_state to bpf_kprobe_override, fixed it so we only read this
  value in the kprobe path, and thus only write to it if we're overriding or
  clearing the override.

v1->v2:
- moved things around to make sure that bpf_override_return could really only be
  used for an ftrace kprobe.
- killed the special return values from trace_call_bpf.
- renamed pc_modified to bpf_kprobe_state so bpf_override_return could tell if
  it was being called from an ftrace kprobe context.
- reworked the logic in kprobe_perf_func to take advantage of bpf_kprobe_state.
- updated the test as per Alexei's review.

- Original message -

A lot of our error paths are not well tested because we have no good way of
injecting errors generically.  Some subystems (block, memory) have ways to
inject errors, but they are random so it's hard to get reproduceable results.

With BPF we can add determinism to our error injection.  We can use kprobes and
other things to verify we are injecting errors at the exact case we are trying
to test.  This patch gives us the tool to actual do the error injection part.
It is very simple, we just set the return value of the pt_regs we're given to
whatever we provide, and then override the PC with a dummy function that simply
returns.

Right now this only works on x86, but it would be simple enough to expand to
other architectures.  Thanks,

Josef

^ permalink raw reply

* [PATCH v10 1/5] add infrastructure for tagging functions as error injectable
From: Josef Bacik @ 2017-12-15 19:12 UTC (permalink / raw)
  To: rostedt, mingo, davem, netdev, linux-kernel, ast, kernel-team,
	daniel, linux-btrfs, darrick.wong, mhiramat
  Cc: Josef Bacik
In-Reply-To: <1513365176-6744-1-git-send-email-josef@toxicpanda.com>

From: Josef Bacik <jbacik@fb.com>

Using BPF we can override kprob'ed functions and return arbitrary
values.  Obviously this can be a bit unsafe, so make this feature opt-in
for functions.  Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in
order to give BPF access to that function for error injection purposes.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
---
 include/asm-generic/vmlinux.lds.h |  10 +++
 include/linux/bpf.h               |  11 +++
 include/linux/kprobes.h           |   1 +
 include/linux/module.h            |   5 ++
 kernel/kprobes.c                  | 163 ++++++++++++++++++++++++++++++++++++++
 kernel/module.c                   |   6 +-
 6 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ee8b707d9fa9..a2e8582d094a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -136,6 +136,15 @@
 #define KPROBE_BLACKLIST()
 #endif
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define ERROR_INJECT_LIST()	. = ALIGN(8);						\
+				VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .;	\
+				KEEP(*(_kprobe_error_inject_list))			\
+				VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .;
+#else
+#define ERROR_INJECT_LIST()
+#endif
+
 #ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS()	. = ALIGN(8);					\
 			VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
@@ -564,6 +573,7 @@
 	FTRACE_EVENTS()							\
 	TRACE_SYSCALLS()						\
 	KPROBE_BLACKLIST()						\
+	ERROR_INJECT_LIST()						\
 	MEM_DISCARD(init.rodata)					\
 	CLK_OF_TABLES()							\
 	RESERVEDMEM_OF_TABLES()						\
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e55e4255a210..7f4d2a953173 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -576,4 +576,15 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto;
 void bpf_user_rnd_init_once(void);
 u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
+#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+#define BPF_ALLOW_ERROR_INJECTION(fname)				\
+static unsigned long __used						\
+	__attribute__((__section__("_kprobe_error_inject_list")))	\
+	_eil_addr_##fname = (unsigned long)fname;
+#else
+#define BPF_ALLOW_ERROR_INJECTION(fname)
+#endif
+#endif
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 9440a2fc8893..963fd364f3d6 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset);
 extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset);
 
 extern bool within_kprobe_blacklist(unsigned long addr);
+extern bool within_kprobe_error_injection_list(unsigned long addr);
 
 struct kprobe_insn_cache {
 	struct mutex mutex;
diff --git a/include/linux/module.h b/include/linux/module.h
index c69b49abe877..548fa09fa806 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -475,6 +475,11 @@ struct module {
 	ctor_fn_t *ctors;
 	unsigned int num_ctors;
 #endif
+
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+	unsigned int num_kprobe_ei_funcs;
+	unsigned long *kprobe_ei_funcs;
+#endif
 } ____cacheline_aligned __randomize_layout;
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index da2ccf142358..b4aab48ad258 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 	return &(kretprobe_table_locks[hash].lock);
 }
 
+/* List of symbols that can be overriden for error injection. */
+static LIST_HEAD(kprobe_error_injection_list);
+static DEFINE_MUTEX(kprobe_ei_mutex);
+struct kprobe_ei_entry {
+	struct list_head list;
+	unsigned long start_addr;
+	unsigned long end_addr;
+	void *priv;
+};
+
 /* Blacklist -- list of struct kprobe_blacklist_entry */
 static LIST_HEAD(kprobe_blacklist);
 
@@ -1394,6 +1404,17 @@ bool within_kprobe_blacklist(unsigned long addr)
 	return false;
 }
 
+bool within_kprobe_error_injection_list(unsigned long addr)
+{
+	struct kprobe_ei_entry *ent;
+
+	list_for_each_entry(ent, &kprobe_error_injection_list, list) {
+		if (addr >= ent->start_addr && addr < ent->end_addr)
+			return true;
+	}
+	return false;
+}
+
 /*
  * If we have a symbol_name argument, look it up and add the offset field
  * to it. This way, we can specify a relative address to a symbol.
@@ -2168,6 +2189,86 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
 	return 0;
 }
 
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+/* Markers of the _kprobe_error_inject_list section */
+extern unsigned long __start_kprobe_error_inject_list[];
+extern unsigned long __stop_kprobe_error_inject_list[];
+
+/*
+ * Lookup and populate the kprobe_error_injection_list.
+ *
+ * For safety reasons we only allow certain functions to be overriden with
+ * bpf_error_injection, so we need to populate the list of the symbols that have
+ * been marked as safe for overriding.
+ */
+static void populate_kprobe_error_injection_list(unsigned long *start,
+						 unsigned long *end,
+						 void *priv)
+{
+	unsigned long *iter;
+	struct kprobe_ei_entry *ent;
+	unsigned long entry, offset = 0, size = 0;
+
+	mutex_lock(&kprobe_ei_mutex);
+	for (iter = start; iter < end; iter++) {
+		entry = arch_deref_entry_point((void *)*iter);
+
+		if (!kernel_text_address(entry) ||
+		    !kallsyms_lookup_size_offset(entry, &size, &offset)) {
+			pr_err("Failed to find error inject entry at %p\n",
+				(void *)entry);
+			continue;
+		}
+
+		ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+		if (!ent)
+			break;
+		ent->start_addr = entry;
+		ent->end_addr = entry + size;
+		ent->priv = priv;
+		INIT_LIST_HEAD(&ent->list);
+		list_add_tail(&ent->list, &kprobe_error_injection_list);
+	}
+	mutex_unlock(&kprobe_ei_mutex);
+}
+
+static void __init populate_kernel_kprobe_ei_list(void)
+{
+	populate_kprobe_error_injection_list(__start_kprobe_error_inject_list,
+					     __stop_kprobe_error_inject_list,
+					     NULL);
+}
+
+static void module_load_kprobe_ei_list(struct module *mod)
+{
+	if (!mod->num_kprobe_ei_funcs)
+		return;
+	populate_kprobe_error_injection_list(mod->kprobe_ei_funcs,
+					     mod->kprobe_ei_funcs +
+					     mod->num_kprobe_ei_funcs, mod);
+}
+
+static void module_unload_kprobe_ei_list(struct module *mod)
+{
+	struct kprobe_ei_entry *ent, *n;
+	if (!mod->num_kprobe_ei_funcs)
+		return;
+
+	mutex_lock(&kprobe_ei_mutex);
+	list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) {
+		if (ent->priv == mod) {
+			list_del_init(&ent->list);
+			kfree(ent);
+		}
+	}
+	mutex_unlock(&kprobe_ei_mutex);
+}
+#else
+static inline void __init populate_kernel_kprobe_ei_list(void) {}
+static inline void module_load_kprobe_ei_list(struct module *m) {}
+static inline void module_unload_kprobe_ei_list(struct module *m) {}
+#endif
+
 /* Module notifier call back, checking kprobes on the module */
 static int kprobes_module_callback(struct notifier_block *nb,
 				   unsigned long val, void *data)
@@ -2178,6 +2279,11 @@ static int kprobes_module_callback(struct notifier_block *nb,
 	unsigned int i;
 	int checkcore = (val == MODULE_STATE_GOING);
 
+	if (val == MODULE_STATE_COMING)
+		module_load_kprobe_ei_list(mod);
+	else if (val == MODULE_STATE_GOING)
+		module_unload_kprobe_ei_list(mod);
+
 	if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
 		return NOTIFY_DONE;
 
@@ -2240,6 +2346,8 @@ static int __init init_kprobes(void)
 		pr_err("Please take care of using kprobes.\n");
 	}
 
+	populate_kernel_kprobe_ei_list();
+
 	if (kretprobe_blacklist_size) {
 		/* lookup the function address from its name */
 		for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
@@ -2407,6 +2515,56 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {
 	.release        = seq_release,
 };
 
+/*
+ * kprobes/error_injection_list -- shows which functions can be overriden for
+ * error injection.
+ * */
+static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos)
+{
+	mutex_lock(&kprobe_ei_mutex);
+	return seq_list_start(&kprobe_error_injection_list, *pos);
+}
+
+static void kprobe_ei_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&kprobe_ei_mutex);
+}
+
+static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &kprobe_error_injection_list, pos);
+}
+
+static int kprobe_ei_seq_show(struct seq_file *m, void *v)
+{
+	char buffer[KSYM_SYMBOL_LEN];
+	struct kprobe_ei_entry *ent =
+		list_entry(v, struct kprobe_ei_entry, list);
+
+	sprint_symbol(buffer, ent->start_addr);
+	seq_printf(m, "%s\n", buffer);
+	return 0;
+}
+
+static const struct seq_operations kprobe_ei_seq_ops = {
+	.start = kprobe_ei_seq_start,
+	.next  = kprobe_ei_seq_next,
+	.stop  = kprobe_ei_seq_stop,
+	.show  = kprobe_ei_seq_show,
+};
+
+static int kprobe_ei_open(struct inode *inode, struct file *filp)
+{
+	return seq_open(filp, &kprobe_ei_seq_ops);
+}
+
+static const struct file_operations debugfs_kprobe_ei_ops = {
+	.open           = kprobe_ei_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
 static void arm_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -2548,6 +2706,11 @@ static int __init debugfs_kprobe_init(void)
 	if (!file)
 		goto error;
 
+	file = debugfs_create_file("error_injection_list", 0444, dir, NULL,
+				  &debugfs_kprobe_ei_ops);
+	if (!file)
+		goto error;
+
 	return 0;
 
 error:
diff --git a/kernel/module.c b/kernel/module.c
index dea01ac9cb74..bd695bfdc5c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3118,7 +3118,11 @@ static int find_module_sections(struct module *mod, struct load_info *info)
 					     sizeof(*mod->ftrace_callsites),
 					     &mod->num_ftrace_callsites);
 #endif
-
+#ifdef CONFIG_BPF_KPROBE_OVERRIDE
+	mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list",
+					    sizeof(*mod->kprobe_ei_funcs),
+					    &mod->num_kprobe_ei_funcs);
+#endif
 	mod->extable = section_objs(info, "__ex_table",
 				    sizeof(*mod->extable), &mod->num_exentries);
 
-- 
2.7.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox