* [RFC v2 bpf-next 5/9] net/ipv6: Add fib6_lookup
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180429180752.15428-1-dsahern@gmail.com>
Add IPv6 equivalent to fib_lookup. Does a fib lookup, including rules,
but returns a FIB entry, fib6_info, rather than a dst based rt6_info.
fib6_lookup is on the order of 40% faster than any of the dst based
lookup methods without custom rules and 25% faster with custom rules.
Since the lookup function has a completely different signature,
fib6_rule_action is split into 2 paths: the existing one is
renamed __fib6_rule_action and a new one for the fib6_info path
is added. fib6_rule_action decides which to call based on the
lookup_ptr. If it is fib6_table_lookup then the new path is taken.
Caller must hold rcu lock as no reference is taken on the returned
fib entry.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 6 ++++
net/ipv6/fib6_rules.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++--
net/ipv6/ip6_fib.c | 7 +++++
3 files changed, 97 insertions(+), 2 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 4f7b8f59ea6d..d920dd00139b 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,12 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
+/* called with rcu lock held; can return error pointer
+ * caller needs to select path
+ */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags);
+
/* called with rcu lock held; caller needs to select path */
struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6, int strict);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index d040c4bff3a0..f590446595d8 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -60,6 +60,39 @@ unsigned int fib6_rules_seq_read(struct net *net)
return fib_rules_seq_read(net, AF_INET6);
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ struct fib6_info *f6i;
+ int err;
+
+ if (net->ipv6.fib6_has_custom_rules) {
+ struct fib_lookup_arg arg = {
+ .lookup_ptr = fib6_table_lookup,
+ .lookup_data = &oif,
+ .flags = FIB_LOOKUP_NOREF,
+ };
+
+ l3mdev_update_flow(net, flowi6_to_flowi(fl6));
+
+ err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
+ flowi6_to_flowi(fl6), flags, &arg);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i = arg.result ? : net->ipv6.fib6_null_entry;
+ } else {
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
+ oif, fl6, flags);
+ if (!f6i || f6i == net->ipv6.fib6_null_entry)
+ f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+ oif, fl6, flags);
+ }
+
+ return f6i;
+}
+
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup)
@@ -121,8 +154,48 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
return 0;
}
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ struct flowi6 *flp6 = &flp->u.ip6;
+ struct net *net = rule->fr_net;
+ struct fib6_table *table;
+ struct fib6_info *f6i;
+ int err = -EAGAIN, *oif;
+ u32 tb_id;
+
+ switch (rule->action) {
+ case FR_ACT_TO_TBL:
+ break;
+ case FR_ACT_UNREACHABLE:
+ return -ENETUNREACH;
+ case FR_ACT_PROHIBIT:
+ return -EACCES;
+ case FR_ACT_BLACKHOLE:
+ default:
+ return -EINVAL;
+ }
+
+ tb_id = fib_rule_get_table(rule, arg);
+ table = fib6_get_table(net, tb_id);
+ if (!table)
+ return -EAGAIN;
+
+ oif = (int *)arg->lookup_data;
+ f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
+ if (f6i != net->ipv6.fib6_null_entry) {
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ fib6_info_nh_dev(f6i));
+
+ if (likely(!err))
+ arg->result = f6i;
+ }
+
+ return err;
+}
+
+static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
{
struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
@@ -182,6 +255,15 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
return err;
}
+static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+ int flags, struct fib_lookup_arg *arg)
+{
+ if (arg->lookup_ptr == fib6_table_lookup)
+ return fib6_rule_action_alt(rule, flp, flags, arg);
+
+ return __fib6_rule_action(rule, flp, flags, arg);
+}
+
static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
{
struct rt6_info *rt = (struct rt6_info *) arg->result;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 4cfffa0f676e..0b94c0a631cb 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -354,6 +354,13 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &rt->dst;
}
+/* called with rcu lock held; no reference taken on fib6_info */
+struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ int flags)
+{
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+}
+
static void __net_init fib6_tables_init(struct net *net)
{
fib6_link_table(net, net->ipv6.fib6_main_tbl);
--
2.11.0
^ permalink raw reply related
* [RFC v2 bpf-next 4/9] net/ipv6: Refactor fib6_rule_action
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180429180752.15428-1-dsahern@gmail.com>
Move source address lookup from fib6_rule_action to a helper. It will be
used in a later patch by a second variant for fib6_rule_action.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/ipv6/fib6_rules.c | 52 ++++++++++++++++++++++++++++++---------------------
1 file changed, 31 insertions(+), 21 deletions(-)
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 6547fc6491a6..d040c4bff3a0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -96,6 +96,31 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
return &net->ipv6.ip6_null_entry->dst;
}
+static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
+ struct flowi6 *flp6, const struct net_device *dev)
+{
+ struct fib6_rule *r = (struct fib6_rule *)rule;
+
+ /* If we need to find a source address for this traffic,
+ * we check the result if it meets requirement of the rule.
+ */
+ if ((rule->flags & FIB_RULE_FIND_SADDR) &&
+ r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
+ struct in6_addr saddr;
+
+ if (ipv6_dev_get_saddr(net, dev, &flp6->daddr,
+ rt6_flags2srcprefs(flags), &saddr))
+ return -EAGAIN;
+
+ if (!ipv6_prefix_equal(&saddr, &r->src.addr, r->src.plen))
+ return -EAGAIN;
+
+ flp6->saddr = saddr;
+ }
+
+ return 0;
+}
+
static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
@@ -134,27 +159,12 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
rt = lookup(net, table, flp6, arg->lookup_data, flags);
if (rt != net->ipv6.ip6_null_entry) {
- struct fib6_rule *r = (struct fib6_rule *)rule;
-
- /*
- * If we need to find a source address for this traffic,
- * we check the result if it meets requirement of the rule.
- */
- if ((rule->flags & FIB_RULE_FIND_SADDR) &&
- r->src.plen && !(flags & RT6_LOOKUP_F_HAS_SADDR)) {
- struct in6_addr saddr;
-
- if (ipv6_dev_get_saddr(net,
- ip6_dst_idev(&rt->dst)->dev,
- &flp6->daddr,
- rt6_flags2srcprefs(flags),
- &saddr))
- goto again;
- if (!ipv6_prefix_equal(&saddr, &r->src.addr,
- r->src.plen))
- goto again;
- flp6->saddr = saddr;
- }
+ err = fib6_rule_saddr(net, rule, flags, flp6,
+ ip6_dst_idev(&rt->dst)->dev);
+
+ if (err == -EAGAIN)
+ goto again;
+
err = rt->dst.error;
if (err != -EAGAIN)
goto out;
--
2.11.0
^ permalink raw reply related
* [RFC v2 bpf-next 3/9] net/ipv6: Extract table lookup from ip6_pol_route
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180429180752.15428-1-dsahern@gmail.com>
ip6_pol_route is used for ingress and egress FIB lookups. Refactor it
moving the table lookup into a separate fib6_table_lookup that can be
invoked separately and export the new function.
ip6_pol_route now calls fib6_table_lookup and uses the result to generate
a dst based rt6_info.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 4 ++++
net/ipv6/route.c | 39 +++++++++++++++++++++++++--------------
2 files changed, 29 insertions(+), 14 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 80d76d8dc683..4f7b8f59ea6d 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
+/* called with rcu lock held; caller needs to select path */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int strict);
+
struct fib6_info *fib6_multipath_select(const struct net *net,
struct fib6_info *match,
struct flowi6 *fl6, int oif,
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 58af969f3a2c..d0ace0c5c3e9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1800,21 +1800,12 @@ void rt6_age_exceptions(struct fib6_info *rt,
rcu_read_unlock_bh();
}
-struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6,
- const struct sk_buff *skb, int flags)
+/* must be called with rcu lock held */
+struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6, int strict)
{
struct fib6_node *fn, *saved_fn;
struct fib6_info *f6i;
- struct rt6_info *rt;
- int strict = 0;
-
- strict |= flags & RT6_LOOKUP_F_IFACE;
- strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
- if (net->ipv6.devconf_all->forwarding == 0)
- strict |= RT6_LOOKUP_F_REACHABLE;
-
- rcu_read_lock();
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
@@ -1824,8 +1815,6 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
redo_rt6_select:
f6i = rt6_select(net, fn, oif, strict);
- if (f6i->fib6_nsiblings)
- f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
@@ -1838,6 +1827,28 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
}
}
+ return f6i;
+}
+
+struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
+ int oif, struct flowi6 *fl6,
+ const struct sk_buff *skb, int flags)
+{
+ struct fib6_info *f6i;
+ struct rt6_info *rt;
+ int strict = 0;
+
+ strict |= flags & RT6_LOOKUP_F_IFACE;
+ strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
+ if (net->ipv6.devconf_all->forwarding == 0)
+ strict |= RT6_LOOKUP_F_REACHABLE;
+
+ rcu_read_lock();
+
+ f6i = fib6_table_lookup(net, table, oif, fl6, strict);
+ if (f6i->fib6_nsiblings)
+ f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+
if (f6i == net->ipv6.fib6_null_entry) {
rt = net->ipv6.ip6_null_entry;
rcu_read_unlock();
--
2.11.0
^ permalink raw reply related
* [RFC v2 bpf-next 2/9] net/ipv6: Rename rt6_multipath_select
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180429180752.15428-1-dsahern@gmail.com>
Rename rt6_multipath_select to fib6_multipath_select and export it.
A later patch wants access to it similar to IPv4's fib_select_path.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 5 +++++
net/ipv6/route.c | 17 +++++++++--------
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 5a16630179cb..80d76d8dc683 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,6 +376,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
+struct fib6_info *fib6_multipath_select(const struct net *net,
+ struct fib6_info *match,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb, int strict);
+
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
const struct in6_addr *daddr,
const struct in6_addr *saddr);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d903db30dfff..58af969f3a2c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -419,11 +419,11 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false;
}
-static struct fib6_info *rt6_multipath_select(const struct net *net,
- struct fib6_info *match,
- struct flowi6 *fl6, int oif,
- const struct sk_buff *skb,
- int strict)
+struct fib6_info *fib6_multipath_select(const struct net *net,
+ struct fib6_info *match,
+ struct flowi6 *fl6, int oif,
+ const struct sk_buff *skb,
+ int strict)
{
struct fib6_info *sibling, *next_sibling;
@@ -1068,8 +1068,9 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
f6i = rt6_device_match(net, f6i, &fl6->saddr,
fl6->flowi6_oif, flags);
if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
- f6i = rt6_multipath_select(net, f6i, fl6,
- fl6->flowi6_oif, skb, flags);
+ f6i = fib6_multipath_select(net, f6i, fl6,
+ fl6->flowi6_oif, skb,
+ flags);
}
if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
@@ -1824,7 +1825,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
redo_rt6_select:
f6i = rt6_select(net, fn, oif, strict);
if (f6i->fib6_nsiblings)
- f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
+ f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
if (f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
--
2.11.0
^ permalink raw reply related
* [RFC v2 bpf-next 1/9] net/ipv6: Rename fib6_lookup to fib6_node_lookup
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
In-Reply-To: <20180429180752.15428-1-dsahern@gmail.com>
Rename fib6_lookup to fib6_node_lookup to better reflect what it
returns. The fib6_lookup name will be used in a later patch for
an IPv6 equivalent to IPv4's fib_lookup.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/ip6_fib.h | 6 +++---
net/ipv6/ip6_fib.c | 14 ++++++++------
net/ipv6/route.c | 8 ++++----
3 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1af450d4e923..5a16630179cb 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -376,9 +376,9 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb,
int flags, pol_lookup_t lookup);
-struct fib6_node *fib6_lookup(struct fib6_node *root,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr);
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr);
struct fib6_node *fib6_locate(struct fib6_node *root,
const struct in6_addr *daddr, int dst_len,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 6421c893466e..4cfffa0f676e 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1354,8 +1354,8 @@ struct lookup_args {
const struct in6_addr *addr; /* search key */
};
-static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
- struct lookup_args *args)
+static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
+ struct lookup_args *args)
{
struct fib6_node *fn;
__be32 dir;
@@ -1400,7 +1400,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
#ifdef CONFIG_IPV6_SUBTREES
if (subtree) {
struct fib6_node *sfn;
- sfn = fib6_lookup_1(subtree, args + 1);
+ sfn = fib6_node_lookup_1(subtree,
+ args + 1);
if (!sfn)
goto backtrack;
fn = sfn;
@@ -1422,8 +1423,9 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root,
/* called with rcu_read_lock() held
*/
-struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+struct fib6_node *fib6_node_lookup(struct fib6_node *root,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
struct fib6_node *fn;
struct lookup_args args[] = {
@@ -1442,7 +1444,7 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad
}
};
- fn = fib6_lookup_1(root, daddr ? args : args + 1);
+ fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
if (!fn || fn->fn_flags & RTN_TL_ROOT)
fn = root;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 7ee0a34fba46..d903db30dfff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1006,7 +1006,7 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
pn = rcu_dereference(fn->parent);
sn = FIB6_SUBTREE(pn);
if (sn && sn != fn)
- fn = fib6_lookup(sn, NULL, saddr);
+ fn = fib6_node_lookup(sn, NULL, saddr);
else
fn = pn;
if (fn->fn_flags & RTN_RTINFO)
@@ -1059,7 +1059,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
flags &= ~RT6_LOOKUP_F_IFACE;
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
f6i = rcu_dereference(fn->leaf);
if (!f6i) {
@@ -1815,7 +1815,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
@@ -2420,7 +2420,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
*/
rcu_read_lock();
- fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
for_each_fib6_node_rt_rcu(fn) {
if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
--
2.11.0
^ permalink raw reply related
* [RFC v2 bpf-next 0/9] bpf: Add helper to do FIB lookups
From: David Ahern @ 2018-04-29 18:07 UTC (permalink / raw)
To: netdev, borkmann, ast
Cc: davem, shm, roopa, brouer, toke, john.fastabend, David Ahern
Provide a helper for doing a FIB and neighbor lookups in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet is expected to continue up the stack
for full processing.
Patches 1-6 do some more refactoring to IPv6 with the end goal of
extracting a FIB lookup function that aligns with fib_lookup for IPv4,
basically returning a fib6_info without creating a dst based entry.
Patch 7 adds lookup functions to the ipv6 stub. These are needed since
bpf is built into the kernel and ipv6 may not be built or loaded.
Patch 8 adds the bpf helper and 9 adds a sample program.
v2
- fixed use of foward helper from cls_act as noted by Daniel
- in patch 1 rename fib6_lookup_1 as well for consistency
David Ahern (9):
net/ipv6: Rename fib6_lookup to fib6_node_lookup
net/ipv6: Rename rt6_multipath_select
net/ipv6: Extract table lookup from ip6_pol_route
net/ipv6: Refactor fib6_rule_action
net/ipv6: Add fib6_lookup
net/ipv6: Update fib6 tracepoint to take fib6_info
net/ipv6: Add fib lookup stubs for use in bpf helper
bpf: Provide helper to do lookups in kernel FIB table
samples/bpf: Add examples of ipv4 and ipv6 forwarding in XDP
include/net/addrconf.h | 14 ++
include/net/ip6_fib.h | 21 ++-
include/trace/events/fib6.h | 14 +-
include/uapi/linux/bpf.h | 83 +++++++++-
net/core/filter.c | 263 ++++++++++++++++++++++++++++++
net/ipv6/addrconf_core.c | 33 +++-
net/ipv6/af_inet6.c | 6 +-
net/ipv6/fib6_rules.c | 138 +++++++++++++---
net/ipv6/ip6_fib.c | 21 ++-
net/ipv6/route.c | 76 +++++----
samples/bpf/Makefile | 4 +
samples/bpf/xdp_fwd_kern.c | 110 +++++++++++++
samples/bpf/xdp_fwd_user.c | 136 +++++++++++++++
tools/testing/selftests/bpf/bpf_helpers.h | 3 +
14 files changed, 847 insertions(+), 75 deletions(-)
create mode 100644 samples/bpf/xdp_fwd_kern.c
create mode 100644 samples/bpf/xdp_fwd_user.c
--
2.11.0
^ permalink raw reply
* [PATCH net] ipv6: fix uninit-value in ip6_multipath_l3_keys()
From: Eric Dumazet @ 2018-04-29 16:54 UTC (permalink / raw)
To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, Jakub Sitnicki
syzbot/KMSAN reported an uninit-value in ip6_multipath_l3_keys(),
root caused to a bad assumption of ICMP header being already
pulled in skb->head
ip_multipath_l3_keys() does the correct thing, so it is an IPv6 only bug.
BUG: KMSAN: uninit-value in ip6_multipath_l3_keys net/ipv6/route.c:1830 [inline]
BUG: KMSAN: uninit-value in rt6_multipath_hash+0x5c4/0x640 net/ipv6/route.c:1858
CPU: 0 PID: 4507 Comm: syz-executor661 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x185/0x1d0 lib/dump_stack.c:53
kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
__msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
ip6_multipath_l3_keys net/ipv6/route.c:1830 [inline]
rt6_multipath_hash+0x5c4/0x640 net/ipv6/route.c:1858
ip6_route_input+0x65a/0x920 net/ipv6/route.c:1884
ip6_rcv_finish+0x413/0x6e0 net/ipv6/ip6_input.c:69
NF_HOOK include/linux/netfilter.h:288 [inline]
ipv6_rcv+0x1e16/0x2340 net/ipv6/ip6_input.c:208
__netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4562
__netif_receive_skb net/core/dev.c:4627 [inline]
netif_receive_skb_internal+0x49d/0x630 net/core/dev.c:4701
netif_receive_skb+0x230/0x240 net/core/dev.c:4725
tun_rx_batched drivers/net/tun.c:1555 [inline]
tun_get_user+0x740f/0x7c60 drivers/net/tun.c:1962
tun_chr_write_iter+0x1d4/0x330 drivers/net/tun.c:1990
call_write_iter include/linux/fs.h:1782 [inline]
new_sync_write fs/read_write.c:469 [inline]
__vfs_write+0x7fb/0x9f0 fs/read_write.c:482
vfs_write+0x463/0x8d0 fs/read_write.c:544
SYSC_write+0x172/0x360 fs/read_write.c:589
SyS_write+0x55/0x80 fs/read_write.c:581
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
Fixes: 23aebdacb05d ("ipv6: Compute multipath hash for ICMP errors from offending packet")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
Cc: Jakub Sitnicki <jkbs@redhat.com>
---
net/ipv6/route.c | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cde7d8251377c1a115e02c46843d361d3c0b4313..f4d61736c41abe8cd7f439c4a37100e90c1eacca 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1835,11 +1835,16 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
const struct ipv6hdr *inner_iph;
const struct icmp6hdr *icmph;
struct ipv6hdr _inner_iph;
+ struct icmp6hdr _icmph;
if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
goto out;
- icmph = icmp6_hdr(skb);
+ icmph = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_icmph), &_icmph);
+ if (!icmph)
+ goto out;
+
if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
--
2.17.0.441.gb46fe60e1d-goog
^ permalink raw reply related
* Re: [PATCH bpf-next v3 2/4] bpf: sockmap, add hash map support
From: John Fastabend @ 2018-04-29 16:41 UTC (permalink / raw)
To: Alexei Starovoitov; +Cc: daniel, ast, netdev
In-Reply-To: <20180429161620.qrtjehd4qozixgfx@ast-mbp>
On 04/29/2018 09:16 AM, Alexei Starovoitov wrote:
> On Sat, Apr 28, 2018 at 08:41:28PM -0700, John Fastabend wrote:
>> Sockmap is currently backed by an array and enforces keys to be
>> four bytes. This works well for many use cases and was originally
>> modeled after devmap which also uses four bytes keys. However,
>> this has become limiting in larger use cases where a hash would
>> be more appropriate. For example users may want to use the 5-tuple
>> of the socket as the lookup key.
>>
>> To support this add hash support.
>>
>> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
> ...
>> enum bpf_prog_type {
>> @@ -1835,7 +1836,10 @@ struct bpf_stack_build_id {
>> FN(msg_pull_data), \
>> FN(bind), \
>> FN(xdp_adjust_tail), \
>> - FN(skb_get_xfrm_state),
>> + FN(skb_get_xfrm_state), \
>> + FN(sock_hash_update), \
>> + FN(msg_redirect_hash), \
>> + FN(sk_redirect_hash),
>
> Documentation for new helpers is missing. Please add it in this commit.
>
OK.
> Also running test_sockmap in the latest bpf-next I see:
> [TEST 81]: (10, 1024, 256, sendpage, pass,apply,): [ 14.227128] WARNING: CPU: 1 PID: 202 at ../net/core/stream.c:206 sk_stream_kill_queues+0x3ca/0x540
> [ 14.228209] Modules linked in:
> [ 14.228583] CPU: 1 PID: 202 Comm: test_sockmap Not tainted 4.17.0-rc2-00438-gfcf85729d8ef #941
> [ 14.229595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> [ 14.230649] RIP: 0010:sk_stream_kill_queues+0x3ca/0x540
> [ 14.231251] RSP: 0018:ffff880111717d00 EFLAGS: 00010202
> [ 14.231855] RAX: 0000000000000000 RBX: ffff8801127f8978 RCX: 1ffff100224ff138
> [ 14.232664] RDX: 0000000000000000 RSI: 0000000000000f01 RDI: ffff8801127f89c0
> [ 14.233521] RBP: ffff8801127f8948 R08: ffffed00224ff122 R09: ffffed00224ff121
> [ 14.234342] R10: ffff8801127f890b R11: ffffed00224ff122 R12: ffff8801127f8998
> [ 14.235169] R13: ffff8801127f88a8 R14: ffff8801127f8880 R15: ffff8801127f8930
> [ 14.235987] FS: 00007f3d46898700(0000) GS:ffff88011b080000(0000) knlGS:0000000000000000
> [ 14.236946] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 14.237607] CR2: 00007f3d45e3caf0 CR3: 0000000112da0001 CR4: 00000000000606e0
> [ 14.238433] Call Trace:
> [ 14.238741] ? tcp_v4_destroy_sock+0x273/0x4d0
> [ 14.239259] inet_csk_destroy_sock+0x157/0x390
> [ 14.239815] tcp_close+0x683/0xf60
> [ 14.240214] inet_release+0xd6/0x1b0
> [ 14.240636] sock_release+0x7e/0x190
> [ 14.241090] sock_close+0xe/0x20
> [ 14.241593] __fput+0x22f/0x720
> [ 14.241981] task_work_run+0x10b/0x190
> [ 14.242425] exit_to_usermode_loop+0xd2/0xf0
> [ 14.242963] do_syscall_64+0x1d5/0x270
> [ 14.243410] ? page_fault+0x8/0x30
> [ 14.243830] entry_SYSCALL_64_after_hwframe+0x44/0xa9
>
> and
>
> [TEST 84]: (1, 1, 1, sendmsg, pass,cork,): [ 14.317100] WARNING: CPU: 1 PID: 66 at ../net/ipv4/af_inet.c:156 inet_sock_destruct+0x570/0x7b0
> [ 14.318147] Modules linked in:
> [ 14.318522] CPU: 1 PID: 66 Comm: kworker/1:1 Tainted: G W 4.17.0-rc2-00438-gfcf85729d8ef #941
> [ 14.319674] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
> [ 14.320764] Workqueue: events smap_gc_work
> [ 14.321262] RIP: 0010:inet_sock_destruct+0x570/0x7b0
> [ 14.321829] RSP: 0018:ffff880118617d08 EFLAGS: 00010206
> [ 14.322415] RAX: 0000000000000ffc RBX: ffff8801127f9980 RCX: ffffffff82c7ac64
> [ 14.323221] RDX: 1ffff100224ff34f RSI: 0000000000000004 RDI: ffff8801127f9a78
> [ 14.324054] RBP: ffff8801127f9ac4 R08: ffffed00224ff359 R09: ffffed00224ff358
> [ 14.324860] R10: ffff8801127f9ac7 R11: ffffed00224ff359 R12: ffff880111107688
> [ 14.325661] R13: dffffc0000000000 R14: ffff880111107688 R15: dead000000000100
> [ 14.326503] FS: 0000000000000000(0000) GS:ffff88011b080000(0000) knlGS:0000000000000000
> [ 14.327413] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 14.328091] CR2: 00007f3d45e3caf0 CR3: 0000000003a0e004 CR4: 00000000000606e0
> [ 14.328936] Call Trace:
> [ 14.329225] __sk_destruct+0x45/0x460
> [ 14.329646] smap_gc_work+0x69f/0x950
> [ 14.330102] process_one_work+0x7be/0x1230
> [ 14.330600] worker_thread+0xd9/0x1080
> [ 14.331059] ? rescuer_thread+0xd70/0xd70
> [ 14.331543] kthread+0x29e/0x390
> [ 14.331945] ? kthread_create_worker_on_cpu+0xb0/0xb0
> [ 14.332555] ret_from_fork+0x1f/0x30
>
> Please send a fix for this issue first.
> I'd like to see test_sockmap being clean before adding hash support to sockmap.
>
Sure, I'll have a fix shortly.
^ permalink raw reply
* [PATCH net-next 2/2] sctp: add sctp_make_op_error_limited and reuse inner functions
From: Marcelo Ricardo Leitner @ 2018-04-29 15:56 UTC (permalink / raw)
To: netdev; +Cc: linux-sctp, Vlad Yasevich, Neil Horman, Xin Long
In-Reply-To: <cover.1525017179.git.marcelo.leitner@gmail.com>
The idea is quite similar to the old functions, but note that the _fixed
function wasn't "fixed" as in that it would generate a packet with a fixed
size, but rather limited/bounded to PMTU.
Also, now with sctp_mtu_payload(), we have a more accurate limit.
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
net/sctp/sm_make_chunk.c | 130 +++++++++++++++++------------------------------
1 file changed, 46 insertions(+), 84 deletions(-)
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index e518eb64ccf3578f7892da050c160a56cf3cc833..4d7b3ccea0789f3a695f710046b50855e4cc41fc 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -81,8 +81,6 @@ static int sctp_process_param(struct sctp_association *asoc,
gfp_t gfp);
static void *sctp_addto_param(struct sctp_chunk *chunk, int len,
const void *data);
-static void *sctp_addto_chunk_fixed(struct sctp_chunk *, int len,
- const void *data);
/* Control chunk destructor */
static void sctp_control_release_owner(struct sk_buff *skb)
@@ -154,9 +152,8 @@ static const struct sctp_paramhdr prsctp_param = {
cpu_to_be16(sizeof(struct sctp_paramhdr)),
};
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk.
+/* A helper to initialize an op error inside a provided chunk, as most
+ * cause codes will be embedded inside an abort chunk.
*/
int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
size_t paylen)
@@ -177,29 +174,6 @@ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
return 0;
}
-/* A helper to initialize an op error inside a
- * provided chunk, as most cause codes will be embedded inside an
- * abort chunk. Differs from sctp_init_cause in that it won't oops
- * if there isn't enough space in the op error chunk
- */
-static int sctp_init_cause_fixed(struct sctp_chunk *chunk, __be16 cause_code,
- size_t paylen)
-{
- struct sctp_errhdr err;
- __u16 len;
-
- /* Cause code constants are now defined in network order. */
- err.cause = cause_code;
- len = sizeof(err) + paylen;
- err.length = htons(len);
-
- if (skb_tailroom(chunk->skb) < len)
- return -ENOSPC;
-
- chunk->subh.err_hdr = sctp_addto_chunk_fixed(chunk, sizeof(err), &err);
-
- return 0;
-}
/* 3.3.2 Initiation (INIT) (1)
*
* This chunk is used to initiate a SCTP association between two
@@ -1263,20 +1237,26 @@ static struct sctp_chunk *sctp_make_op_error_space(
return retval;
}
-/* Create an Operation Error chunk of a fixed size,
- * specifically, max(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT)
- * This is a helper function to allocate an error chunk for
- * for those invalid parameter codes in which we may not want
- * to report all the errors, if the incoming chunk is large
+/* Create an Operation Error chunk of a fixed size, specifically,
+ * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads.
+ * This is a helper function to allocate an error chunk for for those
+ * invalid parameter codes in which we may not want to report all the
+ * errors, if the incoming chunk is large. If it can't fit in a single
+ * packet, we ignore it.
*/
-static inline struct sctp_chunk *sctp_make_op_error_fixed(
+static inline struct sctp_chunk *sctp_make_op_error_limited(
const struct sctp_association *asoc,
const struct sctp_chunk *chunk)
{
- size_t size = asoc ? asoc->pathmtu : 0;
+ size_t size = SCTP_DEFAULT_MAXSEGMENT;
+ struct sctp_sock *sp = NULL;
+
+ if (asoc) {
+ size = min_t(size_t, size, asoc->pathmtu);
+ sp = sctp_sk(asoc->base.sk);
+ }
- if (!size)
- size = SCTP_DEFAULT_MAXSEGMENT;
+ size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr));
return sctp_make_op_error_space(asoc, chunk, size);
}
@@ -1528,18 +1508,6 @@ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data)
return target;
}
-/* Append bytes to the end of a chunk. Returns NULL if there isn't sufficient
- * space in the chunk
- */
-static void *sctp_addto_chunk_fixed(struct sctp_chunk *chunk,
- int len, const void *data)
-{
- if (skb_tailroom(chunk->skb) >= len)
- return sctp_addto_chunk(chunk, len, data);
- else
- return NULL;
-}
-
/* Append bytes from user space to the end of a chunk. Will panic if
* chunk is not big enough.
* Returns a kernel err value.
@@ -1834,6 +1802,9 @@ struct sctp_association *sctp_unpack_cookie(
kt = ktime_get_real();
if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
+ suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
+ __be32 n = htonl(usecs);
+
/*
* Section 3.3.10.3 Stale Cookie Error (3)
*
@@ -1842,17 +1813,12 @@ struct sctp_association *sctp_unpack_cookie(
* Stale Cookie Error: Indicates the receipt of a valid State
* Cookie that has expired.
*/
- len = ntohs(chunk->chunk_hdr->length);
- *errp = sctp_make_op_error_space(asoc, chunk, len);
- if (*errp) {
- suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration));
- __be32 n = htonl(usecs);
-
- sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE,
- sizeof(n));
- sctp_addto_chunk(*errp, sizeof(n), &n);
+ *errp = sctp_make_op_error(asoc, chunk,
+ SCTP_ERROR_STALE_COOKIE, &n,
+ sizeof(n), 0);
+ if (*errp)
*error = -SCTP_IERROR_STALE_COOKIE;
- } else
+ else
*error = -SCTP_IERROR_NOMEM;
goto fail;
@@ -2003,12 +1969,8 @@ static int sctp_process_hn_param(const struct sctp_association *asoc,
if (*errp)
sctp_chunk_free(*errp);
- *errp = sctp_make_op_error_space(asoc, chunk, len);
-
- if (*errp) {
- sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, len);
- sctp_addto_chunk(*errp, len, param.v);
- }
+ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED,
+ param.v, len, 0);
/* Stop processing this chunk. */
return 0;
@@ -2133,23 +2095,23 @@ static enum sctp_ierror sctp_process_unk_param(
/* Make an ERROR chunk, preparing enough room for
* returning multiple unknown parameters.
*/
- if (NULL == *errp)
- *errp = sctp_make_op_error_fixed(asoc, chunk);
-
- if (*errp) {
- if (!sctp_init_cause_fixed(*errp, SCTP_ERROR_UNKNOWN_PARAM,
- SCTP_PAD4(ntohs(param.p->length))))
- sctp_addto_chunk_fixed(*errp,
- SCTP_PAD4(ntohs(param.p->length)),
- param.v);
- } else {
- /* If there is no memory for generating the ERROR
- * report as specified, an ABORT will be triggered
- * to the peer and the association won't be
- * established.
- */
- retval = SCTP_IERROR_NOMEM;
+ if (!*errp) {
+ *errp = sctp_make_op_error_limited(asoc, chunk);
+ if (!*errp) {
+ /* If there is no memory for generating the
+ * ERROR report as specified, an ABORT will be
+ * triggered to the peer and the association
+ * won't be established.
+ */
+ retval = SCTP_IERROR_NOMEM;
+ break;
+ }
}
+
+ if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM,
+ ntohs(param.p->length)))
+ sctp_addto_chunk(*errp, ntohs(param.p->length),
+ param.v);
break;
default:
break;
@@ -2225,10 +2187,10 @@ static enum sctp_ierror sctp_verify_param(struct net *net,
* MUST be aborted. The ABORT chunk SHOULD contain the error
* cause 'Protocol Violation'.
*/
- if (SCTP_AUTH_RANDOM_LENGTH !=
- ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) {
+ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) -
+ sizeof(struct sctp_paramhdr)) {
sctp_process_inv_paramlength(asoc, param.p,
- chunk, err_chunk);
+ chunk, err_chunk);
retval = SCTP_IERROR_ABORT;
}
break;
--
2.14.3
^ permalink raw reply related
* [PATCH net-next 1/2] sctp: allow sctp_init_cause to return errors
From: Marcelo Ricardo Leitner @ 2018-04-29 15:56 UTC (permalink / raw)
To: netdev; +Cc: linux-sctp, Vlad Yasevich, Neil Horman, Xin Long
In-Reply-To: <cover.1525017179.git.marcelo.leitner@gmail.com>
And do so if the skb doesn't have enough space for the payload.
This is a preparation for the next patch.
Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
---
include/net/sctp/sm.h | 2 +-
net/sctp/sm_make_chunk.c | 12 +++++++++---
2 files changed, 10 insertions(+), 4 deletions(-)
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index f4b657478a304050851f33d92c71162a4a4a2e50..5ef1bad81ef54906b375dbfd0e4fd897d078abce 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -215,7 +215,7 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
struct sctp_chunk *sctp_make_shutdown_complete(
const struct sctp_association *asoc,
const struct sctp_chunk *chunk);
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause, size_t paylen);
struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
const struct sctp_chunk *chunk,
const size_t hint);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index db93eabd6ef500ab20be6e091ee06bd3b60923c9..e518eb64ccf3578f7892da050c160a56cf3cc833 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -158,8 +158,8 @@ static const struct sctp_paramhdr prsctp_param = {
* provided chunk, as most cause codes will be embedded inside an
* abort chunk.
*/
-void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
- size_t paylen)
+int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
+ size_t paylen)
{
struct sctp_errhdr err;
__u16 len;
@@ -167,8 +167,14 @@ void sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code,
/* Cause code constants are now defined in network order. */
err.cause = cause_code;
len = sizeof(err) + paylen;
- err.length = htons(len);
+ err.length = htons(len);
+
+ if (skb_tailroom(chunk->skb) < len)
+ return -ENOSPC;
+
chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err);
+
+ return 0;
}
/* A helper to initialize an op error inside a
--
2.14.3
^ permalink raw reply related
* [PATCH net-next 0/2] sctp: unify sctp_make_op_error_fixed and sctp_make_op_error_space
From: Marcelo Ricardo Leitner @ 2018-04-29 15:56 UTC (permalink / raw)
To: netdev; +Cc: linux-sctp, Vlad Yasevich, Neil Horman, Xin Long
These two variants are very close to each other and can be merged
to avoid code duplication. That's what this patchset does.
First, we allow sctp_init_cause to return errors, which then allow us to
add sctp_make_op_error_limited that handles both situations.
Marcelo Ricardo Leitner (2):
sctp: allow sctp_init_cause to return errors
sctp: add sctp_make_op_error_limited and reuse inner functions
include/net/sctp/sm.h | 2 +-
net/sctp/sm_make_chunk.c | 134 ++++++++++++++++++-----------------------------
2 files changed, 52 insertions(+), 84 deletions(-)
^ permalink raw reply
* INFO: rcu detected stall in skb_free_head
From: syzbot @ 2018-04-29 16:33 UTC (permalink / raw)
To: avagin, davem, ktkhai, linux-kernel, netdev, syzkaller-bugs
Hello,
syzbot hit the following crash on upstream commit
a27fc14219f2e3c4a46ba9177b04d9b52c875532 (Mon Apr 16 21:07:39 2018 +0000)
Merge branch 'parisc-4.17-3' of
git://git.kernel.org/pub/scm/linux/kernel/git/deller/parisc-linux
syzbot dashboard link:
https://syzkaller.appspot.com/bug?extid=cac7c17ec0aca89d3c45
Unfortunately, I don't have any reproducer for this crash yet.
Raw console output:
https://syzkaller.appspot.com/x/log.txt?id=6517400396627968
Kernel config:
https://syzkaller.appspot.com/x/.config?id=-5914490758943236750
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+cac7c17ec0aca89d3c45@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for
details.
If you forward the report, please keep this part and the footer.
INFO: rcu_sched self-detected stall on CPU
1-...!: (117917 ticks this GP) idle=036/1/4611686018427387906
softirq=114416/114416 fqs=32
(t=125000 jiffies g=60712 c=60711 q=345938)
rcu_sched kthread starved for 124847 jiffies! g60712 c60711 f0x2
RCU_GP_WAIT_FQS(3) ->state=0x0 ->cpu=0
RCU grace-period kthread stack dump:
rcu_sched R running task 23592 9 2 0x80000000
Call Trace:
context_switch kernel/sched/core.c:2848 [inline]
__schedule+0x801/0x1e30 kernel/sched/core.c:3490
schedule+0xef/0x430 kernel/sched/core.c:3549
schedule_timeout+0x138/0x240 kernel/time/timer.c:1801
rcu_gp_kthread+0x6b5/0x1940 kernel/rcu/tree.c:2231
kthread+0x345/0x410 kernel/kthread.c:238
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412
NMI backtrace for cpu 1
CPU: 1 PID: 24 Comm: kworker/1:1 Not tainted 4.17.0-rc1+ #6
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Workqueue: events rht_deferred_worker
Call Trace:
<IRQ>
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
print_cpu_stall kernel/rcu/tree.c:1525 [inline]
check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
__rcu_pending kernel/rcu/tree.c:3356 [inline]
rcu_pending kernel/rcu/tree.c:3401 [inline]
rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
update_process_times+0x2d/0x70 kernel/time/timer.c:1636
tick_sched_handle+0x9f/0x180 kernel/time/tick-sched.c:173
tick_sched_timer+0x45/0x130 kernel/time/tick-sched.c:1283
__run_hrtimer kernel/time/hrtimer.c:1386 [inline]
__hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1448
hrtimer_interrupt+0x286/0x650 kernel/time/hrtimer.c:1506
local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:863
RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783
[inline]
RIP: 0010:kfree+0x124/0x260 mm/slab.c:3814
RSP: 0018:ffff8801db105450 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
RAX: 0000000000000007 RBX: ffff88006c118040 RCX: 1ffff1003b3059e7
RDX: 0000000000000000 RSI: ffff8801d982cf90 RDI: 0000000000000286
RBP: ffff8801db105470 R08: ffff8801d982ce78 R09: 0000000000000002
R10: ffff8801d982c640 R11: 0000000000000000 R12: 0000000000000286
R13: ffff8801dac00ac0 R14: ffffffff85bd7b69 R15: ffff88006c0f8180
skb_free_head+0x99/0xc0 net/core/skbuff.c:550
skb_release_data+0x690/0x860 net/core/skbuff.c:570
skb_release_all+0x4a/0x60 net/core/skbuff.c:627
__kfree_skb net/core/skbuff.c:641 [inline]
kfree_skb+0x195/0x560 net/core/skbuff.c:659
enqueue_to_backlog+0x2fc/0xc90 net/core/dev.c:3968
netif_rx_internal+0x14d/0xae0 net/core/dev.c:4181
netif_rx+0xba/0x400 net/core/dev.c:4206
loopback_xmit+0x283/0x741 drivers/net/loopback.c:91
__netdev_start_xmit include/linux/netdevice.h:4087 [inline]
netdev_start_xmit include/linux/netdevice.h:4096 [inline]
xmit_one net/core/dev.c:3053 [inline]
dev_hard_start_xmit+0x264/0xc10 net/core/dev.c:3069
__dev_queue_xmit+0x2724/0x34c0 net/core/dev.c:3584
dev_queue_xmit+0x17/0x20 net/core/dev.c:3617
neigh_hh_output include/net/neighbour.h:472 [inline]
neigh_output include/net/neighbour.h:480 [inline]
ip_finish_output2+0x1046/0x1840 net/ipv4/ip_output.c:229
ip_finish_output+0x828/0xf80 net/ipv4/ip_output.c:317
NF_HOOK_COND include/linux/netfilter.h:277 [inline]
ip_output+0x21b/0x850 net/ipv4/ip_output.c:405
dst_output include/net/dst.h:444 [inline]
ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
ip_queue_xmit+0x9d7/0x1f70 net/ipv4/ip_output.c:504
sctp_v4_xmit+0x108/0x140 net/sctp/protocol.c:983
sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:650
sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
expire_timers kernel/time/timer.c:1363 [inline]
__run_timers+0x79e/0xc50 kernel/time/timer.c:1666
run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
__do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1046
</IRQ>
do_softirq.part.17+0x14d/0x190 kernel/softirq.c:329
do_softirq arch/x86/include/asm/preempt.h:23 [inline]
__local_bh_enable_ip+0x1ec/0x230 kernel/softirq.c:182
__raw_spin_unlock_bh include/linux/spinlock_api_smp.h:176 [inline]
_raw_spin_unlock_bh+0x30/0x40 kernel/locking/spinlock.c:200
spin_unlock_bh include/linux/spinlock.h:355 [inline]
rhashtable_rehash_chain lib/rhashtable.c:292 [inline]
rhashtable_rehash_table lib/rhashtable.c:333 [inline]
rht_deferred_worker+0x1058/0x1fb0 lib/rhashtable.c:432
process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
kthread+0x345/0x410 kernel/kthread.c:238
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412
BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 126s!
BUG: workqueue lockup - pool cpus=0-1 flags=0x4 nice=0 stuck for 126s!
Showing busy workqueues and worker pools:
workqueue events: flags=0x0
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=8/256
in-flight: 24:rht_deferred_worker
pending: rht_deferred_worker, defense_work_handler,
defense_work_handler, perf_sched_delayed, defense_work_handler,
switchdev_deferred_process_work, cache_reap
pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=11/256
pending: jump_label_update_timeout, defense_work_handler,
defense_work_handler, defense_work_handler, defense_work_handler,
defense_work_handler, defense_work_handler, check_corruption,
key_garbage_collector, vmstat_shepherd, cache_reap
workqueue events_long: flags=0x0
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=3/256
pending: br_fdb_cleanup, br_fdb_cleanup, br_fdb_cleanup
pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=5/256
pending: br_fdb_cleanup, br_fdb_cleanup, br_fdb_cleanup,
br_fdb_cleanup, br_fdb_cleanup
workqueue events_unbound: flags=0x2
pwq 4: cpus=0-1 flags=0x4 nice=0 active=1/512
in-flight: 10434:fsnotify_connector_destroy_workfn
workqueue events_power_efficient: flags=0x80
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256
pending: neigh_periodic_work
pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=4/256
pending: check_lifetime, gc_worker, do_cache_clean, neigh_periodic_work
workqueue rcu_gp: flags=0x8
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256
pending: process_srcu
workqueue mm_percpu_wq: flags=0x8
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256
pending: vmstat_update
pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
pending: vmstat_update
workqueue writeback: flags=0x4e
pwq 4: cpus=0-1 flags=0x4 nice=0 active=1/256
in-flight: 13940:wb_workfn
workqueue kblockd: flags=0x18
pwq 1: cpus=0 node=0 flags=0x0 nice=-20 active=2/256
pending: blk_mq_timeout_work, blk_mq_timeout_work
workqueue dm_bufio_cache: flags=0x8
pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256
pending: work_fn
workqueue ipv6_addrconf: flags=0x40008
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1
pending: addrconf_verify_work
workqueue krdsd: flags=0xe000a
pwq 4: cpus=0-1 flags=0x4 nice=0 active=1/1
in-flight: 301:rds_connect_worker
pool 2: cpus=1 node=0 flags=0x0 nice=0 hung=127s workers=3 idle: 4831 4860
pool 4: cpus=0-1 flags=0x4 nice=0 hung=0s workers=8 idle: 301 22 6 7437
6882 50
---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.
syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug
report.
Note: all commands must start from beginning of the line in the email body.
^ permalink raw reply
* KMSAN: uninit-value in _decode_session4
From: syzbot @ 2018-04-29 16:32 UTC (permalink / raw)
To: davem, herbert, kuznet, linux-kernel, netdev, steffen.klassert,
syzkaller-bugs, yoshfuji
Hello,
syzbot hit the following crash on
https://github.com/google/kmsan.git/master commit
d2d741e5d1898dfde1a75ea3d29a9a3e2edf0617 (Sun Apr 22 15:05:22 2018 +0000)
kmsan: add initialization for shmem pages
syzbot dashboard link:
https://syzkaller.appspot.com/bug?extid=e7fec512bc2eb4ae0781
So far this crash happened 6 times on
https://github.com/google/kmsan.git/master.
C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5844177157881856
syzkaller reproducer:
https://syzkaller.appspot.com/x/repro.syz?id=6093669123751936
Raw console output:
https://syzkaller.appspot.com/x/log.txt?id=4545366699540480
Kernel config: https://syzkaller.appspot.com/x/.config?id=328654897048964367
compiler: clang version 7.0.0 (trunk 329391)
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+e7fec512bc2eb4ae0781@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for
details.
If you forward the report, please keep this part and the footer.
IPv6: ADDRCONF(NETDEV_UP): veth1: link is not ready
IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
8021q: adding VLAN 0 to HW filter on device team0
==================================================================
BUG: KMSAN: uninit-value in _decode_session4+0x11d3/0x1ce0
net/ipv4/xfrm4_policy.c:126
CPU: 0 PID: 4502 Comm: syz-executor427 Not tainted 4.16.0+ #87
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:17 [inline]
dump_stack+0x185/0x1d0 lib/dump_stack.c:53
kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
__msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
_decode_session4+0x11d3/0x1ce0 net/ipv4/xfrm4_policy.c:126
__xfrm_decode_session+0x151/0x200 net/xfrm/xfrm_policy.c:2368
xfrm_decode_session include/net/xfrm.h:1206 [inline]
vti6_tnl_xmit+0x49b/0x2070 net/ipv6/ip6_vti.c:546
__netdev_start_xmit include/linux/netdevice.h:4066 [inline]
netdev_start_xmit include/linux/netdevice.h:4075 [inline]
xmit_one net/core/dev.c:3026 [inline]
dev_hard_start_xmit+0x5f1/0xc70 net/core/dev.c:3042
__dev_queue_xmit+0x27ee/0x3520 net/core/dev.c:3557
dev_queue_xmit+0x4b/0x60 net/core/dev.c:3590
packet_snd net/packet/af_packet.c:2944 [inline]
packet_sendmsg+0x7c70/0x8a30 net/packet/af_packet.c:2969
sock_sendmsg_nosec net/socket.c:630 [inline]
sock_sendmsg net/socket.c:640 [inline]
SYSC_sendto+0x6c3/0x7e0 net/socket.c:1747
SyS_sendto+0x8a/0xb0 net/socket.c:1715
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x4418f9
RSP: 002b:00007ffcea97afc8 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004418f9
RDX: 0000000000000000 RSI: 00000000200001c0 RDI: 0000000000000003
RBP: 00000000006cd018 R08: 0000000020000000 R09: 000000000000001c
R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004025f0
R13: 0000000000402680 R14: 0000000000000000 R15: 0000000000000000
Uninit was created at:
kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:188
kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:314
kmsan_slab_alloc+0x11/0x20 mm/kmsan/kmsan.c:321
slab_post_alloc_hook mm/slab.h:445 [inline]
slab_alloc_node mm/slub.c:2737 [inline]
__kmalloc_node_track_caller+0xaed/0x11c0 mm/slub.c:4369
__kmalloc_reserve net/core/skbuff.c:138 [inline]
__alloc_skb+0x2cf/0x9f0 net/core/skbuff.c:206
alloc_skb include/linux/skbuff.h:984 [inline]
alloc_skb_with_frags+0x1d4/0xb20 net/core/skbuff.c:5234
sock_alloc_send_pskb+0xb56/0x1190 net/core/sock.c:2085
packet_alloc_skb net/packet/af_packet.c:2803 [inline]
packet_snd net/packet/af_packet.c:2894 [inline]
packet_sendmsg+0x6454/0x8a30 net/packet/af_packet.c:2969
sock_sendmsg_nosec net/socket.c:630 [inline]
sock_sendmsg net/socket.c:640 [inline]
SYSC_sendto+0x6c3/0x7e0 net/socket.c:1747
SyS_sendto+0x8a/0xb0 net/socket.c:1715
do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
==================================================================
---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.
syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
If you want to test a patch for this bug, please reply with:
#syz test: git://repo/address.git branch
and provide the patch inline or as an attachment.
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug
report.
Note: all commands must start from beginning of the line in the email body.
^ permalink raw reply
* WARNING: ODEBUG bug in __sk_destruct
From: syzbot @ 2018-04-29 16:31 UTC (permalink / raw)
To: davem, linux-kernel, linux-s390, netdev, syzkaller-bugs, ubraun
Hello,
syzbot hit the following crash on net-next commit
af201bab50a89aa6cf4df952b2c3bf55895c8eee (Fri Apr 27 15:12:10 2018 +0000)
udp: remove stray export symbol
syzbot dashboard link:
https://syzkaller.appspot.com/bug?extid=92209502e7aab127c75f
So far this crash happened 5 times on net-next.
Unfortunately, I don't have any reproducer for this crash yet.
Raw console output:
https://syzkaller.appspot.com/x/log.txt?id=6049832271609856
Kernel config:
https://syzkaller.appspot.com/x/.config?id=4410550353033654931
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+92209502e7aab127c75f@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for
details.
If you forward the report, please keep this part and the footer.
------------[ cut here ]------------
ODEBUG: free active (active state 0) object type: work_struct hint:
smc_tcp_listen_work+0x0/0xec0 net/smc/af_smc.c:1014
WARNING: CPU: 0 PID: 9815 at lib/debugobjects.c:329
debug_print_object+0x16a/0x210 lib/debugobjects.c:326
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 9815 Comm: syz-executor7 Not tainted 4.17.0-rc2+ #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
panic+0x22f/0x4de kernel/panic.c:184
__warn.cold.8+0x163/0x1b3 kernel/panic.c:536
report_bug+0x252/0x2d0 lib/bug.c:186
fixup_bug arch/x86/kernel/traps.c:178 [inline]
do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326
RSP: 0018:ffff88019790ecf0 EFLAGS: 00010082
RAX: 0000000000000061 RBX: 0000000000000003 RCX: ffffffff818433e8
RDX: 0000000000000000 RSI: ffffffff8160f561 RDI: 0000000000000001
RBP: ffff88019790ed30 R08: ffff8801aced62c0 R09: ffffed003b5c3eb2
R10: ffffed003b5c3eb2 R11: ffff8801dae1f597 R12: 0000000000000001
R13: ffffffff88d5f700 R14: ffffffff87fa3340 R15: ffffffff814ccec0
__debug_check_no_obj_freed lib/debugobjects.c:783 [inline]
debug_check_no_obj_freed+0x3a6/0x584 lib/debugobjects.c:815
kmem_cache_free+0x216/0x2d0 mm/slab.c:3755
sk_prot_free net/core/sock.c:1512 [inline]
__sk_destruct+0x6fe/0xa40 net/core/sock.c:1596
sk_destruct+0x78/0x90 net/core/sock.c:1604
__sk_free+0x22e/0x340 net/core/sock.c:1615
sk_free+0x42/0x50 net/core/sock.c:1626
sock_put include/net/sock.h:1664 [inline]
smc_release+0x459/0x610 net/smc/af_smc.c:162
sock_release+0x96/0x1b0 net/socket.c:594
sock_close+0x16/0x20 net/socket.c:1149
__fput+0x34d/0x890 fs/file_table.c:209
____fput+0x15/0x20 fs/file_table.c:243
task_work_run+0x1e4/0x290 kernel/task_work.c:113
exit_task_work include/linux/task_work.h:22 [inline]
do_exit+0x1aee/0x2730 kernel/exit.c:865
do_group_exit+0x16f/0x430 kernel/exit.c:968
get_signal+0x886/0x1960 kernel/signal.c:2469
do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162
prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x455979
RSP: 002b:00007f6e1a1b9ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
RAX: fffffffffffffe00 RBX: 000000000072bec8 RCX: 0000000000455979
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000072bec8
RBP: 000000000072bec8 R08: 0000000000000000 R09: 000000000072bea0
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 0000000000a3e81f R14: 00007f6e1a1ba9c0 R15: 0000000000000000
======================================================
WARNING: possible circular locking dependency detected
4.17.0-rc2+ #23 Not tainted
------------------------------------------------------
syz-executor7/9815 is trying to acquire lock:
(ptrval) ((console_sem).lock){-.-.}, at: down_trylock+0x13/0x70
kernel/locking/semaphore.c:136
but task is already holding lock:
(ptrval) (&obj_hash[i].lock){-.-.}, at: __debug_check_no_obj_freed
lib/debugobjects.c:774 [inline]
(ptrval) (&obj_hash[i].lock){-.-.}, at:
debug_check_no_obj_freed+0x159/0x584 lib/debugobjects.c:815
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #3 (&obj_hash[i].lock){-.-.}:
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
__debug_object_init+0x11f/0x12c0 lib/debugobjects.c:381
debug_object_init+0x16/0x20 lib/debugobjects.c:429
debug_hrtimer_init kernel/time/hrtimer.c:400 [inline]
debug_init kernel/time/hrtimer.c:448 [inline]
hrtimer_init+0x8f/0x460 kernel/time/hrtimer.c:1296
init_dl_task_timer+0x1b/0x50 kernel/sched/deadline.c:1056
__sched_fork+0x2ae/0xc20 kernel/sched/core.c:2163
init_idle+0x75/0x7a0 kernel/sched/core.c:5406
sched_init+0xbeb/0xd10 kernel/sched/core.c:6104
start_kernel+0x475/0x92d init/main.c:601
x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:445
x86_64_start_kernel+0x76/0x79 arch/x86/kernel/head64.c:426
secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S:242
-> #2 (&rq->lock){-.-.}:
__raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
_raw_spin_lock+0x2a/0x40 kernel/locking/spinlock.c:144
rq_lock kernel/sched/sched.h:1799 [inline]
task_fork_fair+0x8a/0x660 kernel/sched/fair.c:9967
sched_fork+0x43e/0xb30 kernel/sched/core.c:2379
copy_process.part.38+0x1c13/0x6e90 kernel/fork.c:1764
copy_process kernel/fork.c:1607 [inline]
_do_fork+0x291/0x12a0 kernel/fork.c:2088
kernel_thread+0x34/0x40 kernel/fork.c:2147
rest_init+0x22/0xe4 init/main.c:407
start_kernel+0x906/0x92d init/main.c:737
x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:445
x86_64_start_kernel+0x76/0x79 arch/x86/kernel/head64.c:426
secondary_startup_64+0xa5/0xb0 arch/x86/kernel/head_64.S:242
-> #1 (&p->pi_lock){-.-.}:
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
try_to_wake_up+0xca/0x1190 kernel/sched/core.c:1963
wake_up_process+0x10/0x20 kernel/sched/core.c:2126
__up.isra.1+0x1b8/0x290 kernel/locking/semaphore.c:262
up+0x12f/0x1b0 kernel/locking/semaphore.c:187
__up_console_sem+0xbe/0x1b0 kernel/printk/printk.c:242
console_unlock+0x7d6/0x1100 kernel/printk/printk.c:2417
vprintk_emit+0x6ad/0xdd0 kernel/printk/printk.c:1907
vprintk_default+0x28/0x30 kernel/printk/printk.c:1947
vprintk_func+0x7a/0xe7 kernel/printk/printk_safe.c:379
printk+0x9e/0xba kernel/printk/printk.c:1980
regdb_fw_cb.cold.35+0x18/0x89 net/wireless/reg.c:1074
request_firmware_work_func+0x154/0x2c0
drivers/base/firmware_loader/main.c:749
process_one_work+0xc1e/0x1b50 kernel/workqueue.c:2145
worker_thread+0x1cc/0x1440 kernel/workqueue.c:2279
kthread+0x345/0x410 kernel/kthread.c:238
ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:412
-> #0 ((console_sem).lock){-.-.}:
lock_acquire+0x1dc/0x520 kernel/locking/lockdep.c:3920
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
down_trylock+0x13/0x70 kernel/locking/semaphore.c:136
__down_trylock_console_sem+0xae/0x200 kernel/printk/printk.c:225
console_trylock+0x15/0xa0 kernel/printk/printk.c:2229
console_trylock_spinning kernel/printk/printk.c:1643 [inline]
vprintk_emit+0x694/0xdd0 kernel/printk/printk.c:1906
vprintk_default+0x28/0x30 kernel/printk/printk.c:1947
vprintk_func+0x7a/0xe7 kernel/printk/printk_safe.c:379
printk+0x9e/0xba kernel/printk/printk.c:1980
__warn_printk+0x83/0xd0 kernel/panic.c:590
debug_print_object+0x16a/0x210 lib/debugobjects.c:326
__debug_check_no_obj_freed lib/debugobjects.c:783 [inline]
debug_check_no_obj_freed+0x3a6/0x584 lib/debugobjects.c:815
kmem_cache_free+0x216/0x2d0 mm/slab.c:3755
sk_prot_free net/core/sock.c:1512 [inline]
__sk_destruct+0x6fe/0xa40 net/core/sock.c:1596
sk_destruct+0x78/0x90 net/core/sock.c:1604
__sk_free+0x22e/0x340 net/core/sock.c:1615
sk_free+0x42/0x50 net/core/sock.c:1626
sock_put include/net/sock.h:1664 [inline]
smc_release+0x459/0x610 net/smc/af_smc.c:162
sock_release+0x96/0x1b0 net/socket.c:594
sock_close+0x16/0x20 net/socket.c:1149
__fput+0x34d/0x890 fs/file_table.c:209
____fput+0x15/0x20 fs/file_table.c:243
task_work_run+0x1e4/0x290 kernel/task_work.c:113
exit_task_work include/linux/task_work.h:22 [inline]
do_exit+0x1aee/0x2730 kernel/exit.c:865
do_group_exit+0x16f/0x430 kernel/exit.c:968
get_signal+0x886/0x1960 kernel/signal.c:2469
do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162
prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
do_syscall_64+0x6ac/0x800 arch/x86/entry/common.c:290
entry_SYSCALL_64_after_hwframe+0x49/0xbe
other info that might help us debug this:
Chain exists of:
(console_sem).lock --> &rq->lock --> &obj_hash[i].lock
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&obj_hash[i].lock);
lock(&rq->lock);
lock(&obj_hash[i].lock);
lock((console_sem).lock);
*** DEADLOCK ***
1 lock held by syz-executor7/9815:
#0: (ptrval) (&obj_hash[i].lock){-.-.}, at:
__debug_check_no_obj_freed lib/debugobjects.c:774 [inline]
#0: (ptrval) (&obj_hash[i].lock){-.-.}, at:
debug_check_no_obj_freed+0x159/0x584 lib/debugobjects.c:815
stack backtrace:
CPU: 0 PID: 9815 Comm: syz-executor7 Not tainted 4.17.0-rc2+ #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
print_circular_bug.isra.36.cold.54+0x1bd/0x27d
kernel/locking/lockdep.c:1223
check_prev_add kernel/locking/lockdep.c:1863 [inline]
check_prevs_add kernel/locking/lockdep.c:1976 [inline]
validate_chain kernel/locking/lockdep.c:2417 [inline]
__lock_acquire+0x343e/0x5140 kernel/locking/lockdep.c:3431
lock_acquire+0x1dc/0x520 kernel/locking/lockdep.c:3920
__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline]
_raw_spin_lock_irqsave+0x96/0xc0 kernel/locking/spinlock.c:152
down_trylock+0x13/0x70 kernel/locking/semaphore.c:136
__down_trylock_console_sem+0xae/0x200 kernel/printk/printk.c:225
console_trylock+0x15/0xa0 kernel/printk/printk.c:2229
console_trylock_spinning kernel/printk/printk.c:1643 [inline]
vprintk_emit+0x694/0xdd0 kernel/printk/printk.c:1906
vprintk_default+0x28/0x30 kernel/printk/printk.c:1947
vprintk_func+0x7a/0xe7 kernel/printk/printk_safe.c:379
printk+0x9e/0xba kernel/printk/printk.c:1980
__warn_printk+0x83/0xd0 kernel/panic.c:590
debug_print_object+0x16a/0x210 lib/debugobjects.c:326
__debug_check_no_obj_freed lib/debugobjects.c:783 [inline]
debug_check_no_obj_freed+0x3a6/0x584 lib/debugobjects.c:815
kmem_cache_free+0x216/0x2d0 mm/slab.c:3755
sk_prot_free net/core/sock.c:1512 [inline]
__sk_destruct+0x6fe/0xa40 net/core/sock.c:1596
sk_destruct+0x78/0x90 net/core/sock.c:1604
__sk_free+0x22e/0x340 net/core/sock.c:1615
sk_free+0x42/0x50 net/core/sock.c:1626
sock_put include/net/sock.h:1664 [inline]
smc_release+0x459/0x610 net/smc/af_smc.c:162
sock_release+0x96/0x1b0 net/socket.c:594
sock_close+0x16/0x20 net/socket.c:1149
__fput+0x34d/0x890 fs/file_table.c:209
____fput+0x15/0x20 fs/file_table.c:243
task_work_run+0x1e4/0x290 kernel/task_work.c:113
exit_task_work include/linux/task_work.h:22 [inline]
do_exit+0x1aee/0x2730 kernel/exit.c:865
do_group_exit+0x16f/0x430 kernel/exit.c:968
get_signal+0x886/0x1960 kernel/signal.c:2469
do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
? lock_dow
Lost 26 message(s)!
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..
---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.
syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug
report.
Note: all commands must start from beginning of the line in the email body.
^ permalink raw reply
* Re: [PATCH] can: cc770: fix spelling mistake: "comptibility" -> "compatibility"
From: Marc Kleine-Budde @ 2018-04-29 16:26 UTC (permalink / raw)
To: Colin King, Wolfgang Grandegger, Andri Yngvason, linux-can,
netdev
Cc: kernel-janitors, linux-kernel
In-Reply-To: <20180428221600.20039-1-colin.king@canonical.com>
[-- Attachment #1.1: Type: text/plain, Size: 554 bytes --]
On 04/29/2018 12:16 AM, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
>
> Trivial fix to spelling mistake in module parameter description text
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>
Added to linux-can-next.
Thnx,
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Industrial Linux Solutions | Phone: +49-231-2826-924 |
Vertretung West/Dortmund | Fax: +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686 | http://www.pengutronix.de |
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH net-next] can: dev: use skb_put_zero to simplfy code
From: Marc Kleine-Budde @ 2018-04-29 16:24 UTC (permalink / raw)
To: YueHaibing, wg; +Cc: netdev, linux-can, davem, linux-kernel
In-Reply-To: <20180428074957.17084-1-yuehaibing@huawei.com>
[-- Attachment #1.1: Type: text/plain, Size: 496 bytes --]
On 04/28/2018 09:49 AM, YueHaibing wrote:
> use helper skb_put_zero to replace the pattern of skb_put() && memset()
>
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Applied to linux-can-next.
Tnx,
Marc
--
Pengutronix e.K. | Marc Kleine-Budde |
Industrial Linux Solutions | Phone: +49-231-2826-924 |
Vertretung West/Dortmund | Fax: +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686 | http://www.pengutronix.de |
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH bpf-next v2] bpf: Allow bpf_current_task_under_cgroup in interrupt
From: Alexei Starovoitov @ 2018-04-29 16:20 UTC (permalink / raw)
To: Teng Qin; +Cc: netdev, ast, daniel, yhs, kernel-team
In-Reply-To: <20180429063929.206386-1-qinteng@fb.com>
On Sat, Apr 28, 2018 at 11:39:29PM -0700, Teng Qin wrote:
> Currently, the bpf_current_task_under_cgroup helper has a check where if
> the BPF program is running in_interrupt(), it will return -EINVAL. This
> prevents the helper to be used in many useful scenarios, particularly
> BPF programs attached to Perf Events.
>
> This commit removes the check. Tested a few NMI (Perf Event) and some
> softirq context, the helper returns the correct result.
>
> Signed-off-by: Teng Qin <qinteng@fb.com>
Applied, Thanks Teng.
^ permalink raw reply
* Re: [PATCH bpf-next v3 2/4] bpf: sockmap, add hash map support
From: Alexei Starovoitov @ 2018-04-29 16:16 UTC (permalink / raw)
To: John Fastabend; +Cc: daniel, ast, netdev
In-Reply-To: <1524973290-10016-3-git-send-email-john.fastabend@gmail.com>
On Sat, Apr 28, 2018 at 08:41:28PM -0700, John Fastabend wrote:
> Sockmap is currently backed by an array and enforces keys to be
> four bytes. This works well for many use cases and was originally
> modeled after devmap which also uses four bytes keys. However,
> this has become limiting in larger use cases where a hash would
> be more appropriate. For example users may want to use the 5-tuple
> of the socket as the lookup key.
>
> To support this add hash support.
>
> Signed-off-by: John Fastabend <john.fastabend@gmail.com>
...
> enum bpf_prog_type {
> @@ -1835,7 +1836,10 @@ struct bpf_stack_build_id {
> FN(msg_pull_data), \
> FN(bind), \
> FN(xdp_adjust_tail), \
> - FN(skb_get_xfrm_state),
> + FN(skb_get_xfrm_state), \
> + FN(sock_hash_update), \
> + FN(msg_redirect_hash), \
> + FN(sk_redirect_hash),
Documentation for new helpers is missing. Please add it in this commit.
Also running test_sockmap in the latest bpf-next I see:
[TEST 81]: (10, 1024, 256, sendpage, pass,apply,): [ 14.227128] WARNING: CPU: 1 PID: 202 at ../net/core/stream.c:206 sk_stream_kill_queues+0x3ca/0x540
[ 14.228209] Modules linked in:
[ 14.228583] CPU: 1 PID: 202 Comm: test_sockmap Not tainted 4.17.0-rc2-00438-gfcf85729d8ef #941
[ 14.229595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
[ 14.230649] RIP: 0010:sk_stream_kill_queues+0x3ca/0x540
[ 14.231251] RSP: 0018:ffff880111717d00 EFLAGS: 00010202
[ 14.231855] RAX: 0000000000000000 RBX: ffff8801127f8978 RCX: 1ffff100224ff138
[ 14.232664] RDX: 0000000000000000 RSI: 0000000000000f01 RDI: ffff8801127f89c0
[ 14.233521] RBP: ffff8801127f8948 R08: ffffed00224ff122 R09: ffffed00224ff121
[ 14.234342] R10: ffff8801127f890b R11: ffffed00224ff122 R12: ffff8801127f8998
[ 14.235169] R13: ffff8801127f88a8 R14: ffff8801127f8880 R15: ffff8801127f8930
[ 14.235987] FS: 00007f3d46898700(0000) GS:ffff88011b080000(0000) knlGS:0000000000000000
[ 14.236946] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 14.237607] CR2: 00007f3d45e3caf0 CR3: 0000000112da0001 CR4: 00000000000606e0
[ 14.238433] Call Trace:
[ 14.238741] ? tcp_v4_destroy_sock+0x273/0x4d0
[ 14.239259] inet_csk_destroy_sock+0x157/0x390
[ 14.239815] tcp_close+0x683/0xf60
[ 14.240214] inet_release+0xd6/0x1b0
[ 14.240636] sock_release+0x7e/0x190
[ 14.241090] sock_close+0xe/0x20
[ 14.241593] __fput+0x22f/0x720
[ 14.241981] task_work_run+0x10b/0x190
[ 14.242425] exit_to_usermode_loop+0xd2/0xf0
[ 14.242963] do_syscall_64+0x1d5/0x270
[ 14.243410] ? page_fault+0x8/0x30
[ 14.243830] entry_SYSCALL_64_after_hwframe+0x44/0xa9
and
[TEST 84]: (1, 1, 1, sendmsg, pass,cork,): [ 14.317100] WARNING: CPU: 1 PID: 66 at ../net/ipv4/af_inet.c:156 inet_sock_destruct+0x570/0x7b0
[ 14.318147] Modules linked in:
[ 14.318522] CPU: 1 PID: 66 Comm: kworker/1:1 Tainted: G W 4.17.0-rc2-00438-gfcf85729d8ef #941
[ 14.319674] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014
[ 14.320764] Workqueue: events smap_gc_work
[ 14.321262] RIP: 0010:inet_sock_destruct+0x570/0x7b0
[ 14.321829] RSP: 0018:ffff880118617d08 EFLAGS: 00010206
[ 14.322415] RAX: 0000000000000ffc RBX: ffff8801127f9980 RCX: ffffffff82c7ac64
[ 14.323221] RDX: 1ffff100224ff34f RSI: 0000000000000004 RDI: ffff8801127f9a78
[ 14.324054] RBP: ffff8801127f9ac4 R08: ffffed00224ff359 R09: ffffed00224ff358
[ 14.324860] R10: ffff8801127f9ac7 R11: ffffed00224ff359 R12: ffff880111107688
[ 14.325661] R13: dffffc0000000000 R14: ffff880111107688 R15: dead000000000100
[ 14.326503] FS: 0000000000000000(0000) GS:ffff88011b080000(0000) knlGS:0000000000000000
[ 14.327413] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 14.328091] CR2: 00007f3d45e3caf0 CR3: 0000000003a0e004 CR4: 00000000000606e0
[ 14.328936] Call Trace:
[ 14.329225] __sk_destruct+0x45/0x460
[ 14.329646] smap_gc_work+0x69f/0x950
[ 14.330102] process_one_work+0x7be/0x1230
[ 14.330600] worker_thread+0xd9/0x1080
[ 14.331059] ? rescuer_thread+0xd70/0xd70
[ 14.331543] kthread+0x29e/0x390
[ 14.331945] ? kthread_create_worker_on_cpu+0xb0/0xb0
[ 14.332555] ret_from_fork+0x1f/0x30
Please send a fix for this issue first.
I'd like to see test_sockmap being clean before adding hash support to sockmap.
^ permalink raw reply
* WARNING: ODEBUG bug in del_timer
From: syzbot @ 2018-04-29 16:10 UTC (permalink / raw)
To: davem, linux-kernel, linux-s390, netdev, syzkaller-bugs, ubraun
Hello,
syzbot hit the following crash on net-next commit
af201bab50a89aa6cf4df952b2c3bf55895c8eee (Fri Apr 27 15:12:10 2018 +0000)
udp: remove stray export symbol
syzbot dashboard link:
https://syzkaller.appspot.com/bug?extid=03faa2dc16b8b64be396
So far this crash happened 26 times on net-next.
C reproducer: https://syzkaller.appspot.com/x/repro.c?id=5925539139289088
syzkaller reproducer:
https://syzkaller.appspot.com/x/repro.syz?id=4983245594689536
Raw console output:
https://syzkaller.appspot.com/x/log.txt?id=5476181675606016
Kernel config:
https://syzkaller.appspot.com/x/.config?id=4410550353033654931
compiler: gcc (GCC) 8.0.1 20180413 (experimental)
IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+03faa2dc16b8b64be396@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for
details.
If you forward the report, please keep this part and the footer.
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
random: sshd: uninitialized urandom read (32 bytes read)
------------[ cut here ]------------
ODEBUG: assert_init not available (active state 0) object type: timer_list
hint: (null)
WARNING: CPU: 1 PID: 4490 at lib/debugobjects.c:329
debug_print_object+0x16a/0x210 lib/debugobjects.c:326
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 4490 Comm: syz-executor609 Not tainted 4.17.0-rc2+ #23
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x1b9/0x294 lib/dump_stack.c:113
panic+0x22f/0x4de kernel/panic.c:184
__warn.cold.8+0x163/0x1b3 kernel/panic.c:536
report_bug+0x252/0x2d0 lib/bug.c:186
fixup_bug arch/x86/kernel/traps.c:178 [inline]
do_error_trap+0x1de/0x490 arch/x86/kernel/traps.c:296
do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:315
invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:992
RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326
RSP: 0018:ffff8801af1e7880 EFLAGS: 00010086
RAX: 0000000000000061 RBX: 0000000000000005 RCX: ffffffff818433e8
RDX: 0000000000000000 RSI: ffffffff8160f561 RDI: 0000000000000001
RBP: ffff8801af1e78c0 R08: ffff8801afa62100 R09: ffffed003b5e3eb2
R10: ffffed003b5e3eb2 R11: ffff8801daf1f597 R12: 0000000000000001
R13: ffffffff88d96cc0 R14: ffffffff87fa34e0 R15: ffffffff81666d30
debug_object_assert_init+0x309/0x500 lib/debugobjects.c:692
debug_timer_assert_init kernel/time/timer.c:724 [inline]
debug_assert_init kernel/time/timer.c:776 [inline]
del_timer+0x74/0x140 kernel/time/timer.c:1198
try_to_grab_pending+0x439/0x9a0 kernel/workqueue.c:1223
mod_delayed_work_on+0x91/0x250 kernel/workqueue.c:1592
mod_delayed_work include/linux/workqueue.h:541 [inline]
smc_setsockopt+0x33d/0x630 net/smc/af_smc.c:1353
__sys_setsockopt+0x1bd/0x390 net/socket.c:1903
__do_sys_setsockopt net/socket.c:1914 [inline]
__se_sys_setsockopt net/socket.c:1911 [inline]
__x64_sys_setsockopt+0xbe/0x150 net/socket.c:1911
do_syscall_64+0x1b1/0x800 arch/x86/entry/common.c:287
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x43fd09
RSP: 002b:00007ffe1f251c58 EFLAGS: 00000207 ORIG_RAX: 0000000000000036
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fd09
RDX: 0000000000000001 RSI: 0000000000000006 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000004 R09: 00000000004002c8
R10: 0000000020000180 R11: 0000000000000207 R12: 0000000000401630
R13: 00000000004016c0 R14: 0000000000000000 R15: 0000000000000000
Dumping ftrace buffer:
(ftrace buffer empty)
Kernel Offset: disabled
Rebooting in 86400 seconds..
---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.
syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
If you want to test a patch for this bug, please reply with:
#syz test: git://repo/address.git branch
and provide the patch inline or as an attachment.
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug
report.
Note: all commands must start from beginning of the line in the email body.
^ permalink raw reply
* Re: [PATCH bpf-next 0/2] Fix BPF helpers documentation
From: Alexei Starovoitov @ 2018-04-29 16:02 UTC (permalink / raw)
To: Quentin Monnet
Cc: Andrey Ignatov, Alexei Starovoitov, Daniel Borkmann,
Linux Kernel Network Developers, kernel-team
In-Reply-To: <CAH3iqT5mbF_eyDi0B-kHu80k2SDpd5votX9=G4SK9V_pbM+FbA@mail.gmail.com>
On Sun, Apr 29, 2018 at 10:49:40AM +0100, Quentin Monnet wrote:
> On 29 April 2018 at 00:06, Andrey Ignatov <rdna@fb.com> wrote:
> > BPF helpers documentation in UAPI refers to kernel ctx structures when it
> > has to refer to user visible ones. Fix it.
> >
> > Andrey Ignatov (2):
> > bpf: Fix helpers ctx struct types in uapi doc
> > bpf: Sync bpf.h to tools/
> >
> > include/uapi/linux/bpf.h | 12 ++++++------
> > tools/include/uapi/linux/bpf.h | 12 ++++++------
> > 2 files changed, 12 insertions(+), 12 deletions(-)
>
> Correct. Thanks a lot for the fix, Andrey!
> Reviewed-by: Quentin Monnet <quentin.monnet@netronome.com>
Applied, Thanks Andrey.
^ permalink raw reply
* Re: [PATCH bpf-next v9 00/10] bpf: add bpf_get_stack helper
From: Alexei Starovoitov @ 2018-04-29 15:55 UTC (permalink / raw)
To: Yonghong Song; +Cc: ast, daniel, netdev, kernel-team
In-Reply-To: <20180429052816.2882032-1-yhs@fb.com>
On Sat, Apr 28, 2018 at 10:28:06PM -0700, Yonghong Song wrote:
> Currently, stackmap and bpf_get_stackid helper are provided
> for bpf program to get the stack trace. This approach has
> a limitation though. If two stack traces have the same hash,
> only one will get stored in the stackmap table regardless of
> whether BPF_F_REUSE_STACKID is specified or not,
> so some stack traces may be missing from user perspective.
>
> This patch implements a new helper, bpf_get_stack, will
> send stack traces directly to bpf program. The bpf program
> is able to see all stack traces, and then can do in-kernel
> processing or send stack traces to user space through
> shared map or bpf_perf_event_output.
>
> Patches #1 and #2 implemented the core kernel support.
> Patch #3 removes two never-hit branches in verifier.
> Patches #4 and #5 are two verifier improves to make
> bpf programming easier. Patch #6 synced the new helper
> to tools headers. Patch #7 moved perf_event polling code
> and ksym lookup code from samples/bpf to
> tools/testing/selftests/bpf. Patch #8 added a verifier
> test in tools/bpf for new verifier change.
> Patches #9 and #10 added tests for raw tracepoint prog
> and tracepoint prog respectively.
>
> Changelogs:
> v8 -> v9:
> . make function perf_event_mmap (in trace_helpers.c) extern
> to decouple perf_event_mmap and perf_event_poller.
> . add jit enabled handling for kernel stack verification
> in Patch #9. Since we did not have a good way to
> verify jit enabled kernel stack, just return true if
> the kernel stack is not empty.
> . In path #9, using raw_syscalls/sys_enter instead of
> sched/sched_switch, removed calling cmd
> "task 1 dd if=/dev/zero of=/dev/null" which is left
> with dangling process after the program exited.
>
> v7 -> v8:
> . rebase on top of latest bpf-next
> . simplify BPF_ARSH dst_reg->smin_val/smax_value tracking
> . rewrite the description of bpf_get_stack() in uapi bpf.h
> based on new format.
> v6 -> v7:
> . do perf callchain buffer allocation inside the
> verifier. so if the prog->has_callchain_buf is set,
> it is guaranteed that the buffer has been allocated.
> . change condition "trace_nr <= skip" to "trace_nr < skip"
> so that for zero size buffer, return 0 instead of -EFAULT
> v5 -> v6:
> . after refining return register smax_value and umax_value
> for helpers bpf_get_stack and bpf_probe_read_str,
> bounds and var_off of the return register are further refined.
> . added missing commit message for tools header sync commit.
> . removed one unnecessary empty line.
> v4 -> v5:
> . relied on dst_reg->var_off to refine umin_val/umax_val
> in verifier handling BPF_ARSH value range tracking,
> suggested by Edward.
> v3 -> v4:
> . fixed a bug when meta ptr is set to NULL in check_func_arg.
> . introduced tnum_arshift and added detailed comments for
> the underlying implementation
> . avoided using VLA in tools/bpf test_progs.
> v2 -> v3:
> . used meta to track helper memory size argument
> . implemented range checking for ARSH in verifier
> . moved perf event polling and ksym related functions
> from samples/bpf to tools/bpf
> . added test to compare build id's between bpf_get_stackid
> and bpf_get_stack
> v1 -> v2:
> . fixed compilation error when CONFIG_PERF_EVENTS is not enabled
Applied, Thanks Yonghong.
^ permalink raw reply
* Re: [PATCH net-next v9 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Jiri Pirko @ 2018-04-29 13:45 UTC (permalink / raw)
To: Siwei Liu
Cc: Sridhar Samudrala, Michael S. Tsirkin, Stephen Hemminger,
David Miller, Netdev, virtualization, virtio-dev,
Brandeburg, Jesse, Alexander Duyck, Jakub Kicinski, Jason Wang,
aaron.f.brown
In-Reply-To: <CADGSJ23A4w=MmnP7CneBwk=eouS07HfK4=1UWTk3Dz3gBX0yjg@mail.gmail.com>
Sun, Apr 29, 2018 at 10:56:30AM CEST, loseweigh@gmail.com wrote:
>On Sat, Apr 28, 2018 at 2:42 AM, Jiri Pirko <jiri@resnulli.us> wrote:
>> Fri, Apr 27, 2018 at 07:06:59PM CEST, sridhar.samudrala@intel.com wrote:
>>>This patch enables virtio_net to switch over to a VF datapath when a VF
>>>netdev is present with the same MAC address. It allows live migration
>>>of a VM with a direct attached VF without the need to setup a bond/team
>>>between a VF and virtio net device in the guest.
>>>
>>>The hypervisor needs to enable only one datapath at any time so that
>>>packets don't get looped back to the VM over the other datapath. When a VF
>>>is plugged, the virtio datapath link state can be marked as down. The
>>>hypervisor needs to unplug the VF device from the guest on the source host
>>>and reset the MAC filter of the VF to initiate failover of datapath to
>>>virtio before starting the migration. After the migration is completed,
>>>the destination hypervisor sets the MAC filter on the VF and plugs it back
>>>to the guest to switch over to VF datapath.
>>>
>>>It uses the generic failover framework that provides 2 functions to create
>>>and destroy a master failover netdev. When STANDBY feature is enabled, an
>>>additional netdev(failover netdev) is created that acts as a master device
>>>and tracks the state of the 2 lower netdevs. The original virtio_net netdev
>>>is marked as 'standby' netdev and a passthru device with the same MAC is
>>>registered as 'primary' netdev.
>>>
>>>This patch is based on the discussion initiated by Jesse on this thread.
>>>https://marc.info/?l=linux-virtualization&m=151189725224231&w=2
>>>
>>
>> When I enabled the standby feature (hardcoded), I have 2 netdevices now:
>> 4: ens3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
>> link/ether 52:54:00:b2:a7:f1 brd ff:ff:ff:ff:ff:ff
>> inet6 fe80::5054:ff:feb2:a7f1/64 scope link
>> valid_lft forever preferred_lft forever
>> 5: ens3n_sby: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000
>> link/ether 52:54:00:b2:a7:f1 brd ff:ff:ff:ff:ff:ff
>> inet6 fe80::5054:ff:feb2:a7f1/64 scope link
>> valid_lft forever preferred_lft forever
>>
>> However, it seems to confuse my initscripts on Fedora:
>> [root@test1 ~]# ifup ens3
>> ./network-functions: line 78: [: /etc/dhcp/dhclient-ens3: binary operator expected
>> ./network-functions: line 80: [: /etc/dhclient-ens3: binary operator expected
>> ./network-functions: line 69: [: /var/lib/dhclient/dhclient-ens3: binary operator expected
>>
>You should teach Fedora and all cloud vendors to upgrade their
>initscripts and other userspace tools, no?
I just wanted to point out that the conversion from "nostandby" to
"standby" isn't always that smooth as claimed. The claim was "no change
for the current user" iirc.
^ permalink raw reply
* [PATCH v5] bpf, x86_32: add eBPF JIT compiler for ia32
From: Wang YanQing @ 2018-04-29 12:37 UTC (permalink / raw)
To: daniel
Cc: ast, illusionist.neo, tglx, mingo, hpa, davem, x86, netdev,
linux-kernel
The JIT compiler emits ia32 bit instructions. Currently, It supports eBPF
only. Classic BPF is supported because of the conversion by BPF core.
Almost all instructions from eBPF ISA supported except the following:
BPF_ALU64 | BPF_DIV | BPF_K
BPF_ALU64 | BPF_DIV | BPF_X
BPF_ALU64 | BPF_MOD | BPF_K
BPF_ALU64 | BPF_MOD | BPF_X
BPF_STX | BPF_XADD | BPF_W
BPF_STX | BPF_XADD | BPF_DW
It doesn't support BPF_JMP|BPF_CALL with BPF_PSEUDO_CALL at the moment.
IA32 has few general purpose registers, EAX|EDX|ECX|EBX|ESI|EDI. I use
EAX|EDX|ECX|EBX as temporary registers to simulate instructions in eBPF
ISA, and allocate ESI|EDI to BPF_REG_AX for constant blinding, all others
eBPF registers, R0-R10, are simulated through scratch space on stack.
The reasons behind the hardware registers allocation policy are:
1:MUL need EAX:EDX, shift operation need ECX, so they aren't fit
for general eBPF 64bit register simulation.
2:We need at least 4 registers to simulate most eBPF ISA operations
on registers operands instead of on register&memory operands.
3:We need to put BPF_REG_AX on hardware registers, or constant blinding
will degrade jit performance heavily.
Tested on PC (Intel(R) Core(TM) i5-5200U CPU).
Testing results on i5-5200U:
1) test_bpf: Summary: 349 PASSED, 0 FAILED, [319/341 JIT'ed]
2) test_progs: Summary: 83 PASSED, 0 FAILED.
3) test_lpm: OK
4) test_lru_map: OK
5) test_verifier: Summary: 828 PASSED, 0 FAILED.
Above tests are all done in following two conditions separately:
1:bpf_jit_enable=1 and bpf_jit_harden=0
2:bpf_jit_enable=1 and bpf_jit_harden=2
Below are some numbers for this jit implementation:
Note:
I run test_progs in kselftest 100 times continuously for every condition,
the numbers are in format: total/times=avg.
The numbers that test_bpf reports show almost the same relation.
a:jit_enable=0 and jit_harden=0 b:jit_enable=1 and jit_harden=0
test_pkt_access:PASS:ipv4:15622/100=156 test_pkt_access:PASS:ipv4:10674/100=106
test_pkt_access:PASS:ipv6:9130/100=91 test_pkt_access:PASS:ipv6:4855/100=48
test_xdp:PASS:ipv4:240198/100=2401 test_xdp:PASS:ipv4:138912/100=1389
test_xdp:PASS:ipv6:137326/100=1373 test_xdp:PASS:ipv6:68542/100=685
test_l4lb:PASS:ipv4:61100/100=611 test_l4lb:PASS:ipv4:37302/100=373
test_l4lb:PASS:ipv6:101000/100=1010 test_l4lb:PASS:ipv6:55030/100=550
c:jit_enable=1 and jit_harden=2
test_pkt_access:PASS:ipv4:10558/100=105
test_pkt_access:PASS:ipv6:5092/100=50
test_xdp:PASS:ipv4:131902/100=1319
test_xdp:PASS:ipv6:77932/100=779
test_l4lb:PASS:ipv4:38924/100=389
test_l4lb:PASS:ipv6:57520/100=575
The numbers show we get 30%~50% improvement.
See Documentation/networking/filter.txt for more information.
Signed-off-by: Wang YanQing <udknight@gmail.com>
---
Changes v4-v5:
1:Delete is_on_stack, BPF_REG_AX is the only one
on real hardware registers, so just check with
it.
2:Apply commit 1612a981b766 ("bpf, x64: fix JIT emission
for dead code"), suggested by Daniel Borkmann.
Changes v3-v4:
1:Fix changelog in commit.
I install llvm-6.0, then test_progs willn't report errors.
I submit another patch:
"bpf: fix misaligned access for BPF_PROG_TYPE_PERF_EVENT program type on x86_32 platform"
to fix another problem, after that patch, test_verifier willn't report errors too.
2:Fix clear r0[1] twice unnecessarily in *BPF_IND|BPF_ABS* simulation.
Changes v2-v3:
1:Move BPF_REG_AX to real hardware registers for performance reason.
3:Using bpf_load_pointer instead of bpf_jit32.S, suggested by Daniel Borkmann.
4:Delete partial codes in 1c2a088a6626, suggested by Daniel Borkmann.
5:Some bug fixes and comments improvement.
Changes v1-v2:
1:Fix bug in emit_ia32_neg64.
2:Fix bug in emit_ia32_arsh_r64.
3:Delete filename in top level comment, suggested by Thomas Gleixner.
4:Delete unnecessary boiler plate text, suggested by Thomas Gleixner.
5:Rewrite some words in changelog.
6:CodingSytle improvement and a little more comments.
arch/x86/Kconfig | 2 +-
arch/x86/include/asm/nospec-branch.h | 26 +-
arch/x86/net/Makefile | 9 +-
arch/x86/net/bpf_jit_comp32.c | 2527 ++++++++++++++++++++++++++++++++++
4 files changed, 2559 insertions(+), 5 deletions(-)
create mode 100644 arch/x86/net/bpf_jit_comp32.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 00fcf81..1f5fa2f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -137,7 +137,7 @@ config X86
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
- select HAVE_EBPF_JIT if X86_64
+ select HAVE_EBPF_JIT
select HAVE_EFFICIENT_UNALIGNED_ACCESS
select HAVE_EXIT_THREAD
select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index f928ad9..a4c7ca4 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -291,14 +291,17 @@ static inline void indirect_branch_prediction_barrier(void)
* lfence
* jmp spec_trap
* do_rop:
- * mov %rax,(%rsp)
+ * mov %rax,(%rsp) for x86_64
+ * mov %edx,(%esp) for x86_32
* retq
*
* Without retpolines configured:
*
- * jmp *%rax
+ * jmp *%rax for x86_64
+ * jmp *%edx for x86_32
*/
#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_64
# define RETPOLINE_RAX_BPF_JIT_SIZE 17
# define RETPOLINE_RAX_BPF_JIT() \
EMIT1_off32(0xE8, 7); /* callq do_rop */ \
@@ -310,9 +313,28 @@ static inline void indirect_branch_prediction_barrier(void)
EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */ \
EMIT1(0xC3); /* retq */
#else
+# define RETPOLINE_EDX_BPF_JIT() \
+do { \
+ EMIT1_off32(0xE8, 7); /* call do_rop */ \
+ /* spec_trap: */ \
+ EMIT2(0xF3, 0x90); /* pause */ \
+ EMIT3(0x0F, 0xAE, 0xE8); /* lfence */ \
+ EMIT2(0xEB, 0xF9); /* jmp spec_trap */ \
+ /* do_rop: */ \
+ EMIT3(0x89, 0x14, 0x24); /* mov %edx,(%esp) */ \
+ EMIT1(0xC3); /* ret */ \
+} while (0)
+#endif
+#else /* !CONFIG_RETPOLINE */
+
+#ifdef CONFIG_X86_64
# define RETPOLINE_RAX_BPF_JIT_SIZE 2
# define RETPOLINE_RAX_BPF_JIT() \
EMIT2(0xFF, 0xE0); /* jmp *%rax */
+#else
+# define RETPOLINE_EDX_BPF_JIT() \
+ EMIT2(0xFF, 0xE2) /* jmp *%edx */
+#endif
#endif
#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index fefb4b6..f54c9d4 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -1,6 +1,11 @@
#
# Arch-specific network modules
#
-OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
-obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+
+ifeq ($(CONFIG_X86_32),y)
+ obj-$(CONFIG_BPF_JIT) += bpf_jit_comp32.o
+else
+ OBJECT_FILES_NON_STANDARD_bpf_jit.o += y
+ obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+endif
diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c
new file mode 100644
index 0000000..fb7292f
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp32.c
@@ -0,0 +1,2527 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Just-In-Time compiler for eBPF filters on IA32 (32bit x86)
+ *
+ * Author: Wang YanQing (udknight@gmail.com)
+ * The code based on code and ideas from:
+ * Eric Dumazet (eric.dumazet@gmail.com)
+ * and from:
+ * Shubham Bansal <illusionist.neo@gmail.com>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/filter.h>
+#include <linux/if_vlan.h>
+#include <asm/cacheflush.h>
+#include <asm/set_memory.h>
+#include <asm/nospec-branch.h>
+#include <linux/bpf.h>
+
+/*
+ * eBPF prog stack layout:
+ *
+ * high
+ * original ESP => +-----+
+ * | | callee saved registers
+ * +-----+
+ * | ... | eBPF JIT scratch space
+ * BPF_FP,IA32_EBP => +-----+
+ * | ... | eBPF prog stack
+ * +-----+
+ * |RSVD | JIT scratchpad
+ * current ESP => +-----+
+ * | |
+ * | ... | Function call stack
+ * | |
+ * +-----+
+ * low
+ *
+ * The callee saved registers:
+ *
+ * high
+ * original ESP => +------------------+ \
+ * | ebp | |
+ * current EBP => +------------------+ } callee saved registers
+ * | ebx,esi,edi | |
+ * +------------------+ /
+ * low
+ */
+
+static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+ if (len == 1)
+ *ptr = bytes;
+ else if (len == 2)
+ *(u16 *)ptr = bytes;
+ else {
+ *(u32 *)ptr = bytes;
+ barrier();
+ }
+ return ptr + len;
+}
+
+#define EMIT(bytes, len) \
+ do { prog = emit_code(prog, bytes, len); cnt += len; } while (0)
+
+#define EMIT1(b1) EMIT(b1, 1)
+#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4) \
+ EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
+
+#define EMIT1_off32(b1, off) \
+ do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+ do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+ do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+ do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+#define jmp_label(label, jmp_insn_len) (label - cnt - jmp_insn_len)
+
+static bool is_imm8(int value)
+{
+ return value <= 127 && value >= -128;
+}
+
+static bool is_simm32(s64 value)
+{
+ return value == (s64) (s32) value;
+}
+
+#define STACK_OFFSET(k) (k)
+#define TCALL_CNT (MAX_BPF_JIT_REG + 0) /* Tail Call Count */
+
+#define IA32_EAX (0x0)
+#define IA32_EBX (0x3)
+#define IA32_ECX (0x1)
+#define IA32_EDX (0x2)
+#define IA32_ESI (0x6)
+#define IA32_EDI (0x7)
+#define IA32_EBP (0x5)
+#define IA32_ESP (0x4)
+
+/* list of x86 cond jumps opcodes (. + s8)
+ * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
+ */
+#define IA32_JB 0x72
+#define IA32_JAE 0x73
+#define IA32_JE 0x74
+#define IA32_JNE 0x75
+#define IA32_JBE 0x76
+#define IA32_JA 0x77
+#define IA32_JL 0x7C
+#define IA32_JGE 0x7D
+#define IA32_JLE 0x7E
+#define IA32_JG 0x7F
+
+/*
+ * Map eBPF registers to x86_32 32bit registers or stack scratch space.
+ *
+ * 1. All the registers, R0-R10, are mapped to scratch space on stack.
+ * 2. We need two 64 bit temp registers to do complex operations on eBPF
+ * registers.
+ * 3. For performance reason, the BPF_REG_AX for blinding constant, is
+ * mapped to real hardware register pair, IA32_ESI and IA32_EDI.
+ *
+ * As the eBPF registers are all 64 bit registers and x86_32 has only 32 bit
+ * registers, we have to map each eBPF registers with two x86_32 32 bit regs
+ * or scratch memory space and we have to build eBPF 64 bit register from those.
+ *
+ * We use IA32_EAX, IA32_EDX, IA32_ECX, IA32_EBX as temporary registers.
+ */
+static const u8 bpf2ia32[][2] = {
+ /* return value from in-kernel function, and exit value from eBPF */
+ [BPF_REG_0] = {STACK_OFFSET(0), STACK_OFFSET(4)},
+
+ /* arguments from eBPF program to in-kernel function */
+ /* Stored on stack scratch space */
+ [BPF_REG_1] = {STACK_OFFSET(8), STACK_OFFSET(12)},
+ [BPF_REG_2] = {STACK_OFFSET(16), STACK_OFFSET(20)},
+ [BPF_REG_3] = {STACK_OFFSET(24), STACK_OFFSET(28)},
+ [BPF_REG_4] = {STACK_OFFSET(32), STACK_OFFSET(36)},
+ [BPF_REG_5] = {STACK_OFFSET(40), STACK_OFFSET(44)},
+
+ /* callee saved registers that in-kernel function will preserve */
+ /* Stored on stack scratch space */
+ [BPF_REG_6] = {STACK_OFFSET(48), STACK_OFFSET(52)},
+ [BPF_REG_7] = {STACK_OFFSET(56), STACK_OFFSET(60)},
+ [BPF_REG_8] = {STACK_OFFSET(64), STACK_OFFSET(68)},
+ [BPF_REG_9] = {STACK_OFFSET(72), STACK_OFFSET(76)},
+
+ /* Read only Frame Pointer to access Stack */
+ [BPF_REG_FP] = {STACK_OFFSET(80), STACK_OFFSET(84)},
+
+ /* temporary register for blinding constants. */
+ [BPF_REG_AX] = {IA32_ESI, IA32_EDI},
+
+ /* Tail call count. Stored on stack scratch space. */
+ [TCALL_CNT] = {STACK_OFFSET(88), STACK_OFFSET(92)},
+};
+
+#define dst_lo dst[0]
+#define dst_hi dst[1]
+#define src_lo src[0]
+#define src_hi src[1]
+
+#define STACK_ALIGNMENT 8
+/* Stack space for BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4,
+ * BPF_REG_5, BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9,
+ * BPF_REG_FP, BPF_REG_AX and Tail call counts.
+ */
+#define SCRATCH_SIZE 96
+
+/* total stack size used in JITed code */
+#define _STACK_SIZE \
+ (stack_depth + \
+ + SCRATCH_SIZE + \
+ + 4 /* extra for skb_copy_bits buffer */)
+
+#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
+
+/* Get the offset of eBPF REGISTERs stored on scratch space. */
+#define STACK_VAR(off) (off)
+
+/* Offset of skb_copy_bits buffer */
+#define SKB_BUFFER STACK_VAR(SCRATCH_SIZE)
+
+/* encode 'dst_reg' register into x86_32 opcode 'byte' */
+static u8 add_1reg(u8 byte, u32 dst_reg)
+{
+ return byte + dst_reg;
+}
+
+/* encode 'dst_reg' and 'src_reg' registers into x86_32 opcode 'byte' */
+static u8 add_2reg(u8 byte, u32 dst_reg, u32 src_reg)
+{
+ return byte + dst_reg + (src_reg << 3);
+}
+
+static void jit_fill_hole(void *area, unsigned int size)
+{
+ /* fill whole space with int3 instructions */
+ memset(area, 0xcc, size);
+}
+
+static inline void emit_ia32_mov_i(const u8 dst, const u32 val, bool dstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (dstk) {
+ if (val == 0) {
+ /* xor eax,eax */
+ EMIT2(0x33, add_2reg(0xC0, IA32_EAX, IA32_EAX));
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ } else {
+ EMIT3_off32(0xC7, add_1reg(0x40, IA32_EBP),
+ STACK_VAR(dst), val);
+ }
+ } else {
+ if (val == 0)
+ EMIT2(0x33, add_2reg(0xC0, dst, dst));
+ else
+ EMIT2_off32(0xC7, add_1reg(0xC0, dst),
+ val);
+ }
+ *pprog = prog;
+}
+
+/* dst = imm (4 bytes)*/
+static inline void emit_ia32_mov_r(const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_EAX : src;
+
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+ if (dstk)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, sreg), STACK_VAR(dst));
+ else
+ /* mov dst,sreg */
+ EMIT2(0x89, add_2reg(0xC0, dst, sreg));
+
+ *pprog = prog;
+}
+
+/* dst = src */
+static inline void emit_ia32_mov_r64(const bool is64, const u8 dst[],
+ const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ emit_ia32_mov_r(dst_lo, src_lo, dstk, sstk, pprog);
+ if (is64)
+ /* complete 8 byte move */
+ emit_ia32_mov_r(dst_hi, src_hi, dstk, sstk, pprog);
+ else
+ /* zero out high 4 bytes */
+ emit_ia32_mov_i(dst_hi, 0, dstk, pprog);
+}
+
+/* Sign extended move */
+static inline void emit_ia32_mov_i64(const bool is64, const u8 dst[],
+ const u32 val, bool dstk, u8 **pprog)
+{
+ u32 hi = 0;
+
+ if (is64 && (val & (1<<31)))
+ hi = (u32)~0;
+ emit_ia32_mov_i(dst_lo, val, dstk, pprog);
+ emit_ia32_mov_i(dst_hi, hi, dstk, pprog);
+}
+
+/* ALU operation (32 bit)
+ * dst = dst * src
+ */
+static inline void emit_ia32_mul_r(const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_ECX : src;
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+ else
+ /* mov eax,dst */
+ EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+
+ EMIT2(0xF7, add_1reg(0xE0, sreg));
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ /* mov dst,eax */
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+
+ *pprog = prog;
+}
+
+static inline void emit_ia32_to_le_r64(const u8 dst[], s32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk && val != 64) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ switch (val) {
+ case 16:
+ /* emit 'movzwl eax,ax' to zero extend 16-bit
+ * into 64 bit
+ */
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 32:
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 64:
+ /* nop */
+ break;
+ }
+
+ if (dstk && val != 64) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+static inline void emit_ia32_to_be_r64(const u8 dst[], s32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ switch (val) {
+ case 16:
+ /* emit 'ror %ax, 8' to swap lower 2 bytes */
+ EMIT1(0x66);
+ EMIT3(0xC1, add_1reg(0xC8, dreg_lo), 8);
+
+ EMIT2(0x0F, 0xB7);
+ EMIT1(add_2reg(0xC0, dreg_lo, dreg_lo));
+
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 32:
+ /* emit 'bswap eax' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_lo));
+
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ break;
+ case 64:
+ /* emit 'bswap eax' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_lo));
+
+ /* emit 'bswap edx' to swap lower 4 bytes */
+ EMIT1(0x0F);
+ EMIT1(add_1reg(0xC8, dreg_hi));
+
+ /* mov ecx,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, dreg_hi));
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* mov dreg_lo,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+ break;
+ }
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* ALU operation (32 bit)
+ * dst = dst (div|mod) src
+ */
+static inline void emit_ia32_div_mod_r(const u8 op, const u8 dst, const u8 src,
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src));
+ else if (src != IA32_ECX)
+ /* mov ecx,src */
+ EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ /* mov eax,dst */
+ EMIT2(0x8B, add_2reg(0xC0, dst, IA32_EAX));
+
+ /* xor edx,edx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+ /* div ecx */
+ EMIT2(0xF7, add_1reg(0xF0, IA32_ECX));
+
+ if (op == BPF_MOD) {
+ if (dstk)
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst));
+ else
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EDX));
+ } else {
+ if (dstk)
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst));
+ else
+ EMIT2(0x89, add_2reg(0xC0, dst, IA32_EAX));
+ }
+ *pprog = prog;
+}
+
+/* ALU operation (32 bit)
+ * dst = dst (shift) src
+ */
+static inline void emit_ia32_shift_r(const u8 op, const u8 dst, const u8 src,
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg = dstk ? IA32_EAX : dst;
+ u8 b2;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src));
+ else if (src != IA32_ECX)
+ /* mov ecx,src */
+ EMIT2(0x8B, add_2reg(0xC0, src, IA32_ECX));
+
+ switch (op) {
+ case BPF_LSH:
+ b2 = 0xE0; break;
+ case BPF_RSH:
+ b2 = 0xE8; break;
+ case BPF_ARSH:
+ b2 = 0xF8; break;
+ default:
+ return;
+ }
+ EMIT2(0xD3, add_1reg(b2, dreg));
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg), STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/* ALU operation (32 bit)
+ * dst = dst (op) src
+ */
+static inline void emit_ia32_alu_r(const bool is64, const bool hi, const u8 op,
+ const u8 dst, const u8 src, bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 sreg = sstk ? IA32_EAX : src;
+ u8 dreg = dstk ? IA32_EDX : dst;
+
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(src));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(dst));
+
+ switch (BPF_OP(op)) {
+ /* dst = dst + src */
+ case BPF_ADD:
+ if (hi && is64)
+ EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+ else
+ EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst - src */
+ case BPF_SUB:
+ if (hi && is64)
+ EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+ else
+ EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst | src */
+ case BPF_OR:
+ EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst & src */
+ case BPF_AND:
+ EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst ^ src */
+ case BPF_XOR:
+ EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+ break;
+ }
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+ STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_r64(const bool is64, const u8 op,
+ const u8 dst[], const u8 src[],
+ bool dstk, bool sstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+
+ emit_ia32_alu_r(is64, false, op, dst_lo, src_lo, dstk, sstk, &prog);
+ if (is64)
+ emit_ia32_alu_r(is64, true, op, dst_hi, src_hi, dstk, sstk,
+ &prog);
+ else
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ *pprog = prog;
+}
+
+/* ALU operation (32 bit)
+ * dst = dst (op) val
+ */
+static inline void emit_ia32_alu_i(const bool is64, const bool hi, const u8 op,
+ const u8 dst, const s32 val, bool dstk,
+ u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg = dstk ? IA32_EAX : dst;
+ u8 sreg = IA32_EDX;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(dst));
+
+ if (!is_imm8(val))
+ /* mov edx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EDX), val);
+
+ switch (op) {
+ /* dst = dst + val */
+ case BPF_ADD:
+ if (hi && is64) {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xD0, dreg), val);
+ else
+ EMIT2(0x11, add_2reg(0xC0, dreg, sreg));
+ } else {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xC0, dreg), val);
+ else
+ EMIT2(0x01, add_2reg(0xC0, dreg, sreg));
+ }
+ break;
+ /* dst = dst - val */
+ case BPF_SUB:
+ if (hi && is64) {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xD8, dreg), val);
+ else
+ EMIT2(0x19, add_2reg(0xC0, dreg, sreg));
+ } else {
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xE8, dreg), val);
+ else
+ EMIT2(0x29, add_2reg(0xC0, dreg, sreg));
+ }
+ break;
+ /* dst = dst | val */
+ case BPF_OR:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xC8, dreg), val);
+ else
+ EMIT2(0x09, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst & val */
+ case BPF_AND:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xE0, dreg), val);
+ else
+ EMIT2(0x21, add_2reg(0xC0, dreg, sreg));
+ break;
+ /* dst = dst ^ val */
+ case BPF_XOR:
+ if (is_imm8(val))
+ EMIT3(0x83, add_1reg(0xF0, dreg), val);
+ else
+ EMIT2(0x31, add_2reg(0xC0, dreg, sreg));
+ break;
+ case BPF_NEG:
+ EMIT2(0xF7, add_1reg(0xD8, dreg));
+ break;
+ }
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],dreg */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg),
+ STACK_VAR(dst));
+ *pprog = prog;
+}
+
+/* ALU operation (64 bit) */
+static inline void emit_ia32_alu_i64(const bool is64, const u8 op,
+ const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ u32 hi = 0;
+
+ if (is64 && (val & (1<<31)))
+ hi = (u32)~0;
+
+ emit_ia32_alu_i(is64, false, op, dst_lo, val, dstk, &prog);
+ if (is64)
+ emit_ia32_alu_i(is64, true, op, dst_hi, hi, dstk, &prog);
+ else
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+
+ *pprog = prog;
+}
+
+/* dst = ~dst (64 bit) */
+static inline void emit_ia32_neg64(const u8 dst[], bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ /* xor ecx,ecx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* sub dreg_lo,ecx */
+ EMIT2(0x2B, add_2reg(0xC0, dreg_lo, IA32_ECX));
+ /* mov dreg_lo,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, IA32_ECX));
+
+ /* xor ecx,ecx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* sbb dreg_hi,ecx */
+ EMIT2(0x19, add_2reg(0xC0, dreg_hi, IA32_ECX));
+ /* mov dreg_hi,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, IA32_ECX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst << src */
+static inline void emit_ia32_lsh_r64(const u8 dst[], const u8 src[],
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ static int jmp_label1 = -1;
+ static int jmp_label2 = -1;
+ static int jmp_label3 = -1;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* jumps when >= 32 */
+ if (is_imm8(jmp_label(jmp_label1, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+ /* < 32 */
+ /* shl dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xE0, dreg_hi));
+ /* mov ebx,dreg_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ /* shl dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+
+ /* IA32_ECX = -IA32_ECX + 32 */
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shr ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+ /* or dreg_hi,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 32 */
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+
+ /* cmp ecx,64 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+ /* jumps when >= 64 */
+ if (is_imm8(jmp_label(jmp_label2, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+ /* >= 32 && < 64 */
+ /* sub ecx,32 */
+ EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+ /* shl dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE0, dreg_lo));
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 64 */
+ if (jmp_label2 == -1)
+ jmp_label2 = cnt;
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+ if (jmp_label3 == -1)
+ jmp_label3 = cnt;
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst >> src (signed)*/
+static inline void emit_ia32_arsh_r64(const u8 dst[], const u8 src[],
+ bool dstk, bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ static int jmp_label1 = -1;
+ static int jmp_label2 = -1;
+ static int jmp_label3 = -1;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* jumps when >= 32 */
+ if (is_imm8(jmp_label(jmp_label1, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+ /* < 32 */
+ /* lshr dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* ashr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+
+ /* IA32_ECX = -IA32_ECX + 32 */
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 32 */
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+
+ /* cmp ecx,64 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+ /* jumps when >= 64 */
+ if (is_imm8(jmp_label(jmp_label2, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+ /* >= 32 && < 64 */
+ /* sub ecx,32 */
+ EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+ /* ashr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xF8, dreg_hi));
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 64 */
+ if (jmp_label2 == -1)
+ jmp_label2 = cnt;
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+ if (jmp_label3 == -1)
+ jmp_label3 = cnt;
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst >> src */
+static inline void emit_ia32_rsh_r64(const u8 dst[], const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ static int jmp_label1 = -1;
+ static int jmp_label2 = -1;
+ static int jmp_label3 = -1;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk)
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ else
+ /* mov ecx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_ECX));
+
+ /* cmp ecx,32 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 32);
+ /* jumps when >= 32 */
+ if (is_imm8(jmp_label(jmp_label1, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label1, 6));
+
+ /* < 32 */
+ /* lshr dreg_lo,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_lo));
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* shr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+
+ /* IA32_ECX = -IA32_ECX + 32 */
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 32 */
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+ /* cmp ecx,64 */
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), 64);
+ /* jumps when >= 64 */
+ if (is_imm8(jmp_label(jmp_label2, 2)))
+ EMIT2(IA32_JAE, jmp_label(jmp_label2, 2));
+ else
+ EMIT2_off32(0x0F, IA32_JAE + 0x10, jmp_label(jmp_label2, 6));
+
+ /* >= 32 && < 64 */
+ /* sub ecx,32 */
+ EMIT3(0x83, add_1reg(0xE8, IA32_ECX), 32);
+ /* shr dreg_hi,cl */
+ EMIT2(0xD3, add_1reg(0xE8, dreg_hi));
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+ /* goto out; */
+ if (is_imm8(jmp_label(jmp_label3, 2)))
+ EMIT2(0xEB, jmp_label(jmp_label3, 2));
+ else
+ EMIT1_off32(0xE9, jmp_label(jmp_label3, 5));
+
+ /* >= 64 */
+ if (jmp_label2 == -1)
+ jmp_label2 = cnt;
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+
+ if (jmp_label3 == -1)
+ jmp_label3 = cnt;
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ /* out: */
+ *pprog = prog;
+}
+
+/* dst = dst << val */
+static inline void emit_ia32_lsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ /* Do LSH operation */
+ if (val < 32) {
+ /* shl dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_hi), val);
+ /* mov ebx,dreg_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ /* shl dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), val);
+
+ /* IA32_ECX = 32 - val */
+ /* mov ecx,val */
+ EMIT2(0xB1, val);
+ /* movzx ecx,ecx */
+ EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shr ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE8, IA32_EBX));
+ /* or dreg_hi,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* shl dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE0, dreg_lo), value);
+ /* mov dreg_hi,dreg_lo */
+ EMIT2(0x89, add_2reg(0xC0, dreg_hi, dreg_lo));
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ } else {
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst >> val */
+static inline void emit_ia32_rsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ /* Do RSH operation */
+ if (val < 32) {
+ /* shr dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* shr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), val);
+
+ /* IA32_ECX = 32 - val */
+ /* mov ecx,val */
+ EMIT2(0xB1, val);
+ /* movzx ecx,ecx */
+ EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* shr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_hi), value);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ } else {
+ /* xor dreg_lo,dreg_lo */
+ EMIT2(0x33, add_2reg(0xC0, dreg_lo, dreg_lo));
+ /* xor dreg_hi,dreg_hi */
+ EMIT2(0x33, add_2reg(0xC0, dreg_hi, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+/* dst = dst >> val (signed) */
+static inline void emit_ia32_arsh_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ /* Do RSH operation */
+ if (val < 32) {
+ /* shr dreg_lo,imm8 */
+ EMIT3(0xC1, add_1reg(0xE8, dreg_lo), val);
+ /* mov ebx,dreg_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dreg_hi, IA32_EBX));
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), val);
+
+ /* IA32_ECX = 32 - val */
+ /* mov ecx,val */
+ EMIT2(0xB1, val);
+ /* movzx ecx,ecx */
+ EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+ /* neg ecx */
+ EMIT2(0xF7, add_1reg(0xD8, IA32_ECX));
+ /* add ecx,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 32);
+
+ /* shl ebx,cl */
+ EMIT2(0xD3, add_1reg(0xE0, IA32_EBX));
+ /* or dreg_lo,ebx */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, IA32_EBX));
+ } else if (val >= 32 && val < 64) {
+ u32 value = val - 32;
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), value);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ } else {
+ /* ashr dreg_hi,imm8 */
+ EMIT3(0xC1, add_1reg(0xF8, dreg_hi), 31);
+ /* mov dreg_lo,dreg_hi */
+ EMIT2(0x89, add_2reg(0xC0, dreg_lo, dreg_hi));
+ }
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],dreg_lo */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_lo),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],dreg_hi */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, dreg_hi),
+ STACK_VAR(dst_hi));
+ }
+ *pprog = prog;
+}
+
+static inline void emit_ia32_mul_r64(const u8 dst[], const u8 src[], bool dstk,
+ bool sstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_hi));
+ else
+ /* mov eax,dst_hi */
+ EMIT2(0x8B, add_2reg(0xC0, dst_hi, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+ else
+ /* mul src_lo */
+ EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+ /* mov ecx,eax */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_hi));
+ else
+ /* mul src_hi */
+ EMIT2(0xF7, add_1reg(0xE0, src_hi));
+
+ /* add eax,eax */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(src_lo));
+ else
+ /* mul src_lo */
+ EMIT2(0xF7, add_1reg(0xE0, src_lo));
+
+ /* add ecx,edx */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dst_lo,eax */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+ /* mov dst_hi,ecx */
+ EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+ }
+
+ *pprog = prog;
+}
+
+static inline void emit_ia32_mul_i64(const u8 dst[], const u32 val,
+ bool dstk, u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ u32 hi;
+
+ hi = val & (1<<31) ? (u32)~0 : 0;
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_hi));
+ else
+ /* mul dst_hi */
+ EMIT2(0xF7, add_1reg(0xE0, dst_hi));
+
+ /* mov ecx,eax */
+ EMIT2(0x89, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), hi);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+ else
+ /* mul dst_lo */
+ EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+ /* add ecx,eax */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EAX));
+
+ /* movl eax,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EAX), val);
+ if (dstk)
+ /* mul dword ptr [ebp+off] */
+ EMIT3(0xF7, add_1reg(0x60, IA32_EBP), STACK_VAR(dst_lo));
+ else
+ /* mul dst_lo */
+ EMIT2(0xF7, add_1reg(0xE0, dst_lo));
+
+ /* add ecx,edx */
+ EMIT2(0x01, add_2reg(0xC0, IA32_ECX, IA32_EDX));
+
+ if (dstk) {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(dst_hi));
+ } else {
+ /* mov dword ptr [ebp+off],eax */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EAX));
+ /* mov dword ptr [ebp+off],ecx */
+ EMIT2(0x89, add_2reg(0xC0, dst_hi, IA32_ECX));
+ }
+
+ *pprog = prog;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 4; /* imm32 */
+ else
+ return 0;
+}
+
+struct jit_context {
+ int cleanup_addr; /* epilogue code offset */
+};
+
+/* maximum number of bytes emitted while JITing one eBPF insn */
+#define BPF_MAX_INSN_SIZE 128
+#define BPF_INSN_SAFETY 64
+
+#define PROLOGUE_SIZE 35
+
+/* emit prologue code for BPF program and check it's size.
+ * bpf_tail_call helper will skip it while jumping into another program
+ */
+static void emit_prologue(u8 **pprog, u32 stack_depth)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 fplo = bpf2ia32[BPF_REG_FP][0];
+ const u8 fphi = bpf2ia32[BPF_REG_FP][1];
+ const u8 *tcc = bpf2ia32[TCALL_CNT];
+
+ /* push ebp */
+ EMIT1(0x55);
+ /* mov ebp,esp */
+ EMIT2(0x89, 0xE5);
+ /* push edi */
+ EMIT1(0x57);
+ /* push esi */
+ EMIT1(0x56);
+ /* push ebx */
+ EMIT1(0x53);
+
+ /* sub esp,STACK_SIZE */
+ EMIT2_off32(0x81, 0xEC, STACK_SIZE);
+ /* sub ebp,SCRATCH_SIZE+4+12*/
+ EMIT3(0x83, add_1reg(0xE8, IA32_EBP), SCRATCH_SIZE + 16);
+ /* xor ebx,ebx */
+ EMIT2(0x31, add_2reg(0xC0, IA32_EBX, IA32_EBX));
+
+ /* Set up BPF prog stack base register */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBP), STACK_VAR(fplo));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(fphi));
+
+ /* Move BPF_CTX (EAX) to BPF_REG_R1 */
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(r1[1]));
+
+ /* Initialize Tail Count */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[0]));
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ BUILD_BUG_ON(cnt != PROLOGUE_SIZE);
+ *pprog = prog;
+}
+
+/* Emit epilogue code for BPF program */
+static void emit_epilogue(u8 **pprog, u32 stack_depth)
+{
+ u8 *prog = *pprog;
+ const u8 *r0 = bpf2ia32[BPF_REG_0];
+ int cnt = 0;
+
+ /* mov eax,dword ptr [ebp+off]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r0[0]));
+ /* mov edx,dword ptr [ebp+off]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r0[1]));
+
+ /* add ebp,SCRATCH_SIZE+4+12*/
+ EMIT3(0x83, add_1reg(0xC0, IA32_EBP), SCRATCH_SIZE + 16);
+
+ /* mov ebx,dword ptr [ebp-12]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), -12);
+ /* mov esi,dword ptr [ebp-8]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ESI), -8);
+ /* mov edi,dword ptr [ebp-4]*/
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDI), -4);
+
+ EMIT1(0xC9); /* leave */
+ EMIT1(0xC3); /* ret */
+ *pprog = prog;
+}
+
+/* generate the following code:
+ * ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
+ * if (index >= array->map.max_entries)
+ * goto out;
+ * if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ * prog = array->ptrs[index];
+ * if (prog == NULL)
+ * goto out;
+ * goto *(prog->bpf_func + prologue_size);
+ * out:
+ */
+static void emit_bpf_tail_call(u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 *r2 = bpf2ia32[BPF_REG_2];
+ const u8 *r3 = bpf2ia32[BPF_REG_3];
+ const u8 *tcc = bpf2ia32[TCALL_CNT];
+ u32 lo, hi;
+ static int jmp_label1 = -1;
+
+ /* if (index >= array->map.max_entries)
+ * goto out;
+ */
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r2[0]));
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX), STACK_VAR(r3[0]));
+
+ /* cmp dword ptr [eax+off],edx */
+ EMIT3(0x39, add_2reg(0x40, IA32_EAX, IA32_EDX),
+ offsetof(struct bpf_array, map.max_entries));
+ /* jbe out */
+ EMIT2(IA32_JBE, jmp_label(jmp_label1, 2));
+
+ /* if (tail_call_cnt > MAX_TAIL_CALL_CNT)
+ * goto out;
+ */
+ lo = (u32)MAX_TAIL_CALL_CNT;
+ hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32);
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ EMIT3(0x83, add_1reg(0xF8, IA32_EBX), hi); /* cmp edx, hi */
+ EMIT2(IA32_JNE, 3);
+ EMIT3(0x83, add_1reg(0xF8, IA32_ECX), lo); /* cmp ecx, lo */
+
+ EMIT2(IA32_JAE, jmp_label(jmp_label1, 2)); /* ja out */
+
+ EMIT3(0x83, add_1reg(0xC0, IA32_ECX), 0x01); /* add eax, 0x1 */
+ EMIT3(0x83, add_1reg(0xD0, IA32_EBX), 0x00); /* adc ebx, 0x0 */
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(tcc[0]));
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EBX), STACK_VAR(tcc[1]));
+
+ /* prog = array->ptrs[index]; */
+ /* mov edx, [eax + edx * 4 + offsetof(...)] */
+ EMIT3_off32(0x8B, 0x94, 0x90, offsetof(struct bpf_array, ptrs));
+
+ /* if (prog == NULL)
+ * goto out;
+ */
+ EMIT2(0x85, add_2reg(0xC0, IA32_EDX, IA32_EDX)); /* test edx,edx */
+ EMIT2(IA32_JE, jmp_label(jmp_label1, 2)); /* je out */
+
+ /* goto *(prog->bpf_func + prologue_size); */
+ /* mov edx, dword ptr [edx + 32] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EDX, IA32_EDX),
+ offsetof(struct bpf_prog, bpf_func));
+ /* add edx, prologue_size */
+ EMIT3(0x83, add_1reg(0xC0, IA32_EDX), PROLOGUE_SIZE);
+
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX), STACK_VAR(r1[0]));
+
+ /* now we're ready to jump into next BPF program
+ * eax == ctx (1st arg)
+ * edx == prog->bpf_func + prologue_size
+ */
+ RETPOLINE_EDX_BPF_JIT();
+
+ if (jmp_label1 == -1)
+ jmp_label1 = cnt;
+
+ /* out: */
+ *pprog = prog;
+}
+
+// push the scratch stack register on top of the stack
+static inline void emit_push_r64(const u8 src[], u8 **pprog)
+{
+ u8 *prog = *pprog;
+ int cnt = 0;
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_hi));
+ /* push ecx */
+ EMIT1(0x51);
+
+ /* mov ecx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX), STACK_VAR(src_lo));
+ /* push ecx */
+ EMIT1(0x51);
+
+ *pprog = prog;
+}
+
+static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image,
+ int oldproglen, struct jit_context *ctx)
+{
+ struct bpf_insn *insn = bpf_prog->insnsi;
+ int insn_cnt = bpf_prog->len;
+ bool seen_exit = false;
+ u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
+ int i, cnt = 0;
+ int proglen = 0;
+ u8 *prog = temp;
+
+ emit_prologue(&prog, bpf_prog->aux->stack_depth);
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ const s32 imm32 = insn->imm;
+ const bool is64 = BPF_CLASS(insn->code) == BPF_ALU64;
+ const bool dstk = insn->dst_reg == BPF_REG_AX ? false : true;
+ const bool sstk = insn->src_reg == BPF_REG_AX ? false : true;
+ const u8 code = insn->code;
+ const u8 *dst = bpf2ia32[insn->dst_reg];
+ const u8 *src = bpf2ia32[insn->src_reg];
+ const u8 *r0 = bpf2ia32[BPF_REG_0];
+ s64 jmp_offset;
+ u8 jmp_cond;
+ int ilen;
+ u8 *func;
+
+ switch (code) {
+ /* ALU operations */
+ /* dst = src */
+ case BPF_ALU | BPF_MOV | BPF_K:
+ case BPF_ALU | BPF_MOV | BPF_X:
+ case BPF_ALU64 | BPF_MOV | BPF_K:
+ case BPF_ALU64 | BPF_MOV | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mov_r64(is64, dst, src, dstk,
+ sstk, &prog);
+ break;
+ case BPF_K:
+ /* Sign-extend immediate value to dst reg */
+ emit_ia32_mov_i64(is64, dst, imm32,
+ dstk, &prog);
+ break;
+ }
+ break;
+ /* dst = dst + src/imm */
+ /* dst = dst - src/imm */
+ /* dst = dst | src/imm */
+ /* dst = dst & src/imm */
+ /* dst = dst ^ src/imm */
+ /* dst = dst * src/imm */
+ /* dst = dst << src */
+ /* dst = dst >> src */
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU64 | BPF_ADD | BPF_K:
+ case BPF_ALU64 | BPF_ADD | BPF_X:
+ case BPF_ALU64 | BPF_SUB | BPF_K:
+ case BPF_ALU64 | BPF_SUB | BPF_X:
+ case BPF_ALU64 | BPF_OR | BPF_K:
+ case BPF_ALU64 | BPF_OR | BPF_X:
+ case BPF_ALU64 | BPF_AND | BPF_K:
+ case BPF_ALU64 | BPF_AND | BPF_X:
+ case BPF_ALU64 | BPF_XOR | BPF_K:
+ case BPF_ALU64 | BPF_XOR | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_alu_r64(is64, BPF_OP(code), dst,
+ src, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ emit_ia32_alu_i64(is64, BPF_OP(code), dst,
+ imm32, dstk, &prog);
+ break;
+ }
+ break;
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mul_r(dst_lo, src_lo, dstk,
+ sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_mul_r(dst_lo, IA32_ECX, dstk,
+ false, &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_ARSH | BPF_K:
+ case BPF_ALU | BPF_ARSH | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_shift_r(BPF_OP(code), dst_lo, src_lo,
+ dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_shift_r(BPF_OP(code), dst_lo,
+ IA32_ECX, dstk, false,
+ &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = dst / src(imm) */
+ /* dst = dst % src(imm) */
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_MOD | BPF_K:
+ case BPF_ALU | BPF_MOD | BPF_X:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+ src_lo, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX),
+ imm32);
+ emit_ia32_div_mod_r(BPF_OP(code), dst_lo,
+ IA32_ECX, dstk, false,
+ &prog);
+ break;
+ }
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ case BPF_ALU64 | BPF_DIV | BPF_K:
+ case BPF_ALU64 | BPF_DIV | BPF_X:
+ case BPF_ALU64 | BPF_MOD | BPF_K:
+ case BPF_ALU64 | BPF_MOD | BPF_X:
+ goto notyet;
+ /* dst = dst >> imm */
+ /* dst = dst << imm */
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ if (unlikely(imm32 > 31))
+ return -EINVAL;
+ /* mov ecx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ emit_ia32_shift_r(BPF_OP(code), dst_lo, IA32_ECX, dstk,
+ false, &prog);
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = dst << imm */
+ case BPF_ALU64 | BPF_LSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_lsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = dst >> imm */
+ case BPF_ALU64 | BPF_RSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_rsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = dst << src */
+ case BPF_ALU64 | BPF_LSH | BPF_X:
+ emit_ia32_lsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> src */
+ case BPF_ALU64 | BPF_RSH | BPF_X:
+ emit_ia32_rsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> src (signed) */
+ case BPF_ALU64 | BPF_ARSH | BPF_X:
+ emit_ia32_arsh_r64(dst, src, dstk, sstk, &prog);
+ break;
+ /* dst = dst >> imm (signed) */
+ case BPF_ALU64 | BPF_ARSH | BPF_K:
+ if (unlikely(imm32 > 63))
+ return -EINVAL;
+ emit_ia32_arsh_i64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = ~dst */
+ case BPF_ALU | BPF_NEG:
+ emit_ia32_alu_i(is64, false, BPF_OP(code),
+ dst_lo, 0, dstk, &prog);
+ emit_ia32_mov_i(dst_hi, 0, dstk, &prog);
+ break;
+ /* dst = ~dst (64 bit) */
+ case BPF_ALU64 | BPF_NEG:
+ emit_ia32_neg64(dst, dstk, &prog);
+ break;
+ /* dst = dst * src/imm */
+ case BPF_ALU64 | BPF_MUL | BPF_X:
+ case BPF_ALU64 | BPF_MUL | BPF_K:
+ switch (BPF_SRC(code)) {
+ case BPF_X:
+ emit_ia32_mul_r64(dst, src, dstk, sstk, &prog);
+ break;
+ case BPF_K:
+ emit_ia32_mul_i64(dst, imm32, dstk, &prog);
+ break;
+ }
+ break;
+ /* dst = htole(dst) */
+ case BPF_ALU | BPF_END | BPF_FROM_LE:
+ emit_ia32_to_le_r64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = htobe(dst) */
+ case BPF_ALU | BPF_END | BPF_FROM_BE:
+ emit_ia32_to_be_r64(dst, imm32, dstk, &prog);
+ break;
+ /* dst = imm64 */
+ case BPF_LD | BPF_IMM | BPF_DW: {
+ s32 hi, lo = imm32;
+
+ hi = insn[1].imm;
+ emit_ia32_mov_i(dst_lo, lo, dstk, &prog);
+ emit_ia32_mov_i(dst_hi, hi, dstk, &prog);
+ insn++;
+ i++;
+ break;
+ }
+ /* ST: *(u8*)(dst_reg + off) = imm */
+ case BPF_ST | BPF_MEM | BPF_H:
+ case BPF_ST | BPF_MEM | BPF_B:
+ case BPF_ST | BPF_MEM | BPF_W:
+ case BPF_ST | BPF_MEM | BPF_DW:
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT(0xC6, 1); break;
+ case BPF_H:
+ EMIT2(0x66, 0xC7); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0xC7, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_1reg(0x40, IA32_EAX), insn->off);
+ else
+ EMIT1_off32(add_1reg(0x80, IA32_EAX),
+ insn->off);
+ EMIT(imm32, bpf_size_to_x86_bytes(BPF_SIZE(code)));
+
+ if (BPF_SIZE(code) == BPF_DW) {
+ u32 hi;
+
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+ EMIT2_off32(0xC7, add_1reg(0x80, IA32_EAX),
+ insn->off + 4);
+ EMIT(hi, 4);
+ }
+ break;
+
+ /* STX: *(u8*)(dst_reg + off) = src_reg */
+ case BPF_STX | BPF_MEM | BPF_B:
+ case BPF_STX | BPF_MEM | BPF_H:
+ case BPF_STX | BPF_MEM | BPF_W:
+ case BPF_STX | BPF_MEM | BPF_DW:
+ if (dstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov eax,dst_lo */
+ EMIT2(0x8B, add_2reg(0xC0, dst_lo, IA32_EAX));
+
+ if (sstk)
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(src_lo));
+ else
+ /* mov edx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EDX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT(0x88, 1); break;
+ case BPF_H:
+ EMIT2(0x66, 0x89); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0x89, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+ insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off);
+
+ if (BPF_SIZE(code) == BPF_DW) {
+ if (sstk)
+ /* mov edi,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(src_hi));
+ else
+ /* mov edi,src_hi */
+ EMIT2(0x8B, add_2reg(0xC0, src_hi,
+ IA32_EDX));
+ EMIT1(0x89);
+ if (is_imm8(insn->off + 4)) {
+ EMIT2(add_2reg(0x40, IA32_EAX,
+ IA32_EDX),
+ insn->off + 4);
+ } else {
+ EMIT1(add_2reg(0x80, IA32_EAX,
+ IA32_EDX));
+ EMIT(insn->off + 4, 4);
+ }
+ }
+ break;
+
+ /* LDX: dst_reg = *(u8*)(src_reg + off) */
+ case BPF_LDX | BPF_MEM | BPF_B:
+ case BPF_LDX | BPF_MEM | BPF_H:
+ case BPF_LDX | BPF_MEM | BPF_W:
+ case BPF_LDX | BPF_MEM | BPF_DW:
+ if (sstk)
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(src_lo));
+ else
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo, IA32_EAX));
+
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ EMIT2(0x0F, 0xB6); break;
+ case BPF_H:
+ EMIT2(0x0F, 0xB7); break;
+ case BPF_W:
+ case BPF_DW:
+ EMIT(0x8B, 1); break;
+ }
+
+ if (is_imm8(insn->off))
+ EMIT2(add_2reg(0x40, IA32_EAX, IA32_EDX),
+ insn->off);
+ else
+ EMIT1_off32(add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off);
+
+ if (dstk)
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_lo));
+ else
+ /* mov dst_lo,edx */
+ EMIT2(0x89, add_2reg(0xC0, dst_lo, IA32_EDX));
+ switch (BPF_SIZE(code)) {
+ case BPF_B:
+ case BPF_H:
+ case BPF_W:
+ if (dstk) {
+ EMIT3(0xC7, add_1reg(0x40, IA32_EBP),
+ STACK_VAR(dst_hi));
+ EMIT(0x0, 4);
+ } else {
+ EMIT3(0xC7, add_1reg(0xC0, dst_hi), 0);
+ }
+ break;
+ case BPF_DW:
+ EMIT2_off32(0x8B,
+ add_2reg(0x80, IA32_EAX, IA32_EDX),
+ insn->off + 4);
+ if (dstk)
+ EMIT3(0x89,
+ add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(dst_hi));
+ else
+ EMIT2(0x89,
+ add_2reg(0xC0, dst_hi, IA32_EDX));
+ break;
+ default:
+ break;
+ }
+ break;
+ /* call */
+ case BPF_JMP | BPF_CALL:
+ {
+ const u8 *r1 = bpf2ia32[BPF_REG_1];
+ const u8 *r2 = bpf2ia32[BPF_REG_2];
+ const u8 *r3 = bpf2ia32[BPF_REG_3];
+ const u8 *r4 = bpf2ia32[BPF_REG_4];
+ const u8 *r5 = bpf2ia32[BPF_REG_5];
+
+ if (insn->src_reg == BPF_PSEUDO_CALL)
+ goto notyet;
+
+ func = (u8 *) __bpf_call_base + imm32;
+ jmp_offset = func - (image + addrs[i]);
+
+ if (!imm32 || !is_simm32(jmp_offset)) {
+ pr_err("unsupported bpf func %d addr %p image %p\n",
+ imm32, func, image);
+ return -EINVAL;
+ }
+
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r1[0]));
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r1[1]));
+
+ emit_push_r64(r5, &prog);
+ emit_push_r64(r4, &prog);
+ emit_push_r64(r3, &prog);
+ emit_push_r64(r2, &prog);
+
+ EMIT1_off32(0xE8, jmp_offset + 9);
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r0[0]));
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r0[1]));
+
+ /* add esp,32 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 32);
+ break;
+ }
+ case BPF_JMP | BPF_TAIL_CALL:
+ emit_bpf_tail_call(&prog);
+ break;
+
+ /* cond jump */
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JNE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JLT | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JLE | BPF_X:
+ case BPF_JMP | BPF_JSGT | BPF_X:
+ case BPF_JMP | BPF_JSLE | BPF_X:
+ case BPF_JMP | BPF_JSLT | BPF_X:
+ case BPF_JMP | BPF_JSGE | BPF_X: {
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+ u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+ STACK_VAR(src_hi));
+ }
+
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JSET | BPF_X: {
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = sstk ? IA32_ECX : src_lo;
+ u8 sreg_hi = sstk ? IA32_EBX : src_hi;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ if (sstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_ECX),
+ STACK_VAR(src_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EBX),
+ STACK_VAR(src_hi));
+ }
+ /* and dreg_lo,sreg_lo */
+ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+ /* and dreg_hi,sreg_hi */
+ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+ /* or dreg_lo,dreg_hi */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JSET | BPF_K: {
+ u32 hi;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = IA32_ECX;
+ u8 sreg_hi = IA32_EBX;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+
+ /* mov ecx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+ /* and dreg_lo,sreg_lo */
+ EMIT2(0x23, add_2reg(0xC0, sreg_lo, dreg_lo));
+ /* and dreg_hi,sreg_hi */
+ EMIT2(0x23, add_2reg(0xC0, sreg_hi, dreg_hi));
+ /* or dreg_lo,dreg_hi */
+ EMIT2(0x09, add_2reg(0xC0, dreg_lo, dreg_hi));
+ goto emit_cond_jmp;
+ }
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JNE | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JLT | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JLE | BPF_K:
+ case BPF_JMP | BPF_JSGT | BPF_K:
+ case BPF_JMP | BPF_JSLE | BPF_K:
+ case BPF_JMP | BPF_JSLT | BPF_K:
+ case BPF_JMP | BPF_JSGE | BPF_K: {
+ u32 hi;
+ u8 dreg_lo = dstk ? IA32_EAX : dst_lo;
+ u8 dreg_hi = dstk ? IA32_EDX : dst_hi;
+ u8 sreg_lo = IA32_ECX;
+ u8 sreg_hi = IA32_EBX;
+
+ if (dstk) {
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(dst_lo));
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(dst_hi));
+ }
+
+ hi = imm32 & (1<<31) ? (u32)~0 : 0;
+ /* mov ecx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_ECX), imm32);
+ /* mov ebx,imm32 */
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX), hi);
+
+ /* cmp dreg_hi,sreg_hi */
+ EMIT2(0x39, add_2reg(0xC0, dreg_hi, sreg_hi));
+ EMIT2(IA32_JNE, 2);
+ /* cmp dreg_lo,sreg_lo */
+ EMIT2(0x39, add_2reg(0xC0, dreg_lo, sreg_lo));
+
+emit_cond_jmp: /* convert BPF opcode to x86 */
+ switch (BPF_OP(code)) {
+ case BPF_JEQ:
+ jmp_cond = IA32_JE;
+ break;
+ case BPF_JSET:
+ case BPF_JNE:
+ jmp_cond = IA32_JNE;
+ break;
+ case BPF_JGT:
+ /* GT is unsigned '>', JA in x86 */
+ jmp_cond = IA32_JA;
+ break;
+ case BPF_JLT:
+ /* LT is unsigned '<', JB in x86 */
+ jmp_cond = IA32_JB;
+ break;
+ case BPF_JGE:
+ /* GE is unsigned '>=', JAE in x86 */
+ jmp_cond = IA32_JAE;
+ break;
+ case BPF_JLE:
+ /* LE is unsigned '<=', JBE in x86 */
+ jmp_cond = IA32_JBE;
+ break;
+ case BPF_JSGT:
+ /* signed '>', GT in x86 */
+ jmp_cond = IA32_JG;
+ break;
+ case BPF_JSLT:
+ /* signed '<', LT in x86 */
+ jmp_cond = IA32_JL;
+ break;
+ case BPF_JSGE:
+ /* signed '>=', GE in x86 */
+ jmp_cond = IA32_JGE;
+ break;
+ case BPF_JSLE:
+ /* signed '<=', LE in x86 */
+ jmp_cond = IA32_JLE;
+ break;
+ default: /* to silence gcc warning */
+ return -EFAULT;
+ }
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+ if (is_imm8(jmp_offset)) {
+ EMIT2(jmp_cond, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+ } else {
+ pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+
+ break;
+ }
+ case BPF_JMP | BPF_JA:
+ if (insn->off == -1)
+ /* -1 jmp instructions will always jump
+ * backwards two bytes. Explicitly handling
+ * this case avoids wasting too many passes
+ * when there are long sequences of replaced
+ * dead code.
+ */
+ jmp_offset = -2;
+ else
+ jmp_offset = addrs[i + insn->off] - addrs[i];
+
+ if (!jmp_offset)
+ /* optimize out nop jumps */
+ break;
+emit_jmp:
+ if (is_imm8(jmp_offset)) {
+ EMIT2(0xEB, jmp_offset);
+ } else if (is_simm32(jmp_offset)) {
+ EMIT1_off32(0xE9, jmp_offset);
+ } else {
+ pr_err("jmp gen bug %llx\n", jmp_offset);
+ return -EFAULT;
+ }
+ break;
+
+ case BPF_LD | BPF_ABS | BPF_W:
+ case BPF_LD | BPF_ABS | BPF_H:
+ case BPF_LD | BPF_ABS | BPF_B:
+ case BPF_LD | BPF_IND | BPF_W:
+ case BPF_LD | BPF_IND | BPF_H:
+ case BPF_LD | BPF_IND | BPF_B:
+ {
+ int size;
+ const u8 *r6 = bpf2ia32[BPF_REG_6];
+
+ /* Setting up first argument */
+ /* mov eax,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r6[0]));
+
+ /* Setting up second argument */
+ if (BPF_MODE(code) == BPF_ABS) {
+ /* mov %edx, imm32 */
+ EMIT1_off32(0xBA, imm32);
+ } else {
+ if (sstk)
+ /* mov edx,dword ptr [ebp+off] */
+ EMIT3(0x8B, add_2reg(0x40, IA32_EBP,
+ IA32_EDX),
+ STACK_VAR(src_lo));
+ else
+ /* mov edx,src_lo */
+ EMIT2(0x8B, add_2reg(0xC0, src_lo,
+ IA32_EDX));
+ if (imm32) {
+ if (is_imm8(imm32))
+ /* add %edx, imm8 */
+ EMIT3(0x83, 0xC2, imm32);
+ else
+ /* add %edx, imm32 */
+ EMIT2_off32(0x81, 0xC2, imm32);
+ }
+ }
+
+ /* Setting up third argument */
+ switch (BPF_SIZE(code)) {
+ case BPF_W:
+ size = 4;
+ break;
+ case BPF_H:
+ size = 2;
+ break;
+ case BPF_B:
+ size = 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+ /* mov ecx,val */
+ EMIT2(0xB1, size);
+ /* movzx ecx,ecx */
+ EMIT3(0x0F, 0xB6, add_2reg(0xC0, IA32_ECX, IA32_ECX));
+
+ /* mov ebx,ebp */
+ EMIT2(0x8B, add_2reg(0xC0, IA32_EBP, IA32_EBX));
+ /* add %ebx,imm8 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_EBX), SKB_BUFFER);
+ /* push ebx */
+ EMIT1(0x53);
+
+ /* Setting up function pointer to call */
+ /* mov ebx,imm32*/
+ EMIT2_off32(0xC7, add_1reg(0xC0, IA32_EBX),
+ (unsigned int)bpf_load_pointer);
+
+ EMIT2(0xFF, add_1reg(0xD0, IA32_EBX));
+ /* add %esp,4 */
+ EMIT3(0x83, add_1reg(0xC0, IA32_ESP), 4);
+ /* xor edx,edx */
+ EMIT2(0x33, add_2reg(0xC0, IA32_EDX, IA32_EDX));
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r0[0]));
+ /* mov dword ptr [ebp+off],edx */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EDX),
+ STACK_VAR(r0[1]));
+
+ /* Check if return address is NULL or not.
+ * if NULL then jump to epilogue
+ * else continue to load the value from retn address
+ */
+ EMIT3(0x83, add_1reg(0xF8, IA32_EAX), 0);
+ jmp_offset = ctx->cleanup_addr - addrs[i];
+
+ switch (BPF_SIZE(code)) {
+ case BPF_W:
+ jmp_offset += 7;
+ break;
+ case BPF_H:
+ jmp_offset += 10;
+ break;
+ case BPF_B:
+ jmp_offset += 6;
+ break;
+ }
+
+ EMIT2_off32(0x0F, IA32_JE + 0x10, jmp_offset);
+ /* Load value from the address */
+ switch (BPF_SIZE(code)) {
+ case BPF_W:
+ /* mov eax,[eax] */
+ EMIT2(0x8B, 0x0);
+ /* emit 'bswap eax' */
+ EMIT2(0x0F, add_1reg(0xC8, IA32_EAX));
+ break;
+ case BPF_H:
+ EMIT3(0x0F, 0xB7, 0x0);
+ EMIT1(0x66);
+ EMIT3(0xC1, add_1reg(0xC8, IA32_EAX), 8);
+ break;
+ case BPF_B:
+ EMIT3(0x0F, 0xB6, 0x0);
+ break;
+ }
+
+ /* mov dword ptr [ebp+off],eax */
+ EMIT3(0x89, add_2reg(0x40, IA32_EBP, IA32_EAX),
+ STACK_VAR(r0[0]));
+ break;
+ }
+ /* STX XADD: lock *(u32 *)(dst + off) += src */
+ case BPF_STX | BPF_XADD | BPF_W:
+ /* STX XADD: lock *(u64 *)(dst + off) += src */
+ case BPF_STX | BPF_XADD | BPF_DW:
+ goto notyet;
+ case BPF_JMP | BPF_EXIT:
+ if (seen_exit) {
+ jmp_offset = ctx->cleanup_addr - addrs[i];
+ goto emit_jmp;
+ }
+ seen_exit = true;
+ /* update cleanup_addr */
+ ctx->cleanup_addr = proglen;
+ emit_epilogue(&prog, bpf_prog->aux->stack_depth);
+ break;
+notyet:
+ pr_info_once("*** NOT YET: opcode %02x ***\n", code);
+ return -EFAULT;
+ default:
+ /* This error will be seen if new instruction was added
+ * to interpreter, but not to JIT
+ * or if there is junk in bpf_prog
+ */
+ pr_err("bpf_jit: unknown opcode %02x\n", code);
+ return -EINVAL;
+ }
+
+ ilen = prog - temp;
+ if (ilen > BPF_MAX_INSN_SIZE) {
+ pr_err("bpf_jit: fatal insn size error\n");
+ return -EFAULT;
+ }
+
+ if (image) {
+ if (unlikely(proglen + ilen > oldproglen)) {
+ pr_err("bpf_jit: fatal error\n");
+ return -EFAULT;
+ }
+ memcpy(image + proglen, temp, ilen);
+ }
+ proglen += ilen;
+ addrs[i] = proglen;
+ prog = temp;
+ }
+ return proglen;
+}
+
+struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
+{
+ struct bpf_binary_header *header = NULL;
+ struct bpf_prog *tmp, *orig_prog = prog;
+ int proglen, oldproglen = 0;
+ struct jit_context ctx = {};
+ bool tmp_blinded = false;
+ u8 *image = NULL;
+ int *addrs;
+ int pass;
+ int i;
+
+ if (!prog->jit_requested)
+ return orig_prog;
+
+ tmp = bpf_jit_blind_constants(prog);
+ /* If blinding was requested and we failed during blinding,
+ * we must fall back to the interpreter.
+ */
+ if (IS_ERR(tmp))
+ return orig_prog;
+ if (tmp != prog) {
+ tmp_blinded = true;
+ prog = tmp;
+ }
+
+ addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL);
+ if (!addrs) {
+ prog = orig_prog;
+ goto out;
+ }
+
+ /* Before first pass, make a rough estimation of addrs[]
+ * each bpf instruction is translated to less than 64 bytes
+ */
+ for (proglen = 0, i = 0; i < prog->len; i++) {
+ proglen += 64;
+ addrs[i] = proglen;
+ }
+ ctx.cleanup_addr = proglen;
+
+ /* JITed image shrinks with every pass and the loop iterates
+ * until the image stops shrinking. Very large bpf programs
+ * may converge on the last pass. In such case do one more
+ * pass to emit the final image
+ */
+ for (pass = 0; pass < 20 || image; pass++) {
+ proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
+ if (proglen <= 0) {
+ image = NULL;
+ if (header)
+ bpf_jit_binary_free(header);
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ if (image) {
+ if (proglen != oldproglen) {
+ pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+ proglen, oldproglen);
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ break;
+ }
+ if (proglen == oldproglen) {
+ header = bpf_jit_binary_alloc(proglen, &image,
+ 1, jit_fill_hole);
+ if (!header) {
+ prog = orig_prog;
+ goto out_addrs;
+ }
+ }
+ oldproglen = proglen;
+ cond_resched();
+ }
+
+ if (bpf_jit_enable > 1)
+ bpf_jit_dump(prog->len, proglen, pass + 1, image);
+
+ if (image) {
+ bpf_jit_binary_lock_ro(header);
+ prog->bpf_func = (void *)image;
+ prog->jited = 1;
+ prog->jited_len = proglen;
+ } else {
+ prog = orig_prog;
+ }
+
+out_addrs:
+ kfree(addrs);
+out:
+ if (tmp_blinded)
+ bpf_jit_prog_release_other(prog, prog == orig_prog ?
+ tmp : orig_prog);
+ return prog;
+}
--
1.8.5.6.2.g3d8a54e.dirty
^ permalink raw reply related
* Re: [llc_ui_release] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
From: Fengguang Wu @ 2018-04-29 12:16 UTC (permalink / raw)
To: Linus Torvalds
Cc: Network Development, David Miller, xiaolou4617,
Linux Kernel Mailing List, LKP
In-Reply-To: <CA+55aFzeCvwUw=yi5oDy2ouv7onkXDapgamJJ5AhjybyBH8UCw@mail.gmail.com>
On Sun, Apr 29, 2018 at 03:30:48AM +0000, Linus Torvalds wrote:
>On Sat, Apr 28, 2018 at 7:12 PM Fengguang Wu <fengguang.wu@intel.com> wrote:
>
>> FYI this happens in mainline kernel 4.17.0-rc2.
>> It looks like a new regression.
>
>> It occurs in 5 out of 5 boots.
>
>> [main] 375 sockets created based on info from socket cachefile.
>> [main] Generating file descriptors
>> [main] Added 83 filenames from /dev
>> udevd[507]: failed to execute '/sbin/modprobe' '/sbin/modprobe -bv
>platform:regulatory': No such file or directory
>> [ 372.057947] caif:caif_disconnect_client(): nothing to disconnect
>> [ 372.082415] BUG: unable to handle kernel NULL pointer dereference at
>0000000000000004
>
>I think this is fixed by commit 3a04ce7130a7 ("llc: fix NULL pointer deref
>for SOCK_ZAPPED")
Confirmed. Sorry for the late report!
Regards,
Fengguang
^ permalink raw reply
* [PATCH net-next 2/2 v5] netns: restrict uevents
From: Christian Brauner @ 2018-04-29 10:44 UTC (permalink / raw)
To: ebiederm, davem, netdev, linux-kernel
Cc: avagin, ktkhai, serge, gregkh, Christian Brauner
In-Reply-To: <20180429104412.22445-1-christian.brauner@ubuntu.com>
commit 07e98962fa77 ("kobject: Send hotplug events in all network namespaces")
enabled sending hotplug events into all network namespaces back in 2010.
Over time the set of uevents that get sent into all network namespaces has
shrunk. We have now reached the point where hotplug events for all devices
that carry a namespace tag are filtered according to that namespace.
Specifically, they are filtered whenever the namespace tag of the kobject
does not match the namespace tag of the netlink socket.
Currently, only network devices carry namespace tags (i.e. network
namespace tags). Hence, uevents for network devices only show up in the
network namespace such devices are created in or moved to.
However, any uevent for a kobject that does not have a namespace tag
associated with it will not be filtered and we will broadcast it into all
network namespaces. This behavior stopped making sense when user namespaces
were introduced.
This patch simplifies and fixes couple of things:
- Split codepath for sending uevents by kobject namespace tags:
1. Untagged kobjects - uevent_net_broadcast_untagged():
Untagged kobjects will be broadcast into all uevent sockets recorded
in uevent_sock_list, i.e. into all network namespacs owned by the
intial user namespace.
2. Tagged kobjects - uevent_net_broadcast_tagged():
Tagged kobjects will only be broadcast into the network namespace they
were tagged with.
Handling of tagged kobjects in 2. does not cause any semantic changes.
This is just splitting out the filtering logic that was handled by
kobj_bcast_filter() before.
Handling of untagged kobjects in 1. will cause a semantic change. The
reasons why this is needed and ok have been discussed in [1]. Here is a
short summary:
- Userspace ignores uevents from network namespaces that are not owned by
the intial user namespace:
Uevents are filtered by userspace in a user namespace because the
received uid != 0. Instead the uid associated with the event will be
65534 == "nobody" because the global root uid is not mapped.
This means we can safely and without introducing regressions modify the
kernel to not send uevents into all network namespaces whose owning
user namespace is not the initial user namespace because we know that
userspace will ignore the message because of the uid anyway.
I have a) verified that is is true for every udev implementation out
there b) that this behavior has been present in all udev
implementations from the very beginning.
- Thundering herd:
Broadcasting uevents into all network namespaces introduces significant
overhead.
All processes that listen to uevents running in non-initial user
namespaces will end up responding to uevents that will be meaningless
to them. Mainly, because non-initial user namespaces cannot easily
manage devices unless they have a privileged host-process helping them
out. This means that there will be a thundering herd of activity when
there shouldn't be any.
- Removing needless overhead/Increasing performance:
Currently, the uevent socket for each network namespace is added to the
global variable uevent_sock_list. The list itself needs to be protected
by a mutex. So everytime a uevent is generated the mutex is taken on
the list. The mutex is held *from the creation of the uevent (memory
allocation, string creation etc. until all uevent sockets have been
handled*. This is aggravated by the fact that for each uevent socket
that has listeners the mc_list must be walked as well which means we're
talking O(n^2) here. Given that a standard Linux workload usually has
quite a lot of network namespaces and - in the face of containers - a
lot of user namespaces this quickly becomes a performance problem (see
"Thundering herd" above). By just recording uevent sockets of network
namespaces that are owned by the initial user namespace we
significantly increase performance in this codepath.
- Injecting uevents:
There's a valid argument that containers might be interested in
receiving device events especially if they are delegated to them by a
privileged userspace process. One prime example are SR-IOV enabled
devices that are explicitly designed to be handed of to other users
such as VMs or containers.
This use-case can now be correctly handled since
commit 692ec06d7c92 ("netns: send uevent messages"). This commit
introduced the ability to send uevents from userspace. As such we can
let a sufficiently privileged (CAP_SYS_ADMIN in the owning user
namespace of the network namespace of the netlink socket) userspace
process make a decision what uevents should be sent. This removes the
need to blindly broadcast uevents into all user namespaces and provides
a performant and safe solution to this problem.
- Filtering logic:
This patch filters by *owning user namespace of the network namespace a
given task resides in* and not by user namespace of the task per se.
This means if the user namespace of a given task is unshared but the
network namespace is kept and is owned by the initial user namespace a
listener that is opening the uevent socket in that network namespace
can still listen to uevents.
- Fix permission for tagged kobjects:
Network devices that are created or moved into a network namespace that
is owned by a non-initial user namespace currently are send with
INVALID_{G,U}ID in their credentials. This means that all current udev
implementations in userspace will ignore the uevent they receive for
them. This has lead to weird bugs whereby new devices showing up in such
network namespaces were not recognized and did not get IPs assigned etc.
This patch adjusts the permission to the appropriate {g,u}id in the
respective user namespace. This way udevd is able to correctly handle
such devices.
- Simplify filtering logic:
do_one_broadcast() already ensures that only listeners in mc_list receive
uevents that have the same network namespace as the uevent socket itself.
So the filtering logic in kobj_bcast_filter is not needed (see [3]). This
patch therefore removes kobj_bcast_filter() and replaces
netlink_broadcast_filtered() with the simpler netlink_broadcast()
everywhere.
[1]: https://lkml.org/lkml/2018/4/4/739
[2]: https://lkml.org/lkml/2018/4/26/767
[3]: https://lkml.org/lkml/2018/4/26/738
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
---
v4->v5:
* non-functional changes
v3->v4:
* patch unchanged
v2->v3:
* new approach: patch added
v1->v2:
* old approach: different patchset
v0->v1:
* old approach: different patchset
---
lib/kobject_uevent.c | 137 ++++++++++++++++++++++++++++++-------------
1 file changed, 95 insertions(+), 42 deletions(-)
diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
index 649bf60a9440..63d0816ab23b 100644
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -232,30 +232,6 @@ int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count)
return r;
}
-#ifdef CONFIG_NET
-static int kobj_bcast_filter(struct sock *dsk, struct sk_buff *skb, void *data)
-{
- struct kobject *kobj = data, *ksobj;
- const struct kobj_ns_type_operations *ops;
-
- ops = kobj_ns_ops(kobj);
- if (!ops && kobj->kset) {
- ksobj = &kobj->kset->kobj;
- if (ksobj->parent != NULL)
- ops = kobj_ns_ops(ksobj->parent);
- }
-
- if (ops && ops->netlink_ns && kobj->ktype->namespace) {
- const void *sock_ns, *ns;
- ns = kobj->ktype->namespace(kobj);
- sock_ns = ops->netlink_ns(dsk);
- return sock_ns != ns;
- }
-
- return 0;
-}
-#endif
-
#ifdef CONFIG_UEVENT_HELPER
static int kobj_usermode_filter(struct kobject *kobj)
{
@@ -327,17 +303,14 @@ static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
return skb;
}
-#endif
-static int kobject_uevent_net_broadcast(struct kobject *kobj,
- struct kobj_uevent_env *env,
- const char *action_string,
- const char *devpath)
+static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env,
+ const char *action_string,
+ const char *devpath)
{
- int retval = 0;
-#if defined(CONFIG_NET)
struct sk_buff *skb = NULL;
struct uevent_sock *ue_sk;
+ int retval = 0;
/* send netlink message */
list_for_each_entry(ue_sk, &uevent_sock_list, list) {
@@ -353,19 +326,93 @@ static int kobject_uevent_net_broadcast(struct kobject *kobj,
continue;
}
- retval = netlink_broadcast_filtered(uevent_sock, skb_get(skb),
- 0, 1, GFP_KERNEL,
- kobj_bcast_filter,
- kobj);
+ retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1,
+ GFP_KERNEL);
/* ENOBUFS should be handled in userspace */
if (retval == -ENOBUFS || retval == -ESRCH)
retval = 0;
}
consume_skb(skb);
-#endif
+
return retval;
}
+static int uevent_net_broadcast_tagged(struct sock *usk,
+ struct kobj_uevent_env *env,
+ const char *action_string,
+ const char *devpath)
+{
+ struct user_namespace *owning_user_ns = sock_net(usk)->user_ns;
+ struct sk_buff *skb = NULL;
+ int ret = 0;
+
+ skb = alloc_uevent_skb(env, action_string, devpath);
+ if (!skb)
+ return -ENOMEM;
+
+ /* fix credentials */
+ if (owning_user_ns != &init_user_ns) {
+ struct netlink_skb_parms *parms = &NETLINK_CB(skb);
+ kuid_t root_uid;
+ kgid_t root_gid;
+
+ /* fix uid */
+ root_uid = make_kuid(owning_user_ns, 0);
+ if (uid_valid(root_uid))
+ parms->creds.uid = root_uid;
+
+ /* fix gid */
+ root_gid = make_kgid(owning_user_ns, 0);
+ if (gid_valid(root_gid))
+ parms->creds.gid = root_gid;
+ }
+
+ ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL);
+ /* ENOBUFS should be handled in userspace */
+ if (ret == -ENOBUFS || ret == -ESRCH)
+ ret = 0;
+
+ return ret;
+}
+#endif
+
+static int kobject_uevent_net_broadcast(struct kobject *kobj,
+ struct kobj_uevent_env *env,
+ const char *action_string,
+ const char *devpath)
+{
+ int ret = 0;
+
+#ifdef CONFIG_NET
+ const struct kobj_ns_type_operations *ops;
+ const struct net *net = NULL;
+
+ ops = kobj_ns_ops(kobj);
+ if (!ops && kobj->kset) {
+ struct kobject *ksobj = &kobj->kset->kobj;
+ if (ksobj->parent != NULL)
+ ops = kobj_ns_ops(ksobj->parent);
+ }
+
+ /* kobjects currently only carry network namespace tags and they
+ * are the only tag relevant here since we want to decide which
+ * network namespaces to broadcast the uevent into.
+ */
+ if (ops && ops->netlink_ns && kobj->ktype->namespace)
+ if (ops->type == KOBJ_NS_TYPE_NET)
+ net = kobj->ktype->namespace(kobj);
+
+ if (!net)
+ ret = uevent_net_broadcast_untagged(env, action_string,
+ devpath);
+ else
+ ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env,
+ action_string, devpath);
+#endif
+
+ return ret;
+}
+
static void zap_modalias_env(struct kobj_uevent_env *env)
{
static const char modalias_prefix[] = "MODALIAS=";
@@ -724,9 +771,13 @@ static int uevent_net_init(struct net *net)
net->uevent_sock = ue_sk;
- mutex_lock(&uevent_sock_mutex);
- list_add_tail(&ue_sk->list, &uevent_sock_list);
- mutex_unlock(&uevent_sock_mutex);
+ /* Restrict uevents to initial user namespace. */
+ if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+ mutex_lock(&uevent_sock_mutex);
+ list_add_tail(&ue_sk->list, &uevent_sock_list);
+ mutex_unlock(&uevent_sock_mutex);
+ }
+
return 0;
}
@@ -734,9 +785,11 @@ static void uevent_net_exit(struct net *net)
{
struct uevent_sock *ue_sk = net->uevent_sock;
- mutex_lock(&uevent_sock_mutex);
- list_del(&ue_sk->list);
- mutex_unlock(&uevent_sock_mutex);
+ if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
+ mutex_lock(&uevent_sock_mutex);
+ list_del(&ue_sk->list);
+ mutex_unlock(&uevent_sock_mutex);
+ }
netlink_kernel_release(ue_sk->sk);
kfree(ue_sk);
--
2.17.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox