* [PATCH RFC net-next 5/9] net/ipv6: Pass ifa6_config struct to inet6_addr_modify
From: dsahern @ 2018-05-23 22:57 UTC (permalink / raw)
To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180523225727.11386-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Update inet6_addr_modify to take ifa6_config.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
net/ipv6/addrconf.c | 44 +++++++++++++++++++++++---------------------
1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 2db1acf70610..1b1e63a4520b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4527,8 +4527,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm->ifa_prefixlen);
}
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
- u32 prefered_lft, u32 valid_lft)
+static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
{
u32 flags;
clock_t expires;
@@ -4538,32 +4537,32 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
ASSERT_RTNL();
- if (!valid_lft || (prefered_lft > valid_lft))
+ if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
return -EINVAL;
- if (ifa_flags & IFA_F_MANAGETEMPADDR &&
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
(ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
return -EINVAL;
if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
- ifa_flags &= ~IFA_F_OPTIMISTIC;
+ cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
- timeout = addrconf_timeout_fixup(valid_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
expires = jiffies_to_clock_t(timeout * HZ);
- valid_lft = timeout;
+ cfg->valid_lft = timeout;
flags = RTF_EXPIRES;
} else {
expires = 0;
flags = 0;
- ifa_flags |= IFA_F_PERMANENT;
+ cfg->ifa_flags |= IFA_F_PERMANENT;
}
- timeout = addrconf_timeout_fixup(prefered_lft, HZ);
+ timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
if (timeout == 0)
- ifa_flags |= IFA_F_DEPRECATED;
- prefered_lft = timeout;
+ cfg->ifa_flags |= IFA_F_DEPRECATED;
+ cfg->preferred_lft = timeout;
}
spin_lock_bh(&ifp->lock);
@@ -4573,16 +4572,16 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
IFA_F_NOPREFIXROUTE);
- ifp->flags |= ifa_flags;
+ ifp->flags |= cfg->ifa_flags;
ifp->tstamp = jiffies;
- ifp->valid_lft = valid_lft;
- ifp->prefered_lft = prefered_lft;
+ ifp->valid_lft = cfg->valid_lft;
+ ifp->prefered_lft = cfg->preferred_lft;
spin_unlock_bh(&ifp->lock);
if (!(ifp->flags&IFA_F_TENTATIVE))
ipv6_ifa_notify(0, ifp);
- if (!(ifa_flags & IFA_F_NOPREFIXROUTE)) {
+ if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
ifp->idev->dev, expires, flags,
GFP_KERNEL);
@@ -4601,10 +4600,14 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags,
}
if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
- if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
- valid_lft = prefered_lft = 0;
- manage_tempaddrs(ifp->idev, ifp, valid_lft, prefered_lft,
- !was_managetempaddr, jiffies);
+ if (was_managetempaddr &&
+ !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
+ cfg->valid_lft = 0;
+ cfg->preferred_lft = 0;
+ }
+ manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, !was_managetempaddr,
+ jiffies);
}
addrconf_verify_rtnl();
@@ -4691,8 +4694,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
!(nlh->nlmsg_flags & NLM_F_REPLACE))
err = -EEXIST;
else
- err = inet6_addr_modify(ifa, cfg.ifa_flags, cfg.preferred_lft,
- cfg.valid_lft);
+ err = inet6_addr_modify(ifa, &cfg);
in6_ifa_put(ifa);
--
2.11.0
^ permalink raw reply related
* [PATCH RFC net-next 6/9] net: Add IFA_RT_PRIORITY address attribute
From: dsahern @ 2018-05-23 22:57 UTC (permalink / raw)
To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180523225727.11386-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Currently, if two interfaces have addresses in the same connected route,
then the order of the prefix route entries is determined by the order in
which the addresses are assigned or the links brought up.
Add IFA_RT_PRIORITY to allow user to specify the metric of the prefix
route associated with an address giving interface managers better
control of the order of prefix routes.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/uapi/linux/if_addr.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h
index 2ef053d265de..ebaf5701c9db 100644
--- a/include/uapi/linux/if_addr.h
+++ b/include/uapi/linux/if_addr.h
@@ -33,6 +33,7 @@ enum {
IFA_CACHEINFO,
IFA_MULTICAST,
IFA_FLAGS,
+ IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */
__IFA_MAX,
};
--
2.11.0
^ permalink raw reply related
* [PATCH RFC net-next 7/9] net/ipv4: Add support for specifying metric of connected routes
From: dsahern @ 2018-05-23 22:57 UTC (permalink / raw)
To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180523225727.11386-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Add support for IFA_RT_PRIORITY to ipv4 addresses.
If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be replaced.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/linux/inetdevice.h | 1 +
include/net/route.h | 1 +
net/ipv4/devinet.c | 14 ++++++++++++
net/ipv4/fib_frontend.c | 56 +++++++++++++++++++++++++++++++++++++---------
4 files changed, 61 insertions(+), 11 deletions(-)
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index e16fe7d44a71..27650f1bff3d 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -139,6 +139,7 @@ struct in_ifaddr {
__be32 ifa_local;
__be32 ifa_address;
__be32 ifa_mask;
+ __u32 ifa_rt_priority;
__be32 ifa_broadcast;
unsigned char ifa_scope;
unsigned char ifa_prefixlen;
diff --git a/include/net/route.h b/include/net/route.h
index dbb032d5921b..bb53cdba38dc 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -225,6 +225,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
struct in_ifaddr;
void fib_add_ifaddr(struct in_ifaddr *);
void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
void rt_add_uncached_list(struct rtable *rt);
void rt_del_uncached_list(struct rtable *rt);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 40f001782c1b..c5f08c11ded0 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
+ [IFA_RT_PRIORITY] = { .type = NLA_U32 },
};
#define IN4_ADDR_HSIZE_SHIFT 8
@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+ if (tb[IFA_RT_PRIORITY])
+ ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
@@ -906,12 +910,19 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
extack);
} else {
+ u32 new_metric = ifa->ifa_rt_priority;
+
inet_free_ifa(ifa);
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
ifa = ifa_existing;
+
+ if (ifa->ifa_rt_priority != new_metric)
+ fib_modify_prefix_metric(ifa, new_metric);
+ ifa->ifa_rt_priority = new_metric;
+
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
cancel_delayed_work(&check_lifetime_work);
queue_delayed_work(system_power_efficient_wq,
@@ -1549,6 +1560,7 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_BROADCAST */
+ nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */
+ nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
@@ -1618,6 +1630,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ (ifa->ifa_rt_priority &&
+ nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
preferred, valid))
goto nla_put_failure;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 897ae92dff0f..76adbbe24173 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -848,7 +848,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
* to fib engine. It is legal, because all events occur
* only when netlink is already locked.
*/
-static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
+static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
+ struct in_ifaddr *ifa, u32 rt_priority)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
@@ -858,6 +859,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
.fc_type = type,
.fc_dst = dst,
.fc_dst_len = dst_len,
+ .fc_priority = rt_priority,
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
@@ -903,31 +905,60 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
}
}
- fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_NEWROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- prefix, ifa->ifa_prefixlen, prim);
+ prefix, ifa->ifa_prefixlen, prim,
+ ifa->ifa_rt_priority);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+ fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
+ prim, 0);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
- 32, prim);
+ 32, prim, 0);
}
}
}
+void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
+{
+ struct in_device *in_dev = ifa->ifa_dev;
+ struct net_device *dev = in_dev->dev;
+ struct in_ifaddr *prim = ifa;
+ __be32 mask = ifa->ifa_mask;
+ __be32 addr = ifa->ifa_local;
+ __be32 prefix = ifa->ifa_address & mask;
+
+ if (!(dev->flags & IFF_UP) ||
+ ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
+ ipv4_is_zeronet(prefix) ||
+ prefix == addr || ifa->ifa_prefixlen == 32)
+ return;
+
+ /* add the new */
+ fib_magic(RTM_NEWROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim, new_metric);
+
+ /* delete the old */
+ fib_magic(RTM_DELROUTE,
+ dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+ prefix, ifa->ifa_prefixlen, prim, ifa->ifa_rt_priority);
+}
+
/* Delete primary or secondary address.
* Optionally, on secondary address promotion consider the addresses
* from subnet iprim as deleted, even if they are in device list.
@@ -969,7 +1000,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_DELROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
- any, ifa->ifa_prefixlen, prim);
+ any, ifa->ifa_prefixlen, prim, 0);
subnet = 1;
}
@@ -1053,17 +1084,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
no_promotions:
if (!(ok & BRD_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
+ prim, 0);
if (subnet && ifa->ifa_prefixlen < 31) {
if (!(ok & BRD1_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
+ prim, 0);
if (!(ok & BRD0_OK))
- fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
+ prim, 0);
}
if (!(ok & LOCAL_OK)) {
unsigned int addr_type;
- fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+ fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
/* Check, that this local address finally disappeared. */
addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
--
2.11.0
^ permalink raw reply related
* [PATCH RFC net-next 8/9] net/ipv6: Add support for specifying metric of connected routes
From: dsahern @ 2018-05-23 22:57 UTC (permalink / raw)
To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180523225727.11386-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Add support for IFA_RT_PRIORITY to ipv6 addresses.
If the metric is changed on an existing address then the new route
is inserted before removing the old one. Since the metric is one
of the route keys, the prefix route can not be atomically replaced.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/addrconf.h | 1 +
include/net/if_inet6.h | 1 +
net/ipv6/addrconf.c | 94 ++++++++++++++++++++++++++++++++++++++++----------
3 files changed, 78 insertions(+), 18 deletions(-)
diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 1e3ef04176d6..aec0a32fb531 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -68,6 +68,7 @@ struct ifa6_config {
u32 ifa_flags;
u32 preferred_lft;
u32 valid_lft;
+ u32 rt_priority;
u16 scope;
};
diff --git a/include/net/if_inet6.h b/include/net/if_inet6.h
index db389253dc2a..d7578cf49c3a 100644
--- a/include/net/if_inet6.h
+++ b/include/net/if_inet6.h
@@ -42,6 +42,7 @@ enum {
struct inet6_ifaddr {
struct in6_addr addr;
__u32 prefix_len;
+ __u32 rt_priority;
/* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
__u32 valid_lft;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1b1e63a4520b..7d726900c8c2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1056,6 +1056,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
INIT_HLIST_NODE(&ifa->addr_lst);
ifa->scope = cfg->scope;
ifa->prefix_len = cfg->plen;
+ ifa->rt_priority = cfg->rt_priority;
ifa->flags = cfg->ifa_flags;
/* No need to add the TENTATIVE flag for addresses with NODAD */
if (!(cfg->ifa_flags & IFA_F_NODAD))
@@ -1323,6 +1324,8 @@ static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
}
}
+ memset(&cfg, 0, sizeof(cfg));
+
cfg.valid_lft = min_t(__u32, ifp->valid_lft,
idev->cnf.temp_valid_lft + age);
cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
@@ -2314,12 +2317,13 @@ static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
*/
static void
-addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
- unsigned long expires, u32 flags, gfp_t gfp_flags)
+addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
+ struct net_device *dev, unsigned long expires,
+ u32 flags, gfp_t gfp_flags)
{
struct fib6_config cfg = {
.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
- .fc_metric = IP6_RT_PRIO_ADDRCONF,
+ .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
.fc_ifindex = dev->ifindex,
.fc_expires = expires,
.fc_dst_len = plen,
@@ -2683,7 +2687,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
expires = jiffies_to_clock_t(rt_expires);
}
addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
- dev, expires, flags, GFP_ATOMIC);
+ 0, dev, expires, flags,
+ GFP_ATOMIC);
}
fib6_info_release(rt);
}
@@ -2891,8 +2896,9 @@ static int inet6_addr_add(struct net *net, int ifindex,
ifp = ipv6_add_addr(idev, cfg, true, extack);
if (!IS_ERR(ifp)) {
if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev,
- expires, flags, GFP_KERNEL);
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, dev, expires,
+ flags, GFP_KERNEL);
}
/* Send a netlink notification if DAD is enabled and
@@ -3056,7 +3062,7 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
if (addr.s6_addr32[3]) {
add_addr(idev, &addr, plen, scope);
- addrconf_prefix_route(&addr, plen, idev->dev, 0, pflags,
+ addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
GFP_ATOMIC);
return;
}
@@ -3081,8 +3087,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
}
add_addr(idev, &addr, plen, flag);
- addrconf_prefix_route(&addr, plen, idev->dev, 0,
- pflags, GFP_ATOMIC);
+ addrconf_prefix_route(&addr, plen, 0, idev->dev,
+ 0, pflags, GFP_ATOMIC);
}
}
}
@@ -3128,7 +3134,7 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
ifp = ipv6_add_addr(idev, &cfg, true, NULL);
if (!IS_ERR(ifp)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len, idev->dev,
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
0, 0, GFP_ATOMIC);
addrconf_dad_start(ifp);
in6_ifa_put(ifp);
@@ -3244,7 +3250,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
addrconf_add_linklocal(idev, &addr,
IFA_F_STABLE_PRIVACY);
else if (prefix_route)
- addrconf_prefix_route(&addr, 64, idev->dev,
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
0, 0, GFP_KERNEL);
break;
case IN6_ADDR_GEN_MODE_EUI64:
@@ -3255,7 +3261,7 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
addrconf_add_linklocal(idev, &addr, 0);
else if (prefix_route)
- addrconf_prefix_route(&addr, 64, idev->dev,
+ addrconf_prefix_route(&addr, 64, 0, idev->dev,
0, 0, GFP_KERNEL);
break;
case IN6_ADDR_GEN_MODE_NONE:
@@ -3375,7 +3381,8 @@ static int fixup_permanent_addr(struct net *net,
if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
- idev->dev, 0, 0, GFP_ATOMIC);
+ ifp->rt_priority, idev->dev, 0, 0,
+ GFP_ATOMIC);
}
if (ifp->state == INET6_IFADDR_STATE_PREDAD)
@@ -4495,6 +4502,7 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
[IFA_LOCAL] = { .len = sizeof(struct in6_addr) },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .len = sizeof(u32) },
+ [IFA_RT_PRIORITY] = { .len = sizeof(u32) },
};
static int
@@ -4527,6 +4535,37 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm->ifa_prefixlen);
}
+static int modify_prefix_route(struct inet6_ifaddr *ifp,
+ unsigned long expires, u32 flags)
+{
+ struct fib6_info *f6i;
+
+ f6i = addrconf_get_prefix_route(&ifp->addr,
+ ifp->prefix_len,
+ ifp->idev->dev,
+ 0, RTF_GATEWAY | RTF_DEFAULT);
+ if (!f6i)
+ return -ENOENT;
+
+ if (f6i->fib6_metric != ifp->rt_priority) {
+ /* add new one */
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ /* delete old one */
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+ } else {
+ if (!expires)
+ fib6_clean_expires(f6i);
+ else
+ fib6_set_expires(f6i, expires);
+
+ fib6_info_release(f6i);
+ }
+
+ return 0;
+}
+
static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
{
u32 flags;
@@ -4577,14 +4616,25 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
ifp->valid_lft = cfg->valid_lft;
ifp->prefered_lft = cfg->preferred_lft;
+ if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
+ ifp->rt_priority = cfg->rt_priority;
+
spin_unlock_bh(&ifp->lock);
if (!(ifp->flags&IFA_F_TENTATIVE))
ipv6_ifa_notify(0, ifp);
if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
- ifp->idev->dev, expires, flags,
- GFP_KERNEL);
+ int rc = -ENOENT;
+
+ if (had_prefixroute)
+ rc = modify_prefix_route(ifp, expires, flags);
+
+ /* prefix route could have been deleted; if so restore it */
+ if (rc == -ENOENT) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ }
} else if (had_prefixroute) {
enum cleanup_prefix_rt_t action;
unsigned long rt_expires;
@@ -4643,6 +4693,9 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg.peer_pfx = peer_pfx;
cfg.plen = ifm->ifa_prefixlen;
+ if (tb[IFA_RT_PRIORITY])
+ cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+
cfg.valid_lft = INFINITY_LIFE_TIME;
cfg.preferred_lft = INFINITY_LIFE_TIME;
@@ -4745,7 +4798,8 @@ static inline int inet6_ifaddr_msgsize(void)
+ nla_total_size(16) /* IFA_LOCAL */
+ nla_total_size(16) /* IFA_ADDRESS */
+ nla_total_size(sizeof(struct ifa_cacheinfo))
- + nla_total_size(4) /* IFA_FLAGS */;
+ + nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(4) /* IFA_RT_PRIORITY */;
}
static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
@@ -4791,6 +4845,10 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
goto error;
+ if (ifa->rt_priority &&
+ nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
+ goto error;
+
if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
goto error;
@@ -5635,7 +5693,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
if (!ipv6_addr_any(&ifp->peer_addr))
- addrconf_prefix_route(&ifp->peer_addr, 128,
+ addrconf_prefix_route(&ifp->peer_addr, 128, 0,
ifp->idev->dev, 0, 0,
GFP_ATOMIC);
break;
--
2.11.0
^ permalink raw reply related
* [PATCH RFC net-next 9/9] selftests: fib_tests: Add prefix route tests with metric
From: dsahern @ 2018-05-23 22:57 UTC (permalink / raw)
To: netdev; +Cc: roopa, David Ahern
In-Reply-To: <20180523225727.11386-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Add tests verifying prefix routes are inserted with expected metric.
IPv6 prefix route tests
TEST: Default metric [ OK ]
TEST: User specified metric on first device [ OK ]
TEST: User specified metric on second device [ OK ]
TEST: Delete of address on first device [ OK ]
TEST: Modify metric of address [ OK ]
TEST: Prefix route removed on link down [ OK ]
TEST: Prefix route with metric on link up [ OK ]
IPv4 prefix route tests
TEST: Default metric [ OK ]
TEST: User specified metric on first device [ OK ]
TEST: User specified metric on second device [ OK ]
TEST: Delete of address on first device [ OK ]
TEST: Modify metric of address [ OK ]
TEST: Prefix route removed on link down [ OK ]
TEST: Prefix route with metric on link up [ OK ]
Signed-off-by: David Ahern <dsahern@gmail.com>
---
tools/testing/selftests/net/fib_tests.sh | 181 ++++++++++++++++++++++++++++++-
1 file changed, 180 insertions(+), 1 deletion(-)
mode change 100755 => 100644 tools/testing/selftests/net/fib_tests.sh
diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh
old mode 100755
new mode 100644
index e7d76fbc36e9..9780c5a2d73f
--- a/tools/testing/selftests/net/fib_tests.sh
+++ b/tools/testing/selftests/net/fib_tests.sh
@@ -6,7 +6,8 @@
ret=0
-TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt"
+# all tests in this script. Can be overridden with -t option
+TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric"
VERBOSE=0
PAUSE_ON_FAIL=no
PAUSE=no
@@ -642,6 +643,8 @@ check_route6()
local rc=0
out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
+ [ "${out}" = "${expected}" ] && return 0
+
if [ -z "${out}" ]; then
if [ "$VERBOSE" = "1" ]; then
printf "\nNo route entry found\n"
@@ -911,6 +914,98 @@ ipv6_route_test()
route_cleanup
}
+ip_addr_metric_check()
+{
+ ip addr help 2>&1 | grep -q metric
+ if [ $? -ne 0 ]; then
+ echo "iproute2 command does not support metric for addresses. Skipping test"
+ return 1
+ fi
+
+ return 0
+}
+
+ipv6_addr_metric_test()
+{
+ local rc
+
+ echo
+ echo "IPv6 prefix route tests"
+
+ ip_addr_metric_check || return 1
+
+ setup
+
+ set -e
+ $IP li add dummy1 type dummy
+ $IP li add dummy2 type dummy
+ $IP li set dummy1 up
+ $IP li set dummy2 up
+
+ # default entry is metric 256
+ run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64"
+ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64"
+ set +e
+
+ check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 256 2001:db8:104::/64 dev dummy2 proto kernel metric 256"
+ log_test $? 0 "Default metric"
+
+ set -e
+ run_cmd "$IP -6 addr flush dev dummy1"
+ run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64 metric 257"
+ set +e
+
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 256 2001:db8:104::/64 dev dummy1 proto kernel metric 257"
+ log_test $? 0 "User specified metric on first device"
+
+ set -e
+ run_cmd "$IP -6 addr flush dev dummy2"
+ run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64 metric 258"
+ set +e
+
+ check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 257 2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+ log_test $? 0 "User specified metric on second device"
+
+ run_cmd "$IP -6 addr del dev dummy1 2001:db8:104::1/64 metric 257"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 258"
+ rc=$?
+ fi
+ log_test $rc 0 "Delete of address on first device"
+
+ run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::2/64 metric 259"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Modify metric of address"
+
+ # verify prefix route removed on down
+ run_cmd "ip netns exec testns sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1"
+ run_cmd "$IP li set dev dummy2 down"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 ""
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route removed on link down"
+
+ # verify prefix route re-inserted with assigned metric
+ run_cmd "$IP li set dev dummy2 up"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route with metric on link up"
+
+ $IP li del dummy1
+ $IP li del dummy2
+ cleanup
+}
+
# add route for a prefix, flushing any existing routes first
# expected to be the first step of a test
add_route()
@@ -955,6 +1050,8 @@ check_route()
local rc=0
out=$($IP ro ls match ${pfx})
+ [ "${out}" = "${expected}" ] && return 0
+
if [ -z "${out}" ]; then
if [ "$VERBOSE" = "1" ]; then
printf "\nNo route entry found\n"
@@ -1181,6 +1278,86 @@ ipv4_route_test()
route_cleanup
}
+ipv4_addr_metric_test()
+{
+ local rc
+
+ echo
+ echo "IPv4 prefix route tests"
+
+ ip_addr_metric_check || return 1
+
+ setup
+
+ set -e
+ $IP li add dummy1 type dummy
+ $IP li add dummy2 type dummy
+ $IP li set dummy1 up
+ $IP li set dummy2 up
+
+ # default entry is metric 256
+ run_cmd "$IP addr add dev dummy1 172.16.104.1/24"
+ run_cmd "$IP addr add dev dummy2 172.16.104.2/24"
+ set +e
+
+ check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2"
+ log_test $? 0 "Default metric"
+
+ set -e
+ run_cmd "$IP addr flush dev dummy1"
+ run_cmd "$IP addr add dev dummy1 172.16.104.1/24 metric 257"
+ set +e
+
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257"
+ log_test $? 0 "User specified metric on first device"
+
+ set -e
+ run_cmd "$IP addr flush dev dummy2"
+ run_cmd "$IP addr add dev dummy2 172.16.104.2/24 metric 258"
+ set +e
+
+ check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+ log_test $? 0 "User specified metric on second device"
+
+ run_cmd "$IP addr del dev dummy1 172.16.104.1/24 metric 257"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
+ rc=$?
+ fi
+ log_test $rc 0 "Delete of address on first device"
+
+ run_cmd "$IP addr change dev dummy2 172.16.104.2/24 metric 259"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Modify metric of address"
+
+ # verify prefix route removed on down
+ run_cmd "$IP li set dev dummy2 down"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route ""
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route removed on link down"
+
+ # verify prefix route re-inserted with assigned metric
+ run_cmd "$IP li set dev dummy2 up"
+ rc=$?
+ if [ $rc -eq 0 ]; then
+ check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
+ rc=$?
+ fi
+ log_test $rc 0 "Prefix route with metric on link up"
+
+ $IP li del dummy1
+ $IP li del dummy2
+ cleanup
+}
+
################################################################################
# usage
@@ -1245,6 +1422,8 @@ do
fib_nexthop_test|nexthop) fib_nexthop_test;;
ipv6_route_test|ipv6_rt) ipv6_route_test;;
ipv4_route_test|ipv4_rt) ipv4_route_test;;
+ ipv6_addr_metric) ipv6_addr_metric_test;;
+ ipv4_addr_metric) ipv4_addr_metric_test;;
help) echo "Test names: $TESTS"; exit 0;;
esac
--
2.11.0
^ permalink raw reply related
* Re: [PATCH net-next] net:sched: add action inheritdsfield to skbmod
From: Cong Wang @ 2018-05-23 23:01 UTC (permalink / raw)
To: Fu, Qiaobin
Cc: davem@davemloft.net, netdev@vger.kernel.org, jhs@mojatatu.com,
Michel Machado
In-Reply-To: <2F042100-2BAC-48E5-887C-5D426B1D5A5B@bu.edu>
On Thu, May 17, 2018 at 12:33 PM, Fu, Qiaobin <qiaobinf@bu.edu> wrote:
> net/sched: add action inheritdsfield to skbmod
>
> The new action inheritdsfield copies the field DS of
> IPv4 and IPv6 packets into skb->prioriry. This enables
> later classification of packets based on the DS field.
>
> Original idea by Jamal Hadi Salim <jhs@mojatatu.com>
>
> Signed-off-by: Qiaobin Fu <qiaobinf@bu.edu>
> Reviewed-by: Michel Machado <michel@digirati.com.br>
Hmm, but skbedit seems better than skbmod for this job,
given:
1) It already modifies skb->priority, although with a given value
2) skbmod doesn't change skb metadata, it only changes payload
I am _not_ saying there is strict rule for what skbmod can or can't
change, it calls itself "data modifier", so I am saying we probably
need to follow this existing practice.
^ permalink raw reply
* Re: [PATCH v2] ppp: remove the PPPIOCDETACH ioctl
From: Paul Mackerras @ 2018-05-23 23:04 UTC (permalink / raw)
To: Eric Biggers
Cc: linux-ppp, netdev, linux-fsdevel, linux-kernel, Guillaume Nault,
syzkaller-bugs, Eric Biggers
In-Reply-To: <20180523213738.146911-1-ebiggers3@gmail.com>
On Wed, May 23, 2018 at 02:37:38PM -0700, Eric Biggers wrote:
> From: Eric Biggers <ebiggers@google.com>
>
> The PPPIOCDETACH ioctl effectively tries to "close" the given ppp file
> before f_count has reached 0, which is fundamentally a bad idea. It
> does check 'f_count < 2', which excludes concurrent operations on the
> file since they would only be possible with a shared fd table, in which
> case each fdget() would take a file reference. However, it fails to
> account for the fact that even with 'f_count == 1' the file can still be
> linked into epoll instances. As reported by syzbot, this can trivially
> be used to cause a use-after-free.
>
> Yet, the only known user of PPPIOCDETACH is pppd versions older than
> ppp-2.4.2, which was released almost 15 years ago (November 2003).
> Also, PPPIOCDETACH apparently stopped working reliably at around the
> same time, when the f_count check was added to the kernel, e.g. see
> https://lkml.org/lkml/2002/12/31/83. Also, the current 'f_count < 2'
> check makes PPPIOCDETACH only work in single-threaded applications; it
> always fails if called from a multithreaded application.
>
> All pppd versions released in the last 15 years just close() the file
> descriptor instead.
>
> Therefore, instead of hacking around this bug by exporting epoll
> internals to modules, and probably missing other related bugs, just
> remove the PPPIOCDETACH ioctl and see if anyone actually notices. Leave
> a stub in place that prints a one-time warning and returns EINVAL.
>
> Reported-by: syzbot+16363c99d4134717c05b@syzkaller.appspotmail.com
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Cc: stable@vger.kernel.org
> Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Paul Mackerras <paulus@ozlabs.org>
^ permalink raw reply
* Re: INFO: rcu detected stall in corrupted
From: Marcelo Ricardo Leitner @ 2018-05-23 23:13 UTC (permalink / raw)
To: Eric Dumazet
Cc: David Miller, syzbot+f116bc1994efe725d51b, kuznet, linux-kernel,
netdev, syzkaller-bugs, yoshfuji, dsahern, roopa, linux-sctp,
Xin Long
In-Reply-To: <f8d0d282-1e75-d86a-8872-e32b57a6ec14@gmail.com>
On Mon, May 21, 2018 at 11:13:46AM -0700, Eric Dumazet wrote:
>
>
> On 05/21/2018 11:09 AM, David Miller wrote:
> > From: syzbot <syzbot+f116bc1994efe725d51b@syzkaller.appspotmail.com>
> > Date: Mon, 21 May 2018 11:05:02 -0700
> >
> >> find_match+0x244/0x13a0 net/ipv6/route.c:691
> >> find_rr_leaf net/ipv6/route.c:729 [inline]
> >> rt6_select net/ipv6/route.c:779 [inline]
> >
> > Hmmm, endless loop in find_rr_leaf or similar?
> >
>
>
> I do not think so, this really looks like SCTP specific
> , we now have dozens of traces all sharing :
>
> sctp_transport_route+0xad/0x450 net/sctp/transport.c:293
> sctp_packet_config+0xb89/0xfd0 net/sctp/output.c:123
> sctp_outq_flush+0x79c/0x4370 net/sctp/outqueue.c:894
> sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
> sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
> sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
> sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
> sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
> call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
>
>
> Some kind of infinite loop.
>
> When the hrtimer fires, it can point to any code that sits below but does not necessarily have a bug.
Agreed. Xin Long identified the root cause. syzkaller is setting too
aggressive parameters to SCTP RTO, leading to issues with the
heartbeat timer.
^ permalink raw reply
* Re: [PATCH net-next v3] net: sched: don't disable bh when accessing action idr
From: Cong Wang @ 2018-05-23 23:14 UTC (permalink / raw)
To: Vlad Buslov
Cc: Jiri Pirko, Linux Kernel Network Developers, Jamal Hadi Salim,
David Miller, LKML
In-Reply-To: <1527065574-11299-1-git-send-email-vladbu@mellanox.com>
On Wed, May 23, 2018 at 1:52 AM, Vlad Buslov <vladbu@mellanox.com> wrote:
> Initial net_device implementation used ingress_lock spinlock to synchronize
> ingress path of device. This lock was used in both process and bh context.
> In some code paths action map lock was obtained while holding ingress_lock.
> Commit e1e992e52faa ("[NET_SCHED] protect action config/dump from irqs")
> modified actions to always disable bh, while using action map lock, in
> order to prevent deadlock on ingress_lock in softirq. This lock was removed
> in commit 555353cfa1ae ("netdev: The ingress_lock member is no longer
> needed.").
>
> Another reason to disable bh was filters delete code, that released actions
> in rcu callback. This code was changed to release actions from workqueue
> context in patch set "net_sched: close the race between call_rcu() and
> cleanup_net()".
>
> With these changes it is no longer necessary to continue disable bh while
> accessing action map.
>
> Replace all action idr spinlock usage with regular calls that do not
> disable bh.
Looks much better now!
I _guess_ we perhaps can even get rid of this spinlock since most of
the callers hold RTNL lock, not sure about the dump() path where
RTNL might be removed recently.
Anyway,
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
^ permalink raw reply
* [PATCH V4] mlx4_core: allocate ICM memory in page size chunks
From: Qing Huang @ 2018-05-23 23:22 UTC (permalink / raw)
To: tariqt, davem, haakon.bugge, yanjun.zhu
Cc: netdev, linux-rdma, linux-kernel, gi-oh.kim, Qing Huang
When a system is under memory presure (high usage with fragments),
the original 256KB ICM chunk allocations will likely trigger kernel
memory management to enter slow path doing memory compact/migration
ops in order to complete high order memory allocations.
When that happens, user processes calling uverb APIs may get stuck
for more than 120s easily even though there are a lot of free pages
in smaller chunks available in the system.
Syslog:
...
Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
oracle_205573_e:205573 blocked for more than 120 seconds.
...
With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
However in order to support smaller ICM chunk size, we need to fix
another issue in large size kcalloc allocations.
E.g.
Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
entry). So we need a 16MB allocation for a table->icm pointer array to
hold 2M pointers which can easily cause kcalloc to fail.
The solution is to use kvzalloc to replace kcalloc which will fall back
to vmalloc automatically if kmalloc fails.
Signed-off-by: Qing Huang <qing.huang@oracle.com>
Acked-by: Daniel Jurgens <danielj@mellanox.com>
Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
---
v4: use kvzalloc instead of vzalloc
add one err condition check
don't include vmalloc.h any more
v3: use PAGE_SIZE instead of PAGE_SHIFT
add comma to the end of enum variables
include vmalloc.h header file to avoid build issues on Sparc
v2: adjusted chunk size to reflect different architectures
drivers/net/ethernet/mellanox/mlx4/icm.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
index a822f7a..685337d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/icm.c
+++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
@@ -43,12 +43,12 @@
#include "fw.h"
/*
- * We allocate in as big chunks as we can, up to a maximum of 256 KB
- * per chunk.
+ * We allocate in page size (default 4KB on many archs) chunks to avoid high
+ * order memory allocations in fragmented/high usage memory situation.
*/
enum {
- MLX4_ICM_ALLOC_SIZE = 1 << 18,
- MLX4_TABLE_CHUNK_SIZE = 1 << 18
+ MLX4_ICM_ALLOC_SIZE = PAGE_SIZE,
+ MLX4_TABLE_CHUNK_SIZE = PAGE_SIZE,
};
static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
@@ -398,9 +398,11 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
u64 size;
obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
+ if (WARN_ON(!obj_per_chunk))
+ return -EINVAL;
num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
- table->icm = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
+ table->icm = kvzalloc(num_icm * sizeof(*table->icm), GFP_KERNEL);
if (!table->icm)
return -ENOMEM;
table->virt = virt;
@@ -446,7 +448,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
mlx4_free_icm(dev, table->icm[i], use_coherent);
}
- kfree(table->icm);
+ kvfree(table->icm);
return -ENOMEM;
}
@@ -462,5 +464,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
mlx4_free_icm(dev, table->icm[i], table->coherent);
}
- kfree(table->icm);
+ kvfree(table->icm);
}
--
2.9.3
^ permalink raw reply related
* Re: [PATCH net-next v15 4/7] sch_cake: Add NAT awareness to packet classifier
From: Toke Høiland-Jørgensen @ 2018-05-23 23:25 UTC (permalink / raw)
To: Pablo Neira Ayuso; +Cc: netdev, cake, netfilter-devel
In-Reply-To: <20180523224653.mvxkibc4x37nbhha@salvia>
Pablo Neira Ayuso <pablo@netfilter.org> writes:
> On Tue, May 22, 2018 at 04:11:06PM +0200, Toke Høiland-Jørgensen wrote:
>> Pablo Neira Ayuso <pablo@netfilter.org> writes:
>>
>> > Hi Toke,
>> >
>> > On Tue, May 22, 2018 at 03:57:38PM +0200, Toke Høiland-Jørgensen wrote:
>> >> When CAKE is deployed on a gateway that also performs NAT (which is a
>> >> common deployment mode), the host fairness mechanism cannot distinguish
>> >> internal hosts from each other, and so fails to work correctly.
>> >>
>> >> To fix this, we add an optional NAT awareness mode, which will query the
>> >> kernel conntrack mechanism to obtain the pre-NAT addresses for each packet
>> >> and use that in the flow and host hashing.
>> >>
>> >> When the shaper is enabled and the host is already performing NAT, the cost
>> >> of this lookup is negligible. However, in unlimited mode with no NAT being
>> >> performed, there is a significant CPU cost at higher bandwidths. For this
>> >> reason, the feature is turned off by default.
>> >>
>> >> Cc: netfilter-devel@vger.kernel.org
>> >> Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
>> >> ---
>> >> net/sched/sch_cake.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++
>> >> 1 file changed, 79 insertions(+)
>> >>
>> >> diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
>> >> index 68ac908470f1..6f7cae705c84 100644
>> >> --- a/net/sched/sch_cake.c
>> >> +++ b/net/sched/sch_cake.c
>> >> @@ -71,6 +71,12 @@
>> >> #include <net/tcp.h>
>> >> #include <net/flow_dissector.h>
>> >>
>> >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
>> >> +#include <net/netfilter/nf_conntrack_core.h>
>> >> +#include <net/netfilter/nf_conntrack_zones.h>
>> >> +#include <net/netfilter/nf_conntrack.h>
>> >> +#endif
>> >> +
>> >> #define CAKE_SET_WAYS (8)
>> >> #define CAKE_MAX_TINS (8)
>> >> #define CAKE_QUEUES (1024)
>> >> @@ -516,6 +522,60 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
>> >> return drop;
>> >> }
>> >>
>> >> +#if IS_REACHABLE(CONFIG_NF_CONNTRACK)
>> >> +
>> >> +static void cake_update_flowkeys(struct flow_keys *keys,
>> >> + const struct sk_buff *skb)
>> >> +{
>> >> + const struct nf_conntrack_tuple *tuple;
>> >> + enum ip_conntrack_info ctinfo;
>> >> + struct nf_conn *ct;
>> >> + bool rev = false;
>> >> +
>> >> + if (tc_skb_protocol(skb) != htons(ETH_P_IP))
>> >> + return;
>> >> +
>> >> + ct = nf_ct_get(skb, &ctinfo);
>> >> + if (ct) {
>> >> + tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
>> >> + } else {
>> >> + const struct nf_conntrack_tuple_hash *hash;
>> >> + struct nf_conntrack_tuple srctuple;
>> >> +
>> >> + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
>> >> + NFPROTO_IPV4, dev_net(skb->dev),
>> >> + &srctuple))
>> >> + return;
>> >> +
>> >> + hash = nf_conntrack_find_get(dev_net(skb->dev),
>> >> + &nf_ct_zone_dflt,
>> >> + &srctuple);
>> >> + if (!hash)
>> >> + return;
>> >> +
>> >> + rev = true;
>> >> + ct = nf_ct_tuplehash_to_ctrack(hash);
>> >> + tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
>> >> + }
>> >> +
>> >> + keys->addrs.v4addrs.src = rev ? tuple->dst.u3.ip : tuple->src.u3.ip;
>> >> + keys->addrs.v4addrs.dst = rev ? tuple->src.u3.ip : tuple->dst.u3.ip;
>> >> +
>> >> + if (keys->ports.ports) {
>> >> + keys->ports.src = rev ? tuple->dst.u.all : tuple->src.u.all;
>> >> + keys->ports.dst = rev ? tuple->src.u.all : tuple->dst.u.all;
>> >> + }
>> >> + if (rev)
>> >> + nf_ct_put(ct);
>> >> +}
>> >
>> > This is going to pull in the nf_conntrack module, even if you may not
>> > want it, as soon as cake is in place.
>>
>> Yeah, we are aware of that; we get a moddep on nf_conntrack. Our main
>> deployment scenario has been home routers where conntrack is used
>> anyway, so this has not been much of an issue. However, if there is a
>> way to avoid this, and instead detect at runtime if conntrack is
>> available, that would certainly be useful. Is there? :)
>
> Yes, there is.
>
> You place this function in net/netfilter/nf_conntrack_core.c, call it
> nf_conntrack_get_tuple() which internally uses a rcu hook for this.
> See nf_ct_attach() and ip_ct_attach() in net/netfilter/core.c for
> instance.
>
> This allows you to avoid the dependency with nf_conntrack (which would
> be only called if the module has been explicitly loaded), which is
> what you're searching for.
Ah, awesome! I'll look into that; thanks :)
-Toke
^ permalink raw reply
* Re: [PATCH RFC net-next 00/11] udp gso
From: Marcelo Ricardo Leitner @ 2018-05-24 0:02 UTC (permalink / raw)
To: Willem de Bruijn; +Cc: Paolo Abeni, Network Development, Willem de Bruijn
In-Reply-To: <CAF=yD-LaDvQdkE_BkZX7o1ukjyodWiwK=nJ5S=bTgJ-91KBhHg@mail.gmail.com>
On Wed, Apr 18, 2018 at 09:49:18AM -0400, Willem de Bruijn wrote:
> I just hacked up a sendmmsg extension to the benchmark to verify.
> Indeed that does not have nearly the same benefit as GSO:
>
> udp tx: 976 MB/s 695394 calls/s 16557 msg/s
>
> This matches the numbers seen from TCP without TSO and GSO.
> That also has few system calls, but observes per MTU stack traversal.
Reviving this old thread because it's the only place I saw sendmmsg
being mentioned.
sendmmsg shouldn't be considered as an alternative, but rather as a
complement. Then instead of the application building one large request
and request the stack to fragment it, it could simply build the
sendmmsg request and the stack would group the mmsg into a gso skb. It
seems more natural to the application. But well, both (sendmmsg and
the option to fragment) are Linux-specific..
For that we need sendmmsg to do something smarter than doing several
sendmsg calls, yes.
^ permalink raw reply
* [PATCH v2 net-next 1/3] net/ipv4: Udate fib_table_lookup tracepoint
From: dsahern @ 2018-05-24 0:08 UTC (permalink / raw)
To: netdev; +Cc: David Ahern
In-Reply-To: <20180524000849.6553-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Commit 4a2d73a4fb36 ("ipv4: fib_rules: support match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.
In addition, make the IPv4 tracepoint similar to the IPv6 one where
the lookup parameters and result are dumped in 1 event. It is much
easier to use and understand the outcome of the lookup.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/trace/events/fib.h | 72 +++++++++++++++++++++++++++-------------------
net/ipv4/fib_trie.c | 14 +++++----
2 files changed, 52 insertions(+), 34 deletions(-)
diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 81b7e985bb45..f5a1d4c518d8 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -12,12 +12,14 @@
TRACE_EVENT(fib_table_lookup,
- TP_PROTO(u32 tb_id, const struct flowi4 *flp),
+ TP_PROTO(u32 tb_id, const struct flowi4 *flp,
+ const struct fib_nh *nh, int err),
- TP_ARGS(tb_id, flp),
+ TP_ARGS(tb_id, flp, nh, err),
TP_STRUCT__entry(
__field( u32, tb_id )
+ __field( int, err )
__field( int, oif )
__field( int, iif )
__field( __u8, tos )
@@ -25,12 +27,19 @@ TRACE_EVENT(fib_table_lookup,
__field( __u8, flags )
__array( __u8, src, 4 )
__array( __u8, dst, 4 )
+ __array( __u8, gw, 4 )
+ __array( __u8, saddr, 4 )
+ __field( u16, sport )
+ __field( u16, dport )
+ __field( u8, proto )
+ __dynamic_array(char, name, IFNAMSIZ )
),
TP_fast_assign(
__be32 *p32;
__entry->tb_id = tb_id;
+ __entry->err = err;
__entry->oif = flp->flowi4_oif;
__entry->iif = flp->flowi4_iif;
__entry->tos = flp->flowi4_tos;
@@ -42,36 +51,41 @@ TRACE_EVENT(fib_table_lookup,
p32 = (__be32 *) __entry->dst;
*p32 = flp->daddr;
- ),
-
- TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x",
- __entry->tb_id, __entry->oif, __entry->iif,
- __entry->src, __entry->dst, __entry->tos, __entry->scope,
- __entry->flags)
-);
-
-TRACE_EVENT(fib_table_lookup_nh,
-
- TP_PROTO(const struct fib_nh *nh),
-
- TP_ARGS(nh),
-
- TP_STRUCT__entry(
- __string( name, nh->nh_dev->name)
- __field( int, oif )
- __array( __u8, src, 4 )
- ),
-
- TP_fast_assign(
- __be32 *p32 = (__be32 *) __entry->src;
- __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set");
- __entry->oif = nh->nh_oif;
- *p32 = nh->nh_saddr;
+ __entry->proto = flp->flowi4_proto;
+ if (__entry->proto == IPPROTO_TCP ||
+ __entry->proto == IPPROTO_UDP) {
+ __entry->sport = ntohs(flp->fl4_sport);
+ __entry->dport = ntohs(flp->fl4_dport);
+ } else {
+ __entry->sport = 0;
+ __entry->dport = 0;
+ }
+
+ if (nh) {
+ p32 = (__be32 *) __entry->saddr;
+ *p32 = nh->nh_saddr;
+
+ p32 = (__be32 *) __entry->gw;
+ *p32 = nh->nh_gw;
+
+ __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-");
+ } else {
+ p32 = (__be32 *) __entry->saddr;
+ *p32 = 0;
+
+ p32 = (__be32 *) __entry->gw;
+ *p32 = 0;
+
+ __assign_str(name, "-");
+ }
),
- TP_printk("nexthop dev %s oif %d src %pI4",
- __get_str(name), __entry->oif, __entry->src)
+ TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d",
+ __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+ __entry->src, __entry->sport, __entry->dst, __entry->dport,
+ __entry->tos, __entry->scope, __entry->flags,
+ __get_str(name), __entry->gw, __entry->saddr, __entry->err)
);
TRACE_EVENT(fib_validate_source,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3ce98c..65c340f230ae 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
unsigned long index;
t_key cindex;
- trace_fib_table_lookup(tb->tb_id, flp);
-
pn = t->kv;
cindex = 0;
n = get_child_rcu(pn, cindex);
- if (!n)
+ if (!n) {
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
return -EAGAIN;
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->gets);
@@ -1416,8 +1416,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
* nothing for us to do as we do not have any
* further nodes to parse.
*/
- if (IS_TRIE(pn))
+ if (IS_TRIE(pn)) {
+ trace_fib_table_lookup(tb->tb_id, flp,
+ NULL, -EAGAIN);
return -EAGAIN;
+ }
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->backtrack);
#endif
@@ -1459,6 +1462,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
+ trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
return err;
}
if (fi->fib_flags & RTNH_F_DEAD)
@@ -1494,7 +1498,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
- trace_fib_table_lookup_nh(nh);
+ trace_fib_table_lookup(tb->tb_id, flp, nh, err);
return err;
}
--
2.11.0
^ permalink raw reply related
* [PATCH v2 net-next 3/3] net/ipv4: Remove tracepoint in fib_validate_source
From: dsahern @ 2018-05-24 0:08 UTC (permalink / raw)
To: netdev; +Cc: David Ahern
In-Reply-To: <20180524000849.6553-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Tracepoint does not add value and the call to fib_lookup follows
it which shows the same information and the fib lookup result.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/trace/events/fib.h | 35 -----------------------------------
net/ipv4/fib_frontend.c | 2 --
2 files changed, 37 deletions(-)
diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index f5a1d4c518d8..9763cddd0594 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -87,41 +87,6 @@ TRACE_EVENT(fib_table_lookup,
__entry->tos, __entry->scope, __entry->flags,
__get_str(name), __entry->gw, __entry->saddr, __entry->err)
);
-
-TRACE_EVENT(fib_validate_source,
-
- TP_PROTO(const struct net_device *dev, const struct flowi4 *flp),
-
- TP_ARGS(dev, flp),
-
- TP_STRUCT__entry(
- __string( name, dev->name )
- __field( int, oif )
- __field( int, iif )
- __field( __u8, tos )
- __array( __u8, src, 4 )
- __array( __u8, dst, 4 )
- ),
-
- TP_fast_assign(
- __be32 *p32;
-
- __assign_str(name, dev ? dev->name : "not set");
- __entry->oif = flp->flowi4_oif;
- __entry->iif = flp->flowi4_iif;
- __entry->tos = flp->flowi4_tos;
-
- p32 = (__be32 *) __entry->src;
- *p32 = flp->saddr;
-
- p32 = (__be32 *) __entry->dst;
- *p32 = flp->daddr;
- ),
-
- TP_printk("dev %s oif %d iif %d tos %d src %pI4 dst %pI4",
- __get_str(name), __entry->oif, __entry->iif, __entry->tos,
- __entry->src, __entry->dst)
-);
#endif /* _TRACE_FIB_H */
/* This part must be outside protection */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d622112bf95..58696b829065 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.fl4_dport = 0;
}
- trace_fib_validate_source(dev, &fl4);
-
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
if (res.type != RTN_UNICAST &&
--
2.11.0
^ permalink raw reply related
* [PATCH v2 net-next 0/3] net: Update fib_table_lookup tracepoints
From: dsahern @ 2018-05-24 0:08 UTC (permalink / raw)
To: netdev; +Cc: David Ahern
From: David Ahern <dsahern@gmail.com>
Update the FIB lookup tracepoints to include ip proto and port fields
from the flow struct. In the process make the IPv4 tracepoint inline
with IPv6 which is much easier to use and follow the lookup and result.
Remove the tracepoint in fib_validate_source which does not provide
value above the fib_table_lookup which immediately follows it.
v2
- move CREATE_TRACE_POINTS for the v6 tracepoint to route.c to handle
its need for an internal function to convert route type to error and
handle IPv6 as a module or builtin. Reported by kbuild robot.
David Ahern (3):
net/ipv4: Udate fib_table_lookup tracepoint
net/ipv6: Udate fib6_table_lookup tracepoint
net/ipv4: Remove tracepoint in fib_validate_source
include/trace/events/fib.h | 107 ++++++++++++++++++--------------------------
include/trace/events/fib6.h | 29 +++++++++---
net/core/net-traces.c | 4 --
net/ipv4/fib_frontend.c | 2 -
net/ipv4/fib_trie.c | 14 +++---
net/ipv6/route.c | 9 +++-
6 files changed, 81 insertions(+), 84 deletions(-)
--
2.11.0
^ permalink raw reply
* [PATCH v2 net-next 2/3] net/ipv6: Udate fib6_table_lookup tracepoint
From: dsahern @ 2018-05-24 0:08 UTC (permalink / raw)
To: netdev; +Cc: David Ahern
In-Reply-To: <20180524000849.6553-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Commit bb0ad1987e96 ("ipv6: fib6_rules: support for match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/trace/events/fib6.h | 29 ++++++++++++++++++++++-------
net/core/net-traces.c | 4 ----
net/ipv6/route.c | 9 +++++++--
3 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 1b8d951e3c12..b088b54d699c 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -19,7 +19,7 @@ TRACE_EVENT(fib6_table_lookup,
TP_STRUCT__entry(
__field( u32, tb_id )
-
+ __field( int, err )
__field( int, oif )
__field( int, iif )
__field( __u8, tos )
@@ -27,7 +27,10 @@ TRACE_EVENT(fib6_table_lookup,
__field( __u8, flags )
__array( __u8, src, 16 )
__array( __u8, dst, 16 )
-
+ __field( u16, sport )
+ __field( u16, dport )
+ __field( u8, proto )
+ __field( u8, rt_type )
__dynamic_array( char, name, IFNAMSIZ )
__array( __u8, gw, 16 )
),
@@ -36,6 +39,7 @@ TRACE_EVENT(fib6_table_lookup,
struct in6_addr *in6;
__entry->tb_id = table->tb6_id;
+ __entry->err = ip6_rt_type_to_error(f6i->fib6_type);
__entry->oif = flp->flowi6_oif;
__entry->iif = flp->flowi6_iif;
__entry->tos = ip6_tclass(flp->flowlabel);
@@ -48,10 +52,20 @@ TRACE_EVENT(fib6_table_lookup,
in6 = (struct in6_addr *)__entry->dst;
*in6 = flp->daddr;
+ __entry->proto = flp->flowi6_proto;
+ if (__entry->proto == IPPROTO_TCP ||
+ __entry->proto == IPPROTO_UDP) {
+ __entry->sport = ntohs(flp->fl6_sport);
+ __entry->dport = ntohs(flp->fl6_dport);
+ } else {
+ __entry->sport = 0;
+ __entry->dport = 0;
+ }
+
if (f6i->fib6_nh.nh_dev) {
__assign_str(name, f6i->fib6_nh.nh_dev);
} else {
- __assign_str(name, "");
+ __assign_str(name, "-");
}
if (f6i == net->ipv6.fib6_null_entry) {
struct in6_addr in6_zero = {};
@@ -65,10 +79,11 @@ TRACE_EVENT(fib6_table_lookup,
}
),
- TP_printk("table %3u oif %d iif %d src %pI6c dst %pI6c tos %d scope %d flags %x ==> dev %s gw %pI6c",
- __entry->tb_id, __entry->oif, __entry->iif,
- __entry->src, __entry->dst, __entry->tos, __entry->scope,
- __entry->flags, __get_str(name), __entry->gw)
+ TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d",
+ __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+ __entry->src, __entry->sport, __entry->dst, __entry->dport,
+ __entry->tos, __entry->scope, __entry->flags,
+ __get_str(name), __entry->gw, __entry->err)
);
#endif /* _TRACE_FIB6_H */
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa1..419af6dfe29f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
#include <trace/events/tcp.h>
#include <trace/events/fib.h>
#include <trace/events/qdisc.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <trace/events/fib6.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
-#endif
#if IS_ENABLED(CONFIG_BRIDGE)
#include <trace/events/bridge.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bcb8785c0451..8d9e460ff88b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,14 +63,19 @@
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
-#include <trace/events/fib6.h>
-
#include <linux/uaccess.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
+static int ip6_rt_type_to_error(u8 fib6_type);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#undef CREATE_TRACE_POINTS
+
enum rt6_nud_state {
RT6_NUD_FAIL_HARD = -3,
RT6_NUD_FAIL_PROBE = -2,
--
2.11.0
^ permalink raw reply related
* [PATCH bpf-next v4 4/7] tools/bpf: add ksym_get_addr() in trace_helpers
From: Yonghong Song @ 2018-05-24 0:18 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>
Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.
Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/testing/selftests/bpf/trace_helpers.c | 12 ++++++++++++
tools/testing/selftests/bpf/trace_helpers.h | 1 +
2 files changed, 13 insertions(+)
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
return &syms[0];
}
+long ksym_get_addr(const char *name)
+{
+ int i;
+
+ for (i = 0; i < sym_cnt; i++) {
+ if (strcmp(syms[i].name, name) == 0)
+ return syms[i].addr;
+ }
+
+ return 0;
+}
+
static int page_size;
static int page_cnt = 8;
static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
int load_kallsyms(void);
struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-24 0:18 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
include/linux/trace_events.h | 17 +++++++
include/uapi/linux/bpf.h | 26 ++++++++++
kernel/bpf/syscall.c | 115 +++++++++++++++++++++++++++++++++++++++++++
kernel/trace/bpf_trace.c | 48 ++++++++++++++++++
kernel/trace/trace_kprobe.c | 29 +++++++++++
kernel/trace/trace_uprobe.c | 22 +++++++++
6 files changed, 257 insertions(+)
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..d34144a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+ u32 *fd_type, const char **buf,
+ u64 *probe_offset, u64 *probe_addr);
#else
static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
{
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
{
return NULL;
}
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+ u32 *prog_id, u32 *fd_type,
+ const char **buf, u64 *probe_offset,
+ u64 *probe_addr)
+{
+ return -EOPNOTSUPP;
+}
#endif
enum {
@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
#ifdef CONFIG_KPROBE_EVENTS
extern int perf_kprobe_init(struct perf_event *event, bool is_retprobe);
extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+ u32 *fd_type, const char **symbol,
+ u64 *probe_offset, u64 *probe_addr,
+ bool perf_type_tracepoint);
#endif
#ifdef CONFIG_UPROBE_EVENTS
extern int perf_uprobe_init(struct perf_event *event, bool is_retprobe);
extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+ u32 *fd_type, const char **filename,
+ u64 *probe_offset, bool perf_type_tracepoint);
#endif
extern int ftrace_profile_set_filter(struct perf_event *event, int event_id,
char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d..0d51946 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+ BPF_TASK_FD_QUERY,
};
enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32 btf_log_size;
__u32 btf_log_level;
};
+
+ struct {
+ __u32 pid; /* input: pid */
+ __u32 fd; /* input: fd */
+ __u32 flags; /* input: flags */
+ __u32 buf_len; /* input/output: buf len */
+ __aligned_u64 buf; /* input/output:
+ * tp_name for tracepoint
+ * symbol for kprobe
+ * filename for uprobe
+ */
+ __u32 prog_id; /* output: prod_id */
+ __u32 fd_type; /* output: BPF_FD_TYPE_* */
+ __u64 probe_offset; /* output: probe_offset */
+ __u64 probe_addr; /* output: probe_addr */
+ } task_fd_query;
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
__u8 dmac[6]; /* ETH_ALEN */
};
+enum bpf_task_fd_type {
+ BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
+ BPF_FD_TYPE_TRACEPOINT, /* tp name */
+ BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */
+ BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */
+ BPF_FD_TYPE_UPROBE, /* filename + offset */
+ BPF_FD_TYPE_URETPROBE, /* filename + offset */
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b4c945..7dd8c86 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
#include <linux/file.h>
+#include <linux/fs.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>
@@ -2102,6 +2104,116 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
return btf_get_fd_by_id(attr->btf_id);
}
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ u32 prog_id, u32 fd_type,
+ const char *buf, u64 probe_offset,
+ u64 probe_addr)
+{
+ void __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+ u32 len = buf ? strlen(buf) + 1 : 0, input_len;
+ int err = 0;
+
+ if (put_user(len, &uattr->task_fd_query.buf_len))
+ return -EFAULT;
+ input_len = attr->task_fd_query.buf_len;
+ if (input_len && len && ubuf) {
+ if (input_len < len) {
+ err = -ENOSPC;
+ len = input_len;
+ }
+ if (copy_to_user(ubuf, buf, len))
+ return -EFAULT;
+ }
+
+ if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+ put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+ put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+ put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+ return -EFAULT;
+
+ return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ pid_t pid = attr->task_fd_query.pid;
+ u32 fd = attr->task_fd_query.fd;
+ const struct perf_event *event;
+ struct files_struct *files;
+ struct task_struct *task;
+ struct file *file;
+ int err;
+
+ if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (attr->task_fd_query.flags != 0)
+ return -EINVAL;
+
+ task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+ if (!task)
+ return -ENOENT;
+
+ files = get_files_struct(task);
+ put_task_struct(task);
+ if (!files)
+ return -ENOENT;
+
+ err = 0;
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (!file)
+ err = -EBADF;
+ else
+ get_file(file);
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+ if (err)
+ goto out;
+
+ if (file->f_op == &bpf_raw_tp_fops) {
+ struct bpf_raw_tracepoint *raw_tp = file->private_data;
+ struct bpf_raw_event_map *btp = raw_tp->btp;
+
+ err = bpf_task_fd_query_copy(attr, uattr,
+ raw_tp->prog->aux->id,
+ BPF_FD_TYPE_RAW_TRACEPOINT,
+ btp->tp->name, 0, 0);
+ goto put_file;
+ }
+
+ event = perf_get_event(file);
+ if (!IS_ERR(event)) {
+ u64 probe_offset, probe_addr;
+ u32 prog_id, fd_type;
+ const char *buf;
+
+ err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+ &buf, &probe_offset,
+ &probe_addr);
+ if (!err)
+ err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+ fd_type, buf,
+ probe_offset,
+ probe_addr);
+ goto put_file;
+ }
+
+ err = -ENOTSUPP;
+put_file:
+ fput(file);
+out:
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -2188,6 +2300,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_BTF_GET_FD_BY_ID:
err = bpf_btf_get_fd_by_id(&attr);
break;
+ case BPF_TASK_FD_QUERY:
+ err = bpf_task_fd_query(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ce2cbbf..81fdf2f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,7 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/kprobes.h>
+#include <linux/syscalls.h>
#include <linux/error-injection.h>
#include "trace_probe.h"
@@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
mutex_unlock(&bpf_event_mutex);
return err;
}
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+ u32 *fd_type, const char **buf,
+ u64 *probe_offset, u64 *probe_addr)
+{
+ bool is_tracepoint, is_syscall_tp;
+ struct bpf_prog *prog;
+ int flags, err = 0;
+
+ prog = event->prog;
+ if (!prog)
+ return -ENOENT;
+
+ /* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+ if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+ return -EOPNOTSUPP;
+
+ *prog_id = prog->aux->id;
+ flags = event->tp_event->flags;
+ is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+ is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+ if (is_tracepoint || is_syscall_tp) {
+ *buf = is_tracepoint ? event->tp_event->tp->name
+ : event->tp_event->name;
+ *fd_type = BPF_FD_TYPE_TRACEPOINT;
+ *probe_offset = 0x0;
+ *probe_addr = 0x0;
+ } else {
+ /* kprobe/uprobe */
+ err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+ if (flags & TRACE_EVENT_FL_KPROBE)
+ err = bpf_get_kprobe_info(event, fd_type, buf,
+ probe_offset, probe_addr,
+ event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+ if (flags & TRACE_EVENT_FL_UPROBE)
+ err = bpf_get_uprobe_info(event, fd_type, buf,
+ probe_offset,
+ event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+ }
+
+ return err;
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 02aed76..daa8157 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
head, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+ const char **symbol, u64 *probe_offset,
+ u64 *probe_addr, bool perf_type_tracepoint)
+{
+ const char *pevent = trace_event_name(event->tp_event);
+ const char *group = event->tp_event->class->system;
+ struct trace_kprobe *tk;
+
+ if (perf_type_tracepoint)
+ tk = find_trace_kprobe(pevent, group);
+ else
+ tk = event->tp_event->data;
+ if (!tk)
+ return -EINVAL;
+
+ *fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+ : BPF_FD_TYPE_KPROBE;
+ if (tk->symbol) {
+ *symbol = tk->symbol;
+ *probe_offset = tk->rp.kp.offset;
+ *probe_addr = 0;
+ } else {
+ *symbol = NULL;
+ *probe_offset = 0;
+ *probe_addr = (unsigned long)tk->rp.kp.addr;
+ }
+ return 0;
+}
#endif /* CONFIG_PERF_EVENTS */
/*
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ac89287..bf89a51 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
{
__uprobe_perf_func(tu, func, regs, ucb, dsize);
}
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+ const char **filename, u64 *probe_offset,
+ bool perf_type_tracepoint)
+{
+ const char *pevent = trace_event_name(event->tp_event);
+ const char *group = event->tp_event->class->system;
+ struct trace_uprobe *tu;
+
+ if (perf_type_tracepoint)
+ tu = find_probe_event(pevent, group);
+ else
+ tu = event->tp_event->data;
+ if (!tu)
+ return -EINVAL;
+
+ *fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+ : BPF_FD_TYPE_UPROBE;
+ *filename = tu->filename;
+ *probe_offset = tu->offset;
+ return 0;
+}
#endif /* CONFIG_PERF_EVENTS */
static int
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 3/7] tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in libbpf
From: Yonghong Song @ 2018-05-24 0:18 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>
Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_task_fd_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++
tools/lib/bpf/bpf.c | 23 +++++++++++++++++++++++
tools/lib/bpf/bpf.h | 3 +++
3 files changed, 52 insertions(+)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c3e502d..0d51946 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
BPF_RAW_TRACEPOINT_OPEN,
BPF_BTF_LOAD,
BPF_BTF_GET_FD_BY_ID,
+ BPF_TASK_FD_QUERY,
};
enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
__u32 btf_log_size;
__u32 btf_log_level;
};
+
+ struct {
+ __u32 pid; /* input: pid */
+ __u32 fd; /* input: fd */
+ __u32 flags; /* input: flags */
+ __u32 buf_len; /* input/output: buf len */
+ __aligned_u64 buf; /* input/output:
+ * tp_name for tracepoint
+ * symbol for kprobe
+ * filename for uprobe
+ */
+ __u32 prog_id; /* output: prod_id */
+ __u32 fd_type; /* output: BPF_FD_TYPE_* */
+ __u64 probe_offset; /* output: probe_offset */
+ __u64 probe_addr; /* output: probe_addr */
+ } task_fd_query;
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
__u8 dmac[6]; /* ETH_ALEN */
};
+enum bpf_task_fd_type {
+ BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */
+ BPF_FD_TYPE_TRACEPOINT, /* tp name */
+ BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */
+ BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */
+ BPF_FD_TYPE_UPROBE, /* filename + offset */
+ BPF_FD_TYPE_URETPROBE, /* filename + offset */
+};
+
#endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 442b4cd..9ddc89d 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,26 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
return fd;
}
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+ __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+ __u64 *probe_addr)
+{
+ union bpf_attr attr = {};
+ int err;
+
+ attr.task_fd_query.pid = pid;
+ attr.task_fd_query.fd = fd;
+ attr.task_fd_query.flags = flags;
+ attr.task_fd_query.buf = ptr_to_u64(buf);
+ attr.task_fd_query.buf_len = *buf_len;
+
+ err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr));
+ *buf_len = attr.task_fd_query.buf_len;
+ *prog_id = attr.task_fd_query.prog_id;
+ *fd_type = attr.task_fd_query.fd_type;
+ *probe_offset = attr.task_fd_query.probe_offset;
+ *probe_addr = attr.task_fd_query.probe_addr;
+
+ return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index d12344f..0639a30 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
int bpf_raw_tracepoint_open(const char *name, int prog_fd);
int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+ __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+ __u64 *probe_addr);
#endif
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 0/7] bpf: implement BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-24 0:18 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.
There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.
This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
. prog_id
. tracepoint name, or
. k[ret]probe funcname + offset or kernel addr, or
. u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.
Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.
Changelogs:
v3 -> v4:
. made attr buf_len input/output. The length of
actual buffter is written to buf_len so user space knows
what is actually needed. If user provides a buffer
with length >= 1 but less than required, do partial
copy and return -ENOSPC.
. code simplification with put_user.
. changed query result attach_info to fd_type.
. add tests at selftests/bpf to test zero len, null buf and
insufficient buf.
v2 -> v3:
. made perf_get_event() return perf_event pointer const.
this was to ensure that event fields are not meddled.
. detect whether newly BPF_TASK_FD_QUERY is supported or
not in "bpftool perf" and warn users if it is not.
v1 -> v2:
. changed bpf subcommand name from BPF_PERF_EVENT_QUERY
to BPF_TASK_FD_QUERY.
. fixed various "bpftool perf" issues and added documentation
and auto-completion.
Yonghong Song (7):
perf/core: add perf_get_event() to return perf_event given a struct
file
bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in
libbpf
tools/bpf: add ksym_get_addr() in trace_helpers
samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
tools/bpftool: add perf subcommand
include/linux/perf_event.h | 5 +
include/linux/trace_events.h | 17 +
include/uapi/linux/bpf.h | 26 ++
kernel/bpf/syscall.c | 115 +++++++
kernel/events/core.c | 8 +
kernel/trace/bpf_trace.c | 48 +++
kernel/trace/trace_kprobe.c | 29 ++
kernel/trace/trace_uprobe.c | 22 ++
samples/bpf/Makefile | 4 +
samples/bpf/task_fd_query_kern.c | 19 ++
samples/bpf/task_fd_query_user.c | 382 +++++++++++++++++++++++
tools/bpf/bpftool/Documentation/bpftool-perf.rst | 81 +++++
tools/bpf/bpftool/Documentation/bpftool.rst | 5 +-
tools/bpf/bpftool/bash-completion/bpftool | 9 +
tools/bpf/bpftool/main.c | 3 +-
tools/bpf/bpftool/main.h | 1 +
tools/bpf/bpftool/perf.c | 246 +++++++++++++++
tools/include/uapi/linux/bpf.h | 26 ++
tools/lib/bpf/bpf.c | 23 ++
tools/lib/bpf/bpf.h | 3 +
tools/testing/selftests/bpf/test_progs.c | 158 ++++++++++
tools/testing/selftests/bpf/trace_helpers.c | 12 +
tools/testing/selftests/bpf/trace_helpers.h | 1 +
23 files changed, 1241 insertions(+), 2 deletions(-)
create mode 100644 samples/bpf/task_fd_query_kern.c
create mode 100644 samples/bpf/task_fd_query_user.c
create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
create mode 100644 tools/bpf/bpftool/perf.c
--
2.9.5
^ permalink raw reply
* [PATCH bpf-next v4 1/7] perf/core: add perf_get_event() to return perf_event given a struct file
From: Yonghong Song @ 2018-05-24 0:18 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>
A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
include/linux/perf_event.h | 5 +++++
kernel/events/core.c | 8 ++++++++
2 files changed, 13 insertions(+)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..eec302b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child) { }
static inline void perf_event_free_task(struct task_struct *task) { }
static inline void perf_event_delayed_put(struct task_struct *task) { }
static inline struct file *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+ return ERR_PTR(-EINVAL);
+}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..6eeab86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
return file;
}
+const struct perf_event *perf_get_event(struct file *file)
+{
+ if (file->f_op != &perf_fops)
+ return ERR_PTR(-EINVAL);
+
+ return file->private_data;
+}
+
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
if (!event)
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 7/7] tools/bpftool: add perf subcommand
From: Yonghong Song @ 2018-05-24 0:20 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524002020.1182296-1-yhs@fb.com>
The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.
Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
kprobe: trace.py '__x64_sys_nanosleep'
kretprobe: trace.py 'r::__x64_sys_nanosleep'
tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
uprobe: trace.py 'p:/home/yhs/a.out:main'
The bpftool command line and result:
$ bpftool perf
pid 21711 fd 5: prog_id 5 kprobe func __x64_sys_write offset 0
pid 21765 fd 5: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0
pid 21767 fd 5: prog_id 8 tracepoint sys_enter_nanosleep
pid 21800 fd 5: prog_id 9 uprobe filename /home/yhs/a.out offset 1159
$ bpftool -j perf
[{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
{"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
{"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
{"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]
$ bpftool prog
5: kprobe name probe___x64_sys tag e495a0c82f2c7a8d gpl
loaded_at 2018-05-15T04:46:37-0700 uid 0
xlated 200B not jited memlock 4096B map_ids 4
7: kprobe name probe___x64_sys tag f2fdee479a503abf gpl
loaded_at 2018-05-15T04:48:32-0700 uid 0
xlated 200B not jited memlock 4096B map_ids 7
8: tracepoint name tracepoint__sys tag 5390badef2395fcf gpl
loaded_at 2018-05-15T04:48:48-0700 uid 0
xlated 200B not jited memlock 4096B map_ids 8
9: kprobe name probe_main_1 tag 0a87bdc2e2953b6d gpl
loaded_at 2018-05-15T04:49:52-0700 uid 0
xlated 200B not jited memlock 4096B map_ids 9
$ ps ax | grep "python ./trace.py"
21711 pts/0 T 0:03 python ./trace.py __x64_sys_write
21765 pts/0 S+ 0:00 python ./trace.py r::__x64_sys_nanosleep
21767 pts/2 S+ 0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
21800 pts/3 S+ 0:00 python ./trace.py p:/home/yhs/a.out:main
22374 pts/1 S+ 0:00 grep --color=auto python ./trace.py
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/bpf/bpftool/Documentation/bpftool-perf.rst | 81 ++++++++
tools/bpf/bpftool/Documentation/bpftool.rst | 5 +-
tools/bpf/bpftool/bash-completion/bpftool | 9 +
tools/bpf/bpftool/main.c | 3 +-
tools/bpf/bpftool/main.h | 1 +
tools/bpf/bpftool/perf.c | 246 +++++++++++++++++++++++
6 files changed, 343 insertions(+), 2 deletions(-)
create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
create mode 100644 tools/bpf/bpftool/perf.c
diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 0000000..e3eb0ea
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+================
+bpftool-perf
+================
+-------------------------------------------------------------------------------
+tool for inspection of perf related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+ **bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+ *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+ *COMMANDS* :=
+ { **show** | **list** | **help** }
+
+PERF COMMANDS
+=============
+
+| **bpftool** **perf { show | list }**
+| **bpftool** **perf help**
+
+DESCRIPTION
+===========
+ **bpftool perf { show | list }**
+ List all raw_tracepoint, tracepoint, kprobe attachment in the system.
+
+ Output will start with process id and file descriptor in that process,
+ followed by bpf program id, attachment information, and attachment point.
+ The attachment point for raw_tracepoint/tracepoint is the trace probe name.
+ The attachment point for k[ret]probe is either symbol name and offset,
+ or a kernel virtual address.
+ The attachment point for u[ret]probe is the file name and the file offset.
+
+ **bpftool perf help**
+ Print short help message.
+
+OPTIONS
+=======
+ -h, --help
+ Print short generic help message (similar to **bpftool help**).
+
+ -v, --version
+ Print version number (similar to **bpftool version**).
+
+ -j, --json
+ Generate JSON output. For commands that cannot produce JSON, this
+ option has no effect.
+
+ -p, --pretty
+ Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool perf**
+
+::
+
+ pid 21711 fd 5: prog_id 5 kprobe func __x64_sys_write offset 0
+ pid 21765 fd 5: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0
+ pid 21767 fd 5: prog_id 8 tracepoint sys_enter_nanosleep
+ pid 21800 fd 5: prog_id 9 uprobe filename /home/yhs/a.out offset 1159
+
+|
+| **# bpftool -j perf**
+
+::
+
+ [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
+ {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
+ {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
+ {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]
+
+
+SEE ALSO
+========
+ **bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 564cb0d..b6f5d56 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
**bpftool** **version**
- *OBJECT* := { **map** | **program** | **cgroup** }
+ *OBJECT* := { **map** | **program** | **cgroup** | **perf** }
*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -30,6 +30,8 @@ SYNOPSIS
*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
+ *PERF-COMMANDS* := { **show** | **list** | **help** }
+
DESCRIPTION
===========
*bpftool* allows for inspection and simple modification of BPF objects
@@ -56,3 +58,4 @@ OPTIONS
SEE ALSO
========
**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
+ **bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index b301c9b..7bc198d 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -448,6 +448,15 @@ _bpftool()
;;
esac
;;
+ perf)
+ case $command in
+ *)
+ [[ $prev == $object ]] && \
+ COMPREPLY=( $( compgen -W 'help \
+ show list' -- "$cur" ) )
+ ;;
+ esac
+ ;;
esac
} &&
complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1ec852d..eea7f14 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
" %s batch file FILE\n"
" %s version\n"
"\n"
- " OBJECT := { prog | map | cgroup }\n"
+ " OBJECT := { prog | map | cgroup | perf }\n"
" " HELP_SPEC_OPTIONS "\n"
"",
bin_name, bin_name, bin_name);
@@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
{ "prog", do_prog },
{ "map", do_map },
{ "cgroup", do_cgroup },
+ { "perf", do_perf },
{ "version", do_version },
{ 0 }
};
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 6173cd9..63fdb31 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
int do_map(int argc, char **arg);
int do_event_pipe(int argc, char **argv);
int do_cgroup(int argc, char **arg);
+int do_perf(int argc, char **arg);
int prog_parse_fd(int *argc, char ***argv);
int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
new file mode 100644
index 0000000..ac6b1a1
--- /dev/null
+++ b/tools/bpf/bpftool/perf.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+// Author: Yonghong Song <yhs@fb.com>
+
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ftw.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+/* 0: undecided, 1: supported, 2: not supported */
+static int perf_query_supported;
+static bool has_perf_query_support(void)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ char buf[256];
+ int fd;
+
+ if (perf_query_supported)
+ goto out;
+
+ fd = open(bin_name, O_RDONLY);
+ if (fd < 0) {
+ p_err("perf_query_support: %s", strerror(errno));
+ goto out;
+ }
+
+ /* the following query will fail as no bpf attachment,
+ * the expected errno is ENOTSUPP
+ */
+ errno = 0;
+ len = sizeof(buf);
+ bpf_task_fd_query(getpid(), fd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+
+ if (errno == 524 /* ENOTSUPP */) {
+ perf_query_supported = 1;
+ goto close_fd;
+ }
+
+ perf_query_supported = 2;
+ p_err("perf_query_support: %s", strerror(errno));
+ fprintf(stderr,
+ "HINT: non root or kernel doesn't support TASK_FD_QUERY\n");
+
+close_fd:
+ close(fd);
+out:
+ return perf_query_supported == 1;
+}
+
+static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type,
+ char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+ jsonw_start_object(json_wtr);
+ jsonw_int_field(json_wtr, "pid", pid);
+ jsonw_int_field(json_wtr, "fd", fd);
+ jsonw_uint_field(json_wtr, "prog_id", prog_id);
+ switch (fd_type) {
+ case BPF_FD_TYPE_RAW_TRACEPOINT:
+ jsonw_string_field(json_wtr, "fd_type", "raw_tracepoint");
+ jsonw_string_field(json_wtr, "tracepoint", buf);
+ break;
+ case BPF_FD_TYPE_TRACEPOINT:
+ jsonw_string_field(json_wtr, "fd_type", "tracepoint");
+ jsonw_string_field(json_wtr, "tracepoint", buf);
+ break;
+ case BPF_FD_TYPE_KPROBE:
+ jsonw_string_field(json_wtr, "fd_type", "kprobe");
+ if (buf[0] != '\0') {
+ jsonw_string_field(json_wtr, "func", buf);
+ jsonw_lluint_field(json_wtr, "offset", probe_offset);
+ } else {
+ jsonw_lluint_field(json_wtr, "addr", probe_addr);
+ }
+ break;
+ case BPF_FD_TYPE_KRETPROBE:
+ jsonw_string_field(json_wtr, "fd_type", "kretprobe");
+ if (buf[0] != '\0') {
+ jsonw_string_field(json_wtr, "func", buf);
+ jsonw_lluint_field(json_wtr, "offset", probe_offset);
+ } else {
+ jsonw_lluint_field(json_wtr, "addr", probe_addr);
+ }
+ break;
+ case BPF_FD_TYPE_UPROBE:
+ jsonw_string_field(json_wtr, "fd_type", "uprobe");
+ jsonw_string_field(json_wtr, "filename", buf);
+ jsonw_lluint_field(json_wtr, "offset", probe_offset);
+ break;
+ case BPF_FD_TYPE_URETPROBE:
+ jsonw_string_field(json_wtr, "fd_type", "uretprobe");
+ jsonw_string_field(json_wtr, "filename", buf);
+ jsonw_lluint_field(json_wtr, "offset", probe_offset);
+ break;
+ }
+ jsonw_end_object(json_wtr);
+}
+
+static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type,
+ char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+ printf("pid %d fd %d: prog_id %u ", pid, fd, prog_id);
+ switch (fd_type) {
+ case BPF_FD_TYPE_RAW_TRACEPOINT:
+ printf("raw_tracepoint %s\n", buf);
+ break;
+ case BPF_FD_TYPE_TRACEPOINT:
+ printf("tracepoint %s\n", buf);
+ break;
+ case BPF_FD_TYPE_KPROBE:
+ if (buf[0] != '\0')
+ printf("kprobe func %s offset %llu\n", buf,
+ probe_offset);
+ else
+ printf("kprobe addr %llu\n", probe_addr);
+ break;
+ case BPF_FD_TYPE_KRETPROBE:
+ if (buf[0] != '\0')
+ printf("kretprobe func %s offset %llu\n", buf,
+ probe_offset);
+ else
+ printf("kretprobe addr %llu\n", probe_addr);
+ break;
+ case BPF_FD_TYPE_UPROBE:
+ printf("uprobe filename %s offset %llu\n", buf, probe_offset);
+ break;
+ case BPF_FD_TYPE_URETPROBE:
+ printf("uretprobe filename %s offset %llu\n", buf,
+ probe_offset);
+ break;
+ }
+}
+
+static int show_proc(const char *fpath, const struct stat *sb,
+ int tflag, struct FTW *ftwbuf)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ int err, pid = 0, fd = 0;
+ const char *pch;
+ char buf[4096];
+
+ /* prefix always /proc */
+ pch = fpath + 5;
+ if (*pch == '\0')
+ return 0;
+
+ /* pid should be all numbers */
+ pch++;
+ while (isdigit(*pch)) {
+ pid = pid * 10 + *pch - '0';
+ pch++;
+ }
+ if (*pch == '\0')
+ return 0;
+ if (*pch != '/')
+ return FTW_SKIP_SUBTREE;
+
+ /* check /proc/<pid>/fd directory */
+ pch++;
+ if (strncmp(pch, "fd", 2))
+ return FTW_SKIP_SUBTREE;
+ pch += 2;
+ if (*pch == '\0')
+ return 0;
+ if (*pch != '/')
+ return FTW_SKIP_SUBTREE;
+
+ /* check /proc/<pid>/fd/<fd_num> */
+ pch++;
+ while (isdigit(*pch)) {
+ fd = fd * 10 + *pch - '0';
+ pch++;
+ }
+ if (*pch != '\0')
+ return FTW_SKIP_SUBTREE;
+
+ /* query (pid, fd) for potential perf events */
+ len = sizeof(buf);
+ err = bpf_task_fd_query(pid, fd, 0, buf, &len, &prog_id, &fd_type,
+ &probe_offset, &probe_addr);
+ if (err < 0)
+ return 0;
+
+ if (json_output)
+ print_perf_json(pid, fd, prog_id, fd_type, buf, probe_offset,
+ probe_addr);
+ else
+ print_perf_plain(pid, fd, prog_id, fd_type, buf, probe_offset,
+ probe_addr);
+
+ return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+ int flags = FTW_ACTIONRETVAL | FTW_PHYS;
+ int err = 0, nopenfd = 16;
+
+ if (!has_perf_query_support())
+ return -1;
+
+ if (json_output)
+ jsonw_start_array(json_wtr);
+ if (nftw("/proc", show_proc, nopenfd, flags) == -1) {
+ p_err("%s", strerror(errno));
+ err = -1;
+ }
+ if (json_output)
+ jsonw_end_array(json_wtr);
+
+ return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+ fprintf(stderr,
+ "Usage: %s %s { show | list | help }\n"
+ "",
+ bin_name, argv[-2]);
+
+ return 0;
+}
+
+static const struct cmd cmds[] = {
+ { "show", do_show },
+ { "list", do_show },
+ { "help", do_help },
+ { 0 }
+};
+
+int do_perf(int argc, char **argv)
+{
+ return cmd_select(cmds, argc, argv, do_help);
+}
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
From: Yonghong Song @ 2018-05-24 0:20 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524002020.1182296-1-yhs@fb.com>
The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
tools/testing/selftests/bpf/test_progs.c | 158 +++++++++++++++++++++++++++++++
1 file changed, 158 insertions(+)
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..c9fd351 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,162 @@ static void test_get_stack_raw_tp(void)
bpf_object__close(obj);
}
+static void test_task_fd_query_rawtp(void)
+{
+ const char *file = "./test_get_stack_rawtp.o";
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ struct bpf_object *obj;
+ int efd, err, prog_fd;
+ __u32 duration = 0;
+ char buf[256];
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+ return;
+
+ efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+ if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+ goto close_prog;
+
+ /* query (getpid(), efd) */
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+ errno))
+ goto close_prog;
+
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ strcmp(buf, "sys_enter") == 0;
+ if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+ fd_type, buf))
+ goto close_prog;
+
+ /* test zero len */
+ len = 0;
+ err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+ err, errno))
+ goto close_prog;
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == (strlen("sys_enter") + 1);
+ if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+ goto close_prog;
+
+ /* test empty buffer */
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+ err, errno))
+ goto close_prog;
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == (strlen("sys_enter") + 1);
+ if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+ goto close_prog;
+
+ /* test smaller buffer */
+ len = 2;
+ err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 2)",
+ "err %d errno %d\n", err, errno))
+ goto close_prog;
+ err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+ len == (strlen("sys_enter") + 1) &&
+ strncmp(buf, "sys_enter", 2) == 0;
+ if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+ goto close_prog;
+
+ goto close_prog_noerr;
+close_prog:
+ error_cnt++;
+close_prog_noerr:
+ bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+ const char *tp_name)
+{
+ const char *file = "./test_tracepoint.o";
+ int err, bytes, efd, prog_fd, pmu_fd;
+ struct perf_event_attr attr = {};
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ struct bpf_object *obj;
+ __u32 duration = 0;
+ char buf[256];
+
+ err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+ if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+ goto close_prog;
+
+ snprintf(buf, sizeof(buf),
+ "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+ efd = open(buf, O_RDONLY, 0);
+ if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+ goto close_prog;
+ bytes = read(efd, buf, sizeof(buf));
+ close(efd);
+ if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+ "bytes %d errno %d\n", bytes, errno))
+ goto close_prog;
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_type = PERF_SAMPLE_RAW;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+ 0 /* cpu 0 */, -1 /* group id */,
+ 0 /* flags */);
+ if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+ goto close_pmu;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+ if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+ if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ /* query (getpid(), pmu_fd) */
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id,
+ &fd_type, &probe_offset, &probe_addr);
+ if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+ errno))
+ goto close_pmu;
+
+ err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name);
+ if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+ fd_type, buf))
+ goto close_pmu;
+
+ close(pmu_fd);
+ goto close_prog_noerr;
+
+close_pmu:
+ close(pmu_fd);
+close_prog:
+ error_cnt++;
+close_prog_noerr:
+ bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp(void)
+{
+ test_task_fd_query_tp_core("sched/sched_switch",
+ "sched_switch");
+ test_task_fd_query_tp_core("syscalls/sys_enter_read",
+ "sys_enter_read");
+}
+
int main(void)
{
jit_enabled = is_jit_enabled();
@@ -1561,6 +1717,8 @@ int main(void)
test_stacktrace_build_id_nmi();
test_stacktrace_map_raw_tp();
test_get_stack_raw_tp();
+ test_task_fd_query_rawtp();
+ test_task_fd_query_tp();
printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
--
2.9.5
^ permalink raw reply related
* [PATCH bpf-next v4 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-24 0:20 UTC (permalink / raw)
To: peterz, ast, daniel, netdev; +Cc: kernel-team
This is mostly to test kprobe/uprobe which needs kernel headers.
Signed-off-by: Yonghong Song <yhs@fb.com>
---
samples/bpf/Makefile | 4 +
samples/bpf/task_fd_query_kern.c | 19 ++
samples/bpf/task_fd_query_user.c | 382 +++++++++++++++++++++++++++++++++++++++
3 files changed, 405 insertions(+)
create mode 100644 samples/bpf/task_fd_query_kern.c
create mode 100644 samples/bpf/task_fd_query_user.c
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
hostprogs-y += xdp_adjust_tail
hostprogs-y += xdpsock
hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
# Libbpf dependencies
LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
xdp_adjust_tail-objs := xdp_adjust_tail_user.o
xdpsock-objs := bpf_load.o xdpsock_user.o
xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
# Tell kbuild to always build the programs
always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
always += xdp_adjust_tail_kern.o
always += xdpsock_kern.o
always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include
HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
HOST_LOADLIBES += $(LIBBPF) -lelf
HOSTLOADLIBES_tracex4 += -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 0000000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+ return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+ return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 0000000..8381d79
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) { \
+ printf("FAIL: %s:\n", __func__); \
+ perror(" "); \
+ return -1; \
+ } \
+})
+
+#define CHECK_AND_RET(condition) ({ \
+ int __ret = !!(condition); \
+ if (__ret) \
+ return -1; \
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+ return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ errno = 0;
+ ret = (int)strtol(buf, NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+ char buf[256];
+ int fd, ret;
+
+ ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+ fd = open(buf, O_RDONLY);
+ CHECK_PERROR_RET(fd < 0);
+
+ ret = read(fd, buf, sizeof(buf));
+ close(fd);
+ CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+ CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+ errno = 0;
+ ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+ CHECK_PERROR_RET(errno);
+ return ret;
+}
+
+static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name,
+ __u32 expected_fd_type)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ char buf[256];
+ int err;
+
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+ __func__, prog_fd_idx, fn_name);
+ perror(" :");
+ return -1;
+ }
+ if (strcmp(buf, fn_name) != 0 ||
+ fd_type != expected_fd_type ||
+ probe_offset != 0x0 || probe_addr != 0x0) {
+ printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
+ prog_fd_idx);
+ printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
+ " probe_addr: 0x%llx\n",
+ buf, fd_type, probe_offset, probe_addr);
+ return -1;
+ }
+ return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+ const char *name, __u64 offset, __u64 addr, bool is_return,
+ char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+ __u64 *probe_offset, __u64 *probe_addr)
+{
+ int is_return_bit = bpf_get_retprobe_bit(event_type);
+ int type = bpf_find_probe_type(event_type);
+ struct perf_event_attr attr = {};
+ int fd;
+
+ if (type < 0 || is_return_bit < 0) {
+ printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+ __func__, type, is_return_bit);
+ return -1;
+ }
+
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ if (is_return)
+ attr.config |= 1 << is_return_bit;
+
+ if (name) {
+ attr.config1 = ptr_to_u64((void *)name);
+ attr.config2 = offset;
+ } else {
+ attr.config1 = 0;
+ attr.config2 = addr;
+ }
+ attr.size = sizeof(attr);
+ attr.type = type;
+
+ fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+ CHECK_PERROR_RET(fd < 0);
+
+ CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+ CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+ CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+ prog_id, fd_type, probe_offset, probe_addr) < 0);
+
+ return 0;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+ __u64 offset, __u64 addr, bool is_return,
+ __u32 expected_fd_type,
+ __u32 expected_ret_fd_type,
+ char *buf, __u32 buf_len)
+{
+ __u64 probe_offset, probe_addr;
+ __u32 prog_id, fd_type;
+ int err;
+
+ err = test_nondebug_fs_kuprobe_common(event_type, name,
+ offset, addr, is_return,
+ buf, &buf_len, &prog_id,
+ &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, "
+ "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+ __func__, name ? name : "", offset, addr, is_return);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != expected_ret_fd_type) ||
+ (!is_return && fd_type != expected_fd_type)) {
+ printf("FAIL: %s, incorrect fd_type %u\n",
+ __func__, fd_type);
+ return -1;
+ }
+ if (name) {
+ if (strcmp(name, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+ __func__, probe_offset);
+ return -1;
+ }
+ } else {
+ if (buf_len != 0) {
+ printf("FAIL: %s, incorrect buf %p\n",
+ __func__, buf);
+ return -1;
+ }
+
+ if (probe_addr != addr) {
+ printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+ __func__, probe_addr);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+ const char *event_type = "uprobe";
+ struct perf_event_attr attr = {};
+ char buf[256], event_alias[256];
+ __u64 probe_offset, probe_addr;
+ __u32 len, prog_id, fd_type;
+ int err, res, kfd, efd;
+ ssize_t bytes;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+ event_type);
+ kfd = open(buf, O_WRONLY | O_APPEND, 0);
+ CHECK_PERROR_RET(kfd < 0);
+
+ res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+ res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+ is_return ? 'r' : 'p', event_type, event_alias,
+ binary_path, offset);
+ CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+ CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+ close(kfd);
+ kfd = -1;
+
+ snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+ event_type, event_alias);
+ efd = open(buf, O_RDONLY, 0);
+ CHECK_PERROR_RET(efd < 0);
+
+ bytes = read(efd, buf, sizeof(buf));
+ CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+ close(efd);
+ buf[bytes] = '\0';
+
+ attr.config = strtol(buf, NULL, 0);
+ attr.type = PERF_TYPE_TRACEPOINT;
+ attr.sample_period = 1;
+ attr.wakeup_events = 1;
+ kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+ CHECK_PERROR_RET(kfd < 0);
+ CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+ CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+
+ len = sizeof(buf);
+ err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
+ &prog_id, &fd_type, &probe_offset,
+ &probe_addr);
+ if (err < 0) {
+ printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+ perror(" :");
+ return -1;
+ }
+ if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
+ (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
+ printf("FAIL: %s, incorrect fd_type %u\n", __func__,
+ fd_type);
+ return -1;
+ }
+ if (strcmp(binary_path, buf) != 0) {
+ printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+ return -1;
+ }
+ if (probe_offset != offset) {
+ printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+ probe_offset);
+ return -1;
+ }
+
+ close(kfd);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ struct rlimit r = {1024*1024, RLIM_INFINITY};
+ extern char __executable_start;
+ char filename[256], buf[256];
+ __u64 uprobe_file_offset;
+
+ snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ perror("setrlimit(RLIMIT_MEMLOCK)");
+ return 1;
+ }
+
+ if (load_kallsyms()) {
+ printf("failed to process /proc/kallsyms\n");
+ return 1;
+ }
+
+ if (load_bpf_file(filename)) {
+ printf("%s", bpf_log_buf);
+ return 1;
+ }
+
+ /* test two functions in the corresponding *_kern.c file */
+ CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+ BPF_FD_TYPE_KPROBE));
+ CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+ BPF_FD_TYPE_KRETPROBE));
+
+ /* test nondebug fs kprobe */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#ifdef __x86_64__
+ /* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+ false, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+#endif
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+ true, BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), false,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ NULL, 0));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+ ksym_get_addr("bpf_check"), true,
+ BPF_FD_TYPE_KPROBE,
+ BPF_FD_TYPE_KRETPROBE,
+ 0, 0));
+
+ /* test nondebug fs uprobe */
+ /* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+ * and the default linker script, which defines __executable_start as
+ * the start of the .text section. The calculation could be different
+ * on different systems with different compilers. The right way is
+ * to parse the ELF file. We took a shortcut here.
+ */
+ uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, false,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+ CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+ uprobe_file_offset, 0x0, true,
+ BPF_FD_TYPE_UPROBE,
+ BPF_FD_TYPE_URETPROBE,
+ buf, sizeof(buf)));
+
+ /* test debug fs uprobe */
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ false));
+ CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+ true));
+
+ return 0;
+}
--
2.9.5
^ permalink raw reply related
* [PATCH bpf] bpf: properly enforce index mask to prevent out-of-bounds speculation
From: Daniel Borkmann @ 2018-05-24 0:32 UTC (permalink / raw)
To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann
While reviewing the verifier code, I recently noticed that the
following two program variants in relation to tail calls can be
loaded.
Variant 1:
# bpftool p d x i 15
0: (15) if r1 == 0x0 goto pc+3
1: (18) r2 = map[id:5]
3: (05) goto pc+2
4: (18) r2 = map[id:6]
6: (b7) r3 = 7
7: (35) if r3 >= 0xa0 goto pc+2
8: (54) (u32) r3 &= (u32) 255
9: (85) call bpf_tail_call#12
10: (b7) r0 = 1
11: (95) exit
# bpftool m s i 5
5: prog_array flags 0x0
key 4B value 4B max_entries 4 memlock 4096B
# bpftool m s i 6
6: prog_array flags 0x0
key 4B value 4B max_entries 160 memlock 4096B
Variant 2:
# bpftool p d x i 20
0: (15) if r1 == 0x0 goto pc+3
1: (18) r2 = map[id:8]
3: (05) goto pc+2
4: (18) r2 = map[id:7]
6: (b7) r3 = 7
7: (35) if r3 >= 0x4 goto pc+2
8: (54) (u32) r3 &= (u32) 3
9: (85) call bpf_tail_call#12
10: (b7) r0 = 1
11: (95) exit
# bpftool m s i 8
8: prog_array flags 0x0
key 4B value 4B max_entries 160 memlock 4096B
# bpftool m s i 7
7: prog_array flags 0x0
key 4B value 4B max_entries 4 memlock 4096B
In both cases the index masking inserted by the verifier in order
to control out of bounds speculation from a CPU via b2157399cc98
("bpf: prevent out-of-bounds speculation") seems to be incorrect
in what it is enforcing. In the 1st variant, the mask is applied
from the map with the significantly larger number of entries where
we would allow to a certain degree out of bounds speculation for
the smaller map, and in the 2nd variant where the mask is applied
from the map with the smaller number of entries, we get buggy
behavior since we truncate the index of the larger map.
The original intent from commit b2157399cc98 is to reject such
occasions where two or more different tail call maps are used
in the same tail call helper invocation. However, the check on
the BPF_MAP_PTR_POISON is never hit since we never poisoned the
saved pointer in the first place! We do this explicitly for map
lookups but in case of tail calls we basically used the tail
call map in insn_aux_data that was processed in the most recent
path which the verifier walked. Thus any prior path that stored
a pointer in insn_aux_data at the helper location was always
overridden.
Fix it by moving the map pointer poison logic into a small helper
that covers both BPF helpers with the same logic. After that in
fixup_bpf_calls() the poison check is then hit for tail calls
and the program rejected. Latter only happens in unprivileged
case since this is the *only* occasion where a rewrite needs to
happen, and where such rewrite is specific to the map (max_entries,
index_mask). In the privileged case the rewrite is generic for
the insn->imm / insn->code update so multiple maps from different
paths can be handled just fine since all the remaining logic
happens in the instruction processing itself. This is similar
to the case of map lookups: in case there is a collision of
maps in fixup_bpf_calls() we must skip the inlined rewrite since
this will turn the generic instruction sequence into a non-
generic one. Thus the patch_call_imm will simply update the
insn->imm location where the bpf_map_lookup_elem() will later
take care of the dispatch. Given we need this 'poison' state
as a check, the information of whether a map is an unpriv_array
gets lost, so enforcing it prior to that needs an additional
state. In general this check is needed since there are some
complex and tail call intensive BPF programs out there where
LLVM tends to generate such code occasionally. We therefore
convert the map_ptr rather into map_state to store all this
w/o extra memory overhead, and the bit whether one of the maps
involved in the collision was from an unpriv_array thus needs
to be retained as well there.
Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
[ Test cases coming via bpf-next batch of patches to avoid
merge conflicts in test_verifier with bpf. ]
include/linux/bpf_verifier.h | 2 +-
kernel/bpf/verifier.c | 86 ++++++++++++++++++++++++++++++++------------
2 files changed, 65 insertions(+), 23 deletions(-)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c39..52fb077 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -142,7 +142,7 @@ struct bpf_verifier_state_list {
struct bpf_insn_aux_data {
union {
enum bpf_reg_type ptr_type; /* pointer type for load/store insns */
- struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */
+ unsigned long map_state; /* pointer/poison value for maps */
s32 call_imm; /* saved imm field of call insn */
};
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb..dcebf3f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,7 +156,29 @@ struct bpf_verifier_stack_elem {
#define BPF_COMPLEXITY_LIMIT_INSNS 131072
#define BPF_COMPLEXITY_LIMIT_STACK 1024
-#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+#define BPF_MAP_PTR_UNPRIV 1UL
+#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \
+ POISON_POINTER_DELTA))
+#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+
+static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
+{
+ return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+}
+
+static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
+{
+ return aux->map_state & BPF_MAP_PTR_UNPRIV;
+}
+
+static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
+ const struct bpf_map *map, bool unpriv)
+{
+ BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
+ unpriv |= bpf_map_ptr_unpriv(aux);
+ aux->map_state = (unsigned long)map |
+ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
@@ -2333,6 +2355,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return 0;
}
+static int
+record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+ int func_id, int insn_idx)
+{
+ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+
+ if (func_id != BPF_FUNC_tail_call &&
+ func_id != BPF_FUNC_map_lookup_elem)
+ return 0;
+ if (meta->map_ptr == NULL) {
+ verbose(env, "kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+
+ if (!BPF_MAP_PTR(aux->map_state))
+ bpf_map_ptr_store(aux, meta->map_ptr,
+ meta->map_ptr->unpriv_array);
+ else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+ bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
+ meta->map_ptr->unpriv_array);
+ return 0;
+}
+
static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
const struct bpf_func_proto *fn = NULL;
@@ -2387,13 +2432,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
if (err)
return err;
- if (func_id == BPF_FUNC_tail_call) {
- if (meta.map_ptr == NULL) {
- verbose(env, "verifier bug\n");
- return -EINVAL;
- }
- env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
- }
err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
if (err)
return err;
@@ -2404,6 +2442,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
if (err)
return err;
+ err = record_func_map(env, &meta, func_id, insn_idx);
+ if (err)
+ return err;
+
/* Mark slots with STACK_MISC in case of raw mode, stack offset
* is inferred from register state.
*/
@@ -2428,8 +2470,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
} else if (fn->ret_type == RET_VOID) {
regs[BPF_REG_0].type = NOT_INIT;
} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
- struct bpf_insn_aux_data *insn_aux;
-
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
/* There is no offset yet applied, variable or fixed */
mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -2445,11 +2485,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].id = ++env->id_gen;
- insn_aux = &env->insn_aux_data[insn_idx];
- if (!insn_aux->map_ptr)
- insn_aux->map_ptr = meta.map_ptr;
- else if (insn_aux->map_ptr != meta.map_ptr)
- insn_aux->map_ptr = BPF_MAP_PTR_POISON;
} else {
verbose(env, "unknown return type %d of func %s#%d\n",
fn->ret_type, func_id_name(func_id), func_id);
@@ -5417,6 +5452,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_insn *insn = prog->insnsi;
const struct bpf_func_proto *fn;
const int insn_cnt = prog->len;
+ struct bpf_insn_aux_data *aux;
struct bpf_insn insn_buf[16];
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
@@ -5491,19 +5527,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
insn->imm = 0;
insn->code = BPF_JMP | BPF_TAIL_CALL;
+ aux = &env->insn_aux_data[i + delta];
+ if (!bpf_map_ptr_unpriv(aux))
+ continue;
+
/* instead of changing every JIT dealing with tail_call
* emit two extra insns:
* if (index >= max_entries) goto out;
* index &= array->index_mask;
* to avoid out-of-bounds cpu speculation
*/
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON) {
+ if (bpf_map_ptr_poisoned(aux)) {
verbose(env, "tail_call abusing map_ptr\n");
return -EINVAL;
}
- if (!map_ptr->unpriv_array)
- continue;
+
+ map_ptr = BPF_MAP_PTR(aux->map_state);
insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
map_ptr->max_entries, 2);
insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -5527,9 +5566,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
*/
if (prog->jit_requested && BITS_PER_LONG == 64 &&
insn->imm == BPF_FUNC_map_lookup_elem) {
- map_ptr = env->insn_aux_data[i + delta].map_ptr;
- if (map_ptr == BPF_MAP_PTR_POISON ||
- !map_ptr->ops->map_gen_lookup)
+ aux = &env->insn_aux_data[i + delta];
+ if (bpf_map_ptr_poisoned(aux))
+ goto patch_call_imm;
+
+ map_ptr = BPF_MAP_PTR(aux->map_state);
+ if (!map_ptr->ops->map_gen_lookup)
goto patch_call_imm;
cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
--
2.9.5
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox