Linux userland API discussions
 help / color / mirror / Atom feed
* [PATCH net-next v5 3/4] tunnels: advertise link netns via netlink
From: Nicolas Dichtel @ 2015-01-15 14:11 UTC (permalink / raw)
  To: netdev, containers, linux-kernel, linux-api
  Cc: davem, ebiederm, stephen, akpm, luto, cwang, Nicolas Dichtel
In-Reply-To: <1421331078-21622-1-git-send-email-nicolas.dichtel@6wind.com>

Implement rtnl_link_ops->get_link_net() callback so that IFLA_LINK_NETNSID is
added to rtnetlink messages.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 drivers/net/vxlan.c      | 8 ++++++++
 include/net/ip6_tunnel.h | 1 +
 include/net/ip_tunnels.h | 1 +
 net/ipv4/ip_gre.c        | 2 ++
 net/ipv4/ip_tunnel.c     | 8 ++++++++
 net/ipv4/ip_vti.c        | 1 +
 net/ipv4/ipip.c          | 1 +
 net/ipv6/ip6_gre.c       | 1 +
 net/ipv6/ip6_tunnel.c    | 9 +++++++++
 net/ipv6/ip6_vti.c       | 1 +
 net/ipv6/sit.c           | 1 +
 11 files changed, 34 insertions(+)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 6b6b45622a0a..88dbb1edea6e 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2922,6 +2922,13 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static struct net *vxlan_get_link_net(const struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+
+	return vxlan->net;
+}
+
 static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.kind		= "vxlan",
 	.maxtype	= IFLA_VXLAN_MAX,
@@ -2933,6 +2940,7 @@ static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
 	.dellink	= vxlan_dellink,
 	.get_size	= vxlan_get_size,
 	.fill_info	= vxlan_fill_info,
+	.get_link_net	= vxlan_get_link_net,
 };
 
 static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
index 9326c41c2d7f..76c091b53dae 100644
--- a/include/net/ip6_tunnel.h
+++ b/include/net/ip6_tunnel.h
@@ -70,6 +70,7 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
 __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw);
 __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr,
 			     const struct in6_addr *raddr);
+struct net *ip6_tnl_get_link_net(const struct net_device *dev);
 
 static inline void ip6tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index ce4db3cc5647..2c47061a6954 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -141,6 +141,7 @@ int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op,
 int ip_tunnel_init(struct net_device *dev);
 void ip_tunnel_uninit(struct net_device *dev);
 void  ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
+struct net *ip_tunnel_get_link_net(const struct net_device *dev);
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 		       struct rtnl_link_ops *ops, char *devname);
 
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 942576e27df1..6e7727f27393 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -829,6 +829,7 @@ static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipgre_get_size,
 	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
@@ -843,6 +844,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipgre_get_size,
 	.fill_info	= ipgre_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static int __net_init ipgre_tap_init_net(struct net *net)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index d3e447936720..2cd08280c77b 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -972,6 +972,14 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
 
+struct net *ip_tunnel_get_link_net(const struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = netdev_priv(dev);
+
+	return tunnel->net;
+}
+EXPORT_SYMBOL(ip_tunnel_get_link_net);
+
 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
 				  struct rtnl_link_ops *ops, char *devname)
 {
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 1a7e979e80ba..94efe148181c 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -531,6 +531,7 @@ static struct rtnl_link_ops vti_link_ops __read_mostly = {
 	.dellink        = ip_tunnel_dellink,
 	.get_size	= vti_get_size,
 	.fill_info	= vti_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static int __init vti_init(void)
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 40403114f00a..b58d6689874c 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -498,6 +498,7 @@ static struct rtnl_link_ops ipip_link_ops __read_mostly = {
 	.dellink	= ip_tunnel_dellink,
 	.get_size	= ipip_get_size,
 	.fill_info	= ipip_fill_info,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct xfrm_tunnel ipip_handler __read_mostly = {
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 13cda4c6313b..9306a5ff9149 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1662,6 +1662,7 @@ static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.dellink	= ip6gre_dellink,
 	.get_size	= ip6gre_get_size,
 	.fill_info	= ip6gre_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 92b3da571980..266a264ec212 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1760,6 +1760,14 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+struct net *ip6_tnl_get_link_net(const struct net_device *dev)
+{
+	struct ip6_tnl *tunnel = netdev_priv(dev);
+
+	return tunnel->net;
+}
+EXPORT_SYMBOL(ip6_tnl_get_link_net);
+
 static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = {
 	[IFLA_IPTUN_LINK]		= { .type = NLA_U32 },
 	[IFLA_IPTUN_LOCAL]		= { .len = sizeof(struct in6_addr) },
@@ -1783,6 +1791,7 @@ static struct rtnl_link_ops ip6_link_ops __read_mostly = {
 	.dellink	= ip6_tnl_dellink,
 	.get_size	= ip6_tnl_get_size,
 	.fill_info	= ip6_tnl_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ace10d0b3aac..5fb9e212eca8 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1016,6 +1016,7 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
 	.changelink	= vti6_changelink,
 	.get_size	= vti6_get_size,
 	.fill_info	= vti6_fill_info,
+	.get_link_net	= ip6_tnl_get_link_net,
 };
 
 static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n)
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 213546bd6d5d..3cc197c72b59 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1763,6 +1763,7 @@ static struct rtnl_link_ops sit_link_ops __read_mostly = {
 	.get_size	= ipip6_get_size,
 	.fill_info	= ipip6_fill_info,
 	.dellink	= ipip6_dellink,
+	.get_link_net	= ip_tunnel_get_link_net,
 };
 
 static struct xfrm_tunnel sit_handler __read_mostly = {
-- 
2.2.2

^ permalink raw reply related

* [PATCH net-next v5 4/4] rtnl: allow to create device with IFLA_LINK_NETNSID set
From: Nicolas Dichtel @ 2015-01-15 14:11 UTC (permalink / raw)
  To: netdev, containers, linux-kernel, linux-api
  Cc: davem, ebiederm, stephen, akpm, luto, cwang, Nicolas Dichtel
In-Reply-To: <1421331078-21622-1-git-send-email-nicolas.dichtel@6wind.com>

This patch adds the ability to create a netdevice in a specified netns and
then move it into the final netns. In fact, it allows to have a symetry between
get and set rtnl messages.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
---
 net/core/rtnetlink.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ab78ba9a34e8..b2f6d8285a24 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1247,6 +1247,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PHYS_PORT_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_CARRIER_CHANGES]	= { .type = NLA_U32 },  /* ignored */
 	[IFLA_PHYS_SWITCH_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
+	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2020,7 +2021,7 @@ replay:
 		struct nlattr *slave_attr[m_ops ? m_ops->slave_maxtype + 1 : 0];
 		struct nlattr **data = NULL;
 		struct nlattr **slave_data = NULL;
-		struct net *dest_net;
+		struct net *dest_net, *link_net = NULL;
 
 		if (ops) {
 			if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
@@ -2126,7 +2127,18 @@ replay:
 		if (IS_ERR(dest_net))
 			return PTR_ERR(dest_net);
 
-		dev = rtnl_create_link(dest_net, ifname, name_assign_type, ops, tb);
+		if (tb[IFLA_LINK_NETNSID]) {
+			int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+
+			link_net = get_net_ns_by_id(dest_net, id);
+			if (!link_net) {
+				err =  -EINVAL;
+				goto out;
+			}
+		}
+
+		dev = rtnl_create_link(link_net ? : dest_net, ifname,
+				       name_assign_type, ops, tb);
 		if (IS_ERR(dev)) {
 			err = PTR_ERR(dev);
 			goto out;
@@ -2154,9 +2166,16 @@ replay:
 			}
 		}
 		err = rtnl_configure_link(dev, ifm);
-		if (err < 0)
+		if (err < 0) {
 			unregister_netdevice(dev);
+			goto out;
+		}
+
+		if (link_net)
+			err = dev_change_net_namespace(dev, dest_net, ifname);
 out:
+		if (link_net)
+			put_net(link_net);
 		put_net(dest_net);
 		return err;
 	}
-- 
2.2.2

^ permalink raw reply related

* Re: [PATCH] virtio_balloon: coding style fixes
From: Michal Hocko @ 2015-01-15 14:13 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Rusty Russell,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150115134412.GA23874-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Thu 15-01-15 15:44:12, Michael S. Tsirkin wrote:
> On Thu, Jan 15, 2015 at 02:06:42PM +0100, Michal Hocko wrote:
> > On Thu 15-01-15 13:39:06, Michael S. Tsirkin wrote:
> > > Most of our code has
> > > struct foo {
> > > }
> > > 
> > > Fix two instances where balloon is inconsistent.
> > 
> > I hate to complain but is it really necessary to post such patches to
> > linux-api?
> 
> Well it's human to err, so it seems wise to copy parties
> interested in the ABI/API whenever we are changing files under include/uapi.
> Whitespace changes should mostly be safe, but it's not unknown
> e.g. to include unrelated changes in the same commit by mistake.
> 
> > I thought the list was primarily for API related discussions.
> 
> Basically this line in MAINTAINERS
> 
> ABI/API
> L:      linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> F:      Documentation/ABI/
> F:      include/linux/syscalls.h
> F:      include/uapi/
> F:      kernel/sys_ni.c
> 
> normally means "send all patches affecting files under include/uapi/ to
> this list", does it not?

Well, this should always be taken as a hint not a hard rule. So if there
is a change which is adding/removing or changing signature then sure but
not everything falls into that category.
 
> Wasn't this the intent?
> 
> > This is not the only mail sent here which doesn't fall into that
> > category IMO. It is far from low volume list for quite some time.
> > 
> > Please let's get back low volume and API only discussion!
> 
> Maybe send patch dropping include/uapi/ from there,
> should help drive the volumes down?

This would be an overkill IMO. It would be much more preferable if
people actually think about who from the suggested list (either from
MAINTAINERS or ./scripts/get_maintainer.pl) should be really added.

[...]
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: futex(2) man page update help request
From: Michael Kerrisk (man-pages) @ 2015-01-15 15:10 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w, Carlos O'Donell,
	Darren Hart, Ingo Molnar, Jakub Jelinek,
	linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, lkml,
	Davidlohr Bueso, Arnd Bergmann, Steven Rostedt, Peter Zijlstra,
	Linux API, Torvald Riegel, Roland McGrath, Darren Hart,
	Anton Blanchard
In-Reply-To: <alpine.DEB.2.02.1405151144390.6261-3cz04HxQygjZikZi3RtOZ1XZhhPuCNm+@public.gmane.org>

[Adding a few people to CC that have expressed interest in the 
progress of the updates of this page, or who may be able to
provide review feedback. Eventually, you'll all get CCed on
the new draft of the page.]

Hello Thomas,

On 05/15/2014 04:14 PM, Thomas Gleixner wrote:
> On Thu, 15 May 2014, Michael Kerrisk (man-pages) wrote:
>> And that universe would love to have your documentation of 
>> FUTEX_WAKE_BITSET and FUTEX_WAIT_BITSET ;-),
> 
> I give you almost the full treatment, but I leave REQUEUE_PI to
> Darren and FUTEX_WAKE_OP to Jakub. :)

Thank you for the great effort you put into compiling the
text below, and apologies for my long delay in following up.

I've integrated almost all of your suggestions into the 
manual page. I will shortly send out a new draft of the
page that contains various FIXMEs for points that remain 
unclear.

Most of the rest of this mail is just a checklist noting
what I did with your comments. No response is needed 
in most cases, but there are a very few open questions in 
this mail that, to help you find them, I have marked with
"???". If you (or someone else) could reply to those, I 
would be grateful.

In the next day or two, I hope to send out the new version
of the futex(2) page for review. The new draft is a bit
bigger (okay -- 4 x bigger) than the current page. And there 
are a quite number of FIXMEs that I've placed in the page 
for various points--some minor, but a few major--that need
to be checked or fixed. Would you have some time to review
that page? 

For that matter, if anyone else would have time for
reviewing the page, could they shout out now. It's perhaps
unlikely, but I am worried about getting a thundering herd
of comments, and bringing the page to the state where I have 
it now has already been a fairly demanding task.

==========

> FUTEX_WAIT
> 
> < Existing blurb seems ok >
> 
> Related return values
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr.

Added/reworked.

> [EINVAL] The supplied uaddr argument does not pouint to a valid 
> object, i.e. pointer is not 4 byte aligned

Added.

> [EINVAL] The supplied timeout argument is not normalized.

Added, but with more detail.

> [EWOULDBLOCK] The atomic enqueueing failed. 

Added.

Note, however, that for consistency, I'll use EAGAIN throughout 
the page.

>  User space value at uaddr
> is not equal val argument.

Was already present.

> [ETIMEDOUT] timeout expired

Was present, but I have now added more detail.

==========

> FUTEX_WAKE
> 
> < Existing blurb seems ok >
> 
> Related return values
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr.

Added/reworked.

> [EINVAL] The supplied uaddr argument does not point to a valid 
> object, i.e. pointer is not 4 byte aligned

Added.

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_LOCK_PI

Added.

==========

> FUTEX_REQUEUE
> 
> Existing blurb seems ok , except for this:
> 
> The argument val contains the number of waiters on uaddr which are
> immediately woken up.
> The timeout argument is abused to transport the number of waiters
> which are requeued to the futex at uaddr2. The pointer is typecasted
> to u32.

What I've actually done with the main text for FUTEX_REQUEUE is defer 
to the (now-expanded) discussion of FUTEX_CMP_REQUEUE. 

> [EFAULT] Kernel was unable to access the futex value at uaddr or
> uaddr2

Added/reworked.

> [EINVAL] The supplied uaddr/uaddr2 arguments do not point to a valid
> object, i.e. pointer is not 4 byte aligned

Added.

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_LOCK_PI on uaddr

Added.

> [EINVAL] uaddr equal uaddr2. Requeue to same futex.

??? I added this, but does this error not occur only for PI requeues?

==========

> FUTEX_REQUEUE_CMP
> 
> Existing blurb seems ok , except for this:

[[
> The argument val is contains the number of waiters on uaddr which are
> immediately woken up.
> 
> The timeout argument is abused to transport the number of waiters
> which are requeued to the futex at uaddr2. The pointer is typecasted
> to u32.
]]

Covered now (in more detail).

> Related return values
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr or
> uaddr2

Added/reworked.

> [EINVAL] The supplied uaddr/uaddr2 arguments do not point to a valid
> object, i.e. pointer is not 4 byte aligned

Added.

> [EINVAL] uaddr equal uaddr2. Requeue to same futex.

Added.

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_LOCK_PI on uaddr

Added

> [EAGAIN] uaddr1 readout is not equal the compare value in argument
> val3

Was already present.

==========

> FUTEX_WAKE_OP
> 
> 
> Jakub, can you please explain it? I'm lost :)

I had a read of Ulrich Drepper's "Futexes are Tricky", and the source 
code, and took a shot at it. I'd like to have someone check what 
I wrote though. See the draft that I will soon send out.

> The argument val contains the maximum number of waiters on uaddr
> which are immediately woken up.

Covered in my new text.

> The timeout argument is abused to transport the maximum number of
> waiters on uaddr2 which are woken up. The pointer is typecasted to
> u32.

Covered in my new text.

> Related return values
> 
> [EFAULT] Kernel was unable to access the futex values at uaddr or
> uaddr2

This point was covered already in ERRORS.

> [EINVAL] The supplied uaddr or uaddr2 argument does not point to a
> valid object, i.e. pointer is not 4 byte aligned

This point was covered already in ERRORS.

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_LOCK_PI on uaddr

I added this point.

==========

> FUTEX_WAIT_BITSET
> 
> The same as FUTEX_WAIT except that val3 is used to provide a 32bit
> bitset to the kernel. This bitset is stored in the kernel internal
> state of the waiter.

Added.

> This futex op also allows to have the option bit FUTEX_CLOCK_REALTIME
> set.

Added.

> Related return values
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr.

Already covered.

> [EINVAL] The supplied uaddr argument does not point to a valid 
> object, i.e. pointer is not 4 byte aligned

Already covered.

> [EINVAL] The supplied bitset is zero.

Added.

> [EINVAL] The supplied timeout argument is not normalized.

Already covered.

> [ETIMEDOUT] timeout expired

Already covered.

==========

> FUTEX_WAKE_BITSET
> 
> The same as FUTEX_WAKE except that val3 is used to provide a 32bit
> bitset to the kernel. This bitset is used to select waiters on the
> futex. The selection is done by a bitwise AND of the wake side
> supplied bitset and the bitset which is stored in the kernel internal
> state of the waiters. If the result is non zero, the waiter is woken,
> otherwise left waiting.

Added (along with quite a bit of other detail).

> [EFAULT] Kernel was unable to access the futex value at uaddr.

Covered already.

> [EINVAL] The supplied uaddr argument does not point to a valid 
> object, i.e. pointer is not 4 byte aligned

Covered already.

> [EINVAL] The supplied bitset is zero.

Added.

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_LOCK_PI

Added.

==========

> FUTEX_LOCK_PI
> 
> This operation reads from the futex address provided by the uaddr
> argument, which contains the namespace specific TID of the lock
> owner. If the TID is 0, then the kernel tries to set the waiters TID
> atomically. If the TID is nonzero or the take over fails the kernel
> sets atomically the FUTEX_WAITERS bit which signals the owner, that
> it cannot unlock the futex in user space atomically by transitioning
> from TID to 0. After that the kernel tries to find the task which is
> associated to the owner TID, creates or reuses kernel state on behalf
> of the owner and attaches the waiter to it. The enqueing of the 
> waiter is in descending priority order if more than one waiter 
> exists. The owner inherits either the priority or the bandwidth of
> the waiter. This inheritance follows the lock chain in the case of
> nested locking and performs deadlock detection.

Added.

> The timeout argument is handled as described in FUTEX_WAIT. The
> arguments uaddr2, val, and val3 are ignored.

Added. Note, though, that some crufty code gives the impression
that FUTEX_LOCK_PI uses 'val'. I'll send a patch separately.

> Related return values
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr.

Already covered.

> [ENOMEM] Kernel could not allocate state

Added

> [EINVAL] The supplied uaddr argument does not point to a valid 
> object, i.e. pointer is not 4 byte aligned

Already covered.

> [EINVAL] The supplied timeout argument is not normalized.

Already covered.

> [EINVAL]
> The kernel detected inconsistent state between the user space state
> at uaddr and the kernel state. Thats either state corruption or it
> found a waiter on uaddr which is waiting on FUTEX_WAIT[_BITSET]

Added.

> [EPERM]  Caller is not allowed to attach itself to the futex. Can be
> a legitimate issue or a hint for state corruption in user space

Added.

> [ESRCH]	 The TID in the user space value does not exist

Added.

> [EAGAIN] The futex owner TID is about to exit, but has not yet 
> handled the internal state cleanup. Try again.

Added.

> [ETIMEDOUT] timeout expired

Already covered.

> [EDEADLOCK] The futex is already locked by the caller or the kernel 
> detected a deadlock scenario in a nested lock chain

Added.

> [EOWNERDIED] The owner of the futex died and the kernel made the 
> caller the new owner. The kernel sets the FUTEX_OWNER_DIED bit in the
> futex userspace value. Caller is responsible for cleanup

There is no such thing as an EOWNERDIED error. I had a look
through the kernel source for the FUTEX_OWNER_DIED cases and didn't 
see an obvious error associated with them. Can you clarify? (I think 
the point is that this condition, which is described in
Documentation/robust-futexes.txt, is not an error as such. However, I'm
not yet sure of how to describe it in the man page.)
I will add this point as a FIXME in the new draft man page.

> [ENOSYS] Not implemented on all architectures and not supported on
> some CPU variants  (runtime detection)  

Added.

==========

> FUTEX_TRYLOCK_PI
> 
> This operation tries to acquire the futex at uaddr. It deals with the
> situation where the TID value at uaddr is 0, but the FUTEX_HAS_WAITER
> bit is set. User space cannot handle this race free.

Added.

> The arguments uaddr2, val, timeout and val3 are ignored.

??? But the code reads:

        case FUTEX_TRYLOCK_PI:
                return futex_lock_pi(uaddr, flags, 0, timeout, 1);
 
which momentarily misleads one into thinking that 'timeout' is used.
And: it's not quite ignored, since in futex_lock_pi() a non-NULL
'timeout' is unconditionally dereferenced (meaning you could get
an EFAULT error for a bad 'timeout' pointer).
I'm confused....

Maybe the above code should be

        case FUTEX_TRYLOCK_PI:
                return futex_lock_pi(uaddr, flags, 0, NULL, 1);
?

> Return values:
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr.

Already covered.

> [ENOMEM] Kernel could not allocate state

Added.

> [EINVAL] The supplied uaddr argument does not point to a valid 
> object, i.e. pointer is not 4 byte aligned

Already covered.

> [EINVAL] The kernel detected inconsistent state between the user 
> space state at uaddr and the kernel state

Added, but with the same text as for FUTEX_LOCK_PI above. I.e., the text
"Thats either state corruption or it found a waiter on uaddr which is
waiting on FUTEX_WAIT[_BITSET]" is also included.

> [EPERM]  Caller is not allowed to attach itself to the futex. Can be
> a legitimate issue or a hint for state corruption in user space

Added.

> [ESRCH]	 The TID in the user space value does not exist

Added.

> [EAGAIN] The futex owner TID is about to exit, but has not yet 
> handled the internal state cleanup. Try again.

Added.

> [EDEADLOCK] The futex is already locked by the caller.

Added.

> [EOWNERDIED] The owner of the futex died and the kernel made the 
> caller the new owner. The kernel sets the FUTEX_OWNER_DIED bit in the
> futex userspace value. Caller is responsible for cleanup

See comment above concerning EOWNERDIED for FUTEX_LOCK_PI

> [ENOSYS] Not implemented on all architectures and not supported on
> some CPU variants (runtime detection)

Added.

==========

> FUTEX_UNLOCK_PI
> 
> This operation wakes the top priority waiter which is waiting in
> FUTEX_LOCK_PI on the futex address provided by the uaddr argument.
> 
> This is called when the user space value at uaddr cannot be changed
> atomically from TID (of the owner) to 0.
> 
> The arguments uaddr2, val, timeout and val3 are ignored.

Added.

> Related return values:  
> [EINVAL] The kernel detected inconsistent
> state between the user space state at uaddr and the kernel state, 
> i.e. it detected a waiter which waits in FUTEX_WAIT[_BITSET].

Added (but with a question in a FIXME).

> [EPERM]  Caller does not own the futex.

Added.

> [ENOSYS] Not implemented on all architectures and not supported on
> some CPU variants (runtime detection)

Added.

==========

> FUTEX_WAIT_REQUEUE_PI
> 
> Wait operation to wait on a non pi futex at uaddr and potentially be
> requeued on a pi futex at uaddr2. The wait operation on uaddr is the
> same as FUTEX_WAIT. The waiter can be removed from the wait on uaddr
> via FUTEX_WAKE without requeuing on uaddr2.

Added.

> The timeout argument is handled as described in FUTEX_WAIT.

The above seems not to be correct. I've written the discussion of
'timeout' up as I understand it, and added a FIXME to the draft page.

> Darren, can you fill in the missing details?

> Return values:
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr or
> uaddr2

Already covered.

> [EINVAL] The supplied uaddr or uaddr2 argument does not point to a
> valid object, i.e. pointer is not 4 byte aligned

Already covered.

> [EINVAL] The supplied timeout argument is not normalized.

Already covered.

> [EINVAL] The supplied bitset is zero.

??? I don't believe this can happen. 'val3' is internally set to
FUTEX_BITSET_MATCH_ANY. Can you confirm?

> [EWOULDBLOCK] The atomic enqueueing failed. User space value at uaddr
> is not equal val argument.

Added using the same text as FUTEX_WAIT:

       EAGAIN (FUTEX_WAIT, FUTEX_WAIT_REQUEUE_PI) The value pointed to
              by  uaddr was not equal to the expected value val at the
              time of the call.

> [ETIMEDOUT] timeout expired

Already covered.

> [EOWNERDIED] The owner of the PI futex at uaddr2 died and the kernel
> made the caller the new owner. The kernel sets the FUTEX_OWNER_DIED
> bit in the uaddr2 futex userspace value.  Caller is responsible for 
> cleanup

See comment above concerning EOWNERDIED for FUTEX_LOCK_PI

> [ENOSYS] Not implemented on all architectures and not supported on
> some CPU variants (runtime detection)

Added.

==========

> FUTEX_CMP_REQUEUE_PI
> 
> PI aware variant of FUTEX_CMP_REQUEUE. Inner futex at uaddr is a non
> PI futex. Outer futex to which is requeued is a PI futex at uaddr2.

I instead used Darren's proposed text:
 
# PI aware variant for FUTEX_CMP_REQUEUE. Requeue tasks blocked on uaddr via
# FUTEX_WAIT_REQUEUE_PI from a non-PI source futex (uaddr) to a PI target
# futex (uaddr2).

> The waiters on uaddr must wait in FUTEX_WAIT_REQUEUE_PI.

Covered above.

> The argument val is contains the number of waiters on uaddr which are
> immediately woken up. Must be 1 for this opcode.

Added.

> The timeout argument is abused to transport the number of waiters
> which are requeued on to the futex at uaddr2. The pointer is
> typecasted to u32.

Added.

> Darren, can you fill in the missing details?
> 
> [EFAULT] Kernel was unable to access the futex value at uaddr or
> uaddr2

Already covered.

> [ENOMEM] Kernel could not allocate state

Added.

> [EINVAL] The supplied uaddr/uaddr2 arguments do not point to a valid
> object, i.e. pointer is not 4 byte aligned

Already covered.

> [EINVAL] uaddr equal uaddr2. Requeue to same futex.

Added.

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_LOCK_PI on uaddr

Added

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_WAIT[_BITSET] on uaddr

Added.

> [EINVAL] The kernel detected inconsistent state between the user
> space state at uaddr2 and the kernel state, i.e. it detected a waiter
> which waits in FUTEX_WAIT on uaddr2.

Added.

> [EINVAL] The supplied bitset is zero.

Darren Hart noted: Bitset doesn't apply to FUTEX_CMP_REQUEUE_PI.

> [EAGAIN] uaddr1 readout is not equal the compare value in argument
> val3

Added.

> [EAGAIN] The futex owner TID of uaddr2 is about to exit, but has not
> yet handled the internal state cleanup. Try again.

Added.

> [EPERM]  Caller is not allowed to attach the waiter to the futex at
> uaddr2 Can be a legitimate issue or a hint for state corruption in
> user space

Added.

> [ESRCH]	 The TID in the user space value at uaddr2 does not exist

Added.

> [EDEADLOCK] The requeuing of a waiter to the kernel representation of
> the PI futex at uaddr2 detected a deadlock scenario.

Added.

> [ENOSYS] Not implemented on all architectures and not supported on
> some CPU variants (runtime detection)

Added.

==========

> The various option bits seem to be undocumented as well

Yes, thanks for that.

> FUTEX_PRIVATE_FLAG

I've added this one, along with the detail "(since Linux 2.6.22)"

> This option bit can be ored on all futex ops.
> 
> It tells the kernel, that the futex is process private and not shared
> with another process. That allows the kernel to chose the fast path
> for validating the user space address and avoids expensive VMA
> lookup, taking refcounts on file backing store etc.
> 
> FUTEX_CLOCK_REALTIME

I've added this one, along with the detail "(since Linux 2.6.28)"

> This option bit can be ored on the futex ops FUTEX_WAIT_BITSET and
> FUTEX_WAIT_REQUEUE_PI
> 
> If set the kernel treats the user space supplied timeout as absolute
> time based on CLOCK_REALTIME.
> 
> If not set the kernel treats the user space supplied timeout as
> relative time.
> 
> If this is set on any other op than the supported ones, kernel 
> returns ENOSYS!

The details in the preceding 4 paragraphs have been integrated.

Thanks,

Michael



-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* Re: futex(2) man page update help request
From: Michael Kerrisk (man-pages) @ 2015-01-15 15:12 UTC (permalink / raw)
  To: Darren Hart, Thomas Gleixner
  Cc: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w, Carlos O'Donell,
	Ingo Molnar, Jakub Jelinek,
	linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, lkml,
	Davidlohr Bueso, Arnd Bergmann, Steven Rostedt, Peter Zijlstra,
	Linux API
In-Reply-To: <CF9A731D.913E6%dvhart-VuQAYsv1563Yd54FQh9/CA@public.gmane.org>

Hello Darren,

I give you the same apology as to Thomas for the 
long-delayed response to your mail.

And I repeat my note to Thomas:
In the next day or two, I hope to send out the new version
of the futex(2) page for review. The new draft is a bit
bigger (okay -- 4 x bigger) than the current page. And there 
are a quite number of FIXMEs that I've placed in the page 
for various points--some minor, but a few major--that need
to be checked or fixed. Would you have some time to review
that page? 

In the meantime, I have a couple of questions, which, if 
you could answer them, I would work some changes into the 
page before sending.

1. In various places, distinction is made between non-PI 
   futexs and PI futexes. But what determines that distinction?
   From the kernel's perspective, hat make a futex one type
   or another? I presume it is to do with the types of blocking
   waiters on the futex, but it would be good to have a formal
   definition.

2. Can you say something about the pairing requirements of
   FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI. 
   What is the requirement and why do we need it?

Most of the rest of this mail is just a checklist noting
what I did with your comments. No response is needed 
in most cases, but there is one that I have marked with
"???". If you could reply to that. I'd be grateful.

On 05/15/2014 10:35 PM, Darren Hart wrote:
> On 5/15/14, 7:14, "Thomas Gleixner" <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org> wrote:
> 
> Wow Thomas, I planned to do exactly this and you beat me to it. Again.
> Thanks for getting this started.
> 
> Michael, I imagine you want something more condensed, and I'll add to what
> tglx posted (inline below) to try and get you that, but if you have
> questions and need to fill in the gap, the paper I presented at RTLWS11 in
> '09 covers this particularly nasty OPCODE in detail:
> 
> http://lwn.net/images/conf/rtlws11/papers/proc/p10.pdf
> 
> I believe Michael is looking for some higher level documentation, like how
> to use these and what they are intended for. 

Yes, that would be good.

> Probably something more like
> Ulrich's Futexes are Tricky paper - but let's start with getting the op
> codes, arguments, and return codes fleshed out.

Okay.

> For all the PI opcodes, we should probably mention something about the
> futex value scheme (TID), whereas the other opcodes do not require any
> specific value scheme.
> 
> No Owner:	0
> Owner:		TID
> Waiters:	TID | FUTEX_WAITERS
> 
> This is the relevant section from the referenced paper:
> 				
> The PI futex operations diverge from the oth-
> ers in that they impose a policy describing how
> the futex value is to be used. If the lock is un-
> owned, the futex value shall be 0. If owned, it
> shall be the thread id (tid) of the owning thread.
> If there are threads contending for the lock, then
> the FUTEX_WAITERS flag is set. With this policy in
> place, userspace can atomically acquire an unowned
> lock or release an uncontended lock using an atomic
> instruction and their own tid. A non-zero futex
> value will force waiters into the kernel to lock. The
> FUTEX_WAITERS flag forces the owner into the kernel
> to unlock. If the callers are forced into the kernel,
> they then deal directly with an underlying rt_mutex
> which implements the priority inheritance semantics.
> After the rt_mutex is acquired, the futex value is up-
> dated accordingly, before the calling thread returns
> to userspace.
>
> It is important to note that the kernel will update the futex value prior
> to returning to userspace. Unlike other futex op codes,
> FUTEX_CMP_REUQUE_PI (and FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI are designed
> for the implementation of very specific IPC mechanisms).

??? Great text. May I presume that I can take this text 
and freely adapt it for the man page? (Actually, this is a 
request for forgiveness, rather than permission :-).)

>> FUTEX_CMP_REQUEUE_PI
>>
>> 	PI aware variant of FUTEX_CMP_REQUEUE. Inner futex at uaddr is
>> 	a non PI futex. Outer futex to which is requeued is a PI futex
>> 	at uaddr2.
> 
> Inner/outer terminology applies specifically to the glibc pthread
> condition variable and mutex use case, but is overly specific for the man
> page. Consider:
> 
> PI aware variant for FUTEX_CMP_REQUEUE. Requeue tasks blocked on uaddr via
> FUTEX_WAIT_REQUEUE_PI from a non-PI source futex (uaddr) to a PI target
> futex (uaddr2).

Thanks for that text. It is easier to grasp.

>>
>> 	The waiters on uaddr must wait in FUTEX_WAIT_REQUEUE_PI.
>>
>> 	The argument val is contains the number of waiters on uaddr
>> 	which are immediately woken up. Must be 1 for this opcode.
> 
> Because the point is to avoid the thundering herd in the first place, and
> other nasty little races and faulting corner cases...

I added the piece about "thundering herd".

>> 	The timeout argument is abused to transport the number of
>> 	waiters which are requeued on to the futex at uaddr2. The
>> 	pointer is typecasted to u32.
> 
> 
>           val3 contains the expected value of uaddr (same as
> FUTEX_CMP_REQUEUE)

Yes. (The text now says that 'val3' has the same purpose as 
for FUTEX_CMP_REQUEUE.)

>> Darren, can you fill in the missing details?
> 
> Yup...
> 
>>
>> 	[EFAULT] Kernel was unable to access the futex value at uaddr
>> 		 or uaddr2
>>
>> 	[ENOMEM] Kernel could not allocate state
>>
>> 	[EINVAL] The supplied uaddr/uaddr2 arguments do not point to a
>> 		 valid object, i.e. pointer is not 4 byte aligned
>>
>> 	[EINVAL] uaddr equal uaddr2. Requeue to same futex.
>>
>> 	[EINVAL] The kernel detected inconsistent state between the
>> 		 user space state at uaddr and the kernel state,
>> 		 i.e. it detected a waiter which waits in
>> 		 FUTEX_LOCK_PI on uaddr
> 
>                    instead of FUTEX_WAIT_REQUEUE_PI.

Thanks. I added that detail.

>> 	[EINVAL] The kernel detected inconsistent state between the
>> 		 user space state at uaddr and the kernel state,
>> 		 i.e. it detected a waiter which waits in
>> 		 FUTEX_WAIT[_BITSET] on uaddr
>>
>> 	[EINVAL] The kernel detected inconsistent state between the
>> 		 user space state at uaddr2 and the kernel state,
>> 		 i.e. it detected a waiter which waits in
>> 		 FUTEX_WAIT on uaddr2.
> 
>           [EINVAL] The kernel detected the FUTEX_CMP_REQUEUE_PI call is
>                    attempting to requeue a task to a futex other than that
>                    specified by the matching FUTEX_WAIT_REQUEUE_PI call for
>                    that task.

Thanks. Added.

> A number of these EINVALs can probably be combined into "Kernel detected
> bad state" as far as the C library is concerned, but we can consolidate
> later. But basically, EINVAL is returned if the non-pi to pi or op pairing
> semantics are violated.

I think the page probably needs some text to cover that point. I'll add
a FIXME for review.

>>  	[EINVAL] The supplied bitset is zero.
> 
> Bitset doesn't apply to FUTEX_CMP_REQUEUE_PI.

Thanks.

>           [EINVAL] nr_wake != 1

Thanks, I'd already spotted this, but it's good to have confirmation.

> EAGAIN == EWOULDBLOCK. We use each in the kernel, but will just refer to
> them here as EAGAIN.

Yes. And I've followed that convention now in the man page.

>> 	[EAGAIN] uaddr1 readout is not equal the compare value in
>> 		 argument val3
>>
>> 	[EAGAIN] The futex owner TID of uaddr2 is about to exit, but
>> 		 has not yet handled the internal state cleanup. Try
>> 		 again.
>>
>> 	[EPERM]  Caller is not allowed to attach the waiter to the
>> 		 futex at uaddr2 Can be a legitimate issue or a hint
>> 		 for state corruption in user space
>>
>> 	[ESRCH]	 The TID in the user space value at uaddr2 does not exist
> 
> Hrm, I'm missing ESRCH and EPERM in my state diagrams.... put yes, we can
> get ESRCH when looking up PI state, and we can return that from
> futex_requeue.... That needs some time to review...
> 
> I'm not seeing the EPERM path, where is that coming from?

Any further insight on the above?

>> 	[EDEADLOCK] The requeuing of a waiter to the kernel representation
>> 		    of the PI futex at uaddr2 detected a deadlock scenario.
>>
>>        [ENOSYS] Not implemented on all architectures and not supported
>> 		 on some CPU variants (runtime detection)
> 
> Return value >= 0 is successful, indicating the number of of tasks
> requeued or woken (3 requeued and 1 woken would return 4).

Yes. Already noted.

Cheers,

Michael


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [LSF/MM TOPIC] userfaultfd
From: Austin S Hemmelgarn @ 2015-01-15 16:08 UTC (permalink / raw)
  To: Andrea Arcangeli,
	lsf-pc-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <20150114230130.GR6103-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

[-- Attachment #1: Type: text/plain, Size: 876 bytes --]

On 2015-01-14 18:01, Andrea Arcangeli wrote:
> 7) distributed shared memory that could allow simultaneous mapping of
>     regions marked readonly and collapse them on the first exclusive
>     write. I'm mentioning it as a corollary, because I'm not aware of
>     anybody who is planning to use it that way (still I'd like that
>     this will be possible too just in case it finds its way later on).
While I haven't actually written any code for it yet, I've been thinking 
about the possibility to use this to allow qemu to do distributed 
emulation of a NUMA system (ie, you could run qemu on a Beowulf cluster 
and make it look to the guest OS like it's running on a big NUMA system, 
essentially SSI clustering for people who don't have a multi-million 
dollar budget).  Having userfaultd to work with would make this 
exponentially easier to implement.


[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 2455 bytes --]

^ permalink raw reply

* Re: [PATCH RESEND v3] gpio: lib-sysfs: Add 'wakeup' attribute
From: Linus Walleij @ 2015-01-15 16:34 UTC (permalink / raw)
  To: Soren Brinkmann, Johan Hovold
  Cc: Alexandre Courbot, linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-gpio-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	linux-doc-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
In-Reply-To: <1420481779-22841-1-git-send-email-soren.brinkmann-gjFFaj9aHVfQT0dZR+AlfA@public.gmane.org>

On Mon, Jan 5, 2015 at 7:16 PM, Soren Brinkmann
<soren.brinkmann-gjFFaj9aHVfQT0dZR+AlfA@public.gmane.org> wrote:

> Add an attribute 'wakeup' to the GPIO sysfs interface which allows
> marking/unmarking a GPIO as wake IRQ.
> The file 'wakeup' is created in each exported GPIOs directory, if an IRQ
> is associated with that GPIO and the irqchip implements set_wake().
> Writing 'enabled' to that file will enable wake for that GPIO, while
> writing 'disabled' will disable wake.
> Reading that file will return either 'disabled' or 'enabled' depening on
> the currently set flag for the GPIO's IRQ.
>
> Signed-off-by: Soren Brinkmann <soren.brinkmann-gjFFaj9aHVfQT0dZR+AlfA@public.gmane.org>
> Reviewed-by: Alexandre Courbot <acourbot-DDmLM1+adcrQT0dZR+AlfA@public.gmane.org>
> ---
> v3:
>  - add documentation

As per discussion with Johan Hovold I have had to remove this patch
from the development tree.

Please rebase as indicated on the "fixes" branch, fix any resource
leaks, include Johan Hovold on CC for review and resend.

Yours,
Linus Walleij

^ permalink raw reply

* Re: [PATCH RESEND v3] gpio: lib-sysfs: Add 'wakeup' attribute
From: Sören Brinkmann @ 2015-01-15 16:37 UTC (permalink / raw)
  To: Linus Walleij
  Cc: Johan Hovold, Alexandre Courbot, linux-api,
	linux-kernel@vger.kernel.org, linux-gpio@vger.kernel.org,
	linux-doc@vger.kernel.org
In-Reply-To: <CACRpkdYJ_kZw3UfYc5vGd0cxwDJD5WR69c6Q5xTvmFnb7NOfkA@mail.gmail.com>

On Thu, 2015-01-15 at 05:34PM +0100, Linus Walleij wrote:
> On Mon, Jan 5, 2015 at 7:16 PM, Soren Brinkmann
> <soren.brinkmann@xilinx.com> wrote:
> 
> > Add an attribute 'wakeup' to the GPIO sysfs interface which allows
> > marking/unmarking a GPIO as wake IRQ.
> > The file 'wakeup' is created in each exported GPIOs directory, if an IRQ
> > is associated with that GPIO and the irqchip implements set_wake().
> > Writing 'enabled' to that file will enable wake for that GPIO, while
> > writing 'disabled' will disable wake.
> > Reading that file will return either 'disabled' or 'enabled' depening on
> > the currently set flag for the GPIO's IRQ.
> >
> > Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
> > Reviewed-by: Alexandre Courbot <acourbot@nvidia.com>
> > ---
> > v3:
> >  - add documentation
> 
> As per discussion with Johan Hovold I have had to remove this patch
> from the development tree.
> 
> Please rebase as indicated on the "fixes" branch, fix any resource
> leaks, include Johan Hovold on CC for review and resend.

Is that public somewhere? What are the issues?

	Sören

^ permalink raw reply

* Re: [PATCH] virtio_balloon: coding style fixes
From: Michael S. Tsirkin @ 2015-01-15 18:50 UTC (permalink / raw)
  To: Michal Hocko; +Cc: linux-api, linux-kernel, virtualization
In-Reply-To: <20150115141308.GH7000@dhcp22.suse.cz>

On Thu, Jan 15, 2015 at 03:13:08PM +0100, Michal Hocko wrote:
> On Thu 15-01-15 15:44:12, Michael S. Tsirkin wrote:
> > On Thu, Jan 15, 2015 at 02:06:42PM +0100, Michal Hocko wrote:
> > > On Thu 15-01-15 13:39:06, Michael S. Tsirkin wrote:
> > > > Most of our code has
> > > > struct foo {
> > > > }
> > > > 
> > > > Fix two instances where balloon is inconsistent.
> > > 
> > > I hate to complain but is it really necessary to post such patches to
> > > linux-api?
> > 
> > Well it's human to err, so it seems wise to copy parties
> > interested in the ABI/API whenever we are changing files under include/uapi.
> > Whitespace changes should mostly be safe, but it's not unknown
> > e.g. to include unrelated changes in the same commit by mistake.
> > 
> > > I thought the list was primarily for API related discussions.
> > 
> > Basically this line in MAINTAINERS
> > 
> > ABI/API
> > L:      linux-api@vger.kernel.org
> > F:      Documentation/ABI/
> > F:      include/linux/syscalls.h
> > F:      include/uapi/
> > F:      kernel/sys_ni.c
> > 
> > normally means "send all patches affecting files under include/uapi/ to
> > this list", does it not?
> 
> Well, this should always be taken as a hint not a hard rule. So if there
> is a change which is adding/removing or changing signature then sure but
> not everything falls into that category.

At least for code I maintain, I really wish people would just Cc me in
any case.  There's been a bunch of cases where people don't Cc me, and
then another maintainer assumes my silence implies agreement, and
applies.  Not nice. OTOH it's easy to ignore an irrelevant patch.

> > Wasn't this the intent?
> > 
> > > This is not the only mail sent here which doesn't fall into that
> > > category IMO. It is far from low volume list for quite some time.
> > > 
> > > Please let's get back low volume and API only discussion!
> > 
> > Maybe send patch dropping include/uapi/ from there,
> > should help drive the volumes down?
> 
> This would be an overkill IMO. It would be much more preferable if
> people actually think about who from the suggested list (either from
> MAINTAINERS or ./scripts/get_maintainer.pl) should be really added.
> 
> [...]

Yea, think about it, then what?  I've no idea what is linux-abi for, and
what people subscribed there are interested in. How should I? All I know
is what's in MAINTAINERS, which say "ABI/API". So I copy all ABI/API
patches there.

> -- 
> Michal Hocko
> SUSE Labs
> _______________________________________________
> Virtualization mailing list
> Virtualization@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* [PATCH v4] gpio: lib-sysfs: Add 'wakeup' attribute
From: Soren Brinkmann @ 2015-01-15 19:49 UTC (permalink / raw)
  To: Linus Walleij, Johan Hovold
  Cc: Alexandre Courbot, linux-api, linux-kernel, linux-gpio, linux-doc,
	Soren Brinkmann

Add an attribute 'wakeup' to the GPIO sysfs interface which allows
marking/unmarking a GPIO as wake IRQ.
The file 'wakeup' is created in each exported GPIOs directory, if an IRQ
is associated with that GPIO and the irqchip implements set_wake().
Writing 'enabled' to that file will enable wake for that GPIO, while
writing 'disabled' will disable wake.
Reading that file will return either 'disabled' or 'enabled' depening on
the currently set flag for the GPIO's IRQ.

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Reviewed-by: Alexandre Courbot <acourbot@nvidia.com>
---
Hi Linus, Johan,

I rebased my patch. And things look good. But the 'is_visible' things does not
behave the way I expected it to. It seems to be only triggered on an export but
not when attributes change. Hence, in my case, everything was visiible since the
inital state matches that, but even when changing the direction or things like
that, attributes don't disappear. Is that something still worked on? Expected
behavior?

	Thanks,
	Sören

v4:
 - rebased onto gpio/fixes
   - fit into the new attribute framework
   - extend is_visible to limit the wakeup attributes visibility
     according to usability
v3:
 - add documentation
v2:
 - fix error path to unlock mutex before return
---
 Documentation/ABI/testing/sysfs-gpio |  1 +
 Documentation/gpio/sysfs.txt         |  8 +++++
 drivers/gpio/gpiolib-sysfs.c         | 62 ++++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-gpio b/Documentation/ABI/testing/sysfs-gpio
index 80f4c94c7bef..4cc7f4b3f724 100644
--- a/Documentation/ABI/testing/sysfs-gpio
+++ b/Documentation/ABI/testing/sysfs-gpio
@@ -20,6 +20,7 @@ Description:
 	    /value ... always readable, writes fail for input GPIOs
 	    /direction ... r/w as: in, out (default low); write: high, low
 	    /edge ... r/w as: none, falling, rising, both
+	    /wakeup ... r/w as: enabled, disabled
 	/gpiochipN ... for each gpiochip; #N is its first GPIO
 	    /base ... (r/o) same as N
 	    /label ... (r/o) descriptive, not necessarily unique
diff --git a/Documentation/gpio/sysfs.txt b/Documentation/gpio/sysfs.txt
index c2c3a97f8ff7..f703377d528f 100644
--- a/Documentation/gpio/sysfs.txt
+++ b/Documentation/gpio/sysfs.txt
@@ -97,6 +97,14 @@ and have the following read/write attributes:
 		for "rising" and "falling" edges will follow this
 		setting.
 
+	"wakeup" ... reads as either "enabled" or "disabled". Write these
+		strings to set/clear the 'wakeup' flag of the IRQ associated
+		with this GPIO. If the IRQ has the 'wakeup' flag set, it can
+		wake the system from sleep states.
+
+		This file exists only if the pin can generate interrupts and
+		the driver implements the required infrastructure.
+
 GPIO controllers have paths like /sys/class/gpio/gpiochip42/ (for the
 controller implementing GPIOs starting at #42) and have the following
 read-only attributes:
diff --git a/drivers/gpio/gpiolib-sysfs.c b/drivers/gpio/gpiolib-sysfs.c
index f62aa115d79a..14b34d15e61c 100644
--- a/drivers/gpio/gpiolib-sysfs.c
+++ b/drivers/gpio/gpiolib-sysfs.c
@@ -286,6 +286,58 @@ found:
 
 static DEVICE_ATTR(edge, 0644, gpio_edge_show, gpio_edge_store);
 
+static ssize_t gpio_wakeup_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	ssize_t			status;
+	const struct gpio_desc	*desc = dev_get_drvdata(dev);
+	int			irq = gpiod_to_irq(desc);
+	struct irq_desc		*irq_desc = irq_to_desc(irq);
+
+	mutex_lock(&sysfs_lock);
+
+	if (irqd_is_wakeup_set(&irq_desc->irq_data))
+		status = sprintf(buf, "enabled\n");
+	else
+		status = sprintf(buf, "disabled\n");
+
+	mutex_unlock(&sysfs_lock);
+
+	return status;
+}
+
+static ssize_t gpio_wakeup_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t size)
+{
+	int			ret;
+	unsigned int		on;
+	struct gpio_desc	*desc = dev_get_drvdata(dev);
+	int			irq = gpiod_to_irq(desc);
+
+	mutex_lock(&sysfs_lock);
+
+	if (sysfs_streq("enabled", buf)) {
+		on = true;
+	} else if (sysfs_streq("disabled", buf)) {
+		on = false;
+	} else {
+		mutex_unlock(&sysfs_lock);
+		return -EINVAL;
+	}
+
+	ret = irq_set_irq_wake(irq, on);
+
+	mutex_unlock(&sysfs_lock);
+
+	if (ret)
+		pr_warn("%s: failed to %s wake\n", __func__,
+				on ? "enable" : "disable");
+
+	return size;
+}
+
+static DEVICE_ATTR(wakeup, 0644, gpio_wakeup_show, gpio_wakeup_store);
+
 static int sysfs_set_active_low(struct gpio_desc *desc, struct device *dev,
 				int value)
 {
@@ -361,6 +413,7 @@ static umode_t gpio_is_visible(struct kobject *kobj, struct attribute *attr,
 {
 	struct device *dev = container_of(kobj, struct device, kobj);
 	struct gpio_desc *desc = dev_get_drvdata(dev);
+	struct irq_chip *irqchip = desc->chip->irqchip;
 	umode_t mode = attr->mode;
 	bool show_direction = test_bit(FLAG_SYSFS_DIR, &desc->flags);
 
@@ -372,6 +425,14 @@ static umode_t gpio_is_visible(struct kobject *kobj, struct attribute *attr,
 			mode = 0;
 		if (!show_direction && test_bit(FLAG_IS_OUT, &desc->flags))
 			mode = 0;
+	} else if (attr == &dev_attr_wakeup.attr) {
+		if (gpiod_to_irq(desc) < 0)
+			mode = 0;
+		if (!show_direction && test_bit(FLAG_IS_OUT, &desc->flags))
+			mode = 0;
+		if (!test_bit(IRQCHIP_SKIP_SET_WAKE, &irqchip->flags) &&
+				!irqchip->irq_set_wake)
+			mode = 0;
 	}
 
 	return mode;
@@ -382,6 +443,7 @@ static struct attribute *gpio_attrs[] = {
 	&dev_attr_edge.attr,
 	&dev_attr_value.attr,
 	&dev_attr_active_low.attr,
+	&dev_attr_wakeup.attr,
 	NULL,
 };
 
-- 
2.2.1.1.gb42cc81


^ permalink raw reply related

* Re: [PATCH] virtio_balloon: coding style fixes
From: Michal Hocko @ 2015-01-15 20:16 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-api-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA
In-Reply-To: <20150115185018.GA31068-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Thu 15-01-15 20:50:18, Michael S. Tsirkin wrote:
> On Thu, Jan 15, 2015 at 03:13:08PM +0100, Michal Hocko wrote:
> > On Thu 15-01-15 15:44:12, Michael S. Tsirkin wrote:
[...]
> > > Maybe send patch dropping include/uapi/ from there,
> > > should help drive the volumes down?
> > 
> > This would be an overkill IMO. It would be much more preferable if
> > people actually think about who from the suggested list (either from
> > MAINTAINERS or ./scripts/get_maintainer.pl) should be really added.
> > 
> > [...]
> 
> Yea, think about it, then what?  I've no idea what is linux-abi for, and
> what people subscribed there are interested in. How should I?

https://www.kernel.org/doc/man-pages/linux-api-ml.html

And by no means, I didn't intend to put critique just to you or only
this particular patch. I just grown frustrated because there are so many
patches which are no longer API/ABI related flying here. This was just
an example.

Hi volume of unrelated emails will turn this into another lkml, I am
afraid.
-- 
Michal Hocko
SUSE Labs

^ permalink raw reply

* Re: [PATCH] virtio_balloon: coding style fixes
From: Michael Kerrisk (man-pages) @ 2015-01-15 21:09 UTC (permalink / raw)
  To: Michael S. Tsirkin, Michal Hocko
  Cc: linux-api, virtualization, mtk.manpages, linux-kernel
In-Reply-To: <20150115134412.GA23874@redhat.com>

On 01/15/2015 02:44 PM, Michael S. Tsirkin wrote:
> On Thu, Jan 15, 2015 at 02:06:42PM +0100, Michal Hocko wrote:
>> On Thu 15-01-15 13:39:06, Michael S. Tsirkin wrote:
>>> Most of our code has
>>> struct foo {
>>> }
>>>
>>> Fix two instances where balloon is inconsistent.
>>
>> I hate to complain but is it really necessary to post such patches to
>> linux-api?
> 
> Well it's human to err, so it seems wise to copy parties
> interested in the ABI/API whenever we are changing files under include/uapi.
> Whitespace changes should mostly be safe, but it's not unknown
> e.g. to include unrelated changes in the same commit by mistake.
> 
>> I thought the list was primarily for API related discussions.
> 
> Basically this line in MAINTAINERS
> 
> ABI/API
> L:      linux-api@vger.kernel.org
> F:      Documentation/ABI/
> F:      include/linux/syscalls.h
> F:      include/uapi/
> F:      kernel/sys_ni.c
> 
> normally means "send all patches affecting files under include/uapi/ to
> this list", does it not?
> 
> Wasn't this the intent?
> 
>> This is not the only mail sent here which doesn't fall into that
>> category IMO. It is far from low volume list for quite some time.
>>
>> Please let's get back low volume and API only discussion!
> 
> Maybe send patch dropping include/uapi/ from there,
> should help drive the volumes down?

Well, regardless of what it technically means, there's always going
to be scope for ambiguity, and that's where we differ from computers:
we can ask ourselves the question: will other human beings interested 
in the API/ABI care about this patch?

Thanks,

Michael (also saddened about increasing noise on linux-api)

-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply

* Re: futex(2) man page update help request
From: Thomas Gleixner @ 2015-01-15 22:23 UTC (permalink / raw)
  To: Michael Kerrisk (man-pages)
  Cc: Carlos O'Donell, Darren Hart, Ingo Molnar, Jakub Jelinek,
	linux-man-u79uwXL29TY76Z2rM5mHXA@public.gmane.org, lkml,
	Davidlohr Bueso, Arnd Bergmann, Steven Rostedt, Peter Zijlstra,
	Linux API, Torvald Riegel, Roland McGrath, Darren Hart,
	Anton Blanchard
In-Reply-To: <54B7D87C.3090901-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>

On Thu, 15 Jan 2015, Michael Kerrisk (man-pages) wrote:
> > [EINVAL] uaddr equal uaddr2. Requeue to same futex.
> 
> ??? I added this, but does this error not occur only for PI requeues?

It's equally wrong for normal futexes. And its actually the same code
checking for this for all variants.

> > [EDEADLOCK] The futex is already locked by the caller or the kernel 
> > detected a deadlock scenario in a nested lock chain
>
> Added.

It's actually EDEADLK

> 
> > [EOWNERDIED] The owner of the futex died and the kernel made the 
> > caller the new owner. The kernel sets the FUTEX_OWNER_DIED bit in the
> > futex userspace value. Caller is responsible for cleanup
> 
> There is no such thing as an EOWNERDIED error. I had a look
> through the kernel source for the FUTEX_OWNER_DIED cases and didn't 
> see an obvious error associated with them. Can you clarify? (I think 
> the point is that this condition, which is described in
> Documentation/robust-futexes.txt, is not an error as such. However, I'm
> not yet sure of how to describe it in the man page.)
> I will add this point as a FIXME in the new draft man page.

Oops. My bad. That's not the what the kernel does. The kernel merily
marks it in the futex itself with FUTEX_OWNER_DIED. User space needs
to deal with that and the posix users return EOWNERDEAD (not
EOWNERDIED], so it's not part of the futex call itself.

We had discussions about returning EOWNERDEAD in that case, but then
glibc with its sophisticated error handling prevented that ....
 
> > FUTEX_TRYLOCK_PI
> > 
> > This operation tries to acquire the futex at uaddr. It deals with the
> > situation where the TID value at uaddr is 0, but the FUTEX_HAS_WAITER
> > bit is set. User space cannot handle this race free.
> 
> Added.
> 
> > The arguments uaddr2, val, timeout and val3 are ignored.
> 
> ??? But the code reads:
> 
>         case FUTEX_TRYLOCK_PI:
>                 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
>  
> which momentarily misleads one into thinking that 'timeout' is used.
> And: it's not quite ignored, since in futex_lock_pi() a non-NULL
> 'timeout' is unconditionally dereferenced (meaning you could get
> an EFAULT error for a bad 'timeout' pointer).
> I'm confused....

Indeed. That's just wrong.
 
> Maybe the above code should be
> 
>         case FUTEX_TRYLOCK_PI:
>                 return futex_lock_pi(uaddr, flags, 0, NULL, 1);
> ?

Care to send a patch?
 
> > FUTEX_WAIT_REQUEUE_PI
> > 
> > Wait operation to wait on a non pi futex at uaddr and potentially be
> > requeued on a pi futex at uaddr2. The wait operation on uaddr is the
> > same as FUTEX_WAIT. The waiter can be removed from the wait on uaddr
> > via FUTEX_WAKE without requeuing on uaddr2.
> 
> Added.
> 
> > The timeout argument is handled as described in FUTEX_WAIT.
> 
> The above seems not to be correct. I've written the discussion of
> 'timeout' up as I understand it, and added a FIXME to the draft page.
> 
> > Darren, can you fill in the missing details?
> 
> > Return values:
> > 
> > [EFAULT] Kernel was unable to access the futex value at uaddr or
> > uaddr2
> 
> Already covered.
> 
> > [EINVAL] The supplied uaddr or uaddr2 argument does not point to a
> > valid object, i.e. pointer is not 4 byte aligned
> 
> Already covered.
> 
> > [EINVAL] The supplied timeout argument is not normalized.
> 
> Already covered.
> 
> > [EINVAL] The supplied bitset is zero.
> 
> ??? I don't believe this can happen. 'val3' is internally set to
> FUTEX_BITSET_MATCH_ANY. Can you confirm?

Right. We dont support that bitset stuff in requeue_pi ATM.
 
Thanks,

	tglx

^ permalink raw reply

* Re: [PATCH v4 20/20] kbuild: add a new kselftest_install make target to install selftests
From: Michael Ellerman @ 2015-01-15 22:58 UTC (permalink / raw)
  To: Shuah Khan
  Cc: mmarek-AlSwsSmVLrQ, masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	rostedt-nx8X9YLhiw1AfugRpC6u6w, mingo-H+wXaHxf7aLQT0dZR+AlfA,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, keescook-F7+t8E8rja9g9hUCZPvPmw,
	tranmanphong-Re5JQEeQqe8AvxtiuMwx3w, cov-sgV2jX0FEOL9JmXXK+q4OQ,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
	bobby.prani-Re5JQEeQqe8AvxtiuMwx3w,
	serge.hallyn-GeWIH/nMZzLQT0dZR+AlfA,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w, tim.bird-/MT0OVThwyLZJqsBc5GL+g,
	josh-iaAMLnmF4UmaiuxdJuQwMA, koct9i-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kbuild-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <54B69A1B.80006-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>

On Wed, 2015-01-14 at 09:32 -0700, Shuah Khan wrote:
> On 01/06/2015 12:43 PM, Shuah Khan wrote:
> > Add a new make target to install to install kernel selftests.
> > This new target will build and install selftests. kselftest
> > target now depends on kselftest_install and runs the generated
> > kselftest script to reduce duplicate work and for common look
> > and feel when running tests.
> > 
> > make kselftest_target:
> > -- exports kselftest INSTALL_KSFT_PATH
> >    default $(INSTALL_MOD_PATH)/lib/kselftest/$(KERNELRELEASE)
> > -- exports INSTALL_KSFT_PATH
> > -- runs selftests make install target
> > 
> > Signed-off-by: Shuah Khan <shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>
> 
> Hi Marek,
> 
> Could you please Ack this patch, if this version looks good,
> so I can take this through ksefltest tree.

Hi Shuah,

I posted what I think is a better approach last week. It's less code and less
repetition, and installs the majority of tests this series missed - ie. the
powerpc ones.

Please at least respond to it before you put this series into next.

https://lkml.org/lkml/2015/1/9/45

cheers

^ permalink raw reply

* Re: [PATCH v4 20/20] kbuild: add a new kselftest_install make target to install selftests
From: Shuah Khan @ 2015-01-15 23:41 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: mmarek-AlSwsSmVLrQ, masami.hiramatsu.pt-FCd8Q96Dh0JBDgjK7y7TUQ,
	gregkh-hQyY1W1yCW8ekmWlsbkhG0B+6BGkLq7r,
	akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b,
	rostedt-nx8X9YLhiw1AfugRpC6u6w, mingo-H+wXaHxf7aLQT0dZR+AlfA,
	davem-fT/PcQaiUtIeIZ0/mPfg9Q, keescook-F7+t8E8rja9g9hUCZPvPmw,
	tranmanphong-Re5JQEeQqe8AvxtiuMwx3w, cov-sgV2jX0FEOL9JmXXK+q4OQ,
	dh.herrmann-Re5JQEeQqe8AvxtiuMwx3w, hughd-hpIqsD4AKlfQT0dZR+AlfA,
	bobby.prani-Re5JQEeQqe8AvxtiuMwx3w,
	serge.hallyn-GeWIH/nMZzLQT0dZR+AlfA,
	ebiederm-aS9lmoZGLiVWk0Htik3J/w, tim.bird-/MT0OVThwyLZJqsBc5GL+g,
	josh-iaAMLnmF4UmaiuxdJuQwMA, koct9i-Re5JQEeQqe8AvxtiuMwx3w,
	linux-kbuild-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421362698.23332.6.camel-Gsx/Oe8HsFggBc27wqDAHg@public.gmane.org>

On 01/15/2015 03:58 PM, Michael Ellerman wrote:
> On Wed, 2015-01-14 at 09:32 -0700, Shuah Khan wrote:
>> On 01/06/2015 12:43 PM, Shuah Khan wrote:
>>> Add a new make target to install to install kernel selftests.
>>> This new target will build and install selftests. kselftest
>>> target now depends on kselftest_install and runs the generated
>>> kselftest script to reduce duplicate work and for common look
>>> and feel when running tests.
>>>
>>> make kselftest_target:
>>> -- exports kselftest INSTALL_KSFT_PATH
>>>    default $(INSTALL_MOD_PATH)/lib/kselftest/$(KERNELRELEASE)
>>> -- exports INSTALL_KSFT_PATH
>>> -- runs selftests make install target
>>>
>>> Signed-off-by: Shuah Khan <shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org>
>>
>> Hi Marek,
>>
>> Could you please Ack this patch, if this version looks good,
>> so I can take this through ksefltest tree.
> 
> Hi Shuah,
> 
> I posted what I think is a better approach last week. It's less code and less
> repetition, and installs the majority of tests this series missed - ie. the
> powerpc ones.
> 
> Please at least respond to it before you put this series into next.
> 
> https://lkml.org/lkml/2015/1/9/45
> 

Michael,

I didn't get a chance to review your patches yet. I plan to
review and give you feedback by end of this week. This patch
is needed in any case for either approach.

thanks,
-- Shuah


-- 
Shuah Khan
Sr. Linux Kernel Developer
Open Source Innovation Group
Samsung Research America (Silicon Valley)
shuahkh-JPH+aEBZ4P+UEJcrhfAQsw@public.gmane.org | (970) 217-8978

^ permalink raw reply

* Re: [PATCH] crypto: algif - change algif_skcipher to be asynchronous
From: Herbert Xu @ 2015-01-16  2:00 UTC (permalink / raw)
  To: Tadeusz Struk; +Cc: davem, linux-crypto, qat-linux, linux-api
In-Reply-To: <54B6A939.4070207@intel.com>

On Wed, Jan 14, 2015 at 09:36:57AM -0800, Tadeusz Struk wrote:
> 
> But then would you like to extend AIO interface to take the IV and
> something that would indicate the encrypt/decrypt operation on
> aio_write()? Also as far as I can see AIO doesn't support splice()

Any metadata such as the IV can still go through the existing
sendmsg interface, just as you would do a sendmsg before a sendfile
to set things up.

> operation for zero copy, which is the main thing here.

The AIO interface itself can accomodate zero-copy.  It's just that
we currently don't have any support for it in the network socket
API.

> >From the other hand it shouldn't be a problem to add crypto specific
> stuff to include/uapi/linux/if_alg.h, because it is all about crypto
> anyway, is it not?

Yes but you're violating the meaning of sendpage().  The latter
is not crypto-specific so you shouldn't be adding things that
prevent future optimisations to it.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: [PATCH v4 20/20] kbuild: add a new kselftest_install make target to install selftests
From: Michael Ellerman @ 2015-01-16  2:59 UTC (permalink / raw)
  To: Shuah Khan
  Cc: mmarek, masami.hiramatsu.pt, gregkh, akpm, rostedt, mingo, davem,
	keescook, tranmanphong, cov, dh.herrmann, hughd, bobby.prani,
	serge.hallyn, ebiederm, tim.bird, josh, koct9i, linux-kbuild,
	linux-kernel, linux-api, netdev
In-Reply-To: <54B8502C.2040708@osg.samsung.com>

On Thu, 2015-01-15 at 16:41 -0700, Shuah Khan wrote:
> On 01/15/2015 03:58 PM, Michael Ellerman wrote:
> > On Wed, 2015-01-14 at 09:32 -0700, Shuah Khan wrote:
> >> On 01/06/2015 12:43 PM, Shuah Khan wrote:
> >>> Add a new make target to install to install kernel selftests.
> >>> This new target will build and install selftests. kselftest
> >>> target now depends on kselftest_install and runs the generated
> >>> kselftest script to reduce duplicate work and for common look
> >>> and feel when running tests.
> >>>
> >>> make kselftest_target:
> >>> -- exports kselftest INSTALL_KSFT_PATH
> >>>    default $(INSTALL_MOD_PATH)/lib/kselftest/$(KERNELRELEASE)
> >>> -- exports INSTALL_KSFT_PATH
> >>> -- runs selftests make install target
> >>>
> >>> Signed-off-by: Shuah Khan <shuahkh@osg.samsung.com>
> >>
> >> Hi Marek,
> >>
> >> Could you please Ack this patch, if this version looks good,
> >> so I can take this through ksefltest tree.
> > 
> > Hi Shuah,
> > 
> > I posted what I think is a better approach last week. It's less code and less
> > repetition, and installs the majority of tests this series missed - ie. the
> > powerpc ones.
> > 
> > Please at least respond to it before you put this series into next.
> > 
> > https://lkml.org/lkml/2015/1/9/45
> 
> Michael,
> 
> I didn't get a chance to review your patches yet. I plan to
> review and give you feedback by end of this week. 

Thanks. My week ends quite soon (.au time zone), so early next week is fine.

> This patch is needed in any case for either approach.

The version in my series is slightly different, but probably similar enough. I
CC'ed Michal on my series also.

cheers



^ permalink raw reply

* [PATCH tip 0/9] tracing: attach eBPF programs to tracepoints/syscalls/kprobe
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA

Hi Ingo, Steven,

This patch set is based on tip/master.
It adds ability to attach eBPF programs to tracepoints, syscalls and kprobes.

Mechanism of attaching:
- load program via bpf() syscall and receive program_fd
- event_fd = open("/sys/kernel/debug/tracing/events/.../filter")
- write 'bpf-123' to event_fd where 123 is program_fd
- program will be attached to particular event and event automatically enabled
- close(event_fd) will detach bpf program from event and event disabled

Program attach point and input arguments:
- programs attached to kprobes receive 'struct pt_regs *' as an input.
  See tracex4_kern.c that demonstrates how users can write a C program like:
  SEC("events/kprobes/sys_write")
  int bpf_prog4(struct pt_regs *regs)
  {
     long write_size = regs->dx; 
     // here user need to know the proto of sys_write() from kernel
     // sources and x64 calling convention to know that register $rdx
     // contains 3rd argument to sys_write() which is 'size_t count'

  it's obviously architecture dependent, but allows building sophisticated
  user tools on top, that can see from debug info of vmlinux which variables
  are in which registers or stack locations and fetch it from there.
  'perf probe' can potentialy use this hook to generate programs in user space
  and insert them instead of letting kernel parse string during kprobe creation.

- programs attached to tracepoints and syscalls receive 'struct bpf_context *':
  u64 arg1, arg2, ..., arg6;
  for syscalls they match syscall arguments.
  for tracepoints these args match arguments passed to tracepoint.
  For example:
  trace_sched_migrate_task(p, new_cpu); from sched/core.c
  arg1 <- p        which is 'struct task_struct *'
  arg2 <- new_cpu  which is 'unsigned int'
  arg3..arg6 = 0
  the program can use bpf_fetch_u8/16/32/64/ptr() helpers to walk 'task_struct'
  or any other kernel data structures.
  These helpers are using probe_kernel_read() similar to 'perf probe' which is
  not 100% safe in both cases, but good enough.
  To access task_struct's pid inside 'sched_migrate_task' tracepoint
  the program can do:
  struct task_struct *task = (struct task_struct *)ctx->arg1;
  u32 pid = bpf_fetch_u32(&task->pid);
  Since struct layout is kernel configuration specific such programs are not
  portable and require access to kernel headers to be compiled,
  but in this case we don't need debug info.
  llvm with bpf backend will statically compute task->pid offset as a constant
  based on kernel headers only.
  The example of this arbitrary pointer walking is tracex1_kern.c
  which does skb->dev->name == "lo" filtering.

In all cases the programs are called before trace buffer is allocated to
minimize the overhead, since we want to filter huge number of events, but
buffer alloc/free and argument copy for every event is too costly.
Theoretically we can invoke programs after buffer is allocated, but it
doesn't seem needed, since above approach is faster and achieves the same.

Note, tracepoint/syscall and kprobe programs are two different types:
BPF_PROG_TYPE_TRACING_FILTER and BPF_PROG_TYPE_KPROBE_FILTER,
since they expect different input.
Both use the same set of helper functions:
- map access (lookup/update/delete)
- fetch (probe_kernel_read wrappers)
- memcmp (probe_kernel_read + memcmp)
- dump_stack
- trace_printk
The last two are mainly to debug the programs and to print data for user
space consumptions.

Portability:
- kprobe programs are architecture dependent and need user scripting
  language like ktap/stap/dtrace/perf that will dynamically generate
  them based on debug info in vmlinux
- tracepoint programs are architecture independent, but if arbitrary pointer
  walking (with fetch() helpers) is used, they need data struct layout to match.
  Debug info is not necessary
- for networking use case we need to access 'struct sk_buff' fields in portable
  way (user space needs to fetch packet length without knowing skb->len offset),
  so for some frequently used data structures we will add helper functions
  or pseudo instructions to access them. I've hacked few ways specifically
  for skb, but abandoned them in favor of more generic type/field infra.
  That work is still wip. Not part of this set.
  Once it's ready tracepoint programs that access common data structs
  will be kernel independent.

Program return value:
- programs return 0 to discard an event
- and return non-zero to proceed with event (allocate trace buffer, copy
  arguments there and print it eventually in trace_pipe in traditional way)

Examples:
- dropmon.c - simple kfree_skb() accounting in eBPF assembler, similar
  to dropmon tool
- tracex1_kern.c - does net/netif_receive_skb event filtering
  for dev->skb->name == "lo" condition
- tracex2_kern.c - same kfree_skb() accounting like dropmon, but now in C
  plus computes histogram of all write sizes from sys_write syscall
  and prints the histogram in userspace
- tracex3_kern.c - most sophisticated example that computes IO latency
  between block/block_rq_issue and block/block_rq_complete events
  and prints 'heatmap' using gray shades of text terminal.
  Useful to analyze disk performance.
- tracex4_kern.c - computes histogram of write sizes from sys_write syscall
  using kprobe mechanism instead of syscall. Since kprobe is optimized into
  ftrace the overhead of instrumentation is smaller than in example 2.

The user space tools like ktap/dtrace/systemptap/perf that has access
to debug info would probably want to use kprobe attachment point, since kprobe
can be inserted anywhere and all registers are avaiable in the program.
tracepoint attachments are useful without debug info, so standalone tools
like iosnoop will use them.

The main difference vs existing perf_probe/ftrace infra is in kernel aggregation
and conditional walking of arbitrary data structures.

Thanks!

Alexei Starovoitov (9):
  tracing: attach eBPF programs to tracepoints and syscalls
  tracing: allow eBPF programs to call bpf_printk()
  tracing: allow eBPF programs to call ktime_get_ns()
  samples: bpf: simple tracing example in eBPF assembler
  samples: bpf: simple tracing example in C
  samples: bpf: counting example for kfree_skb tracepoint and write
    syscall
  samples: bpf: IO latency analysis (iosnoop/heatmap)
  tracing: attach eBPF programs to kprobe/kretprobe
  samples: bpf: simple kprobe example

 include/linux/ftrace_event.h       |    6 +
 include/trace/bpf_trace.h          |   25 ++++
 include/trace/ftrace.h             |   30 +++++
 include/uapi/linux/bpf.h           |   11 ++
 kernel/trace/Kconfig               |    1 +
 kernel/trace/Makefile              |    1 +
 kernel/trace/bpf_trace.c           |  250 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h               |    3 +
 kernel/trace/trace_events.c        |   41 +++++-
 kernel/trace/trace_events_filter.c |   80 +++++++++++-
 kernel/trace/trace_kprobe.c        |   11 +-
 kernel/trace/trace_syscalls.c      |   31 +++++
 samples/bpf/Makefile               |   18 +++
 samples/bpf/bpf_helpers.h          |   18 +++
 samples/bpf/bpf_load.c             |   62 ++++++++-
 samples/bpf/bpf_load.h             |    3 +
 samples/bpf/dropmon.c              |  129 +++++++++++++++++++
 samples/bpf/tracex1_kern.c         |   28 ++++
 samples/bpf/tracex1_user.c         |   24 ++++
 samples/bpf/tracex2_kern.c         |   71 ++++++++++
 samples/bpf/tracex2_user.c         |   95 ++++++++++++++
 samples/bpf/tracex3_kern.c         |   96 ++++++++++++++
 samples/bpf/tracex3_user.c         |  146 +++++++++++++++++++++
 samples/bpf/tracex4_kern.c         |   36 ++++++
 samples/bpf/tracex4_user.c         |   83 ++++++++++++
 25 files changed, 1290 insertions(+), 9 deletions(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c
 create mode 100644 samples/bpf/dropmon.c
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c
 create mode 100644 samples/bpf/tracex4_kern.c
 create mode 100644 samples/bpf/tracex4_user.c

-- 
1.7.9.5

^ permalink raw reply

* [PATCH tip 1/9] tracing: attach eBPF programs to tracepoints and syscalls
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421381770-4866-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

User interface:
fd = open("/sys/kernel/debug/tracing/__event__/filter")

write(fd, "bpf_123")

where 123 is process local FD associated with eBPF program previously loaded.
__event__ is static tracepoint event or syscall.
(kprobe support is in next patch)
Once program is successfully attached to tracepoint event, the tracepoint
will be auto-enabled

close(fd)
auto-disables tracepoint event and detaches eBPF program from it

eBPF programs can call in-kernel helper functions to:
- lookup/update/delete elements in maps
- memcmp
- dump_stack
- fetch_ptr/u64/u32/u16/u8 values from unsafe address via probe_kernel_read(),
  so that eBPF program can walk any kernel data structures

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/linux/ftrace_event.h       |    4 ++
 include/trace/bpf_trace.h          |   25 +++++++
 include/trace/ftrace.h             |   30 ++++++++
 include/uapi/linux/bpf.h           |    8 +++
 kernel/trace/Kconfig               |    1 +
 kernel/trace/Makefile              |    1 +
 kernel/trace/bpf_trace.c           |  140 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h               |    3 +
 kernel/trace/trace_events.c        |   33 ++++++++-
 kernel/trace/trace_events_filter.c |   76 +++++++++++++++++++-
 kernel/trace/trace_syscalls.c      |   31 ++++++++
 11 files changed, 350 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/bpf_trace.h
 create mode 100644 kernel/trace/bpf_trace.c

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d36f68b08acc..a3897f5e43ca 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -248,6 +248,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 	TRACE_EVENT_FL_USE_CALL_FILTER_BIT,
 	TRACE_EVENT_FL_TRACEPOINT_BIT,
+	TRACE_EVENT_FL_BPF_BIT,
 };
 
 /*
@@ -270,6 +271,7 @@ enum {
 	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 	TRACE_EVENT_FL_USE_CALL_FILTER	= (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT),
 	TRACE_EVENT_FL_TRACEPOINT	= (1 << TRACE_EVENT_FL_TRACEPOINT_BIT),
+	TRACE_EVENT_FL_BPF		= (1 << TRACE_EVENT_FL_BPF_BIT),
 };
 
 struct ftrace_event_call {
@@ -544,6 +546,8 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
 		event_triggers_post_call(file, tt);
 }
 
+unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx);
+
 enum {
 	FILTER_OTHER = 0,
 	FILTER_STATIC_STRING,
diff --git a/include/trace/bpf_trace.h b/include/trace/bpf_trace.h
new file mode 100644
index 000000000000..4e64f61f484d
--- /dev/null
+++ b/include/trace/bpf_trace.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef _LINUX_KERNEL_BPF_TRACE_H
+#define _LINUX_KERNEL_BPF_TRACE_H
+
+/* For tracepoint filters argN fields match one to one to arguments
+ * passed to tracepoint events
+ *
+ * For syscall entry filters argN fields match syscall arguments
+ * For syscall exit filters arg1 is a return value
+ */
+struct bpf_context {
+	u64 arg1;
+	u64 arg2;
+	u64 arg3;
+	u64 arg4;
+	u64 arg5;
+	u64 arg6;
+};
+
+#endif /* _LINUX_KERNEL_BPF_TRACE_H */
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 27609dfcce25..7b2cf74a9b08 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -17,6 +17,7 @@
  */
 
 #include <linux/ftrace_event.h>
+#include <trace/bpf_trace.h>
 
 /*
  * DECLARE_EVENT_CLASS can be used to add a generic function
@@ -617,6 +618,24 @@ static inline notrace int ftrace_get_offsets_##call(			\
 #undef __perf_task
 #define __perf_task(t)	(t)
 
+/* zero extend integer, pointer or aggregate type to u64 without warnings */
+#define __CAST_TO_U64(expr) ({ \
+	u64 ret = 0; \
+	switch (sizeof(expr)) { \
+	case 8: ret = *(u64 *) &expr; break; \
+	case 4: ret = *(u32 *) &expr; break; \
+	case 2: ret = *(u16 *) &expr; break; \
+	case 1: ret = *(u8 *) &expr; break; \
+	} \
+	ret; })
+
+#define __BPF_CAST1(a,...) __CAST_TO_U64(a)
+#define __BPF_CAST2(a,...) __CAST_TO_U64(a), __BPF_CAST1(__VA_ARGS__)
+#define __BPF_CAST3(a,...) __CAST_TO_U64(a), __BPF_CAST2(__VA_ARGS__)
+#define __BPF_CAST4(a,...) __CAST_TO_U64(a), __BPF_CAST3(__VA_ARGS__)
+#define __BPF_CAST5(a,...) __CAST_TO_U64(a), __BPF_CAST4(__VA_ARGS__)
+#define __BPF_CAST6(a,...) __CAST_TO_U64(a), __BPF_CAST5(__VA_ARGS__)
+
 #undef DECLARE_EVENT_CLASS
 #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print)	\
 									\
@@ -632,6 +651,17 @@ ftrace_raw_event_##call(void *__data, proto)				\
 	if (ftrace_trigger_soft_disabled(ftrace_file))			\
 		return;							\
 									\
+	if (unlikely(ftrace_file->flags & FTRACE_EVENT_FL_FILTERED) &&	\
+	    unlikely(ftrace_file->flags & TRACE_EVENT_FL_BPF)) {	\
+		__maybe_unused const u64 z = 0;				\
+		struct bpf_context __ctx = ((struct bpf_context) {	\
+				__BPF_CAST6(args, z, z, z, z, z)	\
+			});						\
+									\
+		if (trace_filter_call_bpf(ftrace_file->filter, &__ctx) == 0) \
+			return;						\
+	}								\
+									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
 	entry = ftrace_event_buffer_reserve(&fbuffer, ftrace_file,	\
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 45da7ec7d274..959538c50117 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -118,6 +118,7 @@ enum bpf_map_type {
 enum bpf_prog_type {
 	BPF_PROG_TYPE_UNSPEC,
 	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_TRACING_FILTER,
 };
 
 /* flags for BPF_MAP_UPDATE_ELEM command */
@@ -162,6 +163,13 @@ enum bpf_func_id {
 	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
 	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
 	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_fetch_ptr,       /* void *bpf_fetch_ptr(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u64,       /* u64 bpf_fetch_u64(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u32,       /* u32 bpf_fetch_u32(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u16,       /* u16 bpf_fetch_u16(void *unsafe_ptr) */
+	BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
+	BPF_FUNC_memcmp,          /* int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size) */
+	BPF_FUNC_dump_stack,      /* void bpf_dump_stack(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5da09c899dd..eb60b234b824 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -75,6 +75,7 @@ config FTRACE_NMI_ENTER
 
 config EVENT_TRACING
 	select CONTEXT_SWITCH_TRACER
+	select BPF_SYSCALL
 	bool
 
 config CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 979ccde26720..ef821d90f3f5 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
+obj-$(CONFIG_EVENT_TRACING) += bpf_trace.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM),y)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
new file mode 100644
index 000000000000..639d3c25dead
--- /dev/null
+++ b/kernel/trace/bpf_trace.c
@@ -0,0 +1,140 @@
+/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/uaccess.h>
+#include <trace/bpf_trace.h>
+#include "trace.h"
+
+static u64 bpf_fetch_ptr(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *ptr = NULL;
+
+	probe_kernel_read(&ptr, unsafe_ptr, sizeof(ptr));
+	return (u64) (unsigned long) ptr;
+}
+
+#define FETCH(SIZE) \
+static u64 bpf_fetch_##SIZE(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)	\
+{									\
+	void *unsafe_ptr = (void *) (long) r1;				\
+	SIZE val = 0;							\
+									\
+	probe_kernel_read(&val, unsafe_ptr, sizeof(val));		\
+	return (u64) (SIZE) val;					\
+}
+FETCH(u64)
+FETCH(u32)
+FETCH(u16)
+FETCH(u8)
+#undef FETCH
+
+static u64 bpf_memcmp(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	void *unsafe_ptr = (void *) (long) r1;
+	void *safe_ptr = (void *) (long) r2;
+	u32 size = (u32) r3;
+	char buf[64];
+	int err;
+
+	if (size < 64) {
+		err = probe_kernel_read(buf, unsafe_ptr, size);
+		if (err)
+			return err;
+		return memcmp(buf, safe_ptr, size);
+	}
+	return -1;
+}
+
+static u64 bpf_dump_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	trace_dump_stack(0);
+	return 0;
+}
+
+static struct bpf_func_proto tracing_filter_funcs[] = {
+#define FETCH(SIZE)				\
+	[BPF_FUNC_fetch_##SIZE] = {		\
+		.func = bpf_fetch_##SIZE,	\
+		.gpl_only = true,		\
+		.ret_type = RET_INTEGER,	\
+	},
+	FETCH(ptr)
+	FETCH(u64)
+	FETCH(u32)
+	FETCH(u16)
+	FETCH(u8)
+#undef FETCH
+	[BPF_FUNC_memcmp] = {
+		.func = bpf_memcmp,
+		.gpl_only = false,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_ANYTHING,
+		.arg2_type = ARG_PTR_TO_STACK,
+		.arg3_type = ARG_CONST_STACK_SIZE,
+	},
+	[BPF_FUNC_dump_stack] = {
+		.func = bpf_dump_stack,
+		.gpl_only = false,
+		.ret_type = RET_VOID,
+	},
+};
+
+static const struct bpf_func_proto *tracing_filter_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	default:
+		if (func_id < 0 || func_id >= ARRAY_SIZE(tracing_filter_funcs))
+			return NULL;
+		return &tracing_filter_funcs[func_id];
+	}
+}
+
+/* check access to argN fields of 'struct bpf_context' from program */
+static bool tracing_filter_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+	/* check bounds */
+	if (off < 0 || off >= sizeof(struct bpf_context))
+		return false;
+
+	/* only read is allowed */
+	if (type != BPF_READ)
+		return false;
+
+	/* disallow misaligned access */
+	if (off % size != 0)
+		return false;
+
+	return true;
+}
+
+static struct bpf_verifier_ops tracing_filter_ops = {
+	.get_func_proto = tracing_filter_func_proto,
+	.is_valid_access = tracing_filter_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl = {
+	.ops = &tracing_filter_ops,
+	.type = BPF_PROG_TYPE_TRACING_FILTER,
+};
+
+static int __init register_tracing_filter_ops(void)
+{
+	bpf_register_prog_type(&tl);
+	return 0;
+}
+late_initcall(register_tracing_filter_ops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8de48bac1ce2..d667547c6f0e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -977,12 +977,15 @@ struct ftrace_event_field {
 	int			is_signed;
 };
 
+struct bpf_prog;
+
 struct event_filter {
 	int			n_preds;	/* Number assigned */
 	int			a_preds;	/* allocated */
 	struct filter_pred	*preds;
 	struct filter_pred	*root;
 	char			*filter_string;
+	struct bpf_prog		*prog;
 };
 
 struct event_subsystem {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 366a78a3e61e..189cc4d697b5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1084,6 +1084,26 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return r;
 }
 
+static int event_filter_release(struct inode *inode, struct file *filp)
+{
+	struct ftrace_event_file *file;
+	char buf[2] = "0";
+
+	mutex_lock(&event_mutex);
+	file = event_file_data(filp);
+	if (file) {
+		if (file->flags & TRACE_EVENT_FL_BPF) {
+			/* auto-disable the filter */
+			ftrace_event_enable_disable(file, 0);
+
+			/* if BPF filter was used, clear it on fd close */
+			apply_event_filter(file, buf);
+		}
+	}
+	mutex_unlock(&event_mutex);
+	return 0;
+}
+
 static ssize_t
 event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
@@ -1107,8 +1127,18 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	mutex_lock(&event_mutex);
 	file = event_file_data(filp);
-	if (file)
+	if (file) {
+		/*
+		 * note to user space tools:
+		 * write() into debugfs/tracing/events/xxx/filter file
+		 * must be done with the same privilege level as open()
+		 */
 		err = apply_event_filter(file, buf);
+		if (!err && file->flags & TRACE_EVENT_FL_BPF)
+			/* once filter is applied, auto-enable it */
+			ftrace_event_enable_disable(file, 1);
+	}
+
 	mutex_unlock(&event_mutex);
 
 	free_page((unsigned long) buf);
@@ -1363,6 +1393,7 @@ static const struct file_operations ftrace_event_filter_fops = {
 	.open = tracing_open_generic,
 	.read = event_filter_read,
 	.write = event_filter_write,
+	.release = event_filter_release,
 	.llseek = default_llseek,
 };
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ced69da0ff55..bb0140414238 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -23,6 +23,9 @@
 #include <linux/mutex.h>
 #include <linux/perf_event.h>
 #include <linux/slab.h>
+#include <linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include <linux/filter.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -541,6 +544,18 @@ static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
 	return WALK_PRED_DEFAULT;
 }
 
+unsigned int trace_filter_call_bpf(struct event_filter *filter, void *ctx)
+{
+	unsigned int ret;
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(filter->prog, ctx);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_filter_call_bpf);
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
@@ -795,6 +810,8 @@ static void __free_filter(struct event_filter *filter)
 	if (!filter)
 		return;
 
+	if (filter->prog)
+		bpf_prog_put(filter->prog);
 	__free_preds(filter);
 	kfree(filter->filter_string);
 	kfree(filter);
@@ -1874,6 +1891,50 @@ static int create_filter_start(char *filter_str, bool set_str,
 	return err;
 }
 
+static int create_filter_bpf(char *filter_str, struct event_filter **filterp)
+{
+	struct event_filter *filter;
+	struct bpf_prog *prog;
+	long ufd;
+	int err = 0;
+
+	*filterp = NULL;
+
+	filter = __alloc_filter();
+	if (!filter)
+		return -ENOMEM;
+
+	err = replace_filter_string(filter, filter_str);
+	if (err)
+		goto free_filter;
+
+	err = kstrtol(filter_str + 4, 0, &ufd);
+	if (err)
+		goto free_filter;
+
+	prog = bpf_prog_get(ufd);
+	if (IS_ERR(prog)) {
+		err = PTR_ERR(prog);
+		goto free_filter;
+	}
+
+	filter->prog = prog;
+
+	if (prog->aux->prog_type != BPF_PROG_TYPE_TRACING_FILTER) {
+		/* valid fd, but invalid bpf program type */
+		err = -EINVAL;
+		goto free_filter;
+	}
+
+	*filterp = filter;
+
+	return 0;
+
+free_filter:
+	__free_filter(filter);
+	return err;
+}
+
 static void create_filter_finish(struct filter_parse_state *ps)
 {
 	if (ps) {
@@ -1971,6 +2032,7 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
 		filter_disable(file);
 		filter = event_filter(file);
 
+		file->flags &= ~TRACE_EVENT_FL_BPF;
 		if (!filter)
 			return 0;
 
@@ -1983,7 +2045,19 @@ int apply_event_filter(struct ftrace_event_file *file, char *filter_string)
 		return 0;
 	}
 
-	err = create_filter(call, filter_string, true, &filter);
+	/*
+	 * 'bpf_123' string is a request to attach eBPF program with id == 123
+	 * also accept 'bpf 123', 'bpf.123', 'bpf-123' variants
+	 */
+	if (memcmp(filter_string, "bpf", 3) == 0 && filter_string[3] != 0 &&
+	    filter_string[4] != 0) {
+		err = create_filter_bpf(filter_string, &filter);
+		if (!err)
+			file->flags |= TRACE_EVENT_FL_BPF;
+	} else {
+		err = create_filter(call, filter_string, true, &filter);
+		file->flags &= ~TRACE_EVENT_FL_BPF;
+	}
 
 	/*
 	 * Always swap the call filter with the new filter
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index f97f6e3a676c..94242d2a9d76 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,6 +7,7 @@
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
+#include <trace/bpf_trace.h>
 
 #include "trace_output.h"
 #include "trace.h"
@@ -290,6 +291,20 @@ static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
 	return ret;
 }
 
+static void populate_bpf_ctx(struct bpf_context *ctx, struct pt_regs *regs)
+{
+	struct task_struct *task = current;
+	unsigned long args[6];
+
+	syscall_get_arguments(task, regs, 0, 6, args);
+	ctx->arg1 = args[0];
+	ctx->arg2 = args[1];
+	ctx->arg3 = args[2];
+	ctx->arg4 = args[3];
+	ctx->arg5 = args[4];
+	ctx->arg6 = args[5];
+}
+
 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 {
 	struct trace_array *tr = data;
@@ -319,6 +334,14 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
+	if (ftrace_file->flags & TRACE_EVENT_FL_BPF) {
+		struct bpf_context ctx;
+
+		populate_bpf_ctx(&ctx, regs);
+		if (trace_filter_call_bpf(ftrace_file->filter, &ctx) == 0)
+			return;
+	}
+
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
 	local_save_flags(irq_flags);
@@ -366,6 +389,14 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
+	if (ftrace_file->flags & TRACE_EVENT_FL_BPF) {
+		struct bpf_context ctx = {};
+
+		ctx.arg1 = syscall_get_return_value(current, regs);
+		if (trace_filter_call_bpf(ftrace_file->filter, &ctx) == 0)
+			return;
+	}
+
 	local_save_flags(irq_flags);
 	pc = preempt_count();
 
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH tip 2/9] tracing: allow eBPF programs to call bpf_printk()
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421381770-4866-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

Debugging of eBPF programs needs some form of printk from the program,
so let programs call limited trace_printk() with %d %u %x %p modifiers only.

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/uapi/linux/bpf.h    |    1 +
 kernel/trace/bpf_trace.c    |   61 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c |    8 ++++++
 3 files changed, 70 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 959538c50117..ef88e3f45b85 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_func_id {
 	BPF_FUNC_fetch_u8,        /* u8 bpf_fetch_u8(void *unsafe_ptr) */
 	BPF_FUNC_memcmp,          /* int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size) */
 	BPF_FUNC_dump_stack,      /* void bpf_dump_stack(void) */
+	BPF_FUNC_printk,          /* int bpf_printk(const char *fmt, int fmt_size, ...) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 639d3c25dead..3825d7a3cbd1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -60,6 +60,60 @@ static u64 bpf_dump_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
 	return 0;
 }
 
+/* limited printk()
+ * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed
+ */
+static u64 bpf_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
+{
+	char *fmt = (char *) (long) r1;
+	int fmt_cnt = 0;
+	bool mod_l[3] = {};
+	int i;
+
+	/* bpf_check() guarantees that fmt points to bpf program stack and
+	 * fmt_size bytes of it were initialized by bpf program
+	 */
+	if (fmt[fmt_size - 1] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++)
+		if (fmt[i] == '%') {
+			if (fmt_cnt >= 3)
+				return -EINVAL;
+			i++;
+			if (i >= fmt_size)
+				return -EINVAL;
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			} else if (fmt[i] == 'p') {
+				mod_l[fmt_cnt] = true;
+				fmt_cnt++;
+				continue;
+			}
+
+			if (fmt[i] == 'l') {
+				mod_l[fmt_cnt] = true;
+				i++;
+				if (i >= fmt_size)
+					return -EINVAL;
+			}
+
+			if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+				return -EINVAL;
+			fmt_cnt++;
+		}
+
+	return __trace_printk((unsigned long) __builtin_return_address(3), fmt,
+			      mod_l[0] ? r3 : (u32) r3,
+			      mod_l[1] ? r4 : (u32) r4,
+			      mod_l[2] ? r5 : (u32) r5);
+}
+
 static struct bpf_func_proto tracing_filter_funcs[] = {
 #define FETCH(SIZE)				\
 	[BPF_FUNC_fetch_##SIZE] = {		\
@@ -86,6 +140,13 @@ static struct bpf_func_proto tracing_filter_funcs[] = {
 		.gpl_only = false,
 		.ret_type = RET_VOID,
 	},
+	[BPF_FUNC_printk] = {
+		.func = bpf_printk,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+		.arg1_type = ARG_PTR_TO_STACK,
+		.arg2_type = ARG_CONST_STACK_SIZE,
+	},
 };
 
 static const struct bpf_func_proto *tracing_filter_func_proto(enum bpf_func_id func_id)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 189cc4d697b5..282ea5822480 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1141,6 +1141,14 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	mutex_unlock(&event_mutex);
 
+	if (file && file->flags & TRACE_EVENT_FL_BPF) {
+		/*
+		 * allocate per-cpu printk buffers, since programs
+		 * might be calling bpf_printk
+		 */
+		trace_printk_init_buffers();
+	}
+
 	free_page((unsigned long) buf);
 	if (err < 0)
 		return err;
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH tip 3/9] tracing: allow eBPF programs to call ktime_get_ns()
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421381770-4866-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

bpf_ktime_get_ns() is used by programs to compue time delta between events
or as a timestamp

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 include/uapi/linux/bpf.h |    1 +
 kernel/trace/bpf_trace.c |   10 ++++++++++
 2 files changed, 11 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index ef88e3f45b85..6075c4f4b67e 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -171,6 +171,7 @@ enum bpf_func_id {
 	BPF_FUNC_memcmp,          /* int bpf_memcmp(void *unsafe_ptr, void *safe_ptr, int size) */
 	BPF_FUNC_dump_stack,      /* void bpf_dump_stack(void) */
 	BPF_FUNC_printk,          /* int bpf_printk(const char *fmt, int fmt_size, ...) */
+	BPF_FUNC_ktime_get_ns,    /* u64 bpf_ktime_get_ns(void) */
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3825d7a3cbd1..14cfbbcec32e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -114,6 +114,11 @@ static u64 bpf_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
 			      mod_l[2] ? r5 : (u32) r5);
 }
 
+static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+	return ktime_get_ns();
+}
+
 static struct bpf_func_proto tracing_filter_funcs[] = {
 #define FETCH(SIZE)				\
 	[BPF_FUNC_fetch_##SIZE] = {		\
@@ -147,6 +152,11 @@ static struct bpf_func_proto tracing_filter_funcs[] = {
 		.arg1_type = ARG_PTR_TO_STACK,
 		.arg2_type = ARG_CONST_STACK_SIZE,
 	},
+	[BPF_FUNC_ktime_get_ns] = {
+		.func = bpf_ktime_get_ns,
+		.gpl_only = true,
+		.ret_type = RET_INTEGER,
+	},
 };
 
 static const struct bpf_func_proto *tracing_filter_func_proto(enum bpf_func_id func_id)
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH tip 4/9] samples: bpf: simple tracing example in eBPF assembler
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1421381770-4866-1-git-send-email-ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>

simple packet drop monitor:
- in-kernel eBPF program attaches to kfree_skb() event and records number
  of packet drops at given location
- userspace iterates over the map every second and prints stats

Usage:
$ sudo dropmon
location 0xffffffff81695995 count 1
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 2
location 0xffffffff816d0da9 count 2

location 0xffffffff81695995 count 3
location 0xffffffff816d0da9 count 2

$ addr2line -ape ./bld_x64/vmlinux 0xffffffff81695995 0xffffffff816d0da9
0xffffffff81695995: ./bld_x64/../net/ipv4/icmp.c:1038
0xffffffff816d0da9: ./bld_x64/../net/unix/af_unix.c:1231

Signed-off-by: Alexei Starovoitov <ast-uqk4Ao+rVK5Wk0Htik3J/w@public.gmane.org>
---
 samples/bpf/Makefile  |    2 +
 samples/bpf/dropmon.c |  129 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 samples/bpf/dropmon.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index b5b3600dcdf5..789691374562 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -6,7 +6,9 @@ hostprogs-y := test_verifier test_maps
 hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
+hostprogs-y += dropmon
 
+dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
diff --git a/samples/bpf/dropmon.c b/samples/bpf/dropmon.c
new file mode 100644
index 000000000000..9a2cd3344d69
--- /dev/null
+++ b/samples/bpf/dropmon.c
@@ -0,0 +1,129 @@
+/* simple packet drop monitor:
+ * - in-kernel eBPF program attaches to kfree_skb() event and records number
+ *   of packet drops at given location
+ * - userspace iterates over the map every second and prints stats
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <string.h>
+#include <linux/filter.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include "libbpf.h"
+
+#define TRACEPOINT "/sys/kernel/debug/tracing/events/skb/kfree_skb/"
+
+static int write_to_file(const char *file, const char *str, bool keep_open)
+{
+	int fd, err;
+
+	fd = open(file, O_WRONLY);
+	err = write(fd, str, strlen(str));
+	(void) err;
+
+	if (keep_open) {
+		return fd;
+	} else {
+		close(fd);
+		return -1;
+	}
+}
+
+static int dropmon(void)
+{
+	long long key, next_key, value = 0;
+	int prog_fd, map_fd, i;
+	char fmt[32];
+
+	map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 1024);
+	if (map_fd < 0) {
+		printf("failed to create map '%s'\n", strerror(errno));
+		goto cleanup;
+	}
+
+	/* the following eBPF program is equivalent to C:
+	 * int filter(struct bpf_context *ctx)
+	 * {
+	 *   long loc = ctx->arg2;
+	 *   long init_val = 1;
+	 *   long *value;
+	 *
+	 *   value = bpf_map_lookup_elem(MAP_ID, &loc);
+	 *   if (value) {
+	 *      __sync_fetch_and_add(value, 1);
+	 *   } else {
+	 *      bpf_map_update_elem(MAP_ID, &loc, &init_val, BPF_ANY);
+	 *   }
+	 *   return 0;
+	 * }
+	 */
+	struct bpf_insn prog[] = {
+		BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_1, 8), /* r2 = *(u64 *)(r1 + 8) */
+		BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_2, -8), /* *(u64 *)(fp - 8) = r2 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+		BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 4),
+		BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
+		BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
+		BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+		BPF_EXIT_INSN(),
+		BPF_ST_MEM(BPF_DW, BPF_REG_10, -16, 1), /* *(u64 *)(fp - 16) = 1 */
+		BPF_MOV64_IMM(BPF_REG_4, BPF_ANY),
+		BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -16), /* r3 = fp - 16 */
+		BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+		BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), /* r2 = fp - 8 */
+		BPF_LD_MAP_FD(BPF_REG_1, map_fd),
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_update_elem),
+		BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */
+		BPF_EXIT_INSN(),
+	};
+
+	prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACING_FILTER, prog,
+				sizeof(prog), "GPL");
+	if (prog_fd < 0) {
+		printf("failed to load prog '%s'\n%s", strerror(errno), bpf_log_buf);
+		return -1;
+	}
+
+	sprintf(fmt, "bpf_%d", prog_fd);
+
+	write_to_file(TRACEPOINT "filter", fmt, true);
+
+	for (i = 0; i < 10; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd, &next_key, &value);
+			printf("location 0x%llx count %lld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+
+cleanup:
+	/* maps, programs, tracepoint filters will auto cleanup on process exit */
+
+	return 0;
+}
+
+int main(void)
+{
+	FILE *f;
+
+	/* start ping in the background to get some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	dropmon();
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH tip 5/9] samples: bpf: simple tracing example in C
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api, netdev, linux-kernel
In-Reply-To: <1421381770-4866-1-git-send-email-ast@plumgrid.com>

tracex1_kern.c - C program which will be compiled into eBPF
to filter netif_receive_skb events on skb->dev->name == "lo"
The programs returns 1 to continue storing an event into trace buffer
and returns 0 - to discard an event.

tracex1_user.c - corresponding user space component that
forever reads /sys/.../trace_pipe

Usage:
$ sudo tracex1

should see:
writing bpf-4 -> /sys/kernel/debug/tracing/events/net/netif_receive_skb/filter
  ping-364   [000] ..s2     8.089771: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc100 len=84
  ping-364   [000] ..s2     8.089889: netif_receive_skb: dev=lo skbaddr=ffff88000dfcc900 len=84

Ctrl-C at any time, kernel will auto cleanup

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 +++
 samples/bpf/bpf_helpers.h  |   18 ++++++++++++++
 samples/bpf/bpf_load.c     |   59 +++++++++++++++++++++++++++++++++++++++-----
 samples/bpf/bpf_load.h     |    3 +++
 samples/bpf/tracex1_kern.c |   28 +++++++++++++++++++++
 samples/bpf/tracex1_user.c |   24 ++++++++++++++++++
 6 files changed, 130 insertions(+), 6 deletions(-)
 create mode 100644 samples/bpf/tracex1_kern.c
 create mode 100644 samples/bpf/tracex1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 789691374562..da28e1b6d3a6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -7,6 +7,7 @@ hostprogs-y += sock_example
 hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += dropmon
+hostprogs-y += tracex1
 
 dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -14,17 +15,20 @@ test_maps-objs := test_maps.o libbpf.o
 sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
+tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
+always += tracex1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
 HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
+HOSTLOADLIBES_tracex1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h
index ca0333146006..81388e821eb3 100644
--- a/samples/bpf/bpf_helpers.h
+++ b/samples/bpf/bpf_helpers.h
@@ -15,6 +15,24 @@ static int (*bpf_map_update_elem)(void *map, void *key, void *value,
 	(void *) BPF_FUNC_map_update_elem;
 static int (*bpf_map_delete_elem)(void *map, void *key) =
 	(void *) BPF_FUNC_map_delete_elem;
+static void *(*bpf_fetch_ptr)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_ptr;
+static unsigned long long (*bpf_fetch_u64)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u64;
+static unsigned int (*bpf_fetch_u32)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u32;
+static unsigned short (*bpf_fetch_u16)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u16;
+static unsigned char (*bpf_fetch_u8)(void *unsafe_ptr) =
+	(void *) BPF_FUNC_fetch_u8;
+static int (*bpf_printk)(const char *fmt, int fmt_size, ...) =
+	(void *) BPF_FUNC_printk;
+static int (*bpf_memcmp)(void *unsafe_ptr, void *safe_ptr, int size) =
+	(void *) BPF_FUNC_memcmp;
+static void (*bpf_dump_stack)(void) =
+	(void *) BPF_FUNC_dump_stack;
+static unsigned long long (*bpf_ktime_get_ns)(void) =
+	(void *) BPF_FUNC_ktime_get_ns;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 1831d236382b..788ac51c1024 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -14,6 +14,8 @@
 #include "bpf_helpers.h"
 #include "bpf_load.h"
 
+#define DEBUGFS "/sys/kernel/debug/tracing/"
+
 static char license[128];
 static bool processed_sec[128];
 int map_fd[MAX_MAPS];
@@ -22,15 +24,18 @@ int prog_cnt;
 
 static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 {
-	int fd;
 	bool is_socket = strncmp(event, "socket", 6) == 0;
+	enum bpf_prog_type prog_type;
+	char path[256] = DEBUGFS;
+	char fmt[32];
+	int fd, event_fd, err;
 
-	if (!is_socket)
-		/* tracing events tbd */
-		return -1;
+	if (is_socket)
+		prog_type = BPF_PROG_TYPE_SOCKET_FILTER;
+	else
+		prog_type = BPF_PROG_TYPE_TRACING_FILTER;
 
-	fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER,
-			   prog, size, license);
+	fd = bpf_prog_load(prog_type, prog, size, license);
 
 	if (fd < 0) {
 		printf("bpf_prog_load() err=%d\n%s", errno, bpf_log_buf);
@@ -39,6 +44,28 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_socket)
+		return 0;
+
+	snprintf(fmt, sizeof(fmt), "bpf-%d", fd);
+
+	strcat(path, event);
+	strcat(path, "/filter");
+
+	printf("writing %s -> %s\n", fmt, path);
+
+	event_fd = open(path, O_WRONLY, 0);
+	if (event_fd < 0) {
+		printf("failed to open event %s\n", event);
+		return -1;
+	}
+
+	err = write(event_fd, fmt, strlen(fmt));
+	if (err < 0) {
+		printf("write to '%s' failed '%s'\n", event, strerror(errno));
+		return -1;
+	}
+
 	return 0;
 }
 
@@ -201,3 +228,23 @@ int load_bpf_file(char *path)
 	close(fd);
 	return 0;
 }
+
+void read_trace_pipe(void)
+{
+	int trace_fd;
+
+	trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0);
+	if (trace_fd < 0)
+		return;
+
+	while (1) {
+		static char buf[4096];
+		ssize_t sz;
+
+		sz = read(trace_fd, buf, sizeof(buf));
+		if (sz) {
+			buf[sz] = 0;
+			puts(buf);
+		}
+	}
+}
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 27789a34f5e6..d154fc2b0535 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -21,4 +21,7 @@ extern int prog_fd[MAX_PROGS];
  */
 int load_bpf_file(char *path);
 
+/* forever reads /sys/.../trace_pipe */
+void read_trace_pipe(void);
+
 #endif
diff --git a/samples/bpf/tracex1_kern.c b/samples/bpf/tracex1_kern.c
new file mode 100644
index 000000000000..7849ceb4bce6
--- /dev/null
+++ b/samples/bpf/tracex1_kern.c
@@ -0,0 +1,28 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+SEC("events/net/netif_receive_skb")
+int bpf_prog1(struct bpf_context *ctx)
+{
+	/*
+	 * attaches to /sys/kernel/debug/tracing/events/net/netif_receive_skb
+	 * prints events for loobpack device only
+	 */
+	char devname[] = "lo";
+	struct net_device *dev;
+	struct sk_buff *skb = 0;
+
+	skb = (struct sk_buff *) ctx->arg1;
+	dev = bpf_fetch_ptr(&skb->dev);
+	if (bpf_memcmp(dev->name, devname, 2) == 0)
+		/* print event using default tracepoint format */
+		return 1;
+
+	/* drop event */
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex1_user.c b/samples/bpf/tracex1_user.c
new file mode 100644
index 000000000000..e85c1b483f57
--- /dev/null
+++ b/samples/bpf/tracex1_user.c
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	read_trace_pipe();
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH tip 6/9] samples: bpf: counting example for kfree_skb tracepoint and write syscall
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api, netdev, linux-kernel
In-Reply-To: <1421381770-4866-1-git-send-email-ast@plumgrid.com>

this example has two probes in one C file that attach to different tracepoints
and use two different maps.

1st probe is the similar to dropmon.c. It attaches to kfree_skb tracepoint and
count number of packet drops at different locations

2nd probe attaches to syscalls/sys_enter_write and computes a histogram of different
write sizes

Usage:
$ sudo tracex2
writing bpf-8 -> /sys/kernel/debug/tracing/events/skb/kfree_skb/filter
writing bpf-10 -> /sys/kernel/debug/tracing/events/syscalls/sys_enter_write/filter
location 0xffffffff816959a5 count 1

location 0xffffffff816959a5 count 2

557145+0 records in
557145+0 records out
285258240 bytes (285 MB) copied, 1.02379 s, 279 MB/s
           syscall write() stats
     byte_size       : count     distribution
       1 -> 1        : 3        |                                      |
       2 -> 3        : 0        |                                      |
       4 -> 7        : 0        |                                      |
       8 -> 15       : 0        |                                      |
      16 -> 31       : 2        |                                      |
      32 -> 63       : 3        |                                      |
      64 -> 127      : 1        |                                      |
     128 -> 255      : 1        |                                      |
     256 -> 511      : 0        |                                      |
     512 -> 1023     : 1118968  |************************************* |

Ctrl-C at any time. Kernel will auto cleanup maps and programs

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex2_kern.c |   71 +++++++++++++++++++++++++++++++++
 samples/bpf/tracex2_user.c |   95 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 samples/bpf/tracex2_kern.c
 create mode 100644 samples/bpf/tracex2_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index da28e1b6d3a6..416af24b01fd 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -8,6 +8,7 @@ hostprogs-y += sockex1
 hostprogs-y += sockex2
 hostprogs-y += dropmon
 hostprogs-y += tracex1
+hostprogs-y += tracex2
 
 dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -16,12 +17,14 @@ sock_example-objs := sock_example.o libbpf.o
 sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
+tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
 always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
+always += tracex2_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -29,6 +32,7 @@ HOSTCFLAGS_bpf_load.o += -I$(objtree)/usr/include -Wno-unused-variable
 HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
+HOSTLOADLIBES_tracex2 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex2_kern.c b/samples/bpf/tracex2_kern.c
new file mode 100644
index 000000000000..a789c456c1b4
--- /dev/null
+++ b/samples/bpf/tracex2_kern.c
@@ -0,0 +1,71 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(long),
+	.max_entries = 1024,
+};
+
+SEC("events/skb/kfree_skb")
+int bpf_prog2(struct bpf_context *ctx)
+{
+	long loc = ctx->arg2;
+	long init_val = 1;
+	long *value;
+
+	value = bpf_map_lookup_elem(&my_map, &loc);
+	if (value)
+		*value += 1;
+	else
+		bpf_map_update_elem(&my_map, &loc, &init_val, BPF_ANY);
+	return 0;
+}
+
+static unsigned int log2(unsigned int v)
+{
+	unsigned int r;
+	unsigned int shift;
+
+	r = (v > 0xFFFF) << 4; v >>= r;
+	shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+	shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+	shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+	r |= (v >> 1);
+	return r;
+}
+
+static unsigned int log2l(unsigned long v)
+{
+	unsigned int hi = v >> 32;
+	if (hi)
+		return log2(hi) + 32;
+	else
+		return log2(v);
+}
+
+struct bpf_map_def SEC("maps") my_hist_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 64,
+};
+
+SEC("events/syscalls/sys_enter_write")
+int bpf_prog3(struct bpf_context *ctx)
+{
+	long write_size = ctx->arg3;
+	long init_val = 1;
+	long *value;
+	u32 index = log2l(write_size);
+
+	value = bpf_map_lookup_elem(&my_hist_map, &index);
+	if (value)
+		__sync_fetch_and_add(value, 1);
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
new file mode 100644
index 000000000000..016a76e97cd7
--- /dev/null
+++ b/samples/bpf/tracex2_user.c
@@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define MAX_INDEX	64
+#define MAX_STARS	38
+
+static void stars(char *str, long val, long max, int width)
+{
+	int i;
+
+	for (i = 0; i < (width * val / max) - 1 && i < width - 1; i++)
+		str[i] = '*';
+	if (val > max)
+		str[i - 1] = '+';
+	str[i] = '\0';
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	long value;
+	long data[MAX_INDEX] = {};
+	char starstr[MAX_STARS];
+	int i;
+	int max_ind = -1;
+	long max_value = 0;
+
+	for (key = 0; key < MAX_INDEX; key++) {
+		bpf_lookup_elem(fd, &key, &value);
+		data[key] = value;
+		if (value && key > max_ind)
+			max_ind = key;
+		if (value > max_value)
+			max_value = value;
+	}
+
+	printf("           syscall write() stats\n");
+	printf("     byte_size       : count     distribution\n");
+	for (i = 1; i <= max_ind + 1; i++) {
+		stars(starstr, data[i - 1], max_value, MAX_STARS);
+		printf("%8ld -> %-8ld : %-8ld |%-*s|\n",
+		       (1l << i) >> 1, (1l << i) - 1, data[i - 1],
+		       MAX_STARS, starstr);
+	}
+}
+static void int_exit(int sig)
+{
+	print_hist(map_fd[1]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	long key, next_key, value;
+	FILE *f;
+	int i;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	signal(SIGINT, int_exit);
+
+	/* start 'ping' in the background to have some kfree_skb events */
+	f = popen("ping -c5 localhost", "r");
+	(void) f;
+
+	/* start 'dd' in the background to have plenty of 'write' syscalls */
+	f = popen("dd if=/dev/zero of=/dev/null", "r");
+	(void) f;
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	for (i = 0; i < 5; i++) {
+		key = 0;
+		while (bpf_get_next_key(map_fd[0], &key, &next_key) == 0) {
+			bpf_lookup_elem(map_fd[0], &next_key, &value);
+			printf("location 0x%lx count %ld\n", next_key, value);
+			key = next_key;
+		}
+		if (key)
+			printf("\n");
+		sleep(1);
+	}
+	print_hist(map_fd[1]);
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH tip 7/9] samples: bpf: IO latency analysis (iosnoop/heatmap)
From: Alexei Starovoitov @ 2015-01-16  4:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, Namhyung Kim, Arnaldo Carvalho de Melo, Jiri Olsa,
	David S. Miller, Daniel Borkmann, Hannes Frederic Sowa,
	Brendan Gregg, linux-api, netdev, linux-kernel
In-Reply-To: <1421381770-4866-1-git-send-email-ast@plumgrid.com>

eBPF C program attaches to block_rq_issue/block_rq_complete events to calculate
IO latency. Then it waits for the first 100 events to compute average latency
and uses range [0 .. ave_lat * 2] to record histogram of events in this latency
range.
User space reads this histogram map every 2 seconds and prints it as a 'heatmap'
using gray shades of text terminal. Black spaces have many events and white
spaces have very few events. Left most space is the smallest latency, right most
space is the largest latency in the range.
If kernel sees too many events that fall out of histogram range, user space
adjusts the range up, so heatmap for next 2 seconds will be more accurate.

Usage:
$ sudo ./tracex3
and do 'sudo dd if=/dev/sda of=/dev/null' in other terminal.
Observe IO latencies and how different activity (like 'make kernel') affects it.

Similar experiments can be done for network transmit latencies, syscalls, etc

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
---
 samples/bpf/Makefile       |    4 ++
 samples/bpf/tracex3_kern.c |   96 +++++++++++++++++++++++++++++
 samples/bpf/tracex3_user.c |  146 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 246 insertions(+)
 create mode 100644 samples/bpf/tracex3_kern.c
 create mode 100644 samples/bpf/tracex3_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 416af24b01fd..da0efd8032ab 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -9,6 +9,7 @@ hostprogs-y += sockex2
 hostprogs-y += dropmon
 hostprogs-y += tracex1
 hostprogs-y += tracex2
+hostprogs-y += tracex3
 
 dropmon-objs := dropmon.o libbpf.o
 test_verifier-objs := test_verifier.o libbpf.o
@@ -18,6 +19,7 @@ sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
 sockex2-objs := bpf_load.o libbpf.o sockex2_user.o
 tracex1-objs := bpf_load.o libbpf.o tracex1_user.o
 tracex2-objs := bpf_load.o libbpf.o tracex2_user.o
+tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -25,6 +27,7 @@ always += sockex1_kern.o
 always += sockex2_kern.o
 always += tracex1_kern.o
 always += tracex2_kern.o
+always += tracex3_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -33,6 +36,7 @@ HOSTLOADLIBES_sockex1 += -lelf
 HOSTLOADLIBES_sockex2 += -lelf
 HOSTLOADLIBES_tracex1 += -lelf
 HOSTLOADLIBES_tracex2 += -lelf
+HOSTLOADLIBES_tracex3 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/tracex3_kern.c b/samples/bpf/tracex3_kern.c
new file mode 100644
index 000000000000..fa04603b80b8
--- /dev/null
+++ b/samples/bpf/tracex3_kern.c
@@ -0,0 +1,96 @@
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
+#include <trace/bpf_trace.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(long),
+	.value_size = sizeof(u64),
+	.max_entries = 4096,
+};
+
+SEC("events/block/block_rq_issue")
+int bpf_prog1(struct bpf_context *ctx)
+{
+	long rq = ctx->arg2;
+	u64 val = bpf_ktime_get_ns();
+
+	bpf_map_update_elem(&my_map, &rq, &val, BPF_ANY);
+	return 0;
+}
+
+struct globals {
+	u64 lat_ave;
+	u64 lat_sum;
+	u64 missed;
+	u64 max_lat;
+	int num_samples;
+};
+
+struct bpf_map_def SEC("maps") global_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(struct globals),
+	.max_entries = 1,
+};
+
+#define MAX_SLOT 32
+
+struct bpf_map_def SEC("maps") lat_map = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u64),
+	.max_entries = MAX_SLOT,
+};
+
+SEC("events/block/block_rq_complete")
+int bpf_prog2(struct bpf_context *ctx)
+{
+	long rq = ctx->arg2;
+	void *value;
+
+	value = bpf_map_lookup_elem(&my_map, &rq);
+	if (!value)
+		return 0;
+
+	u64 cur_time = bpf_ktime_get_ns();
+	u64 delta = (cur_time - *(u64 *)value) / 1000;
+
+	bpf_map_delete_elem(&my_map, &rq);
+
+	int ind = 0;
+	struct globals *g = bpf_map_lookup_elem(&global_map, &ind);
+	if (!g)
+		return 0;
+	if (g->lat_ave == 0) {
+		g->num_samples++;
+		g->lat_sum += delta;
+		if (g->num_samples >= 100) {
+			g->lat_ave = g->lat_sum / g->num_samples;
+			if (0/* debug */) {
+				char fmt[] = "after %d samples average latency %ld usec\n";
+				bpf_printk(fmt, sizeof(fmt), g->num_samples,
+					   g->lat_ave);
+			}
+		}
+	} else {
+		u64 max_lat = g->lat_ave * 2;
+		if (delta > max_lat) {
+			g->missed++;
+			if (delta > g->max_lat)
+				g->max_lat = delta;
+			return 0;
+		}
+
+		ind = delta * MAX_SLOT / max_lat;
+		value = bpf_map_lookup_elem(&lat_map, &ind);
+		if (!value)
+			return 0;
+		(*(u64 *)value) ++;
+	}
+
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c
new file mode 100644
index 000000000000..1945147925b5
--- /dev/null
+++ b/samples/bpf/tracex3_user.c
@@ -0,0 +1,146 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <linux/bpf.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+
+struct globals {
+	__u64 lat_ave;
+	__u64 lat_sum;
+	__u64 missed;
+	__u64 max_lat;
+	int num_samples;
+};
+
+static void clear_stats(int fd)
+{
+	int key;
+	__u64 value = 0;
+	for (key = 0; key < 32; key++)
+		bpf_update_elem(fd, &key, &value, BPF_ANY);
+}
+
+const char *color[] = {
+	"\033[48;5;255m",
+	"\033[48;5;252m",
+	"\033[48;5;250m",
+	"\033[48;5;248m",
+	"\033[48;5;246m",
+	"\033[48;5;244m",
+	"\033[48;5;242m",
+	"\033[48;5;240m",
+	"\033[48;5;238m",
+	"\033[48;5;236m",
+	"\033[48;5;234m",
+	"\033[48;5;232m",
+};
+const int num_colors = ARRAY_SIZE(color);
+
+const char nocolor[] = "\033[00m";
+
+static void print_banner(__u64 max_lat)
+{
+	printf("0 usec     ...          %lld usec\n", max_lat);
+}
+
+static void print_hist(int fd)
+{
+	int key;
+	__u64 value;
+	__u64 cnt[32];
+	__u64 max_cnt = 0;
+	__u64 total_events = 0;
+	int max_bucket = 0;
+
+	for (key = 0; key < 32; key++) {
+		value = 0;
+		bpf_lookup_elem(fd, &key, &value);
+		if (value > 0)
+			max_bucket = key;
+		cnt[key] = value;
+		total_events += value;
+		if (value > max_cnt)
+			max_cnt = value;
+	}
+	clear_stats(fd);
+	for (key = 0; key < 32; key++) {
+		int c = num_colors * cnt[key] / (max_cnt + 1);
+		printf("%s %s", color[c], nocolor);
+	}
+	printf(" captured=%lld", total_events);
+
+	key = 0;
+	struct globals g = {};
+	bpf_lookup_elem(map_fd[1], &key, &g);
+
+	printf(" missed=%lld max_lat=%lld usec\n",
+	       g.missed, g.max_lat);
+
+	if (g.missed > 10 && g.missed > total_events / 10) {
+		printf("adjusting range UP...\n");
+		g.lat_ave = g.max_lat / 2;
+		print_banner(g.lat_ave * 2);
+	} else if (max_bucket < 4 && total_events > 100) {
+		printf("adjusting range DOWN...\n");
+		g.lat_ave = g.lat_ave / 4;
+		print_banner(g.lat_ave * 2);
+	}
+	/* clear some globals */
+	g.missed = 0;
+	g.max_lat = 0;
+	bpf_update_elem(map_fd[1], &key, &g, BPF_ANY);
+}
+
+static void int_exit(int sig)
+{
+	print_hist(map_fd[2]);
+	exit(0);
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	clear_stats(map_fd[2]);
+
+	signal(SIGINT, int_exit);
+
+	if (fork() == 0) {
+		read_trace_pipe();
+	} else {
+		struct globals g;
+
+		printf("waiting for events to determine average latency...\n");
+		for (;;) {
+			int key = 0;
+			bpf_lookup_elem(map_fd[1], &key, &g);
+			if (g.lat_ave)
+				break;
+			sleep(1);
+		}
+
+		printf("  IO latency in usec\n"
+		       "  %s %s - many events with this latency\n"
+		       "  %s %s - few events\n",
+		       color[num_colors - 1], nocolor,
+		       color[0], nocolor);
+		print_banner(g.lat_ave * 2);
+		for (;;) {
+			print_hist(map_fd[2]);
+			sleep(2);
+		}
+	}
+
+	return 0;
+}
-- 
1.7.9.5

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox