Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 5/8] net: include hash policy in LAG changeupper info
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, John Hurley, Jiri Pirko, Jay Vosburgh,
	Veaceslav Falico, Andy Gospodarek
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

LAG upper event notifiers contain the tx type used by the LAG device.
Extend this to also include the hash policy used for tx types that
utilize hashing.

Signed-off-by: John Hurley <john.hurley@netronome.com>
---
CC: Jiri Pirko <jiri@resnulli.us>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>

 drivers/net/bonding/bond_main.c | 27 ++++++++++++++++++++++++++-
 drivers/net/team/team.c         |  1 +
 include/linux/netdevice.h       | 11 +++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index fea17b92b1ae..bd53a71f6b00 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1218,12 +1218,37 @@ static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
 	}
 }
 
+static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
+					       enum netdev_lag_tx_type type)
+{
+	if (type != NETDEV_LAG_TX_TYPE_HASH)
+		return NETDEV_LAG_HASH_NONE;
+
+	switch (bond->params.xmit_policy) {
+	case BOND_XMIT_POLICY_LAYER2:
+		return NETDEV_LAG_HASH_L2;
+	case BOND_XMIT_POLICY_LAYER34:
+		return NETDEV_LAG_HASH_L34;
+	case BOND_XMIT_POLICY_LAYER23:
+		return NETDEV_LAG_HASH_L23;
+	case BOND_XMIT_POLICY_ENCAP23:
+		return NETDEV_LAG_HASH_E23;
+	case BOND_XMIT_POLICY_ENCAP34:
+		return NETDEV_LAG_HASH_E34;
+	default:
+		return NETDEV_LAG_HASH_UNKNOWN;
+	}
+}
+
 static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
 				      struct netlink_ext_ack *extack)
 {
 	struct netdev_lag_upper_info lag_upper_info;
+	enum netdev_lag_tx_type type;
 
-	lag_upper_info.tx_type = bond_lag_tx_type(bond);
+	type = bond_lag_tx_type(bond);
+	lag_upper_info.tx_type = type;
+	lag_upper_info.hash_type = bond_lag_hash_type(bond, type);
 
 	return netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
 					    &lag_upper_info, extack);
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index d6ff881165d0..e6730a01d130 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1129,6 +1129,7 @@ static int team_upper_dev_link(struct team *team, struct team_port *port,
 	int err;
 
 	lag_upper_info.tx_type = team->mode->lag_tx_type;
+	lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN;
 	err = netdev_master_upper_dev_link(port->dev, team->dev, NULL,
 					   &lag_upper_info, extack);
 	if (err)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 03ed492c4e14..e97ba5e885a0 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2328,8 +2328,19 @@ enum netdev_lag_tx_type {
 	NETDEV_LAG_TX_TYPE_HASH,
 };
 
+enum netdev_lag_hash {
+	NETDEV_LAG_HASH_NONE,
+	NETDEV_LAG_HASH_L2,
+	NETDEV_LAG_HASH_L34,
+	NETDEV_LAG_HASH_L23,
+	NETDEV_LAG_HASH_E23,
+	NETDEV_LAG_HASH_E34,
+	NETDEV_LAG_HASH_UNKNOWN,
+};
+
 struct netdev_lag_upper_info {
 	enum netdev_lag_tx_type tx_type;
+	enum netdev_lag_hash hash_type;
 };
 
 struct netdev_lag_lower_state_info {
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 4/8] nfp: flower: add per repr private data for LAG offload
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Add a bitmap to each flower repr to track its state if it is enslaved by a
bond. This LAG state may be different to the port state - for example, the
port may be up but LAG state may be down due to the selection in an
active/backup bond.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../net/ethernet/netronome/nfp/flower/main.c  | 26 +++++++++++++++++++
 .../net/ethernet/netronome/nfp/flower/main.h  |  8 ++++++
 2 files changed, 34 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 1910c3e2b3e5..202284b42fd9 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -185,6 +185,10 @@ nfp_flower_repr_netdev_init(struct nfp_app *app, struct net_device *netdev)
 static void
 nfp_flower_repr_netdev_clean(struct nfp_app *app, struct net_device *netdev)
 {
+	struct nfp_repr *repr = netdev_priv(netdev);
+
+	kfree(repr->app_priv);
+
 	tc_setup_cb_egdev_unregister(netdev, nfp_flower_setup_tc_egress_cb,
 				     netdev_priv(netdev));
 }
@@ -225,7 +229,9 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 	u8 nfp_pcie = nfp_cppcore_pcie_unit(app->pf->cpp);
 	struct nfp_flower_priv *priv = app->priv;
 	atomic_t *replies = &priv->reify_replies;
+	struct nfp_flower_repr_priv *repr_priv;
 	enum nfp_port_type port_type;
+	struct nfp_repr *nfp_repr;
 	struct nfp_reprs *reprs;
 	int i, err, reify_cnt;
 	const u8 queue = 0;
@@ -248,6 +254,15 @@ nfp_flower_spawn_vnic_reprs(struct nfp_app *app,
 			goto err_reprs_clean;
 		}
 
+		repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL);
+		if (!repr_priv) {
+			err = -ENOMEM;
+			goto err_reprs_clean;
+		}
+
+		nfp_repr = netdev_priv(repr);
+		nfp_repr->app_priv = repr_priv;
+
 		/* For now we only support 1 PF */
 		WARN_ON(repr_type == NFP_REPR_TYPE_PF && i);
 
@@ -324,6 +339,8 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 {
 	struct nfp_eth_table *eth_tbl = app->pf->eth_tbl;
 	atomic_t *replies = &priv->reify_replies;
+	struct nfp_flower_repr_priv *repr_priv;
+	struct nfp_repr *nfp_repr;
 	struct sk_buff *ctrl_skb;
 	struct nfp_reprs *reprs;
 	int err, reify_cnt;
@@ -351,6 +368,15 @@ nfp_flower_spawn_phy_reprs(struct nfp_app *app, struct nfp_flower_priv *priv)
 			goto err_reprs_clean;
 		}
 
+		repr_priv = kzalloc(sizeof(*repr_priv), GFP_KERNEL);
+		if (!repr_priv) {
+			err = -ENOMEM;
+			goto err_reprs_clean;
+		}
+
+		nfp_repr = netdev_priv(repr);
+		nfp_repr->app_priv = repr_priv;
+
 		port = nfp_port_alloc(app, NFP_PORT_PHYS_PORT, repr);
 		if (IS_ERR(port)) {
 			err = PTR_ERR(port);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 6e82aa4ed84b..7ce255705446 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -160,6 +160,14 @@ struct nfp_flower_priv {
 	struct nfp_mtu_conf mtu_conf;
 };
 
+/**
+ * struct nfp_flower_repr_priv - Flower APP per-repr priv data
+ * @lag_port_flags:	Extended port flags to record lag state of repr
+ */
+struct nfp_flower_repr_priv {
+	unsigned long lag_port_flags;
+};
+
 struct nfp_fl_key_ls {
 	u32 key_layer_two;
 	u8 key_layer;
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 3/8] nfp: flower: check for/turn on LAG support in firmware
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Check if the fw contains the _abi_flower_balance_sync_enable symbol. If it
does then write a 1 to this indicating that the driver is willing to
receive NIC to kernel LAG related control messages.

If the write is successful, update the list of extra features supported by
the fw and add a stub to accept LAG cmsgs.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/cmsg.c |  5 +++++
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h |  1 +
 drivers/net/ethernet/netronome/nfp/flower/main.c | 12 ++++++++++++
 drivers/net/ethernet/netronome/nfp/flower/main.h |  1 +
 4 files changed, 19 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
index 577659f332e4..03aae2ed9983 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.c
@@ -239,6 +239,7 @@ nfp_flower_cmsg_portreify_rx(struct nfp_app *app, struct sk_buff *skb)
 static void
 nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 {
+	struct nfp_flower_priv *app_priv = app->priv;
 	struct nfp_flower_cmsg_hdr *cmsg_hdr;
 	enum nfp_flower_cmsg_type_port type;
 
@@ -258,6 +259,10 @@ nfp_flower_cmsg_process_one_rx(struct nfp_app *app, struct sk_buff *skb)
 	case NFP_FLOWER_CMSG_TYPE_ACTIVE_TUNS:
 		nfp_tunnel_keep_alive(app, skb);
 		break;
+	case NFP_FLOWER_CMSG_TYPE_LAG_CONFIG:
+		if (app_priv->flower_ext_feats & NFP_FL_FEATS_LAG)
+			break;
+		/* fall through */
 	default:
 		nfp_flower_cmsg_warn(app, "Cannot handle invalid repr control type %u\n",
 				     type);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index bee4367a2c38..3a42a1fc55cb 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -366,6 +366,7 @@ struct nfp_flower_cmsg_hdr {
 enum nfp_flower_cmsg_type_port {
 	NFP_FLOWER_CMSG_TYPE_FLOW_ADD =		0,
 	NFP_FLOWER_CMSG_TYPE_FLOW_DEL =		2,
+	NFP_FLOWER_CMSG_TYPE_LAG_CONFIG =	4,
 	NFP_FLOWER_CMSG_TYPE_PORT_REIFY =	6,
 	NFP_FLOWER_CMSG_TYPE_MAC_REPR =		7,
 	NFP_FLOWER_CMSG_TYPE_PORT_MOD =		8,
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.c b/drivers/net/ethernet/netronome/nfp/flower/main.c
index 4e67c0cbf9f0..1910c3e2b3e5 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.c
@@ -546,8 +546,20 @@ static int nfp_flower_init(struct nfp_app *app)
 	else
 		app_priv->flower_ext_feats = features;
 
+	/* Tell the firmware that the driver supports lag. */
+	err = nfp_rtsym_write_le(app->pf->rtbl,
+				 "_abi_flower_balance_sync_enable", 1);
+	if (!err)
+		app_priv->flower_ext_feats |= NFP_FL_FEATS_LAG;
+	else if (err == -ENOENT)
+		nfp_warn(app->cpp, "LAG not supported by FW.\n");
+	else
+		goto err_cleanup_metadata;
+
 	return 0;
 
+err_cleanup_metadata:
+	nfp_flower_metadata_cleanup(app);
 err_free_app_priv:
 	vfree(app->priv);
 	return err;
diff --git a/drivers/net/ethernet/netronome/nfp/flower/main.h b/drivers/net/ethernet/netronome/nfp/flower/main.h
index 733ff53cc601..6e82aa4ed84b 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/main.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/main.h
@@ -67,6 +67,7 @@ struct nfp_app;
 /* Extra features bitmap. */
 #define NFP_FL_FEATS_GENEVE		BIT(0)
 #define NFP_FL_NBI_MTU_SETTING		BIT(1)
+#define NFP_FL_FEATS_LAG		BIT(31)
 
 struct nfp_fl_mask_id {
 	struct circ_buf mask_id_free_list;
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 2/8] nfp: nfpcore: add rtsym writing function
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Add an rtsym API function that combines the lookup of a symbol and the
writing of a value to it. Values can be written as unsigned 32 or 64 bits.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 .../ethernet/netronome/nfp/nfpcore/nfp_nffw.h |  2 +
 .../netronome/nfp/nfpcore/nfp_rtsym.c         | 43 +++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
index c9724fb7ea4b..df599d5b6bb3 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_nffw.h
@@ -100,6 +100,8 @@ nfp_rtsym_lookup(struct nfp_rtsym_table *rtbl, const char *name);
 
 u64 nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name,
 		      int *error);
+int nfp_rtsym_write_le(struct nfp_rtsym_table *rtbl, const char *name,
+		       u64 value);
 u8 __iomem *
 nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name, const char *id,
 	      unsigned int min_size, struct nfp_cpp_area **area);
diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
index 46107aefad1c..9e34216578da 100644
--- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
+++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_rtsym.c
@@ -286,6 +286,49 @@ u64 nfp_rtsym_read_le(struct nfp_rtsym_table *rtbl, const char *name,
 	return val;
 }
 
+/**
+ * nfp_rtsym_write_le() - Write an unsigned scalar value to a symbol
+ * @rtbl:	NFP RTsym table
+ * @name:	Symbol name
+ * @value:	Value to write
+ *
+ * Lookup a symbol and write a value to it. Symbol can be 4 or 8 bytes in size.
+ * If 4 bytes then the lower 32-bits of 'value' are used. Value will be
+ * written as simple little-endian unsigned value.
+ *
+ * Return: 0 on success or error code.
+ */
+int nfp_rtsym_write_le(struct nfp_rtsym_table *rtbl, const char *name,
+		       u64 value)
+{
+	const struct nfp_rtsym *sym;
+	int err;
+	u32 id;
+
+	sym = nfp_rtsym_lookup(rtbl, name);
+	if (!sym)
+		return -ENOENT;
+
+	id = NFP_CPP_ISLAND_ID(sym->target, NFP_CPP_ACTION_RW, 0, sym->domain);
+
+	switch (sym->size) {
+	case 4:
+		err = nfp_cpp_writel(rtbl->cpp, id, sym->addr, value);
+		break;
+	case 8:
+		err = nfp_cpp_writeq(rtbl->cpp, id, sym->addr, value);
+		break;
+	default:
+		nfp_err(rtbl->cpp,
+			"rtsym '%s' unsupported or non-scalar size: %lld\n",
+			name, sym->size);
+		err = -EINVAL;
+		break;
+	}
+
+	return err;
+}
+
 u8 __iomem *
 nfp_rtsym_map(struct nfp_rtsym_table *rtbl, const char *name, const char *id,
 	      unsigned int min_size, struct nfp_cpp_area **area)
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 1/8] nfp: add ndo_set_mac_address for representors
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley
In-Reply-To: <20180524022255.18548-1-jakub.kicinski@netronome.com>

From: John Hurley <john.hurley@netronome.com>

Adding a netdev to a bond requires that its mac address can be modified.
The default eth_mac_addr is sufficient to satisfy this requirement.

Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Pieter Jansen van Vuuren <pieter.jansenvanvuuren@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
index 09e87d5f4f72..117eca6819de 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c
@@ -277,6 +277,7 @@ const struct net_device_ops nfp_repr_netdev_ops = {
 	.ndo_get_vf_config	= nfp_app_get_vf_config,
 	.ndo_set_vf_link_state	= nfp_app_set_vf_link_state,
 	.ndo_set_features	= nfp_port_set_features,
+	.ndo_set_mac_address    = eth_mac_addr,
 };
 
 static void nfp_repr_clean(struct nfp_repr *repr)
-- 
2.17.0

^ permalink raw reply related

* [PATCH net-next 0/8] nfp: offload LAG for tc flower egress
From: Jakub Kicinski @ 2018-05-24  2:22 UTC (permalink / raw)
  To: davem
  Cc: netdev, oss-drivers, Jakub Kicinski, Jiri Pirko, Jay Vosburgh,
	Veaceslav Falico, Andy Gospodarek

Hi!

This series from John adds bond offload to the nfp driver.  Patch 5
exposes the hash type for NETDEV_LAG_TX_TYPE_HASH to make sure nfp
hashing matches that of the software LAG.  This may be unnecessarily
conservative, let's see what LAG maintainers think :)

John says:

This patchset sets up the infrastructure and offloads output actions for
when a TC flower rule attempts to egress a packet to a LAG port.

Firstly it adds some of the infrastructure required to the flower app and
to the nfp core. This includes the ability to change the MAC address of a
repr, a function for combining lookup and write to a FW symbol, and the
addition of private data to a repr on a per app basis.

Patch 6 continues by implementing notifiers that track Linux bonds and
communicates to the FW those which enslave reprs, along with the current
state of reprs within the bond.

Patch 7 ensures bonds are synchronised with FW by receiving and acting
upon cmsgs sent to the kernel. These may request that a bond message is
retransmitted when FW can process it, or may request a full sync of the
bonds defined in the kernel.

Patch 8 offloads a flower action when that action requires egressing to a
pre-defined Linux bond.

John Hurley (8):
  nfp: add ndo_set_mac_address for representors
  nfp: nfpcore: add rtsym writing function
  nfp: flower: check for/turn on LAG support in firmware
  nfp: flower: add per repr private data for LAG offload
  net: include hash policy in LAG changeupper info
  nfp: flower: monitor and offload LAG groups
  nfp: flower: implement host cmsg handler for LAG
  nfp: flower: compute link aggregation action

 drivers/net/bonding/bond_main.c               |  27 +-
 drivers/net/ethernet/netronome/nfp/Makefile   |   1 +
 .../ethernet/netronome/nfp/flower/action.c    | 131 +++-
 .../net/ethernet/netronome/nfp/flower/cmsg.c  |  11 +-
 .../net/ethernet/netronome/nfp/flower/cmsg.h  |  14 +
 .../ethernet/netronome/nfp/flower/lag_conf.c  | 726 ++++++++++++++++++
 .../net/ethernet/netronome/nfp/flower/main.c  |  61 ++
 .../net/ethernet/netronome/nfp/flower/main.h  |  52 +-
 .../ethernet/netronome/nfp/flower/offload.c   |   2 +-
 .../net/ethernet/netronome/nfp/nfp_net_repr.c |   1 +
 .../ethernet/netronome/nfp/nfpcore/nfp_nffw.h |   2 +
 .../netronome/nfp/nfpcore/nfp_rtsym.c         |  43 ++
 drivers/net/team/team.c                       |   1 +
 include/linux/netdevice.h                     |  11 +
 14 files changed, 1053 insertions(+), 30 deletions(-)
 create mode 100644 drivers/net/ethernet/netronome/nfp/flower/lag_conf.c

--- 
CC: Jiri Pirko <jiri@resnulli.us>
CC: Jay Vosburgh <j.vosburgh@gmail.com>
CC: Veaceslav Falico <vfalico@gmail.com>
CC: Andy Gospodarek <andy@greyhouse.net>

^ permalink raw reply

* Re: [PATCH v3 net-next 0/2] bpfilter
From: Alexei Starovoitov @ 2018-05-24  2:09 UTC (permalink / raw)
  To: Jakub Kicinski, Alexei Starovoitov
  Cc: David S . Miller, daniel, torvalds, gregkh, luto, mcgrof,
	keescook, netdev, linux-kernel, kernel-team
In-Reply-To: <20180523185010.0490e7ec@cakuba>

On 5/23/18 6:50 PM, Jakub Kicinski wrote:
> On Wed, 23 May 2018 18:33:52 -0700, Jakub Kicinski wrote:
>> Minor glitch with Ubuntu 18.04:
>>
>> $ gcc --version
>> gcc (Ubuntu 7.3.0-16ubuntu3) 7.3.0
>>
>> In file included from /usr/include/fcntl.h:290:0,
>>                  from ../net/bpfilter/main.c:7:
>> In function ‘open’,
>>     inlined from ‘main’ at ../net/bpfilter/main.c:58:13:
>> /usr/include/x86_64-linux-gnu/bits/fcntl2.h:50:4: error: call to ‘__open_missing_mode’ declared with attribute error: open with O_CREAT or O_TMPFILE in second argument needs 3 arguments
>>     __open_missing_mode ();
>>     ^~~~~~~~~~~~~~~~~~~~~~
>> scripts/Makefile.host:107: recipe for target 'net/bpfilter/main.o' failed
>> make[3]: *** [net/bpfilter/main.o] Error 1
>>
>> I can't repro on Fedora 27 gcc (GCC) 7.3.1 20180303 (Red Hat 7.3.1-5),
>> perhaps the GCC is broken on that Ubuntu 18.04 box of mine.  The warning/
>> /error, however, looks potentially legit?
>
> More?
>
> Kernel: arch/x86/boot/bzImage is ready  (#9)
>   Building modules, stage 2.
>   MODPOST 1620 modules
> ERROR: "bpfilter_process_sockopt" [net/bpfilter/bpfilter.ko] undefined!
> ../scripts/Makefile.modpost:92: recipe for target '__modpost' failed

hmm. how come buildbot didn't yell at me for any of these things.
will take a look soon.

^ permalink raw reply

* Re: [PATCH v3 net-next 0/2] bpfilter
From: Jakub Kicinski @ 2018-05-24  1:50 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S . Miller, daniel, torvalds, gregkh, luto, mcgrof,
	keescook, netdev, linux-kernel, kernel-team
In-Reply-To: <20180523183352.7ccc3f5d@cakuba>

[-- Attachment #1: Type: text/plain, Size: 1488 bytes --]

On Wed, 23 May 2018 18:33:52 -0700, Jakub Kicinski wrote:
> Minor glitch with Ubuntu 18.04:
> 
> $ gcc --version
> gcc (Ubuntu 7.3.0-16ubuntu3) 7.3.0
> 
> In file included from /usr/include/fcntl.h:290:0,
>                  from ../net/bpfilter/main.c:7:
> In function ‘open’,
>     inlined from ‘main’ at ../net/bpfilter/main.c:58:13:
> /usr/include/x86_64-linux-gnu/bits/fcntl2.h:50:4: error: call to ‘__open_missing_mode’ declared with attribute error: open with O_CREAT or O_TMPFILE in second argument needs 3 arguments
>     __open_missing_mode ();
>     ^~~~~~~~~~~~~~~~~~~~~~
> scripts/Makefile.host:107: recipe for target 'net/bpfilter/main.o' failed
> make[3]: *** [net/bpfilter/main.o] Error 1
> 
> I can't repro on Fedora 27 gcc (GCC) 7.3.1 20180303 (Red Hat 7.3.1-5),
> perhaps the GCC is broken on that Ubuntu 18.04 box of mine.  The warning/
> /error, however, looks potentially legit?

More?

Kernel: arch/x86/boot/bzImage is ready  (#9)
  Building modules, stage 2.
  MODPOST 1620 modules
ERROR: "bpfilter_process_sockopt" [net/bpfilter/bpfilter.ko] undefined!
../scripts/Makefile.modpost:92: recipe for target '__modpost' failed
make[2]: *** [__modpost] Error 1
/home/jkicinski/devel/linux/Makefile:1274: recipe for target 'modules' failed
make[1]: *** [modules] Error 2
make[1]: Leaving directory '/home/jkicinski/devel/linux/build_randconfig'
Makefile:146: recipe for target 'sub-make' failed
make: *** [sub-make] Error 2


[-- Attachment #2: bpfitlter_config.bz2 --]
[-- Type: application/x-bzip, Size: 30187 bytes --]

^ permalink raw reply

* Re: [PATCH v3 net-next 0/2] bpfilter
From: Jakub Kicinski @ 2018-05-24  1:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David S . Miller, daniel, torvalds, gregkh, luto, mcgrof,
	keescook, netdev, linux-kernel, kernel-team
In-Reply-To: <20180522022230.2492505-1-ast@kernel.org>

Minor glitch with Ubuntu 18.04:

$ gcc --version
gcc (Ubuntu 7.3.0-16ubuntu3) 7.3.0

In file included from /usr/include/fcntl.h:290:0,
                 from ../net/bpfilter/main.c:7:
In function ‘open’,
    inlined from ‘main’ at ../net/bpfilter/main.c:58:13:
/usr/include/x86_64-linux-gnu/bits/fcntl2.h:50:4: error: call to ‘__open_missing_mode’ declared with attribute error: open with O_CREAT or O_TMPFILE in second argument needs 3 arguments
    __open_missing_mode ();
    ^~~~~~~~~~~~~~~~~~~~~~
scripts/Makefile.host:107: recipe for target 'net/bpfilter/main.o' failed
make[3]: *** [net/bpfilter/main.o] Error 1

I can't repro on Fedora 27 gcc (GCC) 7.3.1 20180303 (Red Hat 7.3.1-5),
perhaps the GCC is broken on that Ubuntu 18.04 box of mine.  The warning/
/error, however, looks potentially legit?

^ permalink raw reply

* [PATCH net v2] enic: set DMA mask to 47 bit
From: Govindarajulu Varadarajan @ 2018-05-23 18:17 UTC (permalink / raw)
  To: davem, netdev; +Cc: benve, Govindarajulu Varadarajan

In commit 624dbf55a359b ("driver/net: enic: Try DMA 64 first, then
failover to DMA") DMA mask was changed from 40 bits to 64 bits.
Hardware actually supports only 47 bits.

Fixes: 624dbf55a359b ("driver/net: enic: Try DMA 64 first, then failover to DMA")
Signed-off-by: Govindarajulu Varadarajan <gvaradar@cisco.com>
---
v2:
* rebase to net
* Fix space and single line for "Fixes:"

 drivers/net/ethernet/cisco/enic/enic_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 81684acf52af..8a8b12b720ef 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2747,11 +2747,11 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	pci_set_master(pdev);
 
 	/* Query PCI controller on system for DMA addressing
-	 * limitation for the device.  Try 64-bit first, and
+	 * limitation for the device.  Try 47-bit first, and
 	 * fail to 32-bit.
 	 */
 
-	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(47));
 	if (err) {
 		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 		if (err) {
@@ -2765,10 +2765,10 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			goto err_out_release_regions;
 		}
 	} else {
-		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(47));
 		if (err) {
 			dev_err(dev, "Unable to obtain %u-bit DMA "
-				"for consistent allocations, aborting\n", 64);
+				"for consistent allocations, aborting\n", 47);
 			goto err_out_release_regions;
 		}
 		using_dac = 1;
-- 
2.17.0

^ permalink raw reply related

* Re: [PATCH RFC net-next 00/11] udp gso
From: Willem de Bruijn @ 2018-05-24  1:15 UTC (permalink / raw)
  To: Marcelo Ricardo Leitner
  Cc: Paolo Abeni, Network Development, Willem de Bruijn
In-Reply-To: <20180524000230.GP5488@localhost.localdomain>

On Wed, May 23, 2018 at 8:02 PM, Marcelo Ricardo Leitner
<marcelo.leitner@gmail.com> wrote:
> On Wed, Apr 18, 2018 at 09:49:18AM -0400, Willem de Bruijn wrote:
>> I just hacked up a sendmmsg extension to the benchmark to verify.
>> Indeed that does not have nearly the same benefit as GSO:
>>
>> udp tx:    976 MB/s   695394 calls/s  16557 msg/s
>>
>> This matches the numbers seen from TCP without TSO and GSO.
>> That also has few system calls, but observes per MTU stack traversal.
>
> Reviving this old thread because it's the only place I saw sendmmsg
> being mentioned.
>
> sendmmsg shouldn't be considered as an alternative, but rather as a
> complement. Then instead of the application building one large request
> and request the stack to fragment it, it could simply build the
> sendmmsg request and the stack would group the mmsg into a gso skb. It
> seems more natural to the application. But well, both (sendmmsg and
> the option to fragment) are Linux-specific..
>
> For that we need sendmmsg to do something smarter than doing several
> sendmsg calls, yes.

I agree. See also my original point:

"An alternative implementation that would allow non-uniform
  segment length is to use GSO_BY_FRAGS like SCTP. This would
  likely require MSG_MORE to build the list using multiple
  send calls (or one sendmmsg). The two approaches are not
  mutually-exclusive, so that could be a follow-up."

Clear advantages of GSO_BY_FRAGS are that segments do
not have to be of equal length and that converting existing users
of sendmmsg is trivial.

On the other hand, this is less likely to be offloaded to hardware,
as it requires non-constant metadata in the descriptor.

Both cases also potentially apply to the GRO path to allow for
efficient forwarding. And to the udp socket rx layer to allow for
queuing batches of datagrams at a time, then carving off one
gso_size per recvmsg.

^ permalink raw reply

* [PATCH net-next] hv_netvsc: fix bogus ifalias on network device
From: Stephen Hemminger @ 2018-05-24  1:02 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger

If the guest network adapter is not configured with DeviceNaming
enabled on the host, then the query for friendly name will return
success but with a zero length name. Which then leads to a garbage value
(stack contents) for ifalias.

Fix is simple, just don't set name if  host doesn't return it.

Fixes: 0fe554a46a0f ("hv_netvsc: propogate Hyper-V friendly name into interface alias")
Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 drivers/net/hyperv/rndis_filter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c
index 7f3dab4b4cbc..5428bb261102 100644
--- a/drivers/net/hyperv/rndis_filter.c
+++ b/drivers/net/hyperv/rndis_filter.c
@@ -1237,7 +1237,10 @@ static void rndis_get_friendly_name(struct net_device *net,
 	if (rndis_filter_query_device(rndis_device, net_device,
 				      RNDIS_OID_GEN_FRIENDLY_NAME,
 				      wname, &size) != 0)
-		return;
+		return;	/* ignore if host does not support */
+
+	if (size == 0)
+		return;	/* name not set */
 
 	/* Convert Windows Unicode string to UTF-8 */
 	len = ucs2_as_utf8(ifalias, wname, sizeof(ifalias));
-- 
2.17.0

^ permalink raw reply related

* [PATCH bpf] bpf: properly enforce index mask to prevent out-of-bounds speculation
From: Daniel Borkmann @ 2018-05-24  0:32 UTC (permalink / raw)
  To: alexei.starovoitov; +Cc: netdev, Daniel Borkmann

While reviewing the verifier code, I recently noticed that the
following two program variants in relation to tail calls can be
loaded.

Variant 1:

  # bpftool p d x i 15
    0: (15) if r1 == 0x0 goto pc+3
    1: (18) r2 = map[id:5]
    3: (05) goto pc+2
    4: (18) r2 = map[id:6]
    6: (b7) r3 = 7
    7: (35) if r3 >= 0xa0 goto pc+2
    8: (54) (u32) r3 &= (u32) 255
    9: (85) call bpf_tail_call#12
   10: (b7) r0 = 1
   11: (95) exit

  # bpftool m s i 5
    5: prog_array  flags 0x0
        key 4B  value 4B  max_entries 4  memlock 4096B
  # bpftool m s i 6
    6: prog_array  flags 0x0
        key 4B  value 4B  max_entries 160  memlock 4096B

Variant 2:

  # bpftool p d x i 20
    0: (15) if r1 == 0x0 goto pc+3
    1: (18) r2 = map[id:8]
    3: (05) goto pc+2
    4: (18) r2 = map[id:7]
    6: (b7) r3 = 7
    7: (35) if r3 >= 0x4 goto pc+2
    8: (54) (u32) r3 &= (u32) 3
    9: (85) call bpf_tail_call#12
   10: (b7) r0 = 1
   11: (95) exit

  # bpftool m s i 8
    8: prog_array  flags 0x0
        key 4B  value 4B  max_entries 160  memlock 4096B
  # bpftool m s i 7
    7: prog_array  flags 0x0
        key 4B  value 4B  max_entries 4  memlock 4096B

In both cases the index masking inserted by the verifier in order
to control out of bounds speculation from a CPU via b2157399cc98
("bpf: prevent out-of-bounds speculation") seems to be incorrect
in what it is enforcing. In the 1st variant, the mask is applied
from the map with the significantly larger number of entries where
we would allow to a certain degree out of bounds speculation for
the smaller map, and in the 2nd variant where the mask is applied
from the map with the smaller number of entries, we get buggy
behavior since we truncate the index of the larger map.

The original intent from commit b2157399cc98 is to reject such
occasions where two or more different tail call maps are used
in the same tail call helper invocation. However, the check on
the BPF_MAP_PTR_POISON is never hit since we never poisoned the
saved pointer in the first place! We do this explicitly for map
lookups but in case of tail calls we basically used the tail
call map in insn_aux_data that was processed in the most recent
path which the verifier walked. Thus any prior path that stored
a pointer in insn_aux_data at the helper location was always
overridden.

Fix it by moving the map pointer poison logic into a small helper
that covers both BPF helpers with the same logic. After that in
fixup_bpf_calls() the poison check is then hit for tail calls
and the program rejected. Latter only happens in unprivileged
case since this is the *only* occasion where a rewrite needs to
happen, and where such rewrite is specific to the map (max_entries,
index_mask). In the privileged case the rewrite is generic for
the insn->imm / insn->code update so multiple maps from different
paths can be handled just fine since all the remaining logic
happens in the instruction processing itself. This is similar
to the case of map lookups: in case there is a collision of
maps in fixup_bpf_calls() we must skip the inlined rewrite since
this will turn the generic instruction sequence into a non-
generic one. Thus the patch_call_imm will simply update the
insn->imm location where the bpf_map_lookup_elem() will later
take care of the dispatch. Given we need this 'poison' state
as a check, the information of whether a map is an unpriv_array
gets lost, so enforcing it prior to that needs an additional
state. In general this check is needed since there are some
complex and tail call intensive BPF programs out there where
LLVM tends to generate such code occasionally. We therefore
convert the map_ptr rather into map_state to store all this
w/o extra memory overhead, and the bit whether one of the maps
involved in the collision was from an unpriv_array thus needs
to be retained as well there.

Fixes: b2157399cc98 ("bpf: prevent out-of-bounds speculation")
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 [ Test cases coming via bpf-next batch of patches to avoid
   merge conflicts in test_verifier with bpf. ]

 include/linux/bpf_verifier.h |  2 +-
 kernel/bpf/verifier.c        | 86 ++++++++++++++++++++++++++++++++------------
 2 files changed, 65 insertions(+), 23 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 7e61c39..52fb077 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -142,7 +142,7 @@ struct bpf_verifier_state_list {
 struct bpf_insn_aux_data {
 	union {
 		enum bpf_reg_type ptr_type;	/* pointer type for load/store insns */
-		struct bpf_map *map_ptr;	/* pointer for call insn into lookup_elem */
+		unsigned long map_state;	/* pointer/poison value for maps */
 		s32 call_imm;			/* saved imm field of call insn */
 	};
 	int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb..dcebf3f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -156,7 +156,29 @@ struct bpf_verifier_stack_elem {
 #define BPF_COMPLEXITY_LIMIT_INSNS	131072
 #define BPF_COMPLEXITY_LIMIT_STACK	1024
 
-#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA)
+#define BPF_MAP_PTR_UNPRIV	1UL
+#define BPF_MAP_PTR_POISON	((void *)((0xeB9FUL << 1) +	\
+					  POISON_POINTER_DELTA))
+#define BPF_MAP_PTR(X)		((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
+
+static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux)
+{
+	return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON;
+}
+
+static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux)
+{
+	return aux->map_state & BPF_MAP_PTR_UNPRIV;
+}
+
+static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux,
+			      const struct bpf_map *map, bool unpriv)
+{
+	BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV);
+	unpriv |= bpf_map_ptr_unpriv(aux);
+	aux->map_state = (unsigned long)map |
+			 (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL);
+}
 
 struct bpf_call_arg_meta {
 	struct bpf_map *map_ptr;
@@ -2333,6 +2355,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static int
+record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
+		int func_id, int insn_idx)
+{
+	struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx];
+
+	if (func_id != BPF_FUNC_tail_call &&
+	    func_id != BPF_FUNC_map_lookup_elem)
+		return 0;
+	if (meta->map_ptr == NULL) {
+		verbose(env, "kernel subsystem misconfigured verifier\n");
+		return -EINVAL;
+	}
+
+	if (!BPF_MAP_PTR(aux->map_state))
+		bpf_map_ptr_store(aux, meta->map_ptr,
+				  meta->map_ptr->unpriv_array);
+	else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr)
+		bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON,
+				  meta->map_ptr->unpriv_array);
+	return 0;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2387,13 +2432,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
 	if (err)
 		return err;
-	if (func_id == BPF_FUNC_tail_call) {
-		if (meta.map_ptr == NULL) {
-			verbose(env, "verifier bug\n");
-			return -EINVAL;
-		}
-		env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
-	}
 	err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
 	if (err)
 		return err;
@@ -2404,6 +2442,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	err = record_func_map(env, &meta, func_id, insn_idx);
+	if (err)
+		return err;
+
 	/* Mark slots with STACK_MISC in case of raw mode, stack offset
 	 * is inferred from register state.
 	 */
@@ -2428,8 +2470,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
-		struct bpf_insn_aux_data *insn_aux;
-
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
 		mark_reg_known_zero(env, regs, BPF_REG_0);
@@ -2445,11 +2485,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
 		regs[BPF_REG_0].id = ++env->id_gen;
-		insn_aux = &env->insn_aux_data[insn_idx];
-		if (!insn_aux->map_ptr)
-			insn_aux->map_ptr = meta.map_ptr;
-		else if (insn_aux->map_ptr != meta.map_ptr)
-			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else {
 		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
@@ -5417,6 +5452,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 	struct bpf_insn *insn = prog->insnsi;
 	const struct bpf_func_proto *fn;
 	const int insn_cnt = prog->len;
+	struct bpf_insn_aux_data *aux;
 	struct bpf_insn insn_buf[16];
 	struct bpf_prog *new_prog;
 	struct bpf_map *map_ptr;
@@ -5491,19 +5527,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn->imm = 0;
 			insn->code = BPF_JMP | BPF_TAIL_CALL;
 
+			aux = &env->insn_aux_data[i + delta];
+			if (!bpf_map_ptr_unpriv(aux))
+				continue;
+
 			/* instead of changing every JIT dealing with tail_call
 			 * emit two extra insns:
 			 * if (index >= max_entries) goto out;
 			 * index &= array->index_mask;
 			 * to avoid out-of-bounds cpu speculation
 			 */
-			map_ptr = env->insn_aux_data[i + delta].map_ptr;
-			if (map_ptr == BPF_MAP_PTR_POISON) {
+			if (bpf_map_ptr_poisoned(aux)) {
 				verbose(env, "tail_call abusing map_ptr\n");
 				return -EINVAL;
 			}
-			if (!map_ptr->unpriv_array)
-				continue;
+
+			map_ptr = BPF_MAP_PTR(aux->map_state);
 			insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
 						  map_ptr->max_entries, 2);
 			insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
@@ -5527,9 +5566,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 		 */
 		if (prog->jit_requested && BITS_PER_LONG == 64 &&
 		    insn->imm == BPF_FUNC_map_lookup_elem) {
-			map_ptr = env->insn_aux_data[i + delta].map_ptr;
-			if (map_ptr == BPF_MAP_PTR_POISON ||
-			    !map_ptr->ops->map_gen_lookup)
+			aux = &env->insn_aux_data[i + delta];
+			if (bpf_map_ptr_poisoned(aux))
+				goto patch_call_imm;
+
+			map_ptr = BPF_MAP_PTR(aux->map_state);
+			if (!map_ptr->ops->map_gen_lookup)
 				goto patch_call_imm;
 
 			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 5/7] samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-24  0:20 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team

This is mostly to test kprobe/uprobe which needs kernel headers.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/Makefile             |   4 +
 samples/bpf/task_fd_query_kern.c |  19 ++
 samples/bpf/task_fd_query_user.c | 382 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 405 insertions(+)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 62d1aa1..7dc85ed 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -51,6 +51,7 @@ hostprogs-y += cpustat
 hostprogs-y += xdp_adjust_tail
 hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
+hostprogs-y += task_fd_query
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -105,6 +106,7 @@ cpustat-objs := bpf_load.o cpustat_user.o
 xdp_adjust_tail-objs := xdp_adjust_tail_user.o
 xdpsock-objs := bpf_load.o xdpsock_user.o
 xdp_fwd-objs := bpf_load.o xdp_fwd_user.o
+task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -160,6 +162,7 @@ always += cpustat_kern.o
 always += xdp_adjust_tail_kern.o
 always += xdpsock_kern.o
 always += xdp_fwd_kern.o
+always += task_fd_query_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -175,6 +178,7 @@ HOSTCFLAGS_offwaketime_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_spintest_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_trace_event_user.o += -I$(srctree)/tools/lib/bpf/
 HOSTCFLAGS_sampleip_user.o += -I$(srctree)/tools/lib/bpf/
+HOSTCFLAGS_task_fd_query_user.o += -I$(srctree)/tools/lib/bpf/
 
 HOST_LOADLIBES		+= $(LIBBPF) -lelf
 HOSTLOADLIBES_tracex4		+= -lrt
diff --git a/samples/bpf/task_fd_query_kern.c b/samples/bpf/task_fd_query_kern.c
new file mode 100644
index 0000000..f4b0a9e
--- /dev/null
+++ b/samples/bpf/task_fd_query_kern.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/version.h>
+#include <linux/ptrace.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+SEC("kprobe/blk_start_request")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+SEC("kretprobe/blk_account_io_completion")
+int bpf_prog2(struct pt_regs *ctx)
+{
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c
new file mode 100644
index 0000000..8381d79
--- /dev/null
+++ b/samples/bpf/task_fd_query_user.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <linux/bpf.h>
+#include <sys/ioctl.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+#include "trace_helpers.h"
+
+#define CHECK_PERROR_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret) {					\
+		printf("FAIL: %s:\n", __func__);	\
+		perror("    ");			\
+		return -1;				\
+	}						\
+})
+
+#define CHECK_AND_RET(condition) ({			\
+	int __ret = !!(condition);			\
+	if (__ret)					\
+		return -1;				\
+})
+
+static __u64 ptr_to_u64(void *ptr)
+{
+	return (__u64) (unsigned long) ptr;
+}
+
+#define PMU_TYPE_FILE "/sys/bus/event_source/devices/%s/type"
+static int bpf_find_probe_type(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_TYPE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	errno = 0;
+	ret = (int)strtol(buf, NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+#define PMU_RETPROBE_FILE "/sys/bus/event_source/devices/%s/format/retprobe"
+static int bpf_get_retprobe_bit(const char *event_type)
+{
+	char buf[256];
+	int fd, ret;
+
+	ret = snprintf(buf, sizeof(buf), PMU_RETPROBE_FILE, event_type);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+
+	fd = open(buf, O_RDONLY);
+	CHECK_PERROR_RET(fd < 0);
+
+	ret = read(fd, buf, sizeof(buf));
+	close(fd);
+	CHECK_PERROR_RET(ret < 0 || ret >= sizeof(buf));
+	CHECK_PERROR_RET(strlen(buf) < strlen("config:"));
+
+	errno = 0;
+	ret = (int)strtol(buf + strlen("config:"), NULL, 10);
+	CHECK_PERROR_RET(errno);
+	return ret;
+}
+
+static int test_debug_fs_kprobe(int prog_fd_idx, const char *fn_name,
+				__u32 expected_fd_type)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	char buf[256];
+	int err;
+
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), event_fd[prog_fd_idx], 0, buf, &len,
+				&prog_id, &fd_type, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, for event_fd idx %d, fn_name %s\n",
+		       __func__, prog_fd_idx, fn_name);
+		perror("    :");
+		return -1;
+	}
+	if (strcmp(buf, fn_name) != 0 ||
+	    fd_type != expected_fd_type ||
+	    probe_offset != 0x0 || probe_addr != 0x0) {
+		printf("FAIL: bpf_trace_event_query(event_fd[%d]):\n",
+		       prog_fd_idx);
+		printf("buf: %s, fd_type: %u, probe_offset: 0x%llx,"
+		       " probe_addr: 0x%llx\n",
+		       buf, fd_type, probe_offset, probe_addr);
+		return -1;
+	}
+	return 0;
+}
+
+static int test_nondebug_fs_kuprobe_common(const char *event_type,
+	const char *name, __u64 offset, __u64 addr, bool is_return,
+	char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type,
+	__u64 *probe_offset, __u64 *probe_addr)
+{
+	int is_return_bit = bpf_get_retprobe_bit(event_type);
+	int type = bpf_find_probe_type(event_type);
+	struct perf_event_attr attr = {};
+	int fd;
+
+	if (type < 0 || is_return_bit < 0) {
+		printf("FAIL: %s incorrect type (%d) or is_return_bit (%d)\n",
+			__func__, type, is_return_bit);
+		return -1;
+	}
+
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	if (is_return)
+		attr.config |= 1 << is_return_bit;
+
+	if (name) {
+		attr.config1 = ptr_to_u64((void *)name);
+		attr.config2 = offset;
+	} else {
+		attr.config1 = 0;
+		attr.config2 = addr;
+	}
+	attr.size = sizeof(attr);
+	attr.type = type;
+
+	fd = sys_perf_event_open(&attr, -1, 0, -1, 0);
+	CHECK_PERROR_RET(fd < 0);
+
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+	CHECK_PERROR_RET(ioctl(fd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(bpf_task_fd_query(getpid(), fd, 0, buf, buf_len,
+			 prog_id, fd_type, probe_offset, probe_addr) < 0);
+
+	return 0;
+}
+
+static int test_nondebug_fs_probe(const char *event_type, const char *name,
+				  __u64 offset, __u64 addr, bool is_return,
+				  __u32 expected_fd_type,
+				  __u32 expected_ret_fd_type,
+				  char *buf, __u32 buf_len)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 prog_id, fd_type;
+	int err;
+
+	err = test_nondebug_fs_kuprobe_common(event_type, name,
+					      offset, addr, is_return,
+					      buf, &buf_len, &prog_id,
+					      &fd_type, &probe_offset,
+					      &probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, "
+		       "for name %s, offset 0x%llx, addr 0x%llx, is_return %d\n",
+		       __func__, name ? name : "", offset, addr, is_return);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && fd_type != expected_ret_fd_type) ||
+	    (!is_return && fd_type != expected_fd_type)) {
+		printf("FAIL: %s, incorrect fd_type %u\n",
+		       __func__, fd_type);
+		return -1;
+	}
+	if (name) {
+		if (strcmp(name, buf) != 0) {
+			printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+			return -1;
+		}
+		if (probe_offset != offset) {
+			printf("FAIL: %s, incorrect probe_offset 0x%llx\n",
+			       __func__, probe_offset);
+			return -1;
+		}
+	} else {
+		if (buf_len != 0) {
+			printf("FAIL: %s, incorrect buf %p\n",
+			       __func__, buf);
+			return -1;
+		}
+
+		if (probe_addr != addr) {
+			printf("FAIL: %s, incorrect probe_addr 0x%llx\n",
+			       __func__, probe_addr);
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return)
+{
+	const char *event_type = "uprobe";
+	struct perf_event_attr attr = {};
+	char buf[256], event_alias[256];
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	int err, res, kfd, efd;
+	ssize_t bytes;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/%s_events",
+		 event_type);
+	kfd = open(buf, O_WRONLY | O_APPEND, 0);
+	CHECK_PERROR_RET(kfd < 0);
+
+	res = snprintf(event_alias, sizeof(event_alias), "test_%d", getpid());
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(event_alias));
+
+	res = snprintf(buf, sizeof(buf), "%c:%ss/%s %s:0x%lx",
+		       is_return ? 'r' : 'p', event_type, event_alias,
+		       binary_path, offset);
+	CHECK_PERROR_RET(res < 0 || res >= sizeof(buf));
+	CHECK_PERROR_RET(write(kfd, buf, strlen(buf)) < 0);
+
+	close(kfd);
+	kfd = -1;
+
+	snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/%ss/%s/id",
+		 event_type, event_alias);
+	efd = open(buf, O_RDONLY, 0);
+	CHECK_PERROR_RET(efd < 0);
+
+	bytes = read(efd, buf, sizeof(buf));
+	CHECK_PERROR_RET(bytes <= 0 || bytes >= sizeof(buf));
+	close(efd);
+	buf[bytes] = '\0';
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	kfd = sys_perf_event_open(&attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
+	CHECK_PERROR_RET(kfd < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_SET_BPF, prog_fd[0]) < 0);
+	CHECK_PERROR_RET(ioctl(kfd, PERF_EVENT_IOC_ENABLE, 0) < 0);
+
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), kfd, 0, buf, &len,
+				&prog_id, &fd_type, &probe_offset,
+				&probe_addr);
+	if (err < 0) {
+		printf("FAIL: %s, binary_path %s\n", __func__, binary_path);
+		perror("    :");
+		return -1;
+	}
+	if ((is_return && fd_type != BPF_FD_TYPE_URETPROBE) ||
+	    (!is_return && fd_type != BPF_FD_TYPE_UPROBE)) {
+		printf("FAIL: %s, incorrect fd_type %u\n", __func__,
+		       fd_type);
+		return -1;
+	}
+	if (strcmp(binary_path, buf) != 0) {
+		printf("FAIL: %s, incorrect buf %s\n", __func__, buf);
+		return -1;
+	}
+	if (probe_offset != offset) {
+		printf("FAIL: %s, incorrect probe_offset 0x%llx\n", __func__,
+		       probe_offset);
+		return -1;
+	}
+
+	close(kfd);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {1024*1024, RLIM_INFINITY};
+	extern char __executable_start;
+	char filename[256], buf[256];
+	__u64 uprobe_file_offset;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return 1;
+	}
+
+	if (load_kallsyms()) {
+		printf("failed to process /proc/kallsyms\n");
+		return 1;
+	}
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	/* test two functions in the corresponding *_kern.c file */
+	CHECK_AND_RET(test_debug_fs_kprobe(0, "blk_start_request",
+					   BPF_FD_TYPE_KPROBE));
+	CHECK_AND_RET(test_debug_fs_kprobe(1, "blk_account_io_completion",
+					   BPF_FD_TYPE_KRETPROBE));
+
+	/* test nondebug fs kprobe */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     false, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+#ifdef __x86_64__
+	/* set a kprobe on "bpf_check + 0x5", which is x64 specific */
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x5, 0x0,
+					     false, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+#endif
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", "bpf_check", 0x0, 0x0,
+					     true, BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), false,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     NULL, 0));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("kprobe", NULL, 0x0,
+					     ksym_get_addr("bpf_check"), true,
+					     BPF_FD_TYPE_KPROBE,
+					     BPF_FD_TYPE_KRETPROBE,
+					     0, 0));
+
+	/* test nondebug fs uprobe */
+	/* the calculation of uprobe file offset is based on gcc 7.3.1 on x64
+	 * and the default linker script, which defines __executable_start as
+	 * the start of the .text section. The calculation could be different
+	 * on different systems with different compilers. The right way is
+	 * to parse the ELF file. We took a shortcut here.
+	 */
+	uprobe_file_offset = (__u64)main - (__u64)&__executable_start;
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, false,
+					     BPF_FD_TYPE_UPROBE,
+					     BPF_FD_TYPE_URETPROBE,
+					     buf, sizeof(buf)));
+	CHECK_AND_RET(test_nondebug_fs_probe("uprobe", (char *)argv[0],
+					     uprobe_file_offset, 0x0, true,
+					     BPF_FD_TYPE_UPROBE,
+					     BPF_FD_TYPE_URETPROBE,
+					     buf, sizeof(buf)));
+
+	/* test debug fs uprobe */
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   false));
+	CHECK_AND_RET(test_debug_fs_uprobe((char *)argv[0], uprobe_file_offset,
+					   true));
+
+	return 0;
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 6/7] tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
From: Yonghong Song @ 2018-05-24  0:20 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524002020.1182296-1-yhs@fb.com>

The new tests are added to query perf_event information
for raw_tracepoint and tracepoint attachment. For tracepoint,
both syscalls and non-syscalls tracepoints are queries as
they are treated slightly differently inside the kernel.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c | 158 +++++++++++++++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 3ecf733..c9fd351 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1542,6 +1542,162 @@ static void test_get_stack_raw_tp(void)
 	bpf_object__close(obj);
 }
 
+static void test_task_fd_query_rawtp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	int efd, err, prog_fd;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sys_enter", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	/* query (getpid(), efd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      strcmp(buf, "sys_enter") == 0;
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_prog;
+
+	/* test zero len */
+	len = 0;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (len = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == (strlen("sys_enter") + 1);
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test empty buffer */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), efd, 0, 0, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query (buf = 0)", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == (strlen("sys_enter") + 1);
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	/* test smaller buffer */
+	len = 2;
+	err = bpf_task_fd_query(getpid(), efd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err >= 0 || errno != ENOSPC, "bpf_task_fd_query (len = 2)",
+		  "err %d errno %d\n", err, errno))
+		goto close_prog;
+	err = fd_type == BPF_FD_TYPE_RAW_TRACEPOINT &&
+	      len == (strlen("sys_enter") + 1) &&
+	      strncmp(buf, "sys_enter", 2) == 0;
+	if (CHECK(!err, "check_results", "fd_type %d len %u\n", fd_type, len))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp_core(const char *probe_name,
+				       const char *tp_name)
+{
+	const char *file = "./test_tracepoint.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	struct perf_event_attr attr = {};
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	struct bpf_object *obj;
+	__u32 duration = 0;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(err, "perf_event_open", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* query (getpid(), pmu_fd) */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(getpid(), pmu_fd, 0, buf, &len, &prog_id,
+				&fd_type, &probe_offset, &probe_addr);
+	if (CHECK(err < 0, "bpf_task_fd_query", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = (fd_type == BPF_FD_TYPE_TRACEPOINT) && !strcmp(buf, tp_name);
+	if (CHECK(!err, "check_results", "fd_type %d tp_name %s\n",
+		  fd_type, buf))
+		goto close_pmu;
+
+	close(pmu_fd);
+	goto close_prog_noerr;
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
+static void test_task_fd_query_tp(void)
+{
+	test_task_fd_query_tp_core("sched/sched_switch",
+				   "sched_switch");
+	test_task_fd_query_tp_core("syscalls/sys_enter_read",
+				   "sys_enter_read");
+}
+
 int main(void)
 {
 	jit_enabled = is_jit_enabled();
@@ -1561,6 +1717,8 @@ int main(void)
 	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
+	test_task_fd_query_rawtp();
+	test_task_fd_query_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 7/7] tools/bpftool: add perf subcommand
From: Yonghong Song @ 2018-05-24  0:20 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524002020.1182296-1-yhs@fb.com>

The new command "bpftool perf [show | list]" will traverse
all processes under /proc, and if any fd is associated
with a perf event, it will print out related perf event
information. Documentation is also added.

Below is an example to show the results using bcc commands.
Running the following 4 bcc commands:
  kprobe:     trace.py '__x64_sys_nanosleep'
  kretprobe:  trace.py 'r::__x64_sys_nanosleep'
  tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
  uprobe:     trace.py 'p:/home/yhs/a.out:main'

The bpftool command line and result:

  $ bpftool perf
  pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
  pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
  pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
  pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159

  $ bpftool -j perf
  [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
   {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
   {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
   {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]

  $ bpftool prog
  5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
	  loaded_at 2018-05-15T04:46:37-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 4
  7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
	  loaded_at 2018-05-15T04:48:32-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 7
  8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
	  loaded_at 2018-05-15T04:48:48-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 8
  9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
	  loaded_at 2018-05-15T04:49:52-0700  uid 0
	  xlated 200B  not jited  memlock 4096B  map_ids 9

  $ ps ax | grep "python ./trace.py"
  21711 pts/0    T      0:03 python ./trace.py __x64_sys_write
  21765 pts/0    S+     0:00 python ./trace.py r::__x64_sys_nanosleep
  21767 pts/2    S+     0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
  21800 pts/3    S+     0:00 python ./trace.py p:/home/yhs/a.out:main
  22374 pts/1    S+     0:00 grep --color=auto python ./trace.py

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 ++++++++
 tools/bpf/bpftool/Documentation/bpftool.rst      |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool        |   9 +
 tools/bpf/bpftool/main.c                         |   3 +-
 tools/bpf/bpftool/main.h                         |   1 +
 tools/bpf/bpftool/perf.c                         | 246 +++++++++++++++++++++++
 6 files changed, 343 insertions(+), 2 deletions(-)
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

diff --git a/tools/bpf/bpftool/Documentation/bpftool-perf.rst b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
new file mode 100644
index 0000000..e3eb0ea
--- /dev/null
+++ b/tools/bpf/bpftool/Documentation/bpftool-perf.rst
@@ -0,0 +1,81 @@
+================
+bpftool-perf
+================
+-------------------------------------------------------------------------------
+tool for inspection of perf related bpf prog attachments
+-------------------------------------------------------------------------------
+
+:Manual section: 8
+
+SYNOPSIS
+========
+
+	**bpftool** [*OPTIONS*] **perf** *COMMAND*
+
+	*OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
+
+	*COMMANDS* :=
+	{ **show** | **list** | **help** }
+
+PERF COMMANDS
+=============
+
+|	**bpftool** **perf { show | list }**
+|	**bpftool** **perf help**
+
+DESCRIPTION
+===========
+	**bpftool perf { show | list }**
+		  List all raw_tracepoint, tracepoint, kprobe attachment in the system.
+
+		  Output will start with process id and file descriptor in that process,
+		  followed by bpf program id, attachment information, and attachment point.
+		  The attachment point for raw_tracepoint/tracepoint is the trace probe name.
+		  The attachment point for k[ret]probe is either symbol name and offset,
+		  or a kernel virtual address.
+		  The attachment point for u[ret]probe is the file name and the file offset.
+
+	**bpftool perf help**
+		  Print short help message.
+
+OPTIONS
+=======
+	-h, --help
+		  Print short generic help message (similar to **bpftool help**).
+
+	-v, --version
+		  Print version number (similar to **bpftool version**).
+
+	-j, --json
+		  Generate JSON output. For commands that cannot produce JSON, this
+		  option has no effect.
+
+	-p, --pretty
+		  Generate human-readable JSON output. Implies **-j**.
+
+EXAMPLES
+========
+
+| **# bpftool perf**
+
+::
+
+      pid 21711  fd 5: prog_id 5  kprobe  func __x64_sys_write  offset 0
+      pid 21765  fd 5: prog_id 7  kretprobe  func __x64_sys_nanosleep  offset 0
+      pid 21767  fd 5: prog_id 8  tracepoint  sys_enter_nanosleep
+      pid 21800  fd 5: prog_id 9  uprobe  filename /home/yhs/a.out  offset 1159
+
+|
+| **# bpftool -j perf**
+
+::
+
+    [{"pid":21711,"fd":5,"prog_id":5,"fd_type":"kprobe","func":"__x64_sys_write","offset":0}, \
+     {"pid":21765,"fd":5,"prog_id":7,"fd_type":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
+     {"pid":21767,"fd":5,"prog_id":8,"fd_type":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
+     {"pid":21800,"fd":5,"prog_id":9,"fd_type":"uprobe","filename":"/home/yhs/a.out","offset":1159}]
+
+
+SEE ALSO
+========
+	**bpftool**\ (8), **bpftool-prog**\ (8), **bpftool-map**\ (8)
diff --git a/tools/bpf/bpftool/Documentation/bpftool.rst b/tools/bpf/bpftool/Documentation/bpftool.rst
index 564cb0d..b6f5d56 100644
--- a/tools/bpf/bpftool/Documentation/bpftool.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool.rst
@@ -16,7 +16,7 @@ SYNOPSIS
 
 	**bpftool** **version**
 
-	*OBJECT* := { **map** | **program** | **cgroup** }
+	*OBJECT* := { **map** | **program** | **cgroup** | **perf** }
 
 	*OPTIONS* := { { **-V** | **--version** } | { **-h** | **--help** }
 	| { **-j** | **--json** } [{ **-p** | **--pretty** }] }
@@ -30,6 +30,8 @@ SYNOPSIS
 
 	*CGROUP-COMMANDS* := { **show** | **list** | **attach** | **detach** | **help** }
 
+	*PERF-COMMANDS* := { **show** | **list** | **help** }
+
 DESCRIPTION
 ===========
 	*bpftool* allows for inspection and simple modification of BPF objects
@@ -56,3 +58,4 @@ OPTIONS
 SEE ALSO
 ========
 	**bpftool-map**\ (8), **bpftool-prog**\ (8), **bpftool-cgroup**\ (8)
+        **bpftool-perf**\ (8)
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index b301c9b..7bc198d 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -448,6 +448,15 @@ _bpftool()
                     ;;
             esac
             ;;
+        perf)
+            case $command in
+                *)
+                    [[ $prev == $object ]] && \
+                        COMPREPLY=( $( compgen -W 'help \
+                            show list' -- "$cur" ) )
+                    ;;
+            esac
+            ;;
     esac
 } &&
 complete -F _bpftool bpftool
diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
index 1ec852d..eea7f14 100644
--- a/tools/bpf/bpftool/main.c
+++ b/tools/bpf/bpftool/main.c
@@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
 		"       %s batch file FILE\n"
 		"       %s version\n"
 		"\n"
-		"       OBJECT := { prog | map | cgroup }\n"
+		"       OBJECT := { prog | map | cgroup | perf }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, bin_name, bin_name);
@@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
 	{ "prog",	do_prog },
 	{ "map",	do_map },
 	{ "cgroup",	do_cgroup },
+	{ "perf",	do_perf },
 	{ "version",	do_version },
 	{ 0 }
 };
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 6173cd9..63fdb31 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
 int do_map(int argc, char **arg);
 int do_event_pipe(int argc, char **argv);
 int do_cgroup(int argc, char **arg);
+int do_perf(int argc, char **arg);
 
 int prog_parse_fd(int *argc, char ***argv);
 int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
new file mode 100644
index 0000000..ac6b1a1
--- /dev/null
+++ b/tools/bpf/bpftool/perf.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0+
+// Copyright (C) 2018 Facebook
+// Author: Yonghong Song <yhs@fb.com>
+
+#define _GNU_SOURCE
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ftw.h>
+
+#include <bpf.h>
+
+#include "main.h"
+
+/* 0: undecided, 1: supported, 2: not supported */
+static int perf_query_supported;
+static bool has_perf_query_support(void)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	char buf[256];
+	int fd;
+
+	if (perf_query_supported)
+		goto out;
+
+	fd = open(bin_name, O_RDONLY);
+	if (fd < 0) {
+		p_err("perf_query_support: %s", strerror(errno));
+		goto out;
+	}
+
+	/* the following query will fail as no bpf attachment,
+	 * the expected errno is ENOTSUPP
+	 */
+	errno = 0;
+	len = sizeof(buf);
+	bpf_task_fd_query(getpid(), fd, 0, buf, &len, &prog_id,
+			  &fd_type, &probe_offset, &probe_addr);
+
+	if (errno == 524 /* ENOTSUPP */) {
+		perf_query_supported = 1;
+		goto close_fd;
+	}
+
+	perf_query_supported = 2;
+	p_err("perf_query_support: %s", strerror(errno));
+	fprintf(stderr,
+		"HINT: non root or kernel doesn't support TASK_FD_QUERY\n");
+
+close_fd:
+	close(fd);
+out:
+	return perf_query_supported == 1;
+}
+
+static void print_perf_json(int pid, int fd, __u32 prog_id, __u32 fd_type,
+			    char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	jsonw_start_object(json_wtr);
+	jsonw_int_field(json_wtr, "pid", pid);
+	jsonw_int_field(json_wtr, "fd", fd);
+	jsonw_uint_field(json_wtr, "prog_id", prog_id);
+	switch (fd_type) {
+	case BPF_FD_TYPE_RAW_TRACEPOINT:
+		jsonw_string_field(json_wtr, "fd_type", "raw_tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_FD_TYPE_TRACEPOINT:
+		jsonw_string_field(json_wtr, "fd_type", "tracepoint");
+		jsonw_string_field(json_wtr, "tracepoint", buf);
+		break;
+	case BPF_FD_TYPE_KPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "kprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_FD_TYPE_KRETPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "kretprobe");
+		if (buf[0] != '\0') {
+			jsonw_string_field(json_wtr, "func", buf);
+			jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		} else {
+			jsonw_lluint_field(json_wtr, "addr", probe_addr);
+		}
+		break;
+	case BPF_FD_TYPE_UPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "uprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	case BPF_FD_TYPE_URETPROBE:
+		jsonw_string_field(json_wtr, "fd_type", "uretprobe");
+		jsonw_string_field(json_wtr, "filename", buf);
+		jsonw_lluint_field(json_wtr, "offset", probe_offset);
+		break;
+	}
+	jsonw_end_object(json_wtr);
+}
+
+static void print_perf_plain(int pid, int fd, __u32 prog_id, __u32 fd_type,
+			     char *buf, __u64 probe_offset, __u64 probe_addr)
+{
+	printf("pid %d  fd %d: prog_id %u  ", pid, fd, prog_id);
+	switch (fd_type) {
+	case BPF_FD_TYPE_RAW_TRACEPOINT:
+		printf("raw_tracepoint  %s\n", buf);
+		break;
+	case BPF_FD_TYPE_TRACEPOINT:
+		printf("tracepoint  %s\n", buf);
+		break;
+	case BPF_FD_TYPE_KPROBE:
+		if (buf[0] != '\0')
+			printf("kprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_FD_TYPE_KRETPROBE:
+		if (buf[0] != '\0')
+			printf("kretprobe  func %s  offset %llu\n", buf,
+			       probe_offset);
+		else
+			printf("kretprobe  addr %llu\n", probe_addr);
+		break;
+	case BPF_FD_TYPE_UPROBE:
+		printf("uprobe  filename %s  offset %llu\n", buf, probe_offset);
+		break;
+	case BPF_FD_TYPE_URETPROBE:
+		printf("uretprobe  filename %s  offset %llu\n", buf,
+		       probe_offset);
+		break;
+	}
+}
+
+static int show_proc(const char *fpath, const struct stat *sb,
+		     int tflag, struct FTW *ftwbuf)
+{
+	__u64 probe_offset, probe_addr;
+	__u32 len, prog_id, fd_type;
+	int err, pid = 0, fd = 0;
+	const char *pch;
+	char buf[4096];
+
+	/* prefix always /proc */
+	pch = fpath + 5;
+	if (*pch == '\0')
+		return 0;
+
+	/* pid should be all numbers */
+	pch++;
+	while (isdigit(*pch)) {
+		pid = pid * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd directory */
+	pch++;
+	if (strncmp(pch, "fd", 2))
+		return FTW_SKIP_SUBTREE;
+	pch += 2;
+	if (*pch == '\0')
+		return 0;
+	if (*pch != '/')
+		return FTW_SKIP_SUBTREE;
+
+	/* check /proc/<pid>/fd/<fd_num> */
+	pch++;
+	while (isdigit(*pch)) {
+		fd = fd * 10 + *pch - '0';
+		pch++;
+	}
+	if (*pch != '\0')
+		return FTW_SKIP_SUBTREE;
+
+	/* query (pid, fd) for potential perf events */
+	len = sizeof(buf);
+	err = bpf_task_fd_query(pid, fd, 0, buf, &len, &prog_id, &fd_type,
+				&probe_offset, &probe_addr);
+	if (err < 0)
+		return 0;
+
+	if (json_output)
+		print_perf_json(pid, fd, prog_id, fd_type, buf, probe_offset,
+				probe_addr);
+	else
+		print_perf_plain(pid, fd, prog_id, fd_type, buf, probe_offset,
+				 probe_addr);
+
+	return 0;
+}
+
+static int do_show(int argc, char **argv)
+{
+	int flags = FTW_ACTIONRETVAL | FTW_PHYS;
+	int err = 0, nopenfd = 16;
+
+	if (!has_perf_query_support())
+		return -1;
+
+	if (json_output)
+		jsonw_start_array(json_wtr);
+	if (nftw("/proc", show_proc, nopenfd, flags) == -1) {
+		p_err("%s", strerror(errno));
+		err = -1;
+	}
+	if (json_output)
+		jsonw_end_array(json_wtr);
+
+	return err;
+}
+
+static int do_help(int argc, char **argv)
+{
+	fprintf(stderr,
+		"Usage: %s %s { show | list | help }\n"
+		"",
+		bin_name, argv[-2]);
+
+	return 0;
+}
+
+static const struct cmd cmds[] = {
+	{ "show",	do_show },
+	{ "list",	do_show },
+	{ "help",	do_help },
+	{ 0 }
+};
+
+int do_perf(int argc, char **argv)
+{
+	return cmd_select(cmds, argc, argv, do_help);
+}
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 1/7] perf/core: add perf_get_event() to return perf_event given a struct file
From: Yonghong Song @ 2018-05-24  0:18 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>

A new extern function, perf_get_event(), is added to return a perf event
given a struct file. This function will be used in later patches.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 ++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99e..eec302b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -868,6 +868,7 @@ extern void perf_event_exit_task(struct task_struct *child);
 extern void perf_event_free_task(struct task_struct *task);
 extern void perf_event_delayed_put(struct task_struct *task);
 extern struct file *perf_event_get(unsigned int fd);
+extern const struct perf_event *perf_get_event(struct file *file);
 extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
@@ -1289,6 +1290,10 @@ static inline void perf_event_exit_task(struct task_struct *child)	{ }
 static inline void perf_event_free_task(struct task_struct *task)	{ }
 static inline void perf_event_delayed_put(struct task_struct *task)	{ }
 static inline struct file *perf_event_get(unsigned int fd)	{ return ERR_PTR(-EINVAL); }
+static inline const struct perf_event *perf_get_event(struct file *file)
+{
+	return ERR_PTR(-EINVAL);
+}
 static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	return ERR_PTR(-EINVAL);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 67612ce..6eeab86 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11212,6 +11212,14 @@ struct file *perf_event_get(unsigned int fd)
 	return file;
 }
 
+const struct perf_event *perf_get_event(struct file *file)
+{
+	if (file->f_op != &perf_fops)
+		return ERR_PTR(-EINVAL);
+
+	return file->private_data;
+}
+
 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
 {
 	if (!event)
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 0/7] bpf: implement BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-24  0:18 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, this command will return bpf related information
to user space. Right now it only supports tracepoint/kprobe/uprobe
perf event fd's. For such a fd, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Patch #1 adds function perf_get_event() in kernel/events/core.c.
Patch #2 implements the bpf subcommand BPF_TASK_FD_QUERY.
Patch #3 syncs tools bpf.h header and also add bpf_task_fd_query()
in the libbpf library for samples/selftests/bpftool to use.
Patch #4 adds ksym_get_addr() utility function.
Patch #5 add a test in samples/bpf for querying k[ret]probes and
u[ret]probes.
Patch #6 add a test in tools/testing/selftests/bpf for querying
raw_tracepoint and tracepoint.
Patch #7 add a new subcommand "perf" to bpftool.

Changelogs:
  v3 -> v4:
     . made attr buf_len input/output. The length of
       actual buffter is written to buf_len so user space knows
       what is actually needed. If user provides a buffer
       with length >= 1 but less than required, do partial
       copy and return -ENOSPC.
     . code simplification with put_user.
     . changed query result attach_info to fd_type.
     . add tests at selftests/bpf to test zero len, null buf and
       insufficient buf.
  v2 -> v3:
     . made perf_get_event() return perf_event pointer const.
       this was to ensure that event fields are not meddled.
     . detect whether newly BPF_TASK_FD_QUERY is supported or
       not in "bpftool perf" and warn users if it is not.
  v1 -> v2:
     . changed bpf subcommand name from BPF_PERF_EVENT_QUERY
       to BPF_TASK_FD_QUERY.
     . fixed various "bpftool perf" issues and added documentation
       and auto-completion.

Yonghong Song (7):
  perf/core: add perf_get_event() to return perf_event given a struct
    file
  bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
  tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in
    libbpf
  tools/bpf: add ksym_get_addr() in trace_helpers
  samples/bpf: add a samples/bpf test for BPF_TASK_FD_QUERY
  tools/bpf: add two BPF_TASK_FD_QUERY tests in test_progs
  tools/bpftool: add perf subcommand

 include/linux/perf_event.h                       |   5 +
 include/linux/trace_events.h                     |  17 +
 include/uapi/linux/bpf.h                         |  26 ++
 kernel/bpf/syscall.c                             | 115 +++++++
 kernel/events/core.c                             |   8 +
 kernel/trace/bpf_trace.c                         |  48 +++
 kernel/trace/trace_kprobe.c                      |  29 ++
 kernel/trace/trace_uprobe.c                      |  22 ++
 samples/bpf/Makefile                             |   4 +
 samples/bpf/task_fd_query_kern.c                 |  19 ++
 samples/bpf/task_fd_query_user.c                 | 382 +++++++++++++++++++++++
 tools/bpf/bpftool/Documentation/bpftool-perf.rst |  81 +++++
 tools/bpf/bpftool/Documentation/bpftool.rst      |   5 +-
 tools/bpf/bpftool/bash-completion/bpftool        |   9 +
 tools/bpf/bpftool/main.c                         |   3 +-
 tools/bpf/bpftool/main.h                         |   1 +
 tools/bpf/bpftool/perf.c                         | 246 +++++++++++++++
 tools/include/uapi/linux/bpf.h                   |  26 ++
 tools/lib/bpf/bpf.c                              |  23 ++
 tools/lib/bpf/bpf.h                              |   3 +
 tools/testing/selftests/bpf/test_progs.c         | 158 ++++++++++
 tools/testing/selftests/bpf/trace_helpers.c      |  12 +
 tools/testing/selftests/bpf/trace_helpers.h      |   1 +
 23 files changed, 1241 insertions(+), 2 deletions(-)
 create mode 100644 samples/bpf/task_fd_query_kern.c
 create mode 100644 samples/bpf/task_fd_query_user.c
 create mode 100644 tools/bpf/bpftool/Documentation/bpftool-perf.rst
 create mode 100644 tools/bpf/bpftool/perf.c

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v4 3/7] tools/bpf: sync kernel header bpf.h and add bpf_task_fd_query in libbpf
From: Yonghong Song @ 2018-05-24  0:18 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>

Sync kernel header bpf.h to tools/include/uapi/linux/bpf.h and
implement bpf_task_fd_query() in libbpf. The test programs
in samples/bpf and tools/testing/selftests/bpf, and later bpftool
will use this libbpf function to query kernel.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++++
 tools/lib/bpf/bpf.c            | 23 +++++++++++++++++++++++
 tools/lib/bpf/bpf.h            |  3 +++
 3 files changed, 52 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index c3e502d..0d51946 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index 442b4cd..9ddc89d 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -643,3 +643,26 @@ int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 
 	return fd;
 }
+
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+		      __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+		      __u64 *probe_addr)
+{
+	union bpf_attr attr = {};
+	int err;
+
+	attr.task_fd_query.pid = pid;
+	attr.task_fd_query.fd = fd;
+	attr.task_fd_query.flags = flags;
+	attr.task_fd_query.buf = ptr_to_u64(buf);
+	attr.task_fd_query.buf_len = *buf_len;
+
+	err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr));
+	*buf_len = attr.task_fd_query.buf_len;
+	*prog_id = attr.task_fd_query.prog_id;
+	*fd_type = attr.task_fd_query.fd_type;
+	*probe_offset = attr.task_fd_query.probe_offset;
+	*probe_addr = attr.task_fd_query.probe_addr;
+
+	return err;
+}
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index d12344f..0639a30 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -107,4 +107,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 int bpf_raw_tracepoint_open(const char *name, int prog_fd);
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 		 bool do_log);
+int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len,
+		      __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset,
+		      __u64 *probe_addr);
 #endif
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 2/7] bpf: introduce bpf subcommand BPF_TASK_FD_QUERY
From: Yonghong Song @ 2018-05-24  0:18 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>

Currently, suppose a userspace application has loaded a bpf program
and attached it to a tracepoint/kprobe/uprobe, and a bpf
introspection tool, e.g., bpftool, wants to show which bpf program
is attached to which tracepoint/kprobe/uprobe. Such attachment
information will be really useful to understand the overall bpf
deployment in the system.

There is a name field (16 bytes) for each program, which could
be used to encode the attachment point. There are some drawbacks
for this approaches. First, bpftool user (e.g., an admin) may not
really understand the association between the name and the
attachment point. Second, if one program is attached to multiple
places, encoding a proper name which can imply all these
attachments becomes difficult.

This patch introduces a new bpf subcommand BPF_TASK_FD_QUERY.
Given a pid and fd, if the <pid, fd> is associated with a
tracepoint/kprobe/uprobe perf event, BPF_TASK_FD_QUERY will return
   . prog_id
   . tracepoint name, or
   . k[ret]probe funcname + offset or kernel addr, or
   . u[ret]probe filename + offset
to the userspace.
The user can use "bpftool prog" to find more information about
bpf program itself with prog_id.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/trace_events.h |  17 +++++++
 include/uapi/linux/bpf.h     |  26 ++++++++++
 kernel/bpf/syscall.c         | 115 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c     |  48 ++++++++++++++++++
 kernel/trace/trace_kprobe.c  |  29 +++++++++++
 kernel/trace/trace_uprobe.c  |  22 +++++++++
 6 files changed, 257 insertions(+)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 2bde3ef..d34144a 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -473,6 +473,9 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info);
 int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog);
 struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name);
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr);
 #else
 static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
@@ -504,6 +507,13 @@ static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name
 {
 	return NULL;
 }
+static inline int bpf_get_perf_event_info(const struct perf_event *event,
+					  u32 *prog_id, u32 *fd_type,
+					  const char **buf, u64 *probe_offset,
+					  u64 *probe_addr)
+{
+	return -EOPNOTSUPP;
+}
 #endif
 
 enum {
@@ -560,10 +570,17 @@ extern void perf_trace_del(struct perf_event *event, int flags);
 #ifdef CONFIG_KPROBE_EVENTS
 extern int  perf_kprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_kprobe_destroy(struct perf_event *event);
+extern int bpf_get_kprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **symbol,
+			       u64 *probe_offset, u64 *probe_addr,
+			       bool perf_type_tracepoint);
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
 extern int  perf_uprobe_init(struct perf_event *event, bool is_retprobe);
 extern void perf_uprobe_destroy(struct perf_event *event);
+extern int bpf_get_uprobe_info(const struct perf_event *event,
+			       u32 *fd_type, const char **filename,
+			       u64 *probe_offset, bool perf_type_tracepoint);
 #endif
 extern int  ftrace_profile_set_filter(struct perf_event *event, int event_id,
 				     char *filter_str);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c3e502d..0d51946 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -97,6 +97,7 @@ enum bpf_cmd {
 	BPF_RAW_TRACEPOINT_OPEN,
 	BPF_BTF_LOAD,
 	BPF_BTF_GET_FD_BY_ID,
+	BPF_TASK_FD_QUERY,
 };
 
 enum bpf_map_type {
@@ -379,6 +380,22 @@ union bpf_attr {
 		__u32		btf_log_size;
 		__u32		btf_log_level;
 	};
+
+	struct {
+		__u32		pid;		/* input: pid */
+		__u32		fd;		/* input: fd */
+		__u32		flags;		/* input: flags */
+		__u32		buf_len;	/* input/output: buf len */
+		__aligned_u64	buf;		/* input/output:
+						 *   tp_name for tracepoint
+						 *   symbol for kprobe
+						 *   filename for uprobe
+						 */
+		__u32		prog_id;	/* output: prod_id */
+		__u32		fd_type;	/* output: BPF_FD_TYPE_* */
+		__u64		probe_offset;	/* output: probe_offset */
+		__u64		probe_addr;	/* output: probe_addr */
+	} task_fd_query;
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
@@ -2458,4 +2475,13 @@ struct bpf_fib_lookup {
 	__u8	dmac[6];     /* ETH_ALEN */
 };
 
+enum bpf_task_fd_type {
+	BPF_FD_TYPE_RAW_TRACEPOINT,	/* tp name */
+	BPF_FD_TYPE_TRACEPOINT,		/* tp name */
+	BPF_FD_TYPE_KPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_KRETPROBE,		/* (symbol + offset) or addr */
+	BPF_FD_TYPE_UPROBE,		/* filename + offset */
+	BPF_FD_TYPE_URETPROBE,		/* filename + offset */
+};
+
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0b4c945..7dd8c86 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,7 +18,9 @@
 #include <linux/vmalloc.h>
 #include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
+#include <linux/fdtable.h>
 #include <linux/file.h>
+#include <linux/fs.h>
 #include <linux/license.h>
 #include <linux/filter.h>
 #include <linux/version.h>
@@ -2102,6 +2104,116 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	return btf_get_fd_by_id(attr->btf_id);
 }
 
+static int bpf_task_fd_query_copy(const union bpf_attr *attr,
+				    union bpf_attr __user *uattr,
+				    u32 prog_id, u32 fd_type,
+				    const char *buf, u64 probe_offset,
+				    u64 probe_addr)
+{
+	void __user *ubuf = u64_to_user_ptr(attr->task_fd_query.buf);
+	u32 len = buf ? strlen(buf) + 1 : 0, input_len;
+	int err = 0;
+
+	if (put_user(len, &uattr->task_fd_query.buf_len))
+		return -EFAULT;
+	input_len = attr->task_fd_query.buf_len;
+	if (input_len && len && ubuf) {
+		if (input_len < len) {
+			err = -ENOSPC;
+			len = input_len;
+		}
+		if (copy_to_user(ubuf, buf, len))
+			return -EFAULT;
+	}
+
+	if (put_user(prog_id, &uattr->task_fd_query.prog_id) ||
+	    put_user(fd_type, &uattr->task_fd_query.fd_type) ||
+	    put_user(probe_offset, &uattr->task_fd_query.probe_offset) ||
+	    put_user(probe_addr, &uattr->task_fd_query.probe_addr))
+		return -EFAULT;
+
+	return err;
+}
+
+#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr
+
+static int bpf_task_fd_query(const union bpf_attr *attr,
+			     union bpf_attr __user *uattr)
+{
+	pid_t pid = attr->task_fd_query.pid;
+	u32 fd = attr->task_fd_query.fd;
+	const struct perf_event *event;
+	struct files_struct *files;
+	struct task_struct *task;
+	struct file *file;
+	int err;
+
+	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (attr->task_fd_query.flags != 0)
+		return -EINVAL;
+
+	task = get_pid_task(find_vpid(pid), PIDTYPE_PID);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
+	if (!files)
+		return -ENOENT;
+
+	err = 0;
+	spin_lock(&files->file_lock);
+	file = fcheck_files(files, fd);
+	if (!file)
+		err = -EBADF;
+	else
+		get_file(file);
+	spin_unlock(&files->file_lock);
+	put_files_struct(files);
+
+	if (err)
+		goto out;
+
+	if (file->f_op == &bpf_raw_tp_fops) {
+		struct bpf_raw_tracepoint *raw_tp = file->private_data;
+		struct bpf_raw_event_map *btp = raw_tp->btp;
+
+		err = bpf_task_fd_query_copy(attr, uattr,
+					     raw_tp->prog->aux->id,
+					     BPF_FD_TYPE_RAW_TRACEPOINT,
+					     btp->tp->name, 0, 0);
+		goto put_file;
+	}
+
+	event = perf_get_event(file);
+	if (!IS_ERR(event)) {
+		u64 probe_offset, probe_addr;
+		u32 prog_id, fd_type;
+		const char *buf;
+
+		err = bpf_get_perf_event_info(event, &prog_id, &fd_type,
+					      &buf, &probe_offset,
+					      &probe_addr);
+		if (!err)
+			err = bpf_task_fd_query_copy(attr, uattr, prog_id,
+						     fd_type, buf,
+						     probe_offset,
+						     probe_addr);
+		goto put_file;
+	}
+
+	err = -ENOTSUPP;
+put_file:
+	fput(file);
+out:
+	return err;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr = {};
@@ -2188,6 +2300,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_BTF_GET_FD_BY_ID:
 		err = bpf_btf_get_fd_by_id(&attr);
 		break;
+	case BPF_TASK_FD_QUERY:
+		err = bpf_task_fd_query(&attr, uattr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ce2cbbf..81fdf2f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,7 @@
 #include <linux/uaccess.h>
 #include <linux/ctype.h>
 #include <linux/kprobes.h>
+#include <linux/syscalls.h>
 #include <linux/error-injection.h>
 
 #include "trace_probe.h"
@@ -1163,3 +1164,50 @@ int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog)
 	mutex_unlock(&bpf_event_mutex);
 	return err;
 }
+
+int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id,
+			    u32 *fd_type, const char **buf,
+			    u64 *probe_offset, u64 *probe_addr)
+{
+	bool is_tracepoint, is_syscall_tp;
+	struct bpf_prog *prog;
+	int flags, err = 0;
+
+	prog = event->prog;
+	if (!prog)
+		return -ENOENT;
+
+	/* not supporting BPF_PROG_TYPE_PERF_EVENT yet */
+	if (prog->type == BPF_PROG_TYPE_PERF_EVENT)
+		return -EOPNOTSUPP;
+
+	*prog_id = prog->aux->id;
+	flags = event->tp_event->flags;
+	is_tracepoint = flags & TRACE_EVENT_FL_TRACEPOINT;
+	is_syscall_tp = is_syscall_trace_event(event->tp_event);
+
+	if (is_tracepoint || is_syscall_tp) {
+		*buf = is_tracepoint ? event->tp_event->tp->name
+				     : event->tp_event->name;
+		*fd_type = BPF_FD_TYPE_TRACEPOINT;
+		*probe_offset = 0x0;
+		*probe_addr = 0x0;
+	} else {
+		/* kprobe/uprobe */
+		err = -EOPNOTSUPP;
+#ifdef CONFIG_KPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_KPROBE)
+			err = bpf_get_kprobe_info(event, fd_type, buf,
+						  probe_offset, probe_addr,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+#ifdef CONFIG_UPROBE_EVENTS
+		if (flags & TRACE_EVENT_FL_UPROBE)
+			err = bpf_get_uprobe_info(event, fd_type, buf,
+						  probe_offset,
+						  event->attr.type == PERF_TYPE_TRACEPOINT);
+#endif
+	}
+
+	return err;
+}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 02aed76..daa8157 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1287,6 +1287,35 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
+
+int bpf_get_kprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **symbol, u64 *probe_offset,
+			u64 *probe_addr, bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_kprobe *tk;
+
+	if (perf_type_tracepoint)
+		tk = find_trace_kprobe(pevent, group);
+	else
+		tk = event->tp_event->data;
+	if (!tk)
+		return -EINVAL;
+
+	*fd_type = trace_kprobe_is_return(tk) ? BPF_FD_TYPE_KRETPROBE
+					      : BPF_FD_TYPE_KPROBE;
+	if (tk->symbol) {
+		*symbol = tk->symbol;
+		*probe_offset = tk->rp.kp.offset;
+		*probe_addr = 0;
+	} else {
+		*symbol = NULL;
+		*probe_offset = 0;
+		*probe_addr = (unsigned long)tk->rp.kp.addr;
+	}
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 /*
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ac89287..bf89a51 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1161,6 +1161,28 @@ static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
 {
 	__uprobe_perf_func(tu, func, regs, ucb, dsize);
 }
+
+int bpf_get_uprobe_info(const struct perf_event *event, u32 *fd_type,
+			const char **filename, u64 *probe_offset,
+			bool perf_type_tracepoint)
+{
+	const char *pevent = trace_event_name(event->tp_event);
+	const char *group = event->tp_event->class->system;
+	struct trace_uprobe *tu;
+
+	if (perf_type_tracepoint)
+		tu = find_probe_event(pevent, group);
+	else
+		tu = event->tp_event->data;
+	if (!tu)
+		return -EINVAL;
+
+	*fd_type = is_ret_probe(tu) ? BPF_FD_TYPE_URETPROBE
+				    : BPF_FD_TYPE_UPROBE;
+	*filename = tu->filename;
+	*probe_offset = tu->offset;
+	return 0;
+}
 #endif	/* CONFIG_PERF_EVENTS */
 
 static int
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v4 4/7] tools/bpf: add ksym_get_addr() in trace_helpers
From: Yonghong Song @ 2018-05-24  0:18 UTC (permalink / raw)
  To: peterz, ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180524001844.1175727-1-yhs@fb.com>

Given a kernel function name, ksym_get_addr() will return the kernel
address for this function, or 0 if it cannot find this function name
in /proc/kallsyms. This function will be used later when a kernel
address is used to initiate a kprobe perf event.

Acked-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/trace_helpers.c | 12 ++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 8fb4fe8..3868dcb 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -72,6 +72,18 @@ struct ksym *ksym_search(long key)
 	return &syms[0];
 }
 
+long ksym_get_addr(const char *name)
+{
+	int i;
+
+	for (i = 0; i < sym_cnt; i++) {
+		if (strcmp(syms[i].name, name) == 0)
+			return syms[i].addr;
+	}
+
+	return 0;
+}
+
 static int page_size;
 static int page_cnt = 8;
 static struct perf_event_mmap_page *header;
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
index 36d90e3..3b4bcf7 100644
--- a/tools/testing/selftests/bpf/trace_helpers.h
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -11,6 +11,7 @@ struct ksym {
 
 int load_kallsyms(void);
 struct ksym *ksym_search(long key);
+long ksym_get_addr(const char *name);
 
 typedef enum bpf_perf_event_ret (*perf_event_print_fn)(void *data, int size);
 
-- 
2.9.5

^ permalink raw reply related

* [PATCH v2 net-next 2/3] net/ipv6: Udate fib6_table_lookup tracepoint
From: dsahern @ 2018-05-24  0:08 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20180524000849.6553-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Commit bb0ad1987e96 ("ipv6: fib6_rules: support for match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/trace/events/fib6.h | 29 ++++++++++++++++++++++-------
 net/core/net-traces.c       |  4 ----
 net/ipv6/route.c            |  9 +++++++--
 3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/include/trace/events/fib6.h b/include/trace/events/fib6.h
index 1b8d951e3c12..b088b54d699c 100644
--- a/include/trace/events/fib6.h
+++ b/include/trace/events/fib6.h
@@ -19,7 +19,7 @@ TRACE_EVENT(fib6_table_lookup,
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
-
+		__field(	int,	err		)
 		__field(	int,	oif		)
 		__field(	int,	iif		)
 		__field(	__u8,	tos		)
@@ -27,7 +27,10 @@ TRACE_EVENT(fib6_table_lookup,
 		__field(	__u8,	flags		)
 		__array(	__u8,	src,	16	)
 		__array(	__u8,	dst,	16	)
-
+		__field(        u16,	sport		)
+		__field(        u16,	dport		)
+		__field(        u8,	proto		)
+		__field(        u8,	rt_type		)
 		__dynamic_array(	char,	name,	IFNAMSIZ )
 		__array(		__u8,	gw,	16	 )
 	),
@@ -36,6 +39,7 @@ TRACE_EVENT(fib6_table_lookup,
 		struct in6_addr *in6;
 
 		__entry->tb_id = table->tb6_id;
+		__entry->err = ip6_rt_type_to_error(f6i->fib6_type);
 		__entry->oif = flp->flowi6_oif;
 		__entry->iif = flp->flowi6_iif;
 		__entry->tos = ip6_tclass(flp->flowlabel);
@@ -48,10 +52,20 @@ TRACE_EVENT(fib6_table_lookup,
 		in6 = (struct in6_addr *)__entry->dst;
 		*in6 = flp->daddr;
 
+		__entry->proto = flp->flowi6_proto;
+		if (__entry->proto == IPPROTO_TCP ||
+		    __entry->proto == IPPROTO_UDP) {
+			__entry->sport = ntohs(flp->fl6_sport);
+			__entry->dport = ntohs(flp->fl6_dport);
+		} else {
+			__entry->sport = 0;
+			__entry->dport = 0;
+		}
+
 		if (f6i->fib6_nh.nh_dev) {
 			__assign_str(name, f6i->fib6_nh.nh_dev);
 		} else {
-			__assign_str(name, "");
+			__assign_str(name, "-");
 		}
 		if (f6i == net->ipv6.fib6_null_entry) {
 			struct in6_addr in6_zero = {};
@@ -65,10 +79,11 @@ TRACE_EVENT(fib6_table_lookup,
 		}
 	),
 
-	TP_printk("table %3u oif %d iif %d src %pI6c dst %pI6c tos %d scope %d flags %x ==> dev %s gw %pI6c",
-		  __entry->tb_id, __entry->oif, __entry->iif,
-		  __entry->src, __entry->dst, __entry->tos, __entry->scope,
-		  __entry->flags, __get_str(name), __entry->gw)
+	TP_printk("table %3u oif %d iif %d proto %u %pI6c/%u -> %pI6c/%u tos %d scope %d flags %x ==> dev %s gw %pI6c err %d",
+		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
+		  __entry->tos, __entry->scope, __entry->flags,
+		  __get_str(name), __entry->gw, __entry->err)
 );
 
 #endif /* _TRACE_FIB6_H */
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 380934580fa1..419af6dfe29f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -35,10 +35,6 @@
 #include <trace/events/tcp.h>
 #include <trace/events/fib.h>
 #include <trace/events/qdisc.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <trace/events/fib6.h>
-EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
-#endif
 #if IS_ENABLED(CONFIG_BRIDGE)
 #include <trace/events/bridge.h>
 EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index bcb8785c0451..8d9e460ff88b 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,14 +63,19 @@
 #include <net/lwtunnel.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
-#include <trace/events/fib6.h>
-
 #include <linux/uaccess.h>
 
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
 
+static int ip6_rt_type_to_error(u8 fib6_type);
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#undef CREATE_TRACE_POINTS
+
 enum rt6_nud_state {
 	RT6_NUD_FAIL_HARD = -3,
 	RT6_NUD_FAIL_PROBE = -2,
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 0/3] net: Update fib_table_lookup tracepoints
From: dsahern @ 2018-05-24  0:08 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern

From: David Ahern <dsahern@gmail.com>

Update the FIB lookup tracepoints to include ip proto and port fields
from the flow struct. In the process make the IPv4 tracepoint inline
with IPv6 which is much easier to use and follow the lookup and result.

Remove the tracepoint in fib_validate_source which does not provide
value above the fib_table_lookup which immediately follows it.

v2
- move CREATE_TRACE_POINTS for the v6 tracepoint to route.c to handle
  its need for an internal function to convert route type to error and
  handle IPv6 as a module or builtin. Reported by kbuild robot.

David Ahern (3):
  net/ipv4: Udate fib_table_lookup tracepoint
  net/ipv6: Udate fib6_table_lookup tracepoint
  net/ipv4: Remove tracepoint in fib_validate_source

 include/trace/events/fib.h  | 107 ++++++++++++++++++--------------------------
 include/trace/events/fib6.h |  29 +++++++++---
 net/core/net-traces.c       |   4 --
 net/ipv4/fib_frontend.c     |   2 -
 net/ipv4/fib_trie.c         |  14 +++---
 net/ipv6/route.c            |   9 +++-
 6 files changed, 81 insertions(+), 84 deletions(-)

-- 
2.11.0

^ permalink raw reply

* [PATCH v2 net-next 3/3] net/ipv4: Remove tracepoint in fib_validate_source
From: dsahern @ 2018-05-24  0:08 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20180524000849.6553-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Tracepoint does not add value and the call to fib_lookup follows
it which shows the same information and the fib lookup result.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/trace/events/fib.h | 35 -----------------------------------
 net/ipv4/fib_frontend.c    |  2 --
 2 files changed, 37 deletions(-)

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index f5a1d4c518d8..9763cddd0594 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -87,41 +87,6 @@ TRACE_EVENT(fib_table_lookup,
 		  __entry->tos, __entry->scope, __entry->flags,
 		  __get_str(name), __entry->gw, __entry->saddr, __entry->err)
 );
-
-TRACE_EVENT(fib_validate_source,
-
-	TP_PROTO(const struct net_device *dev, const struct flowi4 *flp),
-
-	TP_ARGS(dev, flp),
-
-	TP_STRUCT__entry(
-		__string(	name,	dev->name	)
-		__field(	int,	oif		)
-		__field(	int,	iif		)
-		__field(	__u8,	tos		)
-		__array(	__u8,	src,	4	)
-		__array(	__u8,	dst,	4	)
-	),
-
-	TP_fast_assign(
-		__be32 *p32;
-
-		__assign_str(name, dev ? dev->name : "not set");
-		__entry->oif = flp->flowi4_oif;
-		__entry->iif = flp->flowi4_iif;
-		__entry->tos = flp->flowi4_tos;
-
-		p32 = (__be32 *) __entry->src;
-		*p32 = flp->saddr;
-
-		p32 = (__be32 *) __entry->dst;
-		*p32 = flp->daddr;
-	),
-
-	TP_printk("dev %s oif %d iif %d tos %d src %pI4 dst %pI4",
-		  __get_str(name), __entry->oif, __entry->iif, __entry->tos,
-		  __entry->src, __entry->dst)
-);
 #endif /* _TRACE_FIB_H */
 
 /* This part must be outside protection */
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4d622112bf95..58696b829065 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -354,8 +354,6 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 		fl4.fl4_dport = 0;
 	}
 
-	trace_fib_validate_source(dev, &fl4);
-
 	if (fib_lookup(net, &fl4, &res, 0))
 		goto last_resort;
 	if (res.type != RTN_UNICAST &&
-- 
2.11.0

^ permalink raw reply related

* [PATCH v2 net-next 1/3] net/ipv4: Udate fib_table_lookup tracepoint
From: dsahern @ 2018-05-24  0:08 UTC (permalink / raw)
  To: netdev; +Cc: David Ahern
In-Reply-To: <20180524000849.6553-1-dsahern@kernel.org>

From: David Ahern <dsahern@gmail.com>

Commit 4a2d73a4fb36 ("ipv4: fib_rules: support match on sport, dport
and ip proto") added support for protocol and ports to FIB rules.
Update the FIB lookup tracepoint to dump the parameters.

In addition, make the IPv4 tracepoint similar to the IPv6 one where
the lookup parameters and result are dumped in 1 event. It is much
easier to use and understand the outcome of the lookup.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 include/trace/events/fib.h | 72 +++++++++++++++++++++++++++-------------------
 net/ipv4/fib_trie.c        | 14 +++++----
 2 files changed, 52 insertions(+), 34 deletions(-)

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
index 81b7e985bb45..f5a1d4c518d8 100644
--- a/include/trace/events/fib.h
+++ b/include/trace/events/fib.h
@@ -12,12 +12,14 @@
 
 TRACE_EVENT(fib_table_lookup,
 
-	TP_PROTO(u32 tb_id, const struct flowi4 *flp),
+	TP_PROTO(u32 tb_id, const struct flowi4 *flp,
+		 const struct fib_nh *nh, int err),
 
-	TP_ARGS(tb_id, flp),
+	TP_ARGS(tb_id, flp, nh, err),
 
 	TP_STRUCT__entry(
 		__field(	u32,	tb_id		)
+		__field(	int,	err		)
 		__field(	int,	oif		)
 		__field(	int,	iif		)
 		__field(	__u8,	tos		)
@@ -25,12 +27,19 @@ TRACE_EVENT(fib_table_lookup,
 		__field(	__u8,	flags		)
 		__array(	__u8,	src,	4	)
 		__array(	__u8,	dst,	4	)
+		__array(	__u8,	gw,	4	)
+		__array(	__u8,	saddr,	4	)
+		__field(	u16,	sport		)
+		__field(	u16,	dport		)
+		__field(	u8,	proto		)
+		__dynamic_array(char,  name,   IFNAMSIZ )
 	),
 
 	TP_fast_assign(
 		__be32 *p32;
 
 		__entry->tb_id = tb_id;
+		__entry->err = err;
 		__entry->oif = flp->flowi4_oif;
 		__entry->iif = flp->flowi4_iif;
 		__entry->tos = flp->flowi4_tos;
@@ -42,36 +51,41 @@ TRACE_EVENT(fib_table_lookup,
 
 		p32 = (__be32 *) __entry->dst;
 		*p32 = flp->daddr;
-	),
-
-	TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x",
-		  __entry->tb_id, __entry->oif, __entry->iif,
-		  __entry->src, __entry->dst, __entry->tos, __entry->scope,
-		  __entry->flags)
-);
-
-TRACE_EVENT(fib_table_lookup_nh,
-
-	TP_PROTO(const struct fib_nh *nh),
-
-	TP_ARGS(nh),
-
-	TP_STRUCT__entry(
-		__string(	name,	nh->nh_dev->name)
-		__field(	int,	oif		)
-		__array(	__u8,	src,	4	)
-	),
-
-	TP_fast_assign(
-		__be32 *p32 = (__be32 *) __entry->src;
 
-		__assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set");
-		__entry->oif = nh->nh_oif;
-		*p32 = nh->nh_saddr;
+		__entry->proto = flp->flowi4_proto;
+		if (__entry->proto == IPPROTO_TCP ||
+		    __entry->proto == IPPROTO_UDP) {
+			__entry->sport = ntohs(flp->fl4_sport);
+			__entry->dport = ntohs(flp->fl4_dport);
+		} else {
+			__entry->sport = 0;
+			__entry->dport = 0;
+		}
+
+		if (nh) {
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = nh->nh_saddr;
+
+			p32 = (__be32 *) __entry->gw;
+			*p32 = nh->nh_gw;
+
+			__assign_str(name, nh->nh_dev ? nh->nh_dev->name : "-");
+		} else {
+			p32 = (__be32 *) __entry->saddr;
+			*p32 = 0;
+
+			p32 = (__be32 *) __entry->gw;
+			*p32 = 0;
+
+			__assign_str(name, "-");
+		}
 	),
 
-	TP_printk("nexthop dev %s oif %d src %pI4",
-		  __get_str(name), __entry->oif, __entry->src)
+	TP_printk("table %u oif %d iif %d proto %u %pI4/%u -> %pI4/%u tos %d scope %d flags %x ==> dev %s gw %pI4 src %pI4 err %d",
+		  __entry->tb_id, __entry->oif, __entry->iif, __entry->proto,
+		  __entry->src, __entry->sport, __entry->dst, __entry->dport,
+		  __entry->tos, __entry->scope, __entry->flags,
+		  __get_str(name), __entry->gw, __entry->saddr, __entry->err)
 );
 
 TRACE_EVENT(fib_validate_source,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 3dcffd3ce98c..65c340f230ae 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1326,14 +1326,14 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 	unsigned long index;
 	t_key cindex;
 
-	trace_fib_table_lookup(tb->tb_id, flp);
-
 	pn = t->kv;
 	cindex = 0;
 
 	n = get_child_rcu(pn, cindex);
-	if (!n)
+	if (!n) {
+		trace_fib_table_lookup(tb->tb_id, flp, NULL, -EAGAIN);
 		return -EAGAIN;
+	}
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 	this_cpu_inc(stats->gets);
@@ -1416,8 +1416,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 				 * nothing for us to do as we do not have any
 				 * further nodes to parse.
 				 */
-				if (IS_TRIE(pn))
+				if (IS_TRIE(pn)) {
+					trace_fib_table_lookup(tb->tb_id, flp,
+							       NULL, -EAGAIN);
 					return -EAGAIN;
+				}
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 				this_cpu_inc(stats->backtrack);
 #endif
@@ -1459,6 +1462,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
+			trace_fib_table_lookup(tb->tb_id, flp, NULL, err);
 			return err;
 		}
 		if (fi->fib_flags & RTNH_F_DEAD)
@@ -1494,7 +1498,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 			this_cpu_inc(stats->semantic_match_passed);
 #endif
-			trace_fib_table_lookup_nh(nh);
+			trace_fib_table_lookup(tb->tb_id, flp, nh, err);
 
 			return err;
 		}
-- 
2.11.0

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox