Netdev List
 help / color / mirror / Atom feed
* [PATCH net 2/3] amd-xgbe: Improve KR auto-negotiation and training
From: Tom Lendacky @ 2018-04-23 16:43 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180423164258.18740.98574.stgit@tlendack-t1.amdoffice.net>

Update xgbe-phy-v2.c to make use of the auto-negotiation (AN) phy hooks
to improve the ability to successfully complete Clause 73 AN when running
at 10gbps.  Hardware can sometimes have issues with CDR lock when the
AN DME page exchange is being performed.

The AN and KR training hooks are used as follows:
- The pre AN hook is used to disable CDR tracking in the PHY so that the
  DME page exchange can be successfully and consistently completed.
- The post KR training hook is used to re-enable the CDR tracking so that
  KR training can successfully complete.
- The post AN hook is used to check for an unsuccessful AN which will
  increase a CDR tracking enablement delay (up to a maximum value).

Add two debugfs entries to allow control over use of the CDR tracking
workaround.  The debugfs entries allow the CDR tracking workaround to
be disabled and determine whether to re-enable CDR tracking before or
after link training has been initiated.

Also, with these changes the receiver reset cycle that is performed during
the link status check can be performed less often.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-common.h  |    8 ++
 drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c |   16 +++
 drivers/net/ethernet/amd/xgbe/xgbe-main.c    |    1 
 drivers/net/ethernet/amd/xgbe/xgbe-mdio.c    |    8 +-
 drivers/net/ethernet/amd/xgbe/xgbe-pci.c     |    2 
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c  |  125 ++++++++++++++++++++++++++
 drivers/net/ethernet/amd/xgbe/xgbe.h         |    4 +
 7 files changed, 160 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-common.h b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
index 7ea72ef..d272dc6 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-common.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-common.h
@@ -1321,6 +1321,10 @@
 #define MDIO_VEND2_AN_STAT		0x8002
 #endif
 
+#ifndef MDIO_VEND2_PMA_CDR_CONTROL
+#define MDIO_VEND2_PMA_CDR_CONTROL	0x8056
+#endif
+
 #ifndef MDIO_CTRL1_SPEED1G
 #define MDIO_CTRL1_SPEED1G		(MDIO_CTRL1_SPEED10G & ~BMCR_SPEED100)
 #endif
@@ -1369,6 +1373,10 @@
 #define XGBE_AN_CL37_TX_CONFIG_MASK	0x08
 #define XGBE_AN_CL37_MII_CTRL_8BIT	0x0100
 
+#define XGBE_PMA_CDR_TRACK_EN_MASK	0x01
+#define XGBE_PMA_CDR_TRACK_EN_OFF	0x00
+#define XGBE_PMA_CDR_TRACK_EN_ON	0x01
+
 /* Bit setting and getting macros
  *  The get macro will extract the current bit field value from within
  *  the variable
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
index 7d128be..b911439 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-debugfs.c
@@ -519,6 +519,22 @@ void xgbe_debugfs_init(struct xgbe_prv_data *pdata)
 				   "debugfs_create_file failed\n");
 	}
 
+	if (pdata->vdata->an_cdr_workaround) {
+		pfile = debugfs_create_bool("an_cdr_workaround", 0600,
+					    pdata->xgbe_debugfs,
+					    &pdata->debugfs_an_cdr_workaround);
+		if (!pfile)
+			netdev_err(pdata->netdev,
+				   "debugfs_create_bool failed\n");
+
+		pfile = debugfs_create_bool("an_cdr_track_early", 0600,
+					    pdata->xgbe_debugfs,
+					    &pdata->debugfs_an_cdr_track_early);
+		if (!pfile)
+			netdev_err(pdata->netdev,
+				   "debugfs_create_bool failed\n");
+	}
+
 	kfree(buf);
 }
 
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-main.c b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
index 795e556..441d0973 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-main.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-main.c
@@ -349,6 +349,7 @@ int xgbe_config_netdev(struct xgbe_prv_data *pdata)
 	XGMAC_SET_BITS(pdata->rss_options, MAC_RSSCR, UDP4TE, 1);
 
 	/* Call MDIO/PHY initialization routine */
+	pdata->debugfs_an_cdr_workaround = pdata->vdata->an_cdr_workaround;
 	ret = pdata->phy_if.phy_init(pdata);
 	if (ret)
 		return ret;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
index e3d361e..1b45cd7 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-mdio.c
@@ -432,6 +432,8 @@ static void xgbe_an73_disable(struct xgbe_prv_data *pdata)
 	xgbe_an73_set(pdata, false, false);
 	xgbe_an73_disable_interrupts(pdata);
 
+	pdata->an_start = 0;
+
 	netif_dbg(pdata, link, pdata->netdev, "CL73 AN disabled\n");
 }
 
@@ -511,11 +513,11 @@ static enum xgbe_an xgbe_an73_tx_training(struct xgbe_prv_data *pdata,
 		XMDIO_WRITE(pdata, MDIO_MMD_PMAPMD, MDIO_PMA_10GBR_PMD_CTRL,
 			    reg);
 
-		if (pdata->phy_if.phy_impl.kr_training_post)
-			pdata->phy_if.phy_impl.kr_training_post(pdata);
-
 		netif_dbg(pdata, link, pdata->netdev,
 			  "KR training initiated\n");
+
+		if (pdata->phy_if.phy_impl.kr_training_post)
+			pdata->phy_if.phy_impl.kr_training_post(pdata);
 	}
 
 	return XGBE_AN_PAGE_RECEIVED;
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
index eb23f9b..82d1f41 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
@@ -456,6 +456,7 @@ static int xgbe_pci_resume(struct pci_dev *pdev)
 	.irq_reissue_support		= 1,
 	.tx_desc_prefetch		= 5,
 	.rx_desc_prefetch		= 5,
+	.an_cdr_workaround		= 1,
 };
 
 static const struct xgbe_version_data xgbe_v2b = {
@@ -470,6 +471,7 @@ static int xgbe_pci_resume(struct pci_dev *pdev)
 	.irq_reissue_support		= 1,
 	.tx_desc_prefetch		= 5,
 	.rx_desc_prefetch		= 5,
+	.an_cdr_workaround		= 1,
 };
 
 static const struct pci_device_id xgbe_pci_table[] = {
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 3304a29..b48efc0 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -147,6 +147,14 @@
 /* Rate-change complete wait/retry count */
 #define XGBE_RATECHANGE_COUNT		500
 
+/* CDR delay values for KR support (in usec) */
+#define XGBE_CDR_DELAY_INIT		10000
+#define XGBE_CDR_DELAY_INC		10000
+#define XGBE_CDR_DELAY_MAX		100000
+
+/* RRC frequency during link status check */
+#define XGBE_RRC_FREQUENCY		10
+
 enum xgbe_port_mode {
 	XGBE_PORT_MODE_RSVD = 0,
 	XGBE_PORT_MODE_BACKPLANE,
@@ -355,6 +363,10 @@ struct xgbe_phy_data {
 	unsigned int redrv_addr;
 	unsigned int redrv_lane;
 	unsigned int redrv_model;
+
+	/* KR AN support */
+	unsigned int phy_cdr_notrack;
+	unsigned int phy_cdr_delay;
 };
 
 /* I2C, MDIO and GPIO lines are muxed, so only one device at a time */
@@ -2361,7 +2373,7 @@ static int xgbe_phy_link_status(struct xgbe_prv_data *pdata, int *an_restart)
 		return 1;
 
 	/* No link, attempt a receiver reset cycle */
-	if (phy_data->rrc_count++) {
+	if (phy_data->rrc_count++ > XGBE_RRC_FREQUENCY) {
 		phy_data->rrc_count = 0;
 		xgbe_phy_rrc(pdata);
 	}
@@ -2669,6 +2681,103 @@ static bool xgbe_phy_port_enabled(struct xgbe_prv_data *pdata)
 	return true;
 }
 
+static void xgbe_phy_cdr_track(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+
+	if (!pdata->debugfs_an_cdr_workaround)
+		return;
+
+	if (!phy_data->phy_cdr_notrack)
+		return;
+
+	usleep_range(phy_data->phy_cdr_delay,
+		     phy_data->phy_cdr_delay + 500);
+
+	XMDIO_WRITE_BITS(pdata, MDIO_MMD_PMAPMD, MDIO_VEND2_PMA_CDR_CONTROL,
+			 XGBE_PMA_CDR_TRACK_EN_MASK,
+			 XGBE_PMA_CDR_TRACK_EN_ON);
+
+	phy_data->phy_cdr_notrack = 0;
+}
+
+static void xgbe_phy_cdr_notrack(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+
+	if (!pdata->debugfs_an_cdr_workaround)
+		return;
+
+	if (phy_data->phy_cdr_notrack)
+		return;
+
+	XMDIO_WRITE_BITS(pdata, MDIO_MMD_PMAPMD, MDIO_VEND2_PMA_CDR_CONTROL,
+			 XGBE_PMA_CDR_TRACK_EN_MASK,
+			 XGBE_PMA_CDR_TRACK_EN_OFF);
+
+	xgbe_phy_rrc(pdata);
+
+	phy_data->phy_cdr_notrack = 1;
+}
+
+static void xgbe_phy_kr_training_post(struct xgbe_prv_data *pdata)
+{
+	if (!pdata->debugfs_an_cdr_track_early)
+		xgbe_phy_cdr_track(pdata);
+}
+
+static void xgbe_phy_kr_training_pre(struct xgbe_prv_data *pdata)
+{
+	if (pdata->debugfs_an_cdr_track_early)
+		xgbe_phy_cdr_track(pdata);
+}
+
+static void xgbe_phy_an_post(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+
+	switch (pdata->an_mode) {
+	case XGBE_AN_MODE_CL73:
+	case XGBE_AN_MODE_CL73_REDRV:
+		if (phy_data->cur_mode != XGBE_MODE_KR)
+			break;
+
+		xgbe_phy_cdr_track(pdata);
+
+		switch (pdata->an_result) {
+		case XGBE_AN_READY:
+		case XGBE_AN_COMPLETE:
+			break;
+		default:
+			if (phy_data->phy_cdr_delay < XGBE_CDR_DELAY_MAX)
+				phy_data->phy_cdr_delay += XGBE_CDR_DELAY_INC;
+			else
+				phy_data->phy_cdr_delay = XGBE_CDR_DELAY_INIT;
+			break;
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static void xgbe_phy_an_pre(struct xgbe_prv_data *pdata)
+{
+	struct xgbe_phy_data *phy_data = pdata->phy_data;
+
+	switch (pdata->an_mode) {
+	case XGBE_AN_MODE_CL73:
+	case XGBE_AN_MODE_CL73_REDRV:
+		if (phy_data->cur_mode != XGBE_MODE_KR)
+			break;
+
+		xgbe_phy_cdr_notrack(pdata);
+		break;
+	default:
+		break;
+	}
+}
+
 static void xgbe_phy_stop(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
@@ -2680,6 +2789,9 @@ static void xgbe_phy_stop(struct xgbe_prv_data *pdata)
 	xgbe_phy_sfp_reset(phy_data);
 	xgbe_phy_sfp_mod_absent(pdata);
 
+	/* Reset CDR support */
+	xgbe_phy_cdr_track(pdata);
+
 	/* Power off the PHY */
 	xgbe_phy_power_off(pdata);
 
@@ -2712,6 +2824,9 @@ static int xgbe_phy_start(struct xgbe_prv_data *pdata)
 	/* Start in highest supported mode */
 	xgbe_phy_set_mode(pdata, phy_data->start_mode);
 
+	/* Reset CDR support */
+	xgbe_phy_cdr_track(pdata);
+
 	/* After starting the I2C controller, we can check for an SFP */
 	switch (phy_data->port_mode) {
 	case XGBE_PORT_MODE_SFP:
@@ -3019,6 +3134,8 @@ static int xgbe_phy_init(struct xgbe_prv_data *pdata)
 		}
 	}
 
+	phy_data->phy_cdr_delay = XGBE_CDR_DELAY_INIT;
+
 	/* Register for driving external PHYs */
 	mii = devm_mdiobus_alloc(pdata->dev);
 	if (!mii) {
@@ -3071,4 +3188,10 @@ void xgbe_init_function_ptrs_phy_v2(struct xgbe_phy_if *phy_if)
 	phy_impl->an_advertising	= xgbe_phy_an_advertising;
 
 	phy_impl->an_outcome		= xgbe_phy_an_outcome;
+
+	phy_impl->an_pre		= xgbe_phy_an_pre;
+	phy_impl->an_post		= xgbe_phy_an_post;
+
+	phy_impl->kr_training_pre	= xgbe_phy_kr_training_pre;
+	phy_impl->kr_training_post	= xgbe_phy_kr_training_post;
 }
diff --git a/drivers/net/ethernet/amd/xgbe/xgbe.h b/drivers/net/ethernet/amd/xgbe/xgbe.h
index fa0b51e..95d4b56 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe.h
+++ b/drivers/net/ethernet/amd/xgbe/xgbe.h
@@ -994,6 +994,7 @@ struct xgbe_version_data {
 	unsigned int irq_reissue_support;
 	unsigned int tx_desc_prefetch;
 	unsigned int rx_desc_prefetch;
+	unsigned int an_cdr_workaround;
 };
 
 struct xgbe_vxlan_data {
@@ -1262,6 +1263,9 @@ struct xgbe_prv_data {
 	unsigned int debugfs_xprop_reg;
 
 	unsigned int debugfs_xi2c_reg;
+
+	bool debugfs_an_cdr_workaround;
+	bool debugfs_an_cdr_track_early;
 };
 
 /* Function prototypes*/

^ permalink raw reply related

* [PATCH net 3/3] amd-xgbe: Only use the SFP supported transceiver signals
From: Tom Lendacky @ 2018-04-23 16:43 UTC (permalink / raw)
  To: netdev; +Cc: David Miller
In-Reply-To: <20180423164258.18740.98574.stgit@tlendack-t1.amdoffice.net>

The SFP eeprom indicates the transceiver signals (Rx LOS, Tx Fault, etc.)
that it supports.  Update the driver to include checking the eeprom data
when deciding whether to use a transceiver signal.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |   71 +++++++++++++++++++++------
 1 file changed, 54 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index b48efc0..aac8843 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -253,6 +253,10 @@ enum xgbe_sfp_speed {
 #define XGBE_SFP_BASE_VENDOR_SN			4
 #define XGBE_SFP_BASE_VENDOR_SN_LEN		16
 
+#define XGBE_SFP_EXTD_OPT1			1
+#define XGBE_SFP_EXTD_OPT1_RX_LOS		BIT(1)
+#define XGBE_SFP_EXTD_OPT1_TX_FAULT		BIT(3)
+
 #define XGBE_SFP_EXTD_DIAG			28
 #define XGBE_SFP_EXTD_DIAG_ADDR_CHANGE		BIT(2)
 
@@ -332,6 +336,7 @@ struct xgbe_phy_data {
 
 	unsigned int sfp_gpio_address;
 	unsigned int sfp_gpio_mask;
+	unsigned int sfp_gpio_inputs;
 	unsigned int sfp_gpio_rx_los;
 	unsigned int sfp_gpio_tx_fault;
 	unsigned int sfp_gpio_mod_absent;
@@ -986,6 +991,49 @@ static void xgbe_phy_sfp_external_phy(struct xgbe_prv_data *pdata)
 	phy_data->sfp_phy_avail = 1;
 }
 
+static bool xgbe_phy_check_sfp_rx_los(struct xgbe_phy_data *phy_data)
+{
+	u8 *sfp_extd = phy_data->sfp_eeprom.extd;
+
+	if (!(sfp_extd[XGBE_SFP_EXTD_OPT1] & XGBE_SFP_EXTD_OPT1_RX_LOS))
+		return false;
+
+	if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_RX_LOS)
+		return false;
+
+	if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_rx_los))
+		return true;
+
+	return false;
+}
+
+static bool xgbe_phy_check_sfp_tx_fault(struct xgbe_phy_data *phy_data)
+{
+	u8 *sfp_extd = phy_data->sfp_eeprom.extd;
+
+	if (!(sfp_extd[XGBE_SFP_EXTD_OPT1] & XGBE_SFP_EXTD_OPT1_TX_FAULT))
+		return false;
+
+	if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_TX_FAULT)
+		return false;
+
+	if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_tx_fault))
+		return true;
+
+	return false;
+}
+
+static bool xgbe_phy_check_sfp_mod_absent(struct xgbe_phy_data *phy_data)
+{
+	if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_MOD_ABSENT)
+		return false;
+
+	if (phy_data->sfp_gpio_inputs & (1 << phy_data->sfp_gpio_mod_absent))
+		return true;
+
+	return false;
+}
+
 static bool xgbe_phy_belfuse_parse_quirks(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
@@ -1031,6 +1079,10 @@ static void xgbe_phy_sfp_parse_eeprom(struct xgbe_prv_data *pdata)
 	if (sfp_base[XGBE_SFP_BASE_EXT_ID] != XGBE_SFP_EXT_ID_SFP)
 		return;
 
+	/* Update transceiver signals (eeprom extd/options) */
+	phy_data->sfp_tx_fault = xgbe_phy_check_sfp_tx_fault(phy_data);
+	phy_data->sfp_rx_los = xgbe_phy_check_sfp_rx_los(phy_data);
+
 	if (xgbe_phy_sfp_parse_quirks(pdata))
 		return;
 
@@ -1196,7 +1248,6 @@ static int xgbe_phy_sfp_read_eeprom(struct xgbe_prv_data *pdata)
 static void xgbe_phy_sfp_signals(struct xgbe_prv_data *pdata)
 {
 	struct xgbe_phy_data *phy_data = pdata->phy_data;
-	unsigned int gpio_input;
 	u8 gpio_reg, gpio_ports[2];
 	int ret;
 
@@ -1211,23 +1262,9 @@ static void xgbe_phy_sfp_signals(struct xgbe_prv_data *pdata)
 		return;
 	}
 
-	gpio_input = (gpio_ports[1] << 8) | gpio_ports[0];
-
-	if (phy_data->sfp_gpio_mask & XGBE_GPIO_NO_MOD_ABSENT) {
-		/* No GPIO, just assume the module is present for now */
-		phy_data->sfp_mod_absent = 0;
-	} else {
-		if (!(gpio_input & (1 << phy_data->sfp_gpio_mod_absent)))
-			phy_data->sfp_mod_absent = 0;
-	}
-
-	if (!(phy_data->sfp_gpio_mask & XGBE_GPIO_NO_RX_LOS) &&
-	    (gpio_input & (1 << phy_data->sfp_gpio_rx_los)))
-		phy_data->sfp_rx_los = 1;
+	phy_data->sfp_gpio_inputs = (gpio_ports[1] << 8) | gpio_ports[0];
 
-	if (!(phy_data->sfp_gpio_mask & XGBE_GPIO_NO_TX_FAULT) &&
-	    (gpio_input & (1 << phy_data->sfp_gpio_tx_fault)))
-		phy_data->sfp_tx_fault = 1;
+	phy_data->sfp_mod_absent = xgbe_phy_check_sfp_mod_absent(phy_data);
 }
 
 static void xgbe_phy_sfp_mod_absent(struct xgbe_prv_data *pdata)

^ permalink raw reply related

* Re: [PATCH 3/3] selinux: provide unix_stream_socketpair callback
From: Stephen Smalley @ 2018-04-23 16:48 UTC (permalink / raw)
  To: David Herrmann, linux-kernel
  Cc: James Morris, Paul Moore, teg, selinux, linux-security-module,
	Eric Paris, serge, davem, netdev
In-Reply-To: <20180423133015.5455-4-dh.herrmann@gmail.com>

On 04/23/2018 09:30 AM, David Herrmann wrote:
> Make sure to implement the new unix_stream_socketpair callback so the
> SO_PEERSEC call on socketpair(2)s will return correct information.
> 
> Signed-off-by: David Herrmann <dh.herrmann@gmail.com>

Acked-by: Stephen Smalley <sds@tycho.nsa.gov>

> ---
>  security/selinux/hooks.c | 14 ++++++++++++++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
> index 4cafe6a19167..828881d9a41d 100644
> --- a/security/selinux/hooks.c
> +++ b/security/selinux/hooks.c
> @@ -4905,6 +4905,18 @@ static int selinux_socket_unix_stream_connect(struct sock *sock,
>  	return 0;
>  }
>  
> +static int selinux_socket_unix_stream_socketpair(struct sock *socka,
> +						 struct sock *sockb)
> +{
> +	struct sk_security_struct *sksec_a = socka->sk_security;
> +	struct sk_security_struct *sksec_b = sockb->sk_security;
> +
> +	sksec_a->peer_sid = sksec_b->sid;
> +	sksec_b->peer_sid = sksec_a->sid;
> +
> +	return 0;
> +}
> +
>  static int selinux_socket_unix_may_send(struct socket *sock,
>  					struct socket *other)
>  {
> @@ -6995,6 +7007,8 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
>  	LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
>  
>  	LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect),
> +	LSM_HOOK_INIT(unix_stream_socketpair,
> +			selinux_socket_unix_stream_socketpair),
>  	LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send),
>  
>  	LSM_HOOK_INIT(socket_create, selinux_socket_create),
> 

^ permalink raw reply

* KMSAN: uninit-value in _copy_to_iter (2)
From: syzbot @ 2018-04-23 16:56 UTC (permalink / raw)
  To: avagin, davem, dingtianhong, edumazet, elena.reshetova,
	linux-kernel, matthew, mingo, netdev, pabeni, syzkaller-bugs,
	viro, willemb

Hello,

syzbot hit the following crash on  
https://github.com/google/kmsan.git/master commit
d2d741e5d1898dfde1a75ea3d29a9a3e2edf0617 (Sun Apr 22 15:05:22 2018 +0000)
kmsan: add initialization for shmem pages
syzbot dashboard link:  
https://syzkaller.appspot.com/bug?extid=87cfa083e727a224754b

Unfortunately, I don't have any reproducer for this crash yet.
Raw console output:  
https://syzkaller.appspot.com/x/log.txt?id=6616554548494336
Kernel config: https://syzkaller.appspot.com/x/.config?id=328654897048964367
compiler: clang version 7.0.0 (trunk 329391)

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+87cfa083e727a224754b@syzkaller.appspotmail.com
It will help syzbot understand when the bug is fixed. See footer for  
details.
If you forward the report, please keep this part and the footer.

==================================================================
BUG: KMSAN: uninit-value in copyout lib/iov_iter.c:140 [inline]
BUG: KMSAN: uninit-value in _copy_to_iter+0x1bb3/0x28f0 lib/iov_iter.c:571
CPU: 0 PID: 7670 Comm: syz-executor7 Not tainted 4.16.0+ #86
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Call Trace:
  __dump_stack lib/dump_stack.c:17 [inline]
  dump_stack+0x185/0x1d0 lib/dump_stack.c:53
  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
  kmsan_internal_check_memory+0x135/0x1e0 mm/kmsan/kmsan.c:1157
  kmsan_copy_to_user+0x69/0x160 mm/kmsan/kmsan.c:1199
  copyout lib/iov_iter.c:140 [inline]
  _copy_to_iter+0x1bb3/0x28f0 lib/iov_iter.c:571
  copy_to_iter include/linux/uio.h:106 [inline]
  skb_copy_datagram_iter+0x443/0xf70 net/core/datagram.c:431
  skb_copy_datagram_msg include/linux/skbuff.h:3264 [inline]
  netlink_recvmsg+0x6f1/0x1900 net/netlink/af_netlink.c:1958
  sock_recvmsg_nosec net/socket.c:803 [inline]
  sock_recvmsg+0x1d0/0x230 net/socket.c:810
  ___sys_recvmsg+0x3fb/0x810 net/socket.c:2205
  __sys_recvmmsg+0x54e/0xdb0 net/socket.c:2313
  SYSC_recvmmsg+0x29b/0x3e0 net/socket.c:2394
  SyS_recvmmsg+0x76/0xa0 net/socket.c:2378
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x455389
RSP: 002b:00007f0281d3dc68 EFLAGS: 00000246 ORIG_RAX: 000000000000012b
RAX: ffffffffffffffda RBX: 00007f0281d3e6d4 RCX: 0000000000455389
RDX: 0000000000000003 RSI: 0000000020001f80 RDI: 0000000000000014
RBP: 000000000072bea0 R08: 0000000020002040 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00000000ffffffff
R13: 000000000000049e R14: 00000000006f9f70 R15: 0000000000000000

Uninit was stored to memory at:
  kmsan_save_stack_with_flags mm/kmsan/kmsan.c:278 [inline]
  kmsan_save_stack mm/kmsan/kmsan.c:293 [inline]
  kmsan_internal_chain_origin+0x12b/0x210 mm/kmsan/kmsan.c:684
  kmsan_memcpy_origins+0x11d/0x170 mm/kmsan/kmsan.c:526
  __msan_memcpy+0x109/0x160 mm/kmsan/kmsan_instr.c:477
  __nla_put lib/nlattr.c:569 [inline]
  nla_put+0x276/0x340 lib/nlattr.c:627
  copy_to_user_policy_type net/xfrm/xfrm_user.c:1678 [inline]
  build_acquire net/xfrm/xfrm_user.c:2850 [inline]
  xfrm_send_acquire+0x1068/0x1690 net/xfrm/xfrm_user.c:2873
  km_query net/xfrm/xfrm_state.c:1953 [inline]
  xfrm_state_find+0x3ad8/0x4f40 net/xfrm/xfrm_state.c:1021
  xfrm_tmpl_resolve_one net/xfrm/xfrm_policy.c:1393 [inline]
  xfrm_tmpl_resolve net/xfrm/xfrm_policy.c:1437 [inline]
  xfrm_resolve_and_create_bundle+0xc31/0x5270 net/xfrm/xfrm_policy.c:1833
  xfrm_lookup+0x606/0x39d0 net/xfrm/xfrm_policy.c:2163
  xfrm_lookup_route+0xfa/0x360 net/xfrm/xfrm_policy.c:2283
  ip6_dst_lookup_flow+0x221/0x270 net/ipv6/ip6_output.c:1099
  ip6_datagram_dst_update+0x93a/0x1470 net/ipv6/datagram.c:91
  __ip6_datagram_connect+0x14f6/0x1a20 net/ipv6/datagram.c:257
  ip6_datagram_connect net/ipv6/datagram.c:280 [inline]
  ip6_datagram_connect_v6_only+0x104/0x180 net/ipv6/datagram.c:292
  inet_dgram_connect+0x2e8/0x4d0 net/ipv4/af_inet.c:542
  SYSC_connect+0x41a/0x510 net/socket.c:1639
  SyS_connect+0x54/0x80 net/socket.c:1620
  do_syscall_64+0x309/0x430 arch/x86/entry/common.c:287
  entry_SYSCALL_64_after_hwframe+0x3d/0xa2
Local variable description: ----upt.i.i@xfrm_send_acquire
Variable was created at:
  xfrm_send_acquire+0x73/0x1690 net/xfrm/xfrm_user.c:2864
  km_query net/xfrm/xfrm_state.c:1953 [inline]
  xfrm_state_find+0x3ad8/0x4f40 net/xfrm/xfrm_state.c:1021

Byte 200 of 207 is uninitialized
FAULT_INJECTION: forcing a failure.
name failslab, interval 1, probability 0, space 0, times 0
==================================================================
CPU: 1 PID: 7675 Comm: syz-executor3 Not tainted 4.16.0+ #86


---
This bug is generated by a dumb bot. It may contain errors.
See https://goo.gl/tpsmEJ for details.
Direct all questions to syzkaller@googlegroups.com.

syzbot will keep track of this bug report.
If you forgot to add the Reported-by tag, once the fix for this bug is  
merged
into any tree, please reply to this email with:
#syz fix: exact-commit-title
To mark this as a duplicate of another syzbot report, please reply with:
#syz dup: exact-subject-of-another-report
If it's a one-off invalid bug report, please reply with:
#syz invalid
Note: if the crash happens again, it will cause creation of a new bug  
report.
Note: all commands must start from beginning of the line in the email body.

^ permalink raw reply

* Re: Bluetooth/lock_sock: false positive "WARNING: possible recursive locking detected"
From: Marcel Holtmann @ 2018-04-23 17:01 UTC (permalink / raw)
  To: Jiri Slaby; +Cc: Johan Hedberg, BlueZ development, ML netdev
In-Reply-To: <695d5172-e7f8-9e44-b6ab-0ce1a9473f34@suse.cz>

Hi Jiri,

>> [ 2891.586061] ============================================
>> [ 2891.586063] WARNING: possible recursive locking detected
>> [ 2891.586065] 4.16.2-10.ge881e16-default #1 Not tainted
>> [ 2891.586067] --------------------------------------------
>> [ 2891.586068] kworker/u9:3/873 is trying to acquire lock:
>> [ 2891.586070]  (sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+.}, at: [<000000007b85e829>] bt_accept_enqueue+0x29/0x90 [bluetooth]
>> [ 2891.586086]
>>               but task is already holding lock:
>> [ 2891.586088]  (sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+.}, at: [<0000000042f0b4a5>] l2cap_sock_new_connection_cb+0x18/0xa0 [bluetooth]
>> [ 2891.586109]
>>               other info that might help us debug this:
>> [ 2891.586111]  Possible unsafe locking scenario:
>> 
>> [ 2891.586115]        CPU0
>> [ 2891.586116]        ----
>> [ 2891.586117]   lock(sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP);
>> [ 2891.586120]   lock(sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP);
>> [ 2891.586122]
>>                *** DEADLOCK ***
>> 
>> [ 2891.586125]  May be due to missing lock nesting notation
>> 
>> [ 2891.586127] 5 locks held by kworker/u9:3/873:
>> [ 2891.586128]  #0:  ((wq_completion)"%s"hdev->name#2){+.+.}, at: [<000000004aa1a273>] process_one_work+0x1e3/0x6a0
>> [ 2891.586135]  #1:  ((work_completion)(&hdev->rx_work)){+.+.}, at: [<000000004aa1a273>] process_one_work+0x1e3/0x6a0
>> [ 2891.586140]  #2:  (&conn->chan_lock){+.+.}, at: [<00000000fbad6c82>] l2cap_connect+0x88/0x540 [bluetooth]
>> [ 2891.586155]  #3:  (&chan->lock/2){+.+.}, at: [<000000007c38e27e>] l2cap_connect+0xa0/0x540 [bluetooth]
>> [ 2891.586170]  #4:  (sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+.}, at: [<0000000042f0b4a5>] l2cap_sock_new_connection_cb+0x18/0xa0 [bluetooth]
>> [ 2891.586183]
>>               stack backtrace:
>> [ 2891.586187] CPU: 2 PID: 873 Comm: kworker/u9:3 Not tainted 4.16.2-10.ge881e16-default #1 openSUSE Tumbleweed (unreleased)
>> [ 2891.586189] Hardware name: Dell Inc. Latitude 7280/0KK5D1, BIOS 1.9.3 03/09/2018
>> [ 2891.586200] Workqueue: hci0 hci_rx_work [bluetooth]
>> [ 2891.586202] Call Trace:
>> [ 2891.586207]  dump_stack+0x85/0xc5
>> [ 2891.586211]  __lock_acquire+0x6b4/0x1370
>> [ 2891.586221]  lock_acquire+0x9f/0x210
>> [ 2891.586237]  lock_sock_nested+0x5a/0x80
>> [ 2891.586256]  bt_accept_enqueue+0x29/0x90 [bluetooth]
>> [ 2891.586268]  l2cap_sock_new_connection_cb+0x5d/0xa0 [bluetooth]
>> [ 2891.586280]  l2cap_connect+0x126/0x540 [bluetooth]
>> [ 2891.586315]  l2cap_sig_channel+0x443/0x13b0 [bluetooth]
>> [ 2891.586330]  l2cap_recv_frame+0x1a4/0x300 [bluetooth]
>> [ 2891.586341]  hci_rx_work+0x1c8/0x5c0 [bluetooth]
>> [ 2891.586345]  process_one_work+0x269/0x6a0
>> [ 2891.586350]  worker_thread+0x2b/0x3d0
>> [ 2891.586356]  kthread+0x113/0x130
>> [ 2891.586363]  ret_from_fork+0x24/0x50
>> [ 4954.622809] e1000e: eth0 NIC Link is Down
>> [ 4955.299532] PM: suspend entry (deep)
>> [ 4955.299538] PM: Syncing filesystems ... done.
> 
> This is:
>  lock_sock(sk);       in bt_accept_enqueue
> nested in
>  lock_sock(parent);   in l2cap_sock_new_connection_cb
> 
> So this looks like a false positive to me. So I believe this is a fix:
> 
> --- a/net/bluetooth/l2cap_sock.c
> +++ b/net/bluetooth/l2cap_sock.c
> @@ -1232,7 +1232,7 @@ static struct l2cap_chan
> *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
> {
>        struct sock *sk, *parent = chan->data;
> 
> -       lock_sock(parent);
> +       lock_sock_nested(parent, L2CAP_NESTING_PARENT);
> 
>        /* Check for backlog size */
>        if (sk_acceptq_is_full(parent)) {
> 

can you send a patch for it?

Regards

Marcel

^ permalink raw reply

* Re: [PATCH v7 net-next 4/4] netvsc: refactor notifier/event handling code to use the failover framework
From: Stephen Hemminger @ 2018-04-23 17:04 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Sridhar Samudrala, mst, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <20180420160058.GB2150@nanopsycho.orion>

On Fri, 20 Apr 2018 18:00:58 +0200
Jiri Pirko <jiri@resnulli.us> wrote:

> Fri, Apr 20, 2018 at 05:28:02PM CEST, stephen@networkplumber.org wrote:
> >On Thu, 19 Apr 2018 18:42:04 -0700
> >Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
> >  
> >> Use the registration/notification framework supported by the generic
> >> failover infrastructure.
> >> 
> >> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>  
> >
> >Do what you want to other devices but leave netvsc alone.
> >Adding these failover ops does not reduce the code size, and really is
> >no benefit.  The netvsc device driver needs to be backported to several
> >other distributions and doing this makes that harder.  
> 
> We should not care about the backport burden when we are trying to make
> things right. And things are not right. The current netvsc approach is
> just plain wrong shortcut. It should have been done in a generic way
> from the very beginning. We are just trying to fix this situation.
> 
> Moreover, I believe that part of the fix is to convert netvsc to 3
> netdev solution too. 2 netdev model is wrong.
> 
> 
> >
> >I will NAK patches to change to common code for netvsc especially the
> >three device model.  MS worked hard with distro vendors to support transparent
> >mode, ans we really can't have a new model; or do backport.
> >
> >Plus, DPDK is now dependent on existing model.  
> 
> Sorry, but nobody here cares about dpdk or other similar oddities.

The network device model is a userspace API, and DPDK is a userspace application.
You can't go breaking userspace even if you don't like the application.

^ permalink raw reply

* Re: [PATCH 0/3] Introduce LSM-hook for socketpair(2)
From: Casey Schaufler @ 2018-04-23 17:04 UTC (permalink / raw)
  To: David Herrmann, linux-kernel
  Cc: James Morris, Paul Moore, teg, Stephen Smalley, selinux,
	linux-security-module, Eric Paris, serge, davem, netdev,
	Casey Schaufler
In-Reply-To: <20180423133015.5455-1-dh.herrmann@gmail.com>

On 4/23/2018 6:30 AM, David Herrmann wrote:
> Hi
>
> This series adds a new LSM hook for the socketpair(2) syscall. The idea
> is to allow SO_PEERSEC to be called on AF_UNIX sockets created via
> socketpair(2), and return the same information as if you emulated
> socketpair(2) via a temporary listener socket. Right now SO_PEERSEC
> will return the unlabeled credentials for a socketpair, rather than the
> actual credentials of the creating process.
>
> ...
>
> This series only adds SELinux backends, since that is what we need for
> RHEL. I will gladly extend the other LSMs if needed.

I would be very happy to see a proposed patch for Smack. It shouldn't
be much different from the SELinux version, with the exception that it
will use pointers to smk_known structures instead of secids. It would be
a big help, as someone just threw a whole new species of scorpion into
this pit.

>
> Thanks
> David
>
> [1] https://github.com/bus1/dbus-broker/blob/master/src/util/test-peersec.c
> [2] https://www.spinics.net/lists/selinux/msg22674.html
>
> David Herrmann (3):
>   security: add hook for socketpair(AF_UNIX, ...)
>   net/unix: hook unix_socketpair() into LSM
>   selinux: provide unix_stream_socketpair callback
>
>  include/linux/lsm_hooks.h |  8 ++++++++
>  include/linux/security.h  |  7 +++++++
>  net/unix/af_unix.c        |  5 +++++
>  security/security.c       |  6 ++++++
>  security/selinux/hooks.c  | 14 ++++++++++++++
>  5 files changed, 40 insertions(+)
>

^ permalink raw reply

* [PATCH bpf] bpf: disable and restore preemption in __BPF_PROG_RUN_ARRAY
From: Roman Gushchin @ 2018-04-23 17:09 UTC (permalink / raw)
  To: netdev; +Cc: kernel-team, Roman Gushchin, Alexei Starovoitov, Daniel Borkmann

Running bpf programs requires disabled preemption,
however at least some* of the BPF_PROG_RUN_ARRAY users
do not follow this rule.

To fix this bug, and also to make it not happen in the future,
let's add explicit preemption disabling/re-enabling
to the __BPF_PROG_RUN_ARRAY code.

* for example:
 [   17.624472] RIP: 0010:__cgroup_bpf_run_filter_sk+0x1c4/0x1d0
 ...
 [   17.640890]  inet6_create+0x3eb/0x520
 [   17.641405]  __sock_create+0x242/0x340
 [   17.641939]  __sys_socket+0x57/0xe0
 [   17.642370]  ? trace_hardirqs_off_thunk+0x1a/0x1c
 [   17.642944]  SyS_socket+0xa/0x10
 [   17.643357]  do_syscall_64+0x79/0x220
 [   17.643879]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

Signed-off-by: Roman Gushchin <guro@fb.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 486e65e3db26..dc586cc64bc2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -351,6 +351,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 		struct bpf_prog **_prog, *__prog;	\
 		struct bpf_prog_array *_array;		\
 		u32 _ret = 1;				\
+		preempt_disable();			\
 		rcu_read_lock();			\
 		_array = rcu_dereference(array);	\
 		if (unlikely(check_non_null && !_array))\
@@ -362,6 +363,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
 		}					\
 _out:							\
 		rcu_read_unlock();			\
+		preempt_enable_no_resched();		\
 		_ret;					\
 	 })
 
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH v7 net-next 2/4] net: Introduce generic failover module
From: Samudrala, Sridhar @ 2018-04-23 17:21 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: alexander.h.duyck, virtio-dev, jiri, kubakici, netdev,
	virtualization, loseweigh, davem
In-Reply-To: <20180422200259-mutt-send-email-mst@kernel.org>

On 4/22/2018 10:06 AM, Michael S. Tsirkin wrote:
> On Thu, Apr 19, 2018 at 06:42:02PM -0700, Sridhar Samudrala wrote:
>> +#if IS_ENABLED(CONFIG_NET_FAILOVER)
>> +
>> +int failover_create(struct net_device *standby_dev,
>> +		    struct failover **pfailover);
> Should we rename all these structs net_failover?
> It's possible to extend the concept to storage I think.

We could, the only downside is that the names become longer. i think we need
to change the filenames and the function names also to be consistent.


>
>> +void failover_destroy(struct failover *failover);
>> +
>> +int failover_register(struct net_device *standby_dev, struct failover_ops *ops,
>> +		      struct failover **pfailover);
>> +void failover_unregister(struct failover *failover);
>> +
>> +int failover_slave_unregister(struct net_device *slave_dev);
>> +
>> +#else
>> +
>> +static inline
>> +int failover_create(struct net_device *standby_dev,
>> +		    struct failover **pfailover);
>> +{
>> +	return 0;
> Does this make callers do something sane?
> Shouldn't these return an error?

Yes. i think i should return -EOPNOTSUPP here, so that we fail
when CONFIG_NET_FAILOVER is not enabled and the virtio-net driver is trying
to create a failover device.


>> +}
>> +
>> +static inline
>> +void failover_destroy(struct failover *failover)
>> +{
>> +}
>> +
>> +static inline
>> +int failover_register(struct net_device *standby_dev, struct failover_ops *ops,
>> +		      struct pfailover **pfailover);
>> +{
>> +	return 0;
>> +}
> struct pfailover seems like a typo.

yes. will also change the return to -EOPNOTSUPP

>
>> +
>> +static inline
>> +void failover_unregister(struct failover *failover)
>> +{
>> +}
>> +
>> +static inline
>> +int failover_slave_unregister(struct net_device *slave_dev)
>> +{
>> +	return 0;
>> +}
> Does anyone test return value of unregister?
> should this be void?

yes. can be changed to void.


>
>> +
>> +#endif
>> +
>> +#endif /* _NET_FAILOVER_H */

^ permalink raw reply

* Re: [PATCH v7 net-next 4/4] netvsc: refactor notifier/event handling code to use the failover framework
From: Michael S. Tsirkin @ 2018-04-23 17:24 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jiri Pirko, Sridhar Samudrala, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici,
	jasowang, loseweigh
In-Reply-To: <20180423100406.71b95f74@xeon-e3>

On Mon, Apr 23, 2018 at 10:04:06AM -0700, Stephen Hemminger wrote:
> > >
> > >I will NAK patches to change to common code for netvsc especially the
> > >three device model.  MS worked hard with distro vendors to support transparent
> > >mode, ans we really can't have a new model; or do backport.
> > >
> > >Plus, DPDK is now dependent on existing model.  
> > 
> > Sorry, but nobody here cares about dpdk or other similar oddities.
> 
> The network device model is a userspace API, and DPDK is a userspace application.

It is userspace but are you sure dpdk is actually poking at netdevs?
AFAIK it's normally banging device registers directly.

> You can't go breaking userspace even if you don't like the application.

Could you please explain how is the proposed patchset breaking
userspace? Ignoring DPDK for now, I don't think it changes the userspace
API at all.

-- 
MST

^ permalink raw reply

* Re: [PATCH v7 net-next 4/4] netvsc: refactor notifier/event handling code to use the failover framework
From: Jiri Pirko @ 2018-04-23 17:25 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Sridhar Samudrala, mst, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh
In-Reply-To: <20180423100406.71b95f74@xeon-e3>

Mon, Apr 23, 2018 at 07:04:06PM CEST, stephen@networkplumber.org wrote:
>On Fri, 20 Apr 2018 18:00:58 +0200
>Jiri Pirko <jiri@resnulli.us> wrote:
>
>> Fri, Apr 20, 2018 at 05:28:02PM CEST, stephen@networkplumber.org wrote:
>> >On Thu, 19 Apr 2018 18:42:04 -0700
>> >Sridhar Samudrala <sridhar.samudrala@intel.com> wrote:
>> >  
>> >> Use the registration/notification framework supported by the generic
>> >> failover infrastructure.
>> >> 
>> >> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>  
>> >
>> >Do what you want to other devices but leave netvsc alone.
>> >Adding these failover ops does not reduce the code size, and really is
>> >no benefit.  The netvsc device driver needs to be backported to several
>> >other distributions and doing this makes that harder.  
>> 
>> We should not care about the backport burden when we are trying to make
>> things right. And things are not right. The current netvsc approach is
>> just plain wrong shortcut. It should have been done in a generic way
>> from the very beginning. We are just trying to fix this situation.
>> 
>> Moreover, I believe that part of the fix is to convert netvsc to 3
>> netdev solution too. 2 netdev model is wrong.
>> 
>> 
>> >
>> >I will NAK patches to change to common code for netvsc especially the
>> >three device model.  MS worked hard with distro vendors to support transparent
>> >mode, ans we really can't have a new model; or do backport.
>> >
>> >Plus, DPDK is now dependent on existing model.  
>> 
>> Sorry, but nobody here cares about dpdk or other similar oddities.
>
>The network device model is a userspace API, and DPDK is a userspace application.
>You can't go breaking userspace even if you don't like the application.

I don't understand how you can break anything by exposing
just-another-netdevice. If you break it, it is already broken...

And how you can break anything in userspace by doing refactoring inside
the kernel is puzzling me even more...

^ permalink raw reply

* Re: [nf-next] netfilter: extend SRH match to support matching previous, next and last SID
From: Pablo Neira Ayuso @ 2018-04-23 17:30 UTC (permalink / raw)
  To: Ahmed Abdelsalam
  Cc: fw, davem, dav.lebrun, linux-kernel, netfilter-devel, coreteam,
	netdev
In-Reply-To: <1524480503-1883-2-git-send-email-amsalam20@gmail.com>

On Mon, Apr 23, 2018 at 05:48:22AM -0500, Ahmed Abdelsalam wrote:
> IPv6 Segment Routing Header (SRH) contains a list of SIDs to be crossed by
> SR encapsulated packet. Each SID is encoded as an IPv6 prefix.
> 
> When a Firewall receives an SR encapsulated packet, it should be able to
> identify which node previously processed the packet (previous SID), which
> node is going to process the packet next (next SID), and which node is the
> last to process the packet (last SID) which represent the final destination
> of the packet in case of inline SR mode.
> 
> An example use-case of using these features could be SID list that includes
> two firewalls. When the second firewall receives a packet, it can check
> whether the packet has been processed by the first firewall or not. Based on
> that check, it decides to apply all rules, apply just subset of the rules,
> or totally skip all rules and forward the packet to the next SID.
> 
> This patch extends SRH match to support matching previous SID, next SID, and
> last SID.
> 
> Signed-off-by: Ahmed Abdelsalam <amsalam20@gmail.com>
> ---
>  include/uapi/linux/netfilter_ipv6/ip6t_srh.h | 22 +++++++++++++--
>  net/ipv6/netfilter/ip6t_srh.c                | 41 +++++++++++++++++++++++++++-
>  2 files changed, 60 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
> index f3cc0ef..9808382 100644
> --- a/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
> +++ b/include/uapi/linux/netfilter_ipv6/ip6t_srh.h
> @@ -17,7 +17,10 @@
>  #define IP6T_SRH_LAST_GT        0x0100
>  #define IP6T_SRH_LAST_LT        0x0200
>  #define IP6T_SRH_TAG            0x0400
> -#define IP6T_SRH_MASK           0x07FF
> +#define IP6T_SRH_PSID           0x0800
> +#define IP6T_SRH_NSID           0x1000
> +#define IP6T_SRH_LSID           0x2000
> +#define IP6T_SRH_MASK           0x3FFF
>  
>  /* Values for "mt_invflags" field in struct ip6t_srh */
>  #define IP6T_SRH_INV_NEXTHDR    0x0001
> @@ -31,7 +34,10 @@
>  #define IP6T_SRH_INV_LAST_GT    0x0100
>  #define IP6T_SRH_INV_LAST_LT    0x0200
>  #define IP6T_SRH_INV_TAG        0x0400
> -#define IP6T_SRH_INV_MASK       0x07FF
> +#define IP6T_SRH_INV_PSID       0x0800
> +#define IP6T_SRH_INV_NSID       0x1000
> +#define IP6T_SRH_INV_LSID       0x2000
> +#define IP6T_SRH_INV_MASK       0x3FFF
>  
>  /**
>   *      struct ip6t_srh - SRH match options
> @@ -40,6 +46,12 @@
>   *      @ segs_left: Segments left field of SRH
>   *      @ last_entry: Last entry field of SRH
>   *      @ tag: Tag field of SRH
> + *      @ psid_addr: Address of previous SID in SRH SID list
> + *      @ nsid_addr: Address of NEXT SID in SRH SID list
> + *      @ lsid_addr: Address of LAST SID in SRH SID list
> + *      @ psid_msk: Mask of previous SID in SRH SID list
> + *      @ nsid_msk: Mask of next SID in SRH SID list
> + *      @ lsid_msk: MAsk of last SID in SRH SID list
>   *      @ mt_flags: match options
>   *      @ mt_invflags: Invert the sense of match options
>   */
> @@ -50,6 +62,12 @@ struct ip6t_srh {
>  	__u8                    segs_left;
>  	__u8                    last_entry;
>  	__u16                   tag;
> +	struct in6_addr		psid_addr;
> +	struct in6_addr		nsid_addr;
> +	struct in6_addr		lsid_addr;
> +	struct in6_addr		psid_msk;
> +	struct in6_addr		nsid_msk;
> +	struct in6_addr		lsid_msk;

This is changing something exposed through UAPI, so you will need a
new revision for this.

>  	__u16                   mt_flags;
>  	__u16                   mt_invflags;
>  };
> diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
> index 33719d5..2b5cc73 100644
> --- a/net/ipv6/netfilter/ip6t_srh.c
> +++ b/net/ipv6/netfilter/ip6t_srh.c
> @@ -30,7 +30,9 @@ static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
>  	const struct ip6t_srh *srhinfo = par->matchinfo;
>  	struct ipv6_sr_hdr *srh;
>  	struct ipv6_sr_hdr _srh;
> -	int hdrlen, srhoff = 0;
> +	int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
> +	struct in6_addr *psid, *nsid, *lsid;
> +	struct in6_addr _psid, _nsid, _lsid;

Could you rearrange variable definitions? ie. longest line first, eg.

	int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0;
  	const struct ip6t_srh *srhinfo = par->matchinfo;
	struct in6_addr *psid, *nsid, *lsid;
  	struct ipv6_sr_hdr *srh;
  	struct ipv6_sr_hdr _srh;

Thanks.

^ permalink raw reply

* Re: [PATCH v7 net-next 4/4] netvsc: refactor notifier/event handling code to use the failover framework
From: Stephen Hemminger @ 2018-04-23 17:44 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jiri Pirko, Sridhar Samudrala, davem, netdev, virtualization,
	virtio-dev, jesse.brandeburg, alexander.h.duyck, kubakici,
	jasowang, loseweigh
In-Reply-To: <20180423202204-mutt-send-email-mst@kernel.org>

On Mon, 23 Apr 2018 20:24:56 +0300
"Michael S. Tsirkin" <mst@redhat.com> wrote:

> On Mon, Apr 23, 2018 at 10:04:06AM -0700, Stephen Hemminger wrote:
> > > >
> > > >I will NAK patches to change to common code for netvsc especially the
> > > >three device model.  MS worked hard with distro vendors to support transparent
> > > >mode, ans we really can't have a new model; or do backport.
> > > >
> > > >Plus, DPDK is now dependent on existing model.    
> > > 
> > > Sorry, but nobody here cares about dpdk or other similar oddities.  
> > 
> > The network device model is a userspace API, and DPDK is a userspace application.  
> 
> It is userspace but are you sure dpdk is actually poking at netdevs?
> AFAIK it's normally banging device registers directly.
> 
> > You can't go breaking userspace even if you don't like the application.  
> 
> Could you please explain how is the proposed patchset breaking
> userspace? Ignoring DPDK for now, I don't think it changes the userspace
> API at all.
> 

The DPDK has a device driver vdev_netvsc which scans the Linux network devices
to look for Linux netvsc device and the paired VF device and setup the
DPDK environment.  This setup creates a DPDK failsafe (bondingish) instance
and sets up TAP support over the Linux netvsc device as well as the Mellanox
VF device.

So it depends on existing 2 device model. You can't go to a 3 device model
or start hiding devices from userspace.

Also, I am working on associating netvsc and VF device based on serial number
rather than MAC address. The serial number is how Windows works now, and it makes
sense for Linux and Windows to use the same mechanism if possible.

^ permalink raw reply

* Re: [PATCH net-next 0/6] net: Extend availability of PHY statistics
From: Florian Fainelli @ 2018-04-23 17:46 UTC (permalink / raw)
  To: netdev, David S. Miller; +Cc: Andrew Lunn, Vivien Didelot, cphealy
In-Reply-To: <1524336953-26108-1-git-send-email-f.fainelli@gmail.com>

On 04/21/2018 11:55 AM, Florian Fainelli wrote:
> Hi all,
> 
> This patch series adds support for retrieving PHY statistics with DSA switches
> when the CPU port uses a PHY to PHY connection (as opposed to MAC to MAC).
> To get there a number of things are done:
> 
> - first we move the code dealing with PHY statistics outside of net/core/ethtool.c
>   and create helper functions since the same code will be reused
> - then we allow network device drivers to provide an ethtool_get_phy_stats callback
>   when the standard PHY library helpers are not suitable
> - we update the DSA functions dealing with ethtool operations to get passed a
>   stringset instead of assuming ETH_SS_STATS like they currently do
> - then we provide a set of standard helpers within DSA as a framework and add
>   the plumbing to allow retrieving the PHY statistics of the CPU port(s)
> - finally plug support for retrieving such PHY statistics with the b53 driver

David, there will be a v2 coming not just to fix the kbuild reported
issues, but also another one reported privately, thank you.

> 
> Florian Fainelli (6):
>   net: Move PHY statistics code into PHY library helpers
>   net: Allow network devices to have PHY statistics
>   net: dsa: Pass stringset to ethtool operations
>   net: dsa: Add helper function to obtain PHY device of a given port
>   net: dsa: Allow providing PHY statistics from CPU port
>   net: dsa: b53: Add support for reading PHY statistics
> 
>  drivers/net/dsa/b53/b53_common.c       | 66 ++++++++++++++++++++++---
>  drivers/net/dsa/b53/b53_priv.h         |  6 ++-
>  drivers/net/dsa/bcm_sf2.c              |  1 +
>  drivers/net/dsa/dsa_loop.c             | 11 ++++-
>  drivers/net/dsa/lan9303-core.c         | 11 ++++-
>  drivers/net/dsa/microchip/ksz_common.c | 11 ++++-
>  drivers/net/dsa/mt7530.c               | 11 ++++-
>  drivers/net/dsa/mv88e6xxx/chip.c       | 10 +++-
>  drivers/net/dsa/qca8k.c                | 10 +++-
>  drivers/net/phy/phy.c                  | 48 ++++++++++++++++++
>  include/linux/ethtool.h                |  5 ++
>  include/linux/phy.h                    |  4 ++
>  include/net/dsa.h                      | 12 ++++-
>  net/core/ethtool.c                     | 61 ++++++++---------------
>  net/dsa/master.c                       | 41 +++++++++++++---
>  net/dsa/port.c                         | 90 +++++++++++++++++++++++++++++-----
>  net/dsa/slave.c                        |  5 +-
>  17 files changed, 320 insertions(+), 83 deletions(-)
> 


-- 
Florian

^ permalink raw reply

* Re: [PATCH 0/3] Introduce LSM-hook for socketpair(2)
From: Serge E. Hallyn @ 2018-04-23 17:52 UTC (permalink / raw)
  To: David Herrmann
  Cc: linux-kernel, James Morris, Paul Moore, teg, Stephen Smalley,
	selinux, linux-security-module, Eric Paris, serge, davem, netdev
In-Reply-To: <20180423133015.5455-1-dh.herrmann@gmail.com>

Quoting David Herrmann (dh.herrmann@gmail.com):
> Hi
> 
> This series adds a new LSM hook for the socketpair(2) syscall. The idea
> is to allow SO_PEERSEC to be called on AF_UNIX sockets created via
> socketpair(2), and return the same information as if you emulated
> socketpair(2) via a temporary listener socket. Right now SO_PEERSEC
> will return the unlabeled credentials for a socketpair, rather than the
> actual credentials of the creating process.
> 
> A simple call to:
> 
>     socketpair(AF_UNIX, SOCK_STREAM, 0, out);
> 
> can be emulated via a temporary listener socket bound to a unique,
> random name in the abstract namespace. By connecting to this listener
> socket, accept(2) will return the second part of the pair. If
> SO_PEERSEC is queried on these, the correct credentials of the creating
> process are returned. A simple comparison between the behavior of
> SO_PEERSEC on socketpair(2) and an emulated socketpair is included in
> the dbus-broker test-suite [1].
> 
> This patch series tries to close this gap and makes both behave the
> same. A new LSM-hook is added which allows LSMs to cache the correct
> peer information on newly created socket-pairs.
> 
> Apart from fixing this behavioral difference, the dbus-broker project
> actually needs to query the credentials of socketpairs, and currently
> must resort to racy procfs(2) queries to get the LSM credentials of its
> controller socket. Several parts of the dbus-broker project allow you
> to pass in a socket during execve(2), which will be used by the child
> process to accept control-commands from its parent. One natural way to
> create this communication channel is to use socketpair(2). However,
> right now SO_PEERSEC does not return any useful information, hence, the
> child-process would need other means to retrieve this information. By
> avoiding socketpair(2) and using the hacky-emulated version, this is not
> an issue.
> 
> There was a previous discussion on this matter [2] roughly a year ago.
> Back then there was the suspicion that proper SO_PEERSEC would confuse
> applications. However, we could not find any evidence backing this
> suspicion. Furthermore, we now actually see the contrary. Lack of
> SO_PEERSEC makes it a hassle to use socketpairs with LSM credentials.
> Hence, we propose to implement full SO_PEERSEC for socketpairs.
> 
> This series only adds SELinux backends, since that is what we need for
> RHEL. I will gladly extend the other LSMs if needed.
> 
> Thanks
> David
> 
> [1] https://github.com/bus1/dbus-broker/blob/master/src/util/test-peersec.c
> [2] https://www.spinics.net/lists/selinux/msg22674.html
> 
> David Herrmann (3):
>   security: add hook for socketpair(AF_UNIX, ...)
>   net/unix: hook unix_socketpair() into LSM
>   selinux: provide unix_stream_socketpair callback
> 
>  include/linux/lsm_hooks.h |  8 ++++++++
>  include/linux/security.h  |  7 +++++++
>  net/unix/af_unix.c        |  5 +++++
>  security/security.c       |  6 ++++++
>  security/selinux/hooks.c  | 14 ++++++++++++++
>  5 files changed, 40 insertions(+)

Makes sense to me, thanks.

Acked-by: Serge Hallyn <serge@hallyn.com>

^ permalink raw reply

* [PATCH bpf-next v5 01/10] bpf: change prototype for stack_map_get_build_id_offset
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

This patch didn't incur functionality change. The function prototype
got changed so that the same function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/stackmap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb12..04f6ec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
 
 	/*
 	 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 04/10] bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

In verifier function adjust_scalar_min_max_vals,
when src_known is false and the opcode is BPF_LSH/BPF_RSH,
early return will happen in the function. So remove
the branch in handling BPF_LSH/BPF_RSH when src_known is false.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d00bf53..1bbb43d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2932,10 +2932,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			dst_reg->umin_value <<= umin_val;
 			dst_reg->umax_value <<= umax_val;
 		}
-		if (src_known)
-			dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
-		else
-			dst_reg->var_off = tnum_lshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val);
 		/* We may learn something more from the var_off */
 		__update_reg_bounds(dst_reg);
 		break;
@@ -2963,11 +2960,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		dst_reg->smin_value = S64_MIN;
 		dst_reg->smax_value = S64_MAX;
-		if (src_known)
-			dst_reg->var_off = tnum_rshift(dst_reg->var_off,
-						       umin_val);
-		else
-			dst_reg->var_off = tnum_rshift(tnum_unknown, umin_val);
+		dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val);
 		dst_reg->umin_value >>= umax_val;
 		dst_reg->umax_value >>= umin_val;
 		/* We may learn something more from the var_off */
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 03/10] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

The special property of return values for helpers bpf_get_stack
and bpf_probe_read_str are captured in verifier.
Both helpers return a negative error code or
a length, which is equal to or smaller than the buffer
size argument. This additional information in the
verifier can avoid the condition such as "retval > bufsize"
in the bpf program. For example, for the code blow,
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0 || usize > max_len)
        return 0;
The verifier may have the following errors:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (bf) r8 = r0
    54: (bf) r1 = r8
    55: (67) r1 <<= 32
    56: (bf) r2 = r1
    57: (77) r2 >>= 32
    58: (25) if r2 > 0x31f goto pc+33
     R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
                         umax_value=18446744069414584320,
                         var_off=(0x0; 0xffffffff00000000))
     R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0) R9=inv800 R10=fp0,call_-1
    59: (1f) r9 -= r8
    60: (c7) r1 s>>= 32
    61: (bf) r2 = r7
    62: (0f) r2 += r1
    math between map_value pointer and register with unbounded
    min value is not allowed
The failure is due to llvm compiler optimization where register "r2",
which is a copy of "r1", is tested for condition while later on "r1"
is used for map_ptr operation. The verifier is not able to track such
inst sequence effectively.

Without the "usize > max_len" condition, there is no llvm optimization
and the below generated code passed verifier:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (b7) r1 = 0
    54: (bf) r8 = r0
    55: (67) r8 <<= 32
    56: (c7) r8 s>>= 32
    57: (6d) if r1 s> r8 goto pc+24
     R0=inv(id=0,umax_value=800) R1=inv0 R6=ctx(id=0,off=0,imm=0)
     R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
     R10=fp0,call_-1
    58: (bf) r2 = r7
    59: (0f) r2 += r8
    60: (1f) r9 -= r8
    61: (bf) r1 = r6

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aba9425..d00bf53 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -164,6 +164,8 @@ struct bpf_call_arg_meta {
 	bool pkt_access;
 	int regno;
 	int access_size;
+	s64 msize_smax_value;
+	u64 msize_umax_value;
 };
 
 static DEFINE_MUTEX(bpf_verifier_lock);
@@ -1994,6 +1996,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 	} else if (arg_type_is_mem_size(arg_type)) {
 		bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
 
+		/* remember the mem_size which may be used later
+		 * to refine return values.
+		 */
+		meta->msize_smax_value = reg->smax_value;
+		meta->msize_umax_value = reg->umax_value;
+
 		/* The register is SCALAR_VALUE; the access check
 		 * happens using its boundaries.
 		 */
@@ -2333,6 +2341,21 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+				   int func_id,
+				   struct bpf_call_arg_meta *meta)
+{
+	struct bpf_reg_state *ret_reg = &regs[BPF_REG_0];
+
+	if (ret_type != RET_INTEGER ||
+	    (func_id != BPF_FUNC_get_stack &&
+	     func_id != BPF_FUNC_probe_read_str))
+		return;
+
+	ret_reg->smax_value = meta->msize_smax_value;
+	ret_reg->umax_value = meta->msize_umax_value;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
@@ -2456,6 +2479,8 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	do_refine_retval_range(regs, fn->ret_type, func_id, &meta);
+
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 07/10] samples/bpf: move common-purpose trace functions to selftests
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

There is no functionality change in this patch. The common-purpose
trace functions, including perf_event polling and ksym lookup,
are moved from trace_output_user.c and bpf_load.c to
selftests/bpf/trace_helpers.c so that these function can
be reused later in selftests.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/Makefile                        |  11 +-
 samples/bpf/bpf_load.c                      |  63 ----------
 samples/bpf/bpf_load.h                      |   7 --
 samples/bpf/offwaketime_user.c              |   1 +
 samples/bpf/sampleip_user.c                 |   1 +
 samples/bpf/spintest_user.c                 |   1 +
 samples/bpf/trace_event_user.c              |   1 +
 samples/bpf/trace_output_user.c             | 125 +++----------------
 tools/testing/selftests/bpf/trace_helpers.c | 186 ++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h |  24 ++++
 10 files changed, 238 insertions(+), 182 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.h

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index aa8c392..d36444c 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -49,6 +49,7 @@ hostprogs-y += xdp_adjust_tail
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
 CGROUP_HELPERS := ../../tools/testing/selftests/bpf/cgroup_helpers.o
+TRACE_HELPERS := ../../tools/testing/selftests/bpf/trace_helpers.o
 
 test_lru_dist-objs := test_lru_dist.o $(LIBBPF)
 sock_example-objs := sock_example.o $(LIBBPF)
@@ -65,10 +66,10 @@ tracex6-objs := bpf_load.o $(LIBBPF) tracex6_user.o
 tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
-trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
+trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o $(TRACE_HELPERS)
 lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
-offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
-spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
+offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o $(TRACE_HELPERS)
+spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o $(TRACE_HELPERS)
 map_perf_test-objs := bpf_load.o $(LIBBPF) map_perf_test_user.o
 test_overhead-objs := bpf_load.o $(LIBBPF) test_overhead_user.o
 test_cgrp2_array_pin-objs := $(LIBBPF) test_cgrp2_array_pin.o
@@ -82,8 +83,8 @@ xdp2-objs := bpf_load.o $(LIBBPF) xdp1_user.o
 xdp_router_ipv4-objs := bpf_load.o $(LIBBPF) xdp_router_ipv4_user.o
 test_current_task_under_cgroup-objs := bpf_load.o $(LIBBPF) $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
-trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o
-sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o
+trace_event-objs := bpf_load.o $(LIBBPF) trace_event_user.o $(TRACE_HELPERS)
+sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o $(TRACE_HELPERS)
 tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o
 lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o
 xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index bebe418..529972e 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -650,66 +650,3 @@ void read_trace_pipe(void)
 		}
 	}
 }
-
-#define MAX_SYMS 300000
-static struct ksym syms[MAX_SYMS];
-static int sym_cnt;
-
-static int ksym_cmp(const void *p1, const void *p2)
-{
-	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
-}
-
-int load_kallsyms(void)
-{
-	FILE *f = fopen("/proc/kallsyms", "r");
-	char func[256], buf[256];
-	char symbol;
-	void *addr;
-	int i = 0;
-
-	if (!f)
-		return -ENOENT;
-
-	while (!feof(f)) {
-		if (!fgets(buf, sizeof(buf), f))
-			break;
-		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
-			break;
-		if (!addr)
-			continue;
-		syms[i].addr = (long) addr;
-		syms[i].name = strdup(func);
-		i++;
-	}
-	sym_cnt = i;
-	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
-	return 0;
-}
-
-struct ksym *ksym_search(long key)
-{
-	int start = 0, end = sym_cnt;
-	int result;
-
-	while (start < end) {
-		size_t mid = start + (end - start) / 2;
-
-		result = key - syms[mid].addr;
-		if (result < 0)
-			end = mid;
-		else if (result > 0)
-			start = mid + 1;
-		else
-			return &syms[mid];
-	}
-
-	if (start >= 1 && syms[start - 1].addr < key &&
-	    key < syms[start].addr)
-		/* valid ksym */
-		return &syms[start - 1];
-
-	/* out of range. return _stext */
-	return &syms[0];
-}
-
diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h
index 453c200..2c3d0b4 100644
--- a/samples/bpf/bpf_load.h
+++ b/samples/bpf/bpf_load.h
@@ -54,12 +54,5 @@ int load_bpf_file(char *path);
 int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map);
 
 void read_trace_pipe(void);
-struct ksym {
-	long addr;
-	char *name;
-};
-
-int load_kallsyms(void);
-struct ksym *ksym_search(long key);
 int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags);
 #endif
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 512f87a..f06063a 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -17,6 +17,7 @@
 #include <sys/resource.h>
 #include "libbpf.h"
 #include "bpf_load.h"
+#include "trace_helpers.h"
 
 #define PRINT_RAW_ADDR 0
 
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index 4ed690b..60c2b73 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -22,6 +22,7 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 #define DEFAULT_FREQ	99
 #define DEFAULT_SECS	5
diff --git a/samples/bpf/spintest_user.c b/samples/bpf/spintest_user.c
index 3d73621..8d3e9cf 100644
--- a/samples/bpf/spintest_user.c
+++ b/samples/bpf/spintest_user.c
@@ -7,6 +7,7 @@
 #include <sys/resource.h>
 #include "libbpf.h"
 #include "bpf_load.h"
+#include "trace_helpers.h"
 
 int main(int ac, char **argv)
 {
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 56f7a25..1fa1bec 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -21,6 +21,7 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 #define SAMPLE_FREQ 50
 
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
index ccca1e3..cc4b383 100644
--- a/samples/bpf/trace_output_user.c
+++ b/samples/bpf/trace_output_user.c
@@ -21,100 +21,10 @@
 #include "libbpf.h"
 #include "bpf_load.h"
 #include "perf-sys.h"
+#include "trace_helpers.h"
 
 static int pmu_fd;
 
-int page_size;
-int page_cnt = 8;
-volatile struct perf_event_mmap_page *header;
-
-typedef void (*print_fn)(void *data, int size);
-
-static int perf_event_mmap(int fd)
-{
-	void *base;
-	int mmap_size;
-
-	page_size = getpagesize();
-	mmap_size = page_size * (page_cnt + 1);
-
-	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-	if (base == MAP_FAILED) {
-		printf("mmap err\n");
-		return -1;
-	}
-
-	header = base;
-	return 0;
-}
-
-static int perf_event_poll(int fd)
-{
-	struct pollfd pfd = { .fd = fd, .events = POLLIN };
-
-	return poll(&pfd, 1, 1000);
-}
-
-struct perf_event_sample {
-	struct perf_event_header header;
-	__u32 size;
-	char data[];
-};
-
-static void perf_event_read(print_fn fn)
-{
-	__u64 data_tail = header->data_tail;
-	__u64 data_head = header->data_head;
-	__u64 buffer_size = page_cnt * page_size;
-	void *base, *begin, *end;
-	char buf[256];
-
-	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
-	if (data_head == data_tail)
-		return;
-
-	base = ((char *)header) + page_size;
-
-	begin = base + data_tail % buffer_size;
-	end = base + data_head % buffer_size;
-
-	while (begin != end) {
-		struct perf_event_sample *e;
-
-		e = begin;
-		if (begin + e->header.size > base + buffer_size) {
-			long len = base + buffer_size - begin;
-
-			assert(len < e->header.size);
-			memcpy(buf, begin, len);
-			memcpy(buf + len, base, e->header.size - len);
-			e = (void *) buf;
-			begin = base + e->header.size - len;
-		} else if (begin + e->header.size == base + buffer_size) {
-			begin = base;
-		} else {
-			begin += e->header.size;
-		}
-
-		if (e->header.type == PERF_RECORD_SAMPLE) {
-			fn(e->data, e->size);
-		} else if (e->header.type == PERF_RECORD_LOST) {
-			struct {
-				struct perf_event_header header;
-				__u64 id;
-				__u64 lost;
-			} *lost = (void *) e;
-			printf("lost %lld events\n", lost->lost);
-		} else {
-			printf("unknown event type=%d size=%d\n",
-			       e->header.type, e->header.size);
-		}
-	}
-
-	__sync_synchronize(); /* smp_mb() */
-	header->data_tail = data_head;
-}
-
 static __u64 time_get_ns(void)
 {
 	struct timespec ts;
@@ -127,7 +37,7 @@ static __u64 start_time;
 
 #define MAX_CNT 100000ll
 
-static void print_bpf_output(void *data, int size)
+static int print_bpf_output(void *data, int size)
 {
 	static __u64 cnt;
 	struct {
@@ -138,7 +48,7 @@ static void print_bpf_output(void *data, int size)
 	if (e->cookie != 0x12345678) {
 		printf("BUG pid %llx cookie %llx sized %d\n",
 		       e->pid, e->cookie, size);
-		kill(0, SIGINT);
+		return PERF_EVENT_ERROR;
 	}
 
 	cnt++;
@@ -146,8 +56,10 @@ static void print_bpf_output(void *data, int size)
 	if (cnt == MAX_CNT) {
 		printf("recv %lld events per sec\n",
 		       MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
-		kill(0, SIGINT);
+		return PERF_EVENT_DONE;
 	}
+
+	return PERF_EVENT_CONT;
 }
 
 static void test_bpf_perf_event(void)
@@ -166,10 +78,18 @@ static void test_bpf_perf_event(void)
 	ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
 }
 
+static void exec_action(void)
+{
+	FILE *f;
+
+	f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+	(void) f;
+}
+
 int main(int argc, char **argv)
 {
 	char filename[256];
-	FILE *f;
+	int ret;
 
 	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
 
@@ -180,17 +100,8 @@ int main(int argc, char **argv)
 
 	test_bpf_perf_event();
 
-	if (perf_event_mmap(pmu_fd) < 0)
-		return 1;
-
-	f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
-	(void) f;
-
 	start_time = time_get_ns();
-	for (;;) {
-		perf_event_poll(pmu_fd);
-		perf_event_read(print_bpf_output);
-	}
-
-	return 0;
+	ret = perf_event_poller(pmu_fd, exec_action, print_bpf_output);
+	kill(0, SIGINT);
+	return ret;
 }
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
new file mode 100644
index 0000000..00954e3
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/mman.h>
+#include "trace_helpers.h"
+
+#define MAX_SYMS 300000
+static struct ksym syms[MAX_SYMS];
+static int sym_cnt;
+
+static int ksym_cmp(const void *p1, const void *p2)
+{
+	return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
+}
+
+int load_kallsyms(void)
+{
+	FILE *f = fopen("/proc/kallsyms", "r");
+	char func[256], buf[256];
+	char symbol;
+	void *addr;
+	int i = 0;
+
+	if (!f)
+		return -ENOENT;
+
+	while (!feof(f)) {
+		if (!fgets(buf, sizeof(buf), f))
+			break;
+		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
+			break;
+		if (!addr)
+			continue;
+		syms[i].addr = (long) addr;
+		syms[i].name = strdup(func);
+		i++;
+	}
+	sym_cnt = i;
+	qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
+	return 0;
+}
+
+struct ksym *ksym_search(long key)
+{
+	int start = 0, end = sym_cnt;
+	int result;
+
+	while (start < end) {
+		size_t mid = start + (end - start) / 2;
+
+		result = key - syms[mid].addr;
+		if (result < 0)
+			end = mid;
+		else if (result > 0)
+			start = mid + 1;
+		else
+			return &syms[mid];
+	}
+
+	if (start >= 1 && syms[start - 1].addr < key &&
+	    key < syms[start].addr)
+		/* valid ksym */
+		return &syms[start - 1];
+
+	/* out of range. return _stext */
+	return &syms[0];
+}
+
+static int page_size;
+static int page_cnt = 8;
+static volatile struct perf_event_mmap_page *header;
+
+static int perf_event_mmap(int fd)
+{
+	void *base;
+	int mmap_size;
+
+	page_size = getpagesize();
+	mmap_size = page_size * (page_cnt + 1);
+
+	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		printf("mmap err\n");
+		return -1;
+	}
+
+	header = base;
+	return 0;
+}
+
+static int perf_event_poll(int fd)
+{
+	struct pollfd pfd = { .fd = fd, .events = POLLIN };
+
+	return poll(&pfd, 1, 1000);
+}
+
+struct perf_event_sample {
+	struct perf_event_header header;
+	__u32 size;
+	char data[];
+};
+
+static int perf_event_read(perf_event_print_fn fn)
+{
+	__u64 data_tail = header->data_tail;
+	__u64 data_head = header->data_head;
+	__u64 buffer_size = page_cnt * page_size;
+	void *base, *begin, *end;
+	char buf[256];
+	int ret;
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return PERF_EVENT_CONT;
+
+	base = ((char *)header) + page_size;
+
+	begin = base + data_tail % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin != end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+		if (begin + e->header.size > base + buffer_size) {
+			long len = base + buffer_size - begin;
+
+			assert(len < e->header.size);
+			memcpy(buf, begin, len);
+			memcpy(buf + len, base, e->header.size - len);
+			e = (void *) buf;
+			begin = base + e->header.size - len;
+		} else if (begin + e->header.size == base + buffer_size) {
+			begin = base;
+		} else {
+			begin += e->header.size;
+		}
+
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			ret = fn(e->data, e->size);
+			if (ret != PERF_EVENT_CONT)
+				return ret;
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			struct {
+				struct perf_event_header header;
+				__u64 id;
+				__u64 lost;
+			} *lost = (void *) e;
+			printf("lost %lld events\n", lost->lost);
+		} else {
+			printf("unknown event type=%d size=%d\n",
+			       e->header.type, e->header.size);
+		}
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	header->data_tail = data_head;
+	return PERF_EVENT_CONT;
+}
+
+int perf_event_poller(int fd, perf_event_exec_fn exec_fn,
+		      perf_event_print_fn output_fn)
+{
+	int ret;
+
+	if (perf_event_mmap(fd) < 0)
+		return PERF_EVENT_ERROR;
+
+	exec_fn();
+
+	for (;;) {
+		perf_event_poll(fd);
+		ret = perf_event_read(output_fn);
+		if (ret != PERF_EVENT_CONT)
+			return ret;
+	}
+
+	return PERF_EVENT_DONE;
+}
diff --git a/tools/testing/selftests/bpf/trace_helpers.h b/tools/testing/selftests/bpf/trace_helpers.h
new file mode 100644
index 0000000..8750778
--- /dev/null
+++ b/tools/testing/selftests/bpf/trace_helpers.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __TRACE_HELPER_H
+#define __TRACE_HELPER_H
+
+struct ksym {
+	long addr;
+	char *name;
+};
+
+int load_kallsyms(void);
+struct ksym *ksym_search(long key);
+
+typedef void (*perf_event_exec_fn)(void);
+typedef int (*perf_event_print_fn)(void *data, int size);
+
+/* return code for perf_event_print_fn */
+#define PERF_EVENT_DONE		0
+#define PERF_EVENT_ERROR	1
+#define PERF_EVENT_CONT		2
+
+/* return PERF_EVENT_DONE or PERF_EVENT_ERROR */
+int perf_event_poller(int fd, perf_event_exec_fn exec_fn,
+		      perf_event_print_fn output_fn);
+#endif
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 00/10] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table regardless of
whether BPF_F_REUSE_STACKID is specified or not,
so some stack traces may be missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Patches #1 and #2 implemented the core kernel support.
Patch #3 removes two never-hit branches in verifier.
Patches #4 and #5 are two verifier improves to make
bpf programming easier. Patch #6 synced the new helper
to tools headers. Patch #7 moved perf_event polling code
and ksym lookup code from samples/bpf to
tools/testing/selftests/bpf. Patch #8 added a verifier
test in tools/bpf for new verifier change.
Patches #9 and #10 added tests for raw tracepoint prog
and tracepoint prog respectively.

Changelogs:
  v4 -> v5:
    . relied on dst_reg->var_off to refine umin_val/umax_val
      in verifier handling BPF_ARSH value range tracking,
      suggested by Edward.
  v3 -> v4:
    . fixed a bug when meta ptr is set to NULL in check_func_arg.
    . introduced tnum_arshift and added detailed comments for
      the underlying implementation
    . avoided using VLA in tools/bpf test_progs.
  v2 -> v3:
    . used meta to track helper memory size argument
    . implemented range checking for ARSH in verifier
    . moved perf event polling and ksym related functions
      from samples/bpf to tools/bpf
    . added test to compare build id's between bpf_get_stackid
      and bpf_get_stack
  v1 -> v2:
    . fixed compilation error when CONFIG_PERF_EVENTS is not enabled

Yonghong Song (10):
  bpf: change prototype for stack_map_get_build_id_offset
  bpf: add bpf_get_stack helper
  bpf/verifier: refine retval R0 state for bpf_get_stack helper
  bpf: remove never-hit branches in verifier adjust_scalar_min_max_vals
  bpf/verifier: improve register value range tracking with ARSH
  tools/bpf: add bpf_get_stack helper to tools headers
  samples/bpf: move common-purpose trace functions to selftests
  tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH
  tools/bpf: add a test for bpf_get_stack with raw tracepoint prog
  tools/bpf: add a test for bpf_get_stack with tracepoint prog

 include/linux/bpf.h                                |   1 +
 include/linux/filter.h                             |   3 +-
 include/linux/tnum.h                               |   4 +-
 include/uapi/linux/bpf.h                           |  19 +-
 kernel/bpf/core.c                                  |   5 +
 kernel/bpf/stackmap.c                              |  80 ++++++++-
 kernel/bpf/syscall.c                               |  10 ++
 kernel/bpf/tnum.c                                  |  10 ++
 kernel/bpf/verifier.c                              |  80 ++++++++-
 kernel/trace/bpf_trace.c                           |  50 +++++-
 samples/bpf/Makefile                               |  11 +-
 samples/bpf/bpf_load.c                             |  63 -------
 samples/bpf/bpf_load.h                             |   7 -
 samples/bpf/offwaketime_user.c                     |   1 +
 samples/bpf/sampleip_user.c                        |   1 +
 samples/bpf/spintest_user.c                        |   1 +
 samples/bpf/trace_event_user.c                     |   1 +
 samples/bpf/trace_output_user.c                    | 125 ++------------
 tools/include/uapi/linux/bpf.h                     |  19 +-
 tools/testing/selftests/bpf/Makefile               |   3 +-
 tools/testing/selftests/bpf/bpf_helpers.h          |   3 +-
 tools/testing/selftests/bpf/test_get_stack_rawtp.c | 102 +++++++++++
 tools/testing/selftests/bpf/test_progs.c           | 192 ++++++++++++++++++++-
 .../selftests/bpf/test_stacktrace_build_id.c       |  20 ++-
 tools/testing/selftests/bpf/test_stacktrace_map.c  |  20 ++-
 tools/testing/selftests/bpf/test_verifier.c        |  45 +++++
 tools/testing/selftests/bpf/trace_helpers.c        | 186 ++++++++++++++++++++
 tools/testing/selftests/bpf/trace_helpers.h        |  24 +++
 28 files changed, 867 insertions(+), 219 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.c
 create mode 100644 tools/testing/selftests/bpf/trace_helpers.h

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v5 08/10] tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

The test_verifier already has a few ARSH test cases.
This patch adds a new test case which takes advantage of newly
improved verifier behavior for bpf_get_stack and ARSH.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_verifier.c | 45 +++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 3e7718b..cd595ba 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -11423,6 +11423,51 @@ static struct bpf_test tests[] = {
 		.errstr = "BPF_XADD stores into R2 packet",
 		.prog_type = BPF_PROG_TYPE_XDP,
 	},
+	{
+		"bpf_get_stack return R0 within range",
+		.insns = {
+			BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
+			BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
+			BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
+			BPF_LD_MAP_FD(BPF_REG_1, 0),
+			BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
+				     BPF_FUNC_map_lookup_elem),
+			BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 28),
+			BPF_MOV64_REG(BPF_REG_7, BPF_REG_0),
+			BPF_MOV64_IMM(BPF_REG_9, sizeof(struct test_val)),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_3, sizeof(struct test_val)),
+			BPF_MOV64_IMM(BPF_REG_4, 256),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_MOV64_IMM(BPF_REG_1, 0),
+			BPF_MOV64_REG(BPF_REG_8, BPF_REG_0),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_8, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_8, 32),
+			BPF_JMP_REG(BPF_JSLT, BPF_REG_1, BPF_REG_8, 16),
+			BPF_ALU64_REG(BPF_SUB, BPF_REG_9, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_2, BPF_REG_7),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_8),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_9),
+			BPF_ALU64_IMM(BPF_LSH, BPF_REG_1, 32),
+			BPF_ALU64_IMM(BPF_ARSH, BPF_REG_1, 32),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_2),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_1),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_7),
+			BPF_MOV64_IMM(BPF_REG_5, sizeof(struct test_val)),
+			BPF_ALU64_REG(BPF_ADD, BPF_REG_1, BPF_REG_5),
+			BPF_JMP_REG(BPF_JGE, BPF_REG_3, BPF_REG_1, 4),
+			BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
+			BPF_MOV64_REG(BPF_REG_3, BPF_REG_9),
+			BPF_MOV64_IMM(BPF_REG_4, 0),
+			BPF_EMIT_CALL(BPF_FUNC_get_stack),
+			BPF_EXIT_INSN(),
+		},
+		.fixup_map2 = { 4 },
+		.result = ACCEPT,
+		.prog_type = BPF_PROG_TYPE_TRACEPOINT,
+	},
 };
 
 static int probe_filter_length(const struct bpf_insn *fp)
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 09/10] tools/bpf: add a test for bpf_get_stack with raw tracepoint prog
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

The test attached a raw_tracepoint program to sched/sched_switch.
It tested to get stack for user space, kernel space and user
space with build_id request. It also tested to get user
and kernel stack into the same buffer with back-to-back
bpf_get_stack helper calls.

Whenever the kernel stack is available, the user space
application will check to ensure that the kernel function
for raw_tracepoint ___bpf_prog_run is part of the stack.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/Makefile               |   3 +-
 tools/testing/selftests/bpf/test_get_stack_rawtp.c | 102 +++++++++++++++++
 tools/testing/selftests/bpf/test_progs.c           | 122 +++++++++++++++++++++
 3 files changed, 226 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/test_get_stack_rawtp.c

diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 0b72cc7..54e9e74 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -32,7 +32,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
 	test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \
 	sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \
 	sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o test_adjust_tail.o \
-	test_btf_haskv.o test_btf_nokv.o
+	test_btf_haskv.o test_btf_nokv.o test_get_stack_rawtp.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
@@ -56,6 +56,7 @@ $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a
 $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c
 $(OUTPUT)/test_sock: cgroup_helpers.c
 $(OUTPUT)/test_sock_addr: cgroup_helpers.c
+$(OUTPUT)/test_progs: trace_helpers.c
 
 .PHONY: force
 
diff --git a/tools/testing/selftests/bpf/test_get_stack_rawtp.c b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
new file mode 100644
index 0000000..ba1dcf9
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_get_stack_rawtp.c
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Permit pretty deep stack traces */
+#define MAX_STACK_RAWTP 100
+struct stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK_RAWTP];
+	__u64 user_stack[MAX_STACK_RAWTP];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+struct bpf_map_def SEC("maps") perfmap = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(__u32),
+	.max_entries = 2,
+};
+
+struct bpf_map_def SEC("maps") stackdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct stack_trace_t),
+	.max_entries = 1,
+};
+
+/* Allocate per-cpu space twice the needed. For the code below
+ *   usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+ *   if (usize < 0)
+ *     return 0;
+ *   ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+ *
+ * If we have value_size = MAX_STACK_RAWTP * sizeof(__u64),
+ * verifier will complain that access "raw_data + usize"
+ * with size "max_len - usize" may be out of bound.
+ * The maximum "raw_data + usize" is "raw_data + max_len"
+ * and the maximum "max_len - usize" is "max_len", verifier
+ * concludes that the maximum buffer access range is
+ * "raw_data[0...max_len * 2 - 1]" and hence reject the program.
+ *
+ * Doubling the to-be-used max buffer size can fix this verifier
+ * issue and avoid complicated C programming massaging.
+ * This is an acceptable workaround since there is one entry here.
+ */
+struct bpf_map_def SEC("maps") rawdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = MAX_STACK_RAWTP * sizeof(__u64) * 2,
+	.max_entries = 1,
+};
+
+SEC("tracepoint/sched/sched_switch")
+int bpf_prog1(void *ctx)
+{
+	int max_len, max_buildid_len, usize, ksize, total_size;
+	struct stack_trace_t *data;
+	void *raw_data;
+	__u32 key = 0;
+
+	data = bpf_map_lookup_elem(&stackdata_map, &key);
+	if (!data)
+		return 0;
+
+	max_len = MAX_STACK_RAWTP * sizeof(__u64);
+	max_buildid_len = MAX_STACK_RAWTP * sizeof(struct bpf_stack_build_id);
+	data->pid = bpf_get_current_pid_tgid();
+	data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
+					      max_len, 0);
+	data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
+					    BPF_F_USER_STACK);
+	data->user_stack_buildid_size = bpf_get_stack(
+		ctx, data->user_stack_buildid, max_buildid_len,
+		BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
+
+	/* write both kernel and user stacks to the same buffer */
+	raw_data = bpf_map_lookup_elem(&rawdata_map, &key);
+	if (!raw_data)
+		return 0;
+
+	usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+	if (usize < 0)
+		return 0;
+
+	ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+	if (ksize < 0)
+		return 0;
+
+	total_size = usize + ksize;
+	if (total_size > 0 && total_size <= max_len)
+		bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index eedda98..c148a55 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -38,6 +38,7 @@ typedef __u16 __sum16;
 #include "bpf_util.h"
 #include "bpf_endian.h"
 #include "bpf_rlimit.h"
+#include "trace_helpers.h"
 
 static int error_cnt, pass_cnt;
 
@@ -1204,6 +1205,126 @@ static void test_stacktrace_build_id(void)
 	return;
 }
 
+#define MAX_CNT_RAWTP	10ull
+#define MAX_STACK_RAWTP	100
+struct get_stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK_RAWTP];
+	__u64 user_stack[MAX_STACK_RAWTP];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK_RAWTP];
+};
+
+static void get_stack_raw_tp_action(void)
+{
+	FILE *f;
+
+	f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+	(void) f;
+}
+
+static int get_stack_print_output(void *data, int size)
+{
+	bool good_kern_stack = false, good_user_stack = false;
+	const char *expected_func = "___bpf_prog_run";
+	struct get_stack_trace_t *e = data;
+	int i, num_stack;
+	static __u64 cnt;
+	struct ksym *ks;
+
+	cnt++;
+
+	if (size < sizeof(struct get_stack_trace_t)) {
+		__u64 *raw_data = data;
+
+		num_stack = size / sizeof(__u64);
+		for (i = 0; i < num_stack; i++) {
+			ks = ksym_search(raw_data[i]);
+			if (ks && (strcmp(ks->name, expected_func) == 0)) {
+				good_kern_stack = true;
+				good_user_stack = (i > 0);
+			}
+		}
+	} else {
+		if (e->kern_stack_size > 0) {
+			num_stack = e->kern_stack_size / sizeof(__u64);
+			for (i = 0; i < num_stack; i++) {
+				ks = ksym_search(e->kern_stack[i]);
+				if (ks && (strcmp(ks->name, expected_func) == 0))
+					good_kern_stack = true;
+			}
+		}
+		if (e->user_stack_size > 0 && e->user_stack_buildid_size > 0)
+			good_user_stack = true;
+	}
+	if (!good_kern_stack || !good_user_stack)
+		return PERF_EVENT_ERROR;
+
+	if (cnt == MAX_CNT_RAWTP)
+		return PERF_EVENT_DONE;
+
+	return PERF_EVENT_CONT;
+}
+
+static void test_get_stack_raw_tp(void)
+{
+	const char *file = "./test_get_stack_rawtp.o";
+	int efd, err, prog_fd, pmu_fd, perfmap_fd;
+	struct perf_event_attr attr = {};
+	__u32 key = 0, duration = 0;
+	struct bpf_object *obj;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno))
+		return;
+
+	efd = bpf_raw_tracepoint_open("sched_switch", prog_fd);
+	if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+
+	perfmap_fd = bpf_find_map(__func__, obj, "perfmap");
+	if (CHECK(perfmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  perfmap_fd, errno))
+		goto close_prog;
+
+	err = load_kallsyms();
+	if (CHECK(err < 0, "load_kallsyms", "err %d errno %d\n", err, errno))
+		goto close_prog;
+
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.type = PERF_TYPE_SOFTWARE;
+	attr.config = PERF_COUNT_SW_BPF_OUTPUT;
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1/*pid*/, 0/*cpu*/,
+			 -1/*group_fd*/, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = bpf_map_update_elem(perfmap_fd, &key, &pmu_fd, BPF_ANY);
+	if (CHECK(err < 0, "bpf_map_update_elem", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err < 0, "ioctl PERF_EVENT_IOC_ENABLE", "err %d errno %d\n",
+		  err, errno))
+		goto close_prog;
+
+	err = perf_event_poller(pmu_fd, get_stack_raw_tp_action,
+				get_stack_print_output);
+	if (CHECK(err < 0, "perf_event_poller", "err %d errno %d\n", err,
+		  errno))
+		goto close_prog;
+
+	goto close_prog_noerr;
+close_prog:
+	error_cnt++;
+close_prog_noerr:
+	bpf_object__close(obj);
+}
+
 int main(void)
 {
 	test_pkt_access();
@@ -1219,6 +1340,7 @@ int main(void)
 	test_stacktrace_map();
 	test_stacktrace_build_id();
 	test_stacktrace_map_raw_tp();
+	test_get_stack_raw_tp();
 
 	printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt);
 	return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 06/10] tools/bpf: add bpf_get_stack helper to tools headers
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/include/uapi/linux/bpf.h            | 19 +++++++++++++++++--
 tools/testing/selftests/bpf/bpf_helpers.h |  3 ++-
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 7f7fbb9..116eb5f 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -529,6 +529,17 @@ union bpf_attr {
  *             other bits - reserved
  *     Return: >= 0 stackid on success or negative error
  *
+ * int bpf_get_stack(ctx, buf, size, flags)
+ *     walk user or kernel stack and store the ips in buf
+ *     @ctx: struct pt_regs*
+ *     @buf: user buffer to fill stack
+ *     @size: the buf size
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 11 - get build-id as well if user stack
+ *             other bits - reserved
+ *     Return: >= 0 size copied on success or negative error
+ *
  * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
  *     calculate csum diff
  *     @from: raw from buffer
@@ -841,7 +852,8 @@ union bpf_attr {
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
 	FN(bind),			\
-	FN(xdp_adjust_tail),
+	FN(xdp_adjust_tail),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -875,11 +887,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 9271576..2d9d650 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -98,7 +98,8 @@ static int (*bpf_bind)(void *ctx, void *addr, int addr_len) =
 	(void *) BPF_FUNC_bind;
 static int (*bpf_xdp_adjust_tail)(void *ctx, int offset) =
 	(void *) BPF_FUNC_xdp_adjust_tail;
-
+static int (*bpf_get_stack)(void *ctx, void *buf, int size, int flags) =
+	(void *) BPF_FUNC_get_stack;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 10/10] tools/bpf: add a test for bpf_get_stack with tracepoint prog
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

The test_stacktrace_map and test_stacktrace_build_id are
enhanced to call bpf_get_stack in the helper to get the
stack trace as well.  The stack traces from bpf_get_stack
and bpf_get_stackid are compared to ensure that for the
same stack as represented as the same hash, their ip addresses
or build id's must be the same.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c           | 70 ++++++++++++++++++++--
 .../selftests/bpf/test_stacktrace_build_id.c       | 20 ++++++-
 tools/testing/selftests/bpf/test_stacktrace_map.c  | 20 ++++++-
 3 files changed, 99 insertions(+), 11 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index c148a55..664db67 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -897,11 +897,47 @@ static int compare_map_keys(int map1_fd, int map2_fd)
 	return 0;
 }
 
+static int compare_stack_ips(int smap_fd, int amap_fd, int stack_trace_len)
+{
+	__u32 key, next_key, *cur_key_p, *next_key_p;
+	char *val_buf1, *val_buf2;
+	int i, err = 0;
+
+	val_buf1 = malloc(stack_trace_len);
+	val_buf2 = malloc(stack_trace_len);
+	cur_key_p = NULL;
+	next_key_p = &key;
+	while (bpf_map_get_next_key(smap_fd, cur_key_p, next_key_p) == 0) {
+		err = bpf_map_lookup_elem(smap_fd, next_key_p, val_buf1);
+		if (err)
+			goto out;
+		err = bpf_map_lookup_elem(amap_fd, next_key_p, val_buf2);
+		if (err)
+			goto out;
+		for (i = 0; i < stack_trace_len; i++) {
+			if (val_buf1[i] != val_buf2[i]) {
+				err = -1;
+				goto out;
+			}
+		}
+		key = *next_key_p;
+		cur_key_p = &key;
+		next_key_p = &next_key;
+	}
+	if (errno != ENOENT)
+		err = -1;
+
+out:
+	free(val_buf1);
+	free(val_buf2);
+	return err;
+}
+
 static void test_stacktrace_map()
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_map.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, val, duration = 0;
 	struct bpf_object *obj;
@@ -957,6 +993,10 @@ static void test_stacktrace_map()
 	if (stackmap_fd < 0)
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (stack_amap_fd < 0)
+		goto disable_pmu;
+
 	/* give some time for bpf program run */
 	sleep(1);
 
@@ -978,6 +1018,12 @@ static void test_stacktrace_map()
 		  "err %d errno %d\n", err, errno))
 		goto disable_pmu_noerr;
 
+	stack_trace_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	if (CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu_noerr;
+
 	goto disable_pmu_noerr;
 disable_pmu:
 	error_cnt++;
@@ -1071,9 +1117,9 @@ static int extract_build_id(char *build_id, size_t size)
 
 static void test_stacktrace_build_id(void)
 {
-	int control_map_fd, stackid_hmap_fd, stackmap_fd;
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
 	const char *file = "./test_stacktrace_build_id.o";
-	int bytes, efd, err, pmu_fd, prog_fd;
+	int bytes, efd, err, pmu_fd, prog_fd, stack_trace_len;
 	struct perf_event_attr attr = {};
 	__u32 key, previous_key, val, duration = 0;
 	struct bpf_object *obj;
@@ -1138,6 +1184,11 @@ static void test_stacktrace_build_id(void)
 		  err, errno))
 		goto disable_pmu;
 
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
 	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
 	       == 0);
 	assert(system("./urandom_read if=/dev/urandom of=/dev/zero count=4 2> /dev/null") == 0);
@@ -1189,8 +1240,15 @@ static void test_stacktrace_build_id(void)
 		previous_key = key;
 	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
 
-	CHECK(build_id_matches < 1, "build id match",
-	      "Didn't find expected build ID from the map");
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map"))
+		goto disable_pmu;
+
+	stack_trace_len = PERF_MAX_STACK_DEPTH
+		* sizeof(struct bpf_stack_build_id);
+	err = compare_stack_ips(stackmap_fd, stack_amap_fd, stack_trace_len);
+	CHECK(err, "compare_stack_ips stackmap vs. stack_amap",
+	      "err %d errno %d\n", err, errno);
 
 disable_pmu:
 	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
diff --git a/tools/testing/selftests/bpf/test_stacktrace_build_id.c b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
index b755bd7..d86c281 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_build_id.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_build_id.c
@@ -19,7 +19,7 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
@@ -31,6 +31,14 @@ struct bpf_map_def SEC("maps") stackmap = {
 	.map_flags = BPF_F_STACK_BUILD_ID,
 };
 
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(struct bpf_stack_build_id)
+		* PERF_MAX_STACK_DEPTH,
+	.max_entries = 128,
+};
+
 /* taken from /sys/kernel/debug/tracing/events/random/urandom_read/format */
 struct random_urandom_args {
 	unsigned long long pad;
@@ -42,7 +50,10 @@ struct random_urandom_args {
 SEC("tracepoint/random/urandom_read")
 int oncpu(struct random_urandom_args *args)
 {
+	__u32 max_len = sizeof(struct bpf_stack_build_id)
+			* PERF_MAX_STACK_DEPTH;
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -50,8 +61,13 @@ int oncpu(struct random_urandom_args *args)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(args, &stackmap, BPF_F_USER_STACK);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(args, stack_p, max_len,
+				      BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	}
 
 	return 0;
 }
diff --git a/tools/testing/selftests/bpf/test_stacktrace_map.c b/tools/testing/selftests/bpf/test_stacktrace_map.c
index 76d85c5d..f83c7b6 100644
--- a/tools/testing/selftests/bpf/test_stacktrace_map.c
+++ b/tools/testing/selftests/bpf/test_stacktrace_map.c
@@ -19,14 +19,21 @@ struct bpf_map_def SEC("maps") stackid_hmap = {
 	.type = BPF_MAP_TYPE_HASH,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u32),
-	.max_entries = 10000,
+	.max_entries = 16384,
 };
 
 struct bpf_map_def SEC("maps") stackmap = {
 	.type = BPF_MAP_TYPE_STACK_TRACE,
 	.key_size = sizeof(__u32),
 	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
-	.max_entries = 10000,
+	.max_entries = 16384,
+};
+
+struct bpf_map_def SEC("maps") stack_amap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u64) * PERF_MAX_STACK_DEPTH,
+	.max_entries = 16384,
 };
 
 /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */
@@ -44,7 +51,10 @@ struct sched_switch_args {
 SEC("tracepoint/sched/sched_switch")
 int oncpu(struct sched_switch_args *ctx)
 {
+	__u32 max_len = PERF_MAX_STACK_DEPTH * sizeof(__u64);
 	__u32 key = 0, val = 0, *value_p;
+	void *stack_p;
+
 
 	value_p = bpf_map_lookup_elem(&control_map, &key);
 	if (value_p && *value_p)
@@ -52,8 +62,12 @@ int oncpu(struct sched_switch_args *ctx)
 
 	/* The size of stackmap and stackid_hmap should be the same */
 	key = bpf_get_stackid(ctx, &stackmap, 0);
-	if ((int)key >= 0)
+	if ((int)key >= 0) {
 		bpf_map_update_elem(&stackid_hmap, &key, &val, 0);
+		stack_p = bpf_map_lookup_elem(&stack_amap, &key);
+		if (stack_p)
+			bpf_get_stack(ctx, stack_p, max_len, 0);
+	}
 
 	return 0;
 }
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v5 02/10] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-23 17:54 UTC (permalink / raw)
  To: ast, daniel, netdev, ecree; +Cc: kernel-team
In-Reply-To: <20180423175417.4104464-1-yhs@fb.com>

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table,
so some stack traces are missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Acked-by: Alexei Starovoitov <ast@fb.com>
Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/linux/bpf.h      |  1 +
 include/linux/filter.h   |  3 ++-
 include/uapi/linux/bpf.h | 19 ++++++++++++--
 kernel/bpf/core.c        |  5 ++++
 kernel/bpf/stackmap.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c     | 10 ++++++++
 kernel/bpf/verifier.c    |  3 +++
 kernel/trace/bpf_trace.c | 50 +++++++++++++++++++++++++++++++++++-
 8 files changed, 154 insertions(+), 4 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ee5275e..2c520b4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -690,6 +690,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_push_proto;
 extern const struct bpf_func_proto bpf_skb_vlan_pop_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
+extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 
 /* Shared helpers among cBPF and eBPF. */
diff --git a/include/linux/filter.h b/include/linux/filter.h
index 4da8b23..044d30e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -468,7 +468,8 @@ struct bpf_prog {
 				dst_needed:1,	/* Do we need dst entry? */
 				blinded:1,	/* Was blinded */
 				is_func:1,	/* program is a bpf function */
-				kprobe_override:1; /* Do we override a kprobe? */
+				kprobe_override:1, /* Do we override a kprobe? */
+				need_callchain_buf:1; /* Needs callchain buffer? */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index c8383a2..470f3a2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -529,6 +529,17 @@ union bpf_attr {
  *             other bits - reserved
  *     Return: >= 0 stackid on success or negative error
  *
+ * int bpf_get_stack(ctx, buf, size, flags)
+ *     walk user or kernel stack and store the ips in buf
+ *     @ctx: struct pt_regs*
+ *     @buf: user buffer to fill stack
+ *     @size: the buf size
+ *     @flags: bits 0-7 - numer of stack frames to skip
+ *             bit 8 - collect user stack instead of kernel
+ *             bit 11 - get build-id as well if user stack
+ *             other bits - reserved
+ *     Return: >= 0 size copied on success or negative error
+ *
  * s64 bpf_csum_diff(from, from_size, to, to_size, seed)
  *     calculate csum diff
  *     @from: raw from buffer
@@ -841,7 +852,8 @@ union bpf_attr {
 	FN(msg_cork_bytes),		\
 	FN(msg_pull_data),		\
 	FN(bind),			\
-	FN(xdp_adjust_tail),
+	FN(xdp_adjust_tail),		\
+	FN(get_stack),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -875,11 +887,14 @@ enum bpf_func_id {
 /* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */
 #define BPF_F_TUNINFO_IPV6		(1ULL << 0)
 
-/* BPF_FUNC_get_stackid flags. */
+/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */
 #define BPF_F_SKIP_FIELD_MASK		0xffULL
 #define BPF_F_USER_STACK		(1ULL << 8)
+/* flags used by BPF_FUNC_get_stackid only. */
 #define BPF_F_FAST_STACK_CMP		(1ULL << 9)
 #define BPF_F_REUSE_STACKID		(1ULL << 10)
+/* flags used by BPF_FUNC_get_stack only. */
+#define BPF_F_USER_BUILD_ID		(1ULL << 11)
 
 /* BPF_FUNC_skb_set_tunnel_key flags. */
 #define BPF_F_ZERO_CSUM_TX		(1ULL << 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d315b39..bf22eca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -31,6 +31,7 @@
 #include <linux/rbtree_latch.h>
 #include <linux/kallsyms.h>
 #include <linux/rcupdate.h>
+#include <linux/perf_event.h>
 
 #include <asm/unaligned.h>
 
@@ -1709,6 +1710,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
 	aux = container_of(work, struct bpf_prog_aux, work);
 	if (bpf_prog_is_dev_bound(aux))
 		bpf_prog_offload_destroy(aux->prog);
+#ifdef CONFIG_PERF_EVENTS
+	if (aux->prog->need_callchain_buf)
+		put_callchain_buffers();
+#endif
 	for (i = 0; i < aux->func_cnt; i++)
 		bpf_jit_free(aux->func[i]);
 	if (aux->func_cnt) {
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 04f6ec1..4477cf6 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -402,6 +402,73 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
+	   u64, flags)
+{
+	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
+	bool user_build_id = flags & BPF_F_USER_BUILD_ID;
+	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+	int err = -EINVAL;
+	u64 *ips;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+	if (kernel && user_build_id)
+		goto clear;
+
+	elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id)
+					    : sizeof(u64);
+	if (unlikely(size % elem_size))
+		goto clear;
+
+	num_elem = size / elem_size;
+	if (sysctl_perf_event_max_stack < num_elem)
+		init_nr = 0;
+	else
+		init_nr = sysctl_perf_event_max_stack - num_elem;
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+	if (unlikely(!trace))
+		goto err_fault;
+
+	trace_nr = trace->nr - init_nr;
+	if (trace_nr <= skip)
+		goto err_fault;
+
+	trace_nr -= skip;
+	trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem;
+	copy_len = trace_nr * elem_size;
+	ips = trace->ip + skip + init_nr;
+	if (user && user_build_id)
+		stack_map_get_build_id_offset(buf, ips, trace_nr, user);
+	else
+		memcpy(buf, ips, copy_len);
+
+	if (size > copy_len)
+		memset(buf + copy_len, 0, size - copy_len);
+	return copy_len;
+
+err_fault:
+	err = -EFAULT;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+const struct bpf_func_proto bpf_get_stack_proto = {
+	.func		= bpf_get_stack,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fe23dc5a..1ee71f6 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1360,6 +1360,16 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (err)
 		goto free_used_maps;
 
+	if (prog->need_callchain_buf) {
+#ifdef CONFIG_PERF_EVENTS
+		err = get_callchain_buffers(sysctl_perf_event_max_stack);
+#else
+		err = -ENOTSUPP;
+#endif
+		if (err)
+			goto free_used_maps;
+	}
+
 	err = bpf_prog_new_fd(prog);
 	if (err < 0) {
 		/* failed to allocate fd.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 5dd1dcb..aba9425 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2460,6 +2460,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	if (err)
 		return err;
 
+	if (func_id == BPF_FUNC_get_stack)
+		env->prog->need_callchain_buf = true;
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d88e96d..fe8476f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -20,6 +20,7 @@
 #include "trace.h"
 
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 /**
  * trace_call_bpf - invoke BPF program
@@ -577,6 +578,8 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto;
 	case BPF_FUNC_perf_event_read_value:
 		return &bpf_perf_event_read_value_proto;
 #ifdef CONFIG_BPF_KPROBE_OVERRIDE
@@ -664,6 +667,25 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
+	   u64, flags)
+{
+	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
+
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_tp = {
+	.func		= bpf_get_stack_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -672,6 +694,8 @@ tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
@@ -734,6 +758,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	default:
@@ -744,7 +770,7 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 /*
  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
  * to avoid potential recursive reuse issue when/if tracepoints are added
- * inside bpf_*_event_output and/or bpf_get_stack_id
+ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
  */
 static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
 BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
@@ -787,6 +813,26 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
+
+	perf_fetch_caller_regs(regs);
+	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+			     (unsigned long) size, flags, 0);
+}
+
+static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
+	.func		= bpf_get_stack_raw_tp,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 static const struct bpf_func_proto *
 raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -795,6 +841,8 @@ raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_perf_event_output_proto_raw_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_raw_tp;
+	case BPF_FUNC_get_stack:
+		return &bpf_get_stack_proto_raw_tp;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}
-- 
2.9.5

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox