Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v2 2/3] lan78xx: Read LED states from Device Tree
From: Phil Elwell @ 2018-04-18 15:45 UTC (permalink / raw)
  To: Woojung Huh, Microchip Linux Driver Support, Rob Herring,
	Mark Rutland, David S. Miller, Mauro Carvalho Chehab,
	Greg Kroah-Hartman, Linus Walleij, Andrew Morton, Randy Dunlap,
	netdev, devicetree, linux-kernel, linux-usb
  Cc: Phil Elwell
In-Reply-To: <1524066323-109628-1-git-send-email-phil@raspberrypi.org>

Add support for DT property "microchip,led-modes", a vector of zero
to four cells (u32s) in the range 0-15, each of which sets the mode
for one of the LEDs. Some possible values are:

    0=link/activity          1=link1000/activity
    2=link100/activity       3=link10/activity
    4=link100/1000/activity  5=link10/1000/activity
    6=link10/100/activity    14=off    15=on

These values are given symbolic constants in a dt-bindings header.

Also use the presence of the DT property to indicate that the
LEDs should be enabled - necessary in the event that no valid OTP
or EEPROM is available.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
---
 MAINTAINERS                              |  1 +
 drivers/net/usb/lan78xx.c                | 35 ++++++++++++++++++++++++++++++++
 include/dt-bindings/net/microchip-78xx.h | 21 +++++++++++++++++++
 3 files changed, 57 insertions(+)
 create mode 100644 include/dt-bindings/net/microchip-78xx.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b60179d..9c9bc63 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14573,6 +14573,7 @@ M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	drivers/net/usb/lan78xx.*
+F:	include/dt-bindings/net/microchip-78xx.h
 
 USB MASS STORAGE DRIVER
 M:	Alan Stern <stern@rowland.harvard.edu>
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index a823f01..f47ffea 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -38,6 +38,7 @@
 #include <linux/microchipphy.h>
 #include <linux/phy.h>
 #include <linux/of_net.h>
+#include <dt-bindings/net/microchip-78xx.h>
 #include "lan78xx.h"
 
 #define DRIVER_AUTHOR	"WOOJUNG HUH <woojung.huh@microchip.com>"
@@ -74,6 +75,9 @@
 #define LAN78XX_EEPROM_MAGIC		(0x78A5)
 #define LAN78XX_OTP_MAGIC		(0x78F3)
 
+/* This register is specific to the LAN7800 and LAN7850 embedded PHYs */
+#define LAN78XX_PHY_LED_MODE_SELECT	29
+
 #define	MII_READ			1
 #define	MII_WRITE			0
 
@@ -2005,6 +2009,8 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 {
 	int ret;
 	u32 mii_adv;
+	u32 led_modes[4];
+	int len;
 	struct phy_device *phydev;
 
 	phydev = phy_find_first(dev->mdiobus);
@@ -2077,6 +2083,35 @@ static int lan78xx_phy_init(struct lan78xx_net *dev)
 	mii_adv = (u32)mii_advertise_flowctrl(dev->fc_request_control);
 	phydev->advertising |= mii_adv_to_ethtool_adv_t(mii_adv);
 
+	len = of_property_read_variable_u32_array(dev->udev->dev.of_node,
+						  "microchip,led-modes",
+						  led_modes,
+						  0,
+						  ARRAY_SIZE(led_modes));
+	if (len >= 0) {
+		u32 reg = 0;
+		int i;
+
+		for (i = 0; i < len; i++) {
+			if (led_modes[i] > 15) {
+				ret = -EINVAL;
+				goto error;
+			}
+			reg |= led_modes[i] << (i * 4);
+		}
+		for (; i < ARRAY_SIZE(led_modes); i++)
+			reg |= LAN78XX_FORCE_LED_OFF << (i * 4);
+		(void)phy_write(phydev, LAN78XX_PHY_LED_MODE_SELECT, reg);
+
+		/* Ensure the LEDs are enabled */
+		lan78xx_read_reg(dev, HW_CFG, &reg);
+		reg |= HW_CFG_LED0_EN_ | HW_CFG_LED1_EN_;
+		lan78xx_write_reg(dev, HW_CFG, reg);
+	} else if (len == -EOVERFLOW) {
+		ret = -EINVAL;
+		goto error;
+	}
+
 	genphy_config_aneg(phydev);
 
 	dev->fc_autoneg = phydev->autoneg;
diff --git a/include/dt-bindings/net/microchip-78xx.h b/include/dt-bindings/net/microchip-78xx.h
new file mode 100644
index 0000000..dcf4a43
--- /dev/null
+++ b/include/dt-bindings/net/microchip-78xx.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _DT_BINDINGS_MICROCHIP_LAN78XX_H
+#define _DT_BINDINGS_MICROCHIP_LAN78XX_H
+
+/* LED modes */
+
+#define LAN78XX_LINK_ACTIVITY           0
+#define LAN78XX_LINK_1000_ACTIVITY      1
+#define LAN78XX_LINK_100_ACTIVITY       2
+#define LAN78XX_LINK_10_ACTIVITY        3
+#define LAN78XX_LINK_100_1000_ACTIVITY  4
+#define LAN78XX_LINK_10_1000_ACTIVITY   5
+#define LAN78XX_LINK_10_100_ACTIVITY    6
+#define LAN78XX_DUPLEX_COLLISION        8
+#define LAN78XX_COLLISION               9
+#define LAN78XX_ACTIVITY                10
+#define LAN78XX_AUTONEG_FAULT           12
+#define LAN78XX_FORCE_LED_OFF           14
+#define LAN78XX_FORCE_LED_ON            15
+
+#endif
-- 
2.7.4

^ permalink raw reply related

* [PATCH v2 3/3] dt-bindings: Document the DT bindings for lan78xx
From: Phil Elwell @ 2018-04-18 15:45 UTC (permalink / raw)
  To: Woojung Huh, Microchip Linux Driver Support, Rob Herring,
	Mark Rutland, David S. Miller, Mauro Carvalho Chehab,
	Greg Kroah-Hartman, Linus Walleij, Andrew Morton, Randy Dunlap,
	netdev, devicetree, linux-kernel, linux-usb
  Cc: Phil Elwell
In-Reply-To: <1524066323-109628-1-git-send-email-phil@raspberrypi.org>

The Microchip LAN78XX family of devices are Ethernet controllers with
a USB interface. Despite being discoverable devices it can be useful to
be able to configure them from Device Tree, particularly in low-cost
applications without an EEPROM or programmed OTP.

Document the supported properties in a bindings file.

Signed-off-by: Phil Elwell <phil@raspberrypi.org>
---
 .../devicetree/bindings/net/microchip,lan78xx.txt  | 44 ++++++++++++++++++++++
 MAINTAINERS                                        |  1 +
 2 files changed, 45 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/microchip,lan78xx.txt

diff --git a/Documentation/devicetree/bindings/net/microchip,lan78xx.txt b/Documentation/devicetree/bindings/net/microchip,lan78xx.txt
new file mode 100644
index 0000000..fa68f9b
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/microchip,lan78xx.txt
@@ -0,0 +1,44 @@
+Microchip LAN78xx Gigabit Ethernet controller
+
+The LAN78XX devices are usually configured by programming their OTP or with
+an external EEPROM, but some platforms (e.g. Raspberry Pi 3 B+) have neither.
+The Device Tree properties, if present, override the OTP and EEPROM.
+
+Required properties:
+- compatible: Should be one of "usb424,7800", "usb424,7801" or "usb424,7850".
+
+Optional properties:
+- local-mac-address:   see ethernet.txt
+- mac-address:         see ethernet.txt
+- microchip,led-modes: a 0..4 element vector, with each element configuring
+  the operating mode of an LED. Omitted LEDs are turned off. Allowed values
+  are defined in "include/dt-bindings/net/microchip-78xx.h".
+
+Example:
+
+/* Standard configuration for a Raspberry Pi 3 B+ */
+&usb {
+	usb1@1 {
+		compatible = "usb424,2514";
+		reg = <1>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		usb1_1@1 {
+			compatible = "usb424,2514";
+			reg = <1>;
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			ethernet: usbether@1 {
+				compatible = "usb424,7800";
+				reg = <1>;
+				local-mac-address = [ 00 11 22 33 44 55 ];
+				microchip,led-modes = <
+					LAN78XX_LINK_1000_ACTIVITY
+					LAN78XX_LINK_10_100_ACTIVITY
+				>;
+			};
+		};
+	};
+};
diff --git a/MAINTAINERS b/MAINTAINERS
index 9c9bc63..5352bbb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14572,6 +14572,7 @@ M:	Woojung Huh <woojung.huh@microchip.com>
 M:	Microchip Linux Driver Support <UNGLinuxDriver@microchip.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
+F:	Documentation/devicetree/bindings/net/microchip,lan78xx.txt
 F:	drivers/net/usb/lan78xx.*
 F:	include/dt-bindings/net/microchip-78xx.h
 
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH iproute2 net-next] vxlan: fix ttl inherit behavior
From: Stephen Hemminger @ 2018-04-18 15:50 UTC (permalink / raw)
  To: Hangbin Liu; +Cc: network dev, Jiri Benc
In-Reply-To: <CAPwn2JT_e5UZg52Nmutv97DM408853i1HSEZRv+2e2Geqsma0g@mail.gmail.com>

On Wed, 18 Apr 2018 13:10:49 +0800
Hangbin Liu <liuhangbin@gmail.com> wrote:

> Hi Stephen,
> 
> The patch's subject contains fix. But the kernel feature is applied on net-next.
> So I'm not sure if iproute2 net-next is suitable. If you are OK with the patch,
> please feel free to apply it on the branch which you think is suitable.
> 
> Thanks
> Hangbin
> 
> On 18 April 2018 at 13:05, Hangbin Liu <liuhangbin@gmail.com> wrote:
> > Like kernel net-next commit 72f6d71e491e6 ("vxlan: add ttl inherit support"),
> > vxlan ttl inherit should means inherit the inner protocol's ttl value.
> >
> > But currently when we add vxlan with "ttl inherit", we only set ttl 0,
> > which is actually use whatever default value instead of inherit the inner
> > protocol's ttl value.
> >
> > To make a difference with ttl inherit and ttl == 0, we add an attribute
> > IFLA_VXLAN_TTL_INHERIT when "ttl inherit" specified. And use "ttl auto"
> > to means "use whatever default value", the same behavior with ttl == 0.
> >
> > Reported-by: Jianlin Shi <jishi@redhat.com>
> > Suggested-by: Jiri Benc <jbenc@redhat.com>
> > Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>  

When davem  merges the feature into net-next, dsa will merge this into iproute2-next.
We hold off merging into iproute2 because often the kernel review feedback causes
API changes.

^ permalink raw reply

* [PATCH net-next] lan78xx: Add support to dump lan78xx registers
From: Raghuram Chary J @ 2018-04-18 15:57 UTC (permalink / raw)
  To: davem; +Cc: netdev, unglinuxdriver, woojung.huh, raghuramchary.jallipalli

In order to dump lan78xx family registers using ethtool, add
support at lan78xx driver level.

Signed-off-by: Raghuram Chary J <raghuramchary.jallipalli@microchip.com>
---
 drivers/net/usb/lan78xx.c | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index 0867f7275852..e846698fa32a 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -278,6 +278,30 @@ struct lan78xx_statstage64 {
 	u64 eee_tx_lpi_time;
 };
 
+static u32 lan78xx_regs[] = {
+	ID_REV,
+	INT_STS,
+	HW_CFG,
+	PMT_CTL,
+	E2P_CMD,
+	E2P_DATA,
+	USB_STATUS,
+	VLAN_TYPE,
+	MAC_CR,
+	MAC_RX,
+	MAC_TX,
+	FLOW,
+	ERR_STS,
+	MII_ACC,
+	MII_DATA,
+	EEE_TX_LPI_REQ_DLY,
+	EEE_TW_TX_SYS,
+	EEE_TX_LPI_REM_DLY,
+	WUCSR
+};
+
+#define PHY_REG_SIZE (32 * sizeof(u32))
+
 struct lan78xx_net;
 
 struct lan78xx_priv {
@@ -1605,6 +1629,34 @@ static int lan78xx_set_pause(struct net_device *net,
 	return ret;
 }
 
+static int lan78xx_get_regs_len(struct net_device *netdev)
+{
+	if (!netdev->phydev)
+		return (sizeof(lan78xx_regs));
+	else
+		return (sizeof(lan78xx_regs) + PHY_REG_SIZE);
+}
+
+static void
+lan78xx_get_regs(struct net_device *netdev, struct ethtool_regs *regs,
+		 void *buf)
+{
+	u32 *data = buf;
+	int i, j;
+	struct lan78xx_net *dev = netdev_priv(netdev);
+
+	/* Read Device/MAC registers */
+	for (i = 0, j = 0; i < (sizeof(lan78xx_regs) / sizeof(u32)); i++, j++)
+		lan78xx_read_reg(dev, lan78xx_regs[i], &data[j]);
+
+	if (!netdev->phydev)
+		return;
+
+	/* Read PHY registers */
+	for (i = 0; i < 32; i++, j++)
+		data[j] = phy_read(netdev->phydev, i);
+}
+
 static const struct ethtool_ops lan78xx_ethtool_ops = {
 	.get_link	= lan78xx_get_link,
 	.nway_reset	= phy_ethtool_nway_reset,
@@ -1625,6 +1677,8 @@ static const struct ethtool_ops lan78xx_ethtool_ops = {
 	.set_pauseparam	= lan78xx_set_pause,
 	.get_link_ksettings = lan78xx_get_link_ksettings,
 	.set_link_ksettings = lan78xx_set_link_ksettings,
+	.get_regs_len	= lan78xx_get_regs_len,
+	.get_regs	= lan78xx_get_regs,
 };
 
 static int lan78xx_ioctl(struct net_device *netdev, struct ifreq *rq, int cmd)
-- 
2.16.2

^ permalink raw reply related

* Re: [RFC PATCH] net: bridge: multicast querier per VLAN support
From: Stephen Hemminger @ 2018-04-18 15:54 UTC (permalink / raw)
  To: Nikolay Aleksandrov; +Cc: Joachim Nilsson, netdev, roopa
In-Reply-To: <da36ee2f-d39b-d6c0-15b2-50bde81482ab@cumulusnetworks.com>

On Wed, 18 Apr 2018 16:14:26 +0300
Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:

> On 18/04/18 16:07, Joachim Nilsson wrote:
> > On Wed, Apr 18, 2018 at 03:31:57PM +0300, Nikolay Aleksandrov wrote:  
> >> On 18/04/18 15:07, Joachim Nilsson wrote:  
> >>> - First of all, is this patch useful to anyone  
> >> Obviously to us as it's based on our patch. :-)
> >> We actually recently discussed what will be needed to make it acceptable to upstream.  
> > 
> > Great! :)
> >   
> >>> - The current br_multicast.c is very complex.  The support for both IPv4
> >>>    and IPv6 is a no-brainer, but it also has #ifdef VLAN_FILTERING and
> >>>    'br->vlan_enabled' ... this has likely been discussed before, but if
> >>>    we could remove those code paths I believe what's left would be quite
> >>>    a bit easier to read and maintain.  
> >> br->vlan_enabled has a wrapper that can be used without ifdefs, as does br_vlan_find()
> >> so in short - you can remove the ifdefs and use the wrappers,  they'll degrade to always
> >> false/null when vlans are disabled.  
> > 
> > Thanks, I'll have a look at that and prepare an RFC v2!
> >   
> >>> - Many per-bridge specific multicast sysfs settings may need to have a
> >>>    corresponding per-VLAN setting, e.g. snooping, query_interval, etc.
> >>>    How should we go about that? (For status reporting I have a proposal)  
> >> We'll have to add more to the per-vlan context, but yes it has to happen.
> >> It will be only netlink interface for config/retrieval, no sysfs.  
> > 
> > Some settings are possible to do with sysfs, like multicast_query_interval
> > and ...  
> 
> We want to avoid sysfs in general, all of networking config and stats
> are moving to netlink. It is better controlled and structured for such
> changes, also provides nice interfaces for automatic  type checks etc.
> 
> Also (but a minor reason) there is no tree/entity in sysfs for the vlans
> where to add this. It will either have to be a file which does some
> format string hack (like us currently) or will need to add new tree for
> them which I'd really like to avoid for the bridge.

In general, all bridge attributes need to show in netlink and sysfs.
Sysfs is easier for scripting from languages.

^ permalink raw reply

* [PATCH] atm: iphase: fix spelling mistake: "Tansmit" -> "Transmit"
From: Colin King @ 2018-04-18 15:55 UTC (permalink / raw)
  To: Chas Williams, linux-atm-general, netdev; +Cc: kernel-janitors, linux-kernel

From: Colin Ian King <colin.king@canonical.com>

Trivial fix to spelling mistake in message text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
---
 drivers/atm/iphase.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c
index 44abb8a0a5e5..be076606d30e 100644
--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -671,7 +671,7 @@ static void ia_tx_poll (IADEV *iadev) {
           if ((vcc->pop) && (skb1->len != 0))
           {
              vcc->pop(vcc, skb1);
-             IF_EVENT(printk("Tansmit Done - skb 0x%lx return\n",
+             IF_EVENT(printk("Transmit Done - skb 0x%lx return\n",
                                                           (long)skb1);)
           }
           else 
@@ -1665,7 +1665,7 @@ static void tx_intr(struct atm_dev *dev)
 	status = readl(iadev->seg_reg+SEG_INTR_STATUS_REG);  
         if (status & TRANSMIT_DONE){
 
-           IF_EVENT(printk("Tansmit Done Intr logic run\n");)
+           IF_EVENT(printk("Transmit Done Intr logic run\n");)
            spin_lock_irqsave(&iadev->tx_lock, flags);
            ia_tx_poll(iadev);
            spin_unlock_irqrestore(&iadev->tx_lock, flags);
-- 
2.17.0

^ permalink raw reply related

* Re: [Bug 199429] New: smc_shutdown(net/smc/af_smc.c) has a UAF causing null pointer vulnerability.
From: Stephen Hemminger @ 2018-04-18 15:55 UTC (permalink / raw)
  To: Ursula Braun; +Cc: Ursula Braun, netdev
In-Reply-To: <49ed2fa7-cace-12c9-eb57-539cac783cb2@linux.ibm.com>

On Wed, 18 Apr 2018 13:46:20 +0200
Ursula Braun <ubraun@linux.ibm.com> wrote:

> On 04/18/2018 04:56 AM, Stephen Hemminger wrote:
> > This may already be fixed.
> > 
> > Begin forwarded message:
> > 
> > Date: Wed, 18 Apr 2018 01:52:59 +0000
> > From: bugzilla-daemon@bugzilla.kernel.org
> > To: stephen@networkplumber.org
> > Subject: [Bug 199429] New: smc_shutdown(net/smc/af_smc.c) has a UAF causing null pointer vulnerability.
> > 
> > 
> > https://bugzilla.kernel.org/show_bug.cgi?id=199429
> > 
> >             Bug ID: 199429
> >            Summary: smc_shutdown(net/smc/af_smc.c) has a UAF causing null
> >                     pointer vulnerability.
> >            Product: Networking
> >            Version: 2.5
> >     Kernel Version: 4.16.0-rc7
> >           Hardware: All
> >                 OS: Linux
> >               Tree: Mainline
> >             Status: NEW
> >           Severity: normal
> >           Priority: P1
> >          Component: Other
> >           Assignee: stephen@networkplumber.org
> >           Reporter: 1773876454@qq.com
> >         Regression: No
> > 
> > Created attachment 275431  
> >   --> https://bugzilla.kernel.org/attachment.cgi?id=275431&action=edit    
> > POC
> > 
> > Syzkaller hit 'general protection fault in kernel_sock_shutdown' bug.
> > 
> > NET: Registered protocol family 43  
> 
> Thanks for reporting. This fix is needed here:
> 
>  net/smc/af_smc.c |    2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -1314,7 +1314,7 @@ static int smc_shutdown(struct socket *s
>  	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
>  	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
>  		goto out;
> -	if (smc->use_fallback) {
> +	if (smc->use_fallback || sk->sk_state == SMC_LISTEN) {
>  		rc = kernel_sock_shutdown(smc->clcsock, how);
>  		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
>  		if (sk->sk_shutdown == SHUTDOWN_MASK)
> 
> Kind regards, Ursula
> 

Please submit patch to linux net with proper signed-off-by and Fixes tags.
The maintainer (davem) will take care of getting this into upstream and stable.

^ permalink raw reply

* [bpf-next PATCH 0/3] Add ID to bpf_map/prog tracepoints
From: Sebastiano Miano @ 2018-04-18 15:30 UTC (permalink / raw)
  To: netdev, ast, daniel; +Cc: mingo, rostedt, brouer, fulvio.risso

The following series:
1) Add ID to both map and prog related tracepoints
2) Add a sample program that shows how to monitor and 
   filter map related events using their IDs.

---

Sebastiano Miano (3):
      bpf: add id to map tracepoint
      bpf: add id to prog tracepoint
      bpf: add sample program to trace map events


 include/trace/events/bpf.h          |   44 ++++-
 samples/bpf/Makefile                |    4 
 samples/bpf/trace_map_events_kern.c |  217 ++++++++++++++++++++++++
 samples/bpf/trace_map_events_user.c |  314 +++++++++++++++++++++++++++++++++++
 4 files changed, 569 insertions(+), 10 deletions(-)
 create mode 100644 samples/bpf/trace_map_events_kern.c
 create mode 100644 samples/bpf/trace_map_events_user.c

^ permalink raw reply

* [bpf-next PATCH 1/3] bpf: add id to map tracepoint
From: Sebastiano Miano @ 2018-04-18 15:30 UTC (permalink / raw)
  To: netdev, ast, daniel; +Cc: mingo, rostedt, brouer, fulvio.risso
In-Reply-To: <152406544226.3465.948692097697975172.stgit@localhost.localdomain>

This patch adds the map id to the bpf tracepoints
that can be used when monitoring or inspecting map
related functions.

Signed-off-by: Sebastiano Miano <sebastiano.miano@polito.it>
Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/trace/events/bpf.h |   29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
index 1501856..d7c9726 100644
--- a/include/trace/events/bpf.h
+++ b/include/trace/events/bpf.h
@@ -130,6 +130,7 @@ TRACE_EVENT(bpf_map_create,
 		__field(u32, max_entries)
 		__field(u32, flags)
 		__field(int, ufd)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
@@ -139,9 +140,11 @@ TRACE_EVENT(bpf_map_create,
 		__entry->max_entries = map->max_entries;
 		__entry->flags       = map->map_flags;
 		__entry->ufd         = ufd;
+		__entry->id          = map->id;
 	),
 
-	TP_printk("map type=%s ufd=%d key=%u val=%u max=%u flags=%x",
+	TP_printk("id=%u type=%s ufd=%d key=%u val=%u max=%u flags=%x",
+		  __entry->id,
 		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
 		  __entry->ufd, __entry->size_key, __entry->size_value,
 		  __entry->max_entries, __entry->flags)
@@ -199,17 +202,20 @@ DECLARE_EVENT_CLASS(bpf_obj_map,
 		__field(u32, type)
 		__field(int, ufd)
 		__string(path, pname->name)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
 		__assign_str(path, pname->name);
 		__entry->type = map->map_type;
 		__entry->ufd  = ufd;
+		__entry->id   = map->id;
 	),
 
-	TP_printk("map type=%s ufd=%d path=%s",
-		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
-		  __entry->ufd, __get_str(path))
+	TP_printk("map id=%u type=%s ufd=%d path=%s",
+		__entry->id,
+		__print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
+		__entry->ufd, __get_str(path))
 );
 
 DEFINE_EVENT(bpf_obj_map, bpf_obj_pin_map,
@@ -244,6 +250,7 @@ DECLARE_EVENT_CLASS(bpf_map_keyval,
 		__dynamic_array(u8, val, map->value_size)
 		__field(bool, val_trunc)
 		__field(int, ufd)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
@@ -255,9 +262,11 @@ DECLARE_EVENT_CLASS(bpf_map_keyval,
 		__entry->val_len   = min(map->value_size, 16U);
 		__entry->val_trunc = map->value_size != __entry->val_len;
 		__entry->ufd       = ufd;
+		__entry->id        = map->id;
 	),
 
-	TP_printk("map type=%s ufd=%d key=[%s%s] val=[%s%s]",
+	TP_printk("map id=%d type=%s ufd=%d key=[%s%s] val=[%s%s]",
+		  __entry->id,
 		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
 		  __entry->ufd,
 		  __print_hex(__get_dynamic_array(key), __entry->key_len),
@@ -295,6 +304,7 @@ TRACE_EVENT(bpf_map_delete_elem,
 		__dynamic_array(u8, key, map->key_size)
 		__field(bool, key_trunc)
 		__field(int, ufd)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
@@ -303,9 +313,11 @@ TRACE_EVENT(bpf_map_delete_elem,
 		__entry->key_len   = min(map->key_size, 16U);
 		__entry->key_trunc = map->key_size != __entry->key_len;
 		__entry->ufd       = ufd;
+		__entry->id        = map->id;
 	),
 
-	TP_printk("map type=%s ufd=%d key=[%s%s]",
+	TP_printk("map id=%d type=%s ufd=%d key=[%s%s]",
+		  __entry->id,
 		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
 		  __entry->ufd,
 		  __print_hex(__get_dynamic_array(key), __entry->key_len),
@@ -327,6 +339,7 @@ TRACE_EVENT(bpf_map_next_key,
 		__field(bool, key_trunc)
 		__field(bool, key_null)
 		__field(int, ufd)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
@@ -338,9 +351,11 @@ TRACE_EVENT(bpf_map_next_key,
 		__entry->key_len   = min(map->key_size, 16U);
 		__entry->key_trunc = map->key_size != __entry->key_len;
 		__entry->ufd       = ufd;
+		__entry->id        = map->id;
 	),
 
-	TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]",
+	TP_printk("map id=%d type=%s ufd=%d key=[%s%s] next=[%s%s]",
+		  __entry->id,
 		  __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB),
 		  __entry->ufd,
 		  __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key),

^ permalink raw reply related

* [bpf-next PATCH 2/3] bpf: add id to prog tracepoint
From: Sebastiano Miano @ 2018-04-18 15:30 UTC (permalink / raw)
  To: netdev, ast, daniel; +Cc: mingo, rostedt, brouer, fulvio.risso
In-Reply-To: <152406544226.3465.948692097697975172.stgit@localhost.localdomain>

This patch adds the prog id to the bpf tracepoints
that can be used when monitoring or inspecting prog
related functions.

Signed-off-by: Sebastiano Miano <sebastiano.miano@polito.it>
Suggested-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/trace/events/bpf.h |   15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h
index d7c9726..4ec19c5 100644
--- a/include/trace/events/bpf.h
+++ b/include/trace/events/bpf.h
@@ -65,16 +65,19 @@ DECLARE_EVENT_CLASS(bpf_prog_event,
 	TP_STRUCT__entry(
 		__array(u8, prog_tag, 8)
 		__field(u32, type)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
 		BUILD_BUG_ON(sizeof(__entry->prog_tag) != sizeof(prg->tag));
 		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
 		__entry->type = prg->type;
+		__entry->id   = prg->aux->id;
 	),
 
-	TP_printk("prog=%s type=%s",
+	TP_printk("prog=%s id=%u type=%s",
 		  __print_hex_str(__entry->prog_tag, 8),
+		  __entry->id,
 		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB))
 );
 
@@ -102,6 +105,7 @@ TRACE_EVENT(bpf_prog_load,
 		__array(u8, prog_tag, 8)
 		__field(u32, type)
 		__field(int, ufd)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
@@ -109,10 +113,12 @@ TRACE_EVENT(bpf_prog_load,
 		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
 		__entry->type = prg->type;
 		__entry->ufd  = ufd;
+		__entry->id   = prg->aux->id;
 	),
 
-	TP_printk("prog=%s type=%s ufd=%d",
+	TP_printk("prog=%s id=%u type=%s ufd=%d",
 		  __print_hex_str(__entry->prog_tag, 8),
+		  __entry->id,
 		  __print_symbolic(__entry->type, __PROG_TYPE_SYM_TAB),
 		  __entry->ufd)
 );
@@ -161,6 +167,7 @@ DECLARE_EVENT_CLASS(bpf_obj_prog,
 		__array(u8, prog_tag, 8)
 		__field(int, ufd)
 		__string(path, pname->name)
+		__field(u32, id)
 	),
 
 	TP_fast_assign(
@@ -168,10 +175,12 @@ DECLARE_EVENT_CLASS(bpf_obj_prog,
 		memcpy(__entry->prog_tag, prg->tag, sizeof(prg->tag));
 		__assign_str(path, pname->name);
 		__entry->ufd = ufd;
+		__entry->id  = prg->aux->id;
 	),
 
-	TP_printk("prog=%s path=%s ufd=%d",
+	TP_printk("prog=%s id=%u path=%s ufd=%d",
 		  __print_hex_str(__entry->prog_tag, 8),
+		  __entry->id,
 		  __get_str(path), __entry->ufd)
 );
 

^ permalink raw reply related

* [bpf-next PATCH 3/3] bpf: add sample program to trace map events
From: Sebastiano Miano @ 2018-04-18 15:30 UTC (permalink / raw)
  To: netdev, ast, daniel; +Cc: mingo, rostedt, brouer, fulvio.risso
In-Reply-To: <152406544226.3465.948692097697975172.stgit@localhost.localdomain>

This patch adds a sample program, called trace_map_events,
that shows how to capture map events and filter them based on
the map id.

The program accepts a list of map IDs, via the -i command line
option, and filters all the map events related to those IDs (i.e.,
map_create/update/lookup/next_key).
If no IDs are specified, all map events are listed and no filtering
is performed.

Sample usage:

 # trace_map_events -i <map_id1> -i <map_id2> -i <map_id3> ...

Signed-off-by: Sebastiano Miano <sebastiano.miano@polito.it>
---
 samples/bpf/Makefile                |    4 
 samples/bpf/trace_map_events_kern.c |  225 +++++++++++++++++++++++++
 samples/bpf/trace_map_events_user.c |  314 +++++++++++++++++++++++++++++++++++
 3 files changed, 543 insertions(+)
 create mode 100644 samples/bpf/trace_map_events_kern.c
 create mode 100644 samples/bpf/trace_map_events_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6ed..a7d52b6 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -15,6 +15,7 @@ hostprogs-y += tracex6
 hostprogs-y += tracex7
 hostprogs-y += test_probe_write_user
 hostprogs-y += trace_output
+hostprogs-y += trace_map_events
 hostprogs-y += lathist
 hostprogs-y += offwaketime
 hostprogs-y += spintest
@@ -65,6 +66,7 @@ tracex7-objs := bpf_load.o $(LIBBPF) tracex7_user.o
 load_sock_ops-objs := bpf_load.o $(LIBBPF) load_sock_ops.o
 test_probe_write_user-objs := bpf_load.o $(LIBBPF) test_probe_write_user_user.o
 trace_output-objs := bpf_load.o $(LIBBPF) trace_output_user.o
+trace_map_events-objs := bpf_load.o $(LIBBPF) trace_map_events_user.o
 lathist-objs := bpf_load.o $(LIBBPF) lathist_user.o
 offwaketime-objs := bpf_load.o $(LIBBPF) offwaketime_user.o
 spintest-objs := bpf_load.o $(LIBBPF) spintest_user.o
@@ -111,6 +113,7 @@ always += tracex7_kern.o
 always += sock_flags_kern.o
 always += test_probe_write_user_kern.o
 always += trace_output_kern.o
+always += trace_map_events_kern.o
 always += tcbpf1_kern.o
 always += tcbpf2_kern.o
 always += tc_l2_redirect_kern.o
@@ -171,6 +174,7 @@ HOSTLOADLIBES_test_cgrp2_sock2 += -lelf
 HOSTLOADLIBES_load_sock_ops += -lelf
 HOSTLOADLIBES_test_probe_write_user += -lelf
 HOSTLOADLIBES_trace_output += -lelf -lrt
+HOSTLOADLIBES_trace_map_events += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
 HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
diff --git a/samples/bpf/trace_map_events_kern.c b/samples/bpf/trace_map_events_kern.c
new file mode 100644
index 0000000..f887b5b
--- /dev/null
+++ b/samples/bpf/trace_map_events_kern.c
@@ -0,0 +1,225 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2018 Politecnico di Torino, Italy
+ *
+ * Author: Sebastiano Miano <sebastiano.miano@polito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+#include <uapi/linux/bpf.h>
+#include <linux/version.h>
+#include "bpf_helpers.h"
+
+enum map_event_type {
+	MAP_CREATE = 0,
+	MAP_UPDATE = 1,
+	MAP_LOOKUP = 2,
+	MAP_NEXT_KEY = 3
+};
+
+struct map_event_data {
+	u32 map_id;
+	enum map_event_type evnt_type;
+	u32 map_type;
+};
+
+struct bpf_map_def SEC("maps") map_event_trace = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = 64,
+};
+
+struct bpf_map_def SEC("maps") filtered_ids = {
+	.type = BPF_MAP_TYPE_HASH,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(u32),
+	.max_entries = 64,
+};
+
+struct bpf_map_def SEC("maps") filter_events = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(bool),
+	.max_entries = 1,
+};
+
+/*
+ * Tracepoint format: /sys/kernel/debug/tracing/events/bpf/bpf_map_create/format
+ * Code in:                kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_create_ctx {
+	u64 pad;		// First 8 bytes are not accessible by bpf code
+	u32 type;		// offset:8;	size:4;	signed:0;
+	u32 size_key;		// offset:12;	size:4;	signed:0;
+	u32 size_value;		// offset:16;	size:4;	signed:0;
+	u32 max_entries;	// offset:20;	size:4;	signed:0;
+	u32 flags;		// offset:24;	size:4;	signed:0;
+	int ufd;		// offset:28;	size:4;	signed:1;
+	u32 id;			// offset:32;	size:4;	signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_create")
+int trace_bpf_map_create(struct bpf_map_create_ctx *ctx)
+{
+	struct map_event_data data;
+	int cpu = bpf_get_smp_processor_id();
+	bool *filter;
+	u32 key = 0, map_id = ctx->id;
+
+	filter = bpf_map_lookup_elem(&filter_events, &key);
+	if (!filter)
+		return 1;
+
+	if (!*filter)
+		goto send_event;
+
+	/*
+	 * If the map_id is not in the list of filtered
+	 * ids we immediately return
+	 */
+	if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+		return 0;
+
+send_event:
+	data.map_id = map_id;
+	data.evnt_type = MAP_CREATE;
+	data.map_type = ctx->type;
+
+	bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+	return 0;
+}
+
+/*
+ * Tracepoint:	/sys/kernel/debug/tracing/events/bpf/bpf_map_lookup_elem/format
+ * Tracepoint:	/sys/kernel/debug/tracing/events/bpf/bpf_map_update_elem/format
+ * Code in:          kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_keyval_ctx {
+	u64 pad;		// First 8 bytes are not accessible by bpf code
+	u32 type;		// offset:8;	size:4;	signed:0;
+	u32 key_len;		// offset:12;	size:4;	signed:0;
+	u32 key;		// offset:16;	size:4;	signed:0;
+	bool key_trunc;		// offset:20;	size:1;	signed:0;
+	u32 val_len;		// offset:24;	size:4;	signed:0;
+	u32 val;		// offset:28;	size:4;	signed:0;
+	bool val_trunc;		// offset:32;	size:1;	signed:0;
+	int ufd;		// offset:36;	size:4;	signed:1;
+	u32 id;			// offset:40;	size:4;	signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_lookup_elem")
+int trace_bpf_map_lookup_elem(struct bpf_map_keyval_ctx *ctx)
+{
+	struct map_event_data data;
+	int cpu = bpf_get_smp_processor_id();
+	bool *filter;
+	u32 key = 0, map_id = ctx->id;
+
+	filter = bpf_map_lookup_elem(&filter_events, &key);
+	if (!filter)
+		return 1;
+
+	if (!*filter)
+		goto send_event;
+
+	/*
+	 * If the map_id is not in the list of filtered
+	 * ids we immediately return
+	 */
+	if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+		return 0;
+
+send_event:
+	data.map_id = map_id;
+	data.evnt_type = MAP_LOOKUP;
+	data.map_type = ctx->type;
+
+	bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+	return 0;
+}
+
+SEC("tracepoint/bpf/bpf_map_update_elem")
+int trace_bpf_map_update_elem(struct bpf_map_keyval_ctx *ctx)
+{
+	struct map_event_data data;
+	int cpu = bpf_get_smp_processor_id();
+	bool *filter;
+	u32 key = 0, map_id = ctx->id;
+
+	filter = bpf_map_lookup_elem(&filter_events, &key);
+	if (!filter)
+		return 1;
+
+	if (!*filter)
+		goto send_event;
+
+	/*
+	 * If the map_id is not in the list of filtered
+	 * ids we immediately return
+	 */
+	if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+		return 0;
+
+send_event:
+	data.map_id = map_id;
+	data.evnt_type = MAP_UPDATE;
+	data.map_type = ctx->type;
+
+	bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+	return 0;
+}
+
+/*
+ * Tracepoint:	/sys/kernel/debug/tracing/events/bpf/bpf_map_next_key/format
+ * Code in:          kernel/include/trace/events/bpf.h
+ */
+struct bpf_map_next_key_ctx {
+	u64 pad;		// First 8 bytes are not accessible by bpf code
+	u32 type;		// offset:8;	size:4;	signed:0;
+	u32 key_len;		// offset:12;	size:4;	signed:0;
+	u32 key;		// offset:16;	size:4;	signed:0;
+	u32 nxt;		// offset:20;	size:4;	signed:0;
+	bool key_trunc;		// offset:24;	size:1;	signed:0;
+	bool key_null;		// offset:25;	size:1;	signed:0;
+	int ufd;		// offset:28;	size:4;	signed:1;
+	u32 id;			// offset:32;	size:4;	signed:0;
+};
+
+SEC("tracepoint/bpf/bpf_map_next_key")
+int trace_bpf_map_next_key(struct bpf_map_next_key_ctx *ctx)
+{
+	struct map_event_data data;
+	int cpu = bpf_get_smp_processor_id();
+	bool *filter;
+	u32 key = 0, map_id = ctx->id;
+
+	filter = bpf_map_lookup_elem(&filter_events, &key);
+	if (!filter)
+		return 1;
+
+	if (!*filter)
+		goto send_event;
+
+	/*
+	 * If the map_id is not in the list of filtered
+	 * ids we immediately return
+	 */
+	if (!bpf_map_lookup_elem(&filtered_ids, &map_id))
+		return 0;
+
+send_event:
+	data.map_id = map_id;
+	data.evnt_type = MAP_NEXT_KEY;
+	data.map_type = ctx->type;
+
+	bpf_perf_event_output(ctx, &map_event_trace, cpu, &data, sizeof(data));
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/trace_map_events_user.c b/samples/bpf/trace_map_events_user.c
new file mode 100644
index 0000000..bc7447e
--- /dev/null
+++ b/samples/bpf/trace_map_events_user.c
@@ -0,0 +1,314 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright (C) 2018 Politecnico di Torino, Italy
+ *
+ * Author: Sebastiano Miano <sebastiano.miano@polito.it>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ */
+
+static const char *__desc__ =
+"Sample program to trace map related events\n"
+"The -i option allows to set the id(s) of the map you are interested in.\n"
+"If no ID is specified, all map events are listed.\n";
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/resource.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/epoll.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include <getopt.h>
+
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "perf-sys.h"
+
+#define MAX_FILTERED_IDS 64
+
+static int *perf_fd;
+
+int epoll_fd;
+int page_size;
+int page_cnt = 8;
+volatile struct perf_event_mmap_page **readers;
+
+typedef void (*event_cb)(void *data, int size);
+
+enum map_event_type {
+	MAP_CREATE = 0,
+	MAP_UPDATE = 1,
+	MAP_LOOKUP = 2,
+	MAP_NEXT_KEY = 3
+};
+
+static void usage(char *argv[])
+{
+	printf("\nDESCRIPTION:\n%s", __desc__);
+	printf("\n");
+	printf(" Usage: %s [-i map_id1] [-i map_id2] ...\n", argv[0]);
+	printf("\n");
+}
+
+static int perf_event_mmap(int fd, int cpu)
+{
+	void *base;
+	int mmap_size;
+
+	page_size = getpagesize();
+	mmap_size = page_size * (page_cnt + 1);
+
+	base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (base == MAP_FAILED) {
+		printf("mmap err\n");
+		return -1;
+	}
+
+	readers[cpu] = base;
+	return 0;
+}
+
+static void init_bpf_perf_event_on_cpu(int cpu)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+		.sample_period = 1,
+		.wakeup_events = 1,
+	};
+	int key = cpu;
+
+	perf_fd[cpu] = sys_perf_event_open(&attr, -1, cpu, -1, 0);
+
+	assert(perf_fd[cpu] >= 0);
+	assert(perf_event_mmap(perf_fd[cpu], cpu) >= 0);
+	assert(ioctl(perf_fd[cpu], PERF_EVENT_IOC_ENABLE, 0) >= 0);
+	assert(bpf_map_update_elem(map_fd[0], &key, &perf_fd[cpu], 0) == 0);
+
+	struct epoll_event e = { .events = EPOLLIN, .data.u32 = cpu };
+
+	assert(epoll_ctl(epoll_fd, EPOLL_CTL_ADD, perf_fd[cpu], &e) == 0);
+}
+
+static int perf_event_poll(int fd, int num_cpus, struct epoll_event *events)
+{
+	return epoll_wait(fd, events, num_cpus, -1);
+}
+
+struct perf_event_sample {
+	struct perf_event_header header;
+	__u32 size;
+	char data[];
+};
+
+static void perf_event_read(event_cb fn, __u32 index)
+{
+	__u64 data_tail = readers[index]->data_tail;
+	__u64 data_head = readers[index]->data_head;
+	__u64 buffer_size = page_cnt * page_size;
+	void *base, *begin, *end;
+	char buf[256];
+
+	asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+	if (data_head == data_tail)
+		return;
+
+	base = ((char *)readers[index]) + page_size;
+
+	begin = base + data_tail % buffer_size;
+	end = base + data_head % buffer_size;
+
+	while (begin != end) {
+		struct perf_event_sample *e;
+
+		e = begin;
+		if (begin + e->header.size > base + buffer_size) {
+			long len = base + buffer_size - begin;
+
+			assert(len < e->header.size);
+			memcpy(buf, begin, len);
+			memcpy(buf + len, base, e->header.size - len);
+			e = (void *) buf;
+			begin = base + e->header.size - len;
+		} else if (begin + e->header.size == base + buffer_size) {
+			begin = base;
+		} else {
+			begin += e->header.size;
+		}
+
+		if (e->header.type == PERF_RECORD_SAMPLE) {
+			fn(e->data, e->size);
+		} else if (e->header.type == PERF_RECORD_LOST) {
+			struct {
+				struct perf_event_header header;
+				__u64 id;
+				__u64 lost;
+			} *lost = (void *) e;
+			printf("lost %lld events\n", lost->lost);
+		} else {
+			printf("unknown event type=%d size=%d\n",
+			       e->header.type, e->header.size);
+		}
+	}
+
+	__sync_synchronize(); /* smp_mb() */
+	readers[index]->data_tail = data_head;
+}
+
+static const char *get_event_type(enum map_event_type event)
+{
+	switch (event) {
+	case MAP_CREATE:
+		return "CREATE";
+	case MAP_LOOKUP:
+		return "LOOKUP";
+	case MAP_UPDATE:
+		return "UPDATE";
+	case MAP_NEXT_KEY:
+		return "NEXT_KEY";
+	}
+
+	return "UNKNOWN";
+}
+
+
+static void map_event_callback(void *data, int size)
+{
+	struct {
+		__u32 map_id;
+		enum map_event_type event_type;
+		__u32 map_type;
+	} *e = data;
+
+	printf("%s event for map id: %d and type: %d\n",
+	       get_event_type(e->event_type), e->map_id, e->map_type);
+}
+
+static bool init_filtered_ids_map(int num_ids, int *filtered_ids)
+{
+	int i, key, value;
+	bool filtering = false;
+	/*
+	 * I am going to put the IDs in the map. Only event related to those IDs
+	 * will be shown. The key indicates the ID of the map while the value
+	 * is not used and then is set to 0.
+	 */
+	for (i = 0; i < num_ids; i++) {
+		key = filtered_ids[i];
+		value = 0;
+		if (bpf_map_update_elem(map_fd[1], &key, &value, 0) != 0) {
+			fprintf(stderr,
+			"ERR: bpf_map_update_elem failed key:0x%X\n", key);
+		return false;
+		}
+	}
+
+	if (num_ids > 0)
+		filtering = true;
+
+	key = 0;
+	assert(bpf_map_update_elem(map_fd[2], &key, &filtering, BPF_ANY) == 0);
+	return true;
+}
+
+static bool init_perf_buffer_data_structures(int nr_cpus)
+{
+	int i;
+
+	perf_fd = malloc(sizeof(int) * nr_cpus);
+	assert(perf_fd);
+	readers = malloc(sizeof(*readers) * nr_cpus);
+	assert(readers);
+
+	epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+
+	for (i = 0; i < nr_cpus; i++) {
+		printf("Init bpf_perf_event for cpu:%d\n", i);
+		init_bpf_perf_event_on_cpu(i);
+	}
+
+	return true;
+}
+
+int main(int argc, char **argv)
+{
+	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+	int i, cnt, opt, ret = EXIT_SUCCESS;
+	char bpf_obj_file[256];
+	int num_ids = 0, nr_cpus = bpf_num_possible_cpus();
+	int filtered_ids[MAX_FILTERED_IDS];
+
+	snprintf(bpf_obj_file, sizeof(bpf_obj_file), "%s_kern.o", argv[0]);
+
+	/* Parse commands line args */
+	while ((opt = getopt(argc, argv, "hi:")) != -1) {
+		switch (opt) {
+		case 'i':
+			if (num_ids == MAX_FILTERED_IDS) {
+				printf("Reached maximum number of IDs");
+				return EXIT_FAILURE;
+			}
+			i = atoi(optarg);
+			if (!i)
+				printf("ERROR - Invalid id %s", optarg);
+			else
+				filtered_ids[num_ids++] = i;
+			break;
+		case 'h':
+		default:
+			usage(argv);
+			return EXIT_FAILURE;
+		}
+	}
+
+	if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+		perror("setrlimit(RLIMIT_MEMLOCK)");
+		return EXIT_FAILURE;
+	}
+
+	if (load_bpf_file(bpf_obj_file)) {
+		printf("ERROR - bpf_log_buf: %s", bpf_log_buf);
+		return EXIT_FAILURE;
+	}
+
+	if (!prog_fd[0]) {
+		printf("ERROR - load_bpf_file: %s\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	init_filtered_ids_map(num_ids, filtered_ids);
+	init_perf_buffer_data_structures(nr_cpus);
+
+	struct epoll_event *events = calloc(nr_cpus, sizeof(*events));
+
+	while (true) {
+		printf("Waiting for map events...\n");
+		cnt = perf_event_poll(epoll_fd, nr_cpus, events);
+		for (i = 0; i < cnt; i++)
+			perf_event_read(map_event_callback, events[i].data.u32);
+	}
+
+	free(perf_fd);
+	free(readers);
+	free(events);
+
+	return ret;
+}

^ permalink raw reply related

* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Eric Dumazet @ 2018-04-18 16:05 UTC (permalink / raw)
  To: Mikulas Patocka, David S. Miller, Eric Dumazet
  Cc: Joby Poriyath, Ben Hutchings, netdev, linux-kernel
In-Reply-To: <alpine.LRH.2.02.1804181029270.19294@file01.intranet.prod.int.rdu2.redhat.com>



On 04/18/2018 07:34 AM, Mikulas Patocka wrote:
> The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
> fails (later patches change it to kvzalloc).
> 
> The problem with this is that if the vzalloc function is actually used, 
> virtio_net doesn't work (because it expects that the extra memory should 
> be accessible with DMA-API and memory allocated with vzalloc isn't).
> 
> This patch changes it back to kzalloc and adds a warning if the allocated
> size is too large (the allocation is unreliable in this case).
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")
> 
> ---
>  net/core/dev.c |    3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> Index: linux-2.6/net/core/dev.c
> ===================================================================
> --- linux-2.6.orig/net/core/dev.c	2018-04-16 21:08:36.000000000 +0200
> +++ linux-2.6/net/core/dev.c	2018-04-18 16:24:43.000000000 +0200
> @@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
>  	/* ensure 32-byte alignment of whole construct */
>  	alloc_size += NETDEV_ALIGN - 1;
>  
> -	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> +	WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
> +	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
>  	if (!p)
>  		return NULL;
>  
> 

Since when a net_device needs to be in DMA zone ???

I would rather fix virtio_net, this looks very suspect to me.

Each virtio_net should probably allocate the exact amount of DMA-memory it wants,
instead of expecting core networking stack to have a huge chunk of DMA-memory for everything.

^ permalink raw reply

* Re: [PATCH v2 2/3] lan78xx: Read LED states from Device Tree
From: Andrew Lunn @ 2018-04-18 16:11 UTC (permalink / raw)
  To: Phil Elwell
  Cc: Woojung Huh, Microchip Linux Driver Support, Rob Herring,
	Mark Rutland, David S. Miller, Mauro Carvalho Chehab,
	Greg Kroah-Hartman, Linus Walleij, Andrew Morton, Randy Dunlap,
	netdev, devicetree, linux-kernel, linux-usb
In-Reply-To: <1524066323-109628-3-git-send-email-phil@raspberrypi.org>

On Wed, Apr 18, 2018 at 04:45:22PM +0100, Phil Elwell wrote:
> Add support for DT property "microchip,led-modes", a vector of zero
> to four cells (u32s) in the range 0-15, each of which sets the mode
> for one of the LEDs. Some possible values are:
> 
>     0=link/activity          1=link1000/activity
>     2=link100/activity       3=link10/activity
>     4=link100/1000/activity  5=link10/1000/activity
>     6=link10/100/activity    14=off    15=on
> 
> These values are given symbolic constants in a dt-bindings header.
> 
> Also use the presence of the DT property to indicate that the
> LEDs should be enabled - necessary in the event that no valid OTP
> or EEPROM is available.

Hi Phil

As i said last week, these are PHY properties, so should be in the PHY
node in device tree. It should be the PHY driver which parses these
properties and configures the LEDs, not the MAC.

	   Andrew

^ permalink raw reply

* [PATCH net-next] MAINTAINERS: Direct networking documentation changes to netdev
From: Jonathan Corbet @ 2018-04-18 16:14 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Networking docs changes go through the networking tree, so patch the
MAINTAINERS file to direct authors to the right place.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a1410d5a621..bf4be491d4d9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9765,6 +9765,7 @@ F:	include/uapi/linux/net_namespace.h
 F:	tools/testing/selftests/net/
 F:	lib/net_utils.c
 F:	lib/random32.c
+F:	Documentation/networking/
 
 NETWORKING [IPSEC]
 M:	Steffen Klassert <steffen.klassert@secunet.com>
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH bpf-next v3 07/11] bpf: make tun compatible w/ bpf_xdp_adjust_tail
From: Michael S. Tsirkin @ 2018-04-18 16:16 UTC (permalink / raw)
  To: Nikita V. Shirokov
  Cc: Alexei Starovoitov, Daniel Borkmann, Jason Wang, David S. Miller,
	netdev
In-Reply-To: <20180418044223.17685-8-tehnerd@tehnerd.com>

On Tue, Apr 17, 2018 at 09:42:19PM -0700, Nikita V. Shirokov wrote:
> w/ bpf_xdp_adjust_tail helper xdp's data_end pointer could be changed as
> well (only "decrease" of pointer's location is going to be supported).
> changing of this pointer will change packet's size.
> for tun driver we need to adjust XDP_PASS handling by recalculating
> length of the packet if it was passed to the TCP/IP stack
> (in case if after xdp's prog run data_end pointer was adjusted)
> 
> Reviewed-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Nikita V. Shirokov <tehnerd@tehnerd.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

> ---
>  drivers/net/tun.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 1e58be152d5c..901351a6ed21 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1696,6 +1696,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  			return NULL;
>  		case XDP_PASS:
>  			delta = orig_data - xdp.data;
> +			len = xdp.data_end - xdp.data;
>  			break;
>  		default:
>  			bpf_warn_invalid_xdp_action(act);
> @@ -1716,7 +1717,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  	}
>  
>  	skb_reserve(skb, pad - delta);
> -	skb_put(skb, len + delta);
> +	skb_put(skb, len);
>  	get_page(alloc_frag->page);
>  	alloc_frag->offset += buflen;
>  
> -- 
> 2.15.1

^ permalink raw reply

* Re: [RFC net-next PATCH 2/2] bpf: disallow XDP data_meta to overlap with xdp_frame area
From: Daniel Borkmann @ 2018-04-18 16:21 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov; +Cc: netdev
In-Reply-To: <152405341684.30730.9208803786283211244.stgit@firesoul>

On 04/18/2018 02:10 PM, Jesper Dangaard Brouer wrote:
> If combining xdp_adjust_head and xdp_adjust_meta, then it is possible
> to make data_meta overlap with area used by xdp_frame.  And another
> invocation of xdp_adjust_head can then clear that area, due to
> clearing of xdp_frame area.
> 
> The easiest solution I found was to simply not allow
> xdp_buff->data_meta to overlap with area used by xdp_frame.

Thanks Jesper! Trying to answer both emails in one. :) More below.

> Fixes: 6dfb970d3dbd ("xdp: avoid leaking info stored in frame data on page reuse")
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  net/core/filter.c |   11 +++++++++++
>  1 file changed, 11 insertions(+)
> 
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 15e9b5477360..e3623e741181 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2701,6 +2701,11 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
>  		     data > xdp->data_end - ETH_HLEN))
>  		return -EINVAL;
>  
> +	/* Disallow data_meta to use xdp_frame area */
> +	if (metalen > 0 &&
> +	    unlikely((data - metalen) < xdp_frame_end))
> +		return -EINVAL;
> +
>  	/* Avoid info leak, when reusing area prev used by xdp_frame */
>  	if (data < xdp_frame_end) {

Effectively, when metalen > 0, then data_meta < data pointer, so above test
on new data_meta might be better, but feels like a bit of a workaround to
handle moving data pointer but disallowing moving data_meta pointer whereas
both could be handled if we wanted to go that path.

>  		unsigned long clearlen = xdp_frame_end - data;
> @@ -2734,6 +2739,7 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
>  
>  BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
>  {
> +	void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
>  	void *meta = xdp->data_meta + offset;
>  	unsigned long metalen = xdp->data - meta;
>  
> @@ -2742,6 +2748,11 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
>  	if (unlikely(meta < xdp->data_hard_start ||
>  		     meta > xdp->data))
>  		return -EINVAL;
> +
> +	/* Disallow data_meta to use xdp_frame area */
> +	if (unlikely(meta < xdp_frame_end))
> +		return -EINVAL;

(Ditto.)

>  	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
>  		     (metalen > 32)))
>  		return -EACCES;

The other, perhaps less invasive/complex option would be to just disallow
moving anything into previous sizeof(struct xdp_frame) area. My original
concern was that not all drivers use 256 bytes of headroom, e.g. afaik the
i40e and ixgbe have around 192 bytes of headroom available, but that should
actually still be plenty of space for encap + meta data, and potentially
with meta data use I would expect that at best custom decap would be
happening when pushing the packet up the stack. So might as well disallow
going into that region and not worry about it. Thus, reverting e9e9545e10d3
("xdp: avoid leaking info stored in frame data on page reuse") and adding
something like the below (uncompiled), should just do it:

diff --git a/net/core/filter.c b/net/core/filter.c
index 3bb0cb9..ad98ddd 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2692,8 +2692,9 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)

 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	void *frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
 	unsigned long metalen = xdp_get_metalen(xdp);
-	void *data_start = xdp->data_hard_start + metalen;
+	void *data_start = frame_end + metalen;
 	void *data = xdp->data + offset;

 	if (unlikely(data < data_start ||
@@ -2719,13 +2720,13 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {

 BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
 {
+	void *frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
 	void *meta = xdp->data_meta + offset;
 	unsigned long metalen = xdp->data - meta;

 	if (xdp_data_meta_unsupported(xdp))
 		return -ENOTSUPP;
-	if (unlikely(meta < xdp->data_hard_start ||
-		     meta > xdp->data))
+	if (unlikely(meta < frame_end || meta > xdp->data))
 		return -EINVAL;
 	if (unlikely((metalen & (sizeof(__u32) - 1)) ||
 		     (metalen > 32)))

On top of that, we could even store a bool in struct xdp_rxq_info whether
the driver actually is able to participate in resp. has the XDP_REDIRECT
support and if not do something like:

void *frame_end = xdp->data_hard_start + xdp->rxq->has_redir ? sizeof(struct xdp_frame) : 0;

But the latter is merely a small optimization. Eventually we want all native XDP
drivers to support it. Thoughts?

Thanks,
Daniel

^ permalink raw reply related

* Re: SRIOV switchdev mode BoF minutes
From: Jakub Kicinski @ 2018-04-18 16:26 UTC (permalink / raw)
  To: Andy Gospodarek
  Cc: Or Gerlitz, Samudrala, Sridhar, David Miller, Anjali Singhai Jain,
	Michael Chan, Simon Horman, John Fastabend, Saeed Mahameed,
	Jiri Pirko, Rony Efraim, Linux Netdev List
In-Reply-To: <20180418151529.GL33938@C02RW35GFVH8.dhcp.broadcom.net>

On Wed, 18 Apr 2018 11:15:29 -0400, Andy Gospodarek wrote:
> > A similar issue exists on multi-host for PFs, right?  If one of the
> > hosts is down do we still show their PF repr?  IMHO yes.  
> 
> I would agree with that as well.  With today's model the VF reps are
> created once a PF is put into switchdev mode, but I'm still working out
> how we want to consider whether or not a PF rep for the other domains is
> created locally or not and also how one can determine which domain is in
> control.
> 
> Permanent config options (like NVRAM settings) could easily handle which
> domain is in control, but that still does not mean that PF reps must be
> created automatically, does it?

The control domain is tricky.  I'm not sure I understand how you could
not have a PF rep for remote domains, though.  How do you configure
switching to the PF netdev if there is no rep?

^ permalink raw reply

* Re: [RFC PATCH] net: bridge: multicast querier per VLAN support
From: Nikolay Aleksandrov @ 2018-04-18 16:27 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Joachim Nilsson, netdev, roopa
In-Reply-To: <20180418085407.4f5723de@xeon-e3>

On April 18, 2018 6:54:07 PM GMT+03:00, Stephen Hemminger <stephen@networkplumber.org> wrote:
>On Wed, 18 Apr 2018 16:14:26 +0300
>Nikolay Aleksandrov <nikolay@cumulusnetworks.com> wrote:
>
>> On 18/04/18 16:07, Joachim Nilsson wrote:
>> > On Wed, Apr 18, 2018 at 03:31:57PM +0300, Nikolay Aleksandrov
>wrote:  
>> >> On 18/04/18 15:07, Joachim Nilsson wrote:  
>> >>> - First of all, is this patch useful to anyone  
>> >> Obviously to us as it's based on our patch. :-)
>> >> We actually recently discussed what will be needed to make it
>acceptable to upstream.  
>> > 
>> > Great! :)
>> >   
>> >>> - The current br_multicast.c is very complex.  The support for
>both IPv4
>> >>>    and IPv6 is a no-brainer, but it also has #ifdef
>VLAN_FILTERING and
>> >>>    'br->vlan_enabled' ... this has likely been discussed before,
>but if
>> >>>    we could remove those code paths I believe what's left would
>be quite
>> >>>    a bit easier to read and maintain.  
>> >> br->vlan_enabled has a wrapper that can be used without ifdefs, as
>does br_vlan_find()
>> >> so in short - you can remove the ifdefs and use the wrappers, 
>they'll degrade to always
>> >> false/null when vlans are disabled.  
>> > 
>> > Thanks, I'll have a look at that and prepare an RFC v2!
>> >   
>> >>> - Many per-bridge specific multicast sysfs settings may need to
>have a
>> >>>    corresponding per-VLAN setting, e.g. snooping, query_interval,
>etc.
>> >>>    How should we go about that? (For status reporting I have a
>proposal)  
>> >> We'll have to add more to the per-vlan context, but yes it has to
>happen.
>> >> It will be only netlink interface for config/retrieval, no sysfs. 
>
>> > 
>> > Some settings are possible to do with sysfs, like
>multicast_query_interval
>> > and ...  
>> 
>> We want to avoid sysfs in general, all of networking config and stats
>> are moving to netlink. It is better controlled and structured for
>such
>> changes, also provides nice interfaces for automatic  type checks
>etc.
>> 
>> Also (but a minor reason) there is no tree/entity in sysfs for the
>vlans
>> where to add this. It will either have to be a file which does some
>> format string hack (like us currently) or will need to add new tree
>for
>> them which I'd really like to avoid for the bridge.
>
>In general, all bridge attributes need to show in netlink and sysfs.
>Sysfs is easier for scripting from languages.

True, but vlans and per-vlan settings have never been exposed via sysfs, only through netlink.
I'd like to avoid adding a directory with potentially 4k multiplied by the attr number for each vlan entries.

There is already vlan config infrastructure via netlink.

^ permalink raw reply

* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Mikulas Patocka @ 2018-04-18 16:44 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Eric Dumazet, Joby Poriyath, Ben Hutchings,
	netdev, linux-kernel, Michael S. Tsirkin, Jason Wang,
	virtualization
In-Reply-To: <3e65977e-53cd-bf09-bc4b-0ce40e9091fe@gmail.com>



On Wed, 18 Apr 2018, Eric Dumazet wrote:

> 
> 
> On 04/18/2018 07:34 AM, Mikulas Patocka wrote:
> > The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
> > fails (later patches change it to kvzalloc).
> > 
> > The problem with this is that if the vzalloc function is actually used, 
> > virtio_net doesn't work (because it expects that the extra memory should 
> > be accessible with DMA-API and memory allocated with vzalloc isn't).
> > 
> > This patch changes it back to kzalloc and adds a warning if the allocated
> > size is too large (the allocation is unreliable in this case).
> > 
> > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")
> > 
> > ---
> >  net/core/dev.c |    3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > Index: linux-2.6/net/core/dev.c
> > ===================================================================
> > --- linux-2.6.orig/net/core/dev.c	2018-04-16 21:08:36.000000000 +0200
> > +++ linux-2.6/net/core/dev.c	2018-04-18 16:24:43.000000000 +0200
> > @@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
> >  	/* ensure 32-byte alignment of whole construct */
> >  	alloc_size += NETDEV_ALIGN - 1;
> >  
> > -	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> > +	WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
> > +	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
> >  	if (!p)
> >  		return NULL;
> >  
> > 
> 
> Since when a net_device needs to be in DMA zone ???
> 
> I would rather fix virtio_net, this looks very suspect to me.
> 
> Each virtio_net should probably allocate the exact amount of DMA-memory it wants,
> instead of expecting core networking stack to have a huge chunk of DMA-memory for everything.

The structure net_device is followed by arbitrary driver-specific data 
(accessible with the function netdev_priv). And for virtio-net, these 
driver-specific data must be in DMA memory.

Mikulas

^ permalink raw reply

* Re: [net PATCH v2] net: sched, fix OOO packets with pfifo_fast
From: John Fastabend @ 2018-04-18 16:44 UTC (permalink / raw)
  To: Paolo Abeni, Cong Wang
  Cc: Eric Dumazet, Jiri Pirko, David Miller,
	Linux Kernel Network Developers
In-Reply-To: <1524036512.2599.4.camel@redhat.com>

On 04/18/2018 12:28 AM, Paolo Abeni wrote:
> Hi,
> 
> let me revive this old thread...
> 
> On Mon, 2018-03-26 at 11:16 -0700, John Fastabend wrote:
>> On 03/26/2018 10:30 AM, Cong Wang wrote:
>>> On Sat, Mar 24, 2018 at 10:25 PM, John Fastabend
>>> <john.fastabend@gmail.com> wrote:
>>>> After the qdisc lock was dropped in pfifo_fast we allow multiple
>>>> enqueue threads and dequeue threads to run in parallel. On the
>>>> enqueue side the skb bit ooo_okay is used to ensure all related
>>>> skbs are enqueued in-order. On the dequeue side though there is
>>>> no similar logic. What we observe is with fewer queues than CPUs
>>>> it is possible to re-order packets when two instances of
>>>> __qdisc_run() are running in parallel. Each thread will dequeue
>>>> a skb and then whichever thread calls the ndo op first will
>>>> be sent on the wire. This doesn't typically happen because
>>>> qdisc_run() is usually triggered by the same core that did the
>>>> enqueue. However, drivers will trigger __netif_schedule()
>>>> when queues are transitioning from stopped to awake using the
>>>> netif_tx_wake_* APIs. When this happens netif_schedule() calls
>>>> qdisc_run() on the same CPU that did the netif_tx_wake_* which
>>>> is usually done in the interrupt completion context. This CPU
>>>> is selected with the irq affinity which is unrelated to the
>>>> enqueue operations.
>>>
>>> Interesting. Why this is unique to pfifo_fast? For me it could
>>> happen to other qdisc's too, when we release the qdisc root
>>> lock in sch_direct_xmit(), another CPU could dequeue from
>>> the same qdisc and transmit the skb in parallel too?
>>>
>>
>> Agreed, my guess is it never happens because the timing is
>> tighter in the lock case. Or if it is happening its infrequent
>> enough that no one noticed the OOO packets.
> 
> I think the above could not happend due to the qdisc seqlock - which is
> not acquired by NOLOCK qdiscs.
> 

Yep, seems to be the case.

>> For net-next we probably could clean this up. I was just
>> going for something simple in net that didn't penalize all
>> qdiscs as Eric noted. This patch doesn't make it any worse
>> at least. And we have been living with the above race for
>> years.
> 
> I've benchmarked this patch is some different scenario, and in my
> testing it introduces a measurable regression in uncontended/lightly
> contended scenarios. The measured peak negative delta is with a pktgen
> thread using "xmit_mode queue_xmit":
> 
> before: 27674032 pps
> after: 23809052 pps

Yeah more atomic ops :/

> 
> I spend some time searching a way to improve this, without success.
> 
> John, did you had any chance to look at this again?
> 

If we have a multiple cores pulling from the same skb list and
feeding the same txq this happens. One problem is even if the
normal dev_queue_xmit path is aligned drivers call netif_schedule
from interrupt context and that happens on an arbitrary a cpu. When
the arbitrary cpu runs the netif_schedule logic it will dequeue
from the skb list using the cpu it was scheduled on.

The lockless case is not _really_ lockless after this patch we
have managed to pull apart the enqueue and dequeue serialization
though.

Thanks for bringing this up. I'll think about it for a bit maybe
there is something we can do here. There is a set of conditions
that if met we can run without the lock. Possibly ONETXQUEUE and
aligned cpu_map is sufficient. We could detect this case and drop
the locking. For existing systems and high Gbps NICs I think (feel
free to correct me) assuming a core per cpu is OK. At some point
though we probably need to revisit this assumption.

.John

> Thanks,
> 
> Paolo
> 

^ permalink raw reply

* Re: [PATCH] net: don't use kvzalloc for DMA memory
From: Eric Dumazet @ 2018-04-18 16:51 UTC (permalink / raw)
  To: Mikulas Patocka, Eric Dumazet
  Cc: David S. Miller, Eric Dumazet, Joby Poriyath, Ben Hutchings,
	netdev, linux-kernel, Michael S. Tsirkin, Jason Wang,
	virtualization
In-Reply-To: <alpine.LRH.2.02.1804181218270.19136@file01.intranet.prod.int.rdu2.redhat.com>



On 04/18/2018 09:44 AM, Mikulas Patocka wrote:
> 
> 
> On Wed, 18 Apr 2018, Eric Dumazet wrote:
> 
>>
>>
>> On 04/18/2018 07:34 AM, Mikulas Patocka wrote:
>>> The patch 74d332c13b21 changes alloc_netdev_mqs to use vzalloc if kzalloc
>>> fails (later patches change it to kvzalloc).
>>>
>>> The problem with this is that if the vzalloc function is actually used, 
>>> virtio_net doesn't work (because it expects that the extra memory should 
>>> be accessible with DMA-API and memory allocated with vzalloc isn't).
>>>
>>> This patch changes it back to kzalloc and adds a warning if the allocated
>>> size is too large (the allocation is unreliable in this case).
>>>
>>> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>>> Fixes: 74d332c13b21 ("net: extend net_device allocation to vmalloc()")
>>>
>>> ---
>>>  net/core/dev.c |    3 ++-
>>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>>
>>> Index: linux-2.6/net/core/dev.c
>>> ===================================================================
>>> --- linux-2.6.orig/net/core/dev.c	2018-04-16 21:08:36.000000000 +0200
>>> +++ linux-2.6/net/core/dev.c	2018-04-18 16:24:43.000000000 +0200
>>> @@ -8366,7 +8366,8 @@ struct net_device *alloc_netdev_mqs(int
>>>  	/* ensure 32-byte alignment of whole construct */
>>>  	alloc_size += NETDEV_ALIGN - 1;
>>>  
>>> -	p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
>>> +	WARN_ON(alloc_size > PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
>>> +	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
>>>  	if (!p)
>>>  		return NULL;
>>>  
>>>
>>
>> Since when a net_device needs to be in DMA zone ???
>>
>> I would rather fix virtio_net, this looks very suspect to me.
>>
>> Each virtio_net should probably allocate the exact amount of DMA-memory it wants,
>> instead of expecting core networking stack to have a huge chunk of DMA-memory for everything.
> 
> The structure net_device is followed by arbitrary driver-specific data 
> (accessible with the function netdev_priv). And for virtio-net, these 
> driver-specific data must be in DMA memory.

I get that, but how is the original xenvif problem will be solved ?

Your patch would add a bug in some other driver(s)

I suggest that virtio_net clearly identifies which part needs a specific allocation
and does its itself, instead of abusing the netdev_priv storage.

Ie use a pointer to a block of memory, allocated by virtio_net, for virtio_net.

^ permalink raw reply

* [PATCH bpf-next v2 1/9] bpf: change prototype for stack_map_get_build_id_offset
From: Yonghong Song @ 2018-04-18 16:54 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180418165444.2263237-1-yhs@fb.com>

This patch didn't incur functionality change. The function prototype
got changed so that the same function can be reused later.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/stackmap.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 57eeb12..04f6ec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -262,16 +262,11 @@ static int stack_map_get_build_id(struct vm_area_struct *vma,
 	return ret;
 }
 
-static void stack_map_get_build_id_offset(struct bpf_map *map,
-					  struct stack_map_bucket *bucket,
+static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 					  u64 *ips, u32 trace_nr, bool user)
 {
 	int i;
 	struct vm_area_struct *vma;
-	struct bpf_stack_build_id *id_offs;
-
-	bucket->nr = trace_nr;
-	id_offs = (struct bpf_stack_build_id *)bucket->data;
 
 	/*
 	 * We cannot do up_read() in nmi context, so build_id lookup is
@@ -361,8 +356,10 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 			pcpu_freelist_pop(&smap->freelist);
 		if (unlikely(!new_bucket))
 			return -ENOMEM;
-		stack_map_get_build_id_offset(map, new_bucket, ips,
-					      trace_nr, user);
+		new_bucket->nr = trace_nr;
+		stack_map_get_build_id_offset(
+			(struct bpf_stack_build_id *)new_bucket->data,
+			ips, trace_nr, user);
 		trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
 		if (hash_matches && bucket->nr == trace_nr &&
 		    memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v2 0/9] bpf: add bpf_get_stack helper
From: Yonghong Song @ 2018-04-18 16:54 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team

Currently, stackmap and bpf_get_stackid helper are provided
for bpf program to get the stack trace. This approach has
a limitation though. If two stack traces have the same hash,
only one will get stored in the stackmap table regardless of
whether BPF_F_REUSE_STACKID is specified or not,
so some stack traces may be missing from user perspective.

This patch implements a new helper, bpf_get_stack, will
send stack traces directly to bpf program. The bpf program
is able to see all stack traces, and then can do in-kernel
processing or send stack traces to user space through
shared map or bpf_perf_event_output.

Patches #1 and #2 implemented the core kernel support.
Patches #3 and #4 are two verifier improves to make
bpf programming easier. Patch #5 synced the new helper
to tools headers. Patches #6 and #7 added a test in
samples/bpf by attaching to a kprobe. Patch #8 added
a verifier test in tools/bpf for new verifier change
and Patch #9 added a test by attaching to a tracepoint.

Changelogs:
  v1 -> v2:
    . fix compilation error when CONFIG_PERF_EVENTS is not enabled

Yonghong Song (9):
  bpf: change prototype for stack_map_get_build_id_offset
  bpf: add bpf_get_stack helper
  bpf/verifier: refine retval R0 state for bpf_get_stack helper
  bpf/verifier: improve register value range tracking with ARSH
  tools/bpf: add bpf_get_stack helper to tools headers
  samples/bpf: move common-purpose perf_event functions to bpf_load.c
  samples/bpf: add a test for bpf_get_stack helper
  tools/bpf: add a verifier test case for bpf_get_stack helper and ARSH
  tools/bpf: add a test_progs test case for bpf_get_stack helper

 include/linux/bpf.h                               |   1 +
 include/linux/filter.h                            |   3 +-
 include/uapi/linux/bpf.h                          |  19 ++-
 kernel/bpf/core.c                                 |   5 +
 kernel/bpf/stackmap.c                             |  80 ++++++++++--
 kernel/bpf/syscall.c                              |  10 ++
 kernel/bpf/verifier.c                             |  35 ++++-
 kernel/trace/bpf_trace.c                          |  50 +++++++-
 samples/bpf/Makefile                              |   4 +
 samples/bpf/bpf_load.c                            | 104 +++++++++++++++
 samples/bpf/bpf_load.h                            |   5 +
 samples/bpf/trace_get_stack_kern.c                |  86 +++++++++++++
 samples/bpf/trace_get_stack_user.c                | 150 ++++++++++++++++++++++
 samples/bpf/trace_output_user.c                   | 113 ++--------------
 tools/include/uapi/linux/bpf.h                    |  19 ++-
 tools/testing/selftests/bpf/bpf_helpers.h         |   2 +
 tools/testing/selftests/bpf/test_progs.c          |  41 +++++-
 tools/testing/selftests/bpf/test_stacktrace_map.c |  20 ++-
 tools/testing/selftests/bpf/test_verifier.c       |  45 +++++++
 19 files changed, 669 insertions(+), 123 deletions(-)
 create mode 100644 samples/bpf/trace_get_stack_kern.c
 create mode 100644 samples/bpf/trace_get_stack_user.c

-- 
2.9.5

^ permalink raw reply

* [PATCH bpf-next v2 3/9] bpf/verifier: refine retval R0 state for bpf_get_stack helper
From: Yonghong Song @ 2018-04-18 16:54 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180418165444.2263237-1-yhs@fb.com>

The special property of return values for helpers bpf_get_stack
and bpf_probe_read_str are captured in verifier.
Both helpers return a negative error code or
a length, which is equal to or smaller than the buffer
size argument. This additional information in the
verifier can avoid the condition such as "retval > bufsize"
in the bpf program. For example, for the code blow,
    usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
    if (usize < 0 || usize > max_len)
        return 0;
The verifier may have the following errors:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (bf) r8 = r0
    54: (bf) r1 = r8
    55: (67) r1 <<= 32
    56: (bf) r2 = r1
    57: (77) r2 >>= 32
    58: (25) if r2 > 0x31f goto pc+33
     R0=inv(id=0) R1=inv(id=0,smax_value=9223372032559808512,
                         umax_value=18446744069414584320,
                         var_off=(0x0; 0xffffffff00000000))
     R2=inv(id=0,umax_value=799,var_off=(0x0; 0x3ff))
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0) R9=inv800 R10=fp0,call_-1
    59: (1f) r9 -= r8
    60: (c7) r1 s>>= 32
    61: (bf) r2 = r7
    62: (0f) r2 += r1
    math between map_value pointer and register with unbounded
    min value is not allowed
The failure is due to llvm compiler optimization where register "r2",
which is a copy of "r1", is tested for condition while later on "r1"
is used for map_ptr operation. The verifier is not able to track such
inst sequence effectively.

Without the "usize > max_len" condition, there is no llvm optimization
and the below generated code passed verifier:
    52: (85) call bpf_get_stack#65
     R0=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R1_w=ctx(id=0,off=0,imm=0)
     R2_w=map_value(id=0,off=0,ks=4,vs=1600,imm=0) R3_w=inv800 R4_w=inv256
     R6=ctx(id=0,off=0,imm=0) R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R9_w=inv800 R10=fp0,call_-1
    53: (b7) r1 = 0
    54: (bf) r8 = r0
    55: (67) r8 <<= 32
    56: (c7) r8 s>>= 32
    57: (6d) if r1 s> r8 goto pc+24
     R0=inv(id=0,umax_value=800) R1=inv0 R6=ctx(id=0,off=0,imm=0)
     R7=map_value(id=0,off=0,ks=4,vs=1600,imm=0)
     R8=inv(id=0,umax_value=800,var_off=(0x0; 0x3ff)) R9=inv800
     R10=fp0,call_-1
    58: (bf) r2 = r7
    59: (0f) r2 += r8
    60: (1f) r9 -= r8
    61: (bf) r1 = r6

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 kernel/bpf/verifier.c | 31 ++++++++++++++++++++++++++++++-
 1 file changed, 30 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index aba9425..a8302c3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2333,10 +2333,32 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
 	return 0;
 }
 
+static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type,
+				   int func_id,
+				   struct bpf_reg_state *retval_state,
+				   bool is_check)
+{
+	struct bpf_reg_state *src_reg, *dst_reg;
+
+	if (ret_type != RET_INTEGER ||
+	    (func_id != BPF_FUNC_get_stack &&
+	     func_id != BPF_FUNC_probe_read_str))
+		return;
+
+	dst_reg = is_check ? retval_state : &regs[BPF_REG_0];
+	if (func_id == BPF_FUNC_get_stack)
+		src_reg = is_check ? &regs[BPF_REG_3] : retval_state;
+	else
+		src_reg = is_check ? &regs[BPF_REG_2] : retval_state;
+
+	dst_reg->smax_value = src_reg->smax_value;
+	dst_reg->umax_value = src_reg->umax_value;
+}
+
 static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
 	const struct bpf_func_proto *fn = NULL;
-	struct bpf_reg_state *regs;
+	struct bpf_reg_state *regs, retval_state;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
@@ -2415,6 +2437,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 	}
 
 	regs = cur_regs(env);
+
+	/* before reset caller saved regs, check special ret value */
+	do_refine_retval_range(regs, fn->ret_type, func_id, &retval_state, 1);
+
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
 		mark_reg_not_init(env, regs, caller_saved[i]);
@@ -2456,6 +2482,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		return -EINVAL;
 	}
 
+	/* apply additional constraints to ret value */
+	do_refine_retval_range(regs, fn->ret_type, func_id, &retval_state, 0);
+
 	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next v2 7/9] samples/bpf: add a test for bpf_get_stack helper
From: Yonghong Song @ 2018-04-18 16:54 UTC (permalink / raw)
  To: ast, daniel, netdev; +Cc: kernel-team
In-Reply-To: <20180418165444.2263237-1-yhs@fb.com>

The test attached a kprobe program to kernel function sys_write.
It tested to get stack for user space, kernel space and user
space with build_id request. It also tested to get user
and kernel stack into the same buffer with back-to-back
bpf_get_stack helper calls.

Whenever the kernel stack is available, the user space
application will check to ensure that sys_write/SyS_write
is part of the stack.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/Makefile               |   4 +
 samples/bpf/trace_get_stack_kern.c |  86 +++++++++++++++++++++
 samples/bpf/trace_get_stack_user.c | 150 +++++++++++++++++++++++++++++++++++++
 3 files changed, 240 insertions(+)
 create mode 100644 samples/bpf/trace_get_stack_kern.c
 create mode 100644 samples/bpf/trace_get_stack_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 4d6a6ed..94e7b10 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -44,6 +44,7 @@ hostprogs-y += xdp_monitor
 hostprogs-y += xdp_rxq_info
 hostprogs-y += syscall_tp
 hostprogs-y += cpustat
+hostprogs-y += trace_get_stack
 
 # Libbpf dependencies
 LIBBPF := ../../tools/lib/bpf/bpf.o ../../tools/lib/bpf/nlattr.o
@@ -95,6 +96,7 @@ xdp_monitor-objs := bpf_load.o $(LIBBPF) xdp_monitor_user.o
 xdp_rxq_info-objs := bpf_load.o $(LIBBPF) xdp_rxq_info_user.o
 syscall_tp-objs := bpf_load.o $(LIBBPF) syscall_tp_user.o
 cpustat-objs := bpf_load.o $(LIBBPF) cpustat_user.o
+trace_get_stack-objs := bpf_load.o $(LIBBPF) trace_get_stack_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -148,6 +150,7 @@ always += xdp_rxq_info_kern.o
 always += xdp2skb_meta_kern.o
 always += syscall_tp_kern.o
 always += cpustat_kern.o
+always += trace_get_stack_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 HOSTCFLAGS += -I$(srctree)/tools/lib/
@@ -193,6 +196,7 @@ HOSTLOADLIBES_xdp_monitor += -lelf
 HOSTLOADLIBES_xdp_rxq_info += -lelf
 HOSTLOADLIBES_syscall_tp += -lelf
 HOSTLOADLIBES_cpustat += -lelf
+HOSTLOADLIBES_trace_get_stack += -lelf
 
 # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline:
 #  make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang
diff --git a/samples/bpf/trace_get_stack_kern.c b/samples/bpf/trace_get_stack_kern.c
new file mode 100644
index 0000000..665e4ad
--- /dev/null
+++ b/samples/bpf/trace_get_stack_kern.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+/* Permit pretty deep stack traces */
+#define MAX_STACK 100
+struct stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	u64 kern_stack[MAX_STACK];
+	u64 user_stack[MAX_STACK];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK];
+};
+
+struct bpf_map_def SEC("maps") perfmap = {
+	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+	.key_size = sizeof(int),
+	.value_size = sizeof(u32),
+	.max_entries = 2,
+};
+
+struct bpf_map_def SEC("maps") stackdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(struct stack_trace_t),
+	.max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") rawdata_map = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = MAX_STACK * sizeof(u64) * 2,
+	.max_entries = 1,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	int max_len, max_buildid_len, usize, ksize, total_size;
+	struct stack_trace_t *data;
+	void *raw_data;
+	u32 key = 0;
+
+	data = bpf_map_lookup_elem(&stackdata_map, &key);
+	if (!data)
+		return 0;
+
+	max_len = MAX_STACK * sizeof(u64);
+	max_buildid_len = MAX_STACK * sizeof(struct bpf_stack_build_id);
+	data->pid = bpf_get_current_pid_tgid();
+	data->kern_stack_size = bpf_get_stack(ctx, data->kern_stack,
+					      max_len, 0);
+	data->user_stack_size = bpf_get_stack(ctx, data->user_stack, max_len,
+					    BPF_F_USER_STACK);
+	data->user_stack_buildid_size = bpf_get_stack(
+		ctx, data->user_stack_buildid, max_buildid_len,
+		BPF_F_USER_STACK | BPF_F_USER_BUILD_ID);
+	bpf_perf_event_output(ctx, &perfmap, 0, data, sizeof(*data));
+
+	/* write both kernel and user stacks to the same buffer */
+	raw_data = bpf_map_lookup_elem(&rawdata_map, &key);
+	if (!raw_data)
+		return 0;
+
+	usize = bpf_get_stack(ctx, raw_data, max_len, BPF_F_USER_STACK);
+	if (usize < 0)
+		return 0;
+
+	ksize = bpf_get_stack(ctx, raw_data + usize, max_len - usize, 0);
+	if (ksize < 0)
+		return 0;
+
+	total_size = usize + ksize;
+	if (total_size > 0 && total_size <= max_len)
+		bpf_perf_event_output(ctx, &perfmap, 0, raw_data, total_size);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_get_stack_user.c b/samples/bpf/trace_get_stack_user.c
new file mode 100644
index 0000000..f64f5a5
--- /dev/null
+++ b/samples/bpf/trace_get_stack_user.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+#include "perf-sys.h"
+
+static int pmu_fd;
+
+#define MAX_CNT 10ull
+#define MAX_STACK 100
+struct stack_trace_t {
+	int pid;
+	int kern_stack_size;
+	int user_stack_size;
+	int user_stack_buildid_size;
+	__u64 kern_stack[MAX_STACK];
+	__u64 user_stack[MAX_STACK];
+	struct bpf_stack_build_id user_stack_buildid[MAX_STACK];
+};
+
+static void print_bpf_output(void *data, int size)
+{
+	struct stack_trace_t *e = data;
+	int i, num_stack;
+	static __u64 cnt;
+	bool found = false;
+
+	cnt++;
+
+	if (size < sizeof(struct stack_trace_t)) {
+		__u64 *raw_data = data;
+
+		num_stack = size / sizeof(__u64);
+		printf("sample size = %d, raw stack\n\t", size);
+		for (i = 0; i < num_stack; i++) {
+			struct ksym *ks = ksym_search(raw_data[i]);
+
+			printf("0x%llx ", raw_data[i]);
+			if (ks && (strcmp(ks->name, "sys_write") == 0 ||
+				   strcmp(ks->name, "SyS_write") == 0))
+				found = true;
+		}
+		printf("\n");
+	} else {
+		printf("sample size = %d, pid %d\n", size, e->pid);
+		if (e->kern_stack_size > 0) {
+			num_stack = e->kern_stack_size / sizeof(__u64);
+			printf("\tkernel_stack(%d): ", num_stack);
+			for (i = 0; i < num_stack; i++) {
+				struct ksym *ks = ksym_search(e->kern_stack[i]);
+
+				printf("0x%llx ", e->kern_stack[i]);
+				if (ks && (strcmp(ks->name, "sys_write") == 0 ||
+					   strcmp(ks->name, "SyS_write") == 0))
+					found = true;
+			}
+			printf("\n");
+		}
+		if (e->user_stack_size > 0) {
+			num_stack = e->user_stack_size / sizeof(__u64);
+			printf("\tuser_stack(%d): ", num_stack);
+			for (i = 0; i < num_stack; i++)
+				printf("0x%llx ", e->user_stack[i]);
+			printf("\n");
+		}
+		if (e->user_stack_buildid_size > 0) {
+			num_stack = e->user_stack_buildid_size /
+				    sizeof(struct bpf_stack_build_id);
+			printf("\tuser_stack_buildid(%d): ", num_stack);
+			for (i = 0; i < num_stack; i++) {
+				int j;
+
+				printf("(%d, 0x", e->user_stack_buildid[i].status);
+				for (j = 0; j < BPF_BUILD_ID_SIZE; j++)
+					printf("%02x", e->user_stack_buildid[i].build_id[i]);
+				printf(", %llx) ", e->user_stack_buildid[i].offset);
+			}
+			printf("\n");
+		}
+	}
+	if (!found) {
+		printf("received %lld events, kern symbol not found, exiting ...\n", cnt);
+		kill(0, SIGINT);
+	}
+
+	if (cnt == MAX_CNT) {
+		printf("received max %lld events, exiting ...\n", cnt);
+		kill(0, SIGINT);
+	}
+}
+
+static void test_bpf_perf_event(void)
+{
+	struct perf_event_attr attr = {
+		.sample_type = PERF_SAMPLE_RAW,
+		.type = PERF_TYPE_SOFTWARE,
+		.config = PERF_COUNT_SW_BPF_OUTPUT,
+	};
+	int key = 0;
+
+	pmu_fd = sys_perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+
+	assert(pmu_fd >= 0);
+	assert(bpf_map_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0);
+	ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+static void action(void)
+{
+	FILE *f;
+
+	f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+	(void) f;
+}
+
+int main(int argc, char **argv)
+{
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (load_kallsyms()) {
+		printf("failed to process /proc/kallsyms\n");
+		return 2;
+	}
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	test_bpf_perf_event();
+	return perf_event_poller(pmu_fd, action, print_bpf_output);
+}
-- 
2.9.5

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox