Netdev List
 help / color / mirror / Atom feed
* [PATCH v2 net-next 2/2] net: mvpp2: support multiple comphy lanes
From: Matt Pelland @ 2019-08-08 23:06 UTC (permalink / raw)
  To: netdev; +Cc: Matt Pelland, davem, maxime.chevallier, antoine.tenart
In-Reply-To: <20190808230606.7900-1-mpelland@starry.com>

mvpp 2.2 supports RXAUI, which requires two serdes lanes, and XAUI which
requires four serdes lanes instead of the usual single lane required by other
interface modes. This patch expands the number of lanes that can be associated
to a port so that all relevant serdes lanes are correctly configured at the
appropriate times when either RXAUI or XAUI is in use.

Signed-off-by: Matt Pelland <mpelland@starry.com>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h    |  7 +-
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 97 ++++++++++++++-----
 2 files changed, 77 insertions(+), 27 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index 256e7c796631..d74f458ca099 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -655,6 +655,11 @@
 #define MVPP2_F_LOOPBACK		BIT(0)
 #define MVPP2_F_DT_COMPAT		BIT(1)
 
+/* MVPP22 supports RXAUI which requires two comphy lanes and XAUI which
+ * requires four comphy lanes. All other modes require one.
+ */
+#define MVPP22_MAX_COMPHYS		4
+
 /* Marvell tag types */
 enum mvpp2_tag_type {
 	MVPP2_TAG_TYPE_NONE = 0,
@@ -935,7 +940,7 @@ struct mvpp2_port {
 	phy_interface_t phy_interface;
 	struct phylink *phylink;
 	struct phylink_config phylink_config;
-	struct phy *comphy;
+	struct phy *comphys[MVPP22_MAX_COMPHYS];
 
 	struct mvpp2_bm_pool *pool_long;
 	struct mvpp2_bm_pool *pool_short;
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 1a5037a398fc..100972703f60 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -1200,17 +1200,40 @@ static void mvpp22_gop_setup_irq(struct mvpp2_port *port)
  */
 static int mvpp22_comphy_init(struct mvpp2_port *port)
 {
-	int ret;
+	int i, ret;
 
-	if (!port->comphy)
-		return 0;
+	for (i = 0; i < ARRAY_SIZE(port->comphys); i++) {
+		if (!port->comphys[i])
+			return 0;
 
-	ret = phy_set_mode_ext(port->comphy, PHY_MODE_ETHERNET,
-			       port->phy_interface);
-	if (ret)
-		return ret;
+		ret = phy_set_mode_ext(port->comphys[i],
+				       PHY_MODE_ETHERNET,
+				       port->phy_interface);
+		if (ret)
+			return ret;
+
+		ret = phy_power_on(port->comphys[i]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int mvpp22_comphy_deinit(struct mvpp2_port *port)
+{
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(port->comphys); i++) {
+		if (!port->comphys[i])
+			return 0;
+
+		ret = phy_power_off(port->comphys[i]);
+		if (ret)
+			return ret;
+	}
 
-	return phy_power_on(port->comphy);
+	return 0;
 }
 
 static void mvpp2_port_enable(struct mvpp2_port *port)
@@ -3389,7 +3412,9 @@ static void mvpp2_stop_dev(struct mvpp2_port *port)
 
 	if (port->phylink)
 		phylink_stop(port->phylink);
-	phy_power_off(port->comphy);
+
+	if (port->priv->hw_version == MVPP22)
+		mvpp22_comphy_deinit(port);
 }
 
 static int mvpp2_check_ringparam_valid(struct net_device *dev,
@@ -4946,7 +4971,7 @@ static void mvpp2_mac_config(struct phylink_config *config, unsigned int mode,
 		port->phy_interface = state->interface;
 
 		/* Reconfigure the serdes lanes */
-		phy_power_off(port->comphy);
+		mvpp22_comphy_deinit(port);
 		mvpp22_mode_reconfigure(port);
 	}
 
@@ -5037,20 +5062,18 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 			    struct fwnode_handle *port_fwnode,
 			    struct mvpp2 *priv)
 {
-	struct phy *comphy = NULL;
-	struct mvpp2_port *port;
-	struct mvpp2_port_pcpu *port_pcpu;
+	unsigned int ntxqs, nrxqs, ncomphys, nrequired_comphys, thread;
 	struct device_node *port_node = to_of_node(port_fwnode);
+	struct mvpp2_port_pcpu *port_pcpu;
 	netdev_features_t features;
-	struct net_device *dev;
 	struct phylink *phylink;
-	char *mac_from = "";
-	unsigned int ntxqs, nrxqs, thread;
+	struct mvpp2_port *port;
 	unsigned long flags = 0;
+	struct net_device *dev;
+	int err, i, phy_mode;
+	char *mac_from = "";
 	bool has_tx_irqs;
 	u32 id;
-	int phy_mode;
-	int err, i;
 
 	has_tx_irqs = mvpp2_port_has_irqs(priv, port_node, &flags);
 	if (!has_tx_irqs && queue_mode == MVPP2_QDIST_MULTI_MODE) {
@@ -5084,14 +5107,38 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 		goto err_free_netdev;
 	}
 
+	port = netdev_priv(dev);
+
 	if (port_node) {
-		comphy = devm_of_phy_get(&pdev->dev, port_node, NULL);
-		if (IS_ERR(comphy)) {
-			if (PTR_ERR(comphy) == -EPROBE_DEFER) {
-				err = -EPROBE_DEFER;
-				goto err_free_netdev;
+		for (i = 0, ncomphys = 0; i < ARRAY_SIZE(port->comphys); i++) {
+			port->comphys[i] = devm_of_phy_get_by_index(&pdev->dev,
+								    port_node,
+								    i);
+			if (IS_ERR(port->comphys[i])) {
+				err = PTR_ERR(port->comphys[i]);
+				port->comphys[i] = NULL;
+				if (err == -EPROBE_DEFER)
+					goto err_free_netdev;
+				err = 0;
+				break;
 			}
-			comphy = NULL;
+
+			++ncomphys;
+		}
+
+		if (phy_mode == PHY_INTERFACE_MODE_XAUI)
+			nrequired_comphys = 4;
+		else if (phy_mode == PHY_INTERFACE_MODE_RXAUI)
+			nrequired_comphys = 2;
+		else
+			nrequired_comphys = 1;
+
+		if (ncomphys < nrequired_comphys) {
+			dev_err(&pdev->dev,
+				"not enough comphys to support %s\n",
+				phy_modes(phy_mode));
+			err = -EINVAL;
+			goto err_free_netdev;
 		}
 	}
 
@@ -5106,7 +5153,6 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 	dev->netdev_ops = &mvpp2_netdev_ops;
 	dev->ethtool_ops = &mvpp2_eth_tool_ops;
 
-	port = netdev_priv(dev);
 	port->dev = dev;
 	port->fwnode = port_fwnode;
 	port->has_phy = !!of_find_property(port_node, "phy", NULL);
@@ -5143,7 +5189,6 @@ static int mvpp2_port_probe(struct platform_device *pdev,
 
 	port->of_node = port_node;
 	port->phy_interface = phy_mode;
-	port->comphy = comphy;
 
 	if (priv->hw_version == MVPP21) {
 		port->base = devm_platform_ioremap_resource(pdev, 2 + id);
-- 
2.21.0


^ permalink raw reply related

* [PATCH v2 net-next 1/2] net: mvpp2: implement RXAUI support
From: Matt Pelland @ 2019-08-08 23:06 UTC (permalink / raw)
  To: netdev; +Cc: Matt Pelland, davem, maxime.chevallier, antoine.tenart
In-Reply-To: <20190808230606.7900-1-mpelland@starry.com>

Marvell's mvpp2 packet processor supports RXAUI on port zero in a
similar manner to the existing 10G protocols that have already been
implemented. This patch implements the miscellaneous extra configuration
steps required for RXAUI operation.

Signed-off-by: Matt Pelland <mpelland@starry.com>
---
 drivers/net/ethernet/marvell/mvpp2/mvpp2.h    |  1 +
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 32 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
index 4d9564ba68f6..256e7c796631 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h
@@ -481,6 +481,7 @@
 #define MVPP22_XLG_CTRL4_REG			0x184
 #define     MVPP22_XLG_CTRL4_FWD_FC		BIT(5)
 #define     MVPP22_XLG_CTRL4_FWD_PFC		BIT(6)
+#define     MVPP22_XLG_CTRL4_USE_XPCS		BIT(8)
 #define     MVPP22_XLG_CTRL4_MACMODSELECT_GMAC	BIT(12)
 #define     MVPP22_XLG_CTRL4_EN_IDLE_CHECK	BIT(14)
 
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 74fd9e171865..1a5037a398fc 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -980,6 +980,7 @@ mvpp2_shared_interrupt_mask_unmask(struct mvpp2_port *port, bool mask)
 static bool mvpp2_is_xlg(phy_interface_t interface)
 {
 	return interface == PHY_INTERFACE_MODE_10GKR ||
+	       interface == PHY_INTERFACE_MODE_RXAUI ||
 	       interface == PHY_INTERFACE_MODE_XAUI;
 }
 
@@ -1020,6 +1021,29 @@ static void mvpp22_gop_init_sgmii(struct mvpp2_port *port)
 	}
 }
 
+static void mvpp22_gop_init_rxaui(struct mvpp2_port *port)
+{
+	struct mvpp2 *priv = port->priv;
+	void __iomem *xpcs;
+	u32 val;
+
+	xpcs = priv->iface_base + MVPP22_XPCS_BASE(port->gop_id);
+
+	val = readl(xpcs + MVPP22_XPCS_CFG0);
+	val &= ~MVPP22_XPCS_CFG0_RESET_DIS;
+	writel(val, xpcs + MVPP22_XPCS_CFG0);
+
+	val = readl(xpcs + MVPP22_XPCS_CFG0);
+	val &= ~(MVPP22_XPCS_CFG0_PCS_MODE(0x3) |
+		 MVPP22_XPCS_CFG0_ACTIVE_LANE(0x3));
+	val |= MVPP22_XPCS_CFG0_ACTIVE_LANE(2);
+	writel(val, xpcs + MVPP22_XPCS_CFG0);
+
+	val = readl(xpcs + MVPP22_XPCS_CFG0);
+	val |= MVPP22_XPCS_CFG0_RESET_DIS;
+	writel(val, xpcs + MVPP22_XPCS_CFG0);
+}
+
 static void mvpp22_gop_init_10gkr(struct mvpp2_port *port)
 {
 	struct mvpp2 *priv = port->priv;
@@ -1065,6 +1089,9 @@ static int mvpp22_gop_init(struct mvpp2_port *port)
 	case PHY_INTERFACE_MODE_2500BASEX:
 		mvpp22_gop_init_sgmii(port);
 		break;
+	case PHY_INTERFACE_MODE_RXAUI:
+		mvpp22_gop_init_rxaui(port);
+		break;
 	case PHY_INTERFACE_MODE_10GKR:
 		if (port->gop_id != 0)
 			goto invalid_conf;
@@ -4567,6 +4594,7 @@ static void mvpp2_phylink_validate(struct phylink_config *config,
 	switch (state->interface) {
 	case PHY_INTERFACE_MODE_10GKR:
 	case PHY_INTERFACE_MODE_XAUI:
+	case PHY_INTERFACE_MODE_RXAUI:
 		if (port->gop_id != 0)
 			goto empty_set;
 		break;
@@ -4589,6 +4617,7 @@ static void mvpp2_phylink_validate(struct phylink_config *config,
 	switch (state->interface) {
 	case PHY_INTERFACE_MODE_10GKR:
 	case PHY_INTERFACE_MODE_XAUI:
+	case PHY_INTERFACE_MODE_RXAUI:
 	case PHY_INTERFACE_MODE_NA:
 		if (port->gop_id == 0) {
 			phylink_set(mask, 10000baseT_Full);
@@ -4741,6 +4770,9 @@ static void mvpp2_xlg_config(struct mvpp2_port *port, unsigned int mode,
 		   MVPP22_XLG_CTRL4_EN_IDLE_CHECK);
 	ctrl4 |= MVPP22_XLG_CTRL4_FWD_FC | MVPP22_XLG_CTRL4_FWD_PFC;
 
+	if (state->interface == PHY_INTERFACE_MODE_RXAUI)
+		ctrl4 |= MVPP22_XLG_CTRL4_USE_XPCS;
+
 	if (old_ctrl0 != ctrl0)
 		writel(ctrl0, port->base + MVPP22_XLG_CTRL0_REG);
 	if (old_ctrl4 != ctrl4)
-- 
2.21.0


^ permalink raw reply related

* [PATCH v2 net-next 0/2] net: mvpp2: Implement RXAUI Support
From: Matt Pelland @ 2019-08-08 23:06 UTC (permalink / raw)
  To: netdev; +Cc: Matt Pelland, davem, maxime.chevallier, antoine.tenart

This patch set implements support for configuring Marvell's mvpp2 hardware for
RXAUI operation. There are two other patches necessary for this to work
correctly that concern Marvell's cp110 comphy that were emailed to the general
linux-kernel mailing list earlier on. I can post them here if need be. This
patch set was successfully tested on both a Marvell Armada 7040 based platform
as well as an Armada 8040 based platform.

Changes since v1:

- Use reverse christmas tree formatting for all modified declaration blocks.
- Bump MVP22_MAX_COMPHYS to 4 to allow for XAUI operation.
- Implement comphy sanity checking.

Matt Pelland (2):
  net: mvpp2: implement RXAUI support
  net: mvpp2: support multiple comphy lanes

 drivers/net/ethernet/marvell/mvpp2/mvpp2.h    |   8 +-
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 129 ++++++++++++++----
 2 files changed, 110 insertions(+), 27 deletions(-)

-- 
2.21.0


^ permalink raw reply

* Re: [PATCH v2 15/15] dt-bindings: net: add bindings for ADIN PHY driver
From: Rob Herring @ 2019-08-08 23:03 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel@vger.kernel.org, David Miller,
	Mark Rutland, Florian Fainelli, Heiner Kallweit, Andrew Lunn
In-Reply-To: <20190808123026.17382-16-alexandru.ardelean@analog.com>

On Thu, Aug 8, 2019 at 6:31 AM Alexandru Ardelean
<alexandru.ardelean@analog.com> wrote:
>
> This change adds bindings for the Analog Devices ADIN PHY driver, detailing
> all the properties implemented by the driver.
>
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>
> ---
>  .../devicetree/bindings/net/adi,adin.yaml     | 76 +++++++++++++++++++
>  MAINTAINERS                                   |  1 +
>  2 files changed, 77 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/adi,adin.yaml
>
> diff --git a/Documentation/devicetree/bindings/net/adi,adin.yaml b/Documentation/devicetree/bindings/net/adi,adin.yaml
> new file mode 100644
> index 000000000000..86177c8fe23a
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/adi,adin.yaml
> @@ -0,0 +1,76 @@
> +# SPDX-License-Identifier: GPL-2.0+
> +%YAML 1.2
> +---
> +$id: http://devicetree.org/schemas/net/adi,adin.yaml#
> +$schema: http://devicetree.org/meta-schemas/core.yaml#
> +
> +title: Analog Devices ADIN1200/ADIN1300 PHY
> +
> +maintainers:
> +  - Alexandru Ardelean <alexandru.ardelean@analog.com>
> +
> +description: |
> +  Bindings for Analog Devices Industrial Ethernet PHYs
> +
> +allOf:
> +  - $ref: ethernet-phy.yaml#
> +
> +properties:
> +  adi,rx-internal-delay-ps:
> +    $ref: /schemas/types.yaml#/definitions/uint32
> +    description: |
> +      RGMII RX Clock Delay used only when PHY operates in RGMII mode with
> +      internal delay (phy-mode is 'rgmii-id' or 'rgmii-rxid') in pico-seconds.
> +    enum: [ 1600, 1800, 2000, 2200, 2400 ]
> +    default: 2000

This doesn't actually do what you think. The '$ref' has to be under an
'allOf' to work. It's an oddity of json-schema. However, anything with
a standard unit suffix already has a schema to define the type, so you
don't need to here and can just drop $ref.

> +
> +  adi,tx-internal-delay-ps:
> +    $ref: /schemas/types.yaml#/definitions/uint32
> +    description: |
> +      RGMII TX Clock Delay used only when PHY operates in RGMII mode with
> +      internal delay (phy-mode is 'rgmii-id' or 'rgmii-txid') in pico-seconds.
> +    enum: [ 1600, 1800, 2000, 2200, 2400 ]
> +    default: 2000
> +
> +  adi,fifo-depth-bits:
> +    $ref: /schemas/types.yaml#/definitions/uint32
> +    description: |
> +      When operating in RMII mode, this option configures the FIFO depth.
> +    enum: [ 4, 8, 12, 16, 20, 24 ]
> +    default: 8
> +
> +  adi,disable-energy-detect:
> +    description: |
> +      Disables Energy Detect Powerdown Mode (default disabled, i.e energy detect
> +      is enabled if this property is unspecified)
> +    type: boolean
> +
> +examples:
> +  - |
> +    ethernet {
> +        #address-cells = <1>;
> +        #size-cells = <0>;
> +
> +        phy-mode = "rgmii-id";
> +
> +        ethernet-phy@0 {
> +            reg = <0>;
> +
> +            adi,rx-internal-delay-ps = <1800>;
> +            adi,tx-internal-delay-ps = <2200>;
> +        };
> +    };
> +  - |
> +    ethernet {
> +        #address-cells = <1>;
> +        #size-cells = <0>;
> +
> +        phy-mode = "rmii";
> +
> +        ethernet-phy@1 {
> +            reg = <1>;
> +
> +            adi,fifo-depth-bits = <16>;
> +            adi,disable-energy-detect;
> +        };
> +    };
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e8aa8a667864..fd9ab61c2670 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -944,6 +944,7 @@ L:  netdev@vger.kernel.org
>  W:     http://ez.analog.com/community/linux-device-drivers
>  S:     Supported
>  F:     drivers/net/phy/adin.c
> +F:     Documentation/devicetree/bindings/net/adi,adin.yaml
>
>  ANALOG DEVICES INC ADIS DRIVER LIBRARY
>  M:     Alexandru Ardelean <alexandru.ardelean@analog.com>
> --
> 2.20.1
>

^ permalink raw reply

* Re: [PATCH net-next] net/ncsi: allow to customize BMC MAC Address offset
From: Andrew Lunn @ 2019-08-08 23:03 UTC (permalink / raw)
  To: Tao Ren
  Cc: Jakub Kicinski, netdev@vger.kernel.org, openbmc@lists.ozlabs.org,
	linux-kernel@vger.kernel.org, Samuel Mendoza-Jonas,
	David S . Miller, William Kennington
In-Reply-To: <ac22bbe0-36ca-b4b9-7ea7-7b1741c2070d@fb.com>

> After giving it more thought, I'm thinking about adding ncsi dt node
> with following structure (mac/ncsi similar to mac/mdio/phy):
> 
> &mac0 {
>     /* MAC properties... */
> 
>     use-ncsi;

This property seems to be specific to Faraday FTGMAC100. Are you going
to make it more generic? 

>     ncsi {
>         /* ncsi level properties if any */
> 
>         package@0 {

You should get Rob Herring involved. This is not really describing
hardware, so it might get rejected by the device tree maintainer.

> 1) mac driver doesn't need to parse "mac-offset" stuff: these
> ncsi-network-controller specific settings should be parsed in ncsi
> stack.

> 2) get_bmc_mac_address command is a channel specific command, and
> technically people can configure different offset/formula for
> different channels.

Does that mean the NCSA code puts the interface into promiscuous mode?
Or at least adds these unicast MAC addresses to the MAC receive
filter? Humm, ftgmac100 only seems to support multicast address
filtering, not unicast filters, so it must be using promisc mode, if
you expect to receive frames using this MAC address.

	   Andrew

^ permalink raw reply

* Re: [PATCH net] inet: frags: re-introduce skb coalescing for local delivery
From: David Miller @ 2019-08-08 22:55 UTC (permalink / raw)
  To: gnault; +Cc: netdev, fw, edumazet, posk, alex.aring
In-Reply-To: <22d8da10c97214edd0677e6478093ad9376180ef.1564758715.git.gnault@redhat.com>

From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 2 Aug 2019 17:15:03 +0200

> Before commit d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6
> defrag"), a netperf UDP_STREAM test[0] using big IPv6 datagrams (thus
> generating many fragments) and running over an IPsec tunnel, reported
> more than 6Gbps throughput. After that patch, the same test gets only
> 9Mbps when receiving on a be2net nic (driver can make a big difference
> here, for example, ixgbe doesn't seem to be affected).
> 
> By reusing the IPv4 defragmentation code, IPv6 lost fragment coalescing
> (IPv4 fragment coalescing was dropped by commit 14fe22e33462 ("Revert
> "ipv4: use skb coalescing in defragmentation"")).
> 
> Without fragment coalescing, be2net runs out of Rx ring entries and
> starts to drop frames (ethtool reports rx_drops_no_frags errors). Since
> the netperf traffic is only composed of UDP fragments, any lost packet
> prevents reassembly of the full datagram. Therefore, fragments which
> have no possibility to ever get reassembled pile up in the reassembly
> queue, until the memory accounting exeeds the threshold. At that point
> no fragment is accepted anymore, which effectively discards all
> netperf traffic.
> 
> When reassembly timeout expires, some stale fragments are removed from
> the reassembly queue, so a few packets can be received, reassembled
> and delivered to the netperf receiver. But the nic still drops frames
> and soon the reassembly queue gets filled again with stale fragments.
> These long time frames where no datagram can be received explain why
> the performance drop is so significant.
> 
> Re-introducing fragment coalescing is enough to get the initial
> performances again (6.6Gbps with be2net): driver doesn't drop frames
> anymore (no more rx_drops_no_frags errors) and the reassembly engine
> works at full speed.
> 
> This patch is quite conservative and only coalesces skbs for local
> IPv4 and IPv6 delivery (in order to avoid changing skb geometry when
> forwarding). Coalescing could be extended in the future if need be, as
> more scenarios would probably benefit from it.
 ...
> Signed-off-by: Guillaume Nault <gnault@redhat.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH net-next v4 2/2] net: phy: broadcom: add 1000Base-X support for BCM54616S
From: Tao Ren @ 2019-08-08 22:31 UTC (permalink / raw)
  To: Heiner Kallweit, Andrew Lunn, Florian Fainelli, David S . Miller,
	Arun Parameswaran, Justin Chen, Vladimir Oltean,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	openbmc@lists.ozlabs.org
In-Reply-To: <6d080f3e-48b9-a65d-b73e-576296e98738@gmail.com>

On 8/8/19 3:11 PM, Heiner Kallweit wrote:
> On 08.08.2019 23:47, Tao Ren wrote:
>> Hi Heiner,
>>
>> On 8/7/19 9:24 PM, Tao Ren wrote:
>>> Hi Heiner,
>>>
>>> On 8/7/19 12:18 PM, Heiner Kallweit wrote:
>>>> On 06.08.2019 23:42, Tao Ren wrote:
>>>>> Hi Andrew / Heiner / Vladimir,
>>>>>
>>>>> On 8/6/19 2:09 PM, Tao Ren wrote:
>>>>>> The BCM54616S PHY cannot work properly in RGMII->1000Base-KX mode (for
>>>>>> example, on Facebook CMM BMC platform), mainly because genphy functions
>>>>>> are designed for copper links, and 1000Base-X (clause 37) auto negotiation
>>>>>> needs to be handled differently.
>>>>>>
>>>>>> This patch enables 1000Base-X support for BCM54616S by customizing 3
>>>>>> driver callbacks:
>>>>>>
>>>>>>   - probe: probe callback detects PHY's operation mode based on
>>>>>>     INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX
>>>>>>     Control register.
>>>>>>
>>>>>>   - config_aneg: bcm54616s_config_aneg_1000bx function is added for auto
>>>>>>     negotiation in 1000Base-X mode.
>>>>>>
>>>>>>   - read_status: BCM54616S and BCM5482 PHY share the same read_status
>>>>>>     callback which manually set link speed and duplex mode in 1000Base-X
>>>>>>     mode.
>>>>>>
>>>>>> Signed-off-by: Tao Ren <taoren@fb.com>
>>>>>
>>>>> I customized config_aneg function for BCM54616S 1000Base-X mode and link-down issue is also fixed: the patch is tested on Facebook CMM and Minipack BMC and everything looks normal. Please kindly review when you have bandwidth and let me know if you have further suggestions.
>>>>>
>>>>> BTW, I would be happy to help if we decide to add a set of genphy functions for clause 37, although that may mean I need more help/guidance from you :-)
>>>>
>>>> You want to have standard clause 37 aneg and this should be generic in phylib.
>>>> I hacked together a first version that is compile-tested only:
>>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_1143631_&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=iYElT7HC77pRZ3byVvW8ng&m=ZJArOJvHqNkqvs1x8l9HjfxjCN8e5xJpPz2YViBuKRA&s=EskpfBQtu9IBVeb96dv-sz76xIz4tJK5-lD4-qdIyWI&e= 
>>>> It supports fixed mode too.
>>>>
>>>> It doesn't support half duplex mode because phylib doesn't know 1000BaseX HD yet.
>>>> Not sure whether half duplex mode is used at all in reality.
>>>>
>>>> You could test the new core functions in your own config_aneg and read_status
>>>> callback implementations.
>>>
>>> Thank you very much for the help! I'm planning to add these functions but I haven't started yet because I'm still going through clause 37 :-)
>>>
>>> Let me apply your patch and run some test on my platform. Will share you results tomorrow.
>>
>> The patch "net: phy: add support for clause 37 auto-negotiation" works on my CMM platform, with just 1 minor change in phy.h (I guess it's typo?). Thanks again for the help!
>>
>> -int genphy_c37_aneg_done(struct phy_device *phydev);
>> +int genphy_c37_config_aneg(struct phy_device *phydev);
>>
> Indeed, this was a typo. Thanks.
> 
>> BTW, shall I send out my patch v5 now (based on your patch)? Or I should wait till your patch is included in net-next and then send out my patch?
>>
> Adding new functions to the core is typically only acceptable if in the
> same patch series a user of the new functions is added. Therefore it's
> best if you include my patch in your series (just remove the RFC tag and
> set the From: properly).

Got it. Let me play with it (especially "From:" property) and will send out patch series soon.


Cheers,

Tao

^ permalink raw reply

* Re: [PATCH net-next] net/ncsi: allow to customize BMC MAC Address offset
From: Tao Ren @ 2019-08-08 22:26 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Jakub Kicinski, netdev@vger.kernel.org, openbmc@lists.ozlabs.org,
	linux-kernel@vger.kernel.org, Samuel Mendoza-Jonas,
	David S . Miller, William Kennington
In-Reply-To: <20190808211629.GQ27917@lunn.ch>

On 8/8/19 2:16 PM, Andrew Lunn wrote:
> On Thu, Aug 08, 2019 at 07:02:54PM +0000, Tao Ren wrote:
>> Hi Andrew,
>>
>> On 8/8/19 6:32 AM, Andrew Lunn wrote:
>>>> Let me prepare patch v2 using device tree. I'm not sure if standard
>>>> "mac-address" fits this situation because all we need is an offset
>>>> (integer) and BMC MAC is calculated by adding the offset to NIC's
>>>> MAC address. Anyways, let me work out v2 patch we can discuss more
>>>> then.
>>>
>>> Hi Tao
>>>
>>> I don't know BMC terminology. By NICs MAC address, you are referring
>>> to the hosts MAC address? The MAC address the big CPU is using for its
>>> interface?  Where does this NIC get its MAC address from? If the BMCs
>>> bootloader has access to it, it can set the mac-address property in
>>> the device tree.
>>
>> Sorry for the confusion and let me clarify more:
>>
> 
>> The NIC here refers to the Network controller which provide network
>> connectivity for both BMC (via NC-SI) and Host (for example, via
>> PCIe).
>>
> 
>> On Facebook Yamp BMC, BMC sends NCSI_OEM_GET_MAC command (as an
>> ethernet packet) to the Network Controller while bringing up eth0,
>> and the (Broadcom) Network Controller replies with the Base MAC
>> Address reserved for the platform. As for Yamp, Base-MAC and
>> Base-MAC+1 are used by Host (big CPU) and Base-MAC+2 are assigned to
>> BMC. In my opinion, Base MAC and MAC address assignments are
>> controlled by Network Controller, which is transparent to both BMC
>> and Host.
> 
> Hi Tao
> 
> I've not done any work in the BMC field, so thanks for explaining
> this.
> 
> In a typical embedded system, each network interface is assigned a MAC
> address by the vendor. But here, things are different. The BMC SoC
> network interface has not been assigned a MAC address, it needs to ask
> the network controller for its MAC address, and then do some magical
> transformation on the answer to derive a MAC address for
> itself. Correct?

Yes. It's correct.

> It seems like a better design would of been, the BMC sends a
> NCSI_OEM_GET_BMC_MAC and the answer it gets back is the MAC address
> the BMC should use. No magic involved. But i guess it is too late to
> do that now.

Some NCSI Network Controllers support such OEM command (Get Provisioned BMC MAC Address), but unfortunately it's not supported on Yamp.

>> I'm not sure if I understand your suggestion correctly: do you mean
>> we should move the logic (GET_MAC from Network Controller, adding
>> offset and configuring BMC MAC) from kernel to boot loader?
> 
> In general, the kernel is generic. It probably boots on any ARM system
> which is has the needed modules for. The bootloader is often much more
> specific. It might not be fully platform specific, but it will be at
> least specific to the general family of BMC SoCs. If you consider the
> combination of the BMC bootloader and the device tree blob, you have
> something specific to the platform. This magical transformation of
> adding 2 seems to be very platform specific. So having this magic in
> the bootloader+DT seems like the best place to put it.

I understand your concern now. Thank you for the explanation.

> However, how you pass the resulting MAC address to the kernel should
> be as generic as possible. The DT "mac-address" property is very
> generic, many MAC drivers understand it. Using it also allows for
> vendors which actually assign a MAC address to the BMC to pass it to
> the BMC, avoiding all this NCSI_OEM_GET_MAC handshake. Having an API
> which just passing '2' is not generic at all.

After giving it more thought, I'm thinking about adding ncsi dt node with following structure (mac/ncsi similar to mac/mdio/phy):

&mac0 {
    /* MAC properties... */

    use-ncsi;
    ncsi {
        /* ncsi level properties if any */

        package@0 {
            /* package level properties if any */

            channel@0 {
                /* channel level properties if any */

                bmc-mac-offset = <2>;
            };

            channel@1 {
                /* channel #1 properties */
            };
        };

        /* package #1 properties start here.. */
    };
};

The reasons behind this are:

1) mac driver doesn't need to parse "mac-offset" stuff: these ncsi-network-controller specific settings should be parsed in ncsi stack.

2) get_bmc_mac_address command is a channel specific command, and technically people can configure different offset/formula for different channels.

Any concerns or suggestions?


Thanks,

Tao

^ permalink raw reply

* Re: [PATCH net-next v4 2/2] net: phy: broadcom: add 1000Base-X support for BCM54616S
From: Heiner Kallweit @ 2019-08-08 22:11 UTC (permalink / raw)
  To: Tao Ren, Andrew Lunn, Florian Fainelli, David S . Miller,
	Arun Parameswaran, Justin Chen, Vladimir Oltean,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	openbmc@lists.ozlabs.org
In-Reply-To: <14c1591b-26e1-3a2f-f6c4-beb2c8978e41@fb.com>

On 08.08.2019 23:47, Tao Ren wrote:
> Hi Heiner,
> 
> On 8/7/19 9:24 PM, Tao Ren wrote:
>> Hi Heiner,
>>
>> On 8/7/19 12:18 PM, Heiner Kallweit wrote:
>>> On 06.08.2019 23:42, Tao Ren wrote:
>>>> Hi Andrew / Heiner / Vladimir,
>>>>
>>>> On 8/6/19 2:09 PM, Tao Ren wrote:
>>>>> The BCM54616S PHY cannot work properly in RGMII->1000Base-KX mode (for
>>>>> example, on Facebook CMM BMC platform), mainly because genphy functions
>>>>> are designed for copper links, and 1000Base-X (clause 37) auto negotiation
>>>>> needs to be handled differently.
>>>>>
>>>>> This patch enables 1000Base-X support for BCM54616S by customizing 3
>>>>> driver callbacks:
>>>>>
>>>>>   - probe: probe callback detects PHY's operation mode based on
>>>>>     INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX
>>>>>     Control register.
>>>>>
>>>>>   - config_aneg: bcm54616s_config_aneg_1000bx function is added for auto
>>>>>     negotiation in 1000Base-X mode.
>>>>>
>>>>>   - read_status: BCM54616S and BCM5482 PHY share the same read_status
>>>>>     callback which manually set link speed and duplex mode in 1000Base-X
>>>>>     mode.
>>>>>
>>>>> Signed-off-by: Tao Ren <taoren@fb.com>
>>>>
>>>> I customized config_aneg function for BCM54616S 1000Base-X mode and link-down issue is also fixed: the patch is tested on Facebook CMM and Minipack BMC and everything looks normal. Please kindly review when you have bandwidth and let me know if you have further suggestions.
>>>>
>>>> BTW, I would be happy to help if we decide to add a set of genphy functions for clause 37, although that may mean I need more help/guidance from you :-)
>>>
>>> You want to have standard clause 37 aneg and this should be generic in phylib.
>>> I hacked together a first version that is compile-tested only:
>>> https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_1143631_&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=iYElT7HC77pRZ3byVvW8ng&m=ZJArOJvHqNkqvs1x8l9HjfxjCN8e5xJpPz2YViBuKRA&s=EskpfBQtu9IBVeb96dv-sz76xIz4tJK5-lD4-qdIyWI&e= 
>>> It supports fixed mode too.
>>>
>>> It doesn't support half duplex mode because phylib doesn't know 1000BaseX HD yet.
>>> Not sure whether half duplex mode is used at all in reality.
>>>
>>> You could test the new core functions in your own config_aneg and read_status
>>> callback implementations.
>>
>> Thank you very much for the help! I'm planning to add these functions but I haven't started yet because I'm still going through clause 37 :-)
>>
>> Let me apply your patch and run some test on my platform. Will share you results tomorrow.
> 
> The patch "net: phy: add support for clause 37 auto-negotiation" works on my CMM platform, with just 1 minor change in phy.h (I guess it's typo?). Thanks again for the help!
> 
> -int genphy_c37_aneg_done(struct phy_device *phydev);
> +int genphy_c37_config_aneg(struct phy_device *phydev);
> 
Indeed, this was a typo. Thanks.

> BTW, shall I send out my patch v5 now (based on your patch)? Or I should wait till your patch is included in net-next and then send out my patch?
> 
Adding new functions to the core is typically only acceptable if in the
same patch series a user of the new functions is added. Therefore it's
best if you include my patch in your series (just remove the RFC tag and
set the From: properly).

> 
> Cheers,
> 
> Tao
> 
Heiner

^ permalink raw reply

* Re: KASAN: use-after-free Read in tomoyo_socket_sendmsg_permission
From: Tetsuo Handa @ 2019-08-08 22:07 UTC (permalink / raw)
  To: syzbot, syzkaller-bugs, Ralf Baechle, linux-hams; +Cc: linux-kernel, netdev
In-Reply-To: <000000000000a244b3058f9dc7d6@google.com>

On 2019/08/09 1:45, syzbot wrote:
> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:    107e47cc vrf: make sure skb->data contains ip header to ma..
> git tree:       net
> console output: https://syzkaller.appspot.com/x/log.txt?x=139506d8600000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=4dba67bf8b8c9ad7
> dashboard link: https://syzkaller.appspot.com/bug?extid=b91501546ab4037f685f
> compiler:       gcc (GCC) 9.0.0 20181231 (experimental)

This is not TOMOYO's bug. LSM modules expect that "struct sock" does not go away.

Also, another use-after-free (presumably on the same "struct sock") was concurrently
inflight at nr_insert_socket() in net/netrom/af_netrom.c . Thus, suspecting netrom's bug.

[  625.441058][    C0] ------------[ cut here ]------------
[  625.446837][    C0] refcount_t: increment on 0; use-after-free.
[  625.461518][    C0] WARNING: CPU: 0 PID: 0 at lib/refcount.c:156 refcount_inc_checked+0x61/0x70
[  625.479173][    C0] Kernel panic - not syncing: panic_on_warn set ...
[  625.746558][    C0] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.2.0+ #97
[  625.746575][    C0] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[  625.755731][    C0] Call Trace:
[  625.770091][    C0]  <IRQ>
[  625.777543][    C0]  dump_stack+0x172/0x1f0
[  625.786005][    C0]  ? refcount_inc_not_zero_checked+0x1f0/0x200
[  625.794831][    C0]  panic+0x2dc/0x755
[  625.805217][    C0]  ? add_taint.cold+0x16/0x16
[  625.813697][    C0]  ? __kasan_check_write+0x14/0x20
[  625.822433][    C0]  ? __warn.cold+0x5/0x4c
[  625.832388][    C0]  ? __warn+0xe7/0x1e0
[  625.841820][    C0]  ? refcount_inc_checked+0x61/0x70
[  625.851148][    C0]  __warn.cold+0x20/0x4c
[  625.859701][    C0]  ? vprintk_emit+0x1ea/0x700
[  625.867208][    C0]  ? refcount_inc_checked+0x61/0x70
[  625.875413][    C0]  report_bug+0x263/0x2b0
[  625.884580][    C0]  do_error_trap+0x11b/0x200
[  625.893730][    C0]  do_invalid_op+0x37/0x50
[  625.902936][    C0]  ? refcount_inc_checked+0x61/0x70
[  625.911858][    C0]  invalid_op+0x14/0x20
[  625.920825][    C0] RIP: 0010:refcount_inc_checked+0x61/0x70
[  625.929407][    C0] Code: 1d 3f 6e 64 06 31 ff 89 de e8 cb d2 35 fe 84 db 75 dd e8 82 d1 35 fe 48 c7 c7 40 09 c6 87 c6 05 1f 6e 64 06 01 e8 77 39 07 fe <0f> 0b eb c1 90 90 90 90 90 90 90 90 90 90 90 55 48 89 e5 41 57 41
[  625.937608][    C0] RSP: 0018:ffff8880ae809bf0 EFLAGS: 00010282
[  625.948510][    C0] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
[  625.957237][    C0] RDX: 0000000000000100 RSI: ffffffff815c3a26 RDI: ffffed1015d01370
[  625.967249][    C0] RBP: ffff8880ae809c00 R08: ffffffff88c7a1c0 R09: fffffbfff14a775b
[  625.991542][    C0] R10: fffffbfff14a775a R11: ffffffff8a53bad7 R12: ffff8880a066f480
[  626.002193][    C0] R13: ffff8880a066f468 R14: ffff88808d69ef48 R15: ffff88808d69ef20
[  626.014844][    C0]  ? vprintk_func+0x86/0x189
[  626.027298][    C0]  nr_insert_socket+0x2d/0xe0
[  626.041237][    C0]  nr_rx_frame+0x1605/0x1e73
[  626.051737][    C0]  nr_loopback_timer+0x7b/0x170
[  626.073842][    C0]  call_timer_fn+0x1ac/0x780
[  626.092970][    C0]  ? nr_process_rx_frame+0x1540/0x1540
[  626.108552][    C0]  ? msleep_interruptible+0x150/0x150
[  626.118574][    C0]  ? run_timer_softirq+0x685/0x17a0
[  626.131811][    C0]  ? trace_hardirqs_on+0x67/0x240
[  626.145424][    C0]  ? __kasan_check_read+0x11/0x20
[  626.156592][    C0]  ? nr_process_rx_frame+0x1540/0x1540
[  626.164362][    C0]  ? nr_process_rx_frame+0x1540/0x1540
[  626.175423][    C0]  run_timer_softirq+0x697/0x17a0
[  626.188804][    C0]  ? add_timer+0x930/0x930
[  626.202652][    C0]  ? kvm_clock_read+0x18/0x30
[  626.215813][    C0]  ? kvm_sched_clock_read+0x9/0x20
[  626.231378][    C0]  ? sched_clock+0x2e/0x50
[  626.231395][    C0]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
[  626.231408][    C0]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
[  626.231432][    C0]  __do_softirq+0x262/0x98c
[  626.244512][    C0]  ? sched_clock_cpu+0x1b/0x1b0
[  626.244531][    C0]  irq_exit+0x19b/0x1e0
[  626.244545][    C0]  smp_apic_timer_interrupt+0x1a3/0x610
[  626.244558][    C0]  apic_timer_interrupt+0xf/0x20
[  626.244563][    C0]  </IRQ>
[  626.244579][    C0] RIP: 0010:native_safe_halt+0xe/0x10
[  626.244606][    C0] Code: b8 94 73 fa eb 8a 90 90 90 90 90 90 e9 07 00 00 00 0f 00 2d 34 25 4f 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 24 25 4f 00 fb f4 <c3> 90 55 48 89 e5 41 57 41 56 41 55 41 54 53 e8 0e 56 27 fa e8 c9
[  626.257081][    C0] RSP: 0018:ffffffff88c07ce8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff13
[  626.269812][    C0] RAX: 1ffffffff11a5e05 RBX: ffffffff88c7a1c0 RCX: 0000000000000000
[  626.281053][    C0] RDX: dffffc0000000000 RSI: 0000000000000006 RDI: ffffffff88c7aa4c
[  626.290913][    C0] RBP: ffffffff88c07d18 R08: ffffffff88c7a1c0 R09: 0000000000000000
[  626.303361][    C0] R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000
[  626.314081][    C0] R13: ffffffff89a4f778 R14: 0000000000000000 R15: 0000000000000000
[  626.314116][    C0]  ? default_idle+0x4e/0x360
[  626.323075][    C0]  arch_cpu_idle+0xa/0x10
[  626.333543][    C0]  default_idle_call+0x84/0xb0
[  626.341839][    C0]  do_idle+0x413/0x760
[  626.370736][    C0]  ? retint_kernel+0x2b/0x2b
[  626.383044][    C0]  ? arch_cpu_idle_exit+0x80/0x80
[  626.400071][    C0]  ? do_idle+0x387/0x760
[  626.418085][    C0]  cpu_startup_entry+0x1b/0x20
[  626.431835][    C0]  rest_init+0x245/0x37b
[  626.459420][    C0]  arch_call_rest_init+0xe/0x1b
[  626.471993][    C0]  start_kernel+0x912/0x951
[  626.482387][    C0]  ? mem_encrypt_init+0xb/0xb
[  626.495105][    C0]  ? __sanitizer_cov_trace_const_cmp4+0x16/0x20
[  626.507125][    C0]  ? x86_family+0x41/0x50
[  626.519773][    C0]  ? __sanitizer_cov_trace_const_cmp1+0x1a/0x20
[  626.532837][    C0]  x86_64_start_reservations+0x29/0x2b
[  626.545019][    C0]  x86_64_start_kernel+0x77/0x7b
[  626.558711][    C0]  secondary_startup_64+0xa4/0xb0
[  626.897092][    C0] Kernel Offset: disabled
[  626.901428][    C0] Rebooting in 86400 seconds..

^ permalink raw reply

* [PATCH net-next] r8169: fix performance issue on RTL8168evl
From: Heiner Kallweit @ 2019-08-08 22:02 UTC (permalink / raw)
  To: Realtek linux nic maintainers, David Miller,
	Holger Hoffstätte
  Cc: netdev@vger.kernel.org

From: Holger Hoffstätte <holger@applied-asynchrony.com>
Disabling TSO but leaving SG active results is a significant
performance drop. Therefore disable also SG on RTL8168evl.
This restores the original performance.

Fixes: 93681cd7d94f ("r8169: enable HW csum and TSO")
Signed-off-by: Holger Hoffstätte <holger@applied-asynchrony.com>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/ethernet/realtek/r8169_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index b2a275d85..912bd41ea 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -6898,9 +6898,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	/* RTL8168e-vl has a HW issue with TSO */
 	if (tp->mac_version == RTL_GIGA_MAC_VER_34) {
-		dev->vlan_features &= ~NETIF_F_ALL_TSO;
-		dev->hw_features &= ~NETIF_F_ALL_TSO;
-		dev->features &= ~NETIF_F_ALL_TSO;
+		dev->vlan_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG);
+		dev->hw_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG);
+		dev->features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG);
 	}
 
 	dev->hw_features |= NETIF_F_RXALL;
-- 
2.22.0


^ permalink raw reply related

* Re: [PATCH net-next v4 2/2] net: phy: broadcom: add 1000Base-X support for BCM54616S
From: Tao Ren @ 2019-08-08 21:47 UTC (permalink / raw)
  To: Heiner Kallweit, Andrew Lunn, Florian Fainelli, David S . Miller,
	Arun Parameswaran, Justin Chen, Vladimir Oltean,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	openbmc@lists.ozlabs.org
In-Reply-To: <a827c44c-3946-8f6f-e515-b476fd375cf6@fb.com>

Hi Heiner,

On 8/7/19 9:24 PM, Tao Ren wrote:
> Hi Heiner,
> 
> On 8/7/19 12:18 PM, Heiner Kallweit wrote:
>> On 06.08.2019 23:42, Tao Ren wrote:
>>> Hi Andrew / Heiner / Vladimir,
>>>
>>> On 8/6/19 2:09 PM, Tao Ren wrote:
>>>> The BCM54616S PHY cannot work properly in RGMII->1000Base-KX mode (for
>>>> example, on Facebook CMM BMC platform), mainly because genphy functions
>>>> are designed for copper links, and 1000Base-X (clause 37) auto negotiation
>>>> needs to be handled differently.
>>>>
>>>> This patch enables 1000Base-X support for BCM54616S by customizing 3
>>>> driver callbacks:
>>>>
>>>>   - probe: probe callback detects PHY's operation mode based on
>>>>     INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX
>>>>     Control register.
>>>>
>>>>   - config_aneg: bcm54616s_config_aneg_1000bx function is added for auto
>>>>     negotiation in 1000Base-X mode.
>>>>
>>>>   - read_status: BCM54616S and BCM5482 PHY share the same read_status
>>>>     callback which manually set link speed and duplex mode in 1000Base-X
>>>>     mode.
>>>>
>>>> Signed-off-by: Tao Ren <taoren@fb.com>
>>>
>>> I customized config_aneg function for BCM54616S 1000Base-X mode and link-down issue is also fixed: the patch is tested on Facebook CMM and Minipack BMC and everything looks normal. Please kindly review when you have bandwidth and let me know if you have further suggestions.
>>>
>>> BTW, I would be happy to help if we decide to add a set of genphy functions for clause 37, although that may mean I need more help/guidance from you :-)
>>
>> You want to have standard clause 37 aneg and this should be generic in phylib.
>> I hacked together a first version that is compile-tested only:
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__patchwork.ozlabs.org_patch_1143631_&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=iYElT7HC77pRZ3byVvW8ng&m=ZJArOJvHqNkqvs1x8l9HjfxjCN8e5xJpPz2YViBuKRA&s=EskpfBQtu9IBVeb96dv-sz76xIz4tJK5-lD4-qdIyWI&e= 
>> It supports fixed mode too.
>>
>> It doesn't support half duplex mode because phylib doesn't know 1000BaseX HD yet.
>> Not sure whether half duplex mode is used at all in reality.
>>
>> You could test the new core functions in your own config_aneg and read_status
>> callback implementations.
> 
> Thank you very much for the help! I'm planning to add these functions but I haven't started yet because I'm still going through clause 37 :-)
> 
> Let me apply your patch and run some test on my platform. Will share you results tomorrow.

The patch "net: phy: add support for clause 37 auto-negotiation" works on my CMM platform, with just 1 minor change in phy.h (I guess it's typo?). Thanks again for the help!

-int genphy_c37_aneg_done(struct phy_device *phydev);
+int genphy_c37_config_aneg(struct phy_device *phydev);

BTW, shall I send out my patch v5 now (based on your patch)? Or I should wait till your patch is included in net-next and then send out my patch?


Cheers,

Tao

^ permalink raw reply

* Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain namespacedata from current task
From: Carlos Antonio Neira Bustos @ 2019-08-08 21:17 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Y Song, netdev@vger.kernel.org, ebiederm@xmission.com,
	brouer@redhat.com, quentin.monnet@netronome.com
In-Reply-To: <1c24077d-ed17-86b6-8d3f-81994105f302@fb.com>

Thanks a lot, Yonghong. I'll fix and split up the patch.
Thanks again for your help.

Bests

On Thu, Aug 08, 2019 at 08:47:14PM +0000, Yonghong Song wrote:
> 
> 
> On 8/8/19 1:26 PM, carlos antonio neira bustos wrote:
> > Hi Yonghong,
> > 
> > I’m sorry, just to be sure, I’m just missing the error codes from 
> > filename_lookup() right ?.
> 
>  From kernel functionality point of view. Yes, I am talking about
> error codes returned by filename_lookup().
> For example, if CONFIG_PID_NS or CONFIG_NAMESPACES is not
> defined in the config, the path "/proc/self/ns/pid" will not exist,
> the error code will return. It may be -ENOTDIR
> if CONFIG_NAMESPACES not defined or -ECHILD if CONFIG_PID_NS
> is not defined. Please double check.
> 
> Please do follow the advice in
>  > https://lore.kernel.org/netdev/20190808174848.poybtaagg5ctle7t@dev00/T/#t
> to break the single patch to multiple patches.
> 
> I only reviewed the kernel code. Will review tools/ code
> in the next properly-formatted (broken-up) commits.
> 
> Also, please also cc commits to bpf mailing list at
> bpf@vger.kernel.org
> 
> > 
> > Bests
> > 
> > Maybe some other error codes in filename_lookup() function?
> > 
> >  > + *
> > 
> >  > + *                      If unable to get the inode from 
> > /proc/self/ns/pid an error code
> > 
> >  > + *                      will be returned.
> > 
> > *From: *Y Song <mailto:ys114321@gmail.com>
> > *Sent: *08 August 2019 15:44
> > *To: *Carlos Antonio Neira Bustos <mailto:cneirabustos@gmail.com>
> > *Cc: *Yonghong Song <mailto:yhs@fb.com>; netdev@vger.kernel.org 
> > <mailto:netdev@vger.kernel.org>; ebiederm@xmission.com 
> > <mailto:ebiederm@xmission.com>; brouer@redhat.com 
> > <mailto:brouer@redhat.com>; quentin.monnet@netronome.com 
> > <mailto:quentin.monnet@netronome.com>
> > *Subject: *Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain 
> > namespacedata from current task
> > 
> > On Thu, Aug 8, 2019 at 10:52 AM Carlos Antonio Neira Bustos
> > 
> > <cneirabustos@gmail.com> wrote:
> > 
> >  >
> > 
> >  > Yonghong,
> > 
> >  >
> > 
> >  > I have modified the patch following your feedback.
> > 
> >  > Let me know if I'm missing something.
> > 
> > Yes, I have some other requests about formating.
> > 
> > https://lore.kernel.org/netdev/20190808174848.poybtaagg5ctle7t@dev00/T/#t
> > 
> > Could you address it as well?
> > 
> >  >
> > 
> >  > Bests
> > 
> >  >
> > 
> >  > From 70f8d5584700c9cfc82c006901d8ee9595c53f15 Mon Sep 17 00:00:00 2001
> > 
> >  > From: Carlos <cneirabustos@gmail.com>
> > 
> >  > Date: Wed, 7 Aug 2019 20:04:30 -0400
> > 
> >  > Subject: [PATCH] [PATCH v6 bpf-next] BPF: New helper to obtain 
> > namespace data
> > 
> >  >  from current task
> > 
> >  >
> > 
> >  > This helper obtains the active namespace from current and returns 
> > pid, tgid,
> > 
> >  > device and namespace id as seen from that namespace, allowing to 
> > instrument
> > 
> >  > a process inside a container.
> > 
> >  > Device is read from /proc/self/ns/pid, as in the future it's possible 
> > that
> > 
> >  > different pid_ns files may belong to different devices, according
> > 
> >  > to the discussion between Eric Biederman and Yonghong in 2017 linux 
> > plumbers
> > 
> >  > conference.
> > 
> >  > Currently bpf_get_current_pid_tgid(), is used to do pid filtering in 
> > bcc's
> > 
> >  > scripts but this helper returns the pid as seen by the root namespace 
> > which is
> > 
> >  > fine when a bcc script is not executed inside a container.
> > 
> >  > When the process of interest is inside a container, pid filtering 
> > will not work
> > 
> >  > if bpf_get_current_pid_tgid() is used. This helper addresses this 
> > limitation
> > 
> >  > returning the pid as it's seen by the current namespace where the 
> > script is
> > 
> >  > executing.
> > 
> >  >
> > 
> >  > This helper has the same use cases as bpf_get_current_pid_tgid() as 
> > it can be
> > 
> >  > used to do pid filtering even inside a container.
> > 
> >  >
> > 
> >  > For example a bcc script using bpf_get_current_pid_tgid() 
> > (tools/funccount.py):
> > 
> >  >
> > 
> >  >         u32 pid = bpf_get_current_pid_tgid() >> 32;
> > 
> >  >         if (pid != <pid_arg_passed_in>)
> > 
> >  >                 return 0;
> > 
> >  > Could be modified to use bpf_get_current_pidns_info() as follows:
> > 
> >  >
> > 
> >  >         struct bpf_pidns pidns;
> > 
> >  >         bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
> > 
> >  >         u32 pid = pidns.tgid;
> > 
> >  >         u32 nsid = pidns.nsid;
> > 
> >  >         if ((pid != <pid_arg_passed_in>) && (nsid != 
> > <nsid_arg_passed_in>))
> > 
> >  >                 return 0;
> > 
> >  >
> > 
> >  > To find out the name PID namespace id of a process, you could use 
> > this command:
> > 
> >  >
> > 
> >  > $ ps -h -o pidns -p <pid_of_interest>
> > 
> >  >
> > 
> >  > Or this other command:
> > 
> >  >
> > 
> >  > $ ls -Li /proc/<pid_of_interest>/ns/pid
> > 
> >  >
> > 
> >  > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> > 
> >  > ---
> > 
> >  >  fs/internal.h                                      |   2 -
> > 
> >  >  fs/namei.c                                         |   1 -
> > 
> >  >  include/linux/bpf.h                                |   1 +
> > 
> >  >  include/linux/namei.h                              |   4 +
> > 
> >  >  include/uapi/linux/bpf.h                           |  27 +++-
> > 
> >  >  kernel/bpf/core.c                                  |   1 +
> > 
> >  >  kernel/bpf/helpers.c                               |  64 ++++++++++
> > 
> >  >  kernel/trace/bpf_trace.c                           |   2 +
> > 
> >  >  samples/bpf/Makefile                               |   3 +
> > 
> >  >  samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> > 
> >  >  samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> > 
> >  >  tools/include/uapi/linux/bpf.h                     |  27 +++-
> > 
> >  >  tools/testing/selftests/bpf/Makefile               |   2 +-
> > 
> >  >  tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> > 
> >  >  .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> > 
> >  >  tools/testing/selftests/bpf/test_pidns.c           | 138 
> > +++++++++++++++++++++
> > 
> >  >  16 files changed, 399 insertions(+), 6 deletions(-)
> > 
> >  >  create mode 100644 samples/bpf/trace_ns_info_user.c
> > 
> >  >  create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> > 
> >  >  create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > 
> >  >  create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> > 
> >  >
> > 
> >  > diff --git a/fs/internal.h b/fs/internal.h
> > 
> >  > index 315fcd8d237c..6647e15dd419 100644
> > 
> >  > --- a/fs/internal.h
> > 
> >  > +++ b/fs/internal.h
> > 
> >  > @@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
> > 
> >  >  /*
> > 
> >  >   * namei.c
> > 
> >  >   */
> > 
> >  > -extern int filename_lookup(int dfd, struct filename *name, unsigned 
> > flags,
> > 
> >  > -                          struct path *path, struct path *root);
> > 
> >  >  extern int user_path_mountpoint_at(int, const char __user *, 
> > unsigned int, struct path *);
> > 
> >  >  extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
> > 
> >  >                            const char *, unsigned int, struct path *);
> > 
> >  > diff --git a/fs/namei.c b/fs/namei.c
> > 
> >  > index 209c51a5226c..a89fc72a4a10 100644
> > 
> >  > --- a/fs/namei.c
> > 
> >  > +++ b/fs/namei.c
> > 
> >  > @@ -19,7 +19,6 @@
> > 
> >  >  #include <linux/export.h>
> > 
> >  >  #include <linux/kernel.h>
> > 
> >  >  #include <linux/slab.h>
> > 
> >  > -#include <linux/fs.h>
> > 
> >  >  #include <linux/namei.h>
> > 
> >  >  #include <linux/pagemap.h>
> > 
> >  >  #include <linux/fsnotify.h>
> > 
> >  > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > 
> >  > index f9a506147c8a..e4adf5e05afd 100644
> > 
> >  > --- a/include/linux/bpf.h
> > 
> >  > +++ b/include/linux/bpf.h
> > 
> >  > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto 
> > bpf_get_local_storage_proto;
> > 
> >  >  extern const struct bpf_func_proto bpf_strtol_proto;
> > 
> >  >  extern const struct bpf_func_proto bpf_strtoul_proto;
> > 
> >  >  extern const struct bpf_func_proto bpf_tcp_sock_proto;
> > 
> >  > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> > 
> >  >
> > 
> >  >  /* Shared helpers among cBPF and eBPF. */
> > 
> >  >  void bpf_user_rnd_init_once(void);
> > 
> >  > diff --git a/include/linux/namei.h b/include/linux/namei.h
> > 
> >  > index 9138b4471dbf..b45c8b6f7cb4 100644
> > 
> >  > --- a/include/linux/namei.h
> > 
> >  > +++ b/include/linux/namei.h
> > 
> >  > @@ -6,6 +6,7 @@
> > 
> >  >  #include <linux/path.h>
> > 
> >  >  #include <linux/fcntl.h>
> > 
> >  >  #include <linux/errno.h>
> > 
> >  > +#include <linux/fs.h>
> > 
> >  >
> > 
> >  >  enum { MAX_NESTED_LINKS = 8 };
> > 
> >  >
> > 
> >  > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct 
> > dentry *);
> > 
> >  >
> > 
> >  >  extern void nd_jump_link(struct path *path);
> > 
> >  >
> > 
> >  > +extern int filename_lookup(int dfd, struct filename *name, unsigned 
> > flags,
> > 
> >  > +                          struct path *path, struct path *root);
> > 
> >  > +
> > 
> >  >  static inline void nd_terminate_link(void *name, size_t len, size_t 
> > maxlen)
> > 
> >  >  {
> > 
> >  >         ((char *) name)[min(len, maxlen)] = '\0';
> > 
> >  > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > 
> >  > index 4393bd4b2419..b0d4869fb860 100644
> > 
> >  > --- a/include/uapi/linux/bpf.h
> > 
> >  > +++ b/include/uapi/linux/bpf.h
> > 
> >  > @@ -2741,6 +2741,24 @@ union bpf_attr {
> > 
> >  >   *             **-EOPNOTSUPP** kernel configuration does not enable 
> > SYN cookies
> > 
> >  >   *
> > 
> >  >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> > 
> >  > + *
> > 
> >  > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
> > size_of_pidns)
> > 
> >  > + *     Description
> > 
> >  > + *             Copies into *pidns* pid, namespace id and tgid as 
> > seen by the
> > 
> >  > + *             current namespace and also device from /proc/self/ns/pid.
> > 
> >  > + *             *size_of_pidns* must be the size of *pidns*
> > 
> >  > + *
> > 
> >  > + *             This helper is used when pid filtering is needed inside a
> > 
> >  > + *             container as bpf_get_current_tgid() helper returns 
> > always the
> > 
> >  > + *             pid id as seen by the root namespace.
> > 
> >  > + *     Return
> > 
> >  > + *             0 on success
> > 
> >  > + *
> > 
> >  > + *             **-EINVAL** if *size_of_pidns* is not valid or unable 
> > to get ns, pid
> > 
> >  > + *             or tgid of the current task.
> > 
> >  > + *
> > 
> >  > + *             **-ENOMEM**  if allocation fails.
> > 
> >  > + *
> > 
> >  >   */
> > 
> >  >  #define __BPF_FUNC_MAPPER(FN)          \
> > 
> >  >         FN(unspec),                     \
> > 
> >  > @@ -2853,7 +2871,8 @@ union bpf_attr {
> > 
> >  >         FN(sk_storage_get),             \
> > 
> >  >         FN(sk_storage_delete),          \
> > 
> >  >         FN(send_signal),                \
> > 
> >  > -       FN(tcp_gen_syncookie),
> > 
> >  > +       FN(tcp_gen_syncookie),          \
> > 
> >  > +       FN(get_current_pidns_info),
> > 
> >  >
> > 
> >  >  /* integer value in 'imm' field of BPF_CALL instruction selects 
> > which helper
> > 
> >  >   * function eBPF program intends to call
> > 
> >  > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> > 
> >  >         __s32   retval;
> > 
> >  >  };
> > 
> >  >
> > 
> >  > +struct bpf_pidns_info {
> > 
> >  > +       __u32 dev;
> > 
> >  > +       __u32 nsid;
> > 
> >  > +       __u32 tgid;
> > 
> >  > +       __u32 pid;
> > 
> >  > +};
> > 
> >  >  #endif /* _UAPI__LINUX_BPF_H__ */
> > 
> >  > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > 
> >  > index 8191a7db2777..3159f2a0188c 100644
> > 
> >  > --- a/kernel/bpf/core.c
> > 
> >  > +++ b/kernel/bpf/core.c
> > 
> >  > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto 
> > bpf_get_current_uid_gid_proto __weak;
> > 
> >  >  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> > 
> >  >  const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> > 
> >  >  const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> > 
> >  > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> > 
> >  >
> > 
> >  >  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> > 
> >  >  {
> > 
> >  > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > 
> >  > index 5e28718928ca..41fbf1f28a48 100644
> > 
> >  > --- a/kernel/bpf/helpers.c
> > 
> >  > +++ b/kernel/bpf/helpers.c
> > 
> >  > @@ -11,6 +11,12 @@
> > 
> >  >  #include <linux/uidgid.h>
> > 
> >  >  #include <linux/filter.h>
> > 
> >  >  #include <linux/ctype.h>
> > 
> >  > +#include <linux/pid_namespace.h>
> > 
> >  > +#include <linux/major.h>
> > 
> >  > +#include <linux/stat.h>
> > 
> >  > +#include <linux/namei.h>
> > 
> >  > +#include <linux/version.h>
> > 
> >  > +
> > 
> >  >
> > 
> >  >  #include "../../lib/kstrtox.h"
> > 
> >  >
> > 
> >  > @@ -312,6 +318,64 @@ void copy_map_value_locked(struct bpf_map *map, 
> > void *dst, void *src,
> > 
> >  >         preempt_enable();
> > 
> >  >  }
> > 
> >  >
> > 
> >  > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, 
> > pidns_info, u32,
> > 
> >  > +        size)
> > 
> >  > +{
> > 
> >  > +       const char *pidns_path = "/proc/self/ns/pid";
> > 
> >  > +       struct pid_namespace *pidns = NULL;
> > 
> >  > +       struct filename *tmp = NULL;
> > 
> >  > +       struct inode *inode;
> > 
> >  > +       struct path kp;
> > 
> >  > +       pid_t tgid = 0;
> > 
> >  > +       pid_t pid = 0;
> > 
> >  > +       int ret;
> > 
> >  > +       int len;
> > 
> >  > +
> > 
> >  > +       if (unlikely(size != sizeof(struct bpf_pidns_info)))
> > 
> >  > +               return -EINVAL;
> > 
> >  > +       pidns = task_active_pid_ns(current);
> > 
> >  > +       if (unlikely(!pidns))
> > 
> >  > +               goto clear;
> > 
> >  > +       pidns_info->nsid =  pidns->ns.inum;
> > 
> >  > +       pid = task_pid_nr_ns(current, pidns);
> > 
> >  > +       if (unlikely(!pid))
> > 
> >  > +               goto clear;
> > 
> >  > +       tgid = task_tgid_nr_ns(current, pidns);
> > 
> >  > +       if (unlikely(!tgid))
> > 
> >  > +               goto clear;
> > 
> >  > +       pidns_info->tgid = (u32) tgid;
> > 
> >  > +       pidns_info->pid = (u32) pid;
> > 
> >  > +       tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> > 
> >  > +       if (unlikely(!tmp)) {
> > 
> >  > +               memset((void *)pidns_info, 0, (size_t) size);
> > 
> >  > +               return -ENOMEM;
> > 
> >  > +       }
> > 
> >  > +       len = strlen(pidns_path) + 1;
> > 
> >  > +       memcpy((char *)tmp->name, pidns_path, len);
> > 
> >  > +       tmp->uptr = NULL;
> > 
> >  > +       tmp->aname = NULL;
> > 
> >  > +       tmp->refcnt = 1;
> > 
> >  > +       ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> > 
> >  > +       if (ret) {
> > 
> >  > +               memset((void *)pidns_info, 0, (size_t) size);
> > 
> >  > +               return ret;
> > 
> >  > +       }
> > 
> >  > +       inode = d_backing_inode(kp.dentry);
> > 
> >  > +       pidns_info->dev = inode->i_sb->s_dev;
> > 
> >  > +       return 0;
> > 
> >  > +clear:
> > 
> >  > +       memset((void *)pidns_info, 0, (size_t) size);
> > 
> >  > +       return -EINVAL;
> > 
> >  > +}
> > 
> >  > +
> > 
> >  > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> > 
> >  > +       .func           = bpf_get_current_pidns_info,
> > 
> >  > +       .gpl_only       = false,
> > 
> >  > +       .ret_type       = RET_INTEGER,
> > 
> >  > +       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> > 
> >  > +       .arg2_type      = ARG_CONST_SIZE,
> > 
> >  > +};
> > 
> >  > +
> > 
> >  >  #ifdef CONFIG_CGROUPS
> > 
> >  >  BPF_CALL_0(bpf_get_current_cgroup_id)
> > 
> >  >  {
> > 
> >  > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > 
> >  > index ca1255d14576..5e1dc22765a5 100644
> > 
> >  > --- a/kernel/trace/bpf_trace.c
> > 
> >  > +++ b/kernel/trace/bpf_trace.c
> > 
> >  > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, 
> > const struct bpf_prog *prog)
> > 
> >  >  #endif
> > 
> >  >         case BPF_FUNC_send_signal:
> > 
> >  >                 return &bpf_send_signal_proto;
> > 
> >  > +       case BPF_FUNC_get_current_pidns_info:
> > 
> >  > +               return &bpf_get_current_pidns_info_proto;
> > 
> >  >         default:
> > 
> >  >                 return NULL;
> > 
> >  >         }
> > 
> >  > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> > 
> >  > index 1d9be26b4edd..238453ff27d2 100644
> > 
> >  > --- a/samples/bpf/Makefile
> > 
> >  > +++ b/samples/bpf/Makefile
> > 
> >  > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> > 
> >  >  hostprogs-y += xdp_sample_pkts
> > 
> >  >  hostprogs-y += ibumad
> > 
> >  >  hostprogs-y += hbm
> > 
> >  > +hostprogs-y += trace_ns_info
> > 
> >  >
> > 
> >  >  # Libbpf dependencies
> > 
> >  >  LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
> > 
> >  > @@ -109,6 +110,7 @@ task_fd_query-objs := bpf_load.o 
> > task_fd_query_user.o $(TRACE_HELPERS)
> > 
> >  >  xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
> > 
> >  >  ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
> > 
> >  >  hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
> > 
> >  > +trace_ns_info-objs := bpf_load.o trace_ns_info_user.o
> > 
> >  >
> > 
> >  >  # Tell kbuild to always build the programs
> > 
> >  >  always := $(hostprogs-y)
> > 
> >  > @@ -170,6 +172,7 @@ always += xdp_sample_pkts_kern.o
> > 
> >  >  always += ibumad_kern.o
> > 
> >  >  always += hbm_out_kern.o
> > 
> >  >  always += hbm_edt_kern.o
> > 
> >  > +always += trace_ns_info_user_kern.o
> > 
> >  >
> > 
> >  >  KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
> > 
> >  >  KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
> > 
> >  > diff --git a/samples/bpf/trace_ns_info_user.c 
> > b/samples/bpf/trace_ns_info_user.c
> > 
> >  > new file mode 100644
> > 
> >  > index 000000000000..e06d08db6f30
> > 
> >  > --- /dev/null
> > 
> >  > +++ b/samples/bpf/trace_ns_info_user.c
> > 
> >  > @@ -0,0 +1,35 @@
> > 
> >  > +// SPDX-License-Identifier: GPL-2.0
> > 
> >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > 
> >  > + *
> > 
> >  > + * This program is free software; you can redistribute it and/or
> > 
> >  > + * modify it under the terms of version 2 of the GNU General Public
> > 
> >  > + * License as published by the Free Software Foundation.
> > 
> >  > + */
> > 
> >  > +
> > 
> >  > +#include <stdio.h>
> > 
> >  > +#include <linux/bpf.h>
> > 
> >  > +#include <unistd.h>
> > 
> >  > +#include "bpf/libbpf.h"
> > 
> >  > +#include "bpf_load.h"
> > 
> >  > +
> > 
> >  > +/* This code was taken verbatim from tracex1_user.c, it's used
> > 
> >  > + * to exercize bpf_get_current_pidns_info() helper call.
> > 
> >  > + */
> > 
> >  > +int main(int ac, char **argv)
> > 
> >  > +{
> > 
> >  > +       FILE *f;
> > 
> >  > +       char filename[256];
> > 
> >  > +
> > 
> >  > +       snprintf(filename, sizeof(filename), "%s_user_kern.o", argv[0]);
> > 
> >  > +       printf("loading %s\n", filename);
> > 
> >  > +
> > 
> >  > +       if (load_bpf_file(filename)) {
> > 
> >  > +               printf("%s", bpf_log_buf);
> > 
> >  > +               return 1;
> > 
> >  > +       }
> > 
> >  > +
> > 
> >  > +       f = popen("taskset 1 ping  localhost", "r");
> > 
> >  > +       (void) f;
> > 
> >  > +       read_trace_pipe();
> > 
> >  > +       return 0;
> > 
> >  > +}
> > 
> >  > diff --git a/samples/bpf/trace_ns_info_user_kern.c 
> > b/samples/bpf/trace_ns_info_user_kern.c
> > 
> >  > new file mode 100644
> > 
> >  > index 000000000000..96675e02b707
> > 
> >  > --- /dev/null
> > 
> >  > +++ b/samples/bpf/trace_ns_info_user_kern.c
> > 
> >  > @@ -0,0 +1,44 @@
> > 
> >  > +// SPDX-License-Identifier: GPL-2.0
> > 
> >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > 
> >  > + *
> > 
> >  > + * This program is free software; you can redistribute it and/or
> > 
> >  > + * modify it under the terms of version 2 of the GNU General Public
> > 
> >  > + * License as published by the Free Software Foundation.
> > 
> >  > + */
> > 
> >  > +#include <linux/skbuff.h>
> > 
> >  > +#include <linux/netdevice.h>
> > 
> >  > +#include <linux/version.h>
> > 
> >  > +#include <uapi/linux/bpf.h>
> > 
> >  > +#include "bpf_helpers.h"
> > 
> >  > +
> > 
> >  > +typedef __u64 u64;
> > 
> >  > +typedef __u32 u32;
> > 
> >  > +
> > 
> >  > +
> > 
> >  > +/* kprobe is NOT a stable ABI
> > 
> >  > + * kernel functions can be removed, renamed or completely change 
> > semantics.
> > 
> >  > + * Number of arguments and their positions can change, etc.
> > 
> >  > + * In such case this bpf+kprobe example will no longer be meaningful
> > 
> >  > + */
> > 
> >  > +
> > 
> >  > +/* This will call bpf_get_current_pidns_info() to display pid and ns 
> > values
> > 
> >  > + * as seen by the current namespace, on the far left you will see 
> > the pid as
> > 
> >  > + * seen as by the root namespace.
> > 
> >  > + */
> > 
> >  > +
> > 
> >  > +SEC("kprobe/__netif_receive_skb_core")
> > 
> >  > +int bpf_prog1(struct pt_regs *ctx)
> > 
> >  > +{
> > 
> >  > +       char fmt[] = "nsid:%u, dev: %u,  pid:%u\n";
> > 
> >  > +       struct bpf_pidns_info nsinfo;
> > 
> >  > +       int ok = 0;
> > 
> >  > +
> > 
> >  > +       ok = bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo));
> > 
> >  > +       if (ok == 0)
> > 
> >  > +               bpf_trace_printk(fmt, sizeof(fmt), (u32)nsinfo.nsid,
> > 
> >  > +                                (u32) nsinfo.dev, (u32)nsinfo.pid);
> > 
> >  > +
> > 
> >  > +       return 0;
> > 
> >  > +}
> > 
> >  > +char _license[] SEC("license") = "GPL";
> > 
> >  > +u32 _version SEC("version") = LINUX_VERSION_CODE;
> > 
> >  > diff --git a/tools/include/uapi/linux/bpf.h 
> > b/tools/include/uapi/linux/bpf.h
> > 
> >  > index 4393bd4b2419..b0d4869fb860 100644
> > 
> >  > --- a/tools/include/uapi/linux/bpf.h
> > 
> >  > +++ b/tools/include/uapi/linux/bpf.h
> > 
> >  > @@ -2741,6 +2741,24 @@ union bpf_attr {
> > 
> >  >   *             **-EOPNOTSUPP** kernel configuration does not enable 
> > SYN cookies
> > 
> >  >   *
> > 
> >  >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> > 
> >  > + *
> > 
> >  > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
> > size_of_pidns)
> > 
> >  > + *     Description
> > 
> >  > + *             Copies into *pidns* pid, namespace id and tgid as 
> > seen by the
> > 
> >  > + *             current namespace and also device from /proc/self/ns/pid.
> > 
> >  > + *             *size_of_pidns* must be the size of *pidns*
> > 
> >  > + *
> > 
> >  > + *             This helper is used when pid filtering is needed inside a
> > 
> >  > + *             container as bpf_get_current_tgid() helper returns 
> > always the
> > 
> >  > + *             pid id as seen by the root namespace.
> > 
> >  > + *     Return
> > 
> >  > + *             0 on success
> > 
> >  > + *
> > 
> >  > + *             **-EINVAL** if *size_of_pidns* is not valid or unable 
> > to get ns, pid
> > 
> >  > + *             or tgid of the current task.
> > 
> >  > + *
> > 
> >  > + *             **-ENOMEM**  if allocation fails.
> > 
> >  > + *
> > 
> >  >   */
> > 
> >  >  #define __BPF_FUNC_MAPPER(FN)          \
> > 
> >  >         FN(unspec),                     \
> > 
> >  > @@ -2853,7 +2871,8 @@ union bpf_attr {
> > 
> >  >         FN(sk_storage_get),             \
> > 
> >  >         FN(sk_storage_delete),          \
> > 
> >  >         FN(send_signal),                \
> > 
> >  > -       FN(tcp_gen_syncookie),
> > 
> >  > +       FN(tcp_gen_syncookie),          \
> > 
> >  > +       FN(get_current_pidns_info),
> > 
> >  >
> > 
> >  >  /* integer value in 'imm' field of BPF_CALL instruction selects 
> > which helper
> > 
> >  >   * function eBPF program intends to call
> > 
> >  > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> > 
> >  >         __s32   retval;
> > 
> >  >  };
> > 
> >  >
> > 
> >  > +struct bpf_pidns_info {
> > 
> >  > +       __u32 dev;
> > 
> >  > +       __u32 nsid;
> > 
> >  > +       __u32 tgid;
> > 
> >  > +       __u32 pid;
> > 
> >  > +};
> > 
> >  >  #endif /* _UAPI__LINUX_BPF_H__ */
> > 
> >  > diff --git a/tools/testing/selftests/bpf/Makefile 
> > b/tools/testing/selftests/bpf/Makefile
> > 
> >  > index 3bd0f4a0336a..1f97b571b581 100644
> > 
> >  > --- a/tools/testing/selftests/bpf/Makefile
> > 
> >  > +++ b/tools/testing/selftests/bpf/Makefile
> > 
> >  > @@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps 
> > test_lru_map test_lpm_map test
> > 
> >  >         test_cgroup_storage test_select_reuseport test_section_names \
> > 
> >  >         test_netcnt test_tcpnotify_user test_sock_fields test_sysctl 
> > test_hashmap \
> > 
> >  >         test_btf_dump test_cgroup_attach xdping test_sockopt 
> > test_sockopt_sk \
> > 
> >  > -       test_sockopt_multi test_tcp_rtt
> > 
> >  > +       test_sockopt_multi test_tcp_rtt test_pidns
> > 
> >  >
> > 
> >  >  BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
> > 
> >  >  TEST_GEN_FILES = $(BPF_OBJ_FILES)
> > 
> >  > diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
> > b/tools/testing/selftests/bpf/bpf_helpers.h
> > 
> >  > index 120aa86c58d3..c96795a9d983 100644
> > 
> >  > --- a/tools/testing/selftests/bpf/bpf_helpers.h
> > 
> >  > +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> > 
> >  > @@ -231,6 +231,9 @@ static int (*bpf_send_signal)(unsigned sig) = 
> > (void *)BPF_FUNC_send_signal;
> > 
> >  >  static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
> > 
> >  >                                           int ip_len, void *tcp, int 
> > tcp_len) =
> > 
> >  >         (void *) BPF_FUNC_tcp_gen_syncookie;
> > 
> >  > +static int (*bpf_get_current_pidns_info)(struct bpf_pidns_info *buf,
> > 
> >  > +                                        unsigned int buf_size) =
> > 
> >  > +       (void *) BPF_FUNC_get_current_pidns_info;
> > 
> >  >
> > 
> >  >  /* llvm builtin functions that eBPF C program may use to
> > 
> >  >   * emit BPF_LD_ABS and BPF_LD_IND instructions
> > 
> >  > diff --git a/tools/testing/selftests/bpf/progs/test_pidns_kern.c 
> > b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > 
> >  > new file mode 100644
> > 
> >  > index 000000000000..e1d2facfa762
> > 
> >  > --- /dev/null
> > 
> >  > +++ b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > 
> >  > @@ -0,0 +1,51 @@
> > 
> >  > +// SPDX-License-Identifier: GPL-2.0
> > 
> >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > 
> >  > + *
> > 
> >  > + * This program is free software; you can redistribute it and/or
> > 
> >  > + * modify it under the terms of version 2 of the GNU General Public
> > 
> >  > + * License as published by the Free Software Foundation.
> > 
> >  > + */
> > 
> >  > +
> > 
> >  > +#include <linux/bpf.h>
> > 
> >  > +#include <errno.h>
> > 
> >  > +#include "bpf_helpers.h"
> > 
> >  > +
> > 
> >  > +struct bpf_map_def SEC("maps") nsidmap = {
> > 
> >  > +       .type = BPF_MAP_TYPE_ARRAY,
> > 
> >  > +       .key_size = sizeof(__u32),
> > 
> >  > +       .value_size = sizeof(__u32),
> > 
> >  > +       .max_entries = 1,
> > 
> >  > +};
> > 
> >  > +
> > 
> >  > +struct bpf_map_def SEC("maps") pidmap = {
> > 
> >  > +       .type = BPF_MAP_TYPE_ARRAY,
> > 
> >  > +       .key_size = sizeof(__u32),
> > 
> >  > +       .value_size = sizeof(__u32),
> > 
> >  > +       .max_entries = 1,
> > 
> >  > +};
> > 
> >  > +
> > 
> >  > +SEC("tracepoint/syscalls/sys_enter_nanosleep")
> > 
> >  > +int trace(void *ctx)
> > 
> >  > +{
> > 
> >  > +       struct bpf_pidns_info nsinfo;
> > 
> >  > +       __u32 key = 0, *expected_pid, *val;
> > 
> >  > +       char fmt[] = "ERROR nspid:%d\n";
> > 
> >  > +
> > 
> >  > +       if (bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo)))
> > 
> >  > +               return -EINVAL;
> > 
> >  > +
> > 
> >  > +       expected_pid = bpf_map_lookup_elem(&pidmap, &key);
> > 
> >  > +
> > 
> >  > +
> > 
> >  > +       if (!expected_pid || *expected_pid != nsinfo.pid)
> > 
> >  > +               return 0;
> > 
> >  > +
> > 
> >  > +       val = bpf_map_lookup_elem(&nsidmap, &key);
> > 
> >  > +       if (val)
> > 
> >  > +               *val = nsinfo.nsid;
> > 
> >  > +
> > 
> >  > +       return 0;
> > 
> >  > +}
> > 
> >  > +
> > 
> >  > +char _license[] SEC("license") = "GPL";
> > 
> >  > +__u32 _version SEC("version") = 1;
> > 
> >  > diff --git a/tools/testing/selftests/bpf/test_pidns.c 
> > b/tools/testing/selftests/bpf/test_pidns.c
> > 
> >  > new file mode 100644
> > 
> >  > index 000000000000..a7254055f294
> > 
> >  > --- /dev/null
> > 
> >  > +++ b/tools/testing/selftests/bpf/test_pidns.c
> > 
> >  > @@ -0,0 +1,138 @@
> > 
> >  > +// SPDX-License-Identifier: GPL-2.0
> > 
> >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > 
> >  > + *
> > 
> >  > + * This program is free software; you can redistribute it and/or
> > 
> >  > + * modify it under the terms of version 2 of the GNU General Public
> > 
> >  > + * License as published by the Free Software Foundation.
> > 
> >  > + */
> > 
> >  > +
> > 
> >  > +#include <stdio.h>
> > 
> >  > +#include <stdlib.h>
> > 
> >  > +#include <string.h>
> > 
> >  > +#include <errno.h>
> > 
> >  > +#include <fcntl.h>
> > 
> >  > +#include <syscall.h>
> > 
> >  > +#include <unistd.h>
> > 
> >  > +#include <linux/perf_event.h>
> > 
> >  > +#include <sys/ioctl.h>
> > 
> >  > +#include <sys/time.h>
> > 
> >  > +#include <sys/types.h>
> > 
> >  > +#include <sys/stat.h>
> > 
> >  > +
> > 
> >  > +#include <linux/bpf.h>
> > 
> >  > +#include <bpf/bpf.h>
> > 
> >  > +#include <bpf/libbpf.h>
> > 
> >  > +
> > 
> >  > +#include "cgroup_helpers.h"
> > 
> >  > +#include "bpf_rlimit.h"
> > 
> >  > +
> > 
> >  > +#define CHECK(condition, tag, format...) ({            \
> > 
> >  > +       int __ret = !!(condition);                      \
> > 
> >  > +       if (__ret) {                                    \
> > 
> >  > +               printf("%s:FAIL:%s ", __func__, tag);   \
> > 
> >  > +               printf(format);                         \
> > 
> >  > +       } else {                                        \
> > 
> >  > +               printf("%s:PASS:%s\n", __func__, tag);  \
> > 
> >  > +       }                                               \
> > 
> >  > +       __ret;                                          \
> > 
> >  > +})
> > 
> >  > +
> > 
> >  > +static int bpf_find_map(const char *test, struct bpf_object *obj,
> > 
> >  > +                       const char *name)
> > 
> >  > +{
> > 
> >  > +       struct bpf_map *map;
> > 
> >  > +
> > 
> >  > +       map = bpf_object__find_map_by_name(obj, name);
> > 
> >  > +       if (!map)
> > 
> >  > +               return -1;
> > 
> >  > +       return bpf_map__fd(map);
> > 
> >  > +}
> > 
> >  > +
> > 
> >  > +
> > 
> >  > +int main(int argc, char **argv)
> > 
> >  > +{
> > 
> >  > +       const char *probe_name = "syscalls/sys_enter_nanosleep";
> > 
> >  > +       const char *file = "test_pidns_kern.o";
> > 
> >  > +       int err, bytes, efd, prog_fd, pmu_fd;
> > 
> >  > +       int pidmap_fd, nsidmap_fd;
> > 
> >  > +       struct perf_event_attr attr = {};
> > 
> >  > +       struct bpf_object *obj;
> > 
> >  > +       __u32 knsid = 0;
> > 
> >  > +       __u32 key = 0, pid;
> > 
> >  > +       int exit_code = 1;
> > 
> >  > +       struct stat st;
> > 
> >  > +       char buf[256];
> > 
> >  > +
> > 
> >  > +       err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, 
> > &prog_fd);
> > 
> >  > +       if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
> > 
> >  > +               goto cleanup_cgroup_env;
> > 
> >  > +
> > 
> >  > +       nsidmap_fd = bpf_find_map(__func__, obj, "nsidmap");
> > 
> >  > +       if (CHECK(nsidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> > 
> >  > +                 nsidmap_fd, errno))
> > 
> >  > +               goto close_prog;
> > 
> >  > +
> > 
> >  > +       pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
> > 
> >  > +       if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> > 
> >  > +                 pidmap_fd, errno))
> > 
> >  > +               goto close_prog;
> > 
> >  > +
> > 
> >  > +       pid = getpid();
> > 
> >  > +       bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
> > 
> >  > +
> > 
> >  > +       snprintf(buf, sizeof(buf),
> > 
> >  > +                "/sys/kernel/debug/tracing/events/%s/id", probe_name);
> > 
> >  > +       efd = open(buf, O_RDONLY, 0);
> > 
> >  > +       if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
> > 
> >  > +               goto close_prog;
> > 
> >  > +       bytes = read(efd, buf, sizeof(buf));
> > 
> >  > +       close(efd);
> > 
> >  > +       if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
> > 
> >  > +                 "bytes %d errno %d\n", bytes, errno))
> > 
> >  > +               goto close_prog;
> > 
> >  > +
> > 
> >  > +       attr.config = strtol(buf, NULL, 0);
> > 
> >  > +       attr.type = PERF_TYPE_TRACEPOINT;
> > 
> >  > +       attr.sample_type = PERF_SAMPLE_RAW;
> > 
> >  > +       attr.sample_period = 1;
> > 
> >  > +       attr.wakeup_events = 1;
> > 
> >  > +
> > 
> >  > +       pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, 
> > -1, 0);
> > 
> >  > +       if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", 
> > pmu_fd,
> > 
> >  > +                 errno))
> > 
> >  > +               goto close_prog;
> > 
> >  > +
> > 
> >  > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
> > 
> >  > +       if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
> > 
> >  > +                 errno))
> > 
> >  > +               goto close_pmu;
> > 
> >  > +
> > 
> >  > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
> > 
> >  > +       if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", 
> > err,
> > 
> >  > +                 errno))
> > 
> >  > +               goto close_pmu;
> > 
> >  > +
> > 
> >  > +       /* trigger some syscalls */
> > 
> >  > +       sleep(1);
> > 
> >  > +
> > 
> >  > +       err = bpf_map_lookup_elem(nsidmap_fd, &key, &knsid);
> > 
> >  > +       if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", 
> > err, errno))
> > 
> >  > +               goto close_pmu;
> > 
> >  > +
> > 
> >  > +       if (stat("/proc/self/ns/pid", &st))
> > 
> >  > +               goto close_pmu;
> > 
> >  > +
> > 
> >  > +       if (CHECK(knsid != (__u32) st.st_ino, "compare_namespace_id",
> > 
> >  > +                 "kern knsid %u user unsid %u\n", knsid, (__u32) 
> > st.st_ino))
> > 
> >  > +               goto close_pmu;
> > 
> >  > +
> > 
> >  > +       exit_code = 0;
> > 
> >  > +       printf("%s:PASS\n", argv[0]);
> > 
> >  > +
> > 
> >  > +close_pmu:
> > 
> >  > +       close(pmu_fd);
> > 
> >  > +close_prog:
> > 
> >  > +       bpf_object__close(obj);
> > 
> >  > +cleanup_cgroup_env:
> > 
> >  > +       return exit_code;
> > 
> >  > +}
> > 
> >  > --
> > 
> >  > 2.11.0
> > 
> >  >
> > 
> >  >
> > 
> >  >
> > 
> >  >
> > 
> >  >
> > 
> >  >
> > 
> >  > On Thu, Aug 08, 2019 at 05:09:51AM +0000, Yonghong Song wrote:
> > 
> >  > >
> > 
> >  > >
> > 
> >  > > On 8/7/19 6:22 PM, Carlos Antonio Neira Bustos wrote:
> > 
> >  > > > The code has been modified to avoid syscalls that could sleep.
> > 
> >  > > > Please let me know if any other modification is needed.
> > 
> >  > > >
> > 
> >  > > >  From be0384c0fa209a78c1567936e8db4e35b9a7c0f8 Mon Sep 17 
> > 00:00:00 2001
> > 
> >  > > > From: Carlos <cneirabustos@gmail.com>
> > 
> >  > > > Date: Wed, 7 Aug 2019 20:04:30 -0400
> > 
> >  > > > Subject: [PATCH] [PATCH v5 bpf-next] BPF: New helper to obtain 
> > namespace data
> > 
> >  > > >   from current task
> > 
> >  > > >
> > 
> >  > > > This helper obtains the active namespace from current and returns 
> > pid, tgid,
> > 
> >  > > > device and namespace id as seen from that namespace, allowing to 
> > instrument
> > 
> >  > > > a process inside a container.
> > 
> >  > > > Device is read from /proc/self/ns/pid, as in the future it's 
> > possible that
> > 
> >  > > > different pid_ns files may belong to different devices, according
> > 
> >  > > > to the discussion between Eric Biederman and Yonghong in 2017 
> > linux plumbers
> > 
> >  > > > conference.
> > 
> >  > > > Currently bpf_get_current_pid_tgid(), is used to do pid filtering 
> > in bcc's
> > 
> >  > > > scripts but this helper returns the pid as seen by the root 
> > namespace which is
> > 
> >  > > > fine when a bcc script is not executed inside a container.
> > 
> >  > > > When the process of interest is inside a container, pid filtering 
> > will not work
> > 
> >  > > > if bpf_get_current_pid_tgid() is used. This helper addresses this 
> > limitation
> > 
> >  > > > returning the pid as it's seen by the current namespace where the 
> > script is
> > 
> >  > > > executing.
> > 
> >  > > >
> > 
> >  > > > This helper has the same use cases as bpf_get_current_pid_tgid() 
> > as it can be
> > 
> >  > > > used to do pid filtering even inside a container.
> > 
> >  > > >
> > 
> >  > > > For example a bcc script using bpf_get_current_pid_tgid() 
> > (tools/funccount.py):
> > 
> >  > > >
> > 
> >  > > >          u32 pid = bpf_get_current_pid_tgid() >> 32;
> > 
> >  > > >          if (pid != <pid_arg_passed_in>)
> > 
> >  > > >                  return 0;
> > 
> >  > > > Could be modified to use bpf_get_current_pidns_info() as follows:
> > 
> >  > > >
> > 
> >  > > >          struct bpf_pidns pidns;
> > 
> >  > > >          bpf_get_current_pidns_info(&pidns, sizeof(struct 
> > bpf_pidns));
> > 
> >  > > >          u32 pid = pidns.tgid;
> > 
> >  > > >          u32 nsid = pidns.nsid;
> > 
> >  > > >          if ((pid != <pid_arg_passed_in>) && (nsid != 
> > <nsid_arg_passed_in>))
> > 
> >  > > >                  return 0;
> > 
> >  > > >
> > 
> >  > > > To find out the name PID namespace id of a process, you could use 
> > this command:
> > 
> >  > > >
> > 
> >  > > > $ ps -h -o pidns -p <pid_of_interest>
> > 
> >  > > >
> > 
> >  > > > Or this other command:
> > 
> >  > > >
> > 
> >  > > > $ ls -Li /proc/<pid_of_interest>/ns/pid
> > 
> >  > > >
> > 
> >  > > > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> > 
> >  > > > ---
> > 
> >  > > >   fs/namei.c                                         |   2 +-
> > 
> >  > > >   include/linux/bpf.h                                |   1 +
> > 
> >  > > >   include/linux/namei.h                              |   4 +
> > 
> >  > > >   include/uapi/linux/bpf.h                           |  29 ++++-
> > 
> >  > > >   kernel/bpf/core.c                                  |   1 +
> > 
> >  > > >   kernel/bpf/helpers.c                               |  78 
> > ++++++++++++
> > 
> >  > > >   kernel/trace/bpf_trace.c                           |   2 +
> > 
> >  > > >   samples/bpf/Makefile                               |   3 +
> > 
> >  > > >   samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> > 
> >  > > >   samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> > 
> >  > > >   tools/include/uapi/linux/bpf.h                     |  29 ++++-
> > 
> >  > > >   tools/testing/selftests/bpf/Makefile               |   2 +-
> > 
> >  > > >   tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> > 
> >  > > >   .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> > 
> >  > > >   tools/testing/selftests/bpf/test_pidns.c           | 138 
> > +++++++++++++++++++++
> > 
> >  > > >   15 files changed, 418 insertions(+), 4 deletions(-)
> > 
> >  > > >   create mode 100644 samples/bpf/trace_ns_info_user.c
> > 
> >  > > >   create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> > 
> >  > > >   create mode 100644 
> > tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > 
> >  > > >   create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> > 
> >  > > >
> > 
> >  > > > diff --git a/fs/namei.c b/fs/namei.c
> > 
> >  > > > index 209c51a5226c..d1eca36972d2 100644
> > 
> >  > > > --- a/fs/namei.c
> > 
> >  > > > +++ b/fs/namei.c
> > 
> >  > > > @@ -19,7 +19,6 @@
> > 
> >  > > >   #include <linux/export.h>
> > 
> >  > > >   #include <linux/kernel.h>
> > 
> >  > > >   #include <linux/slab.h>
> > 
> >  > > > -#include <linux/fs.h>
> > 
> >  > > >   #include <linux/namei.h>
> > 
> >  > > >   #include <linux/pagemap.h>
> > 
> >  > > >   #include <linux/fsnotify.h>
> > 
> >  > > > @@ -2355,6 +2354,7 @@ int filename_lookup(int dfd, struct 
> > filename *name, unsigned flags,
> > 
> >  > > >     putname(name);
> > 
> >  > > >     return retval;
> > 
> >  > > >   }
> > 
> >  > > > +EXPORT_SYMBOL(filename_lookup);
> > 
> >  > >
> > 
> >  > > No need to export symbols. bpf uses it and bpf is in the core, not in
> > 
> >  > > modules.
> > 
> >  > >
> > 
> >  > > >
> > 
> >  > > >   /* Returns 0 and nd will be valid on success; Retuns error, 
> > otherwise. */
> > 
> >  > > >   static int path_parentat(struct nameidata *nd, unsigned flags,
> > 
> >  > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > 
> >  > > > index f9a506147c8a..e4adf5e05afd 100644
> > 
> >  > > > --- a/include/linux/bpf.h
> > 
> >  > > > +++ b/include/linux/bpf.h
> > 
> >  > > > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto 
> > bpf_get_local_storage_proto;
> > 
> >  > > >   extern const struct bpf_func_proto bpf_strtol_proto;
> > 
> >  > > >   extern const struct bpf_func_proto bpf_strtoul_proto;
> > 
> >  > > >   extern const struct bpf_func_proto bpf_tcp_sock_proto;
> > 
> >  > > > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> > 
> >  > > >
> > 
> >  > > >   /* Shared helpers among cBPF and eBPF. */
> > 
> >  > > >   void bpf_user_rnd_init_once(void);
> > 
> >  > > > diff --git a/include/linux/namei.h b/include/linux/namei.h
> > 
> >  > > > index 9138b4471dbf..2c24e8c71d46 100644
> > 
> >  > > > --- a/include/linux/namei.h
> > 
> >  > > > +++ b/include/linux/namei.h
> > 
> >  > > > @@ -6,6 +6,7 @@
> > 
> >  > > >   #include <linux/path.h>
> > 
> >  > > >   #include <linux/fcntl.h>
> > 
> >  > > >   #include <linux/errno.h>
> > 
> >  > > > +#include <linux/fs.h>
> > 
> >  > > >
> > 
> >  > > >   enum { MAX_NESTED_LINKS = 8 };
> > 
> >  > > >
> > 
> >  > > > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, 
> > struct dentry *);
> > 
> >  > > >
> > 
> >  > > >   extern void nd_jump_link(struct path *path);
> > 
> >  > > >
> > 
> >  > > > +extern int filename_lookup(int dfd, struct filename *name, 
> > unsigned int flags,
> > 
> >  > > > +               struct path *path, struct path *root);
> > 
> >  > >
> > 
> >  > > The previous definition in fs/internal.h should be removed.
> > 
> >  > >
> > 
> >  > > > +
> > 
> >  > > >   static inline void nd_terminate_link(void *name, size_t len, 
> > size_t maxlen)
> > 
> >  > > >   {
> > 
> >  > > >     ((char *) name)[min(len, maxlen)] = '\0';
> > 
> >  > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > 
> >  > > > index 4393bd4b2419..6f601f7106e2 100644
> > 
> >  > > > --- a/include/uapi/linux/bpf.h
> > 
> >  > > > +++ b/include/uapi/linux/bpf.h
> > 
> >  > > > @@ -2741,6 +2741,26 @@ union bpf_attr {
> > 
> >  > > >    *                **-EOPNOTSUPP** kernel configuration does not 
> > enable SYN cookies
> > 
> >  > > >    *
> > 
> >  > > >    *                **-EPROTONOSUPPORT** IP packet version is not 
> > 4 or 6
> > 
> >  > > > + *
> > 
> >  > > > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, 
> > u32 size_of_pidns)
> > 
> >  > > > + * Description
> > 
> >  > > > + *         Copies into *pidns* pid, namespace id and tgid as 
> > seen by the
> > 
> >  > > > + *         current namespace and also device from /proc/self/ns/pid.
> > 
> >  > > > + *         *size_of_pidns* must be the size of *pidns*
> > 
> >  > > > + *
> > 
> >  > > > + *         This helper is used when pid filtering is needed inside a
> > 
> >  > > > + *         container as bpf_get_current_tgid() helper returns 
> > always the
> > 
> >  > > > + *         pid id as seen by the root namespace.
> > 
> >  > > > + * Return
> > 
> >  > > > + *         0 on success
> > 
> >  > > > + *
> > 
> >  > > > + *         **-EINVAL**  if unable to get ns, pid or tgid of 
> > current task.
> > 
> >  > > > + *         Or if size_of_pidns is not valid.
> > 
> >  > >
> > 
> >  > > Maybe reword by following the code sequence.
> > 
> >  > >     if *size_of_pidns* is not valid or unable to get ns, pid or tgid of
> > 
> >  > >     the current task.
> > 
> >  > >
> > 
> >  > > > + *
> > 
> >  > > > + *         **-ENOMEM**  if allocation fails.
> > 
> >  > >
> > 
> >  > > Maybe some other error codes in filename_lookup() function?
> > 
> >  > >
> > 
> >  > > > + *
> > 
> >  > > > + *         If unable to get the inode from /proc/self/ns/pid an 
> > error code
> > 
> >  > > > + *         will be returned.
> > 
> >  > >
> > 
> >  > > You do not need this. The description of error code cases should 
> > cover this.
> > 
> >  > >
> > 
> >  > > >    */
> > 
> >  > > >   #define __BPF_FUNC_MAPPER(FN)             \
> > 
> >  > > >     FN(unspec),                     \
> > 
> >  > > > @@ -2853,7 +2873,8 @@ union bpf_attr {
> > 
> >  > > >     FN(sk_storage_get),             \
> > 
> >  > > >     FN(sk_storage_delete),          \
> > 
> >  > > >     FN(send_signal),                \
> > 
> >  > > > -   FN(tcp_gen_syncookie),
> > 
> >  > > > +   FN(tcp_gen_syncookie),          \
> > 
> >  > > > +   FN(get_current_pidns_info),
> > 
> >  > > >
> > 
> >  > > >   /* integer value in 'imm' field of BPF_CALL instruction selects 
> > which helper
> > 
> >  > > >    * function eBPF program intends to call
> > 
> >  > > > @@ -3604,4 +3625,10 @@ struct bpf_sockopt {
> > 
> >  > > >     __s32   retval;
> > 
> >  > > >   };
> > 
> >  > > >
> > 
> >  > > > +struct bpf_pidns_info {
> > 
> >  > > > +   __u32 dev;
> > 
> >  > > > +   __u32 nsid;
> > 
> >  > > > +   __u32 tgid;
> > 
> >  > > > +   __u32 pid;
> > 
> >  > > > +};
> > 
> >  > > >   #endif /* _UAPI__LINUX_BPF_H__ */
> > 
> >  > > > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > 
> >  > > > index 8191a7db2777..3159f2a0188c 100644
> > 
> >  > > > --- a/kernel/bpf/core.c
> > 
> >  > > > +++ b/kernel/bpf/core.c
> > 
> >  > > > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto 
> > bpf_get_current_uid_gid_proto __weak;
> > 
> >  > > >   const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> > 
> >  > > >   const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> > 
> >  > > >   const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> > 
> >  > > > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> > 
> >  > > >
> > 
> >  > > >   const struct bpf_func_proto * __weak 
> > bpf_get_trace_printk_proto(void)
> > 
> >  > > >   {
> > 
> >  > > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > 
> >  > > > index 5e28718928ca..571f24077db2 100644
> > 
> >  > > > --- a/kernel/bpf/helpers.c
> > 
> >  > > > +++ b/kernel/bpf/helpers.c
> > 
> >  > > > @@ -11,6 +11,12 @@
> > 
> >  > > >   #include <linux/uidgid.h>
> > 
> >  > > >   #include <linux/filter.h>
> > 
> >  > > >   #include <linux/ctype.h>
> > 
> >  > > > +#include <linux/pid_namespace.h>
> > 
> >  > > > +#include <linux/major.h>
> > 
> >  > > > +#include <linux/stat.h>
> > 
> >  > > > +#include <linux/namei.h>
> > 
> >  > > > +#include <linux/version.h>
> > 
> >  > > > +
> > 
> >  > > >
> > 
> >  > > >   #include "../../lib/kstrtox.h"
> > 
> >  > > >
> > 
> >  > > > @@ -312,6 +318,78 @@ void copy_map_value_locked(struct bpf_map 
> > *map, void *dst, void *src,
> > 
> >  > > >     preempt_enable();
> > 
> >  > > >   }
> > 
> >  > > >
> > 
> >  > > > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, 
> > pidns_info, u32,
> > 
> >  > > > +    size)
> > 
> >  > > > +{
> > 
> >  > > > +   const char *name = "/proc/self/ns/pid";
> > 
> >  > >
> > 
> >  > > maybe rename this variable to pidns_path?
> > 
> >  > >
> > 
> >  > > > +   struct pid_namespace *pidns = NULL;
> > 
> >  > > > +   struct filename *tmp = NULL;
> > 
> >  > >
> > 
> >  > > Maybe rename this variable to name?
> > 
> >  > >
> > 
> >  > > > +   int len = strlen(name) + 1;
> > 
> >  > >
> > 
> >  > > We can delay this assignment later until it is needed.
> > 
> >  > >
> > 
> >  > > > +   struct inode *inode;
> > 
> >  > > > +   struct path kp;
> > 
> >  > > > +   pid_t tgid = 0;
> > 
> >  > > > +   pid_t pid = 0;
> > 
> >  > > > +   int ret;
> > 
> >  > > > +
> > 
> >  > > > +   if (unlikely(size != sizeof(struct bpf_pidns_info)))
> > 
> >  > > > +           return -EINVAL;
> > 
> >  > > > +
> > 
> >  > > > +   pidns = task_active_pid_ns(current);
> > 
> >  > > > +
> > 
> >  > >
> > 
> >  > > we can save an empty line here.
> > 
> >  > >
> > 
> >  > > > +   if (unlikely(!pidns))
> > 
> >  > > > +           goto clear;
> > 
> >  > > > +
> > 
> >  > > > +   pidns_info->nsid =  pidns->ns.inum;
> > 
> >  > > > +   pid = task_pid_nr_ns(current, pidns);
> > 
> >  > > > +
> > 
> >  > >
> > 
> >  > > We can save an empty line here.
> > 
> >  > >
> > 
> >  > > > +   if (unlikely(!pid))
> > 
> >  > > > +           goto clear;
> > 
> >  > > > +
> > 
> >  > > > +   tgid = task_tgid_nr_ns(current, pidns);
> > 
> >  > > > +
> > 
> >  > > ditto. save an empty line.
> > 
> >  > > > +   if (unlikely(!tgid))
> > 
> >  > > > +           goto clear;
> > 
> >  > > > +
> > 
> >  > > > +   pidns_info->tgid = (u32) tgid;
> > 
> >  > > > +   pidns_info->pid = (u32) pid;
> > 
> >  > > > +
> > 
> >  > > > +   tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> > 
> >  > > > +   if (unlikely(!tmp)) {
> > 
> >  > > > +           memset((void *)pidns_info, 0, (size_t) size);
> > 
> >  > > > +           return -ENOMEM;
> > 
> >  > > > +   }
> > 
> >  > > > +
> > 
> >  > > > +   memcpy((char *)tmp->name, name, len);
> > 
> >  > > > +   tmp->uptr = NULL;
> > 
> >  > > > +   tmp->aname = NULL;
> > 
> >  > > > +   tmp->refcnt = 1;
> > 
> >  > > > +
> > 
> >  > > ditto. save an empty line.
> > 
> >  > > > +   ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> > 
> >  > > > +
> > 
> >  > > ditto. save an empty line.
> > 
> >  > > > +   if (ret) {
> > 
> >  > > > +           memset((void *)pidns_info, 0, (size_t) size);
> > 
> >  > > > +           return ret;
> > 
> >  > > > +   }
> > 
> >  > > > +
> > 
> >  > > > +   inode = d_backing_inode(kp.dentry);
> > 
> >  > > > +   pidns_info->dev = inode->i_sb->s_dev;
> > 
> >  > > > +
> > 
> >  > > > +   return 0;
> > 
> >  > > > +
> > 
> >  > > > +clear:
> > 
> >  > > > +   memset((void *)pidns_info, 0, (size_t) size);
> > 
> >  > > > +
> > 
> >  > > save an empty line.
> > 
> >  > > > +   return -EINVAL;
> > 
> >  > > > +}
> > 
> >  > > > +
> > 
> >  > > > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> > 
> >  > > > +   .func   = bpf_get_current_pidns_info,
> > 
> >  > > make the "= " aligned with others?
> > 
> >  > > > +   .gpl_only       = false,
> > 
> >  > > > +   .ret_type       = RET_INTEGER,
> > 
> >  > > > +   .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> > 
> >  > > > +   .arg2_type      = ARG_CONST_SIZE,
> > 
> >  > > > +};
> > 
> >  > > > +
> > 
> >  > > >   #ifdef CONFIG_CGROUPS
> > 
> >  > > >   BPF_CALL_0(bpf_get_current_cgroup_id)
> > 
> >  > > >   {
> > 
> >  > > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > 
> >  > > > index ca1255d14576..5e1dc22765a5 100644
> > 
> >  > > > --- a/kernel/trace/bpf_trace.c
> > 
> >  > > > +++ b/kernel/trace/bpf_trace.c
> > 
> >  > > > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, 
> > const struct bpf_prog *prog)
> > 
> >  > > >   #endif
> > 
> >  > > >     case BPF_FUNC_send_signal:
> > 
> >  > > >             return &bpf_send_signal_proto;
> > 
> >  > > > +   case BPF_FUNC_get_current_pidns_info:
> > 
> >  > > > +           return &bpf_get_current_pidns_info_proto;
> > 
> >  > > >     default:
> > 
> >  > > >             return NULL;
> > 
> >  > > >     }
> > 
> >  > > > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> > 
> >  > > > index 1d9be26b4edd..238453ff27d2 100644
> > 
> >  > > > --- a/samples/bpf/Makefile
> > 
> >  > > > +++ b/samples/bpf/Makefile
> > 
> >  > > > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> > 
> >  > > >   hostprogs-y += xdp_sample_pkts
> > 
> >  > > >   hostprogs-y += ibumad
> > 
> >  > > >   hostprogs-y += hbm
> > 
> >  > > > +hostprogs-y += trace_ns_info
> > 
> >  > > [...]
> > 

^ permalink raw reply

* Re: [PATCH net-next] net/ncsi: allow to customize BMC MAC Address offset
From: Andrew Lunn @ 2019-08-08 21:16 UTC (permalink / raw)
  To: Tao Ren
  Cc: Jakub Kicinski, Samuel Mendoza-Jonas, David S . Miller,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	openbmc@lists.ozlabs.org, William Kennington, Joel Stanley
In-Reply-To: <77762b10-b8e7-b8a4-3fc0-e901707a1d54@fb.com>

On Thu, Aug 08, 2019 at 07:02:54PM +0000, Tao Ren wrote:
> Hi Andrew,
> 
> On 8/8/19 6:32 AM, Andrew Lunn wrote:
> >> Let me prepare patch v2 using device tree. I'm not sure if standard
> >> "mac-address" fits this situation because all we need is an offset
> >> (integer) and BMC MAC is calculated by adding the offset to NIC's
> >> MAC address. Anyways, let me work out v2 patch we can discuss more
> >> then.
> > 
> > Hi Tao
> > 
> > I don't know BMC terminology. By NICs MAC address, you are referring
> > to the hosts MAC address? The MAC address the big CPU is using for its
> > interface?  Where does this NIC get its MAC address from? If the BMCs
> > bootloader has access to it, it can set the mac-address property in
> > the device tree.
> 
> Sorry for the confusion and let me clarify more:
> 

> The NIC here refers to the Network controller which provide network
> connectivity for both BMC (via NC-SI) and Host (for example, via
> PCIe).
> 

> On Facebook Yamp BMC, BMC sends NCSI_OEM_GET_MAC command (as an
> ethernet packet) to the Network Controller while bringing up eth0,
> and the (Broadcom) Network Controller replies with the Base MAC
> Address reserved for the platform. As for Yamp, Base-MAC and
> Base-MAC+1 are used by Host (big CPU) and Base-MAC+2 are assigned to
> BMC. In my opinion, Base MAC and MAC address assignments are
> controlled by Network Controller, which is transparent to both BMC
> and Host.

Hi Tao

I've not done any work in the BMC field, so thanks for explaining
this.

In a typical embedded system, each network interface is assigned a MAC
address by the vendor. But here, things are different. The BMC SoC
network interface has not been assigned a MAC address, it needs to ask
the network controller for its MAC address, and then do some magical
transformation on the answer to derive a MAC address for
itself. Correct?

It seems like a better design would of been, the BMC sends a
NCSI_OEM_GET_BMC_MAC and the answer it gets back is the MAC address
the BMC should use. No magic involved. But i guess it is too late to
do that now.

> I'm not sure if I understand your suggestion correctly: do you mean
> we should move the logic (GET_MAC from Network Controller, adding
> offset and configuring BMC MAC) from kernel to boot loader?

In general, the kernel is generic. It probably boots on any ARM system
which is has the needed modules for. The bootloader is often much more
specific. It might not be fully platform specific, but it will be at
least specific to the general family of BMC SoCs. If you consider the
combination of the BMC bootloader and the device tree blob, you have
something specific to the platform. This magical transformation of
adding 2 seems to be very platform specific. So having this magic in
the bootloader+DT seems like the best place to put it.

However, how you pass the resulting MAC address to the kernel should
be as generic as possible. The DT "mac-address" property is very
generic, many MAC drivers understand it. Using it also allows for
vendors which actually assign a MAC address to the BMC to pass it to
the BMC, avoiding all this NCSI_OEM_GET_MAC handshake. Having an API
which just passing '2' is not generic at all.

    Andrew

^ permalink raw reply

* Re: [PATCH] powerpc/kmcent2: update the ethernet devices' phy properties
From: Valentin Longchamp @ 2019-08-08 21:09 UTC (permalink / raw)
  To: Madalin-cristian Bucur
  Cc: Scott Wood, linuxppc-dev@lists.ozlabs.org,
	galak@kernel.crashing.org, netdev@vger.kernel.org
In-Reply-To: <VI1PR04MB55679AAE8DDC3160B9CCE073ECDC0@VI1PR04MB5567.eurprd04.prod.outlook.com>

Le mar. 30 juil. 2019 à 11:44, Madalin-cristian Bucur
<madalin.bucur@nxp.com> a écrit :
>
> > -----Original Message-----
> >
> > > Le dim. 14 juil. 2019 à 22:05, Valentin Longchamp
> > > <valentin@longchamp.me> a écrit :
> > > >
> > > > Change all phy-connection-type properties to phy-mode that are better
> > > > supported by the fman driver.
> > > >
> > > > Use the more readable fixed-link node for the 2 sgmii links.
> > > >
> > > > Change the RGMII link to rgmii-id as the clock delays are added by the
> > > > phy.
> > > >
> > > > Signed-off-by: Valentin Longchamp <valentin@longchamp.me>
> >
> > I don't see any other uses of phy-mode in arch/powerpc/boot/dts/fsl, and I see
> > lots of phy-connection-type with fman.  Madalin, does this patch look OK?
> >
> > -Scott
>
> Hi,
>
> we are using "phy-connection-type" not "phy-mode" for the NXP (former Freescale)
> DPAA platforms. While the two seem to be interchangeable ("phy-mode" seems to be
> more recent, looking at the device tree bindings), the driver code in Linux seems
> to use one or the other, not both so one should stick with the variant the driver
> is using. To make things more complex, there may be dependencies in bootloaders,
> I see code in u-boot using only "phy-connection-type" or only "phy-mode".
>
> I'd leave "phy-connection-type" as is.

So I have finally had time to have a look and now I understand what
happens. You are right, there are bootloader dependencies: u-boot
calls fdt_fixup_phy_connection() that somehow in our case adds (or
changes if already in the device tree) the phy-connection-type
property to a wrong value ! By having a phy-mode in the device tree,
that is not changed by u-boot and by chance picked up by the kernel
fman driver (of_get_phy_mode() ) over phy-connection-mode, the below
patch fixes it for us.

I agree with you, it's not correct to have both phy-connection-type
and phy-mode. Ideally, u-boot on the board should be reworked so that
it does not perform the above wrong fixup. However, in an "unfixed"
.dtb (I have disabled fdt_fixup_phy_connection), the device tree in
the end only has either phy-connection-type or phy-mode, according to
what was chosen in the .dts file. And the fman driver works well with
both (thanks to the call to of_get_phy_mode() ). I would therefore
argue that even if all other DPAA platforms use phy-connection-type,
phy-mode is valid as well. (Furthermore we already have hundreds of
such boards in the field and we don't really support "remote" u-boot
update, so the u-boot fix is going to be difficult for us to pull).

Valentin

>
> Madalin
>
> > > > ---
> > > >  arch/powerpc/boot/dts/fsl/kmcent2.dts | 16 +++++++++++-----
> > > >  1 file changed, 11 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > > b/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > > index 48b7f9797124..c3e0741cafb1 100644
> > > > --- a/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > > +++ b/arch/powerpc/boot/dts/fsl/kmcent2.dts
> > > > @@ -210,13 +210,19 @@
> > > >
> > > >                 fman@400000 {
> > > >                         ethernet@e0000 {
> > > > -                               fixed-link = <0 1 1000 0 0>;
> > > > -                               phy-connection-type = "sgmii";
> > > > +                               phy-mode = "sgmii";
> > > > +                               fixed-link {
> > > > +                                       speed = <1000>;
> > > > +                                       full-duplex;
> > > > +                               };
> > > >                         };
> > > >
> > > >                         ethernet@e2000 {
> > > > -                               fixed-link = <1 1 1000 0 0>;
> > > > -                               phy-connection-type = "sgmii";
> > > > +                               phy-mode = "sgmii";
> > > > +                               fixed-link {
> > > > +                                       speed = <1000>;
> > > > +                                       full-duplex;
> > > > +                               };
> > > >                         };
> > > >
> > > >                         ethernet@e4000 {
> > > > @@ -229,7 +235,7 @@
> > > >
> > > >                         ethernet@e8000 {
> > > >                                 phy-handle = <&front_phy>;
> > > > -                               phy-connection-type = "rgmii";
> > > > +                               phy-mode = "rgmii-id";
> > > >                         };
> > > >
> > > >                         mdio0: mdio@fc000 {
> > > > --
> > > > 2.17.1
> > > >
> > >
> > >
>

^ permalink raw reply

* Re: [PATCH net-next 00/10] drop_monitor: Capture dropped packets and metadata
From: David Ahern @ 2019-08-08 21:08 UTC (permalink / raw)
  To: Ido Schimmel, netdev
  Cc: davem, nhorman, jiri, toke, roopa, nikolay, jakub.kicinski, andy,
	f.fainelli, andrew, vivien.didelot, mlxsw, Ido Schimmel
In-Reply-To: <20190807103059.15270-1-idosch@idosch.org>

On 8/7/19 4:30 AM, Ido Schimmel wrote:
> Example usage with patched dropwatch [1] can be found here [2]. Example
> dissection of drop monitor netlink events with patched wireshark [3] can
> be found here [4]. I will submit both changes upstream after the kernel
> changes are accepted. Another change worth making is adding a dropmon
> pseudo interface to libpcap, similar to the nflog interface [5]. This
> will allow users to specifically listen on dropmon traffic instead of
> capturing all netlink packets via the nlmon netdev.

Nice work, Ido.

On top of your dropwatch changes I added the ability to print the
payload as hex. e.g.,

Issue Ctrl-C to stop monitoring
drop at: nf_hook_slow+0x59/0x98 (0xffffffff814ec532)
input port ifindex: 1
timestamp: Thu Aug  8 15:04:02 2019 360015026 nsec
length: 64
00 00 00 00 00 00 00 00  00 00 00 00 08 00 45 00      ........ ......E.
00 3c e7 50 40 00 40 06  55 69 7f 00 00 01 7f 00      .<.P@.@. Ui......
00 01 80 2c 30 39 74 b9  c7 4d 00 00 00 00 a0 02      ...,09t. .M......
ff d7 fe 30 00 00 02 04  ff d7 04 02 08 0a 53 79       ...0.... ......Sy
original length: 74


Seems like the skb protocol is also needed to properly parse the payload
- ie., to know it is an ethernet header, followed by ip and tcp.

^ permalink raw reply

* Re: [PATCH net-next] net: openvswitch: Set OvS recirc_id from tc chain index
From: Pravin Shelar @ 2019-08-08 20:53 UTC (permalink / raw)
  To: Paul Blakey
  Cc: Linux Kernel Network Developers, David S. Miller, Justin Pettit,
	Simon Horman, Marcelo Ricardo Leitner, Vlad Buslov, Jiri Pirko,
	Roi Dayan, Yossi Kuperman, Rony Efraim, Oz Shlomo
In-Reply-To: <1565179722-22488-1-git-send-email-paulb@mellanox.com>

On Wed, Aug 7, 2019 at 5:08 AM Paul Blakey <paulb@mellanox.com> wrote:
>
> Offloaded OvS datapath rules are translated one to one to tc rules,
> for example the following simplified OvS rule:
>
> recirc_id(0),in_port(dev1),eth_type(0x0800),ct_state(-trk) actions:ct(),recirc(2)
>
> Will be translated to the following tc rule:
>
> $ tc filter add dev dev1 ingress \
>             prio 1 chain 0 proto ip \
>                 flower tcp ct_state -trk \
>                 action ct pipe \
>                 action goto chain 2
>
> Received packets will first travel though tc, and if they aren't stolen
> by it, like in the above rule, they will continue to OvS datapath.
> Since we already did some actions (action ct in this case) which might
> modify the packets, and updated action stats, we would like to continue
> the proccessing with the correct recirc_id in OvS (here recirc_id(2))
> where we left off.
>
> To support this, introduce a new skb extension for tc, which
> will be used for translating tc chain to ovs recirc_id to
> handle these miss cases. Last tc chain index will be set
> by tc goto chain action and read by OvS datapath.
>
> Signed-off-by: Paul Blakey <paulb@mellanox.com>
> Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
> Acked-by: Jiri Pirko <jiri@mellanox.com>
> ---
>  include/linux/skbuff.h    | 13 +++++++++++++
>  include/net/sch_generic.h |  5 ++++-
>  net/core/skbuff.c         |  6 ++++++
>  net/openvswitch/flow.c    |  9 +++++++++
>  net/sched/Kconfig         | 13 +++++++++++++
>  net/sched/act_api.c       |  1 +
>  net/sched/cls_api.c       | 12 ++++++++++++
>  7 files changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 3aef8d8..fb2a792 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -279,6 +279,16 @@ struct nf_bridge_info {
>  };
>  #endif
>
> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> +/* Chain in tc_skb_ext will be used to share the tc chain with
> + * ovs recirc_id. It will be set to the current chain by tc
> + * and read by ovs to recirc_id.
> + */
> +struct tc_skb_ext {
> +       __u32 chain;
> +};
> +#endif
> +
>  struct sk_buff_head {
>         /* These two members must be first. */
>         struct sk_buff  *next;
> @@ -4050,6 +4060,9 @@ enum skb_ext_id {
>  #ifdef CONFIG_XFRM
>         SKB_EXT_SEC_PATH,
>  #endif
> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> +       TC_SKB_EXT,
> +#endif
>         SKB_EXT_NUM, /* must be last */
>  };
>
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index 6b6b012..871feea 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -275,7 +275,10 @@ struct tcf_result {
>                         unsigned long   class;
>                         u32             classid;
>                 };
> -               const struct tcf_proto *goto_tp;
> +               struct {
> +                       const struct tcf_proto *goto_tp;
> +                       u32 goto_index;
> +               };
>
>                 /* used in the skb_tc_reinsert function */
>                 struct {
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index ea8e8d3..2b40b5a 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -4087,6 +4087,9 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
>  #ifdef CONFIG_XFRM
>         [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
>  #endif
> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> +       [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
> +#endif
>  };
>
>  static __always_inline unsigned int skb_ext_total_length(void)
> @@ -4098,6 +4101,9 @@ static __always_inline unsigned int skb_ext_total_length(void)
>  #ifdef CONFIG_XFRM
>                 skb_ext_type_len[SKB_EXT_SEC_PATH] +
>  #endif
> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> +               skb_ext_type_len[TC_SKB_EXT] +
> +#endif
>                 0;
>  }
>
> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> index bc89e16..0287ead 100644
> --- a/net/openvswitch/flow.c
> +++ b/net/openvswitch/flow.c
> @@ -816,6 +816,9 @@ static int key_extract_mac_proto(struct sk_buff *skb)
>  int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
>                          struct sk_buff *skb, struct sw_flow_key *key)
>  {
> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> +       struct tc_skb_ext *tc_ext;
> +#endif
>         int res, err;
>
>         /* Extract metadata from packet. */
> @@ -848,7 +851,13 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
>         if (res < 0)
>                 return res;
>         key->mac_proto = res;
> +
> +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
> +       tc_ext = skb_ext_find(skb, TC_SKB_EXT);
> +       key->recirc_id = tc_ext ? tc_ext->chain : 0;
> +#else
>         key->recirc_id = 0;
> +#endif
>
Most of cases the config would be turned on, so the ifdef is not that
useful. Can you add static key to avoid searching the skb-ext in non
offload cases.

^ permalink raw reply

* Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain namespacedata from current task
From: Yonghong Song @ 2019-08-08 20:47 UTC (permalink / raw)
  To: carlos antonio neira bustos, Y Song
  Cc: netdev@vger.kernel.org, ebiederm@xmission.com, brouer@redhat.com,
	quentin.monnet@netronome.com
In-Reply-To: <5d4c856b.1c69fb81.2aa4f.32dd@mx.google.com>



On 8/8/19 1:26 PM, carlos antonio neira bustos wrote:
> Hi Yonghong,
> 
> I’m sorry, just to be sure, I’m just missing the error codes from 
> filename_lookup() right ?.

 From kernel functionality point of view. Yes, I am talking about
error codes returned by filename_lookup().
For example, if CONFIG_PID_NS or CONFIG_NAMESPACES is not
defined in the config, the path "/proc/self/ns/pid" will not exist,
the error code will return. It may be -ENOTDIR
if CONFIG_NAMESPACES not defined or -ECHILD if CONFIG_PID_NS
is not defined. Please double check.

Please do follow the advice in
 > https://lore.kernel.org/netdev/20190808174848.poybtaagg5ctle7t@dev00/T/#t
to break the single patch to multiple patches.

I only reviewed the kernel code. Will review tools/ code
in the next properly-formatted (broken-up) commits.

Also, please also cc commits to bpf mailing list at
bpf@vger.kernel.org

> 
> Bests
> 
> Maybe some other error codes in filename_lookup() function?
> 
>  > + *
> 
>  > + *                      If unable to get the inode from 
> /proc/self/ns/pid an error code
> 
>  > + *                      will be returned.
> 
> *From: *Y Song <mailto:ys114321@gmail.com>
> *Sent: *08 August 2019 15:44
> *To: *Carlos Antonio Neira Bustos <mailto:cneirabustos@gmail.com>
> *Cc: *Yonghong Song <mailto:yhs@fb.com>; netdev@vger.kernel.org 
> <mailto:netdev@vger.kernel.org>; ebiederm@xmission.com 
> <mailto:ebiederm@xmission.com>; brouer@redhat.com 
> <mailto:brouer@redhat.com>; quentin.monnet@netronome.com 
> <mailto:quentin.monnet@netronome.com>
> *Subject: *Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain 
> namespacedata from current task
> 
> On Thu, Aug 8, 2019 at 10:52 AM Carlos Antonio Neira Bustos
> 
> <cneirabustos@gmail.com> wrote:
> 
>  >
> 
>  > Yonghong,
> 
>  >
> 
>  > I have modified the patch following your feedback.
> 
>  > Let me know if I'm missing something.
> 
> Yes, I have some other requests about formating.
> 
> https://lore.kernel.org/netdev/20190808174848.poybtaagg5ctle7t@dev00/T/#t
> 
> Could you address it as well?
> 
>  >
> 
>  > Bests
> 
>  >
> 
>  > From 70f8d5584700c9cfc82c006901d8ee9595c53f15 Mon Sep 17 00:00:00 2001
> 
>  > From: Carlos <cneirabustos@gmail.com>
> 
>  > Date: Wed, 7 Aug 2019 20:04:30 -0400
> 
>  > Subject: [PATCH] [PATCH v6 bpf-next] BPF: New helper to obtain 
> namespace data
> 
>  >  from current task
> 
>  >
> 
>  > This helper obtains the active namespace from current and returns 
> pid, tgid,
> 
>  > device and namespace id as seen from that namespace, allowing to 
> instrument
> 
>  > a process inside a container.
> 
>  > Device is read from /proc/self/ns/pid, as in the future it's possible 
> that
> 
>  > different pid_ns files may belong to different devices, according
> 
>  > to the discussion between Eric Biederman and Yonghong in 2017 linux 
> plumbers
> 
>  > conference.
> 
>  > Currently bpf_get_current_pid_tgid(), is used to do pid filtering in 
> bcc's
> 
>  > scripts but this helper returns the pid as seen by the root namespace 
> which is
> 
>  > fine when a bcc script is not executed inside a container.
> 
>  > When the process of interest is inside a container, pid filtering 
> will not work
> 
>  > if bpf_get_current_pid_tgid() is used. This helper addresses this 
> limitation
> 
>  > returning the pid as it's seen by the current namespace where the 
> script is
> 
>  > executing.
> 
>  >
> 
>  > This helper has the same use cases as bpf_get_current_pid_tgid() as 
> it can be
> 
>  > used to do pid filtering even inside a container.
> 
>  >
> 
>  > For example a bcc script using bpf_get_current_pid_tgid() 
> (tools/funccount.py):
> 
>  >
> 
>  >         u32 pid = bpf_get_current_pid_tgid() >> 32;
> 
>  >         if (pid != <pid_arg_passed_in>)
> 
>  >                 return 0;
> 
>  > Could be modified to use bpf_get_current_pidns_info() as follows:
> 
>  >
> 
>  >         struct bpf_pidns pidns;
> 
>  >         bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
> 
>  >         u32 pid = pidns.tgid;
> 
>  >         u32 nsid = pidns.nsid;
> 
>  >         if ((pid != <pid_arg_passed_in>) && (nsid != 
> <nsid_arg_passed_in>))
> 
>  >                 return 0;
> 
>  >
> 
>  > To find out the name PID namespace id of a process, you could use 
> this command:
> 
>  >
> 
>  > $ ps -h -o pidns -p <pid_of_interest>
> 
>  >
> 
>  > Or this other command:
> 
>  >
> 
>  > $ ls -Li /proc/<pid_of_interest>/ns/pid
> 
>  >
> 
>  > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> 
>  > ---
> 
>  >  fs/internal.h                                      |   2 -
> 
>  >  fs/namei.c                                         |   1 -
> 
>  >  include/linux/bpf.h                                |   1 +
> 
>  >  include/linux/namei.h                              |   4 +
> 
>  >  include/uapi/linux/bpf.h                           |  27 +++-
> 
>  >  kernel/bpf/core.c                                  |   1 +
> 
>  >  kernel/bpf/helpers.c                               |  64 ++++++++++
> 
>  >  kernel/trace/bpf_trace.c                           |   2 +
> 
>  >  samples/bpf/Makefile                               |   3 +
> 
>  >  samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> 
>  >  samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> 
>  >  tools/include/uapi/linux/bpf.h                     |  27 +++-
> 
>  >  tools/testing/selftests/bpf/Makefile               |   2 +-
> 
>  >  tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> 
>  >  .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> 
>  >  tools/testing/selftests/bpf/test_pidns.c           | 138 
> +++++++++++++++++++++
> 
>  >  16 files changed, 399 insertions(+), 6 deletions(-)
> 
>  >  create mode 100644 samples/bpf/trace_ns_info_user.c
> 
>  >  create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> 
>  >  create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c
> 
>  >  create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> 
>  >
> 
>  > diff --git a/fs/internal.h b/fs/internal.h
> 
>  > index 315fcd8d237c..6647e15dd419 100644
> 
>  > --- a/fs/internal.h
> 
>  > +++ b/fs/internal.h
> 
>  > @@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
> 
>  >  /*
> 
>  >   * namei.c
> 
>  >   */
> 
>  > -extern int filename_lookup(int dfd, struct filename *name, unsigned 
> flags,
> 
>  > -                          struct path *path, struct path *root);
> 
>  >  extern int user_path_mountpoint_at(int, const char __user *, 
> unsigned int, struct path *);
> 
>  >  extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
> 
>  >                            const char *, unsigned int, struct path *);
> 
>  > diff --git a/fs/namei.c b/fs/namei.c
> 
>  > index 209c51a5226c..a89fc72a4a10 100644
> 
>  > --- a/fs/namei.c
> 
>  > +++ b/fs/namei.c
> 
>  > @@ -19,7 +19,6 @@
> 
>  >  #include <linux/export.h>
> 
>  >  #include <linux/kernel.h>
> 
>  >  #include <linux/slab.h>
> 
>  > -#include <linux/fs.h>
> 
>  >  #include <linux/namei.h>
> 
>  >  #include <linux/pagemap.h>
> 
>  >  #include <linux/fsnotify.h>
> 
>  > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> 
>  > index f9a506147c8a..e4adf5e05afd 100644
> 
>  > --- a/include/linux/bpf.h
> 
>  > +++ b/include/linux/bpf.h
> 
>  > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto 
> bpf_get_local_storage_proto;
> 
>  >  extern const struct bpf_func_proto bpf_strtol_proto;
> 
>  >  extern const struct bpf_func_proto bpf_strtoul_proto;
> 
>  >  extern const struct bpf_func_proto bpf_tcp_sock_proto;
> 
>  > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> 
>  >
> 
>  >  /* Shared helpers among cBPF and eBPF. */
> 
>  >  void bpf_user_rnd_init_once(void);
> 
>  > diff --git a/include/linux/namei.h b/include/linux/namei.h
> 
>  > index 9138b4471dbf..b45c8b6f7cb4 100644
> 
>  > --- a/include/linux/namei.h
> 
>  > +++ b/include/linux/namei.h
> 
>  > @@ -6,6 +6,7 @@
> 
>  >  #include <linux/path.h>
> 
>  >  #include <linux/fcntl.h>
> 
>  >  #include <linux/errno.h>
> 
>  > +#include <linux/fs.h>
> 
>  >
> 
>  >  enum { MAX_NESTED_LINKS = 8 };
> 
>  >
> 
>  > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct 
> dentry *);
> 
>  >
> 
>  >  extern void nd_jump_link(struct path *path);
> 
>  >
> 
>  > +extern int filename_lookup(int dfd, struct filename *name, unsigned 
> flags,
> 
>  > +                          struct path *path, struct path *root);
> 
>  > +
> 
>  >  static inline void nd_terminate_link(void *name, size_t len, size_t 
> maxlen)
> 
>  >  {
> 
>  >         ((char *) name)[min(len, maxlen)] = '\0';
> 
>  > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> 
>  > index 4393bd4b2419..b0d4869fb860 100644
> 
>  > --- a/include/uapi/linux/bpf.h
> 
>  > +++ b/include/uapi/linux/bpf.h
> 
>  > @@ -2741,6 +2741,24 @@ union bpf_attr {
> 
>  >   *             **-EOPNOTSUPP** kernel configuration does not enable 
> SYN cookies
> 
>  >   *
> 
>  >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> 
>  > + *
> 
>  > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
> size_of_pidns)
> 
>  > + *     Description
> 
>  > + *             Copies into *pidns* pid, namespace id and tgid as 
> seen by the
> 
>  > + *             current namespace and also device from /proc/self/ns/pid.
> 
>  > + *             *size_of_pidns* must be the size of *pidns*
> 
>  > + *
> 
>  > + *             This helper is used when pid filtering is needed inside a
> 
>  > + *             container as bpf_get_current_tgid() helper returns 
> always the
> 
>  > + *             pid id as seen by the root namespace.
> 
>  > + *     Return
> 
>  > + *             0 on success
> 
>  > + *
> 
>  > + *             **-EINVAL** if *size_of_pidns* is not valid or unable 
> to get ns, pid
> 
>  > + *             or tgid of the current task.
> 
>  > + *
> 
>  > + *             **-ENOMEM**  if allocation fails.
> 
>  > + *
> 
>  >   */
> 
>  >  #define __BPF_FUNC_MAPPER(FN)          \
> 
>  >         FN(unspec),                     \
> 
>  > @@ -2853,7 +2871,8 @@ union bpf_attr {
> 
>  >         FN(sk_storage_get),             \
> 
>  >         FN(sk_storage_delete),          \
> 
>  >         FN(send_signal),                \
> 
>  > -       FN(tcp_gen_syncookie),
> 
>  > +       FN(tcp_gen_syncookie),          \
> 
>  > +       FN(get_current_pidns_info),
> 
>  >
> 
>  >  /* integer value in 'imm' field of BPF_CALL instruction selects 
> which helper
> 
>  >   * function eBPF program intends to call
> 
>  > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> 
>  >         __s32   retval;
> 
>  >  };
> 
>  >
> 
>  > +struct bpf_pidns_info {
> 
>  > +       __u32 dev;
> 
>  > +       __u32 nsid;
> 
>  > +       __u32 tgid;
> 
>  > +       __u32 pid;
> 
>  > +};
> 
>  >  #endif /* _UAPI__LINUX_BPF_H__ */
> 
>  > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> 
>  > index 8191a7db2777..3159f2a0188c 100644
> 
>  > --- a/kernel/bpf/core.c
> 
>  > +++ b/kernel/bpf/core.c
> 
>  > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto 
> bpf_get_current_uid_gid_proto __weak;
> 
>  >  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> 
>  >  const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> 
>  >  const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> 
>  > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> 
>  >
> 
>  >  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> 
>  >  {
> 
>  > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> 
>  > index 5e28718928ca..41fbf1f28a48 100644
> 
>  > --- a/kernel/bpf/helpers.c
> 
>  > +++ b/kernel/bpf/helpers.c
> 
>  > @@ -11,6 +11,12 @@
> 
>  >  #include <linux/uidgid.h>
> 
>  >  #include <linux/filter.h>
> 
>  >  #include <linux/ctype.h>
> 
>  > +#include <linux/pid_namespace.h>
> 
>  > +#include <linux/major.h>
> 
>  > +#include <linux/stat.h>
> 
>  > +#include <linux/namei.h>
> 
>  > +#include <linux/version.h>
> 
>  > +
> 
>  >
> 
>  >  #include "../../lib/kstrtox.h"
> 
>  >
> 
>  > @@ -312,6 +318,64 @@ void copy_map_value_locked(struct bpf_map *map, 
> void *dst, void *src,
> 
>  >         preempt_enable();
> 
>  >  }
> 
>  >
> 
>  > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, 
> pidns_info, u32,
> 
>  > +        size)
> 
>  > +{
> 
>  > +       const char *pidns_path = "/proc/self/ns/pid";
> 
>  > +       struct pid_namespace *pidns = NULL;
> 
>  > +       struct filename *tmp = NULL;
> 
>  > +       struct inode *inode;
> 
>  > +       struct path kp;
> 
>  > +       pid_t tgid = 0;
> 
>  > +       pid_t pid = 0;
> 
>  > +       int ret;
> 
>  > +       int len;
> 
>  > +
> 
>  > +       if (unlikely(size != sizeof(struct bpf_pidns_info)))
> 
>  > +               return -EINVAL;
> 
>  > +       pidns = task_active_pid_ns(current);
> 
>  > +       if (unlikely(!pidns))
> 
>  > +               goto clear;
> 
>  > +       pidns_info->nsid =  pidns->ns.inum;
> 
>  > +       pid = task_pid_nr_ns(current, pidns);
> 
>  > +       if (unlikely(!pid))
> 
>  > +               goto clear;
> 
>  > +       tgid = task_tgid_nr_ns(current, pidns);
> 
>  > +       if (unlikely(!tgid))
> 
>  > +               goto clear;
> 
>  > +       pidns_info->tgid = (u32) tgid;
> 
>  > +       pidns_info->pid = (u32) pid;
> 
>  > +       tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> 
>  > +       if (unlikely(!tmp)) {
> 
>  > +               memset((void *)pidns_info, 0, (size_t) size);
> 
>  > +               return -ENOMEM;
> 
>  > +       }
> 
>  > +       len = strlen(pidns_path) + 1;
> 
>  > +       memcpy((char *)tmp->name, pidns_path, len);
> 
>  > +       tmp->uptr = NULL;
> 
>  > +       tmp->aname = NULL;
> 
>  > +       tmp->refcnt = 1;
> 
>  > +       ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> 
>  > +       if (ret) {
> 
>  > +               memset((void *)pidns_info, 0, (size_t) size);
> 
>  > +               return ret;
> 
>  > +       }
> 
>  > +       inode = d_backing_inode(kp.dentry);
> 
>  > +       pidns_info->dev = inode->i_sb->s_dev;
> 
>  > +       return 0;
> 
>  > +clear:
> 
>  > +       memset((void *)pidns_info, 0, (size_t) size);
> 
>  > +       return -EINVAL;
> 
>  > +}
> 
>  > +
> 
>  > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> 
>  > +       .func           = bpf_get_current_pidns_info,
> 
>  > +       .gpl_only       = false,
> 
>  > +       .ret_type       = RET_INTEGER,
> 
>  > +       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> 
>  > +       .arg2_type      = ARG_CONST_SIZE,
> 
>  > +};
> 
>  > +
> 
>  >  #ifdef CONFIG_CGROUPS
> 
>  >  BPF_CALL_0(bpf_get_current_cgroup_id)
> 
>  >  {
> 
>  > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> 
>  > index ca1255d14576..5e1dc22765a5 100644
> 
>  > --- a/kernel/trace/bpf_trace.c
> 
>  > +++ b/kernel/trace/bpf_trace.c
> 
>  > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, 
> const struct bpf_prog *prog)
> 
>  >  #endif
> 
>  >         case BPF_FUNC_send_signal:
> 
>  >                 return &bpf_send_signal_proto;
> 
>  > +       case BPF_FUNC_get_current_pidns_info:
> 
>  > +               return &bpf_get_current_pidns_info_proto;
> 
>  >         default:
> 
>  >                 return NULL;
> 
>  >         }
> 
>  > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> 
>  > index 1d9be26b4edd..238453ff27d2 100644
> 
>  > --- a/samples/bpf/Makefile
> 
>  > +++ b/samples/bpf/Makefile
> 
>  > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> 
>  >  hostprogs-y += xdp_sample_pkts
> 
>  >  hostprogs-y += ibumad
> 
>  >  hostprogs-y += hbm
> 
>  > +hostprogs-y += trace_ns_info
> 
>  >
> 
>  >  # Libbpf dependencies
> 
>  >  LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
> 
>  > @@ -109,6 +110,7 @@ task_fd_query-objs := bpf_load.o 
> task_fd_query_user.o $(TRACE_HELPERS)
> 
>  >  xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
> 
>  >  ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
> 
>  >  hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
> 
>  > +trace_ns_info-objs := bpf_load.o trace_ns_info_user.o
> 
>  >
> 
>  >  # Tell kbuild to always build the programs
> 
>  >  always := $(hostprogs-y)
> 
>  > @@ -170,6 +172,7 @@ always += xdp_sample_pkts_kern.o
> 
>  >  always += ibumad_kern.o
> 
>  >  always += hbm_out_kern.o
> 
>  >  always += hbm_edt_kern.o
> 
>  > +always += trace_ns_info_user_kern.o
> 
>  >
> 
>  >  KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
> 
>  >  KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
> 
>  > diff --git a/samples/bpf/trace_ns_info_user.c 
> b/samples/bpf/trace_ns_info_user.c
> 
>  > new file mode 100644
> 
>  > index 000000000000..e06d08db6f30
> 
>  > --- /dev/null
> 
>  > +++ b/samples/bpf/trace_ns_info_user.c
> 
>  > @@ -0,0 +1,35 @@
> 
>  > +// SPDX-License-Identifier: GPL-2.0
> 
>  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> 
>  > + *
> 
>  > + * This program is free software; you can redistribute it and/or
> 
>  > + * modify it under the terms of version 2 of the GNU General Public
> 
>  > + * License as published by the Free Software Foundation.
> 
>  > + */
> 
>  > +
> 
>  > +#include <stdio.h>
> 
>  > +#include <linux/bpf.h>
> 
>  > +#include <unistd.h>
> 
>  > +#include "bpf/libbpf.h"
> 
>  > +#include "bpf_load.h"
> 
>  > +
> 
>  > +/* This code was taken verbatim from tracex1_user.c, it's used
> 
>  > + * to exercize bpf_get_current_pidns_info() helper call.
> 
>  > + */
> 
>  > +int main(int ac, char **argv)
> 
>  > +{
> 
>  > +       FILE *f;
> 
>  > +       char filename[256];
> 
>  > +
> 
>  > +       snprintf(filename, sizeof(filename), "%s_user_kern.o", argv[0]);
> 
>  > +       printf("loading %s\n", filename);
> 
>  > +
> 
>  > +       if (load_bpf_file(filename)) {
> 
>  > +               printf("%s", bpf_log_buf);
> 
>  > +               return 1;
> 
>  > +       }
> 
>  > +
> 
>  > +       f = popen("taskset 1 ping  localhost", "r");
> 
>  > +       (void) f;
> 
>  > +       read_trace_pipe();
> 
>  > +       return 0;
> 
>  > +}
> 
>  > diff --git a/samples/bpf/trace_ns_info_user_kern.c 
> b/samples/bpf/trace_ns_info_user_kern.c
> 
>  > new file mode 100644
> 
>  > index 000000000000..96675e02b707
> 
>  > --- /dev/null
> 
>  > +++ b/samples/bpf/trace_ns_info_user_kern.c
> 
>  > @@ -0,0 +1,44 @@
> 
>  > +// SPDX-License-Identifier: GPL-2.0
> 
>  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> 
>  > + *
> 
>  > + * This program is free software; you can redistribute it and/or
> 
>  > + * modify it under the terms of version 2 of the GNU General Public
> 
>  > + * License as published by the Free Software Foundation.
> 
>  > + */
> 
>  > +#include <linux/skbuff.h>
> 
>  > +#include <linux/netdevice.h>
> 
>  > +#include <linux/version.h>
> 
>  > +#include <uapi/linux/bpf.h>
> 
>  > +#include "bpf_helpers.h"
> 
>  > +
> 
>  > +typedef __u64 u64;
> 
>  > +typedef __u32 u32;
> 
>  > +
> 
>  > +
> 
>  > +/* kprobe is NOT a stable ABI
> 
>  > + * kernel functions can be removed, renamed or completely change 
> semantics.
> 
>  > + * Number of arguments and their positions can change, etc.
> 
>  > + * In such case this bpf+kprobe example will no longer be meaningful
> 
>  > + */
> 
>  > +
> 
>  > +/* This will call bpf_get_current_pidns_info() to display pid and ns 
> values
> 
>  > + * as seen by the current namespace, on the far left you will see 
> the pid as
> 
>  > + * seen as by the root namespace.
> 
>  > + */
> 
>  > +
> 
>  > +SEC("kprobe/__netif_receive_skb_core")
> 
>  > +int bpf_prog1(struct pt_regs *ctx)
> 
>  > +{
> 
>  > +       char fmt[] = "nsid:%u, dev: %u,  pid:%u\n";
> 
>  > +       struct bpf_pidns_info nsinfo;
> 
>  > +       int ok = 0;
> 
>  > +
> 
>  > +       ok = bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo));
> 
>  > +       if (ok == 0)
> 
>  > +               bpf_trace_printk(fmt, sizeof(fmt), (u32)nsinfo.nsid,
> 
>  > +                                (u32) nsinfo.dev, (u32)nsinfo.pid);
> 
>  > +
> 
>  > +       return 0;
> 
>  > +}
> 
>  > +char _license[] SEC("license") = "GPL";
> 
>  > +u32 _version SEC("version") = LINUX_VERSION_CODE;
> 
>  > diff --git a/tools/include/uapi/linux/bpf.h 
> b/tools/include/uapi/linux/bpf.h
> 
>  > index 4393bd4b2419..b0d4869fb860 100644
> 
>  > --- a/tools/include/uapi/linux/bpf.h
> 
>  > +++ b/tools/include/uapi/linux/bpf.h
> 
>  > @@ -2741,6 +2741,24 @@ union bpf_attr {
> 
>  >   *             **-EOPNOTSUPP** kernel configuration does not enable 
> SYN cookies
> 
>  >   *
> 
>  >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> 
>  > + *
> 
>  > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
> size_of_pidns)
> 
>  > + *     Description
> 
>  > + *             Copies into *pidns* pid, namespace id and tgid as 
> seen by the
> 
>  > + *             current namespace and also device from /proc/self/ns/pid.
> 
>  > + *             *size_of_pidns* must be the size of *pidns*
> 
>  > + *
> 
>  > + *             This helper is used when pid filtering is needed inside a
> 
>  > + *             container as bpf_get_current_tgid() helper returns 
> always the
> 
>  > + *             pid id as seen by the root namespace.
> 
>  > + *     Return
> 
>  > + *             0 on success
> 
>  > + *
> 
>  > + *             **-EINVAL** if *size_of_pidns* is not valid or unable 
> to get ns, pid
> 
>  > + *             or tgid of the current task.
> 
>  > + *
> 
>  > + *             **-ENOMEM**  if allocation fails.
> 
>  > + *
> 
>  >   */
> 
>  >  #define __BPF_FUNC_MAPPER(FN)          \
> 
>  >         FN(unspec),                     \
> 
>  > @@ -2853,7 +2871,8 @@ union bpf_attr {
> 
>  >         FN(sk_storage_get),             \
> 
>  >         FN(sk_storage_delete),          \
> 
>  >         FN(send_signal),                \
> 
>  > -       FN(tcp_gen_syncookie),
> 
>  > +       FN(tcp_gen_syncookie),          \
> 
>  > +       FN(get_current_pidns_info),
> 
>  >
> 
>  >  /* integer value in 'imm' field of BPF_CALL instruction selects 
> which helper
> 
>  >   * function eBPF program intends to call
> 
>  > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> 
>  >         __s32   retval;
> 
>  >  };
> 
>  >
> 
>  > +struct bpf_pidns_info {
> 
>  > +       __u32 dev;
> 
>  > +       __u32 nsid;
> 
>  > +       __u32 tgid;
> 
>  > +       __u32 pid;
> 
>  > +};
> 
>  >  #endif /* _UAPI__LINUX_BPF_H__ */
> 
>  > diff --git a/tools/testing/selftests/bpf/Makefile 
> b/tools/testing/selftests/bpf/Makefile
> 
>  > index 3bd0f4a0336a..1f97b571b581 100644
> 
>  > --- a/tools/testing/selftests/bpf/Makefile
> 
>  > +++ b/tools/testing/selftests/bpf/Makefile
> 
>  > @@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps 
> test_lru_map test_lpm_map test
> 
>  >         test_cgroup_storage test_select_reuseport test_section_names \
> 
>  >         test_netcnt test_tcpnotify_user test_sock_fields test_sysctl 
> test_hashmap \
> 
>  >         test_btf_dump test_cgroup_attach xdping test_sockopt 
> test_sockopt_sk \
> 
>  > -       test_sockopt_multi test_tcp_rtt
> 
>  > +       test_sockopt_multi test_tcp_rtt test_pidns
> 
>  >
> 
>  >  BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
> 
>  >  TEST_GEN_FILES = $(BPF_OBJ_FILES)
> 
>  > diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
> b/tools/testing/selftests/bpf/bpf_helpers.h
> 
>  > index 120aa86c58d3..c96795a9d983 100644
> 
>  > --- a/tools/testing/selftests/bpf/bpf_helpers.h
> 
>  > +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> 
>  > @@ -231,6 +231,9 @@ static int (*bpf_send_signal)(unsigned sig) = 
> (void *)BPF_FUNC_send_signal;
> 
>  >  static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
> 
>  >                                           int ip_len, void *tcp, int 
> tcp_len) =
> 
>  >         (void *) BPF_FUNC_tcp_gen_syncookie;
> 
>  > +static int (*bpf_get_current_pidns_info)(struct bpf_pidns_info *buf,
> 
>  > +                                        unsigned int buf_size) =
> 
>  > +       (void *) BPF_FUNC_get_current_pidns_info;
> 
>  >
> 
>  >  /* llvm builtin functions that eBPF C program may use to
> 
>  >   * emit BPF_LD_ABS and BPF_LD_IND instructions
> 
>  > diff --git a/tools/testing/selftests/bpf/progs/test_pidns_kern.c 
> b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> 
>  > new file mode 100644
> 
>  > index 000000000000..e1d2facfa762
> 
>  > --- /dev/null
> 
>  > +++ b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> 
>  > @@ -0,0 +1,51 @@
> 
>  > +// SPDX-License-Identifier: GPL-2.0
> 
>  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> 
>  > + *
> 
>  > + * This program is free software; you can redistribute it and/or
> 
>  > + * modify it under the terms of version 2 of the GNU General Public
> 
>  > + * License as published by the Free Software Foundation.
> 
>  > + */
> 
>  > +
> 
>  > +#include <linux/bpf.h>
> 
>  > +#include <errno.h>
> 
>  > +#include "bpf_helpers.h"
> 
>  > +
> 
>  > +struct bpf_map_def SEC("maps") nsidmap = {
> 
>  > +       .type = BPF_MAP_TYPE_ARRAY,
> 
>  > +       .key_size = sizeof(__u32),
> 
>  > +       .value_size = sizeof(__u32),
> 
>  > +       .max_entries = 1,
> 
>  > +};
> 
>  > +
> 
>  > +struct bpf_map_def SEC("maps") pidmap = {
> 
>  > +       .type = BPF_MAP_TYPE_ARRAY,
> 
>  > +       .key_size = sizeof(__u32),
> 
>  > +       .value_size = sizeof(__u32),
> 
>  > +       .max_entries = 1,
> 
>  > +};
> 
>  > +
> 
>  > +SEC("tracepoint/syscalls/sys_enter_nanosleep")
> 
>  > +int trace(void *ctx)
> 
>  > +{
> 
>  > +       struct bpf_pidns_info nsinfo;
> 
>  > +       __u32 key = 0, *expected_pid, *val;
> 
>  > +       char fmt[] = "ERROR nspid:%d\n";
> 
>  > +
> 
>  > +       if (bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo)))
> 
>  > +               return -EINVAL;
> 
>  > +
> 
>  > +       expected_pid = bpf_map_lookup_elem(&pidmap, &key);
> 
>  > +
> 
>  > +
> 
>  > +       if (!expected_pid || *expected_pid != nsinfo.pid)
> 
>  > +               return 0;
> 
>  > +
> 
>  > +       val = bpf_map_lookup_elem(&nsidmap, &key);
> 
>  > +       if (val)
> 
>  > +               *val = nsinfo.nsid;
> 
>  > +
> 
>  > +       return 0;
> 
>  > +}
> 
>  > +
> 
>  > +char _license[] SEC("license") = "GPL";
> 
>  > +__u32 _version SEC("version") = 1;
> 
>  > diff --git a/tools/testing/selftests/bpf/test_pidns.c 
> b/tools/testing/selftests/bpf/test_pidns.c
> 
>  > new file mode 100644
> 
>  > index 000000000000..a7254055f294
> 
>  > --- /dev/null
> 
>  > +++ b/tools/testing/selftests/bpf/test_pidns.c
> 
>  > @@ -0,0 +1,138 @@
> 
>  > +// SPDX-License-Identifier: GPL-2.0
> 
>  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> 
>  > + *
> 
>  > + * This program is free software; you can redistribute it and/or
> 
>  > + * modify it under the terms of version 2 of the GNU General Public
> 
>  > + * License as published by the Free Software Foundation.
> 
>  > + */
> 
>  > +
> 
>  > +#include <stdio.h>
> 
>  > +#include <stdlib.h>
> 
>  > +#include <string.h>
> 
>  > +#include <errno.h>
> 
>  > +#include <fcntl.h>
> 
>  > +#include <syscall.h>
> 
>  > +#include <unistd.h>
> 
>  > +#include <linux/perf_event.h>
> 
>  > +#include <sys/ioctl.h>
> 
>  > +#include <sys/time.h>
> 
>  > +#include <sys/types.h>
> 
>  > +#include <sys/stat.h>
> 
>  > +
> 
>  > +#include <linux/bpf.h>
> 
>  > +#include <bpf/bpf.h>
> 
>  > +#include <bpf/libbpf.h>
> 
>  > +
> 
>  > +#include "cgroup_helpers.h"
> 
>  > +#include "bpf_rlimit.h"
> 
>  > +
> 
>  > +#define CHECK(condition, tag, format...) ({            \
> 
>  > +       int __ret = !!(condition);                      \
> 
>  > +       if (__ret) {                                    \
> 
>  > +               printf("%s:FAIL:%s ", __func__, tag);   \
> 
>  > +               printf(format);                         \
> 
>  > +       } else {                                        \
> 
>  > +               printf("%s:PASS:%s\n", __func__, tag);  \
> 
>  > +       }                                               \
> 
>  > +       __ret;                                          \
> 
>  > +})
> 
>  > +
> 
>  > +static int bpf_find_map(const char *test, struct bpf_object *obj,
> 
>  > +                       const char *name)
> 
>  > +{
> 
>  > +       struct bpf_map *map;
> 
>  > +
> 
>  > +       map = bpf_object__find_map_by_name(obj, name);
> 
>  > +       if (!map)
> 
>  > +               return -1;
> 
>  > +       return bpf_map__fd(map);
> 
>  > +}
> 
>  > +
> 
>  > +
> 
>  > +int main(int argc, char **argv)
> 
>  > +{
> 
>  > +       const char *probe_name = "syscalls/sys_enter_nanosleep";
> 
>  > +       const char *file = "test_pidns_kern.o";
> 
>  > +       int err, bytes, efd, prog_fd, pmu_fd;
> 
>  > +       int pidmap_fd, nsidmap_fd;
> 
>  > +       struct perf_event_attr attr = {};
> 
>  > +       struct bpf_object *obj;
> 
>  > +       __u32 knsid = 0;
> 
>  > +       __u32 key = 0, pid;
> 
>  > +       int exit_code = 1;
> 
>  > +       struct stat st;
> 
>  > +       char buf[256];
> 
>  > +
> 
>  > +       err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, 
> &prog_fd);
> 
>  > +       if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
> 
>  > +               goto cleanup_cgroup_env;
> 
>  > +
> 
>  > +       nsidmap_fd = bpf_find_map(__func__, obj, "nsidmap");
> 
>  > +       if (CHECK(nsidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> 
>  > +                 nsidmap_fd, errno))
> 
>  > +               goto close_prog;
> 
>  > +
> 
>  > +       pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
> 
>  > +       if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> 
>  > +                 pidmap_fd, errno))
> 
>  > +               goto close_prog;
> 
>  > +
> 
>  > +       pid = getpid();
> 
>  > +       bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
> 
>  > +
> 
>  > +       snprintf(buf, sizeof(buf),
> 
>  > +                "/sys/kernel/debug/tracing/events/%s/id", probe_name);
> 
>  > +       efd = open(buf, O_RDONLY, 0);
> 
>  > +       if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
> 
>  > +               goto close_prog;
> 
>  > +       bytes = read(efd, buf, sizeof(buf));
> 
>  > +       close(efd);
> 
>  > +       if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
> 
>  > +                 "bytes %d errno %d\n", bytes, errno))
> 
>  > +               goto close_prog;
> 
>  > +
> 
>  > +       attr.config = strtol(buf, NULL, 0);
> 
>  > +       attr.type = PERF_TYPE_TRACEPOINT;
> 
>  > +       attr.sample_type = PERF_SAMPLE_RAW;
> 
>  > +       attr.sample_period = 1;
> 
>  > +       attr.wakeup_events = 1;
> 
>  > +
> 
>  > +       pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, 
> -1, 0);
> 
>  > +       if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", 
> pmu_fd,
> 
>  > +                 errno))
> 
>  > +               goto close_prog;
> 
>  > +
> 
>  > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
> 
>  > +       if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
> 
>  > +                 errno))
> 
>  > +               goto close_pmu;
> 
>  > +
> 
>  > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
> 
>  > +       if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", 
> err,
> 
>  > +                 errno))
> 
>  > +               goto close_pmu;
> 
>  > +
> 
>  > +       /* trigger some syscalls */
> 
>  > +       sleep(1);
> 
>  > +
> 
>  > +       err = bpf_map_lookup_elem(nsidmap_fd, &key, &knsid);
> 
>  > +       if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", 
> err, errno))
> 
>  > +               goto close_pmu;
> 
>  > +
> 
>  > +       if (stat("/proc/self/ns/pid", &st))
> 
>  > +               goto close_pmu;
> 
>  > +
> 
>  > +       if (CHECK(knsid != (__u32) st.st_ino, "compare_namespace_id",
> 
>  > +                 "kern knsid %u user unsid %u\n", knsid, (__u32) 
> st.st_ino))
> 
>  > +               goto close_pmu;
> 
>  > +
> 
>  > +       exit_code = 0;
> 
>  > +       printf("%s:PASS\n", argv[0]);
> 
>  > +
> 
>  > +close_pmu:
> 
>  > +       close(pmu_fd);
> 
>  > +close_prog:
> 
>  > +       bpf_object__close(obj);
> 
>  > +cleanup_cgroup_env:
> 
>  > +       return exit_code;
> 
>  > +}
> 
>  > --
> 
>  > 2.11.0
> 
>  >
> 
>  >
> 
>  >
> 
>  >
> 
>  >
> 
>  >
> 
>  > On Thu, Aug 08, 2019 at 05:09:51AM +0000, Yonghong Song wrote:
> 
>  > >
> 
>  > >
> 
>  > > On 8/7/19 6:22 PM, Carlos Antonio Neira Bustos wrote:
> 
>  > > > The code has been modified to avoid syscalls that could sleep.
> 
>  > > > Please let me know if any other modification is needed.
> 
>  > > >
> 
>  > > >  From be0384c0fa209a78c1567936e8db4e35b9a7c0f8 Mon Sep 17 
> 00:00:00 2001
> 
>  > > > From: Carlos <cneirabustos@gmail.com>
> 
>  > > > Date: Wed, 7 Aug 2019 20:04:30 -0400
> 
>  > > > Subject: [PATCH] [PATCH v5 bpf-next] BPF: New helper to obtain 
> namespace data
> 
>  > > >   from current task
> 
>  > > >
> 
>  > > > This helper obtains the active namespace from current and returns 
> pid, tgid,
> 
>  > > > device and namespace id as seen from that namespace, allowing to 
> instrument
> 
>  > > > a process inside a container.
> 
>  > > > Device is read from /proc/self/ns/pid, as in the future it's 
> possible that
> 
>  > > > different pid_ns files may belong to different devices, according
> 
>  > > > to the discussion between Eric Biederman and Yonghong in 2017 
> linux plumbers
> 
>  > > > conference.
> 
>  > > > Currently bpf_get_current_pid_tgid(), is used to do pid filtering 
> in bcc's
> 
>  > > > scripts but this helper returns the pid as seen by the root 
> namespace which is
> 
>  > > > fine when a bcc script is not executed inside a container.
> 
>  > > > When the process of interest is inside a container, pid filtering 
> will not work
> 
>  > > > if bpf_get_current_pid_tgid() is used. This helper addresses this 
> limitation
> 
>  > > > returning the pid as it's seen by the current namespace where the 
> script is
> 
>  > > > executing.
> 
>  > > >
> 
>  > > > This helper has the same use cases as bpf_get_current_pid_tgid() 
> as it can be
> 
>  > > > used to do pid filtering even inside a container.
> 
>  > > >
> 
>  > > > For example a bcc script using bpf_get_current_pid_tgid() 
> (tools/funccount.py):
> 
>  > > >
> 
>  > > >          u32 pid = bpf_get_current_pid_tgid() >> 32;
> 
>  > > >          if (pid != <pid_arg_passed_in>)
> 
>  > > >                  return 0;
> 
>  > > > Could be modified to use bpf_get_current_pidns_info() as follows:
> 
>  > > >
> 
>  > > >          struct bpf_pidns pidns;
> 
>  > > >          bpf_get_current_pidns_info(&pidns, sizeof(struct 
> bpf_pidns));
> 
>  > > >          u32 pid = pidns.tgid;
> 
>  > > >          u32 nsid = pidns.nsid;
> 
>  > > >          if ((pid != <pid_arg_passed_in>) && (nsid != 
> <nsid_arg_passed_in>))
> 
>  > > >                  return 0;
> 
>  > > >
> 
>  > > > To find out the name PID namespace id of a process, you could use 
> this command:
> 
>  > > >
> 
>  > > > $ ps -h -o pidns -p <pid_of_interest>
> 
>  > > >
> 
>  > > > Or this other command:
> 
>  > > >
> 
>  > > > $ ls -Li /proc/<pid_of_interest>/ns/pid
> 
>  > > >
> 
>  > > > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> 
>  > > > ---
> 
>  > > >   fs/namei.c                                         |   2 +-
> 
>  > > >   include/linux/bpf.h                                |   1 +
> 
>  > > >   include/linux/namei.h                              |   4 +
> 
>  > > >   include/uapi/linux/bpf.h                           |  29 ++++-
> 
>  > > >   kernel/bpf/core.c                                  |   1 +
> 
>  > > >   kernel/bpf/helpers.c                               |  78 
> ++++++++++++
> 
>  > > >   kernel/trace/bpf_trace.c                           |   2 +
> 
>  > > >   samples/bpf/Makefile                               |   3 +
> 
>  > > >   samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> 
>  > > >   samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> 
>  > > >   tools/include/uapi/linux/bpf.h                     |  29 ++++-
> 
>  > > >   tools/testing/selftests/bpf/Makefile               |   2 +-
> 
>  > > >   tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> 
>  > > >   .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> 
>  > > >   tools/testing/selftests/bpf/test_pidns.c           | 138 
> +++++++++++++++++++++
> 
>  > > >   15 files changed, 418 insertions(+), 4 deletions(-)
> 
>  > > >   create mode 100644 samples/bpf/trace_ns_info_user.c
> 
>  > > >   create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> 
>  > > >   create mode 100644 
> tools/testing/selftests/bpf/progs/test_pidns_kern.c
> 
>  > > >   create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> 
>  > > >
> 
>  > > > diff --git a/fs/namei.c b/fs/namei.c
> 
>  > > > index 209c51a5226c..d1eca36972d2 100644
> 
>  > > > --- a/fs/namei.c
> 
>  > > > +++ b/fs/namei.c
> 
>  > > > @@ -19,7 +19,6 @@
> 
>  > > >   #include <linux/export.h>
> 
>  > > >   #include <linux/kernel.h>
> 
>  > > >   #include <linux/slab.h>
> 
>  > > > -#include <linux/fs.h>
> 
>  > > >   #include <linux/namei.h>
> 
>  > > >   #include <linux/pagemap.h>
> 
>  > > >   #include <linux/fsnotify.h>
> 
>  > > > @@ -2355,6 +2354,7 @@ int filename_lookup(int dfd, struct 
> filename *name, unsigned flags,
> 
>  > > >     putname(name);
> 
>  > > >     return retval;
> 
>  > > >   }
> 
>  > > > +EXPORT_SYMBOL(filename_lookup);
> 
>  > >
> 
>  > > No need to export symbols. bpf uses it and bpf is in the core, not in
> 
>  > > modules.
> 
>  > >
> 
>  > > >
> 
>  > > >   /* Returns 0 and nd will be valid on success; Retuns error, 
> otherwise. */
> 
>  > > >   static int path_parentat(struct nameidata *nd, unsigned flags,
> 
>  > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> 
>  > > > index f9a506147c8a..e4adf5e05afd 100644
> 
>  > > > --- a/include/linux/bpf.h
> 
>  > > > +++ b/include/linux/bpf.h
> 
>  > > > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto 
> bpf_get_local_storage_proto;
> 
>  > > >   extern const struct bpf_func_proto bpf_strtol_proto;
> 
>  > > >   extern const struct bpf_func_proto bpf_strtoul_proto;
> 
>  > > >   extern const struct bpf_func_proto bpf_tcp_sock_proto;
> 
>  > > > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> 
>  > > >
> 
>  > > >   /* Shared helpers among cBPF and eBPF. */
> 
>  > > >   void bpf_user_rnd_init_once(void);
> 
>  > > > diff --git a/include/linux/namei.h b/include/linux/namei.h
> 
>  > > > index 9138b4471dbf..2c24e8c71d46 100644
> 
>  > > > --- a/include/linux/namei.h
> 
>  > > > +++ b/include/linux/namei.h
> 
>  > > > @@ -6,6 +6,7 @@
> 
>  > > >   #include <linux/path.h>
> 
>  > > >   #include <linux/fcntl.h>
> 
>  > > >   #include <linux/errno.h>
> 
>  > > > +#include <linux/fs.h>
> 
>  > > >
> 
>  > > >   enum { MAX_NESTED_LINKS = 8 };
> 
>  > > >
> 
>  > > > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, 
> struct dentry *);
> 
>  > > >
> 
>  > > >   extern void nd_jump_link(struct path *path);
> 
>  > > >
> 
>  > > > +extern int filename_lookup(int dfd, struct filename *name, 
> unsigned int flags,
> 
>  > > > +               struct path *path, struct path *root);
> 
>  > >
> 
>  > > The previous definition in fs/internal.h should be removed.
> 
>  > >
> 
>  > > > +
> 
>  > > >   static inline void nd_terminate_link(void *name, size_t len, 
> size_t maxlen)
> 
>  > > >   {
> 
>  > > >     ((char *) name)[min(len, maxlen)] = '\0';
> 
>  > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> 
>  > > > index 4393bd4b2419..6f601f7106e2 100644
> 
>  > > > --- a/include/uapi/linux/bpf.h
> 
>  > > > +++ b/include/uapi/linux/bpf.h
> 
>  > > > @@ -2741,6 +2741,26 @@ union bpf_attr {
> 
>  > > >    *                **-EOPNOTSUPP** kernel configuration does not 
> enable SYN cookies
> 
>  > > >    *
> 
>  > > >    *                **-EPROTONOSUPPORT** IP packet version is not 
> 4 or 6
> 
>  > > > + *
> 
>  > > > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, 
> u32 size_of_pidns)
> 
>  > > > + * Description
> 
>  > > > + *         Copies into *pidns* pid, namespace id and tgid as 
> seen by the
> 
>  > > > + *         current namespace and also device from /proc/self/ns/pid.
> 
>  > > > + *         *size_of_pidns* must be the size of *pidns*
> 
>  > > > + *
> 
>  > > > + *         This helper is used when pid filtering is needed inside a
> 
>  > > > + *         container as bpf_get_current_tgid() helper returns 
> always the
> 
>  > > > + *         pid id as seen by the root namespace.
> 
>  > > > + * Return
> 
>  > > > + *         0 on success
> 
>  > > > + *
> 
>  > > > + *         **-EINVAL**  if unable to get ns, pid or tgid of 
> current task.
> 
>  > > > + *         Or if size_of_pidns is not valid.
> 
>  > >
> 
>  > > Maybe reword by following the code sequence.
> 
>  > >     if *size_of_pidns* is not valid or unable to get ns, pid or tgid of
> 
>  > >     the current task.
> 
>  > >
> 
>  > > > + *
> 
>  > > > + *         **-ENOMEM**  if allocation fails.
> 
>  > >
> 
>  > > Maybe some other error codes in filename_lookup() function?
> 
>  > >
> 
>  > > > + *
> 
>  > > > + *         If unable to get the inode from /proc/self/ns/pid an 
> error code
> 
>  > > > + *         will be returned.
> 
>  > >
> 
>  > > You do not need this. The description of error code cases should 
> cover this.
> 
>  > >
> 
>  > > >    */
> 
>  > > >   #define __BPF_FUNC_MAPPER(FN)             \
> 
>  > > >     FN(unspec),                     \
> 
>  > > > @@ -2853,7 +2873,8 @@ union bpf_attr {
> 
>  > > >     FN(sk_storage_get),             \
> 
>  > > >     FN(sk_storage_delete),          \
> 
>  > > >     FN(send_signal),                \
> 
>  > > > -   FN(tcp_gen_syncookie),
> 
>  > > > +   FN(tcp_gen_syncookie),          \
> 
>  > > > +   FN(get_current_pidns_info),
> 
>  > > >
> 
>  > > >   /* integer value in 'imm' field of BPF_CALL instruction selects 
> which helper
> 
>  > > >    * function eBPF program intends to call
> 
>  > > > @@ -3604,4 +3625,10 @@ struct bpf_sockopt {
> 
>  > > >     __s32   retval;
> 
>  > > >   };
> 
>  > > >
> 
>  > > > +struct bpf_pidns_info {
> 
>  > > > +   __u32 dev;
> 
>  > > > +   __u32 nsid;
> 
>  > > > +   __u32 tgid;
> 
>  > > > +   __u32 pid;
> 
>  > > > +};
> 
>  > > >   #endif /* _UAPI__LINUX_BPF_H__ */
> 
>  > > > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> 
>  > > > index 8191a7db2777..3159f2a0188c 100644
> 
>  > > > --- a/kernel/bpf/core.c
> 
>  > > > +++ b/kernel/bpf/core.c
> 
>  > > > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto 
> bpf_get_current_uid_gid_proto __weak;
> 
>  > > >   const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> 
>  > > >   const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> 
>  > > >   const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> 
>  > > > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> 
>  > > >
> 
>  > > >   const struct bpf_func_proto * __weak 
> bpf_get_trace_printk_proto(void)
> 
>  > > >   {
> 
>  > > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> 
>  > > > index 5e28718928ca..571f24077db2 100644
> 
>  > > > --- a/kernel/bpf/helpers.c
> 
>  > > > +++ b/kernel/bpf/helpers.c
> 
>  > > > @@ -11,6 +11,12 @@
> 
>  > > >   #include <linux/uidgid.h>
> 
>  > > >   #include <linux/filter.h>
> 
>  > > >   #include <linux/ctype.h>
> 
>  > > > +#include <linux/pid_namespace.h>
> 
>  > > > +#include <linux/major.h>
> 
>  > > > +#include <linux/stat.h>
> 
>  > > > +#include <linux/namei.h>
> 
>  > > > +#include <linux/version.h>
> 
>  > > > +
> 
>  > > >
> 
>  > > >   #include "../../lib/kstrtox.h"
> 
>  > > >
> 
>  > > > @@ -312,6 +318,78 @@ void copy_map_value_locked(struct bpf_map 
> *map, void *dst, void *src,
> 
>  > > >     preempt_enable();
> 
>  > > >   }
> 
>  > > >
> 
>  > > > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, 
> pidns_info, u32,
> 
>  > > > +    size)
> 
>  > > > +{
> 
>  > > > +   const char *name = "/proc/self/ns/pid";
> 
>  > >
> 
>  > > maybe rename this variable to pidns_path?
> 
>  > >
> 
>  > > > +   struct pid_namespace *pidns = NULL;
> 
>  > > > +   struct filename *tmp = NULL;
> 
>  > >
> 
>  > > Maybe rename this variable to name?
> 
>  > >
> 
>  > > > +   int len = strlen(name) + 1;
> 
>  > >
> 
>  > > We can delay this assignment later until it is needed.
> 
>  > >
> 
>  > > > +   struct inode *inode;
> 
>  > > > +   struct path kp;
> 
>  > > > +   pid_t tgid = 0;
> 
>  > > > +   pid_t pid = 0;
> 
>  > > > +   int ret;
> 
>  > > > +
> 
>  > > > +   if (unlikely(size != sizeof(struct bpf_pidns_info)))
> 
>  > > > +           return -EINVAL;
> 
>  > > > +
> 
>  > > > +   pidns = task_active_pid_ns(current);
> 
>  > > > +
> 
>  > >
> 
>  > > we can save an empty line here.
> 
>  > >
> 
>  > > > +   if (unlikely(!pidns))
> 
>  > > > +           goto clear;
> 
>  > > > +
> 
>  > > > +   pidns_info->nsid =  pidns->ns.inum;
> 
>  > > > +   pid = task_pid_nr_ns(current, pidns);
> 
>  > > > +
> 
>  > >
> 
>  > > We can save an empty line here.
> 
>  > >
> 
>  > > > +   if (unlikely(!pid))
> 
>  > > > +           goto clear;
> 
>  > > > +
> 
>  > > > +   tgid = task_tgid_nr_ns(current, pidns);
> 
>  > > > +
> 
>  > > ditto. save an empty line.
> 
>  > > > +   if (unlikely(!tgid))
> 
>  > > > +           goto clear;
> 
>  > > > +
> 
>  > > > +   pidns_info->tgid = (u32) tgid;
> 
>  > > > +   pidns_info->pid = (u32) pid;
> 
>  > > > +
> 
>  > > > +   tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> 
>  > > > +   if (unlikely(!tmp)) {
> 
>  > > > +           memset((void *)pidns_info, 0, (size_t) size);
> 
>  > > > +           return -ENOMEM;
> 
>  > > > +   }
> 
>  > > > +
> 
>  > > > +   memcpy((char *)tmp->name, name, len);
> 
>  > > > +   tmp->uptr = NULL;
> 
>  > > > +   tmp->aname = NULL;
> 
>  > > > +   tmp->refcnt = 1;
> 
>  > > > +
> 
>  > > ditto. save an empty line.
> 
>  > > > +   ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> 
>  > > > +
> 
>  > > ditto. save an empty line.
> 
>  > > > +   if (ret) {
> 
>  > > > +           memset((void *)pidns_info, 0, (size_t) size);
> 
>  > > > +           return ret;
> 
>  > > > +   }
> 
>  > > > +
> 
>  > > > +   inode = d_backing_inode(kp.dentry);
> 
>  > > > +   pidns_info->dev = inode->i_sb->s_dev;
> 
>  > > > +
> 
>  > > > +   return 0;
> 
>  > > > +
> 
>  > > > +clear:
> 
>  > > > +   memset((void *)pidns_info, 0, (size_t) size);
> 
>  > > > +
> 
>  > > save an empty line.
> 
>  > > > +   return -EINVAL;
> 
>  > > > +}
> 
>  > > > +
> 
>  > > > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> 
>  > > > +   .func   = bpf_get_current_pidns_info,
> 
>  > > make the "= " aligned with others?
> 
>  > > > +   .gpl_only       = false,
> 
>  > > > +   .ret_type       = RET_INTEGER,
> 
>  > > > +   .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> 
>  > > > +   .arg2_type      = ARG_CONST_SIZE,
> 
>  > > > +};
> 
>  > > > +
> 
>  > > >   #ifdef CONFIG_CGROUPS
> 
>  > > >   BPF_CALL_0(bpf_get_current_cgroup_id)
> 
>  > > >   {
> 
>  > > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> 
>  > > > index ca1255d14576..5e1dc22765a5 100644
> 
>  > > > --- a/kernel/trace/bpf_trace.c
> 
>  > > > +++ b/kernel/trace/bpf_trace.c
> 
>  > > > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, 
> const struct bpf_prog *prog)
> 
>  > > >   #endif
> 
>  > > >     case BPF_FUNC_send_signal:
> 
>  > > >             return &bpf_send_signal_proto;
> 
>  > > > +   case BPF_FUNC_get_current_pidns_info:
> 
>  > > > +           return &bpf_get_current_pidns_info_proto;
> 
>  > > >     default:
> 
>  > > >             return NULL;
> 
>  > > >     }
> 
>  > > > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> 
>  > > > index 1d9be26b4edd..238453ff27d2 100644
> 
>  > > > --- a/samples/bpf/Makefile
> 
>  > > > +++ b/samples/bpf/Makefile
> 
>  > > > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> 
>  > > >   hostprogs-y += xdp_sample_pkts
> 
>  > > >   hostprogs-y += ibumad
> 
>  > > >   hostprogs-y += hbm
> 
>  > > > +hostprogs-y += trace_ns_info
> 
>  > > [...]
> 

^ permalink raw reply

* Re: [PATCH net-next] taprio: remove unused variable 'entry_list_policy'
From: Vinicius Costa Gomes @ 2019-08-08 20:42 UTC (permalink / raw)
  To: David Miller, yuehaibing; +Cc: jhs, xiyou.wangcong, jiri, linux-kernel, netdev
In-Reply-To: <20190808.113813.478689798535715440.davem@davemloft.net>

Hi,

David Miller <davem@davemloft.net> writes:

> From: YueHaibing <yuehaibing@huawei.com>
> Date: Thu, 8 Aug 2019 22:26:23 +0800
>
>> net/sched/sch_taprio.c:680:32: warning:
>>  entry_list_policy defined but not used [-Wunused-const-variable=]
>> 
>> It is not used since commit a3d43c0d56f1 ("taprio: Add
>> support adding an admin schedule")
>> 
>> Reported-by: Hulk Robot <hulkci@huawei.com>
>> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
>
> This is probably unintentional and a bug, we should be using that
> policy value to validate that the sched list is indeed a nested
> attribute.

Removing this policy should be fine.

One of the points of commit (as explained in the commit message)
a3d43c0d56f1 ("taprio: Add support adding an admin schedule") is that it
removes support (it now returns "not supported") for schedules using the
TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY attribute (which were never used),
the parsing of those types of schedules was the only user of this
policy.

>
> I'm not applying this without at least a better and clear commit
> message explaining why we shouldn't be using this policy any more.

YueHaibing may use the text above in the commit message of a new spin of
this patch if you think it's clear enough.


Cheers,
--
Vinicius

^ permalink raw reply

* Re: [PATCH v2 13/15] net: phy: adin: configure downshift on config_init
From: Andrew Lunn @ 2019-08-08 20:39 UTC (permalink / raw)
  To: Heiner Kallweit
  Cc: Alexandru Ardelean, netdev, devicetree, linux-kernel, davem,
	robh+dt, mark.rutland, f.fainelli
In-Reply-To: <420c8e15-3361-a722-4ad1-3c448b1d3bc1@gmail.com>

On Thu, Aug 08, 2019 at 09:38:40PM +0200, Heiner Kallweit wrote:
> On 08.08.2019 14:30, Alexandru Ardelean wrote:
> > Down-speed auto-negotiation may not always be enabled, in which case the
> > PHY won't down-shift to 100 or 10 during auto-negotiation.
> > 
> > This change enables downshift and configures the number of retries to
> > default 8 (maximum supported value).
> > 
> > The change has been adapted from the Marvell PHY driver.
> > 
> Instead of a fixed downshift setting (like in the Marvell driver) you
> may consider to implement the ethtool phy-tunable ETHTOOL_PHY_DOWNSHIFT.

Hi Alexandru

Upps, sorry, my bad.

I looked at marvell_set_downshift(), and assumed it was connected to
the phy-tunable. I have patches somewhere which does that. But they
have not made it into mainline yet.

> See the Aquantia PHY driver for an example.

Yes, that does have all the tunable stuff.

     Andrew

^ permalink raw reply

* Re: [PATCH net] net: phy: rtl8211f: do a double read to get real time link status
From: Andrew Lunn @ 2019-08-08 20:34 UTC (permalink / raw)
  To: Heiner Kallweit
  Cc: Yonglong Liu, davem, netdev, linux-kernel, linuxarm, salil.mehta,
	yisen.zhuang, shiju.jose
In-Reply-To: <26e2c5c9-915c-858b-d091-e5bfa7ab6a5b@gmail.com>

On Thu, Aug 08, 2019 at 10:01:39PM +0200, Heiner Kallweit wrote:
> On 08.08.2019 21:40, Andrew Lunn wrote:
> >> @@ -568,6 +568,11 @@ int phy_start_aneg(struct phy_device *phydev)
> >>  	if (err < 0)
> >>  		goto out_unlock;
> >>  
> >> +	/* The PHY may not yet have cleared aneg-completed and link-up bit
> >> +	 * w/o this delay when the following read is done.
> >> +	 */
> >> +	usleep_range(1000, 2000);
> >> +
> > 
> > Hi Heiner
> > 
> > Does 802.3 C22 say anything about this?
> > 
> C22 says:
> "The Auto-Negotiation process shall be restarted by setting bit 0.9 to a logic one. This bit is self-
> clearing, and a PHY shall return a value of one in bit 0.9 until the Auto-Negotiation process has been
> initiated."
> 
> Maybe we should read bit 0.9 in genphy_update_link() after having read BMSR and report
> aneg-complete and link-up as false (no matter of their current value) if 0.9 is set.

Yes. That sounds sensible.

     Andrew

^ permalink raw reply

* Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain namespace data from current task
From: Carlos Antonio Neira Bustos @ 2019-08-08 20:32 UTC (permalink / raw)
  To: Y Song
  Cc: Yonghong Song, netdev@vger.kernel.org, ebiederm@xmission.com,
	brouer@redhat.com, quentin.monnet@netronome.com
In-Reply-To: <CAH3MdRUiQJ4e4rRAE4WrbzG8LWvnuDC4J-UYQc1wRA7AEN=7+g@mail.gmail.com>

Hi Yonghong,

I'm sorry just to be sure, I'm just missing the error codes from filename_lookup()?.
I'll work on that.

Bests

> > > Maybe reword by following the code sequence.
> > >     if *size_of_pidns* is not valid or unable to get ns, pid or tgid of
> > >     the current task.
> > >
> > > > + *
> > > > + *         **-ENOMEM**  if allocation fails.
> > >
> > > Maybe some other error codes in filename_lookup() function?
> > >
> > > > + *
> > > > + *         If unable to get the inode from /proc/self/ns/pid an error code
> > > > + *         will be returned.
> > >
> > > You do not need this. The description of error code cases should cover this.
>

On Thu, Aug 08, 2019 at 12:44:22PM -0700, Y Song wrote:
> On Thu, Aug 8, 2019 at 10:52 AM Carlos Antonio Neira Bustos
> <cneirabustos@gmail.com> wrote:
> >
> > Yonghong,
> >
> > I have modified the patch following your feedback.
> > Let me know if I'm missing something.
> 
> Yes, I have some other requests about formating.
> https://lore.kernel.org/netdev/20190808174848.poybtaagg5ctle7t@dev00/T/#t
> Could you address it as well?
> 
> >
> > Bests
> >
> > From 70f8d5584700c9cfc82c006901d8ee9595c53f15 Mon Sep 17 00:00:00 2001
> > From: Carlos <cneirabustos@gmail.com>
> > Date: Wed, 7 Aug 2019 20:04:30 -0400
> > Subject: [PATCH] [PATCH v6 bpf-next] BPF: New helper to obtain namespace data
> >  from current task
> >
> > This helper obtains the active namespace from current and returns pid, tgid,
> > device and namespace id as seen from that namespace, allowing to instrument
> > a process inside a container.
> > Device is read from /proc/self/ns/pid, as in the future it's possible that
> > different pid_ns files may belong to different devices, according
> > to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
> > conference.
> > Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
> > scripts but this helper returns the pid as seen by the root namespace which is
> > fine when a bcc script is not executed inside a container.
> > When the process of interest is inside a container, pid filtering will not work
> > if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
> > returning the pid as it's seen by the current namespace where the script is
> > executing.
> >
> > This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
> > used to do pid filtering even inside a container.
> >
> > For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):
> >
> >         u32 pid = bpf_get_current_pid_tgid() >> 32;
> >         if (pid != <pid_arg_passed_in>)
> >                 return 0;
> > Could be modified to use bpf_get_current_pidns_info() as follows:
> >
> >         struct bpf_pidns pidns;
> >         bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
> >         u32 pid = pidns.tgid;
> >         u32 nsid = pidns.nsid;
> >         if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>))
> >                 return 0;
> >
> > To find out the name PID namespace id of a process, you could use this command:
> >
> > $ ps -h -o pidns -p <pid_of_interest>
> >
> > Or this other command:
> >
> > $ ls -Li /proc/<pid_of_interest>/ns/pid
> >
> > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> > ---
> >  fs/internal.h                                      |   2 -
> >  fs/namei.c                                         |   1 -
> >  include/linux/bpf.h                                |   1 +
> >  include/linux/namei.h                              |   4 +
> >  include/uapi/linux/bpf.h                           |  27 +++-
> >  kernel/bpf/core.c                                  |   1 +
> >  kernel/bpf/helpers.c                               |  64 ++++++++++
> >  kernel/trace/bpf_trace.c                           |   2 +
> >  samples/bpf/Makefile                               |   3 +
> >  samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> >  samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> >  tools/include/uapi/linux/bpf.h                     |  27 +++-
> >  tools/testing/selftests/bpf/Makefile               |   2 +-
> >  tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> >  .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> >  tools/testing/selftests/bpf/test_pidns.c           | 138 +++++++++++++++++++++
> >  16 files changed, 399 insertions(+), 6 deletions(-)
> >  create mode 100644 samples/bpf/trace_ns_info_user.c
> >  create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> >  create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c
> >  create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> >
> > diff --git a/fs/internal.h b/fs/internal.h
> > index 315fcd8d237c..6647e15dd419 100644
> > --- a/fs/internal.h
> > +++ b/fs/internal.h
> > @@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
> >  /*
> >   * namei.c
> >   */
> > -extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
> > -                          struct path *path, struct path *root);
> >  extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
> >  extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
> >                            const char *, unsigned int, struct path *);
> > diff --git a/fs/namei.c b/fs/namei.c
> > index 209c51a5226c..a89fc72a4a10 100644
> > --- a/fs/namei.c
> > +++ b/fs/namei.c
> > @@ -19,7 +19,6 @@
> >  #include <linux/export.h>
> >  #include <linux/kernel.h>
> >  #include <linux/slab.h>
> > -#include <linux/fs.h>
> >  #include <linux/namei.h>
> >  #include <linux/pagemap.h>
> >  #include <linux/fsnotify.h>
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index f9a506147c8a..e4adf5e05afd 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto;
> >  extern const struct bpf_func_proto bpf_strtol_proto;
> >  extern const struct bpf_func_proto bpf_strtoul_proto;
> >  extern const struct bpf_func_proto bpf_tcp_sock_proto;
> > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> >
> >  /* Shared helpers among cBPF and eBPF. */
> >  void bpf_user_rnd_init_once(void);
> > diff --git a/include/linux/namei.h b/include/linux/namei.h
> > index 9138b4471dbf..b45c8b6f7cb4 100644
> > --- a/include/linux/namei.h
> > +++ b/include/linux/namei.h
> > @@ -6,6 +6,7 @@
> >  #include <linux/path.h>
> >  #include <linux/fcntl.h>
> >  #include <linux/errno.h>
> > +#include <linux/fs.h>
> >
> >  enum { MAX_NESTED_LINKS = 8 };
> >
> > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct dentry *);
> >
> >  extern void nd_jump_link(struct path *path);
> >
> > +extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
> > +                          struct path *path, struct path *root);
> > +
> >  static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
> >  {
> >         ((char *) name)[min(len, maxlen)] = '\0';
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 4393bd4b2419..b0d4869fb860 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2741,6 +2741,24 @@ union bpf_attr {
> >   *             **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
> >   *
> >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> > + *
> > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
> > + *     Description
> > + *             Copies into *pidns* pid, namespace id and tgid as seen by the
> > + *             current namespace and also device from /proc/self/ns/pid.
> > + *             *size_of_pidns* must be the size of *pidns*
> > + *
> > + *             This helper is used when pid filtering is needed inside a
> > + *             container as bpf_get_current_tgid() helper returns always the
> > + *             pid id as seen by the root namespace.
> > + *     Return
> > + *             0 on success
> > + *
> > + *             **-EINVAL** if *size_of_pidns* is not valid or unable to get ns, pid
> > + *             or tgid of the current task.
> > + *
> > + *             **-ENOMEM**  if allocation fails.
> > + *
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)          \
> >         FN(unspec),                     \
> > @@ -2853,7 +2871,8 @@ union bpf_attr {
> >         FN(sk_storage_get),             \
> >         FN(sk_storage_delete),          \
> >         FN(send_signal),                \
> > -       FN(tcp_gen_syncookie),
> > +       FN(tcp_gen_syncookie),          \
> > +       FN(get_current_pidns_info),
> >
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> >   * function eBPF program intends to call
> > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> >         __s32   retval;
> >  };
> >
> > +struct bpf_pidns_info {
> > +       __u32 dev;
> > +       __u32 nsid;
> > +       __u32 tgid;
> > +       __u32 pid;
> > +};
> >  #endif /* _UAPI__LINUX_BPF_H__ */
> > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > index 8191a7db2777..3159f2a0188c 100644
> > --- a/kernel/bpf/core.c
> > +++ b/kernel/bpf/core.c
> > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
> >  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> >  const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> >  const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> >
> >  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> >  {
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index 5e28718928ca..41fbf1f28a48 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -11,6 +11,12 @@
> >  #include <linux/uidgid.h>
> >  #include <linux/filter.h>
> >  #include <linux/ctype.h>
> > +#include <linux/pid_namespace.h>
> > +#include <linux/major.h>
> > +#include <linux/stat.h>
> > +#include <linux/namei.h>
> > +#include <linux/version.h>
> > +
> >
> >  #include "../../lib/kstrtox.h"
> >
> > @@ -312,6 +318,64 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
> >         preempt_enable();
> >  }
> >
> > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,
> > +        size)
> > +{
> > +       const char *pidns_path = "/proc/self/ns/pid";
> > +       struct pid_namespace *pidns = NULL;
> > +       struct filename *tmp = NULL;
> > +       struct inode *inode;
> > +       struct path kp;
> > +       pid_t tgid = 0;
> > +       pid_t pid = 0;
> > +       int ret;
> > +       int len;
> > +
> > +       if (unlikely(size != sizeof(struct bpf_pidns_info)))
> > +               return -EINVAL;
> > +       pidns = task_active_pid_ns(current);
> > +       if (unlikely(!pidns))
> > +               goto clear;
> > +       pidns_info->nsid =  pidns->ns.inum;
> > +       pid = task_pid_nr_ns(current, pidns);
> > +       if (unlikely(!pid))
> > +               goto clear;
> > +       tgid = task_tgid_nr_ns(current, pidns);
> > +       if (unlikely(!tgid))
> > +               goto clear;
> > +       pidns_info->tgid = (u32) tgid;
> > +       pidns_info->pid = (u32) pid;
> > +       tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> > +       if (unlikely(!tmp)) {
> > +               memset((void *)pidns_info, 0, (size_t) size);
> > +               return -ENOMEM;
> > +       }
> > +       len = strlen(pidns_path) + 1;
> > +       memcpy((char *)tmp->name, pidns_path, len);
> > +       tmp->uptr = NULL;
> > +       tmp->aname = NULL;
> > +       tmp->refcnt = 1;
> > +       ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> > +       if (ret) {
> > +               memset((void *)pidns_info, 0, (size_t) size);
> > +               return ret;
> > +       }
> > +       inode = d_backing_inode(kp.dentry);
> > +       pidns_info->dev = inode->i_sb->s_dev;
> > +       return 0;
> > +clear:
> > +       memset((void *)pidns_info, 0, (size_t) size);
> > +       return -EINVAL;
> > +}
> > +
> > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> > +       .func           = bpf_get_current_pidns_info,
> > +       .gpl_only       = false,
> > +       .ret_type       = RET_INTEGER,
> > +       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> > +       .arg2_type      = ARG_CONST_SIZE,
> > +};
> > +
> >  #ifdef CONFIG_CGROUPS
> >  BPF_CALL_0(bpf_get_current_cgroup_id)
> >  {
> > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > index ca1255d14576..5e1dc22765a5 100644
> > --- a/kernel/trace/bpf_trace.c
> > +++ b/kernel/trace/bpf_trace.c
> > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> >  #endif
> >         case BPF_FUNC_send_signal:
> >                 return &bpf_send_signal_proto;
> > +       case BPF_FUNC_get_current_pidns_info:
> > +               return &bpf_get_current_pidns_info_proto;
> >         default:
> >                 return NULL;
> >         }
> > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> > index 1d9be26b4edd..238453ff27d2 100644
> > --- a/samples/bpf/Makefile
> > +++ b/samples/bpf/Makefile
> > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> >  hostprogs-y += xdp_sample_pkts
> >  hostprogs-y += ibumad
> >  hostprogs-y += hbm
> > +hostprogs-y += trace_ns_info
> >
> >  # Libbpf dependencies
> >  LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
> > @@ -109,6 +110,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
> >  xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
> >  ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
> >  hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
> > +trace_ns_info-objs := bpf_load.o trace_ns_info_user.o
> >
> >  # Tell kbuild to always build the programs
> >  always := $(hostprogs-y)
> > @@ -170,6 +172,7 @@ always += xdp_sample_pkts_kern.o
> >  always += ibumad_kern.o
> >  always += hbm_out_kern.o
> >  always += hbm_edt_kern.o
> > +always += trace_ns_info_user_kern.o
> >
> >  KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
> >  KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
> > diff --git a/samples/bpf/trace_ns_info_user.c b/samples/bpf/trace_ns_info_user.c
> > new file mode 100644
> > index 000000000000..e06d08db6f30
> > --- /dev/null
> > +++ b/samples/bpf/trace_ns_info_user.c
> > @@ -0,0 +1,35 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of version 2 of the GNU General Public
> > + * License as published by the Free Software Foundation.
> > + */
> > +
> > +#include <stdio.h>
> > +#include <linux/bpf.h>
> > +#include <unistd.h>
> > +#include "bpf/libbpf.h"
> > +#include "bpf_load.h"
> > +
> > +/* This code was taken verbatim from tracex1_user.c, it's used
> > + * to exercize bpf_get_current_pidns_info() helper call.
> > + */
> > +int main(int ac, char **argv)
> > +{
> > +       FILE *f;
> > +       char filename[256];
> > +
> > +       snprintf(filename, sizeof(filename), "%s_user_kern.o", argv[0]);
> > +       printf("loading %s\n", filename);
> > +
> > +       if (load_bpf_file(filename)) {
> > +               printf("%s", bpf_log_buf);
> > +               return 1;
> > +       }
> > +
> > +       f = popen("taskset 1 ping  localhost", "r");
> > +       (void) f;
> > +       read_trace_pipe();
> > +       return 0;
> > +}
> > diff --git a/samples/bpf/trace_ns_info_user_kern.c b/samples/bpf/trace_ns_info_user_kern.c
> > new file mode 100644
> > index 000000000000..96675e02b707
> > --- /dev/null
> > +++ b/samples/bpf/trace_ns_info_user_kern.c
> > @@ -0,0 +1,44 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of version 2 of the GNU General Public
> > + * License as published by the Free Software Foundation.
> > + */
> > +#include <linux/skbuff.h>
> > +#include <linux/netdevice.h>
> > +#include <linux/version.h>
> > +#include <uapi/linux/bpf.h>
> > +#include "bpf_helpers.h"
> > +
> > +typedef __u64 u64;
> > +typedef __u32 u32;
> > +
> > +
> > +/* kprobe is NOT a stable ABI
> > + * kernel functions can be removed, renamed or completely change semantics.
> > + * Number of arguments and their positions can change, etc.
> > + * In such case this bpf+kprobe example will no longer be meaningful
> > + */
> > +
> > +/* This will call bpf_get_current_pidns_info() to display pid and ns values
> > + * as seen by the current namespace, on the far left you will see the pid as
> > + * seen as by the root namespace.
> > + */
> > +
> > +SEC("kprobe/__netif_receive_skb_core")
> > +int bpf_prog1(struct pt_regs *ctx)
> > +{
> > +       char fmt[] = "nsid:%u, dev: %u,  pid:%u\n";
> > +       struct bpf_pidns_info nsinfo;
> > +       int ok = 0;
> > +
> > +       ok = bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo));
> > +       if (ok == 0)
> > +               bpf_trace_printk(fmt, sizeof(fmt), (u32)nsinfo.nsid,
> > +                                (u32) nsinfo.dev, (u32)nsinfo.pid);
> > +
> > +       return 0;
> > +}
> > +char _license[] SEC("license") = "GPL";
> > +u32 _version SEC("version") = LINUX_VERSION_CODE;
> > diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> > index 4393bd4b2419..b0d4869fb860 100644
> > --- a/tools/include/uapi/linux/bpf.h
> > +++ b/tools/include/uapi/linux/bpf.h
> > @@ -2741,6 +2741,24 @@ union bpf_attr {
> >   *             **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
> >   *
> >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> > + *
> > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
> > + *     Description
> > + *             Copies into *pidns* pid, namespace id and tgid as seen by the
> > + *             current namespace and also device from /proc/self/ns/pid.
> > + *             *size_of_pidns* must be the size of *pidns*
> > + *
> > + *             This helper is used when pid filtering is needed inside a
> > + *             container as bpf_get_current_tgid() helper returns always the
> > + *             pid id as seen by the root namespace.
> > + *     Return
> > + *             0 on success
> > + *
> > + *             **-EINVAL** if *size_of_pidns* is not valid or unable to get ns, pid
> > + *             or tgid of the current task.
> > + *
> > + *             **-ENOMEM**  if allocation fails.
> > + *
> >   */
> >  #define __BPF_FUNC_MAPPER(FN)          \
> >         FN(unspec),                     \
> > @@ -2853,7 +2871,8 @@ union bpf_attr {
> >         FN(sk_storage_get),             \
> >         FN(sk_storage_delete),          \
> >         FN(send_signal),                \
> > -       FN(tcp_gen_syncookie),
> > +       FN(tcp_gen_syncookie),          \
> > +       FN(get_current_pidns_info),
> >
> >  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> >   * function eBPF program intends to call
> > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> >         __s32   retval;
> >  };
> >
> > +struct bpf_pidns_info {
> > +       __u32 dev;
> > +       __u32 nsid;
> > +       __u32 tgid;
> > +       __u32 pid;
> > +};
> >  #endif /* _UAPI__LINUX_BPF_H__ */
> > diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> > index 3bd0f4a0336a..1f97b571b581 100644
> > --- a/tools/testing/selftests/bpf/Makefile
> > +++ b/tools/testing/selftests/bpf/Makefile
> > @@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
> >         test_cgroup_storage test_select_reuseport test_section_names \
> >         test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
> >         test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
> > -       test_sockopt_multi test_tcp_rtt
> > +       test_sockopt_multi test_tcp_rtt test_pidns
> >
> >  BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
> >  TEST_GEN_FILES = $(BPF_OBJ_FILES)
> > diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
> > index 120aa86c58d3..c96795a9d983 100644
> > --- a/tools/testing/selftests/bpf/bpf_helpers.h
> > +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> > @@ -231,6 +231,9 @@ static int (*bpf_send_signal)(unsigned sig) = (void *)BPF_FUNC_send_signal;
> >  static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
> >                                           int ip_len, void *tcp, int tcp_len) =
> >         (void *) BPF_FUNC_tcp_gen_syncookie;
> > +static int (*bpf_get_current_pidns_info)(struct bpf_pidns_info *buf,
> > +                                        unsigned int buf_size) =
> > +       (void *) BPF_FUNC_get_current_pidns_info;
> >
> >  /* llvm builtin functions that eBPF C program may use to
> >   * emit BPF_LD_ABS and BPF_LD_IND instructions
> > diff --git a/tools/testing/selftests/bpf/progs/test_pidns_kern.c b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > new file mode 100644
> > index 000000000000..e1d2facfa762
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > @@ -0,0 +1,51 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of version 2 of the GNU General Public
> > + * License as published by the Free Software Foundation.
> > + */
> > +
> > +#include <linux/bpf.h>
> > +#include <errno.h>
> > +#include "bpf_helpers.h"
> > +
> > +struct bpf_map_def SEC("maps") nsidmap = {
> > +       .type = BPF_MAP_TYPE_ARRAY,
> > +       .key_size = sizeof(__u32),
> > +       .value_size = sizeof(__u32),
> > +       .max_entries = 1,
> > +};
> > +
> > +struct bpf_map_def SEC("maps") pidmap = {
> > +       .type = BPF_MAP_TYPE_ARRAY,
> > +       .key_size = sizeof(__u32),
> > +       .value_size = sizeof(__u32),
> > +       .max_entries = 1,
> > +};
> > +
> > +SEC("tracepoint/syscalls/sys_enter_nanosleep")
> > +int trace(void *ctx)
> > +{
> > +       struct bpf_pidns_info nsinfo;
> > +       __u32 key = 0, *expected_pid, *val;
> > +       char fmt[] = "ERROR nspid:%d\n";
> > +
> > +       if (bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo)))
> > +               return -EINVAL;
> > +
> > +       expected_pid = bpf_map_lookup_elem(&pidmap, &key);
> > +
> > +
> > +       if (!expected_pid || *expected_pid != nsinfo.pid)
> > +               return 0;
> > +
> > +       val = bpf_map_lookup_elem(&nsidmap, &key);
> > +       if (val)
> > +               *val = nsinfo.nsid;
> > +
> > +       return 0;
> > +}
> > +
> > +char _license[] SEC("license") = "GPL";
> > +__u32 _version SEC("version") = 1;
> > diff --git a/tools/testing/selftests/bpf/test_pidns.c b/tools/testing/selftests/bpf/test_pidns.c
> > new file mode 100644
> > index 000000000000..a7254055f294
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/test_pidns.c
> > @@ -0,0 +1,138 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > + *
> > + * This program is free software; you can redistribute it and/or
> > + * modify it under the terms of version 2 of the GNU General Public
> > + * License as published by the Free Software Foundation.
> > + */
> > +
> > +#include <stdio.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <errno.h>
> > +#include <fcntl.h>
> > +#include <syscall.h>
> > +#include <unistd.h>
> > +#include <linux/perf_event.h>
> > +#include <sys/ioctl.h>
> > +#include <sys/time.h>
> > +#include <sys/types.h>
> > +#include <sys/stat.h>
> > +
> > +#include <linux/bpf.h>
> > +#include <bpf/bpf.h>
> > +#include <bpf/libbpf.h>
> > +
> > +#include "cgroup_helpers.h"
> > +#include "bpf_rlimit.h"
> > +
> > +#define CHECK(condition, tag, format...) ({            \
> > +       int __ret = !!(condition);                      \
> > +       if (__ret) {                                    \
> > +               printf("%s:FAIL:%s ", __func__, tag);   \
> > +               printf(format);                         \
> > +       } else {                                        \
> > +               printf("%s:PASS:%s\n", __func__, tag);  \
> > +       }                                               \
> > +       __ret;                                          \
> > +})
> > +
> > +static int bpf_find_map(const char *test, struct bpf_object *obj,
> > +                       const char *name)
> > +{
> > +       struct bpf_map *map;
> > +
> > +       map = bpf_object__find_map_by_name(obj, name);
> > +       if (!map)
> > +               return -1;
> > +       return bpf_map__fd(map);
> > +}
> > +
> > +
> > +int main(int argc, char **argv)
> > +{
> > +       const char *probe_name = "syscalls/sys_enter_nanosleep";
> > +       const char *file = "test_pidns_kern.o";
> > +       int err, bytes, efd, prog_fd, pmu_fd;
> > +       int pidmap_fd, nsidmap_fd;
> > +       struct perf_event_attr attr = {};
> > +       struct bpf_object *obj;
> > +       __u32 knsid = 0;
> > +       __u32 key = 0, pid;
> > +       int exit_code = 1;
> > +       struct stat st;
> > +       char buf[256];
> > +
> > +       err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
> > +       if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
> > +               goto cleanup_cgroup_env;
> > +
> > +       nsidmap_fd = bpf_find_map(__func__, obj, "nsidmap");
> > +       if (CHECK(nsidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> > +                 nsidmap_fd, errno))
> > +               goto close_prog;
> > +
> > +       pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
> > +       if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> > +                 pidmap_fd, errno))
> > +               goto close_prog;
> > +
> > +       pid = getpid();
> > +       bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
> > +
> > +       snprintf(buf, sizeof(buf),
> > +                "/sys/kernel/debug/tracing/events/%s/id", probe_name);
> > +       efd = open(buf, O_RDONLY, 0);
> > +       if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
> > +               goto close_prog;
> > +       bytes = read(efd, buf, sizeof(buf));
> > +       close(efd);
> > +       if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
> > +                 "bytes %d errno %d\n", bytes, errno))
> > +               goto close_prog;
> > +
> > +       attr.config = strtol(buf, NULL, 0);
> > +       attr.type = PERF_TYPE_TRACEPOINT;
> > +       attr.sample_type = PERF_SAMPLE_RAW;
> > +       attr.sample_period = 1;
> > +       attr.wakeup_events = 1;
> > +
> > +       pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
> > +       if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
> > +                 errno))
> > +               goto close_prog;
> > +
> > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
> > +       if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
> > +                 errno))
> > +               goto close_pmu;
> > +
> > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
> > +       if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
> > +                 errno))
> > +               goto close_pmu;
> > +
> > +       /* trigger some syscalls */
> > +       sleep(1);
> > +
> > +       err = bpf_map_lookup_elem(nsidmap_fd, &key, &knsid);
> > +       if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno))
> > +               goto close_pmu;
> > +
> > +       if (stat("/proc/self/ns/pid", &st))
> > +               goto close_pmu;
> > +
> > +       if (CHECK(knsid != (__u32) st.st_ino, "compare_namespace_id",
> > +                 "kern knsid %u user unsid %u\n", knsid, (__u32) st.st_ino))
> > +               goto close_pmu;
> > +
> > +       exit_code = 0;
> > +       printf("%s:PASS\n", argv[0]);
> > +
> > +close_pmu:
> > +       close(pmu_fd);
> > +close_prog:
> > +       bpf_object__close(obj);
> > +cleanup_cgroup_env:
> > +       return exit_code;
> > +}
> > --
> > 2.11.0
> >
> >
> >
> >
> >
> >
> > On Thu, Aug 08, 2019 at 05:09:51AM +0000, Yonghong Song wrote:
> > >
> > >
> > > On 8/7/19 6:22 PM, Carlos Antonio Neira Bustos wrote:
> > > > The code has been modified to avoid syscalls that could sleep.
> > > > Please let me know if any other modification is needed.
> > > >
> > > >  From be0384c0fa209a78c1567936e8db4e35b9a7c0f8 Mon Sep 17 00:00:00 2001
> > > > From: Carlos <cneirabustos@gmail.com>
> > > > Date: Wed, 7 Aug 2019 20:04:30 -0400
> > > > Subject: [PATCH] [PATCH v5 bpf-next] BPF: New helper to obtain namespace data
> > > >   from current task
> > > >
> > > > This helper obtains the active namespace from current and returns pid, tgid,
> > > > device and namespace id as seen from that namespace, allowing to instrument
> > > > a process inside a container.
> > > > Device is read from /proc/self/ns/pid, as in the future it's possible that
> > > > different pid_ns files may belong to different devices, according
> > > > to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
> > > > conference.
> > > > Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
> > > > scripts but this helper returns the pid as seen by the root namespace which is
> > > > fine when a bcc script is not executed inside a container.
> > > > When the process of interest is inside a container, pid filtering will not work
> > > > if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
> > > > returning the pid as it's seen by the current namespace where the script is
> > > > executing.
> > > >
> > > > This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
> > > > used to do pid filtering even inside a container.
> > > >
> > > > For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):
> > > >
> > > >          u32 pid = bpf_get_current_pid_tgid() >> 32;
> > > >          if (pid != <pid_arg_passed_in>)
> > > >                  return 0;
> > > > Could be modified to use bpf_get_current_pidns_info() as follows:
> > > >
> > > >          struct bpf_pidns pidns;
> > > >          bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
> > > >          u32 pid = pidns.tgid;
> > > >          u32 nsid = pidns.nsid;
> > > >          if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>))
> > > >                  return 0;
> > > >
> > > > To find out the name PID namespace id of a process, you could use this command:
> > > >
> > > > $ ps -h -o pidns -p <pid_of_interest>
> > > >
> > > > Or this other command:
> > > >
> > > > $ ls -Li /proc/<pid_of_interest>/ns/pid
> > > >
> > > > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> > > > ---
> > > >   fs/namei.c                                         |   2 +-
> > > >   include/linux/bpf.h                                |   1 +
> > > >   include/linux/namei.h                              |   4 +
> > > >   include/uapi/linux/bpf.h                           |  29 ++++-
> > > >   kernel/bpf/core.c                                  |   1 +
> > > >   kernel/bpf/helpers.c                               |  78 ++++++++++++
> > > >   kernel/trace/bpf_trace.c                           |   2 +
> > > >   samples/bpf/Makefile                               |   3 +
> > > >   samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> > > >   samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> > > >   tools/include/uapi/linux/bpf.h                     |  29 ++++-
> > > >   tools/testing/selftests/bpf/Makefile               |   2 +-
> > > >   tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> > > >   .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> > > >   tools/testing/selftests/bpf/test_pidns.c           | 138 +++++++++++++++++++++
> > > >   15 files changed, 418 insertions(+), 4 deletions(-)
> > > >   create mode 100644 samples/bpf/trace_ns_info_user.c
> > > >   create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> > > >   create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > > >   create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> > > >
> > > > diff --git a/fs/namei.c b/fs/namei.c
> > > > index 209c51a5226c..d1eca36972d2 100644
> > > > --- a/fs/namei.c
> > > > +++ b/fs/namei.c
> > > > @@ -19,7 +19,6 @@
> > > >   #include <linux/export.h>
> > > >   #include <linux/kernel.h>
> > > >   #include <linux/slab.h>
> > > > -#include <linux/fs.h>
> > > >   #include <linux/namei.h>
> > > >   #include <linux/pagemap.h>
> > > >   #include <linux/fsnotify.h>
> > > > @@ -2355,6 +2354,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
> > > >     putname(name);
> > > >     return retval;
> > > >   }
> > > > +EXPORT_SYMBOL(filename_lookup);
> > >
> > > No need to export symbols. bpf uses it and bpf is in the core, not in
> > > modules.
> > >
> > > >
> > > >   /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
> > > >   static int path_parentat(struct nameidata *nd, unsigned flags,
> > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > > index f9a506147c8a..e4adf5e05afd 100644
> > > > --- a/include/linux/bpf.h
> > > > +++ b/include/linux/bpf.h
> > > > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto;
> > > >   extern const struct bpf_func_proto bpf_strtol_proto;
> > > >   extern const struct bpf_func_proto bpf_strtoul_proto;
> > > >   extern const struct bpf_func_proto bpf_tcp_sock_proto;
> > > > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> > > >
> > > >   /* Shared helpers among cBPF and eBPF. */
> > > >   void bpf_user_rnd_init_once(void);
> > > > diff --git a/include/linux/namei.h b/include/linux/namei.h
> > > > index 9138b4471dbf..2c24e8c71d46 100644
> > > > --- a/include/linux/namei.h
> > > > +++ b/include/linux/namei.h
> > > > @@ -6,6 +6,7 @@
> > > >   #include <linux/path.h>
> > > >   #include <linux/fcntl.h>
> > > >   #include <linux/errno.h>
> > > > +#include <linux/fs.h>
> > > >
> > > >   enum { MAX_NESTED_LINKS = 8 };
> > > >
> > > > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct dentry *);
> > > >
> > > >   extern void nd_jump_link(struct path *path);
> > > >
> > > > +extern int filename_lookup(int dfd, struct filename *name, unsigned int flags,
> > > > +               struct path *path, struct path *root);
> > >
> > > The previous definition in fs/internal.h should be removed.
> > >
> > > > +
> > > >   static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
> > > >   {
> > > >     ((char *) name)[min(len, maxlen)] = '\0';
> > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > > index 4393bd4b2419..6f601f7106e2 100644
> > > > --- a/include/uapi/linux/bpf.h
> > > > +++ b/include/uapi/linux/bpf.h
> > > > @@ -2741,6 +2741,26 @@ union bpf_attr {
> > > >    *                **-EOPNOTSUPP** kernel configuration does not enable SYN cookies
> > > >    *
> > > >    *                **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> > > > + *
> > > > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
> > > > + * Description
> > > > + *         Copies into *pidns* pid, namespace id and tgid as seen by the
> > > > + *         current namespace and also device from /proc/self/ns/pid.
> > > > + *         *size_of_pidns* must be the size of *pidns*
> > > > + *
> > > > + *         This helper is used when pid filtering is needed inside a
> > > > + *         container as bpf_get_current_tgid() helper returns always the
> > > > + *         pid id as seen by the root namespace.
> > > > + * Return
> > > > + *         0 on success
> > > > + *
> > > > + *         **-EINVAL**  if unable to get ns, pid or tgid of current task.
> > > > + *         Or if size_of_pidns is not valid.
> > >
> > > Maybe reword by following the code sequence.
> > >     if *size_of_pidns* is not valid or unable to get ns, pid or tgid of
> > >     the current task.
> > >
> > > > + *
> > > > + *         **-ENOMEM**  if allocation fails.
> > >
> > > Maybe some other error codes in filename_lookup() function?
> > >
> > > > + *
> > > > + *         If unable to get the inode from /proc/self/ns/pid an error code
> > > > + *         will be returned.
> > >
> > > You do not need this. The description of error code cases should cover this.
> > >
> > > >    */
> > > >   #define __BPF_FUNC_MAPPER(FN)             \
> > > >     FN(unspec),                     \
> > > > @@ -2853,7 +2873,8 @@ union bpf_attr {
> > > >     FN(sk_storage_get),             \
> > > >     FN(sk_storage_delete),          \
> > > >     FN(send_signal),                \
> > > > -   FN(tcp_gen_syncookie),
> > > > +   FN(tcp_gen_syncookie),          \
> > > > +   FN(get_current_pidns_info),
> > > >
> > > >   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> > > >    * function eBPF program intends to call
> > > > @@ -3604,4 +3625,10 @@ struct bpf_sockopt {
> > > >     __s32   retval;
> > > >   };
> > > >
> > > > +struct bpf_pidns_info {
> > > > +   __u32 dev;
> > > > +   __u32 nsid;
> > > > +   __u32 tgid;
> > > > +   __u32 pid;
> > > > +};
> > > >   #endif /* _UAPI__LINUX_BPF_H__ */
> > > > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > > > index 8191a7db2777..3159f2a0188c 100644
> > > > --- a/kernel/bpf/core.c
> > > > +++ b/kernel/bpf/core.c
> > > > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
> > > >   const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> > > >   const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> > > >   const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> > > > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> > > >
> > > >   const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> > > >   {
> > > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > > index 5e28718928ca..571f24077db2 100644
> > > > --- a/kernel/bpf/helpers.c
> > > > +++ b/kernel/bpf/helpers.c
> > > > @@ -11,6 +11,12 @@
> > > >   #include <linux/uidgid.h>
> > > >   #include <linux/filter.h>
> > > >   #include <linux/ctype.h>
> > > > +#include <linux/pid_namespace.h>
> > > > +#include <linux/major.h>
> > > > +#include <linux/stat.h>
> > > > +#include <linux/namei.h>
> > > > +#include <linux/version.h>
> > > > +
> > > >
> > > >   #include "../../lib/kstrtox.h"
> > > >
> > > > @@ -312,6 +318,78 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
> > > >     preempt_enable();
> > > >   }
> > > >
> > > > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,
> > > > +    size)
> > > > +{
> > > > +   const char *name = "/proc/self/ns/pid";
> > >
> > > maybe rename this variable to pidns_path?
> > >
> > > > +   struct pid_namespace *pidns = NULL;
> > > > +   struct filename *tmp = NULL;
> > >
> > > Maybe rename this variable to name?
> > >
> > > > +   int len = strlen(name) + 1;
> > >
> > > We can delay this assignment later until it is needed.
> > >
> > > > +   struct inode *inode;
> > > > +   struct path kp;
> > > > +   pid_t tgid = 0;
> > > > +   pid_t pid = 0;
> > > > +   int ret;
> > > > +
> > > > +   if (unlikely(size != sizeof(struct bpf_pidns_info)))
> > > > +           return -EINVAL;
> > > > +
> > > > +   pidns = task_active_pid_ns(current);
> > > > +
> > >
> > > we can save an empty line here.
> > >
> > > > +   if (unlikely(!pidns))
> > > > +           goto clear;
> > > > +
> > > > +   pidns_info->nsid =  pidns->ns.inum;
> > > > +   pid = task_pid_nr_ns(current, pidns);
> > > > +
> > >
> > > We can save an empty line here.
> > >
> > > > +   if (unlikely(!pid))
> > > > +           goto clear;
> > > > +
> > > > +   tgid = task_tgid_nr_ns(current, pidns);
> > > > +
> > > ditto. save an empty line.
> > > > +   if (unlikely(!tgid))
> > > > +           goto clear;
> > > > +
> > > > +   pidns_info->tgid = (u32) tgid;
> > > > +   pidns_info->pid = (u32) pid;
> > > > +
> > > > +   tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> > > > +   if (unlikely(!tmp)) {
> > > > +           memset((void *)pidns_info, 0, (size_t) size);
> > > > +           return -ENOMEM;
> > > > +   }
> > > > +
> > > > +   memcpy((char *)tmp->name, name, len);
> > > > +   tmp->uptr = NULL;
> > > > +   tmp->aname = NULL;
> > > > +   tmp->refcnt = 1;
> > > > +
> > > ditto. save an empty line.
> > > > +   ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> > > > +
> > > ditto. save an empty line.
> > > > +   if (ret) {
> > > > +           memset((void *)pidns_info, 0, (size_t) size);
> > > > +           return ret;
> > > > +   }
> > > > +
> > > > +   inode = d_backing_inode(kp.dentry);
> > > > +   pidns_info->dev = inode->i_sb->s_dev;
> > > > +
> > > > +   return 0;
> > > > +
> > > > +clear:
> > > > +   memset((void *)pidns_info, 0, (size_t) size);
> > > > +
> > > save an empty line.
> > > > +   return -EINVAL;
> > > > +}
> > > > +
> > > > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> > > > +   .func   = bpf_get_current_pidns_info,
> > > make the "= " aligned with others?
> > > > +   .gpl_only       = false,
> > > > +   .ret_type       = RET_INTEGER,
> > > > +   .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> > > > +   .arg2_type      = ARG_CONST_SIZE,
> > > > +};
> > > > +
> > > >   #ifdef CONFIG_CGROUPS
> > > >   BPF_CALL_0(bpf_get_current_cgroup_id)
> > > >   {
> > > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > > index ca1255d14576..5e1dc22765a5 100644
> > > > --- a/kernel/trace/bpf_trace.c
> > > > +++ b/kernel/trace/bpf_trace.c
> > > > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
> > > >   #endif
> > > >     case BPF_FUNC_send_signal:
> > > >             return &bpf_send_signal_proto;
> > > > +   case BPF_FUNC_get_current_pidns_info:
> > > > +           return &bpf_get_current_pidns_info_proto;
> > > >     default:
> > > >             return NULL;
> > > >     }
> > > > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> > > > index 1d9be26b4edd..238453ff27d2 100644
> > > > --- a/samples/bpf/Makefile
> > > > +++ b/samples/bpf/Makefile
> > > > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> > > >   hostprogs-y += xdp_sample_pkts
> > > >   hostprogs-y += ibumad
> > > >   hostprogs-y += hbm
> > > > +hostprogs-y += trace_ns_info
> > > [...]

^ permalink raw reply

* Re: [v3,4/4] tools: bpftool: add documentation for net attach/detach
From: Daniel T. Lee @ 2019-08-08 20:28 UTC (permalink / raw)
  To: Quentin Monnet; +Cc: Daniel Borkmann, Alexei Starovoitov, netdev
In-Reply-To: <1cc16243-ad5a-87f3-7727-31a58599bf04@netronome.com>

On Fri, Aug 9, 2019 at 1:48 AM Quentin Monnet
<quentin.monnet@netronome.com> wrote:
>
> 2019-08-07 11:25 UTC+0900 ~ Daniel T. Lee <danieltimlee@gmail.com>
> > Since, new sub-command 'net attach/detach' has been added for
> > attaching XDP program on interface,
> > this commit documents usage and sample output of `net attach/detach`.
> >
> > Signed-off-by: Daniel T. Lee <danieltimlee@gmail.com>
> > ---
> >  .../bpf/bpftool/Documentation/bpftool-net.rst | 51 +++++++++++++++++--
> >  1 file changed, 48 insertions(+), 3 deletions(-)
> >
> > diff --git a/tools/bpf/bpftool/Documentation/bpftool-net.rst b/tools/bpf/bpftool/Documentation/bpftool-net.rst
> > index d8e5237a2085..4ad1a380e186 100644
> > --- a/tools/bpf/bpftool/Documentation/bpftool-net.rst
> > +++ b/tools/bpf/bpftool/Documentation/bpftool-net.rst
> > @@ -15,17 +15,22 @@ SYNOPSIS
> >       *OPTIONS* := { [{ **-j** | **--json** }] [{ **-p** | **--pretty** }] }
> >
> >       *COMMANDS* :=
> > -     { **show** | **list** } [ **dev** name ] | **help**
> > +     { **show** | **list** | **attach** | **detach** | **help** }
> >
> >  NET COMMANDS
> >  ============
> >
> > -|    **bpftool** **net { show | list } [ dev name ]**
> > +|    **bpftool** **net { show | list }** [ **dev** *name* ]
> > +|    **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *name* [ **overwrite** ]
> > +|    **bpftool** **net detach** *ATTACH_TYPE* **dev** *name*
>
> Nit: Could we have "name" in capital letters (everywhere in the file),
> to make this file consistent with the formatting used for
> bpftool-prog.rst and bpftool-map.rst?
>

I'll update all "name" with capital "NAME" at next version of patch.

> >  |    **bpftool** **net help**
> > +|
> > +|    *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* }
> > +|    *ATTACH_TYPE* := { **xdp** | **xdpgeneric** | **xdpdrv** | **xdpoffload** }
> >
> >  DESCRIPTION
> >  ===========
> > -     **bpftool net { show | list } [ dev name ]**
> > +     **bpftool net { show | list }** [ **dev** *name* ]
> >                    List bpf program attachments in the kernel networking subsystem.
> >
> >                    Currently, only device driver xdp attachments and tc filter
> > @@ -47,6 +52,18 @@ DESCRIPTION
> >                    all bpf programs attached to non clsact qdiscs, and finally all
> >                    bpf programs attached to root and clsact qdisc.
> >
> > +     **bpftool** **net attach** *ATTACH_TYPE* *PROG* **dev** *name* [ **overwrite** ]
> > +                  Attach bpf program *PROG* to network interface *name* with
> > +                  type specified by *ATTACH_TYPE*. Previously attached bpf program
> > +                  can be replaced by the command used with **overwrite** option.
> > +                  Currently, *ATTACH_TYPE* only contains XDP programs.
>
> Other nit: "ATTACH_TYPE only contains XDP programs" sounds odd to me.
> Could we maybe phrase this something like: "Currently, only XDP-related
> modes are supported for ATTACH_TYPE"?
>
> Also, could you please provide a brief description of the different
> attach types? In particular, explaining what "xdp" alone stands for
> might be useful.
>

I'll replace the phrase and add brief description about the attach types.

> Thanks,
> Quentin
>
> > +
> > +     **bpftool** **net detach** *ATTACH_TYPE* **dev** *name*
> > +                  Detach bpf program attached to network interface *name* with
> > +                  type specified by *ATTACH_TYPE*. To detach bpf program, same
> > +                  *ATTACH_TYPE* previously used for attach must be specified.
> > +                  Currently, *ATTACH_TYPE* only contains XDP programs.

Thank you for taking your time for the review.

^ permalink raw reply

* Re: [PATCH net-next 3/3] net: phy: realtek: add support for the 2.5Gbps PHY in RTL8125
From: Heiner Kallweit @ 2019-08-08 20:24 UTC (permalink / raw)
  To: Andrew Lunn; +Cc: Florian Fainelli, David Miller, netdev@vger.kernel.org
In-Reply-To: <20190808202029.GN27917@lunn.ch>

On 08.08.2019 22:20, Andrew Lunn wrote:
>> I have a contact in Realtek who provided the information about
>> the vendor-specific registers used in the patch. I also asked for
>> a method to auto-detect 2.5Gbps support but have no feedback so far.
>> What may contribute to the problem is that also the integrated 1Gbps
>> PHY's (all with the same PHY ID) differ significantly from each other,
>> depending on the network chip version.
> 
> Hi Heiner
> 
> Some of the PHYs embedded in Marvell switches have an OUI, but no
> product ID. We work around this brokenness by trapping the reads to
> the ID registers in the MDIO bus controller driver and inserting the
> switch product ID. The Marvell PHY driver then recognises these IDs
> and does the right thing.
> 
> Maybe you can do something similar here?
> 
Yes, this would be an idea. Let me check.

>       Andrew
> 
Thanks, Heiner


^ permalink raw reply

* [net 12/12] net/mlx5e: Remove redundant check in CQE recovery flow of tx reporter
From: Saeed Mahameed @ 2019-08-08 20:22 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev@vger.kernel.org, Aya Levin, Saeed Mahameed
In-Reply-To: <20190808202025.11303-1-saeedm@mellanox.com>

From: Aya Levin <ayal@mellanox.com>

Remove check of recovery bit, in the beginning of the CQE recovery
function. This test is already performed right before the reporter
is invoked, when CQE error is detected.

Fixes: de8650a82071 ("net/mlx5e: Add tx reporter support")
Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index b91814ecfbc9..c7f86453c638 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -76,9 +76,6 @@ static int mlx5e_tx_reporter_err_cqe_recover(struct mlx5e_txqsq *sq)
 	u8 state;
 	int err;
 
-	if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
-		return 0;
-
 	err = mlx5_core_query_sq_state(mdev, sq->sqn, &state);
 	if (err) {
 		netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n",
-- 
2.21.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox