Netdev List
 help / color / mirror / Atom feed
* [net-next 09/15] net/mlx5e: Extend encap entry with reference counter
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Jianbo Liu, Roi Dayan,
	Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

List of flows attached to encap entry is used as implicit reference
counter (encap entry is deallocated when list becomes free) and as a
mechanism to obtain encap entry that flow is attached to (through list
head). This is not safe when concurrent modification of list of flows
attached to encap entry is possible. Proper atomic reference counter is
required to support concurrent access.

As a preparation for extending encap with reference counting, extract code
that lookups and deletes encap entry into standalone put/get helpers. In
order to remove this dependency on external locking, extend encap entry
with reference counter to manage its lifetime and extend flow structure
with direct pointer to encap entry that flow is attached to.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  |  5 ++
 .../net/ethernet/mellanox/mlx5/core/en_rep.h  |  1 +
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 84 ++++++++++++-------
 .../net/ethernet/mellanox/mlx5/core/en_tc.h   |  2 +
 4 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index b7f113e996e5..cd957ff4e207 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -613,12 +613,17 @@ static void mlx5e_rep_neigh_update(struct work_struct *work)
 	neigh_connected = (nud_state & NUD_VALID) && !dead;
 
 	list_for_each_entry(e, &nhe->encap_list, encap_list) {
+		if (!mlx5e_encap_take(e))
+			continue;
+
 		encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
 		priv = netdev_priv(e->out_dev);
 
 		if (encap_connected != neigh_connected ||
 		    !ether_addr_equal(e->h_dest, ha))
 			mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
+
+		mlx5e_encap_put(priv, e);
 	}
 	mlx5e_rep_neigh_entry_release(nhe);
 	rtnl_unlock();
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
index 43eeebe9c8d2..2e970d0729be 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h
@@ -164,6 +164,7 @@ struct mlx5e_encap_entry {
 	u8 flags;
 	char *encap_header;
 	int encap_size;
+	refcount_t refcnt;
 };
 
 struct mlx5e_rep_sq {
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index fcaf9ab9e373..4e378200a9d2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -103,6 +103,7 @@ enum {
  *        container_of(helper item, containing struct type, helper field[index])
  */
 struct encap_flow_item {
+	struct mlx5e_encap_entry *e; /* attached encap instance */
 	struct list_head list;
 	int index;
 };
@@ -1433,8 +1434,11 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 
 	list_for_each_entry(e, &nhe->encap_list, encap_list) {
 		struct encap_flow_item *efi, *tmp;
-		if (!(e->flags & MLX5_ENCAP_ENTRY_VALID))
+
+		if (!(e->flags & MLX5_ENCAP_ENTRY_VALID) ||
+		    !mlx5e_encap_take(e))
 			continue;
+
 		list_for_each_entry_safe(efi, tmp, &e->flows, list) {
 			flow = container_of(efi, struct mlx5e_tc_flow,
 					    encaps[efi->index]);
@@ -1453,6 +1457,8 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 
 			mlx5e_flow_put(netdev_priv(e->out_dev), flow);
 		}
+
+		mlx5e_encap_put(netdev_priv(e->out_dev), e);
 		if (neigh_used)
 			break;
 	}
@@ -1472,29 +1478,33 @@ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
 	}
 }
 
+void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e)
+{
+	if (!refcount_dec_and_test(&e->refcnt))
+		return;
+
+	WARN_ON(!list_empty(&e->flows));
+	mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
+
+	if (e->flags & MLX5_ENCAP_ENTRY_VALID)
+		mlx5_packet_reformat_dealloc(priv->mdev, e->encap_id);
+
+	hash_del_rcu(&e->encap_hlist);
+	kfree(e->encap_header);
+	kfree(e);
+}
+
 static void mlx5e_detach_encap(struct mlx5e_priv *priv,
 			       struct mlx5e_tc_flow *flow, int out_index)
 {
-	struct list_head *next = flow->encaps[out_index].list.next;
-
 	/* flow wasn't fully initialized */
-	if (list_empty(&flow->encaps[out_index].list))
+	if (!flow->encaps[out_index].e)
 		return;
 
 	list_del(&flow->encaps[out_index].list);
-	if (list_empty(next)) {
-		struct mlx5e_encap_entry *e;
-
-		e = list_entry(next, struct mlx5e_encap_entry, flows);
-		mlx5e_rep_encap_entry_detach(netdev_priv(e->out_dev), e);
 
-		if (e->flags & MLX5_ENCAP_ENTRY_VALID)
-			mlx5_packet_reformat_dealloc(priv->mdev, e->encap_id);
-
-		hash_del_rcu(&e->encap_hlist);
-		kfree(e->encap_header);
-		kfree(e);
-	}
+	mlx5e_encap_put(priv, flow->encaps[out_index].e);
+	flow->encaps[out_index].e = NULL;
 }
 
 static void __mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow)
@@ -2817,6 +2827,31 @@ static bool is_merged_eswitch_dev(struct mlx5e_priv *priv,
 
 
 
+bool mlx5e_encap_take(struct mlx5e_encap_entry *e)
+{
+	return refcount_inc_not_zero(&e->refcnt);
+}
+
+static struct mlx5e_encap_entry *
+mlx5e_encap_get(struct mlx5e_priv *priv, struct encap_key *key,
+		uintptr_t hash_key)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	struct mlx5e_encap_entry *e;
+	struct encap_key e_key;
+
+	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
+				   encap_hlist, hash_key) {
+		e_key.ip_tun_key = &e->tun_info->key;
+		e_key.tc_tunnel = e->tunnel;
+		if (!cmp_encap_info(&e_key, key) &&
+		    mlx5e_encap_take(e))
+			return e;
+	}
+
+	return NULL;
+}
+
 static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 			      struct mlx5e_tc_flow *flow,
 			      struct net_device *mirred_dev,
@@ -2829,11 +2864,10 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 	struct mlx5_esw_flow_attr *attr = flow->esw_attr;
 	struct mlx5e_tc_flow_parse_attr *parse_attr;
 	const struct ip_tunnel_info *tun_info;
-	struct encap_key key, e_key;
+	struct encap_key key;
 	struct mlx5e_encap_entry *e;
 	unsigned short family;
 	uintptr_t hash_key;
-	bool found = false;
 	int err = 0;
 
 	parse_attr = attr->parse_attr;
@@ -2848,24 +2882,17 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 
 	hash_key = hash_encap_info(&key);
 
-	hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
-				   encap_hlist, hash_key) {
-		e_key.ip_tun_key = &e->tun_info->key;
-		e_key.tc_tunnel = e->tunnel;
-		if (!cmp_encap_info(&e_key, &key)) {
-			found = true;
-			break;
-		}
-	}
+	e = mlx5e_encap_get(priv, &key, hash_key);
 
 	/* must verify if encap is valid or not */
-	if (found)
+	if (e)
 		goto attach_flow;
 
 	e = kzalloc(sizeof(*e), GFP_KERNEL);
 	if (!e)
 		return -ENOMEM;
 
+	refcount_set(&e->refcnt, 1);
 	e->tun_info = tun_info;
 	err = mlx5e_tc_tun_init_encap_attr(mirred_dev, priv, e, extack);
 	if (err)
@@ -2884,6 +2911,7 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
 	hash_add_rcu(esw->offloads.encap_tbl, &e->encap_hlist, hash_key);
 
 attach_flow:
+	flow->encaps[out_index].e = e;
 	list_add(&flow->encaps[out_index].list, &e->flows);
 	flow->encaps[out_index].index = out_index;
 	*encap_dev = e->out_dev;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
index 20f045e96c92..ea2072e2fe84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
@@ -75,6 +75,8 @@ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
 			      struct mlx5e_encap_entry *e);
 void mlx5e_tc_encap_flows_del(struct mlx5e_priv *priv,
 			      struct mlx5e_encap_entry *e);
+bool mlx5e_encap_take(struct mlx5e_encap_entry *e);
+void mlx5e_encap_put(struct mlx5e_priv *priv, struct mlx5e_encap_entry *e);
 
 struct mlx5e_neigh_hash_entry;
 void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH v4 9/9] Input: add IOC3 serio driver
From: Dmitry Torokhov @ 2019-08-09 22:04 UTC (permalink / raw)
  To: Thomas Bogendoerfer
  Cc: Ralf Baechle, Paul Burton, James Hogan, Lee Jones,
	David S. Miller, Srinivas Kandagatla, Alessandro Zummo,
	Alexandre Belloni, Greg Kroah-Hartman, Jiri Slaby,
	Evgeniy Polyakov, linux-mips, linux-kernel, linux-input, netdev,
	linux-rtc, linux-serial
In-Reply-To: <20190809103235.16338-10-tbogendoerfer@suse.de>

On Fri, Aug 09, 2019 at 12:32:31PM +0200, Thomas Bogendoerfer wrote:
> This patch adds a platform driver for supporting keyboard and mouse
> interface of SGI IOC3 chips.
> 
> Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
> ---
>  drivers/input/serio/Kconfig   |  10 +++
>  drivers/input/serio/Makefile  |   1 +
>  drivers/input/serio/ioc3kbd.c | 163 ++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 174 insertions(+)
>  create mode 100644 drivers/input/serio/ioc3kbd.c
> 
> diff --git a/drivers/input/serio/Kconfig b/drivers/input/serio/Kconfig
> index f3e18f8ef9ca..373a1646019e 100644
> --- a/drivers/input/serio/Kconfig
> +++ b/drivers/input/serio/Kconfig
> @@ -165,6 +165,16 @@ config SERIO_MACEPS2
>  	  To compile this driver as a module, choose M here: the
>  	  module will be called maceps2.
>  
> +config SERIO_SGI_IOC3
> +	tristate "SGI IOC3 PS/2 controller"
> +	depends on SGI_MFD_IOC3
> +	help
> +	  Say Y here if you have an SGI Onyx2, SGI Octane or IOC3 PCI card
> +	  and you want to attach and use a keyboard, mouse, or both.
> +
> +	  To compile this driver as a module, choose M here: the
> +	  module will be called ioc3kbd.
> +
>  config SERIO_LIBPS2
>  	tristate "PS/2 driver library"
>  	depends on SERIO_I8042 || SERIO_I8042=n
> diff --git a/drivers/input/serio/Makefile b/drivers/input/serio/Makefile
> index 67950a5ccb3f..6d97bad7b844 100644
> --- a/drivers/input/serio/Makefile
> +++ b/drivers/input/serio/Makefile
> @@ -20,6 +20,7 @@ obj-$(CONFIG_HIL_MLC)		+= hp_sdc_mlc.o hil_mlc.o
>  obj-$(CONFIG_SERIO_PCIPS2)	+= pcips2.o
>  obj-$(CONFIG_SERIO_PS2MULT)	+= ps2mult.o
>  obj-$(CONFIG_SERIO_MACEPS2)	+= maceps2.o
> +obj-$(CONFIG_SERIO_SGI_IOC3)	+= ioc3kbd.o
>  obj-$(CONFIG_SERIO_LIBPS2)	+= libps2.o
>  obj-$(CONFIG_SERIO_RAW)		+= serio_raw.o
>  obj-$(CONFIG_SERIO_AMS_DELTA)	+= ams_delta_serio.o
> diff --git a/drivers/input/serio/ioc3kbd.c b/drivers/input/serio/ioc3kbd.c
> new file mode 100644
> index 000000000000..6840e3c23fed
> --- /dev/null
> +++ b/drivers/input/serio/ioc3kbd.c
> @@ -0,0 +1,163 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * SGI IOC3 PS/2 controller driver for linux
> + *
> + * Copyright (C) 2019 Thomas Bogendoerfer <tbogendoerfer@suse.de>
> + *
> + * Based on code Copyright (C) 2005 Stanislaw Skowronek <skylark@unaligned.org>
> + *               Copyright (C) 2009 Johannes Dickgreber <tanzy@gmx.de>
> + */
> +
> +#include <linux/delay.h>
> +#include <linux/init.h>
> +#include <linux/io.h>
> +#include <linux/serio.h>
> +#include <linux/module.h>
> +#include <linux/platform_device.h>
> +
> +#include <asm/sn/ioc3.h>
> +
> +struct ioc3kbd_data {
> +	struct ioc3_serioregs __iomem *regs;
> +	struct serio *kbd, *aux;
> +	int irq;
> +};
> +
> +static int ioc3kbd_write(struct serio *dev, u8 val)
> +{
> +	struct ioc3kbd_data *d = dev->port_data;
> +	unsigned long timeout = 0;
> +	u32 mask;
> +
> +	mask = (dev == d->aux) ? KM_CSR_M_WRT_PEND : KM_CSR_K_WRT_PEND;
> +	while ((readl(&d->regs->km_csr) & mask) && (timeout < 1000)) {
> +		udelay(100);
> +		timeout++;
> +	}
> +
> +	if (timeout >= 1000)
> +		return -ETIMEDOUT;
> +
> +	writel(val, dev == d->aux ? &d->regs->m_wd : &d->regs->k_wd);
> +
> +	return 0;
> +}
> +
> +static irqreturn_t ioc3kbd_intr(int itq, void *dev_id)
> +{
> +	struct ioc3kbd_data *d = dev_id;
> +	u32 data_k, data_m;
> +
> +	data_k = readl(&d->regs->k_rd);
> +	data_m = readl(&d->regs->m_rd);
> +
> +	if (data_k & KM_RD_VALID_0)
> +		serio_interrupt(d->kbd, (data_k >> KM_RD_DATA_0_SHIFT) & 0xff,
> +				0);
> +	if (data_k & KM_RD_VALID_1)
> +		serio_interrupt(d->kbd, (data_k >> KM_RD_DATA_1_SHIFT) & 0xff,
> +				0);
> +	if (data_k & KM_RD_VALID_2)
> +		serio_interrupt(d->kbd, (data_k >> KM_RD_DATA_2_SHIFT) & 0xff,
> +				0);
> +	if (data_m & KM_RD_VALID_0)
> +		serio_interrupt(d->aux, (data_m >> KM_RD_DATA_0_SHIFT) & 0xff,
> +				0);
> +	if (data_m & KM_RD_VALID_1)
> +		serio_interrupt(d->aux, (data_m >> KM_RD_DATA_1_SHIFT) & 0xff,
> +				0);
> +	if (data_m & KM_RD_VALID_2)
> +		serio_interrupt(d->aux, (data_m >> KM_RD_DATA_2_SHIFT) & 0xff,
> +				0);
> +
> +	return 0;

IRQ_NONE? Or IRQ_HANDLED?

> +}
> +
> +static int ioc3kbd_probe(struct platform_device *pdev)
> +{
> +	struct ioc3_serioregs __iomem *regs;
> +	struct device *dev = &pdev->dev;
> +	struct ioc3kbd_data *d;
> +	struct serio *sk, *sa;
> +	int irq, ret;
> +
> +	regs = devm_platform_ioremap_resource(pdev, 0);
> +	if (IS_ERR(regs))
> +		return PTR_ERR(regs);
> +
> +	irq = platform_get_irq(pdev, 0);
> +	if (irq < 0)
> +		return -ENXIO;
> +
> +	d = devm_kzalloc(&pdev->dev, sizeof(*d), GFP_KERNEL);
> +	if (!d)
> +		return -ENOMEM;
> +
> +	sk = kzalloc(sizeof(*sk), GFP_KERNEL);
> +	if (!sk)
> +		return -ENOMEM;
> +
> +	sa = kzalloc(sizeof(*sa), GFP_KERNEL);
> +	if (!sa) {
> +		kfree(sk);
> +		return -ENOMEM;
> +	}
> +
> +	sk->id.type = SERIO_8042;
> +	sk->write = ioc3kbd_write;
> +	snprintf(sk->name, sizeof(sk->name), "IOC3 keyboard %d", pdev->id);
> +	snprintf(sk->phys, sizeof(sk->phys), "ioc3/serio%dkbd", pdev->id);
> +	sk->port_data = d;
> +	sk->dev.parent = &pdev->dev;
> +
> +	sa->id.type = SERIO_8042;
> +	sa->write = ioc3kbd_write;
> +	snprintf(sa->name, sizeof(sa->name), "IOC3 auxiliary %d", pdev->id);
> +	snprintf(sa->phys, sizeof(sa->phys), "ioc3/serio%daux", pdev->id);
> +	sa->port_data = d;
> +	sa->dev.parent = dev;
> +
> +	d->regs = regs;
> +	d->kbd = sk;
> +	d->aux = sa;
> +	d->irq = irq;
> +
> +	platform_set_drvdata(pdev, d);
> +	serio_register_port(d->kbd);
> +	serio_register_port(d->aux);
> +
> +	ret = devm_request_irq(&pdev->dev, irq, ioc3kbd_intr, IRQF_SHARED,
> +			       "ioc3-kbd", d);

Just request_irq(); there is not really any benefit from devm since you
free it manually.

What else is sharing this interrupt?

> +	if (ret) {
> +		dev_err(&pdev->dev, "could not request IRQ %d\n", irq);
> +		serio_unregister_port(d->kbd);
> +		serio_unregister_port(d->aux);
> +		kfree(sk);
> +		kfree(sa);
> +		return ret;
> +	}
> +	return 0;
> +}
> +
> +static int ioc3kbd_remove(struct platform_device *pdev)
> +{
> +	struct ioc3kbd_data *d = platform_get_drvdata(pdev);
> +
> +	devm_free_irq(&pdev->dev, d->irq, d);
> +	serio_unregister_port(d->kbd);
> +	serio_unregister_port(d->aux);
> +	return 0;
> +}
> +
> +static struct platform_driver ioc3kbd_driver = {
> +	.probe          = ioc3kbd_probe,
> +	.remove         = ioc3kbd_remove,
> +	.driver = {
> +		.name = "ioc3-kbd",
> +	},
> +};
> +module_platform_driver(ioc3kbd_driver);
> +
> +MODULE_AUTHOR("Thomas Bogendoerfer <tbogendoerfer@suse.de>");
> +MODULE_DESCRIPTION("SGI IOC3 serio driver");
> +MODULE_LICENSE("GPL");
> -- 
> 2.13.7
> 

Thanks.

-- 
Dmitry

^ permalink raw reply

* [net-next 08/15] net/mlx5e: Allow concurrent creation of mod_hdr entries
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Roi Dayan, Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

Mod_hdr entries creation is fully synchronized by mod_hdr_tbl->lock. In
order to allow concurrent allocation of hardware resources used to offload
header rewrite, extend mlx5e_mod_hdr_entry with 'res_ready' completion.
Move call to mlx5_modify_header_alloc() out of mod_hdr_tbl->lock critical
section. Modify code that attaches new flows to existing mh to wait for
'res_ready' completion before using the entry. Insert mh to mod_hdr table
before provisioning it to hardware and modify all users of mod_hdr table to
verify that mh was fully initialized by checking completion result for
negative value (and to wait for 'res_ready' completion, if necessary).

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 0600b7878600..fcaf9ab9e373 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -199,6 +199,8 @@ struct mlx5e_mod_hdr_entry {
 	u32 mod_hdr_id;
 
 	refcount_t refcnt;
+	struct completion res_ready;
+	int compl_result;
 };
 
 #define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)
@@ -326,7 +328,8 @@ static void mlx5e_mod_hdr_put(struct mlx5e_priv *priv,
 	mutex_unlock(&tbl->lock);
 
 	WARN_ON(!list_empty(&mh->flows));
-	mlx5_modify_header_dealloc(priv->mdev, mh->mod_hdr_id);
+	if (mh->compl_result > 0)
+		mlx5_modify_header_dealloc(priv->mdev, mh->mod_hdr_id);
 
 	kfree(mh);
 }
@@ -359,13 +362,21 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 
 	mutex_lock(&tbl->lock);
 	mh = mlx5e_mod_hdr_get(tbl, &key, hash_key);
-	if (mh)
+	if (mh) {
+		mutex_unlock(&tbl->lock);
+		wait_for_completion(&mh->res_ready);
+
+		if (mh->compl_result < 0) {
+			err = -EREMOTEIO;
+			goto attach_header_err;
+		}
 		goto attach_flow;
+	}
 
 	mh = kzalloc(sizeof(*mh) + actions_size, GFP_KERNEL);
 	if (!mh) {
-		err = -ENOMEM;
-		goto out_err;
+		mutex_unlock(&tbl->lock);
+		return -ENOMEM;
 	}
 
 	mh->key.actions = (void *)mh + sizeof(*mh);
@@ -374,18 +385,23 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 	spin_lock_init(&mh->flows_lock);
 	INIT_LIST_HEAD(&mh->flows);
 	refcount_set(&mh->refcnt, 1);
+	init_completion(&mh->res_ready);
+
+	hash_add(tbl->hlist, &mh->mod_hdr_hlist, hash_key);
+	mutex_unlock(&tbl->lock);
 
 	err = mlx5_modify_header_alloc(priv->mdev, namespace,
 				       mh->key.num_actions,
 				       mh->key.actions,
 				       &mh->mod_hdr_id);
-	if (err)
-		goto out_err;
-
-	hash_add(tbl->hlist, &mh->mod_hdr_hlist, hash_key);
+	if (err) {
+		mh->compl_result = err;
+		goto alloc_header_err;
+	}
+	mh->compl_result = 1;
+	complete_all(&mh->res_ready);
 
 attach_flow:
-	mutex_unlock(&tbl->lock);
 	flow->mh = mh;
 	spin_lock(&mh->flows_lock);
 	list_add(&flow->mod_hdr, &mh->flows);
@@ -397,9 +413,10 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 
 	return 0;
 
-out_err:
-	mutex_unlock(&tbl->lock);
-	kfree(mh);
+alloc_header_err:
+	complete_all(&mh->res_ready);
+attach_header_err:
+	mlx5e_mod_hdr_put(priv, mh, namespace);
 	return err;
 }
 
-- 
2.21.0


^ permalink raw reply related

* [net-next 14/15] net/mlx5e: Use vhca_id in generating representor port_index
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev@vger.kernel.org, Parav Pandit, Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Parav Pandit <parav@mellanox.com>

It is desired to use unique port indices when multiple pci devices'
devlink instance have the same switch-id.

Make use of vhca-id to generate such unique devlink port indices.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_rep.c  | 20 +++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index 33ae66dc72e2..7ce5cb6e527e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -1746,34 +1746,46 @@ is_devlink_port_supported(const struct mlx5_core_dev *dev,
 	       mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport);
 }
 
+static unsigned int
+vport_to_devlink_port_index(const struct mlx5_core_dev *dev, u16 vport_num)
+{
+	return (MLX5_CAP_GEN(dev, vhca_id) << 16) | vport_num;
+}
+
 static int register_devlink_port(struct mlx5_core_dev *dev,
 				 struct mlx5e_rep_priv *rpriv)
 {
 	struct devlink *devlink = priv_to_devlink(dev);
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
 	struct netdev_phys_item_id ppid = {};
+	unsigned int dl_port_index = 0;
 
 	if (!is_devlink_port_supported(dev, rpriv))
 		return 0;
 
 	mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid);
 
-	if (rep->vport == MLX5_VPORT_UPLINK)
+	if (rep->vport == MLX5_VPORT_UPLINK) {
 		devlink_port_attrs_set(&rpriv->dl_port,
 				       DEVLINK_PORT_FLAVOUR_PHYSICAL,
 				       PCI_FUNC(dev->pdev->devfn), false, 0,
 				       &ppid.id[0], ppid.id_len);
-	else if (rep->vport == MLX5_VPORT_PF)
+		dl_port_index = vport_to_devlink_port_index(dev, rep->vport);
+	} else if (rep->vport == MLX5_VPORT_PF) {
 		devlink_port_attrs_pci_pf_set(&rpriv->dl_port,
 					      &ppid.id[0], ppid.id_len,
 					      dev->pdev->devfn);
-	else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch, rpriv->rep->vport))
+		dl_port_index = rep->vport;
+	} else if (mlx5_eswitch_is_vf_vport(dev->priv.eswitch,
+					    rpriv->rep->vport)) {
 		devlink_port_attrs_pci_vf_set(&rpriv->dl_port,
 					      &ppid.id[0], ppid.id_len,
 					      dev->pdev->devfn,
 					      rep->vport - 1);
+		dl_port_index = vport_to_devlink_port_index(dev, rep->vport);
+	}
 
-	return devlink_port_register(devlink, &rpriv->dl_port, rep->vport);
+	return devlink_port_register(devlink, &rpriv->dl_port, dl_port_index);
 }
 
 static void unregister_devlink_port(struct mlx5_core_dev *dev,
-- 
2.21.0


^ permalink raw reply related

* [net-next 13/15] net/mlx5e: Simplify querying port representor parent id
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Parav Pandit, Vu Pham, Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Parav Pandit <parav@mellanox.com>

System image GUID doesn't depend on eswitch switchdev mode.

Hence, remove the check which simplifies the code.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
index cd957ff4e207..33ae66dc72e2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
@@ -389,24 +389,17 @@ static const struct ethtool_ops mlx5e_uplink_rep_ethtool_ops = {
 	.set_pauseparam    = mlx5e_uplink_rep_set_pauseparam,
 };
 
-static int mlx5e_rep_get_port_parent_id(struct net_device *dev,
-					struct netdev_phys_item_id *ppid)
+static void mlx5e_rep_get_port_parent_id(struct net_device *dev,
+					 struct netdev_phys_item_id *ppid)
 {
-	struct mlx5_eswitch *esw;
 	struct mlx5e_priv *priv;
 	u64 parent_id;
 
 	priv = netdev_priv(dev);
-	esw = priv->mdev->priv.eswitch;
-
-	if (esw->mode == MLX5_ESWITCH_NONE)
-		return -EOPNOTSUPP;
 
 	parent_id = mlx5_query_nic_system_image_guid(priv->mdev);
 	ppid->id_len = sizeof(parent_id);
 	memcpy(ppid->id, &parent_id, sizeof(parent_id));
-
-	return 0;
 }
 
 static void mlx5e_sqs2vport_stop(struct mlx5_eswitch *esw,
@@ -1759,14 +1752,11 @@ static int register_devlink_port(struct mlx5_core_dev *dev,
 	struct devlink *devlink = priv_to_devlink(dev);
 	struct mlx5_eswitch_rep *rep = rpriv->rep;
 	struct netdev_phys_item_id ppid = {};
-	int ret;
 
 	if (!is_devlink_port_supported(dev, rpriv))
 		return 0;
 
-	ret = mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid);
-	if (ret)
-		return ret;
+	mlx5e_rep_get_port_parent_id(rpriv->netdev, &ppid);
 
 	if (rep->vport == MLX5_VPORT_UPLINK)
 		devlink_port_attrs_set(&rpriv->dl_port,
-- 
2.21.0


^ permalink raw reply related

* [net-next 12/15] net/mlx5: E-switch, Removed unused hwid
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Parav Pandit, Vu Pham, Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Parav Pandit <parav@mellanox.com>

Currently mlx5_eswitch_rep stores same hw ID for all representors.
However it is never used from this structure.
It is always used from mlx5_vport.

Hence, remove unused field.

Signed-off-by: Parav Pandit <parav@mellanox.com>
Reviewed-by: Vu Pham <vuhuong@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 6 +-----
 include/linux/mlx5/eswitch.h                               | 1 -
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 8fe5dddf18d0..42cc5001255b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -1393,10 +1393,9 @@ void esw_offloads_cleanup_reps(struct mlx5_eswitch *esw)
 int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 {
 	int total_vports = esw->total_vports;
-	struct mlx5_core_dev *dev = esw->dev;
 	struct mlx5_eswitch_rep *rep;
-	u8 hw_id[ETH_ALEN], rep_type;
 	int vport_index;
+	u8 rep_type;
 
 	esw->offloads.vport_reps = kcalloc(total_vports,
 					   sizeof(struct mlx5_eswitch_rep),
@@ -1404,12 +1403,9 @@ int esw_offloads_init_reps(struct mlx5_eswitch *esw)
 	if (!esw->offloads.vport_reps)
 		return -ENOMEM;
 
-	mlx5_query_mac_address(dev, hw_id);
-
 	mlx5_esw_for_all_reps(esw, vport_index, rep) {
 		rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport_index);
 		rep->vport_index = vport_index;
-		ether_addr_copy(rep->hw_id, hw_id);
 
 		for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
 			atomic_set(&rep->rep_data[rep_type].state,
diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
index 46b5ba029802..38a70d16d8d5 100644
--- a/include/linux/mlx5/eswitch.h
+++ b/include/linux/mlx5/eswitch.h
@@ -44,7 +44,6 @@ struct mlx5_eswitch_rep_data {
 struct mlx5_eswitch_rep {
 	struct mlx5_eswitch_rep_data rep_data[NUM_REP_TYPES];
 	u16		       vport;
-	u8		       hw_id[ETH_ALEN];
 	u16		       vlan;
 	/* Only IB rep is using vport_index */
 	u16		       vport_index;
-- 
2.21.0


^ permalink raw reply related

* [net-next 07/15] net/mlx5e: Protect mod_hdr hash table with mutex
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Roi Dayan, Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

To remove dependency on rtnl lock, protect mod_hdr hash table from
concurrent modifications with new mutex.

Implement helper function to get flow namespace to prevent code
duplication.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 35 +++++++++++++------
 .../net/ethernet/mellanox/mlx5/core/eswitch.c |  2 ++
 include/linux/mlx5/fs.h                       |  1 +
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 09d5cc700297..0600b7878600 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -315,22 +315,31 @@ mlx5e_mod_hdr_get(struct mod_hdr_tbl *tbl, struct mod_hdr_key *key, u32 hash_key
 }
 
 static void mlx5e_mod_hdr_put(struct mlx5e_priv *priv,
-			      struct mlx5e_mod_hdr_entry *mh)
+			      struct mlx5e_mod_hdr_entry *mh,
+			      int namespace)
 {
-	if (!refcount_dec_and_test(&mh->refcnt))
+	struct mod_hdr_tbl *tbl = get_mod_hdr_table(priv, namespace);
+
+	if (!refcount_dec_and_mutex_lock(&mh->refcnt, &tbl->lock))
 		return;
+	hash_del(&mh->mod_hdr_hlist);
+	mutex_unlock(&tbl->lock);
 
 	WARN_ON(!list_empty(&mh->flows));
 	mlx5_modify_header_dealloc(priv->mdev, mh->mod_hdr_id);
-	hash_del(&mh->mod_hdr_hlist);
+
 	kfree(mh);
 }
 
+static int get_flow_name_space(struct mlx5e_tc_flow *flow)
+{
+	return mlx5e_is_eswitch_flow(flow) ?
+		MLX5_FLOW_NAMESPACE_FDB : MLX5_FLOW_NAMESPACE_KERNEL;
+}
 static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 				struct mlx5e_tc_flow *flow,
 				struct mlx5e_tc_flow_parse_attr *parse_attr)
 {
-	bool is_eswitch_flow = mlx5e_is_eswitch_flow(flow);
 	int num_actions, actions_size, namespace, err;
 	struct mlx5e_mod_hdr_entry *mh;
 	struct mod_hdr_tbl *tbl;
@@ -345,17 +354,19 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 
 	hash_key = hash_mod_hdr_info(&key);
 
-	namespace = is_eswitch_flow ?
-		MLX5_FLOW_NAMESPACE_FDB : MLX5_FLOW_NAMESPACE_KERNEL;
+	namespace = get_flow_name_space(flow);
 	tbl = get_mod_hdr_table(priv, namespace);
 
+	mutex_lock(&tbl->lock);
 	mh = mlx5e_mod_hdr_get(tbl, &key, hash_key);
 	if (mh)
 		goto attach_flow;
 
 	mh = kzalloc(sizeof(*mh) + actions_size, GFP_KERNEL);
-	if (!mh)
-		return -ENOMEM;
+	if (!mh) {
+		err = -ENOMEM;
+		goto out_err;
+	}
 
 	mh->key.actions = (void *)mh + sizeof(*mh);
 	memcpy(mh->key.actions, key.actions, actions_size);
@@ -374,11 +385,12 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 	hash_add(tbl->hlist, &mh->mod_hdr_hlist, hash_key);
 
 attach_flow:
+	mutex_unlock(&tbl->lock);
 	flow->mh = mh;
 	spin_lock(&mh->flows_lock);
 	list_add(&flow->mod_hdr, &mh->flows);
 	spin_unlock(&mh->flows_lock);
-	if (is_eswitch_flow)
+	if (mlx5e_is_eswitch_flow(flow))
 		flow->esw_attr->mod_hdr_id = mh->mod_hdr_id;
 	else
 		flow->nic_attr->mod_hdr_id = mh->mod_hdr_id;
@@ -386,6 +398,7 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 	return 0;
 
 out_err:
+	mutex_unlock(&tbl->lock);
 	kfree(mh);
 	return err;
 }
@@ -401,7 +414,7 @@ static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv,
 	list_del(&flow->mod_hdr);
 	spin_unlock(&flow->mh->flows_lock);
 
-	mlx5e_mod_hdr_put(priv, flow->mh);
+	mlx5e_mod_hdr_put(priv, flow->mh, get_flow_name_space(flow));
 	flow->mh = NULL;
 }
 
@@ -3865,6 +3878,7 @@ int mlx5e_tc_nic_init(struct mlx5e_priv *priv)
 	int err;
 
 	mutex_init(&tc->t_lock);
+	mutex_init(&tc->mod_hdr.lock);
 	hash_init(tc->mod_hdr.hlist);
 	mutex_init(&tc->hairpin_tbl_lock);
 	hash_init(tc->hairpin_tbl);
@@ -3898,6 +3912,7 @@ void mlx5e_tc_nic_cleanup(struct mlx5e_priv *priv)
 	if (tc->netdevice_nb.notifier_call)
 		unregister_netdevice_notifier(&tc->netdevice_nb);
 
+	mutex_destroy(&tc->mod_hdr.lock);
 	mutex_destroy(&tc->hairpin_tbl_lock);
 
 	rhashtable_destroy(&tc->ht);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 5ce3c81e3083..2d734ecae719 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2000,6 +2000,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 		goto abort;
 
 	hash_init(esw->offloads.encap_tbl);
+	mutex_init(&esw->offloads.mod_hdr.lock);
 	hash_init(esw->offloads.mod_hdr.hlist);
 	atomic64_set(&esw->offloads.num_flows, 0);
 	mutex_init(&esw->state_lock);
@@ -2037,6 +2038,7 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)
 	esw->dev->priv.eswitch = NULL;
 	destroy_workqueue(esw->work_queue);
 	esw_offloads_cleanup_reps(esw);
+	mutex_destroy(&esw->offloads.mod_hdr.lock);
 	kfree(esw->vports);
 	kfree(esw);
 }
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index 96650a33aa91..1cb1045ce313 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -127,6 +127,7 @@ struct mlx5_flow_destination {
 };
 
 struct mod_hdr_tbl {
+	struct mutex lock; /* protects hlist */
 	DECLARE_HASHTABLE(hlist, 8);
 };
 
-- 
2.21.0


^ permalink raw reply related

* [net-next 06/15] net/mlx5e: Protect mod header entry flows list with spinlock
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Jianbo Liu, Roi Dayan,
	Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

To remove dependency on rtnl lock, extend mod header entry with spinlock
and use it to protect list of flows attached to mod header entry from
concurrent modifications.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index fe1b04aa910a..09d5cc700297 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -189,6 +189,8 @@ struct mlx5e_mod_hdr_entry {
 	/* a node of a hash table which keeps all the mod_hdr entries */
 	struct hlist_node mod_hdr_hlist;
 
+	/* protects flows list */
+	spinlock_t flows_lock;
 	/* flows sharing the same mod_hdr entry */
 	struct list_head flows;
 
@@ -358,6 +360,7 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 	mh->key.actions = (void *)mh + sizeof(*mh);
 	memcpy(mh->key.actions, key.actions, actions_size);
 	mh->key.num_actions = num_actions;
+	spin_lock_init(&mh->flows_lock);
 	INIT_LIST_HEAD(&mh->flows);
 	refcount_set(&mh->refcnt, 1);
 
@@ -372,7 +375,9 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 
 attach_flow:
 	flow->mh = mh;
+	spin_lock(&mh->flows_lock);
 	list_add(&flow->mod_hdr, &mh->flows);
+	spin_unlock(&mh->flows_lock);
 	if (is_eswitch_flow)
 		flow->esw_attr->mod_hdr_id = mh->mod_hdr_id;
 	else
@@ -392,7 +397,9 @@ static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv,
 	if (!flow->mh)
 		return;
 
+	spin_lock(&flow->mh->flows_lock);
 	list_del(&flow->mod_hdr);
+	spin_unlock(&flow->mh->flows_lock);
 
 	mlx5e_mod_hdr_put(priv, flow->mh);
 	flow->mh = NULL;
-- 
2.21.0


^ permalink raw reply related

* [net-next 05/15] net/mlx5e: Extend mod header entry with reference counter
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Jianbo Liu, Roi Dayan,
	Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

List of flows attached to mod header entry is used as implicit reference
counter (mod header entry is deallocated when list becomes free) and as a
mechanism to obtain mod header entry that flow is attached to (through list
head). This is not safe when concurrent modification of list of flows
attached to mod header entry is possible. Proper atomic reference counter
is required to support concurrent access.

As a preparation for extending mod header with reference counting, extract
code that lookups and deletes mod header entry into standalone put/get
helpers. In order to remove this dependency on external locking, extend mod
header entry with reference counter to manage its lifetime and extend flow
structure with direct pointer to mod header entry that flow is attached to.

To remove code duplication between legacy and switchdev mode
implementations that both support mod_hdr functionality, store mod_hdr
table in dedicated structure used by both fdb and kernel namespaces. New
table structure is extended with table lock by one of the following patches
in this series. Implement helper function to get correct mod_hdr table
depending on flow namespace.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en/fs.h   |  2 +-
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 94 +++++++++++--------
 .../net/ethernet/mellanox/mlx5/core/eswitch.c |  2 +-
 .../net/ethernet/mellanox/mlx5/core/eswitch.h |  2 +-
 include/linux/mlx5/fs.h                       |  4 +
 5 files changed, 61 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
index 100506a3dd58..ca2161b42c7f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs.h
@@ -16,7 +16,7 @@ struct mlx5e_tc_table {
 
 	struct rhashtable               ht;
 
-	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	struct mod_hdr_tbl mod_hdr;
 	struct mutex hairpin_tbl_lock; /* protects hairpin_tbl */
 	DECLARE_HASHTABLE(hairpin_tbl, 8);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index b6a91e3054c0..fe1b04aa910a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -119,6 +119,7 @@ struct mlx5e_tc_flow {
 	 */
 	struct encap_flow_item encaps[MLX5_MAX_FLOW_FWD_VPORTS];
 	struct mlx5e_tc_flow    *peer_flow;
+	struct mlx5e_mod_hdr_entry *mh; /* attached mod header instance */
 	struct list_head	mod_hdr; /* flows sharing the same mod hdr ID */
 	struct mlx5e_hairpin_entry *hpe; /* attached hairpin instance */
 	struct list_head	hairpin; /* flows sharing the same hairpin */
@@ -194,6 +195,8 @@ struct mlx5e_mod_hdr_entry {
 	struct mod_hdr_key key;
 
 	u32 mod_hdr_id;
+
+	refcount_t refcnt;
 };
 
 #define MLX5_MH_ACT_SZ MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)
@@ -284,14 +287,51 @@ static inline int cmp_mod_hdr_info(struct mod_hdr_key *a,
 	return memcmp(a->actions, b->actions, a->num_actions * MLX5_MH_ACT_SZ);
 }
 
+static struct mod_hdr_tbl *
+get_mod_hdr_table(struct mlx5e_priv *priv, int namespace)
+{
+	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+
+	return namespace == MLX5_FLOW_NAMESPACE_FDB ? &esw->offloads.mod_hdr :
+		&priv->fs.tc.mod_hdr;
+}
+
+static struct mlx5e_mod_hdr_entry *
+mlx5e_mod_hdr_get(struct mod_hdr_tbl *tbl, struct mod_hdr_key *key, u32 hash_key)
+{
+	struct mlx5e_mod_hdr_entry *mh, *found = NULL;
+
+	hash_for_each_possible(tbl->hlist, mh, mod_hdr_hlist, hash_key) {
+		if (!cmp_mod_hdr_info(&mh->key, key)) {
+			refcount_inc(&mh->refcnt);
+			found = mh;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static void mlx5e_mod_hdr_put(struct mlx5e_priv *priv,
+			      struct mlx5e_mod_hdr_entry *mh)
+{
+	if (!refcount_dec_and_test(&mh->refcnt))
+		return;
+
+	WARN_ON(!list_empty(&mh->flows));
+	mlx5_modify_header_dealloc(priv->mdev, mh->mod_hdr_id);
+	hash_del(&mh->mod_hdr_hlist);
+	kfree(mh);
+}
+
 static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 				struct mlx5e_tc_flow *flow,
 				struct mlx5e_tc_flow_parse_attr *parse_attr)
 {
-	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+	bool is_eswitch_flow = mlx5e_is_eswitch_flow(flow);
 	int num_actions, actions_size, namespace, err;
-	bool found = false, is_eswitch_flow;
 	struct mlx5e_mod_hdr_entry *mh;
+	struct mod_hdr_tbl *tbl;
 	struct mod_hdr_key key;
 	u32 hash_key;
 
@@ -303,28 +343,12 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 
 	hash_key = hash_mod_hdr_info(&key);
 
-	is_eswitch_flow = mlx5e_is_eswitch_flow(flow);
-	if (is_eswitch_flow) {
-		namespace = MLX5_FLOW_NAMESPACE_FDB;
-		hash_for_each_possible(esw->offloads.mod_hdr_tbl, mh,
-				       mod_hdr_hlist, hash_key) {
-			if (!cmp_mod_hdr_info(&mh->key, &key)) {
-				found = true;
-				break;
-			}
-		}
-	} else {
-		namespace = MLX5_FLOW_NAMESPACE_KERNEL;
-		hash_for_each_possible(priv->fs.tc.mod_hdr_tbl, mh,
-				       mod_hdr_hlist, hash_key) {
-			if (!cmp_mod_hdr_info(&mh->key, &key)) {
-				found = true;
-				break;
-			}
-		}
-	}
+	namespace = is_eswitch_flow ?
+		MLX5_FLOW_NAMESPACE_FDB : MLX5_FLOW_NAMESPACE_KERNEL;
+	tbl = get_mod_hdr_table(priv, namespace);
 
-	if (found)
+	mh = mlx5e_mod_hdr_get(tbl, &key, hash_key);
+	if (mh)
 		goto attach_flow;
 
 	mh = kzalloc(sizeof(*mh) + actions_size, GFP_KERNEL);
@@ -335,6 +359,7 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 	memcpy(mh->key.actions, key.actions, actions_size);
 	mh->key.num_actions = num_actions;
 	INIT_LIST_HEAD(&mh->flows);
+	refcount_set(&mh->refcnt, 1);
 
 	err = mlx5_modify_header_alloc(priv->mdev, namespace,
 				       mh->key.num_actions,
@@ -343,12 +368,10 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 	if (err)
 		goto out_err;
 
-	if (is_eswitch_flow)
-		hash_add(esw->offloads.mod_hdr_tbl, &mh->mod_hdr_hlist, hash_key);
-	else
-		hash_add(priv->fs.tc.mod_hdr_tbl, &mh->mod_hdr_hlist, hash_key);
+	hash_add(tbl->hlist, &mh->mod_hdr_hlist, hash_key);
 
 attach_flow:
+	flow->mh = mh;
 	list_add(&flow->mod_hdr, &mh->flows);
 	if (is_eswitch_flow)
 		flow->esw_attr->mod_hdr_id = mh->mod_hdr_id;
@@ -365,23 +388,14 @@ static int mlx5e_attach_mod_hdr(struct mlx5e_priv *priv,
 static void mlx5e_detach_mod_hdr(struct mlx5e_priv *priv,
 				 struct mlx5e_tc_flow *flow)
 {
-	struct list_head *next = flow->mod_hdr.next;
-
 	/* flow wasn't fully initialized */
-	if (list_empty(&flow->mod_hdr))
+	if (!flow->mh)
 		return;
 
 	list_del(&flow->mod_hdr);
 
-	if (list_empty(next)) {
-		struct mlx5e_mod_hdr_entry *mh;
-
-		mh = list_entry(next, struct mlx5e_mod_hdr_entry, flows);
-
-		mlx5_modify_header_dealloc(priv->mdev, mh->mod_hdr_id);
-		hash_del(&mh->mod_hdr_hlist);
-		kfree(mh);
-	}
+	mlx5e_mod_hdr_put(priv, flow->mh);
+	flow->mh = NULL;
 }
 
 static
@@ -3844,7 +3858,7 @@ int mlx5e_tc_nic_init(struct mlx5e_priv *priv)
 	int err;
 
 	mutex_init(&tc->t_lock);
-	hash_init(tc->mod_hdr_tbl);
+	hash_init(tc->mod_hdr.hlist);
 	mutex_init(&tc->hairpin_tbl_lock);
 	hash_init(tc->hairpin_tbl);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 5fbebee7254d..5ce3c81e3083 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -2000,7 +2000,7 @@ int mlx5_eswitch_init(struct mlx5_core_dev *dev)
 		goto abort;
 
 	hash_init(esw->offloads.encap_tbl);
-	hash_init(esw->offloads.mod_hdr_tbl);
+	hash_init(esw->offloads.mod_hdr.hlist);
 	atomic64_set(&esw->offloads.num_flows, 0);
 	mutex_init(&esw->state_lock);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 804912e38dee..fd63ba4ed0da 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -182,7 +182,7 @@ struct mlx5_esw_offload {
 	struct list_head peer_flows;
 	struct mutex peer_mutex;
 	DECLARE_HASHTABLE(encap_tbl, 8);
-	DECLARE_HASHTABLE(mod_hdr_tbl, 8);
+	struct mod_hdr_tbl mod_hdr;
 	DECLARE_HASHTABLE(termtbl_tbl, 8);
 	struct mutex termtbl_mutex; /* protects termtbl hash */
 	const struct mlx5_eswitch_rep_ops *rep_ops[NUM_REP_TYPES];
diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
index f049af3f3cd8..96650a33aa91 100644
--- a/include/linux/mlx5/fs.h
+++ b/include/linux/mlx5/fs.h
@@ -126,6 +126,10 @@ struct mlx5_flow_destination {
 	};
 };
 
+struct mod_hdr_tbl {
+	DECLARE_HASHTABLE(hlist, 8);
+};
+
 struct mlx5_flow_namespace *
 mlx5_get_fdb_sub_ns(struct mlx5_core_dev *dev, int n);
 struct mlx5_flow_namespace *
-- 
2.21.0


^ permalink raw reply related

* [net-next 04/15] net/mlx5e: Allow concurrent creation of hairpin entries
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Roi Dayan, Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

Hairpin entries creation is fully synchronized by hairpin_tbl_lock. In
order to allow concurrent initialization of mlx5e_hairpin structure
instances and provisioning of hairpin entries to hardware, extend
mlx5e_hairpin_entry with 'res_ready' completion. Move call to
mlx5e_hairpin_create() out of hairpin_tbl_lock critical section. Modify
code that attaches new flows to existing hpe to wait for 'res_ready'
completion before using the hpe. Insert hpe to hairpin table before
provisioning it to hardware and modify all users of hairpin table to verify
that hpe was fully initialized by checking hpe->hp pointer (and to wait for
'res_ready' completion, if necessary).

Modify dead peer update event handling function to save hpe's to temporary
list with their reference counter incremented. Wait for completion of hpe's
in temporary list and update their 'peer_gone' flag outside of
hairpin_tbl_lock critical section.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 65 +++++++++++++------
 1 file changed, 46 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index a7acb7fcbf5a..b6a91e3054c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -39,6 +39,7 @@
 #include <linux/mlx5/device.h>
 #include <linux/rhashtable.h>
 #include <linux/refcount.h>
+#include <linux/completion.h>
 #include <net/tc_act/tc_mirred.h>
 #include <net/tc_act/tc_vlan.h>
 #include <net/tc_act/tc_tunnel_key.h>
@@ -166,11 +167,16 @@ struct mlx5e_hairpin_entry {
 	spinlock_t flows_lock;
 	/* flows sharing the same hairpin */
 	struct list_head flows;
+	/* hpe's that were not fully initialized when dead peer update event
+	 * function traversed them.
+	 */
+	struct list_head dead_peer_wait_list;
 
 	u16 peer_vhca_id;
 	u8 prio;
 	struct mlx5e_hairpin *hp;
 	refcount_t refcnt;
+	struct completion res_ready;
 };
 
 struct mod_hdr_key {
@@ -657,11 +663,14 @@ static void mlx5e_hairpin_put(struct mlx5e_priv *priv,
 	hash_del(&hpe->hairpin_hlist);
 	mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
 
-	netdev_dbg(priv->netdev, "del hairpin: peer %s\n",
-		   dev_name(hpe->hp->pair->peer_mdev->device));
+	if (!IS_ERR_OR_NULL(hpe->hp)) {
+		netdev_dbg(priv->netdev, "del hairpin: peer %s\n",
+			   dev_name(hpe->hp->pair->peer_mdev->device));
+
+		mlx5e_hairpin_destroy(hpe->hp);
+	}
 
 	WARN_ON(!list_empty(&hpe->flows));
-	mlx5e_hairpin_destroy(hpe->hp);
 	kfree(hpe);
 }
 
@@ -733,20 +742,34 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 
 	mutex_lock(&priv->fs.tc.hairpin_tbl_lock);
 	hpe = mlx5e_hairpin_get(priv, peer_id, match_prio);
-	if (hpe)
+	if (hpe) {
+		mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
+		wait_for_completion(&hpe->res_ready);
+
+		if (IS_ERR(hpe->hp)) {
+			err = -EREMOTEIO;
+			goto out_err;
+		}
 		goto attach_flow;
+	}
 
 	hpe = kzalloc(sizeof(*hpe), GFP_KERNEL);
 	if (!hpe) {
-		err = -ENOMEM;
-		goto create_hairpin_err;
+		mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
+		return -ENOMEM;
 	}
 
 	spin_lock_init(&hpe->flows_lock);
 	INIT_LIST_HEAD(&hpe->flows);
+	INIT_LIST_HEAD(&hpe->dead_peer_wait_list);
 	hpe->peer_vhca_id = peer_id;
 	hpe->prio = match_prio;
 	refcount_set(&hpe->refcnt, 1);
+	init_completion(&hpe->res_ready);
+
+	hash_add(priv->fs.tc.hairpin_tbl, &hpe->hairpin_hlist,
+		 hash_hairpin_info(peer_id, match_prio));
+	mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
 
 	params.log_data_size = 15;
 	params.log_data_size = min_t(u8, params.log_data_size,
@@ -768,9 +791,11 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 	params.num_channels = link_speed64;
 
 	hp = mlx5e_hairpin_create(priv, &params, peer_ifindex);
+	hpe->hp = hp;
+	complete_all(&hpe->res_ready);
 	if (IS_ERR(hp)) {
 		err = PTR_ERR(hp);
-		goto create_hairpin_err;
+		goto out_err;
 	}
 
 	netdev_dbg(priv->netdev, "add hairpin: tirn %x rqn %x peer %s sqn %x prio %d (log) data %d packets %d\n",
@@ -778,10 +803,6 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 		   dev_name(hp->pair->peer_mdev->device),
 		   hp->pair->sqn[0], match_prio, params.log_data_size, params.log_num_packets);
 
-	hpe->hp = hp;
-	hash_add(priv->fs.tc.hairpin_tbl, &hpe->hairpin_hlist,
-		 hash_hairpin_info(peer_id, match_prio));
-
 attach_flow:
 	if (hpe->hp->num_channels > 1) {
 		flow_flag_set(flow, HAIRPIN_RSS);
@@ -789,7 +810,6 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 	} else {
 		flow->nic_attr->hairpin_tirn = hpe->hp->tirn;
 	}
-	mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
 
 	flow->hpe = hpe;
 	spin_lock(&hpe->flows_lock);
@@ -798,9 +818,8 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 
 	return 0;
 
-create_hairpin_err:
-	mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
-	kfree(hpe);
+out_err:
+	mlx5e_hairpin_put(priv, hpe);
 	return err;
 }
 
@@ -3767,7 +3786,8 @@ static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv,
 					      struct mlx5e_priv *peer_priv)
 {
 	struct mlx5_core_dev *peer_mdev = peer_priv->mdev;
-	struct mlx5e_hairpin_entry *hpe;
+	struct mlx5e_hairpin_entry *hpe, *tmp;
+	LIST_HEAD(init_wait_list);
 	u16 peer_vhca_id;
 	int bkt;
 
@@ -3777,11 +3797,18 @@ static void mlx5e_tc_hairpin_update_dead_peer(struct mlx5e_priv *priv,
 	peer_vhca_id = MLX5_CAP_GEN(peer_mdev, vhca_id);
 
 	mutex_lock(&priv->fs.tc.hairpin_tbl_lock);
-	hash_for_each(priv->fs.tc.hairpin_tbl, bkt, hpe, hairpin_hlist) {
-		if (hpe->peer_vhca_id == peer_vhca_id)
+	hash_for_each(priv->fs.tc.hairpin_tbl, bkt, hpe, hairpin_hlist)
+		if (refcount_inc_not_zero(&hpe->refcnt))
+			list_add(&hpe->dead_peer_wait_list, &init_wait_list);
+	mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
+
+	list_for_each_entry_safe(hpe, tmp, &init_wait_list, dead_peer_wait_list) {
+		wait_for_completion(&hpe->res_ready);
+		if (!IS_ERR_OR_NULL(hpe->hp) && hpe->peer_vhca_id == peer_vhca_id)
 			hpe->hp->pair->peer_gone = true;
+
+		mlx5e_hairpin_put(priv, hpe);
 	}
-	mutex_unlock(&priv->fs.tc.hairpin_tbl_lock);
 }
 
 static int mlx5e_tc_netdev_event(struct notifier_block *this,
-- 
2.21.0


^ permalink raw reply related

* [pull request][net-next 00/15] Mellanox, mlx5 tc flow handling for concurrent execution (Part 2)
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev@vger.kernel.org, Saeed Mahameed

Hi Dave,

This series, mostly from Vlad, is the 2nd part of 3 part series to
improve mlx5 tc flow handling by removing dependency on rtnl_lock and
providing a more fine-grained locking and rcu safe data structures to
allow tc flow handling for concurrent execution.

In this part Vlad handles hairpin, header rewrite and encapsulation
offloads.

For more information please see tag log below.

Please pull and let me know if there is any problem.

Thanks,
Saeed.


---
The following changes since commit ca497fb6aa9fbd3b0a87fd0a71e9e1df2600ac30:

  taprio: remove unused variable 'entry_list_policy' (2019-08-09 13:41:24 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux.git tags/mlx5-updates-2019-08-09

for you to fetch changes up to b51c225e6c4e987e131b8b1332f66969382bf328:

  net/mlx5e: Use refcount_t for refcount (2019-08-09 14:54:11 -0700)

----------------------------------------------------------------
mlx5-updates-2019-08-09

This series includes update to mlx5 ethernet and core driver:

In first #11 patches, Vlad submits part 2 of 3 part series to allow
TC flow handling for concurrent execution.

1) TC flow handling for concurrent execution (part 2)

Vald Says:
==========

Refactor data structures that are shared between flows in tc.
Currently, all cls API hardware offloads driver callbacks require caller
to hold rtnl lock when calling them. Cls API has already been updated to
update software filters in parallel (on classifiers that support
unlocked execution), however hardware offloads code still obtains rtnl
lock before calling driver tc callbacks. This set implements support for
unlocked execution of tc hairpin, mod_hdr and encap subsystem. The
changed implemented in these subsystems are very similar in general.

The main difference is that hairpin is accessed through mlx5e_tc_table
(legacy mode), mod_hdr is accessed through both mlx5e_tc_table and
mlx5_esw_offload (legacy and switchdev modes) and encap is only accessed
through mlx5_esw_offload (switchdev mode).

1.1) Hairpin handling and structure mlx5e_hairpin_entry refactored in
following way:

- Hairpin structure is extended with atomic reference counter. This
  approach allows to lookup of hairpin entry and obtain reference to it
  with hairpin_tbl_lock protection and then continue using the entry
  unlocked (including provisioning to hardware).

- To support unlocked provisioning of hairpin entry to hardware, the entry
  is extended with 'res_ready' completion and is inserted to hairpin_tbl
  before calling the firmware. With this approach any concurrent users that
  attempt to use the same hairpin entry wait for completion first to
  prevent access to entries that are not fully initialized.

- Hairpin entry is extended with new flows_lock spinlock to protect the
  list when multiple concurrent tc instances update flows attached to
  the same hairpin entry.

1.2) Modify header handling code and structure mlx5e_mod_hdr_entry
are refactored in the following way:

- Mod_hdr structure is extended with atomic reference counter. This
  approach allows to lookup of mod_hdr entry and obtain reference to it
  with mod_hdr_tbl_lock protection and then continue using the entry
  unlocked (including provisioning to hardware).

- To support unlocked provisioning of mod_hdr entry to hardware, the entry
  is extended with 'res_ready' completion and is inserted to mod_hdr_tbl
  before calling the firmware. With this approach any concurrent users that
  attempt to use the same mod_hdr entry wait for completion first to
  prevent access to entries that are not fully initialized.

- Mod_Hdr entry is extended with new flows_lock spinlock to protect the
  list when multiple concurrent tc instances update flows attached to
  the same mod_hdr entry.

1.3) Encapsulation handling code and Structure mlx5e_encap_entry
are refactored in the following way:

- encap structure is extended with atomic reference counter. This
  approach allows to lookup of encap entry and obtain reference to it
  with encap_tbl_lock protection and then continue using the entry
  unlocked (including provisioning to hardware).

- To support unlocked provisioning of encap entry to hardware, the entry is
  extended with 'res_ready' completion and is inserted to encap_tbl before
  calling the firmware. With this approach any concurrent users that
  attempt to use the same encap entry wait for completion first to prevent
  access to entries that are not fully initialized.

- As a difference from approach used to refactor hairpin and mod_hdr,
  encap entry is not extended with any per-entry fine-grained lock.
  Instead, encap_table_lock is used to synchronize all operations on
  encap table and instances of mlx5e_encap_entry. This is necessary
  because single flow can be attached to multiple encap entries
  simultaneously. During new flow creation or neigh update event all of
  encaps that flow is attached to must be accessed together as in atomic
  manner, which makes usage of per-entry lock infeasible.

- Encap entry is extended with new flows_lock spinlock to protect the
  list when multiple concurrent tc instances update flows attached to
  the same encap entry.

==========

3) Parav improves the way port representors report their parent ID and
port index.

4) Use refcount_t for refcount in vxlan data base from  Chuhong Yuan

----------------------------------------------------------------
Chuhong Yuan (1):
      net/mlx5e: Use refcount_t for refcount

Parav Pandit (3):
      net/mlx5: E-switch, Removed unused hwid
      net/mlx5e: Simplify querying port representor parent id
      net/mlx5e: Use vhca_id in generating representor port_index

Vlad Buslov (11):
      net/mlx5e: Extend hairpin entry with reference counter
      net/mlx5e: Protect hairpin entry flows list with spinlock
      net/mlx5e: Protect hairpin hash table with mutex
      net/mlx5e: Allow concurrent creation of hairpin entries
      net/mlx5e: Extend mod header entry with reference counter
      net/mlx5e: Protect mod header entry flows list with spinlock
      net/mlx5e: Protect mod_hdr hash table with mutex
      net/mlx5e: Allow concurrent creation of mod_hdr entries
      net/mlx5e: Extend encap entry with reference counter
      net/mlx5e: Protect encap hash table with mutex
      net/mlx5e: Allow concurrent creation of encap entries

 drivers/net/ethernet/mellanox/mlx5/core/en/fs.h    |   3 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.c   |  41 ++-
 drivers/net/ethernet/mellanox/mlx5/core/en_rep.h   |   3 +
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 401 +++++++++++++++------
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.h    |   2 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.c  |   6 +-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h  |   3 +-
 .../ethernet/mellanox/mlx5/core/eswitch_offloads.c |   6 +-
 .../net/ethernet/mellanox/mlx5/core/lib/vxlan.c    |   9 +-
 include/linux/mlx5/eswitch.h                       |   1 -
 include/linux/mlx5/fs.h                            |   5 +
 11 files changed, 340 insertions(+), 140 deletions(-)

^ permalink raw reply

* [net-next 02/15] net/mlx5e: Protect hairpin entry flows list with spinlock
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Jianbo Liu, Roi Dayan,
	Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

To remove dependency on rtnl lock, extend hairpin entry with spinlock and
use it to protect list of flows attached to hairpin entry from concurrent
modifications.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 64ce762ec1e6..0abfa9b3ec54 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -162,6 +162,8 @@ struct mlx5e_hairpin_entry {
 	/* a node of a hash table which keeps all the  hairpin entries */
 	struct hlist_node hairpin_hlist;
 
+	/* protects flows list */
+	spinlock_t flows_lock;
 	/* flows sharing the same hairpin */
 	struct list_head flows;
 
@@ -735,6 +737,7 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 	if (!hpe)
 		return -ENOMEM;
 
+	spin_lock_init(&hpe->flows_lock);
 	INIT_LIST_HEAD(&hpe->flows);
 	hpe->peer_vhca_id = peer_id;
 	hpe->prio = match_prio;
@@ -782,7 +785,9 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 		flow->nic_attr->hairpin_tirn = hpe->hp->tirn;
 	}
 	flow->hpe = hpe;
+	spin_lock(&hpe->flows_lock);
 	list_add(&flow->hairpin, &hpe->flows);
+	spin_unlock(&hpe->flows_lock);
 
 	return 0;
 
@@ -798,7 +803,10 @@ static void mlx5e_hairpin_flow_del(struct mlx5e_priv *priv,
 	if (!flow->hpe)
 		return;
 
+	spin_lock(&flow->hpe->flows_lock);
 	list_del(&flow->hairpin);
+	spin_unlock(&flow->hpe->flows_lock);
+
 	mlx5e_hairpin_put(priv, flow->hpe);
 	flow->hpe = NULL;
 }
-- 
2.21.0


^ permalink raw reply related

* [net-next 01/15] net/mlx5e: Extend hairpin entry with reference counter
From: Saeed Mahameed @ 2019-08-09 22:04 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev@vger.kernel.org, Vlad Buslov, Jianbo Liu, Roi Dayan,
	Saeed Mahameed
In-Reply-To: <20190809220359.11516-1-saeedm@mellanox.com>

From: Vlad Buslov <vladbu@mellanox.com>

List of flows attached to hairpin entry is used as implicit reference
counter (hairpin entry is deallocated when list becomes free) and as a
mechanism to obtain hairpin entry that flow is attached to (through list
head). This is not safe when concurrent modification of list of flows
attached to hairpin entry is possible. Proper atomic reference counter is
required to support concurrent access.

As a preparation for extending hairpin with reference counting, extract
code that deletes hairpin entry into standalone function. In order to
remove this dependency on external locking, extend hairpin entry with
reference counter to manage its lifetime and extend flow structure with
direct pointer to hairpin entry that flow is attached to.

Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
Reviewed-by: Jianbo Liu <jianbol@mellanox.com>
Reviewed-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
---
 .../net/ethernet/mellanox/mlx5/core/en_tc.c   | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 4d97cc47835f..64ce762ec1e6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -119,6 +119,7 @@ struct mlx5e_tc_flow {
 	struct encap_flow_item encaps[MLX5_MAX_FLOW_FWD_VPORTS];
 	struct mlx5e_tc_flow    *peer_flow;
 	struct list_head	mod_hdr; /* flows sharing the same mod hdr ID */
+	struct mlx5e_hairpin_entry *hpe; /* attached hairpin instance */
 	struct list_head	hairpin; /* flows sharing the same hairpin */
 	struct list_head	peer;    /* flows with peer flow */
 	struct list_head	unready; /* flows not ready to be offloaded (e.g due to missing route) */
@@ -167,6 +168,7 @@ struct mlx5e_hairpin_entry {
 	u16 peer_vhca_id;
 	u8 prio;
 	struct mlx5e_hairpin *hp;
+	refcount_t refcnt;
 };
 
 struct mod_hdr_key {
@@ -635,13 +637,31 @@ static struct mlx5e_hairpin_entry *mlx5e_hairpin_get(struct mlx5e_priv *priv,
 
 	hash_for_each_possible(priv->fs.tc.hairpin_tbl, hpe,
 			       hairpin_hlist, hash_key) {
-		if (hpe->peer_vhca_id == peer_vhca_id && hpe->prio == prio)
+		if (hpe->peer_vhca_id == peer_vhca_id && hpe->prio == prio) {
+			refcount_inc(&hpe->refcnt);
 			return hpe;
+		}
 	}
 
 	return NULL;
 }
 
+static void mlx5e_hairpin_put(struct mlx5e_priv *priv,
+			      struct mlx5e_hairpin_entry *hpe)
+{
+	/* no more hairpin flows for us, release the hairpin pair */
+	if (!refcount_dec_and_test(&hpe->refcnt))
+		return;
+
+	netdev_dbg(priv->netdev, "del hairpin: peer %s\n",
+		   dev_name(hpe->hp->pair->peer_mdev->device));
+
+	WARN_ON(!list_empty(&hpe->flows));
+	mlx5e_hairpin_destroy(hpe->hp);
+	hash_del(&hpe->hairpin_hlist);
+	kfree(hpe);
+}
+
 #define UNKNOWN_MATCH_PRIO 8
 
 static int mlx5e_hairpin_get_prio(struct mlx5e_priv *priv,
@@ -718,6 +738,7 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 	INIT_LIST_HEAD(&hpe->flows);
 	hpe->peer_vhca_id = peer_id;
 	hpe->prio = match_prio;
+	refcount_set(&hpe->refcnt, 1);
 
 	params.log_data_size = 15;
 	params.log_data_size = min_t(u8, params.log_data_size,
@@ -760,6 +781,7 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 	} else {
 		flow->nic_attr->hairpin_tirn = hpe->hp->tirn;
 	}
+	flow->hpe = hpe;
 	list_add(&flow->hairpin, &hpe->flows);
 
 	return 0;
@@ -772,27 +794,13 @@ static int mlx5e_hairpin_flow_add(struct mlx5e_priv *priv,
 static void mlx5e_hairpin_flow_del(struct mlx5e_priv *priv,
 				   struct mlx5e_tc_flow *flow)
 {
-	struct list_head *next = flow->hairpin.next;
-
 	/* flow wasn't fully initialized */
-	if (list_empty(&flow->hairpin))
+	if (!flow->hpe)
 		return;
 
 	list_del(&flow->hairpin);
-
-	/* no more hairpin flows for us, release the hairpin pair */
-	if (list_empty(next)) {
-		struct mlx5e_hairpin_entry *hpe;
-
-		hpe = list_entry(next, struct mlx5e_hairpin_entry, flows);
-
-		netdev_dbg(priv->netdev, "del hairpin: peer %s\n",
-			   dev_name(hpe->hp->pair->peer_mdev->device));
-
-		mlx5e_hairpin_destroy(hpe->hp);
-		hash_del(&hpe->hairpin_hlist);
-		kfree(hpe);
-	}
+	mlx5e_hairpin_put(priv, flow->hpe);
+	flow->hpe = NULL;
 }
 
 static int
-- 
2.21.0


^ permalink raw reply related

* Re: [Potential Spoof] Re: [PATCH net-next v6 3/3] net: phy: broadcom: add 1000Base-X support for BCM54616S
From: Heiner Kallweit @ 2019-08-09 21:59 UTC (permalink / raw)
  To: Tao Ren, Andrew Lunn, Florian Fainelli, David S . Miller,
	Arun Parameswaran, Justin Chen, Vladimir Oltean,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	openbmc@lists.ozlabs.org
In-Reply-To: <8f0e172b-575c-dab8-b695-c33dfc78fa8f@fb.com>

On 09.08.2019 23:13, Tao Ren wrote:
> On 8/9/19 1:54 PM, Tao Ren wrote:
>> Hi Heiner,
>>
>> On 8/9/19 1:21 PM, Heiner Kallweit wrote:
>>> On 09.08.2019 07:44, Tao Ren wrote:
>>>> The BCM54616S PHY cannot work properly in RGMII->1000Base-KX mode (for
>>>> example, on Facebook CMM BMC platform), mainly because genphy functions
>>>> are designed for copper links, and 1000Base-X (clause 37) auto negotiation
>>>> needs to be handled differently.
>>>>
>>>> This patch enables 1000Base-X support for BCM54616S by customizing 3
>>>> driver callbacks:
>>>>
>>>>   - probe: probe callback detects PHY's operation mode based on
>>>>     INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX
>>>>     Control register.
>>>>
>>>>   - config_aneg: calls genphy_c37_config_aneg when the PHY is running in
>>>>     1000Base-X mode; otherwise, genphy_config_aneg will be called.
>>>>
>>>>   - read_status: calls genphy_c37_read_status when the PHY is running in
>>>>     1000Base-X mode; otherwise, genphy_read_status will be called.
>>>>
>>>> Signed-off-by: Tao Ren <taoren@fb.com>
>>>> ---
>>>>  Changes in v6:
>>>>   - nothing changed.
>>>>  Changes in v5:
>>>>   - include Heiner's patch "net: phy: add support for clause 37
>>>>     auto-negotiation" into the series.
>>>>   - use genphy_c37_config_aneg and genphy_c37_read_status in BCM54616S
>>>>     PHY driver's callback when the PHY is running in 1000Base-X mode.
>>>>  Changes in v4:
>>>>   - add bcm54616s_config_aneg_1000bx() to deal with auto negotiation in
>>>>     1000Base-X mode.
>>>>  Changes in v3:
>>>>   - rename bcm5482_read_status to bcm54xx_read_status so the callback can
>>>>     be shared by BCM5482 and BCM54616S.
>>>>  Changes in v2:
>>>>   - Auto-detect PHY operation mode instead of passing DT node.
>>>>   - move PHY mode auto-detect logic from config_init to probe callback.
>>>>   - only set speed (not including duplex) in read_status callback.
>>>>   - update patch description with more background to avoid confusion.
>>>>   - patch #1 in the series ("net: phy: broadcom: set features explicitly
>>>>     for BCM54616") is dropped: the fix should go to get_features callback
>>>>     which may potentially depend on this patch.
>>>>
>>>>  drivers/net/phy/broadcom.c | 54 +++++++++++++++++++++++++++++++++++---
>>>>  include/linux/brcmphy.h    | 10 +++++--
>>>>  2 files changed, 58 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
>>>> index 937d0059e8ac..fbd76a31c142 100644
>>>> --- a/drivers/net/phy/broadcom.c
>>>> +++ b/drivers/net/phy/broadcom.c
>>>> @@ -383,9 +383,9 @@ static int bcm5482_config_init(struct phy_device *phydev)
>>>>  		/*
>>>>  		 * Select 1000BASE-X register set (primary SerDes)
>>>>  		 */
>>>> -		reg = bcm_phy_read_shadow(phydev, BCM5482_SHD_MODE);
>>>> -		bcm_phy_write_shadow(phydev, BCM5482_SHD_MODE,
>>>> -				     reg | BCM5482_SHD_MODE_1000BX);
>>>> +		reg = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
>>>> +		bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE,
>>>> +				     reg | BCM54XX_SHD_MODE_1000BX);
>>>>  
>>>>  		/*
>>>>  		 * LED1=ACTIVITYLED, LED3=LINKSPD[2]
>>>> @@ -451,12 +451,44 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
>>>>  	return ret;
>>>>  }
>>>>  
>>>> +static int bcm54616s_probe(struct phy_device *phydev)
>>>> +{
>>>> +	int val, intf_sel;
>>>> +
>>>> +	val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
>>>> +	if (val < 0)
>>>> +		return val;
>>>> +
>>>> +	/* The PHY is strapped in RGMII to fiber mode when INTERF_SEL[1:0]
>>>> +	 * is 01b.
>>>> +	 */
>>>> +	intf_sel = (val & BCM54XX_SHD_INTF_SEL_MASK) >> 1;
>>>> +	if (intf_sel == 1) {
>>>> +		val = bcm_phy_read_shadow(phydev, BCM54616S_SHD_100FX_CTRL);
>>>> +		if (val < 0)
>>>> +			return val;
>>>> +
>>>> +		/* Bit 0 of the SerDes 100-FX Control register, when set
>>>> +		 * to 1, sets the MII/RGMII -> 100BASE-FX configuration.
>>>> +		 * When this bit is set to 0, it sets the GMII/RGMII ->
>>>> +		 * 1000BASE-X configuration.
>>>> +		 */
>>>> +		if (!(val & BCM54616S_100FX_MODE))
>>>> +			phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX;
>>>> +	}
>>>> +
>>>> +	return 0;
>>>> +}
>>>> +
>>>>  static int bcm54616s_config_aneg(struct phy_device *phydev)
>>>>  {
>>>>  	int ret;
>>>>  
>>>>  	/* Aneg firsly. */
>>>> -	ret = genphy_config_aneg(phydev);
>>>> +	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX)
>>>> +		ret = genphy_c37_config_aneg(phydev);
>>>> +	else
>>>> +		ret = genphy_config_aneg(phydev);
>>>>  
>>>
>>> I'm just wondering whether it needs to be considered that 100base-FX
>>> doesn't support auto-negotiation. I suppose BMSR reports aneg as
>>> supported, therefore phylib will use aneg per default.
>>> Not sure who could set 100Base-FX mode when, but maybe at that place
>>> also phydev->autoneg needs to be cleared. Did you test 100Base-FX mode?
>>
>> I'm doubting if 100Base-FX works. Besides auto-negotiation, 100Base-FX Control/Status registers are defined in shadow register instead of MII_BMCR and MII_BMSR.
>>
>> Unfortunately I don't have environment to test 100Base-FX and that's why I only make changes when the PHY is working in 1000X mode.
> 
> I can prepare a patch for 100Base-FX based on my understanding of bcm54616s datasheet, but the patch would be just compile-tested 
> 
Support for 1000Base-X should be sufficient. Best mention the missing support for
100Base-FX in the commit message and at a suited place in the driver code.

> 
> Thanks,
> 
> Tao
> 
Heiner

^ permalink raw reply

* Re: [PATCH v3] tools: bpftool: fix reading from /proc/config.gz
From: Jakub Kicinski @ 2019-08-09 21:57 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Peter Wu, Alexei Starovoitov, Daniel Borkmann, netdev,
	Stanislav Fomichev, Quentin Monnet
In-Reply-To: <20190809214831.GE2820@mini-arch>

On Fri, 9 Aug 2019 14:48:31 -0700, Stanislav Fomichev wrote:
> I'm just being nit picky :-)
> Because changelog says we already depend on -lz, but then in the patch
> we explicitly add it.
> 
> I think you were right in pointing out that we already implicitly depend
> on -lz via -lelf and/or -lbfd. And it works for non-static builds.
> We don't need an explicit -lz unless somebody puts '-static' in
> EXTRA_CFLAGS. So maybe we should just submit the patch as is because
> it fixes make EXTRA_CFLAGS=-static.

Mm. Sounds reasonable. Fixing EXTRA_CFLAGS=-static would be really cool,
too, I always struggle to get a statically linked build.

> RE $(error): we don't do it for -lelf, right? So probably not worth
> the hassle for zlib.

Right, OTOH bpftool doesn't really care about -lelf, it's libbpf that
needs it, and libbpf does test.

^ permalink raw reply

* Re: [PATCH v3] tools: bpftool: fix reading from /proc/config.gz
From: Stanislav Fomichev @ 2019-08-09 21:48 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Peter Wu, Alexei Starovoitov, Daniel Borkmann, netdev,
	Stanislav Fomichev, Quentin Monnet
In-Reply-To: <20190809140956.24369b00@cakuba.netronome.com>

On 08/09, Jakub Kicinski wrote:
> On Fri, 9 Aug 2019 08:32:10 -0700, Stanislav Fomichev wrote:
> > On 08/09, Peter Wu wrote:
> > > /proc/config has never existed as far as I can see, but /proc/config.gz
> > > is present on Arch Linux. Add support for decompressing config.gz using
> > > zlib which is a mandatory dependency of libelf. Replace existing stdio
> > > functions with gzFile operations since the latter transparently handles
> > > uncompressed and gzip-compressed files.
> > > 
> > > Cc: Quentin Monnet <quentin.monnet@netronome.com>
> > > Signed-off-by: Peter Wu <peter@lekensteyn.nl>
> 
> Thanks for the patch, looks good to me now!
> 
> > >  tools/bpf/bpftool/Makefile  |   2 +-
> > >  tools/bpf/bpftool/feature.c | 105 ++++++++++++++++++------------------
> > >  2 files changed, 54 insertions(+), 53 deletions(-)
> > > 
> > > diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
> > > index a7afea4dec47..078bd0dcfba5 100644
> > > --- a/tools/bpf/bpftool/Makefile
> > > +++ b/tools/bpf/bpftool/Makefile
> > > @@ -52,7 +52,7 @@ ifneq ($(EXTRA_LDFLAGS),)
> > >  LDFLAGS += $(EXTRA_LDFLAGS)
> > >  endif
> > >  
> > > -LIBS = -lelf $(LIBBPF)
> > > +LIBS = -lelf -lz $(LIBBPF)  
> > You're saying in the commit description that bpftool already links
> > against -lz (via -lelf), but then explicitly add -lz here, why?
> 
> It probably won't hurt to enable the zlib test:
> 
> diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
> index 078bd0dcfba5..8176632e519c 100644
> --- a/tools/bpf/bpftool/Makefile
> +++ b/tools/bpf/bpftool/Makefile
> @@ -58,8 +58,8 @@ INSTALL ?= install
>  RM ?= rm -f
>  
>  FEATURE_USER = .bpftool
> -FEATURE_TESTS = libbfd disassembler-four-args reallocarray
> -FEATURE_DISPLAY = libbfd disassembler-four-args
> +FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
> +FEATURE_DISPLAY = libbfd disassembler-four-args zlib
>  
>  check_feat := 1
>  NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall
> 
> And then we can test for it the way libbpf tests for elf:
> 
> all: zdep $(OUTPUT)bpftool
> 
> PHONY += zdep
> 
> zdep:
> 	@if [ "$(feature-zlib)" != "1" ]; then echo "No zlib found"; exit 1 ; fi
> 
> Or maybe just $(error ...), Stan what's your preference here? 
> We don't have a precedent for hard tests of features in bpftool.
I'm just being nit picky :-)
Because changelog says we already depend on -lz, but then in the patch
we explicitly add it.

I think you were right in pointing out that we already implicitly depend
on -lz via -lelf and/or -lbfd. And it works for non-static builds.
We don't need an explicit -lz unless somebody puts '-static' in
EXTRA_CFLAGS. So maybe we should just submit the patch as is because
it fixes make EXTRA_CFLAGS=-static.

RE $(error): we don't do it for -lelf, right? So probably not worth
the hassle for zlib.

^ permalink raw reply

* [PATCH net] rxrpc: Fix local refcounting
From: David Howells @ 2019-08-09 21:47 UTC (permalink / raw)
  To: netdev; +Cc: dhowells, jaltman, linux-afs, linux-kernel

Fix rxrpc_unuse_local() to handle a NULL local pointer as it can be called
on an unbound socket on which rx->local is not yet set.

The following reproduced (includes omitted):

	int main(void)
	{
		socket(AF_RXRPC, SOCK_DGRAM, AF_INET);
		return 0;
	}

causes the following oops to occur:

	BUG: kernel NULL pointer dereference, address: 0000000000000010
	...
	RIP: 0010:rxrpc_unuse_local+0x8/0x1b
	...
	Call Trace:
	 rxrpc_release+0x2b5/0x338
	 __sock_release+0x37/0xa1
	 sock_close+0x14/0x17
	 __fput+0x115/0x1e9
	 task_work_run+0x72/0x98
	 do_exit+0x51b/0xa7a
	 ? __context_tracking_exit+0x4e/0x10e
	 do_group_exit+0xab/0xab
	 __x64_sys_exit_group+0x14/0x17
	 do_syscall_64+0x89/0x1d4
	 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Reported-by: syzbot+20dee719a2e090427b5f@syzkaller.appspotmail.com
Fixes: 730c5fd42c1e ("rxrpc: Fix local endpoint refcounting")
Signed-off-by: David Howells <dhowells@redhat.com>
cc: Jeffrey Altman <jaltman@auristor.com>
---

 net/rxrpc/local_object.c |   12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 9798159ee65f..c9db3e762d8d 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -402,11 +402,13 @@ void rxrpc_unuse_local(struct rxrpc_local *local)
 {
 	unsigned int au;
 
-	au = atomic_dec_return(&local->active_users);
-	if (au == 0)
-		rxrpc_queue_local(local);
-	else
-		rxrpc_put_local(local);
+	if (local) {
+		au = atomic_dec_return(&local->active_users);
+		if (au == 0)
+			rxrpc_queue_local(local);
+		else
+			rxrpc_put_local(local);
+	}
 }
 
 /*


^ permalink raw reply related

* Re: [v4,0/4] tools: bpftool: add net attach/detach command to attach XDP prog
From: Jakub Kicinski @ 2019-08-09 21:45 UTC (permalink / raw)
  To: Daniel T. Lee; +Cc: Daniel Borkmann, Alexei Starovoitov, netdev
In-Reply-To: <20190809133248.19788-1-danieltimlee@gmail.com>

On Fri,  9 Aug 2019 22:32:44 +0900, Daniel T. Lee wrote:
> Currently, bpftool net only supports dumping progs attached on the
> interface. To attach XDP prog on interface, user must use other tool
> (eg. iproute2). By this patch, with `bpftool net attach/detach`, user
> can attach/detach XDP prog on interface.
> 
>     # bpftool prog
>         16: xdp  name xdp_prog1  tag 539ec6ce11b52f98  gpl
>         loaded_at 2019-08-07T08:30:17+0900  uid 0
>         ...
>         20: xdp  name xdp_fwd_prog  tag b9cb69f121e4a274  gpl
>         loaded_at 2019-08-07T08:30:17+0900  uid 0
> 
>     # bpftool net attach xdpdrv id 16 dev enp6s0np0
>     # bpftool net
>     xdp:
>         enp6s0np0(4) driver id 16
> 
>     # bpftool net attach xdpdrv id 20 dev enp6s0np0 overwrite
>     # bpftool net
>     xdp:
>         enp6s0np0(4) driver id 20
> 
>     # bpftool net detach xdpdrv dev enp6s0np0
>     # bpftool net
>     xdp:
> 
> 
> While this patch only contains support for XDP, through `net
> attach/detach`, bpftool can further support other prog attach types.
> 
> XDP attach/detach tested on Mellanox ConnectX-4 and Netronome Agilio.

Looks good to me now*, thanks!

Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>

* apart from the entire duplication thing.

^ permalink raw reply

* Re: [patch net-next] netdevsim: register couple of devlink params
From: Jakub Kicinski @ 2019-08-09 21:26 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, mlxsw
In-Reply-To: <20190809110512.31779-1-jiri@resnulli.us>

On Fri,  9 Aug 2019 13:05:12 +0200, Jiri Pirko wrote:
> From: Jiri Pirko <jiri@mellanox.com>
> 
> Register couple of devlink params, one generic, one driver-specific.
> Make the values available over debugfs.
> 
> Example:
> $ echo "111" > /sys/bus/netdevsim/new_device
> $ devlink dev param
> netdevsim/netdevsim111:
>   name max_macs type generic
>     values:
>       cmode driverinit value 32
>   name test1 type driver-specific
>     values:
>       cmode driverinit value true
> $ cat /sys/kernel/debug/netdevsim/netdevsim111/max_macs
> 32
> $ cat /sys/kernel/debug/netdevsim/netdevsim111/test1
> Y
> $ devlink dev param set netdevsim/netdevsim111 name max_macs cmode driverinit value 16
> $ devlink dev param set netdevsim/netdevsim111 name test1 cmode driverinit value false
> $ devlink dev reload netdevsim/netdevsim111
> $ cat /sys/kernel/debug/netdevsim/netdevsim111/max_macs
> 16
> $ cat /sys/kernel/debug/netdevsim/netdevsim111/test1
> 
> Signed-off-by: Jiri Pirko <jiri@mellanox.com>

The netdevsim patch looks good, what's the plan for tests?

We don't need much perhaps what you have in the commit message 
as a script which can be run by automated bots would be sufficient?

^ permalink raw reply

* Re: [PATCH v4 7/9] mfd: ioc3: Add driver for SGI IOC3 chip
From: Jakub Kicinski @ 2019-08-09 21:22 UTC (permalink / raw)
  To: Thomas Bogendoerfer
  Cc: Ralf Baechle, Paul Burton, James Hogan, Dmitry Torokhov,
	Lee Jones, David S. Miller, Srinivas Kandagatla, Alessandro Zummo,
	Alexandre Belloni, Greg Kroah-Hartman, Jiri Slaby,
	Evgeniy Polyakov, linux-mips, linux-kernel, linux-input, netdev,
	linux-rtc, linux-serial
In-Reply-To: <20190809103235.16338-8-tbogendoerfer@suse.de>

On Fri,  9 Aug 2019 12:32:29 +0200, Thomas Bogendoerfer wrote:
> SGI IOC3 chip has integrated ethernet, keyboard and mouse interface.
> It also supports connecting a SuperIO chip for serial and parallel
> interfaces. IOC3 is used inside various SGI systemboards and add-on
> cards with different equipped external interfaces.
> 
> Support for ethernet and serial interfaces were implemented inside
> the network driver. This patchset moves out the not network related
> parts to a new MFD driver, which takes care of card detection,
> setup of platform devices and interrupt distribution for the subdevices.
> 
> Serial portion: Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> 
> Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>

There are a lot of changes in the ethernet part which are not easy to
explain by the introduction of the other MFD parts.. Could you possibly
break this change up into smaller chunks?

Also please don't use stdint types in the kernel, please try checkpatch
to catch coding style issues.

^ permalink raw reply

* Re: [net-next 01/15] ice: Implement ethtool ops for channels
From: Jakub Kicinski @ 2019-08-09 21:15 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: davem, Henry Tieman, netdev, nhorman, sassmann, Tony Nguyen,
	Andrew Bowers
In-Reply-To: <20190809183139.30871-2-jeffrey.t.kirsher@intel.com>

On Fri,  9 Aug 2019 11:31:25 -0700, Jeff Kirsher wrote:
> From: Henry Tieman <henry.w.tieman@intel.com>
> 
> Add code to query and set the number of queues on the primary
> VSI for a PF. This is accessed from the 'ethtool -l' and 'ethtool -L'
> commands, respectively.
> 
> Signed-off-by: Henry Tieman <henry.w.tieman@intel.com>
> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
> Tested-by: Andrew Bowers <andrewx.bowers@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

If you're using the same IRQ vector for RX and TX queue the channel
counts as combined. Looks like you are counting RX and TX separately
here. That's incorrect.

^ permalink raw reply

* Re: [Potential Spoof] Re: [PATCH net-next v6 3/3] net: phy: broadcom: add 1000Base-X support for BCM54616S
From: Tao Ren @ 2019-08-09 21:13 UTC (permalink / raw)
  To: Heiner Kallweit, Andrew Lunn, Florian Fainelli, David S . Miller,
	Arun Parameswaran, Justin Chen, Vladimir Oltean,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	openbmc@lists.ozlabs.org
In-Reply-To: <e556dd17-ef85-3c61-bc08-17db02d9a5dc@fb.com>

On 8/9/19 1:54 PM, Tao Ren wrote:
> Hi Heiner,
> 
> On 8/9/19 1:21 PM, Heiner Kallweit wrote:
>> On 09.08.2019 07:44, Tao Ren wrote:
>>> The BCM54616S PHY cannot work properly in RGMII->1000Base-KX mode (for
>>> example, on Facebook CMM BMC platform), mainly because genphy functions
>>> are designed for copper links, and 1000Base-X (clause 37) auto negotiation
>>> needs to be handled differently.
>>>
>>> This patch enables 1000Base-X support for BCM54616S by customizing 3
>>> driver callbacks:
>>>
>>>   - probe: probe callback detects PHY's operation mode based on
>>>     INTERF_SEL[1:0] pins and 1000X/100FX selection bit in SerDES 100-FX
>>>     Control register.
>>>
>>>   - config_aneg: calls genphy_c37_config_aneg when the PHY is running in
>>>     1000Base-X mode; otherwise, genphy_config_aneg will be called.
>>>
>>>   - read_status: calls genphy_c37_read_status when the PHY is running in
>>>     1000Base-X mode; otherwise, genphy_read_status will be called.
>>>
>>> Signed-off-by: Tao Ren <taoren@fb.com>
>>> ---
>>>  Changes in v6:
>>>   - nothing changed.
>>>  Changes in v5:
>>>   - include Heiner's patch "net: phy: add support for clause 37
>>>     auto-negotiation" into the series.
>>>   - use genphy_c37_config_aneg and genphy_c37_read_status in BCM54616S
>>>     PHY driver's callback when the PHY is running in 1000Base-X mode.
>>>  Changes in v4:
>>>   - add bcm54616s_config_aneg_1000bx() to deal with auto negotiation in
>>>     1000Base-X mode.
>>>  Changes in v3:
>>>   - rename bcm5482_read_status to bcm54xx_read_status so the callback can
>>>     be shared by BCM5482 and BCM54616S.
>>>  Changes in v2:
>>>   - Auto-detect PHY operation mode instead of passing DT node.
>>>   - move PHY mode auto-detect logic from config_init to probe callback.
>>>   - only set speed (not including duplex) in read_status callback.
>>>   - update patch description with more background to avoid confusion.
>>>   - patch #1 in the series ("net: phy: broadcom: set features explicitly
>>>     for BCM54616") is dropped: the fix should go to get_features callback
>>>     which may potentially depend on this patch.
>>>
>>>  drivers/net/phy/broadcom.c | 54 +++++++++++++++++++++++++++++++++++---
>>>  include/linux/brcmphy.h    | 10 +++++--
>>>  2 files changed, 58 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c
>>> index 937d0059e8ac..fbd76a31c142 100644
>>> --- a/drivers/net/phy/broadcom.c
>>> +++ b/drivers/net/phy/broadcom.c
>>> @@ -383,9 +383,9 @@ static int bcm5482_config_init(struct phy_device *phydev)
>>>  		/*
>>>  		 * Select 1000BASE-X register set (primary SerDes)
>>>  		 */
>>> -		reg = bcm_phy_read_shadow(phydev, BCM5482_SHD_MODE);
>>> -		bcm_phy_write_shadow(phydev, BCM5482_SHD_MODE,
>>> -				     reg | BCM5482_SHD_MODE_1000BX);
>>> +		reg = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
>>> +		bcm_phy_write_shadow(phydev, BCM54XX_SHD_MODE,
>>> +				     reg | BCM54XX_SHD_MODE_1000BX);
>>>  
>>>  		/*
>>>  		 * LED1=ACTIVITYLED, LED3=LINKSPD[2]
>>> @@ -451,12 +451,44 @@ static int bcm5481_config_aneg(struct phy_device *phydev)
>>>  	return ret;
>>>  }
>>>  
>>> +static int bcm54616s_probe(struct phy_device *phydev)
>>> +{
>>> +	int val, intf_sel;
>>> +
>>> +	val = bcm_phy_read_shadow(phydev, BCM54XX_SHD_MODE);
>>> +	if (val < 0)
>>> +		return val;
>>> +
>>> +	/* The PHY is strapped in RGMII to fiber mode when INTERF_SEL[1:0]
>>> +	 * is 01b.
>>> +	 */
>>> +	intf_sel = (val & BCM54XX_SHD_INTF_SEL_MASK) >> 1;
>>> +	if (intf_sel == 1) {
>>> +		val = bcm_phy_read_shadow(phydev, BCM54616S_SHD_100FX_CTRL);
>>> +		if (val < 0)
>>> +			return val;
>>> +
>>> +		/* Bit 0 of the SerDes 100-FX Control register, when set
>>> +		 * to 1, sets the MII/RGMII -> 100BASE-FX configuration.
>>> +		 * When this bit is set to 0, it sets the GMII/RGMII ->
>>> +		 * 1000BASE-X configuration.
>>> +		 */
>>> +		if (!(val & BCM54616S_100FX_MODE))
>>> +			phydev->dev_flags |= PHY_BCM_FLAGS_MODE_1000BX;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>>  static int bcm54616s_config_aneg(struct phy_device *phydev)
>>>  {
>>>  	int ret;
>>>  
>>>  	/* Aneg firsly. */
>>> -	ret = genphy_config_aneg(phydev);
>>> +	if (phydev->dev_flags & PHY_BCM_FLAGS_MODE_1000BX)
>>> +		ret = genphy_c37_config_aneg(phydev);
>>> +	else
>>> +		ret = genphy_config_aneg(phydev);
>>>  
>>
>> I'm just wondering whether it needs to be considered that 100base-FX
>> doesn't support auto-negotiation. I suppose BMSR reports aneg as
>> supported, therefore phylib will use aneg per default.
>> Not sure who could set 100Base-FX mode when, but maybe at that place
>> also phydev->autoneg needs to be cleared. Did you test 100Base-FX mode?
> 
> I'm doubting if 100Base-FX works. Besides auto-negotiation, 100Base-FX Control/Status registers are defined in shadow register instead of MII_BMCR and MII_BMSR.
> 
> Unfortunately I don't have environment to test 100Base-FX and that's why I only make changes when the PHY is working in 1000X mode.

I can prepare a patch for 100Base-FX based on my understanding of bcm54616s datasheet, but the patch would be just compile-tested 


Thanks,

Tao

^ permalink raw reply

* Re: [PATCH v3] tools: bpftool: fix reading from /proc/config.gz
From: Jakub Kicinski @ 2019-08-09 21:09 UTC (permalink / raw)
  To: Stanislav Fomichev
  Cc: Peter Wu, Alexei Starovoitov, Daniel Borkmann, netdev,
	Stanislav Fomichev, Quentin Monnet
In-Reply-To: <20190809153210.GD2820@mini-arch>

On Fri, 9 Aug 2019 08:32:10 -0700, Stanislav Fomichev wrote:
> On 08/09, Peter Wu wrote:
> > /proc/config has never existed as far as I can see, but /proc/config.gz
> > is present on Arch Linux. Add support for decompressing config.gz using
> > zlib which is a mandatory dependency of libelf. Replace existing stdio
> > functions with gzFile operations since the latter transparently handles
> > uncompressed and gzip-compressed files.
> > 
> > Cc: Quentin Monnet <quentin.monnet@netronome.com>
> > Signed-off-by: Peter Wu <peter@lekensteyn.nl>

Thanks for the patch, looks good to me now!

> >  tools/bpf/bpftool/Makefile  |   2 +-
> >  tools/bpf/bpftool/feature.c | 105 ++++++++++++++++++------------------
> >  2 files changed, 54 insertions(+), 53 deletions(-)
> > 
> > diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
> > index a7afea4dec47..078bd0dcfba5 100644
> > --- a/tools/bpf/bpftool/Makefile
> > +++ b/tools/bpf/bpftool/Makefile
> > @@ -52,7 +52,7 @@ ifneq ($(EXTRA_LDFLAGS),)
> >  LDFLAGS += $(EXTRA_LDFLAGS)
> >  endif
> >  
> > -LIBS = -lelf $(LIBBPF)
> > +LIBS = -lelf -lz $(LIBBPF)  
> You're saying in the commit description that bpftool already links
> against -lz (via -lelf), but then explicitly add -lz here, why?

It probably won't hurt to enable the zlib test:

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 078bd0dcfba5..8176632e519c 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -58,8 +58,8 @@ INSTALL ?= install
 RM ?= rm -f
 
 FEATURE_USER = .bpftool
-FEATURE_TESTS = libbfd disassembler-four-args reallocarray
-FEATURE_DISPLAY = libbfd disassembler-four-args
+FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib
+FEATURE_DISPLAY = libbfd disassembler-four-args zlib
 
 check_feat := 1
 NON_CHECK_FEAT_TARGETS := clean uninstall doc doc-clean doc-install doc-uninstall

And then we can test for it the way libbpf tests for elf:

all: zdep $(OUTPUT)bpftool

PHONY += zdep

zdep:
	@if [ "$(feature-zlib)" != "1" ]; then echo "No zlib found"; exit 1 ; fi

Or maybe just $(error ...), Stan what's your preference here? 
We don't have a precedent for hard tests of features in bpftool.

^ permalink raw reply related

* Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain namespacedata from current task
From: Carlos Antonio Neira Bustos @ 2019-08-09 21:03 UTC (permalink / raw)
  To: Yonghong Song
  Cc: Y Song, netdev@vger.kernel.org, ebiederm@xmission.com,
	brouer@redhat.com, bpf, quentin.monnet@netronome.com
In-Reply-To: <20190808211714.taet5fjr6q43na5i@dev00>

Yonghong,

I have splitted the patch in 2 :

- bpf_helper introduction :
 

From 40ec0781525b82d5235c45f5066a7a79dea71065 Mon Sep 17 00:00:00 2001
From: Carlos <cneirabustos@gmail.com>
Date: Fri, 9 Aug 2019 12:20:52 -0700
Subject: [PATCH 1/2] [PATCH v8 bpf-next 1/2] BPF: New helper to obtain
 namespace data  from current task

This helper obtains the active namespace from current and returns pid, tgid,
device and namespace id as seen from that namespace, allowing to instrument
a process inside a container.
Device is read from /proc/self/ns/pid, as in the future it's possible that
different pid_ns files may belong to different devices, according
to the discussion between Eric Biederman and Yonghong in 2017 linux plumbers
conference.
Currently bpf_get_current_pid_tgid(), is used to do pid filtering in bcc's
scripts but this helper returns the pid as seen by the root namespace which is
fine when a bcc script is not executed inside a container.
When the process of interest is inside a container, pid filtering will not work
if bpf_get_current_pid_tgid() is used. This helper addresses this limitation
returning the pid as it's seen by the current namespace where the script is
executing.

This helper has the same use cases as bpf_get_current_pid_tgid() as it can be
used to do pid filtering even inside a container.

For example a bcc script using bpf_get_current_pid_tgid() (tools/funccount.py):

        u32 pid = bpf_get_current_pid_tgid() >> 32;
        if (pid != <pid_arg_passed_in>)
                return 0;
Could be modified to use bpf_get_current_pidns_info() as follows:

        struct bpf_pidns pidns;
        bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
        u32 pid = pidns.tgid;
        u32 nsid = pidns.nsid;
        if ((pid != <pid_arg_passed_in>) && (nsid != <nsid_arg_passed_in>))
                return 0;

To find out the name PID namespace id of a process, you could use this command:

$ ps -h -o pidns -p <pid_of_interest>

Or this other command:

$ ls -Li /proc/<pid_of_interest>/ns/pid

Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
---
 fs/internal.h                  |  2 --
 fs/namei.c                     |  1 -
 include/linux/bpf.h            |  1 +
 include/linux/namei.h          |  4 +++
 include/uapi/linux/bpf.h       | 31 +++++++++++++++++++-
 kernel/bpf/core.c              |  1 +
 kernel/bpf/helpers.c           | 64 ++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/bpf_trace.c       |  2 ++
 tools/include/uapi/linux/bpf.h | 31 +++++++++++++++++++-
 9 files changed, 132 insertions(+), 5 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 315fcd8d237c..6647e15dd419 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
 /*
  * namei.c
  */
-extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
-			   struct path *path, struct path *root);
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct path *);
diff --git a/fs/namei.c b/fs/namei.c
index 209c51a5226c..a89fc72a4a10 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -19,7 +19,6 @@
 #include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
 #include <linux/fsnotify.h>
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9a506147c8a..e4adf5e05afd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto bpf_get_local_storage_proto;
 extern const struct bpf_func_proto bpf_strtol_proto;
 extern const struct bpf_func_proto bpf_strtoul_proto;
 extern const struct bpf_func_proto bpf_tcp_sock_proto;
+extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
 
 /* Shared helpers among cBPF and eBPF. */
 void bpf_user_rnd_init_once(void);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9138b4471dbf..b45c8b6f7cb4 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -6,6 +6,7 @@
 #include <linux/path.h>
 #include <linux/fcntl.h>
 #include <linux/errno.h>
+#include <linux/fs.h>
 
 enum { MAX_NESTED_LINKS = 8 };
 
@@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct dentry *);
 
 extern void nd_jump_link(struct path *path);
 
+extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
+			   struct path *path, struct path *root);
+
 static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
 {
 	((char *) name)[min(len, maxlen)] = '\0';
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 4393bd4b2419..db241857ec15 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2741,6 +2741,28 @@ union bpf_attr {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
+ *	Description
+ *		Copies into *pidns* pid, namespace id and tgid as seen by the
+ *		current namespace and also device from /proc/self/ns/pid.
+ *		*size_of_pidns* must be the size of *pidns*
+ *
+ *		This helper is used when pid filtering is needed inside a
+ *		container as bpf_get_current_tgid() helper returns always the
+ *		pid id as seen by the root namespace.
+ *	Return
+ *		0 on success
+ *
+ *		**-EINVAL** if *size_of_pidns* is not valid or unable to get ns, pid
+ *		or tgid of the current task.
+ *
+ *		**-ECHILD** if /proc/self/ns/pid does not exists.
+ *
+ *		**-ENOTDIR** if /proc/self/ns does not exists.
+ *
+ *		**-ENOMEM**  if allocation fails.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2853,7 +2875,8 @@ union bpf_attr {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(get_current_pidns_info),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3604,4 +3627,10 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+struct bpf_pidns_info {
+	__u32 dev;
+	__u32 nsid;
+	__u32 tgid;
+	__u32 pid;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..3159f2a0188c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2038,6 +2038,7 @@ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak;
 const struct bpf_func_proto bpf_get_current_comm_proto __weak;
 const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
 const struct bpf_func_proto bpf_get_local_storage_proto __weak;
+const struct bpf_func_proto bpf_get_current_pidns_info __weak;
 
 const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
 {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5e28718928ca..41fbf1f28a48 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -11,6 +11,12 @@
 #include <linux/uidgid.h>
 #include <linux/filter.h>
 #include <linux/ctype.h>
+#include <linux/pid_namespace.h>
+#include <linux/major.h>
+#include <linux/stat.h>
+#include <linux/namei.h>
+#include <linux/version.h>
+
 
 #include "../../lib/kstrtox.h"
 
@@ -312,6 +318,64 @@ void copy_map_value_locked(struct bpf_map *map, void *dst, void *src,
 	preempt_enable();
 }
 
+BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, pidns_info, u32,
+	 size)
+{
+	const char *pidns_path = "/proc/self/ns/pid";
+	struct pid_namespace *pidns = NULL;
+	struct filename *tmp = NULL;
+	struct inode *inode;
+	struct path kp;
+	pid_t tgid = 0;
+	pid_t pid = 0;
+	int ret;
+	int len;
+
+	if (unlikely(size != sizeof(struct bpf_pidns_info)))
+		return -EINVAL;
+	pidns = task_active_pid_ns(current);
+	if (unlikely(!pidns))
+		goto clear;
+	pidns_info->nsid =  pidns->ns.inum;
+	pid = task_pid_nr_ns(current, pidns);
+	if (unlikely(!pid))
+		goto clear;
+	tgid = task_tgid_nr_ns(current, pidns);
+	if (unlikely(!tgid))
+		goto clear;
+	pidns_info->tgid = (u32) tgid;
+	pidns_info->pid = (u32) pid;
+	tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
+	if (unlikely(!tmp)) {
+		memset((void *)pidns_info, 0, (size_t) size);
+		return -ENOMEM;
+	}
+	len = strlen(pidns_path) + 1;
+	memcpy((char *)tmp->name, pidns_path, len);
+	tmp->uptr = NULL;
+	tmp->aname = NULL;
+	tmp->refcnt = 1;
+	ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
+	if (ret) {
+		memset((void *)pidns_info, 0, (size_t) size);
+		return ret;
+	}
+	inode = d_backing_inode(kp.dentry);
+	pidns_info->dev = inode->i_sb->s_dev;
+	return 0;
+clear:
+	memset((void *)pidns_info, 0, (size_t) size);
+	return -EINVAL;
+}
+
+const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
+	.func		= bpf_get_current_pidns_info,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg2_type	= ARG_CONST_SIZE,
+};
+
 #ifdef CONFIG_CGROUPS
 BPF_CALL_0(bpf_get_current_cgroup_id)
 {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..5e1dc22765a5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 #endif
 	case BPF_FUNC_send_signal:
 		return &bpf_send_signal_proto;
+	case BPF_FUNC_get_current_pidns_info:
+		return &bpf_get_current_pidns_info_proto;
 	default:
 		return NULL;
 	}
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 4393bd4b2419..db241857ec15 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -2741,6 +2741,28 @@ union bpf_attr {
  *		**-EOPNOTSUPP** kernel configuration does not enable SYN cookies
  *
  *		**-EPROTONOSUPPORT** IP packet version is not 4 or 6
+ *
+ * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 size_of_pidns)
+ *	Description
+ *		Copies into *pidns* pid, namespace id and tgid as seen by the
+ *		current namespace and also device from /proc/self/ns/pid.
+ *		*size_of_pidns* must be the size of *pidns*
+ *
+ *		This helper is used when pid filtering is needed inside a
+ *		container as bpf_get_current_tgid() helper returns always the
+ *		pid id as seen by the root namespace.
+ *	Return
+ *		0 on success
+ *
+ *		**-EINVAL** if *size_of_pidns* is not valid or unable to get ns, pid
+ *		or tgid of the current task.
+ *
+ *		**-ECHILD** if /proc/self/ns/pid does not exists.
+ *
+ *		**-ENOTDIR** if /proc/self/ns does not exists.
+ *
+ *		**-ENOMEM**  if allocation fails.
+ *
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2853,7 +2875,8 @@ union bpf_attr {
 	FN(sk_storage_get),		\
 	FN(sk_storage_delete),		\
 	FN(send_signal),		\
-	FN(tcp_gen_syncookie),
+	FN(tcp_gen_syncookie),		\
+	FN(get_current_pidns_info),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
@@ -3604,4 +3627,10 @@ struct bpf_sockopt {
 	__s32	retval;
 };
 
+struct bpf_pidns_info {
+	__u32 dev;
+	__u32 nsid;
+	__u32 tgid;
+	__u32 pid;
+};
 #endif /* _UAPI__LINUX_BPF_H__ */
-- 
2.11.0


- BPF helper samples and selftests 

From a87df8b026c6374c21b2af03d83471c258ff6038 Mon Sep 17 00:00:00 2001
From: Carlos <cneirabustos@gmail.com>
Date: Fri, 9 Aug 2019 12:23:27 -0700
Subject: [PATCH 2/2] [PATCH v8 bpf-next 2/2] BPF: New helper to obtain
 namespace data  from current task

Samples and selftests for new helper.

Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
---
 samples/bpf/Makefile                               |   3 +
 samples/bpf/trace_ns_info_user.c                   |  35 ++++++
 samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
 tools/testing/selftests/bpf/Makefile               |   2 +-
 tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
 .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
 tools/testing/selftests/bpf/test_pidns.c           | 138 +++++++++++++++++++++
 7 files changed, 275 insertions(+), 1 deletion(-)
 create mode 100644 samples/bpf/trace_ns_info_user.c
 create mode 100644 samples/bpf/trace_ns_info_user_kern.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c
 create mode 100644 tools/testing/selftests/bpf/test_pidns.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 1d9be26b4edd..238453ff27d2 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
 hostprogs-y += xdp_sample_pkts
 hostprogs-y += ibumad
 hostprogs-y += hbm
+hostprogs-y += trace_ns_info
 
 # Libbpf dependencies
 LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
@@ -109,6 +110,7 @@ task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
 ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
 hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
+trace_ns_info-objs := bpf_load.o trace_ns_info_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -170,6 +172,7 @@ always += xdp_sample_pkts_kern.o
 always += ibumad_kern.o
 always += hbm_out_kern.o
 always += hbm_edt_kern.o
+always += trace_ns_info_user_kern.o
 
 KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
 KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
diff --git a/samples/bpf/trace_ns_info_user.c b/samples/bpf/trace_ns_info_user.c
new file mode 100644
index 000000000000..e06d08db6f30
--- /dev/null
+++ b/samples/bpf/trace_ns_info_user.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <stdio.h>
+#include <linux/bpf.h>
+#include <unistd.h>
+#include "bpf/libbpf.h"
+#include "bpf_load.h"
+
+/* This code was taken verbatim from tracex1_user.c, it's used
+ * to exercize bpf_get_current_pidns_info() helper call.
+ */
+int main(int ac, char **argv)
+{
+	FILE *f;
+	char filename[256];
+
+	snprintf(filename, sizeof(filename), "%s_user_kern.o", argv[0]);
+	printf("loading %s\n", filename);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	f = popen("taskset 1 ping  localhost", "r");
+	(void) f;
+	read_trace_pipe();
+	return 0;
+}
diff --git a/samples/bpf/trace_ns_info_user_kern.c b/samples/bpf/trace_ns_info_user_kern.c
new file mode 100644
index 000000000000..96675e02b707
--- /dev/null
+++ b/samples/bpf/trace_ns_info_user_kern.c
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+typedef __u64 u64;
+typedef __u32 u32;
+
+
+/* kprobe is NOT a stable ABI
+ * kernel functions can be removed, renamed or completely change semantics.
+ * Number of arguments and their positions can change, etc.
+ * In such case this bpf+kprobe example will no longer be meaningful
+ */
+
+/* This will call bpf_get_current_pidns_info() to display pid and ns values
+ * as seen by the current namespace, on the far left you will see the pid as
+ * seen as by the root namespace.
+ */
+
+SEC("kprobe/__netif_receive_skb_core")
+int bpf_prog1(struct pt_regs *ctx)
+{
+	char fmt[] = "nsid:%u, dev: %u,  pid:%u\n";
+	struct bpf_pidns_info nsinfo;
+	int ok = 0;
+
+	ok = bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo));
+	if (ok == 0)
+		bpf_trace_printk(fmt, sizeof(fmt), (u32)nsinfo.nsid,
+				 (u32) nsinfo.dev, (u32)nsinfo.pid);
+
+	return 0;
+}
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3bd0f4a0336a..1f97b571b581 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
 	test_cgroup_storage test_select_reuseport test_section_names \
 	test_netcnt test_tcpnotify_user test_sock_fields test_sysctl test_hashmap \
 	test_btf_dump test_cgroup_attach xdping test_sockopt test_sockopt_sk \
-	test_sockopt_multi test_tcp_rtt
+	test_sockopt_multi test_tcp_rtt test_pidns
 
 BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
 TEST_GEN_FILES = $(BPF_OBJ_FILES)
diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h
index 8b503ea142f0..3fae3b9fcd2c 100644
--- a/tools/testing/selftests/bpf/bpf_helpers.h
+++ b/tools/testing/selftests/bpf/bpf_helpers.h
@@ -231,6 +231,9 @@ static int (*bpf_send_signal)(unsigned sig) = (void *)BPF_FUNC_send_signal;
 static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
 					  int ip_len, void *tcp, int tcp_len) =
 	(void *) BPF_FUNC_tcp_gen_syncookie;
+static int (*bpf_get_current_pidns_info)(struct bpf_pidns_info *buf,
+					 unsigned int buf_size) =
+	(void *) BPF_FUNC_get_current_pidns_info;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/tools/testing/selftests/bpf/progs/test_pidns_kern.c b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
new file mode 100644
index 000000000000..e1d2facfa762
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <linux/bpf.h>
+#include <errno.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") nsidmap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps") pidmap = {
+	.type = BPF_MAP_TYPE_ARRAY,
+	.key_size = sizeof(__u32),
+	.value_size = sizeof(__u32),
+	.max_entries = 1,
+};
+
+SEC("tracepoint/syscalls/sys_enter_nanosleep")
+int trace(void *ctx)
+{
+	struct bpf_pidns_info nsinfo;
+	__u32 key = 0, *expected_pid, *val;
+	char fmt[] = "ERROR nspid:%d\n";
+
+	if (bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo)))
+		return -EINVAL;
+
+	expected_pid = bpf_map_lookup_elem(&pidmap, &key);
+
+
+	if (!expected_pid || *expected_pid != nsinfo.pid)
+		return 0;
+
+	val = bpf_map_lookup_elem(&nsidmap, &key);
+	if (val)
+		*val = nsinfo.nsid;
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+__u32 _version SEC("version") = 1;
diff --git a/tools/testing/selftests/bpf/test_pidns.c b/tools/testing/selftests/bpf/test_pidns.c
new file mode 100644
index 000000000000..a7254055f294
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_pidns.c
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include "cgroup_helpers.h"
+#include "bpf_rlimit.h"
+
+#define CHECK(condition, tag, format...) ({		\
+	int __ret = !!(condition);			\
+	if (__ret) {					\
+		printf("%s:FAIL:%s ", __func__, tag);	\
+		printf(format);				\
+	} else {					\
+		printf("%s:PASS:%s\n", __func__, tag);	\
+	}						\
+	__ret;						\
+})
+
+static int bpf_find_map(const char *test, struct bpf_object *obj,
+			const char *name)
+{
+	struct bpf_map *map;
+
+	map = bpf_object__find_map_by_name(obj, name);
+	if (!map)
+		return -1;
+	return bpf_map__fd(map);
+}
+
+
+int main(int argc, char **argv)
+{
+	const char *probe_name = "syscalls/sys_enter_nanosleep";
+	const char *file = "test_pidns_kern.o";
+	int err, bytes, efd, prog_fd, pmu_fd;
+	int pidmap_fd, nsidmap_fd;
+	struct perf_event_attr attr = {};
+	struct bpf_object *obj;
+	__u32 knsid = 0;
+	__u32 key = 0, pid;
+	int exit_code = 1;
+	struct stat st;
+	char buf[256];
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
+	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
+		goto cleanup_cgroup_env;
+
+	nsidmap_fd = bpf_find_map(__func__, obj, "nsidmap");
+	if (CHECK(nsidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  nsidmap_fd, errno))
+		goto close_prog;
+
+	pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
+	if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
+		  pidmap_fd, errno))
+		goto close_prog;
+
+	pid = getpid();
+	bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
+
+	snprintf(buf, sizeof(buf),
+		 "/sys/kernel/debug/tracing/events/%s/id", probe_name);
+	efd = open(buf, O_RDONLY, 0);
+	if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
+		goto close_prog;
+	bytes = read(efd, buf, sizeof(buf));
+	close(efd);
+	if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
+		  "bytes %d errno %d\n", bytes, errno))
+		goto close_prog;
+
+	attr.config = strtol(buf, NULL, 0);
+	attr.type = PERF_TYPE_TRACEPOINT;
+	attr.sample_type = PERF_SAMPLE_RAW;
+	attr.sample_period = 1;
+	attr.wakeup_events = 1;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, -1, 0);
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", pmu_fd,
+		  errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", err,
+		  errno))
+		goto close_pmu;
+
+	/* trigger some syscalls */
+	sleep(1);
+
+	err = bpf_map_lookup_elem(nsidmap_fd, &key, &knsid);
+	if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", err, errno))
+		goto close_pmu;
+
+	if (stat("/proc/self/ns/pid", &st))
+		goto close_pmu;
+
+	if (CHECK(knsid != (__u32) st.st_ino, "compare_namespace_id",
+		  "kern knsid %u user unsid %u\n", knsid, (__u32) st.st_ino))
+		goto close_pmu;
+
+	exit_code = 0;
+	printf("%s:PASS\n", argv[0]);
+
+close_pmu:
+	close(pmu_fd);
+close_prog:
+	bpf_object__close(obj);
+cleanup_cgroup_env:
+	return exit_code;
+}
-- 
2.11.0

Thanks for your help.

Bests


On Thu, Aug 08, 2019 at 05:17:16PM -0400, Carlos Antonio Neira Bustos wrote:
> Thanks a lot, Yonghong. I'll fix and split up the patch.
> Thanks again for your help.
> 
> Bests
> 
> On Thu, Aug 08, 2019 at 08:47:14PM +0000, Yonghong Song wrote:
> > 
> > 
> > On 8/8/19 1:26 PM, carlos antonio neira bustos wrote:
> > > Hi Yonghong,
> > > 
> > > I’m sorry, just to be sure, I’m just missing the error codes from 
> > > filename_lookup() right ?.
> > 
> >  From kernel functionality point of view. Yes, I am talking about
> > error codes returned by filename_lookup().
> > For example, if CONFIG_PID_NS or CONFIG_NAMESPACES is not
> > defined in the config, the path "/proc/self/ns/pid" will not exist,
> > the error code will return. It may be -ENOTDIR
> > if CONFIG_NAMESPACES not defined or -ECHILD if CONFIG_PID_NS
> > is not defined. Please double check.
> > 
> > Please do follow the advice in
> >  > https://lore.kernel.org/netdev/20190808174848.poybtaagg5ctle7t@dev00/T/#t
> > to break the single patch to multiple patches.
> > 
> > I only reviewed the kernel code. Will review tools/ code
> > in the next properly-formatted (broken-up) commits.
> > 
> > Also, please also cc commits to bpf mailing list at
> > bpf@vger.kernel.org
> > 
> > > 
> > > Bests
> > > 
> > > Maybe some other error codes in filename_lookup() function?
> > > 
> > >  > + *
> > > 
> > >  > + *                      If unable to get the inode from 
> > > /proc/self/ns/pid an error code
> > > 
> > >  > + *                      will be returned.
> > > 
> > > *From: *Y Song <mailto:ys114321@gmail.com>
> > > *Sent: *08 August 2019 15:44
> > > *To: *Carlos Antonio Neira Bustos <mailto:cneirabustos@gmail.com>
> > > *Cc: *Yonghong Song <mailto:yhs@fb.com>; netdev@vger.kernel.org 
> > > <mailto:netdev@vger.kernel.org>; ebiederm@xmission.com 
> > > <mailto:ebiederm@xmission.com>; brouer@redhat.com 
> > > <mailto:brouer@redhat.com>; quentin.monnet@netronome.com 
> > > <mailto:quentin.monnet@netronome.com>
> > > *Subject: *Re: [PATCH v5 bpf-next] BPF: helpers: New helper to obtain 
> > > namespacedata from current task
> > > 
> > > On Thu, Aug 8, 2019 at 10:52 AM Carlos Antonio Neira Bustos
> > > 
> > > <cneirabustos@gmail.com> wrote:
> > > 
> > >  >
> > > 
> > >  > Yonghong,
> > > 
> > >  >
> > > 
> > >  > I have modified the patch following your feedback.
> > > 
> > >  > Let me know if I'm missing something.
> > > 
> > > Yes, I have some other requests about formating.
> > > 
> > > https://lore.kernel.org/netdev/20190808174848.poybtaagg5ctle7t@dev00/T/#t
> > > 
> > > Could you address it as well?
> > > 
> > >  >
> > > 
> > >  > Bests
> > > 
> > >  >
> > > 
> > >  > From 70f8d5584700c9cfc82c006901d8ee9595c53f15 Mon Sep 17 00:00:00 2001
> > > 
> > >  > From: Carlos <cneirabustos@gmail.com>
> > > 
> > >  > Date: Wed, 7 Aug 2019 20:04:30 -0400
> > > 
> > >  > Subject: [PATCH] [PATCH v6 bpf-next] BPF: New helper to obtain 
> > > namespace data
> > > 
> > >  >  from current task
> > > 
> > >  >
> > > 
> > >  > This helper obtains the active namespace from current and returns 
> > > pid, tgid,
> > > 
> > >  > device and namespace id as seen from that namespace, allowing to 
> > > instrument
> > > 
> > >  > a process inside a container.
> > > 
> > >  > Device is read from /proc/self/ns/pid, as in the future it's possible 
> > > that
> > > 
> > >  > different pid_ns files may belong to different devices, according
> > > 
> > >  > to the discussion between Eric Biederman and Yonghong in 2017 linux 
> > > plumbers
> > > 
> > >  > conference.
> > > 
> > >  > Currently bpf_get_current_pid_tgid(), is used to do pid filtering in 
> > > bcc's
> > > 
> > >  > scripts but this helper returns the pid as seen by the root namespace 
> > > which is
> > > 
> > >  > fine when a bcc script is not executed inside a container.
> > > 
> > >  > When the process of interest is inside a container, pid filtering 
> > > will not work
> > > 
> > >  > if bpf_get_current_pid_tgid() is used. This helper addresses this 
> > > limitation
> > > 
> > >  > returning the pid as it's seen by the current namespace where the 
> > > script is
> > > 
> > >  > executing.
> > > 
> > >  >
> > > 
> > >  > This helper has the same use cases as bpf_get_current_pid_tgid() as 
> > > it can be
> > > 
> > >  > used to do pid filtering even inside a container.
> > > 
> > >  >
> > > 
> > >  > For example a bcc script using bpf_get_current_pid_tgid() 
> > > (tools/funccount.py):
> > > 
> > >  >
> > > 
> > >  >         u32 pid = bpf_get_current_pid_tgid() >> 32;
> > > 
> > >  >         if (pid != <pid_arg_passed_in>)
> > > 
> > >  >                 return 0;
> > > 
> > >  > Could be modified to use bpf_get_current_pidns_info() as follows:
> > > 
> > >  >
> > > 
> > >  >         struct bpf_pidns pidns;
> > > 
> > >  >         bpf_get_current_pidns_info(&pidns, sizeof(struct bpf_pidns));
> > > 
> > >  >         u32 pid = pidns.tgid;
> > > 
> > >  >         u32 nsid = pidns.nsid;
> > > 
> > >  >         if ((pid != <pid_arg_passed_in>) && (nsid != 
> > > <nsid_arg_passed_in>))
> > > 
> > >  >                 return 0;
> > > 
> > >  >
> > > 
> > >  > To find out the name PID namespace id of a process, you could use 
> > > this command:
> > > 
> > >  >
> > > 
> > >  > $ ps -h -o pidns -p <pid_of_interest>
> > > 
> > >  >
> > > 
> > >  > Or this other command:
> > > 
> > >  >
> > > 
> > >  > $ ls -Li /proc/<pid_of_interest>/ns/pid
> > > 
> > >  >
> > > 
> > >  > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> > > 
> > >  > ---
> > > 
> > >  >  fs/internal.h                                      |   2 -
> > > 
> > >  >  fs/namei.c                                         |   1 -
> > > 
> > >  >  include/linux/bpf.h                                |   1 +
> > > 
> > >  >  include/linux/namei.h                              |   4 +
> > > 
> > >  >  include/uapi/linux/bpf.h                           |  27 +++-
> > > 
> > >  >  kernel/bpf/core.c                                  |   1 +
> > > 
> > >  >  kernel/bpf/helpers.c                               |  64 ++++++++++
> > > 
> > >  >  kernel/trace/bpf_trace.c                           |   2 +
> > > 
> > >  >  samples/bpf/Makefile                               |   3 +
> > > 
> > >  >  samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> > > 
> > >  >  samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> > > 
> > >  >  tools/include/uapi/linux/bpf.h                     |  27 +++-
> > > 
> > >  >  tools/testing/selftests/bpf/Makefile               |   2 +-
> > > 
> > >  >  tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> > > 
> > >  >  .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> > > 
> > >  >  tools/testing/selftests/bpf/test_pidns.c           | 138 
> > > +++++++++++++++++++++
> > > 
> > >  >  16 files changed, 399 insertions(+), 6 deletions(-)
> > > 
> > >  >  create mode 100644 samples/bpf/trace_ns_info_user.c
> > > 
> > >  >  create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> > > 
> > >  >  create mode 100644 tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > > 
> > >  >  create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> > > 
> > >  >
> > > 
> > >  > diff --git a/fs/internal.h b/fs/internal.h
> > > 
> > >  > index 315fcd8d237c..6647e15dd419 100644
> > > 
> > >  > --- a/fs/internal.h
> > > 
> > >  > +++ b/fs/internal.h
> > > 
> > >  > @@ -59,8 +59,6 @@ extern int finish_clean_context(struct fs_context *fc);
> > > 
> > >  >  /*
> > > 
> > >  >   * namei.c
> > > 
> > >  >   */
> > > 
> > >  > -extern int filename_lookup(int dfd, struct filename *name, unsigned 
> > > flags,
> > > 
> > >  > -                          struct path *path, struct path *root);
> > > 
> > >  >  extern int user_path_mountpoint_at(int, const char __user *, 
> > > unsigned int, struct path *);
> > > 
> > >  >  extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
> > > 
> > >  >                            const char *, unsigned int, struct path *);
> > > 
> > >  > diff --git a/fs/namei.c b/fs/namei.c
> > > 
> > >  > index 209c51a5226c..a89fc72a4a10 100644
> > > 
> > >  > --- a/fs/namei.c
> > > 
> > >  > +++ b/fs/namei.c
> > > 
> > >  > @@ -19,7 +19,6 @@
> > > 
> > >  >  #include <linux/export.h>
> > > 
> > >  >  #include <linux/kernel.h>
> > > 
> > >  >  #include <linux/slab.h>
> > > 
> > >  > -#include <linux/fs.h>
> > > 
> > >  >  #include <linux/namei.h>
> > > 
> > >  >  #include <linux/pagemap.h>
> > > 
> > >  >  #include <linux/fsnotify.h>
> > > 
> > >  > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > 
> > >  > index f9a506147c8a..e4adf5e05afd 100644
> > > 
> > >  > --- a/include/linux/bpf.h
> > > 
> > >  > +++ b/include/linux/bpf.h
> > > 
> > >  > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto 
> > > bpf_get_local_storage_proto;
> > > 
> > >  >  extern const struct bpf_func_proto bpf_strtol_proto;
> > > 
> > >  >  extern const struct bpf_func_proto bpf_strtoul_proto;
> > > 
> > >  >  extern const struct bpf_func_proto bpf_tcp_sock_proto;
> > > 
> > >  > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> > > 
> > >  >
> > > 
> > >  >  /* Shared helpers among cBPF and eBPF. */
> > > 
> > >  >  void bpf_user_rnd_init_once(void);
> > > 
> > >  > diff --git a/include/linux/namei.h b/include/linux/namei.h
> > > 
> > >  > index 9138b4471dbf..b45c8b6f7cb4 100644
> > > 
> > >  > --- a/include/linux/namei.h
> > > 
> > >  > +++ b/include/linux/namei.h
> > > 
> > >  > @@ -6,6 +6,7 @@
> > > 
> > >  >  #include <linux/path.h>
> > > 
> > >  >  #include <linux/fcntl.h>
> > > 
> > >  >  #include <linux/errno.h>
> > > 
> > >  > +#include <linux/fs.h>
> > > 
> > >  >
> > > 
> > >  >  enum { MAX_NESTED_LINKS = 8 };
> > > 
> > >  >
> > > 
> > >  > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, struct 
> > > dentry *);
> > > 
> > >  >
> > > 
> > >  >  extern void nd_jump_link(struct path *path);
> > > 
> > >  >
> > > 
> > >  > +extern int filename_lookup(int dfd, struct filename *name, unsigned 
> > > flags,
> > > 
> > >  > +                          struct path *path, struct path *root);
> > > 
> > >  > +
> > > 
> > >  >  static inline void nd_terminate_link(void *name, size_t len, size_t 
> > > maxlen)
> > > 
> > >  >  {
> > > 
> > >  >         ((char *) name)[min(len, maxlen)] = '\0';
> > > 
> > >  > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > 
> > >  > index 4393bd4b2419..b0d4869fb860 100644
> > > 
> > >  > --- a/include/uapi/linux/bpf.h
> > > 
> > >  > +++ b/include/uapi/linux/bpf.h
> > > 
> > >  > @@ -2741,6 +2741,24 @@ union bpf_attr {
> > > 
> > >  >   *             **-EOPNOTSUPP** kernel configuration does not enable 
> > > SYN cookies
> > > 
> > >  >   *
> > > 
> > >  >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> > > 
> > >  > + *
> > > 
> > >  > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
> > > size_of_pidns)
> > > 
> > >  > + *     Description
> > > 
> > >  > + *             Copies into *pidns* pid, namespace id and tgid as 
> > > seen by the
> > > 
> > >  > + *             current namespace and also device from /proc/self/ns/pid.
> > > 
> > >  > + *             *size_of_pidns* must be the size of *pidns*
> > > 
> > >  > + *
> > > 
> > >  > + *             This helper is used when pid filtering is needed inside a
> > > 
> > >  > + *             container as bpf_get_current_tgid() helper returns 
> > > always the
> > > 
> > >  > + *             pid id as seen by the root namespace.
> > > 
> > >  > + *     Return
> > > 
> > >  > + *             0 on success
> > > 
> > >  > + *
> > > 
> > >  > + *             **-EINVAL** if *size_of_pidns* is not valid or unable 
> > > to get ns, pid
> > > 
> > >  > + *             or tgid of the current task.
> > > 
> > >  > + *
> > > 
> > >  > + *             **-ENOMEM**  if allocation fails.
> > > 
> > >  > + *
> > > 
> > >  >   */
> > > 
> > >  >  #define __BPF_FUNC_MAPPER(FN)          \
> > > 
> > >  >         FN(unspec),                     \
> > > 
> > >  > @@ -2853,7 +2871,8 @@ union bpf_attr {
> > > 
> > >  >         FN(sk_storage_get),             \
> > > 
> > >  >         FN(sk_storage_delete),          \
> > > 
> > >  >         FN(send_signal),                \
> > > 
> > >  > -       FN(tcp_gen_syncookie),
> > > 
> > >  > +       FN(tcp_gen_syncookie),          \
> > > 
> > >  > +       FN(get_current_pidns_info),
> > > 
> > >  >
> > > 
> > >  >  /* integer value in 'imm' field of BPF_CALL instruction selects 
> > > which helper
> > > 
> > >  >   * function eBPF program intends to call
> > > 
> > >  > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> > > 
> > >  >         __s32   retval;
> > > 
> > >  >  };
> > > 
> > >  >
> > > 
> > >  > +struct bpf_pidns_info {
> > > 
> > >  > +       __u32 dev;
> > > 
> > >  > +       __u32 nsid;
> > > 
> > >  > +       __u32 tgid;
> > > 
> > >  > +       __u32 pid;
> > > 
> > >  > +};
> > > 
> > >  >  #endif /* _UAPI__LINUX_BPF_H__ */
> > > 
> > >  > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > > 
> > >  > index 8191a7db2777..3159f2a0188c 100644
> > > 
> > >  > --- a/kernel/bpf/core.c
> > > 
> > >  > +++ b/kernel/bpf/core.c
> > > 
> > >  > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto 
> > > bpf_get_current_uid_gid_proto __weak;
> > > 
> > >  >  const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> > > 
> > >  >  const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> > > 
> > >  >  const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> > > 
> > >  > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> > > 
> > >  >
> > > 
> > >  >  const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
> > > 
> > >  >  {
> > > 
> > >  > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > 
> > >  > index 5e28718928ca..41fbf1f28a48 100644
> > > 
> > >  > --- a/kernel/bpf/helpers.c
> > > 
> > >  > +++ b/kernel/bpf/helpers.c
> > > 
> > >  > @@ -11,6 +11,12 @@
> > > 
> > >  >  #include <linux/uidgid.h>
> > > 
> > >  >  #include <linux/filter.h>
> > > 
> > >  >  #include <linux/ctype.h>
> > > 
> > >  > +#include <linux/pid_namespace.h>
> > > 
> > >  > +#include <linux/major.h>
> > > 
> > >  > +#include <linux/stat.h>
> > > 
> > >  > +#include <linux/namei.h>
> > > 
> > >  > +#include <linux/version.h>
> > > 
> > >  > +
> > > 
> > >  >
> > > 
> > >  >  #include "../../lib/kstrtox.h"
> > > 
> > >  >
> > > 
> > >  > @@ -312,6 +318,64 @@ void copy_map_value_locked(struct bpf_map *map, 
> > > void *dst, void *src,
> > > 
> > >  >         preempt_enable();
> > > 
> > >  >  }
> > > 
> > >  >
> > > 
> > >  > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, 
> > > pidns_info, u32,
> > > 
> > >  > +        size)
> > > 
> > >  > +{
> > > 
> > >  > +       const char *pidns_path = "/proc/self/ns/pid";
> > > 
> > >  > +       struct pid_namespace *pidns = NULL;
> > > 
> > >  > +       struct filename *tmp = NULL;
> > > 
> > >  > +       struct inode *inode;
> > > 
> > >  > +       struct path kp;
> > > 
> > >  > +       pid_t tgid = 0;
> > > 
> > >  > +       pid_t pid = 0;
> > > 
> > >  > +       int ret;
> > > 
> > >  > +       int len;
> > > 
> > >  > +
> > > 
> > >  > +       if (unlikely(size != sizeof(struct bpf_pidns_info)))
> > > 
> > >  > +               return -EINVAL;
> > > 
> > >  > +       pidns = task_active_pid_ns(current);
> > > 
> > >  > +       if (unlikely(!pidns))
> > > 
> > >  > +               goto clear;
> > > 
> > >  > +       pidns_info->nsid =  pidns->ns.inum;
> > > 
> > >  > +       pid = task_pid_nr_ns(current, pidns);
> > > 
> > >  > +       if (unlikely(!pid))
> > > 
> > >  > +               goto clear;
> > > 
> > >  > +       tgid = task_tgid_nr_ns(current, pidns);
> > > 
> > >  > +       if (unlikely(!tgid))
> > > 
> > >  > +               goto clear;
> > > 
> > >  > +       pidns_info->tgid = (u32) tgid;
> > > 
> > >  > +       pidns_info->pid = (u32) pid;
> > > 
> > >  > +       tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> > > 
> > >  > +       if (unlikely(!tmp)) {
> > > 
> > >  > +               memset((void *)pidns_info, 0, (size_t) size);
> > > 
> > >  > +               return -ENOMEM;
> > > 
> > >  > +       }
> > > 
> > >  > +       len = strlen(pidns_path) + 1;
> > > 
> > >  > +       memcpy((char *)tmp->name, pidns_path, len);
> > > 
> > >  > +       tmp->uptr = NULL;
> > > 
> > >  > +       tmp->aname = NULL;
> > > 
> > >  > +       tmp->refcnt = 1;
> > > 
> > >  > +       ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> > > 
> > >  > +       if (ret) {
> > > 
> > >  > +               memset((void *)pidns_info, 0, (size_t) size);
> > > 
> > >  > +               return ret;
> > > 
> > >  > +       }
> > > 
> > >  > +       inode = d_backing_inode(kp.dentry);
> > > 
> > >  > +       pidns_info->dev = inode->i_sb->s_dev;
> > > 
> > >  > +       return 0;
> > > 
> > >  > +clear:
> > > 
> > >  > +       memset((void *)pidns_info, 0, (size_t) size);
> > > 
> > >  > +       return -EINVAL;
> > > 
> > >  > +}
> > > 
> > >  > +
> > > 
> > >  > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> > > 
> > >  > +       .func           = bpf_get_current_pidns_info,
> > > 
> > >  > +       .gpl_only       = false,
> > > 
> > >  > +       .ret_type       = RET_INTEGER,
> > > 
> > >  > +       .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> > > 
> > >  > +       .arg2_type      = ARG_CONST_SIZE,
> > > 
> > >  > +};
> > > 
> > >  > +
> > > 
> > >  >  #ifdef CONFIG_CGROUPS
> > > 
> > >  >  BPF_CALL_0(bpf_get_current_cgroup_id)
> > > 
> > >  >  {
> > > 
> > >  > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > 
> > >  > index ca1255d14576..5e1dc22765a5 100644
> > > 
> > >  > --- a/kernel/trace/bpf_trace.c
> > > 
> > >  > +++ b/kernel/trace/bpf_trace.c
> > > 
> > >  > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, 
> > > const struct bpf_prog *prog)
> > > 
> > >  >  #endif
> > > 
> > >  >         case BPF_FUNC_send_signal:
> > > 
> > >  >                 return &bpf_send_signal_proto;
> > > 
> > >  > +       case BPF_FUNC_get_current_pidns_info:
> > > 
> > >  > +               return &bpf_get_current_pidns_info_proto;
> > > 
> > >  >         default:
> > > 
> > >  >                 return NULL;
> > > 
> > >  >         }
> > > 
> > >  > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> > > 
> > >  > index 1d9be26b4edd..238453ff27d2 100644
> > > 
> > >  > --- a/samples/bpf/Makefile
> > > 
> > >  > +++ b/samples/bpf/Makefile
> > > 
> > >  > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> > > 
> > >  >  hostprogs-y += xdp_sample_pkts
> > > 
> > >  >  hostprogs-y += ibumad
> > > 
> > >  >  hostprogs-y += hbm
> > > 
> > >  > +hostprogs-y += trace_ns_info
> > > 
> > >  >
> > > 
> > >  >  # Libbpf dependencies
> > > 
> > >  >  LIBBPF = $(TOOLS_PATH)/lib/bpf/libbpf.a
> > > 
> > >  > @@ -109,6 +110,7 @@ task_fd_query-objs := bpf_load.o 
> > > task_fd_query_user.o $(TRACE_HELPERS)
> > > 
> > >  >  xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
> > > 
> > >  >  ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
> > > 
> > >  >  hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
> > > 
> > >  > +trace_ns_info-objs := bpf_load.o trace_ns_info_user.o
> > > 
> > >  >
> > > 
> > >  >  # Tell kbuild to always build the programs
> > > 
> > >  >  always := $(hostprogs-y)
> > > 
> > >  > @@ -170,6 +172,7 @@ always += xdp_sample_pkts_kern.o
> > > 
> > >  >  always += ibumad_kern.o
> > > 
> > >  >  always += hbm_out_kern.o
> > > 
> > >  >  always += hbm_edt_kern.o
> > > 
> > >  > +always += trace_ns_info_user_kern.o
> > > 
> > >  >
> > > 
> > >  >  KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
> > > 
> > >  >  KBUILD_HOSTCFLAGS += -I$(srctree)/tools/lib/bpf/
> > > 
> > >  > diff --git a/samples/bpf/trace_ns_info_user.c 
> > > b/samples/bpf/trace_ns_info_user.c
> > > 
> > >  > new file mode 100644
> > > 
> > >  > index 000000000000..e06d08db6f30
> > > 
> > >  > --- /dev/null
> > > 
> > >  > +++ b/samples/bpf/trace_ns_info_user.c
> > > 
> > >  > @@ -0,0 +1,35 @@
> > > 
> > >  > +// SPDX-License-Identifier: GPL-2.0
> > > 
> > >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > > 
> > >  > + *
> > > 
> > >  > + * This program is free software; you can redistribute it and/or
> > > 
> > >  > + * modify it under the terms of version 2 of the GNU General Public
> > > 
> > >  > + * License as published by the Free Software Foundation.
> > > 
> > >  > + */
> > > 
> > >  > +
> > > 
> > >  > +#include <stdio.h>
> > > 
> > >  > +#include <linux/bpf.h>
> > > 
> > >  > +#include <unistd.h>
> > > 
> > >  > +#include "bpf/libbpf.h"
> > > 
> > >  > +#include "bpf_load.h"
> > > 
> > >  > +
> > > 
> > >  > +/* This code was taken verbatim from tracex1_user.c, it's used
> > > 
> > >  > + * to exercize bpf_get_current_pidns_info() helper call.
> > > 
> > >  > + */
> > > 
> > >  > +int main(int ac, char **argv)
> > > 
> > >  > +{
> > > 
> > >  > +       FILE *f;
> > > 
> > >  > +       char filename[256];
> > > 
> > >  > +
> > > 
> > >  > +       snprintf(filename, sizeof(filename), "%s_user_kern.o", argv[0]);
> > > 
> > >  > +       printf("loading %s\n", filename);
> > > 
> > >  > +
> > > 
> > >  > +       if (load_bpf_file(filename)) {
> > > 
> > >  > +               printf("%s", bpf_log_buf);
> > > 
> > >  > +               return 1;
> > > 
> > >  > +       }
> > > 
> > >  > +
> > > 
> > >  > +       f = popen("taskset 1 ping  localhost", "r");
> > > 
> > >  > +       (void) f;
> > > 
> > >  > +       read_trace_pipe();
> > > 
> > >  > +       return 0;
> > > 
> > >  > +}
> > > 
> > >  > diff --git a/samples/bpf/trace_ns_info_user_kern.c 
> > > b/samples/bpf/trace_ns_info_user_kern.c
> > > 
> > >  > new file mode 100644
> > > 
> > >  > index 000000000000..96675e02b707
> > > 
> > >  > --- /dev/null
> > > 
> > >  > +++ b/samples/bpf/trace_ns_info_user_kern.c
> > > 
> > >  > @@ -0,0 +1,44 @@
> > > 
> > >  > +// SPDX-License-Identifier: GPL-2.0
> > > 
> > >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > > 
> > >  > + *
> > > 
> > >  > + * This program is free software; you can redistribute it and/or
> > > 
> > >  > + * modify it under the terms of version 2 of the GNU General Public
> > > 
> > >  > + * License as published by the Free Software Foundation.
> > > 
> > >  > + */
> > > 
> > >  > +#include <linux/skbuff.h>
> > > 
> > >  > +#include <linux/netdevice.h>
> > > 
> > >  > +#include <linux/version.h>
> > > 
> > >  > +#include <uapi/linux/bpf.h>
> > > 
> > >  > +#include "bpf_helpers.h"
> > > 
> > >  > +
> > > 
> > >  > +typedef __u64 u64;
> > > 
> > >  > +typedef __u32 u32;
> > > 
> > >  > +
> > > 
> > >  > +
> > > 
> > >  > +/* kprobe is NOT a stable ABI
> > > 
> > >  > + * kernel functions can be removed, renamed or completely change 
> > > semantics.
> > > 
> > >  > + * Number of arguments and their positions can change, etc.
> > > 
> > >  > + * In such case this bpf+kprobe example will no longer be meaningful
> > > 
> > >  > + */
> > > 
> > >  > +
> > > 
> > >  > +/* This will call bpf_get_current_pidns_info() to display pid and ns 
> > > values
> > > 
> > >  > + * as seen by the current namespace, on the far left you will see 
> > > the pid as
> > > 
> > >  > + * seen as by the root namespace.
> > > 
> > >  > + */
> > > 
> > >  > +
> > > 
> > >  > +SEC("kprobe/__netif_receive_skb_core")
> > > 
> > >  > +int bpf_prog1(struct pt_regs *ctx)
> > > 
> > >  > +{
> > > 
> > >  > +       char fmt[] = "nsid:%u, dev: %u,  pid:%u\n";
> > > 
> > >  > +       struct bpf_pidns_info nsinfo;
> > > 
> > >  > +       int ok = 0;
> > > 
> > >  > +
> > > 
> > >  > +       ok = bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo));
> > > 
> > >  > +       if (ok == 0)
> > > 
> > >  > +               bpf_trace_printk(fmt, sizeof(fmt), (u32)nsinfo.nsid,
> > > 
> > >  > +                                (u32) nsinfo.dev, (u32)nsinfo.pid);
> > > 
> > >  > +
> > > 
> > >  > +       return 0;
> > > 
> > >  > +}
> > > 
> > >  > +char _license[] SEC("license") = "GPL";
> > > 
> > >  > +u32 _version SEC("version") = LINUX_VERSION_CODE;
> > > 
> > >  > diff --git a/tools/include/uapi/linux/bpf.h 
> > > b/tools/include/uapi/linux/bpf.h
> > > 
> > >  > index 4393bd4b2419..b0d4869fb860 100644
> > > 
> > >  > --- a/tools/include/uapi/linux/bpf.h
> > > 
> > >  > +++ b/tools/include/uapi/linux/bpf.h
> > > 
> > >  > @@ -2741,6 +2741,24 @@ union bpf_attr {
> > > 
> > >  >   *             **-EOPNOTSUPP** kernel configuration does not enable 
> > > SYN cookies
> > > 
> > >  >   *
> > > 
> > >  >   *             **-EPROTONOSUPPORT** IP packet version is not 4 or 6
> > > 
> > >  > + *
> > > 
> > >  > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, u32 
> > > size_of_pidns)
> > > 
> > >  > + *     Description
> > > 
> > >  > + *             Copies into *pidns* pid, namespace id and tgid as 
> > > seen by the
> > > 
> > >  > + *             current namespace and also device from /proc/self/ns/pid.
> > > 
> > >  > + *             *size_of_pidns* must be the size of *pidns*
> > > 
> > >  > + *
> > > 
> > >  > + *             This helper is used when pid filtering is needed inside a
> > > 
> > >  > + *             container as bpf_get_current_tgid() helper returns 
> > > always the
> > > 
> > >  > + *             pid id as seen by the root namespace.
> > > 
> > >  > + *     Return
> > > 
> > >  > + *             0 on success
> > > 
> > >  > + *
> > > 
> > >  > + *             **-EINVAL** if *size_of_pidns* is not valid or unable 
> > > to get ns, pid
> > > 
> > >  > + *             or tgid of the current task.
> > > 
> > >  > + *
> > > 
> > >  > + *             **-ENOMEM**  if allocation fails.
> > > 
> > >  > + *
> > > 
> > >  >   */
> > > 
> > >  >  #define __BPF_FUNC_MAPPER(FN)          \
> > > 
> > >  >         FN(unspec),                     \
> > > 
> > >  > @@ -2853,7 +2871,8 @@ union bpf_attr {
> > > 
> > >  >         FN(sk_storage_get),             \
> > > 
> > >  >         FN(sk_storage_delete),          \
> > > 
> > >  >         FN(send_signal),                \
> > > 
> > >  > -       FN(tcp_gen_syncookie),
> > > 
> > >  > +       FN(tcp_gen_syncookie),          \
> > > 
> > >  > +       FN(get_current_pidns_info),
> > > 
> > >  >
> > > 
> > >  >  /* integer value in 'imm' field of BPF_CALL instruction selects 
> > > which helper
> > > 
> > >  >   * function eBPF program intends to call
> > > 
> > >  > @@ -3604,4 +3623,10 @@ struct bpf_sockopt {
> > > 
> > >  >         __s32   retval;
> > > 
> > >  >  };
> > > 
> > >  >
> > > 
> > >  > +struct bpf_pidns_info {
> > > 
> > >  > +       __u32 dev;
> > > 
> > >  > +       __u32 nsid;
> > > 
> > >  > +       __u32 tgid;
> > > 
> > >  > +       __u32 pid;
> > > 
> > >  > +};
> > > 
> > >  >  #endif /* _UAPI__LINUX_BPF_H__ */
> > > 
> > >  > diff --git a/tools/testing/selftests/bpf/Makefile 
> > > b/tools/testing/selftests/bpf/Makefile
> > > 
> > >  > index 3bd0f4a0336a..1f97b571b581 100644
> > > 
> > >  > --- a/tools/testing/selftests/bpf/Makefile
> > > 
> > >  > +++ b/tools/testing/selftests/bpf/Makefile
> > > 
> > >  > @@ -29,7 +29,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps 
> > > test_lru_map test_lpm_map test
> > > 
> > >  >         test_cgroup_storage test_select_reuseport test_section_names \
> > > 
> > >  >         test_netcnt test_tcpnotify_user test_sock_fields test_sysctl 
> > > test_hashmap \
> > > 
> > >  >         test_btf_dump test_cgroup_attach xdping test_sockopt 
> > > test_sockopt_sk \
> > > 
> > >  > -       test_sockopt_multi test_tcp_rtt
> > > 
> > >  > +       test_sockopt_multi test_tcp_rtt test_pidns
> > > 
> > >  >
> > > 
> > >  >  BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
> > > 
> > >  >  TEST_GEN_FILES = $(BPF_OBJ_FILES)
> > > 
> > >  > diff --git a/tools/testing/selftests/bpf/bpf_helpers.h 
> > > b/tools/testing/selftests/bpf/bpf_helpers.h
> > > 
> > >  > index 120aa86c58d3..c96795a9d983 100644
> > > 
> > >  > --- a/tools/testing/selftests/bpf/bpf_helpers.h
> > > 
> > >  > +++ b/tools/testing/selftests/bpf/bpf_helpers.h
> > > 
> > >  > @@ -231,6 +231,9 @@ static int (*bpf_send_signal)(unsigned sig) = 
> > > (void *)BPF_FUNC_send_signal;
> > > 
> > >  >  static long long (*bpf_tcp_gen_syncookie)(struct bpf_sock *sk, void *ip,
> > > 
> > >  >                                           int ip_len, void *tcp, int 
> > > tcp_len) =
> > > 
> > >  >         (void *) BPF_FUNC_tcp_gen_syncookie;
> > > 
> > >  > +static int (*bpf_get_current_pidns_info)(struct bpf_pidns_info *buf,
> > > 
> > >  > +                                        unsigned int buf_size) =
> > > 
> > >  > +       (void *) BPF_FUNC_get_current_pidns_info;
> > > 
> > >  >
> > > 
> > >  >  /* llvm builtin functions that eBPF C program may use to
> > > 
> > >  >   * emit BPF_LD_ABS and BPF_LD_IND instructions
> > > 
> > >  > diff --git a/tools/testing/selftests/bpf/progs/test_pidns_kern.c 
> > > b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > > 
> > >  > new file mode 100644
> > > 
> > >  > index 000000000000..e1d2facfa762
> > > 
> > >  > --- /dev/null
> > > 
> > >  > +++ b/tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > > 
> > >  > @@ -0,0 +1,51 @@
> > > 
> > >  > +// SPDX-License-Identifier: GPL-2.0
> > > 
> > >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > > 
> > >  > + *
> > > 
> > >  > + * This program is free software; you can redistribute it and/or
> > > 
> > >  > + * modify it under the terms of version 2 of the GNU General Public
> > > 
> > >  > + * License as published by the Free Software Foundation.
> > > 
> > >  > + */
> > > 
> > >  > +
> > > 
> > >  > +#include <linux/bpf.h>
> > > 
> > >  > +#include <errno.h>
> > > 
> > >  > +#include "bpf_helpers.h"
> > > 
> > >  > +
> > > 
> > >  > +struct bpf_map_def SEC("maps") nsidmap = {
> > > 
> > >  > +       .type = BPF_MAP_TYPE_ARRAY,
> > > 
> > >  > +       .key_size = sizeof(__u32),
> > > 
> > >  > +       .value_size = sizeof(__u32),
> > > 
> > >  > +       .max_entries = 1,
> > > 
> > >  > +};
> > > 
> > >  > +
> > > 
> > >  > +struct bpf_map_def SEC("maps") pidmap = {
> > > 
> > >  > +       .type = BPF_MAP_TYPE_ARRAY,
> > > 
> > >  > +       .key_size = sizeof(__u32),
> > > 
> > >  > +       .value_size = sizeof(__u32),
> > > 
> > >  > +       .max_entries = 1,
> > > 
> > >  > +};
> > > 
> > >  > +
> > > 
> > >  > +SEC("tracepoint/syscalls/sys_enter_nanosleep")
> > > 
> > >  > +int trace(void *ctx)
> > > 
> > >  > +{
> > > 
> > >  > +       struct bpf_pidns_info nsinfo;
> > > 
> > >  > +       __u32 key = 0, *expected_pid, *val;
> > > 
> > >  > +       char fmt[] = "ERROR nspid:%d\n";
> > > 
> > >  > +
> > > 
> > >  > +       if (bpf_get_current_pidns_info(&nsinfo, sizeof(nsinfo)))
> > > 
> > >  > +               return -EINVAL;
> > > 
> > >  > +
> > > 
> > >  > +       expected_pid = bpf_map_lookup_elem(&pidmap, &key);
> > > 
> > >  > +
> > > 
> > >  > +
> > > 
> > >  > +       if (!expected_pid || *expected_pid != nsinfo.pid)
> > > 
> > >  > +               return 0;
> > > 
> > >  > +
> > > 
> > >  > +       val = bpf_map_lookup_elem(&nsidmap, &key);
> > > 
> > >  > +       if (val)
> > > 
> > >  > +               *val = nsinfo.nsid;
> > > 
> > >  > +
> > > 
> > >  > +       return 0;
> > > 
> > >  > +}
> > > 
> > >  > +
> > > 
> > >  > +char _license[] SEC("license") = "GPL";
> > > 
> > >  > +__u32 _version SEC("version") = 1;
> > > 
> > >  > diff --git a/tools/testing/selftests/bpf/test_pidns.c 
> > > b/tools/testing/selftests/bpf/test_pidns.c
> > > 
> > >  > new file mode 100644
> > > 
> > >  > index 000000000000..a7254055f294
> > > 
> > >  > --- /dev/null
> > > 
> > >  > +++ b/tools/testing/selftests/bpf/test_pidns.c
> > > 
> > >  > @@ -0,0 +1,138 @@
> > > 
> > >  > +// SPDX-License-Identifier: GPL-2.0
> > > 
> > >  > +/* Copyright (c) 2018 Carlos Neira cneirabustos@gmail.com
> > > 
> > >  > + *
> > > 
> > >  > + * This program is free software; you can redistribute it and/or
> > > 
> > >  > + * modify it under the terms of version 2 of the GNU General Public
> > > 
> > >  > + * License as published by the Free Software Foundation.
> > > 
> > >  > + */
> > > 
> > >  > +
> > > 
> > >  > +#include <stdio.h>
> > > 
> > >  > +#include <stdlib.h>
> > > 
> > >  > +#include <string.h>
> > > 
> > >  > +#include <errno.h>
> > > 
> > >  > +#include <fcntl.h>
> > > 
> > >  > +#include <syscall.h>
> > > 
> > >  > +#include <unistd.h>
> > > 
> > >  > +#include <linux/perf_event.h>
> > > 
> > >  > +#include <sys/ioctl.h>
> > > 
> > >  > +#include <sys/time.h>
> > > 
> > >  > +#include <sys/types.h>
> > > 
> > >  > +#include <sys/stat.h>
> > > 
> > >  > +
> > > 
> > >  > +#include <linux/bpf.h>
> > > 
> > >  > +#include <bpf/bpf.h>
> > > 
> > >  > +#include <bpf/libbpf.h>
> > > 
> > >  > +
> > > 
> > >  > +#include "cgroup_helpers.h"
> > > 
> > >  > +#include "bpf_rlimit.h"
> > > 
> > >  > +
> > > 
> > >  > +#define CHECK(condition, tag, format...) ({            \
> > > 
> > >  > +       int __ret = !!(condition);                      \
> > > 
> > >  > +       if (__ret) {                                    \
> > > 
> > >  > +               printf("%s:FAIL:%s ", __func__, tag);   \
> > > 
> > >  > +               printf(format);                         \
> > > 
> > >  > +       } else {                                        \
> > > 
> > >  > +               printf("%s:PASS:%s\n", __func__, tag);  \
> > > 
> > >  > +       }                                               \
> > > 
> > >  > +       __ret;                                          \
> > > 
> > >  > +})
> > > 
> > >  > +
> > > 
> > >  > +static int bpf_find_map(const char *test, struct bpf_object *obj,
> > > 
> > >  > +                       const char *name)
> > > 
> > >  > +{
> > > 
> > >  > +       struct bpf_map *map;
> > > 
> > >  > +
> > > 
> > >  > +       map = bpf_object__find_map_by_name(obj, name);
> > > 
> > >  > +       if (!map)
> > > 
> > >  > +               return -1;
> > > 
> > >  > +       return bpf_map__fd(map);
> > > 
> > >  > +}
> > > 
> > >  > +
> > > 
> > >  > +
> > > 
> > >  > +int main(int argc, char **argv)
> > > 
> > >  > +{
> > > 
> > >  > +       const char *probe_name = "syscalls/sys_enter_nanosleep";
> > > 
> > >  > +       const char *file = "test_pidns_kern.o";
> > > 
> > >  > +       int err, bytes, efd, prog_fd, pmu_fd;
> > > 
> > >  > +       int pidmap_fd, nsidmap_fd;
> > > 
> > >  > +       struct perf_event_attr attr = {};
> > > 
> > >  > +       struct bpf_object *obj;
> > > 
> > >  > +       __u32 knsid = 0;
> > > 
> > >  > +       __u32 key = 0, pid;
> > > 
> > >  > +       int exit_code = 1;
> > > 
> > >  > +       struct stat st;
> > > 
> > >  > +       char buf[256];
> > > 
> > >  > +
> > > 
> > >  > +       err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, 
> > > &prog_fd);
> > > 
> > >  > +       if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
> > > 
> > >  > +               goto cleanup_cgroup_env;
> > > 
> > >  > +
> > > 
> > >  > +       nsidmap_fd = bpf_find_map(__func__, obj, "nsidmap");
> > > 
> > >  > +       if (CHECK(nsidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> > > 
> > >  > +                 nsidmap_fd, errno))
> > > 
> > >  > +               goto close_prog;
> > > 
> > >  > +
> > > 
> > >  > +       pidmap_fd = bpf_find_map(__func__, obj, "pidmap");
> > > 
> > >  > +       if (CHECK(pidmap_fd < 0, "bpf_find_map", "err %d errno %d\n",
> > > 
> > >  > +                 pidmap_fd, errno))
> > > 
> > >  > +               goto close_prog;
> > > 
> > >  > +
> > > 
> > >  > +       pid = getpid();
> > > 
> > >  > +       bpf_map_update_elem(pidmap_fd, &key, &pid, 0);
> > > 
> > >  > +
> > > 
> > >  > +       snprintf(buf, sizeof(buf),
> > > 
> > >  > +                "/sys/kernel/debug/tracing/events/%s/id", probe_name);
> > > 
> > >  > +       efd = open(buf, O_RDONLY, 0);
> > > 
> > >  > +       if (CHECK(efd < 0, "open", "err %d errno %d\n", efd, errno))
> > > 
> > >  > +               goto close_prog;
> > > 
> > >  > +       bytes = read(efd, buf, sizeof(buf));
> > > 
> > >  > +       close(efd);
> > > 
> > >  > +       if (CHECK(bytes <= 0 || bytes >= sizeof(buf), "read",
> > > 
> > >  > +                 "bytes %d errno %d\n", bytes, errno))
> > > 
> > >  > +               goto close_prog;
> > > 
> > >  > +
> > > 
> > >  > +       attr.config = strtol(buf, NULL, 0);
> > > 
> > >  > +       attr.type = PERF_TYPE_TRACEPOINT;
> > > 
> > >  > +       attr.sample_type = PERF_SAMPLE_RAW;
> > > 
> > >  > +       attr.sample_period = 1;
> > > 
> > >  > +       attr.wakeup_events = 1;
> > > 
> > >  > +
> > > 
> > >  > +       pmu_fd = syscall(__NR_perf_event_open, &attr, getpid(), -1, 
> > > -1, 0);
> > > 
> > >  > +       if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n", 
> > > pmu_fd,
> > > 
> > >  > +                 errno))
> > > 
> > >  > +               goto close_prog;
> > > 
> > >  > +
> > > 
> > >  > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
> > > 
> > >  > +       if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", err,
> > > 
> > >  > +                 errno))
> > > 
> > >  > +               goto close_pmu;
> > > 
> > >  > +
> > > 
> > >  > +       err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
> > > 
> > >  > +       if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", 
> > > err,
> > > 
> > >  > +                 errno))
> > > 
> > >  > +               goto close_pmu;
> > > 
> > >  > +
> > > 
> > >  > +       /* trigger some syscalls */
> > > 
> > >  > +       sleep(1);
> > > 
> > >  > +
> > > 
> > >  > +       err = bpf_map_lookup_elem(nsidmap_fd, &key, &knsid);
> > > 
> > >  > +       if (CHECK(err, "bpf_map_lookup_elem", "err %d errno %d\n", 
> > > err, errno))
> > > 
> > >  > +               goto close_pmu;
> > > 
> > >  > +
> > > 
> > >  > +       if (stat("/proc/self/ns/pid", &st))
> > > 
> > >  > +               goto close_pmu;
> > > 
> > >  > +
> > > 
> > >  > +       if (CHECK(knsid != (__u32) st.st_ino, "compare_namespace_id",
> > > 
> > >  > +                 "kern knsid %u user unsid %u\n", knsid, (__u32) 
> > > st.st_ino))
> > > 
> > >  > +               goto close_pmu;
> > > 
> > >  > +
> > > 
> > >  > +       exit_code = 0;
> > > 
> > >  > +       printf("%s:PASS\n", argv[0]);
> > > 
> > >  > +
> > > 
> > >  > +close_pmu:
> > > 
> > >  > +       close(pmu_fd);
> > > 
> > >  > +close_prog:
> > > 
> > >  > +       bpf_object__close(obj);
> > > 
> > >  > +cleanup_cgroup_env:
> > > 
> > >  > +       return exit_code;
> > > 
> > >  > +}
> > > 
> > >  > --
> > > 
> > >  > 2.11.0
> > > 
> > >  >
> > > 
> > >  >
> > > 
> > >  >
> > > 
> > >  >
> > > 
> > >  >
> > > 
> > >  >
> > > 
> > >  > On Thu, Aug 08, 2019 at 05:09:51AM +0000, Yonghong Song wrote:
> > > 
> > >  > >
> > > 
> > >  > >
> > > 
> > >  > > On 8/7/19 6:22 PM, Carlos Antonio Neira Bustos wrote:
> > > 
> > >  > > > The code has been modified to avoid syscalls that could sleep.
> > > 
> > >  > > > Please let me know if any other modification is needed.
> > > 
> > >  > > >
> > > 
> > >  > > >  From be0384c0fa209a78c1567936e8db4e35b9a7c0f8 Mon Sep 17 
> > > 00:00:00 2001
> > > 
> > >  > > > From: Carlos <cneirabustos@gmail.com>
> > > 
> > >  > > > Date: Wed, 7 Aug 2019 20:04:30 -0400
> > > 
> > >  > > > Subject: [PATCH] [PATCH v5 bpf-next] BPF: New helper to obtain 
> > > namespace data
> > > 
> > >  > > >   from current task
> > > 
> > >  > > >
> > > 
> > >  > > > This helper obtains the active namespace from current and returns 
> > > pid, tgid,
> > > 
> > >  > > > device and namespace id as seen from that namespace, allowing to 
> > > instrument
> > > 
> > >  > > > a process inside a container.
> > > 
> > >  > > > Device is read from /proc/self/ns/pid, as in the future it's 
> > > possible that
> > > 
> > >  > > > different pid_ns files may belong to different devices, according
> > > 
> > >  > > > to the discussion between Eric Biederman and Yonghong in 2017 
> > > linux plumbers
> > > 
> > >  > > > conference.
> > > 
> > >  > > > Currently bpf_get_current_pid_tgid(), is used to do pid filtering 
> > > in bcc's
> > > 
> > >  > > > scripts but this helper returns the pid as seen by the root 
> > > namespace which is
> > > 
> > >  > > > fine when a bcc script is not executed inside a container.
> > > 
> > >  > > > When the process of interest is inside a container, pid filtering 
> > > will not work
> > > 
> > >  > > > if bpf_get_current_pid_tgid() is used. This helper addresses this 
> > > limitation
> > > 
> > >  > > > returning the pid as it's seen by the current namespace where the 
> > > script is
> > > 
> > >  > > > executing.
> > > 
> > >  > > >
> > > 
> > >  > > > This helper has the same use cases as bpf_get_current_pid_tgid() 
> > > as it can be
> > > 
> > >  > > > used to do pid filtering even inside a container.
> > > 
> > >  > > >
> > > 
> > >  > > > For example a bcc script using bpf_get_current_pid_tgid() 
> > > (tools/funccount.py):
> > > 
> > >  > > >
> > > 
> > >  > > >          u32 pid = bpf_get_current_pid_tgid() >> 32;
> > > 
> > >  > > >          if (pid != <pid_arg_passed_in>)
> > > 
> > >  > > >                  return 0;
> > > 
> > >  > > > Could be modified to use bpf_get_current_pidns_info() as follows:
> > > 
> > >  > > >
> > > 
> > >  > > >          struct bpf_pidns pidns;
> > > 
> > >  > > >          bpf_get_current_pidns_info(&pidns, sizeof(struct 
> > > bpf_pidns));
> > > 
> > >  > > >          u32 pid = pidns.tgid;
> > > 
> > >  > > >          u32 nsid = pidns.nsid;
> > > 
> > >  > > >          if ((pid != <pid_arg_passed_in>) && (nsid != 
> > > <nsid_arg_passed_in>))
> > > 
> > >  > > >                  return 0;
> > > 
> > >  > > >
> > > 
> > >  > > > To find out the name PID namespace id of a process, you could use 
> > > this command:
> > > 
> > >  > > >
> > > 
> > >  > > > $ ps -h -o pidns -p <pid_of_interest>
> > > 
> > >  > > >
> > > 
> > >  > > > Or this other command:
> > > 
> > >  > > >
> > > 
> > >  > > > $ ls -Li /proc/<pid_of_interest>/ns/pid
> > > 
> > >  > > >
> > > 
> > >  > > > Signed-off-by: Carlos Neira <cneirabustos@gmail.com>
> > > 
> > >  > > > ---
> > > 
> > >  > > >   fs/namei.c                                         |   2 +-
> > > 
> > >  > > >   include/linux/bpf.h                                |   1 +
> > > 
> > >  > > >   include/linux/namei.h                              |   4 +
> > > 
> > >  > > >   include/uapi/linux/bpf.h                           |  29 ++++-
> > > 
> > >  > > >   kernel/bpf/core.c                                  |   1 +
> > > 
> > >  > > >   kernel/bpf/helpers.c                               |  78 
> > > ++++++++++++
> > > 
> > >  > > >   kernel/trace/bpf_trace.c                           |   2 +
> > > 
> > >  > > >   samples/bpf/Makefile                               |   3 +
> > > 
> > >  > > >   samples/bpf/trace_ns_info_user.c                   |  35 ++++++
> > > 
> > >  > > >   samples/bpf/trace_ns_info_user_kern.c              |  44 +++++++
> > > 
> > >  > > >   tools/include/uapi/linux/bpf.h                     |  29 ++++-
> > > 
> > >  > > >   tools/testing/selftests/bpf/Makefile               |   2 +-
> > > 
> > >  > > >   tools/testing/selftests/bpf/bpf_helpers.h          |   3 +
> > > 
> > >  > > >   .../testing/selftests/bpf/progs/test_pidns_kern.c  |  51 ++++++++
> > > 
> > >  > > >   tools/testing/selftests/bpf/test_pidns.c           | 138 
> > > +++++++++++++++++++++
> > > 
> > >  > > >   15 files changed, 418 insertions(+), 4 deletions(-)
> > > 
> > >  > > >   create mode 100644 samples/bpf/trace_ns_info_user.c
> > > 
> > >  > > >   create mode 100644 samples/bpf/trace_ns_info_user_kern.c
> > > 
> > >  > > >   create mode 100644 
> > > tools/testing/selftests/bpf/progs/test_pidns_kern.c
> > > 
> > >  > > >   create mode 100644 tools/testing/selftests/bpf/test_pidns.c
> > > 
> > >  > > >
> > > 
> > >  > > > diff --git a/fs/namei.c b/fs/namei.c
> > > 
> > >  > > > index 209c51a5226c..d1eca36972d2 100644
> > > 
> > >  > > > --- a/fs/namei.c
> > > 
> > >  > > > +++ b/fs/namei.c
> > > 
> > >  > > > @@ -19,7 +19,6 @@
> > > 
> > >  > > >   #include <linux/export.h>
> > > 
> > >  > > >   #include <linux/kernel.h>
> > > 
> > >  > > >   #include <linux/slab.h>
> > > 
> > >  > > > -#include <linux/fs.h>
> > > 
> > >  > > >   #include <linux/namei.h>
> > > 
> > >  > > >   #include <linux/pagemap.h>
> > > 
> > >  > > >   #include <linux/fsnotify.h>
> > > 
> > >  > > > @@ -2355,6 +2354,7 @@ int filename_lookup(int dfd, struct 
> > > filename *name, unsigned flags,
> > > 
> > >  > > >     putname(name);
> > > 
> > >  > > >     return retval;
> > > 
> > >  > > >   }
> > > 
> > >  > > > +EXPORT_SYMBOL(filename_lookup);
> > > 
> > >  > >
> > > 
> > >  > > No need to export symbols. bpf uses it and bpf is in the core, not in
> > > 
> > >  > > modules.
> > > 
> > >  > >
> > > 
> > >  > > >
> > > 
> > >  > > >   /* Returns 0 and nd will be valid on success; Retuns error, 
> > > otherwise. */
> > > 
> > >  > > >   static int path_parentat(struct nameidata *nd, unsigned flags,
> > > 
> > >  > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > 
> > >  > > > index f9a506147c8a..e4adf5e05afd 100644
> > > 
> > >  > > > --- a/include/linux/bpf.h
> > > 
> > >  > > > +++ b/include/linux/bpf.h
> > > 
> > >  > > > @@ -1050,6 +1050,7 @@ extern const struct bpf_func_proto 
> > > bpf_get_local_storage_proto;
> > > 
> > >  > > >   extern const struct bpf_func_proto bpf_strtol_proto;
> > > 
> > >  > > >   extern const struct bpf_func_proto bpf_strtoul_proto;
> > > 
> > >  > > >   extern const struct bpf_func_proto bpf_tcp_sock_proto;
> > > 
> > >  > > > +extern const struct bpf_func_proto bpf_get_current_pidns_info_proto;
> > > 
> > >  > > >
> > > 
> > >  > > >   /* Shared helpers among cBPF and eBPF. */
> > > 
> > >  > > >   void bpf_user_rnd_init_once(void);
> > > 
> > >  > > > diff --git a/include/linux/namei.h b/include/linux/namei.h
> > > 
> > >  > > > index 9138b4471dbf..2c24e8c71d46 100644
> > > 
> > >  > > > --- a/include/linux/namei.h
> > > 
> > >  > > > +++ b/include/linux/namei.h
> > > 
> > >  > > > @@ -6,6 +6,7 @@
> > > 
> > >  > > >   #include <linux/path.h>
> > > 
> > >  > > >   #include <linux/fcntl.h>
> > > 
> > >  > > >   #include <linux/errno.h>
> > > 
> > >  > > > +#include <linux/fs.h>
> > > 
> > >  > > >
> > > 
> > >  > > >   enum { MAX_NESTED_LINKS = 8 };
> > > 
> > >  > > >
> > > 
> > >  > > > @@ -97,6 +98,9 @@ extern void unlock_rename(struct dentry *, 
> > > struct dentry *);
> > > 
> > >  > > >
> > > 
> > >  > > >   extern void nd_jump_link(struct path *path);
> > > 
> > >  > > >
> > > 
> > >  > > > +extern int filename_lookup(int dfd, struct filename *name, 
> > > unsigned int flags,
> > > 
> > >  > > > +               struct path *path, struct path *root);
> > > 
> > >  > >
> > > 
> > >  > > The previous definition in fs/internal.h should be removed.
> > > 
> > >  > >
> > > 
> > >  > > > +
> > > 
> > >  > > >   static inline void nd_terminate_link(void *name, size_t len, 
> > > size_t maxlen)
> > > 
> > >  > > >   {
> > > 
> > >  > > >     ((char *) name)[min(len, maxlen)] = '\0';
> > > 
> > >  > > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > > 
> > >  > > > index 4393bd4b2419..6f601f7106e2 100644
> > > 
> > >  > > > --- a/include/uapi/linux/bpf.h
> > > 
> > >  > > > +++ b/include/uapi/linux/bpf.h
> > > 
> > >  > > > @@ -2741,6 +2741,26 @@ union bpf_attr {
> > > 
> > >  > > >    *                **-EOPNOTSUPP** kernel configuration does not 
> > > enable SYN cookies
> > > 
> > >  > > >    *
> > > 
> > >  > > >    *                **-EPROTONOSUPPORT** IP packet version is not 
> > > 4 or 6
> > > 
> > >  > > > + *
> > > 
> > >  > > > + * int bpf_get_current_pidns_info(struct bpf_pidns_info *pidns, 
> > > u32 size_of_pidns)
> > > 
> > >  > > > + * Description
> > > 
> > >  > > > + *         Copies into *pidns* pid, namespace id and tgid as 
> > > seen by the
> > > 
> > >  > > > + *         current namespace and also device from /proc/self/ns/pid.
> > > 
> > >  > > > + *         *size_of_pidns* must be the size of *pidns*
> > > 
> > >  > > > + *
> > > 
> > >  > > > + *         This helper is used when pid filtering is needed inside a
> > > 
> > >  > > > + *         container as bpf_get_current_tgid() helper returns 
> > > always the
> > > 
> > >  > > > + *         pid id as seen by the root namespace.
> > > 
> > >  > > > + * Return
> > > 
> > >  > > > + *         0 on success
> > > 
> > >  > > > + *
> > > 
> > >  > > > + *         **-EINVAL**  if unable to get ns, pid or tgid of 
> > > current task.
> > > 
> > >  > > > + *         Or if size_of_pidns is not valid.
> > > 
> > >  > >
> > > 
> > >  > > Maybe reword by following the code sequence.
> > > 
> > >  > >     if *size_of_pidns* is not valid or unable to get ns, pid or tgid of
> > > 
> > >  > >     the current task.
> > > 
> > >  > >
> > > 
> > >  > > > + *
> > > 
> > >  > > > + *         **-ENOMEM**  if allocation fails.
> > > 
> > >  > >
> > > 
> > >  > > Maybe some other error codes in filename_lookup() function?
> > > 
> > >  > >
> > > 
> > >  > > > + *
> > > 
> > >  > > > + *         If unable to get the inode from /proc/self/ns/pid an 
> > > error code
> > > 
> > >  > > > + *         will be returned.
> > > 
> > >  > >
> > > 
> > >  > > You do not need this. The description of error code cases should 
> > > cover this.
> > > 
> > >  > >
> > > 
> > >  > > >    */
> > > 
> > >  > > >   #define __BPF_FUNC_MAPPER(FN)             \
> > > 
> > >  > > >     FN(unspec),                     \
> > > 
> > >  > > > @@ -2853,7 +2873,8 @@ union bpf_attr {
> > > 
> > >  > > >     FN(sk_storage_get),             \
> > > 
> > >  > > >     FN(sk_storage_delete),          \
> > > 
> > >  > > >     FN(send_signal),                \
> > > 
> > >  > > > -   FN(tcp_gen_syncookie),
> > > 
> > >  > > > +   FN(tcp_gen_syncookie),          \
> > > 
> > >  > > > +   FN(get_current_pidns_info),
> > > 
> > >  > > >
> > > 
> > >  > > >   /* integer value in 'imm' field of BPF_CALL instruction selects 
> > > which helper
> > > 
> > >  > > >    * function eBPF program intends to call
> > > 
> > >  > > > @@ -3604,4 +3625,10 @@ struct bpf_sockopt {
> > > 
> > >  > > >     __s32   retval;
> > > 
> > >  > > >   };
> > > 
> > >  > > >
> > > 
> > >  > > > +struct bpf_pidns_info {
> > > 
> > >  > > > +   __u32 dev;
> > > 
> > >  > > > +   __u32 nsid;
> > > 
> > >  > > > +   __u32 tgid;
> > > 
> > >  > > > +   __u32 pid;
> > > 
> > >  > > > +};
> > > 
> > >  > > >   #endif /* _UAPI__LINUX_BPF_H__ */
> > > 
> > >  > > > diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> > > 
> > >  > > > index 8191a7db2777..3159f2a0188c 100644
> > > 
> > >  > > > --- a/kernel/bpf/core.c
> > > 
> > >  > > > +++ b/kernel/bpf/core.c
> > > 
> > >  > > > @@ -2038,6 +2038,7 @@ const struct bpf_func_proto 
> > > bpf_get_current_uid_gid_proto __weak;
> > > 
> > >  > > >   const struct bpf_func_proto bpf_get_current_comm_proto __weak;
> > > 
> > >  > > >   const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak;
> > > 
> > >  > > >   const struct bpf_func_proto bpf_get_local_storage_proto __weak;
> > > 
> > >  > > > +const struct bpf_func_proto bpf_get_current_pidns_info __weak;
> > > 
> > >  > > >
> > > 
> > >  > > >   const struct bpf_func_proto * __weak 
> > > bpf_get_trace_printk_proto(void)
> > > 
> > >  > > >   {
> > > 
> > >  > > > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > > 
> > >  > > > index 5e28718928ca..571f24077db2 100644
> > > 
> > >  > > > --- a/kernel/bpf/helpers.c
> > > 
> > >  > > > +++ b/kernel/bpf/helpers.c
> > > 
> > >  > > > @@ -11,6 +11,12 @@
> > > 
> > >  > > >   #include <linux/uidgid.h>
> > > 
> > >  > > >   #include <linux/filter.h>
> > > 
> > >  > > >   #include <linux/ctype.h>
> > > 
> > >  > > > +#include <linux/pid_namespace.h>
> > > 
> > >  > > > +#include <linux/major.h>
> > > 
> > >  > > > +#include <linux/stat.h>
> > > 
> > >  > > > +#include <linux/namei.h>
> > > 
> > >  > > > +#include <linux/version.h>
> > > 
> > >  > > > +
> > > 
> > >  > > >
> > > 
> > >  > > >   #include "../../lib/kstrtox.h"
> > > 
> > >  > > >
> > > 
> > >  > > > @@ -312,6 +318,78 @@ void copy_map_value_locked(struct bpf_map 
> > > *map, void *dst, void *src,
> > > 
> > >  > > >     preempt_enable();
> > > 
> > >  > > >   }
> > > 
> > >  > > >
> > > 
> > >  > > > +BPF_CALL_2(bpf_get_current_pidns_info, struct bpf_pidns_info *, 
> > > pidns_info, u32,
> > > 
> > >  > > > +    size)
> > > 
> > >  > > > +{
> > > 
> > >  > > > +   const char *name = "/proc/self/ns/pid";
> > > 
> > >  > >
> > > 
> > >  > > maybe rename this variable to pidns_path?
> > > 
> > >  > >
> > > 
> > >  > > > +   struct pid_namespace *pidns = NULL;
> > > 
> > >  > > > +   struct filename *tmp = NULL;
> > > 
> > >  > >
> > > 
> > >  > > Maybe rename this variable to name?
> > > 
> > >  > >
> > > 
> > >  > > > +   int len = strlen(name) + 1;
> > > 
> > >  > >
> > > 
> > >  > > We can delay this assignment later until it is needed.
> > > 
> > >  > >
> > > 
> > >  > > > +   struct inode *inode;
> > > 
> > >  > > > +   struct path kp;
> > > 
> > >  > > > +   pid_t tgid = 0;
> > > 
> > >  > > > +   pid_t pid = 0;
> > > 
> > >  > > > +   int ret;
> > > 
> > >  > > > +
> > > 
> > >  > > > +   if (unlikely(size != sizeof(struct bpf_pidns_info)))
> > > 
> > >  > > > +           return -EINVAL;
> > > 
> > >  > > > +
> > > 
> > >  > > > +   pidns = task_active_pid_ns(current);
> > > 
> > >  > > > +
> > > 
> > >  > >
> > > 
> > >  > > we can save an empty line here.
> > > 
> > >  > >
> > > 
> > >  > > > +   if (unlikely(!pidns))
> > > 
> > >  > > > +           goto clear;
> > > 
> > >  > > > +
> > > 
> > >  > > > +   pidns_info->nsid =  pidns->ns.inum;
> > > 
> > >  > > > +   pid = task_pid_nr_ns(current, pidns);
> > > 
> > >  > > > +
> > > 
> > >  > >
> > > 
> > >  > > We can save an empty line here.
> > > 
> > >  > >
> > > 
> > >  > > > +   if (unlikely(!pid))
> > > 
> > >  > > > +           goto clear;
> > > 
> > >  > > > +
> > > 
> > >  > > > +   tgid = task_tgid_nr_ns(current, pidns);
> > > 
> > >  > > > +
> > > 
> > >  > > ditto. save an empty line.
> > > 
> > >  > > > +   if (unlikely(!tgid))
> > > 
> > >  > > > +           goto clear;
> > > 
> > >  > > > +
> > > 
> > >  > > > +   pidns_info->tgid = (u32) tgid;
> > > 
> > >  > > > +   pidns_info->pid = (u32) pid;
> > > 
> > >  > > > +
> > > 
> > >  > > > +   tmp = kmem_cache_alloc(names_cachep, GFP_ATOMIC);
> > > 
> > >  > > > +   if (unlikely(!tmp)) {
> > > 
> > >  > > > +           memset((void *)pidns_info, 0, (size_t) size);
> > > 
> > >  > > > +           return -ENOMEM;
> > > 
> > >  > > > +   }
> > > 
> > >  > > > +
> > > 
> > >  > > > +   memcpy((char *)tmp->name, name, len);
> > > 
> > >  > > > +   tmp->uptr = NULL;
> > > 
> > >  > > > +   tmp->aname = NULL;
> > > 
> > >  > > > +   tmp->refcnt = 1;
> > > 
> > >  > > > +
> > > 
> > >  > > ditto. save an empty line.
> > > 
> > >  > > > +   ret = filename_lookup(AT_FDCWD, tmp, 0, &kp, NULL);
> > > 
> > >  > > > +
> > > 
> > >  > > ditto. save an empty line.
> > > 
> > >  > > > +   if (ret) {
> > > 
> > >  > > > +           memset((void *)pidns_info, 0, (size_t) size);
> > > 
> > >  > > > +           return ret;
> > > 
> > >  > > > +   }
> > > 
> > >  > > > +
> > > 
> > >  > > > +   inode = d_backing_inode(kp.dentry);
> > > 
> > >  > > > +   pidns_info->dev = inode->i_sb->s_dev;
> > > 
> > >  > > > +
> > > 
> > >  > > > +   return 0;
> > > 
> > >  > > > +
> > > 
> > >  > > > +clear:
> > > 
> > >  > > > +   memset((void *)pidns_info, 0, (size_t) size);
> > > 
> > >  > > > +
> > > 
> > >  > > save an empty line.
> > > 
> > >  > > > +   return -EINVAL;
> > > 
> > >  > > > +}
> > > 
> > >  > > > +
> > > 
> > >  > > > +const struct bpf_func_proto bpf_get_current_pidns_info_proto = {
> > > 
> > >  > > > +   .func   = bpf_get_current_pidns_info,
> > > 
> > >  > > make the "= " aligned with others?
> > > 
> > >  > > > +   .gpl_only       = false,
> > > 
> > >  > > > +   .ret_type       = RET_INTEGER,
> > > 
> > >  > > > +   .arg1_type      = ARG_PTR_TO_UNINIT_MEM,
> > > 
> > >  > > > +   .arg2_type      = ARG_CONST_SIZE,
> > > 
> > >  > > > +};
> > > 
> > >  > > > +
> > > 
> > >  > > >   #ifdef CONFIG_CGROUPS
> > > 
> > >  > > >   BPF_CALL_0(bpf_get_current_cgroup_id)
> > > 
> > >  > > >   {
> > > 
> > >  > > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> > > 
> > >  > > > index ca1255d14576..5e1dc22765a5 100644
> > > 
> > >  > > > --- a/kernel/trace/bpf_trace.c
> > > 
> > >  > > > +++ b/kernel/trace/bpf_trace.c
> > > 
> > >  > > > @@ -709,6 +709,8 @@ tracing_func_proto(enum bpf_func_id func_id, 
> > > const struct bpf_prog *prog)
> > > 
> > >  > > >   #endif
> > > 
> > >  > > >     case BPF_FUNC_send_signal:
> > > 
> > >  > > >             return &bpf_send_signal_proto;
> > > 
> > >  > > > +   case BPF_FUNC_get_current_pidns_info:
> > > 
> > >  > > > +           return &bpf_get_current_pidns_info_proto;
> > > 
> > >  > > >     default:
> > > 
> > >  > > >             return NULL;
> > > 
> > >  > > >     }
> > > 
> > >  > > > diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
> > > 
> > >  > > > index 1d9be26b4edd..238453ff27d2 100644
> > > 
> > >  > > > --- a/samples/bpf/Makefile
> > > 
> > >  > > > +++ b/samples/bpf/Makefile
> > > 
> > >  > > > @@ -53,6 +53,7 @@ hostprogs-y += task_fd_query
> > > 
> > >  > > >   hostprogs-y += xdp_sample_pkts
> > > 
> > >  > > >   hostprogs-y += ibumad
> > > 
> > >  > > >   hostprogs-y += hbm
> > > 
> > >  > > > +hostprogs-y += trace_ns_info
> > > 
> > >  > > [...]
> > > 

^ permalink raw reply related

* [PATCH net-next] r8169: inline rtl8169_free_rx_databuff
From: Heiner Kallweit @ 2019-08-09 20:59 UTC (permalink / raw)
  To: Realtek linux nic maintainers, David Miller; +Cc: netdev@vger.kernel.org

rtl8169_free_rx_databuff is used in only one place, so let's inline it.
We can improve the loop because rtl8169_init_ring zero's RX_databuff
before calling rtl8169_rx_fill, and rtl8169_rx_fill fills
Rx_databuff starting from index 0.

Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/ethernet/realtek/r8169_main.c | 24 +++++++----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index b2a275d85..641a34942 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5260,18 +5260,6 @@ static inline void rtl8169_make_unusable_by_asic(struct RxDesc *desc)
 	desc->opts1 &= ~cpu_to_le32(DescOwn | RsvdMask);
 }
 
-static void rtl8169_free_rx_databuff(struct rtl8169_private *tp,
-				     struct page **data_buff,
-				     struct RxDesc *desc)
-{
-	dma_unmap_page(tp_to_dev(tp), le64_to_cpu(desc->addr),
-		       R8169_RX_BUF_SIZE, DMA_FROM_DEVICE);
-
-	__free_pages(*data_buff, get_order(R8169_RX_BUF_SIZE));
-	*data_buff = NULL;
-	rtl8169_make_unusable_by_asic(desc);
-}
-
 static inline void rtl8169_mark_to_asic(struct RxDesc *desc)
 {
 	u32 eor = le32_to_cpu(desc->opts1) & RingEnd;
@@ -5312,11 +5300,13 @@ static void rtl8169_rx_clear(struct rtl8169_private *tp)
 {
 	unsigned int i;
 
-	for (i = 0; i < NUM_RX_DESC; i++) {
-		if (tp->Rx_databuff[i]) {
-			rtl8169_free_rx_databuff(tp, tp->Rx_databuff + i,
-					    tp->RxDescArray + i);
-		}
+	for (i = 0; i < NUM_RX_DESC && tp->Rx_databuff[i]; i++) {
+		dma_unmap_page(tp_to_dev(tp),
+			       le64_to_cpu(tp->RxDescArray[i].addr),
+			       R8169_RX_BUF_SIZE, DMA_FROM_DEVICE);
+		__free_pages(tp->Rx_databuff[i], get_order(R8169_RX_BUF_SIZE));
+		tp->Rx_databuff[i] = NULL;
+		rtl8169_make_unusable_by_asic(tp->RxDescArray + i);
 	}
 }
 
-- 
2.22.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox