Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH 09/14] net: sched: don't release reference on action overwrite
From: Jiri Pirko @ 2018-05-16  7:50 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <vbfd0xw11pn.fsf@reg-r-vrt-018-180.mtr.labs.mlnx>

Wed, May 16, 2018 at 09:47:32AM CEST, vladbu@mellanox.com wrote:
>
>On Wed 16 May 2018 at 07:43, Jiri Pirko <jiri@resnulli.us> wrote:
>> Mon, May 14, 2018 at 04:27:10PM CEST, vladbu@mellanox.com wrote:
>>>Return from action init function with reference to action taken,
>>>even when overwriting existing action.
>>>
>>>Action init API initializes its fourth argument (pointer to pointer to
>>>tc action) to either existing action with same index or newly created
>>>action. In case of existing index(and bind argument is zero), init
>>>function returns without incrementing action reference counter. Caller
>>>of action init then proceeds working with action without actually
>>>holding reference to it. This means that action could be deleted
>>>concurrently. To prevent such scenario this patch changes action init
>>
>> Be imperative to the codebase in the patch description.
>>
>>
>>>behavior to always take reference to action before returning
>>>successfully.
>>
>> Where's the balance? Who does the release instead? I'm probably missing
>> something.
>
>I've resplit these patches for V2 to always do take/release in same
>patch.

Good. Thanks.

>
>>
>>>
>>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>>---
>>> net/sched/act_bpf.c        |  8 ++++----
>>> net/sched/act_connmark.c   |  5 +++--
>>> net/sched/act_csum.c       |  8 ++++----
>>> net/sched/act_gact.c       |  5 +++--
>>> net/sched/act_ife.c        | 12 +++++-------
>>> net/sched/act_ipt.c        |  5 +++--
>>> net/sched/act_mirred.c     |  5 ++---
>>> net/sched/act_nat.c        |  5 +++--
>>> net/sched/act_pedit.c      |  5 +++--
>>> net/sched/act_police.c     |  8 +++-----
>>> net/sched/act_sample.c     |  8 +++-----
>>> net/sched/act_simple.c     |  5 +++--
>>> net/sched/act_skbedit.c    |  5 +++--
>>> net/sched/act_skbmod.c     |  8 +++-----
>>> net/sched/act_tunnel_key.c |  8 +++-----
>>> net/sched/act_vlan.c       |  8 +++-----
>>> 16 files changed, 51 insertions(+), 57 deletions(-)
>>>
>>>diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
>>>index 5d95c43..5554bf7 100644
>>>--- a/net/sched/act_bpf.c
>>>+++ b/net/sched/act_bpf.c
>>>@@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
>>> 		if (bind)
>>> 			return 0;
>>> 
>>>-		tcf_idr_release(*act, bind);
>>>-		if (!replace)
>>>+		if (!replace) {
>>>+			tcf_idr_release(*act, bind);
>>> 			return -EEXIST;
>>>+		}
>>> 	}
>>> 
>>> 	is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
>>
>> [...]
>

^ permalink raw reply

* Re: [PATCH 10/14] net: sched: extend act API for lockless actions
From: Jiri Pirko @ 2018-05-16  7:50 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <1526308035-12484-11-git-send-email-vladbu@mellanox.com>

Mon, May 14, 2018 at 04:27:11PM CEST, vladbu@mellanox.com wrote:
>Implement new action API function to atomically delete action with
>specified index and to atomically insert unique action. These functions are
>required to implement init and delete functions for specific actions that
>do not rely on rtnl lock.
>
>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>---
> include/net/act_api.h |  2 ++
> net/sched/act_api.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 47 insertions(+)
>
>diff --git a/include/net/act_api.h b/include/net/act_api.h
>index a8c8570..bce0cf1 100644
>--- a/include/net/act_api.h
>+++ b/include/net/act_api.h
>@@ -153,7 +153,9 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
> 		   struct tc_action **a, const struct tc_action_ops *ops,
> 		   int bind, bool cpustats);
> void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a);
>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a);
> 
>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index);
> int __tcf_idr_release(struct tc_action *a, bool bind, bool strict);
> 
> static inline int tcf_idr_release(struct tc_action *a, bool bind)
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 2772276e..a5193dc 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -330,6 +330,41 @@ bool tcf_idr_check(struct tc_action_net *tn, u32 index, struct tc_action **a,
> }
> EXPORT_SYMBOL(tcf_idr_check);
> 
>+int tcf_idr_find_delete(struct tc_action_net *tn, u32 index)
>+{
>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>+	struct tc_action *p;
>+	int ret = 0;
>+
>+	spin_lock_bh(&idrinfo->lock);

Why "_bh" is needed here?


>+	p = idr_find(&idrinfo->action_idr, index);
>+	if (!p) {
>+		spin_unlock(&idrinfo->lock);
>+		return -ENOENT;
>+	}
>+
>+	if (!atomic_read(&p->tcfa_bindcnt)) {
>+		if (refcount_dec_and_test(&p->tcfa_refcnt)) {
>+			struct module *owner = p->ops->owner;
>+
>+			WARN_ON(p != idr_remove(&idrinfo->action_idr,
>+						p->tcfa_index));
>+			spin_unlock_bh(&idrinfo->lock);
>+
>+			tcf_action_cleanup(p);
>+			module_put(owner);
>+			return 0;
>+		}
>+		ret = 0;
>+	} else {
>+		ret = -EPERM;

I wonder if "-EPERM" is the best error code for this...


>+	}
>+
>+	spin_unlock_bh(&idrinfo->lock);
>+	return ret;
>+}
>+EXPORT_SYMBOL(tcf_idr_find_delete);
>+
> int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
> 		   struct tc_action **a, const struct tc_action_ops *ops,
> 		   int bind, bool cpustats)
>@@ -407,6 +442,16 @@ void tcf_idr_insert(struct tc_action_net *tn, struct tc_action *a)
> }
> EXPORT_SYMBOL(tcf_idr_insert);
> 
>+void tcf_idr_insert_unique(struct tc_action_net *tn, struct tc_action *a)
>+{
>+	struct tcf_idrinfo *idrinfo = tn->idrinfo;
>+
>+	spin_lock_bh(&idrinfo->lock);
>+	WARN_ON(idr_replace(&idrinfo->action_idr, a, a->tcfa_index));

Under which condition this WARN_ON is hit?


>+	spin_unlock_bh(&idrinfo->lock);
>+}
>+EXPORT_SYMBOL(tcf_idr_insert_unique);
>+
> void tcf_idrinfo_destroy(const struct tc_action_ops *ops,
> 			 struct tcf_idrinfo *idrinfo)
> {
>-- 
>2.7.5
>

^ permalink raw reply

* RE: i40e - Is i40e_force_link_state doing the right thing ?
From: Stachura, Mariusz @ 2018-05-16  7:49 UTC (permalink / raw)
  To: Chaitanya Lala
  Cc: netdev@vger.kernel.org, Williams, Mitch A, Bowers, AndrewX,
	Kirsher, Jeffrey T
In-Reply-To: <CABOGejK=HjghOat91xYkXbGAaKMtb_ECKcZRhD-=8Bet1AA=ew@mail.gmail.com>

> Hi Mariusz, ...
>
> On Tue, May 15, 2018 at 2:24 PM, Stachura, Mariusz <mariusz.stachura@intel.com> wrote:
>> On Tue, May 15, 2018 at 1:15 PM, Chaitanya Lala <chaitanya.lala@gmail.com> wrote:
>>> Hi,
>>>
>>> I am trying to bring up a Intel XL710 4x10G Intel card using the 
>>> latest mainline top-of-tree.
>>> The problem is that "ifconfig up" and "ifconfig down" do not take 
>>> effect at the link state level.
>>> I tracked the problem down to i40e_force_link_state() when it is 
>>> called from i40e_down().
>>> It calls i40e_force_link_state with "is_up" == false. In-turn it 
>>> calls, i40e_aq_set_link_restart_an(hw, true, NULL).
>>>
>>> Should the second argument of  i40e_aq_set_link_restart_an be "is_up"
>>> vs the current "true"
>>> i.e. i40e_aq_set_link_restart_an(hw, is_up, NULL). ? When I make this 
>>> change, the link state syncs-up with the interface administrative 
>>> state.
>>>
>>> Is this a bug ?
>>>
>>> Thanks,
>>>  Chaitanya
>>
>> Hello Chaitanya,
>>
>> i40e_down() calls i40e_force_link_state with "is_up" == false only if interface's private flag "link-down-on-close" is set. By default the link is left up for manageability and VF traffic, user can use this flag to power down the interface on the link level. Does that work for you?
>> The command is:
>> "ethtool --set-priv-flags IFNAME link-down-on-close on" and then
>
> This flag is _on_ in my setup and hencet i40e_force_link_state is being called with is_up == false in my setup. The problem is that irrespective of value of "is_up" flag, i40e_force_link_state invokes i40e_aq_set_link_restart_an with second argument (enable_link) as "true". So i40e_aq_set_link_restart_an is always trying to enable link even if is_up was false. Is that correct behavior ?
>
> I have pasted code with my annotations below marked with "//XXX".
> (...)
> Thanks,
> Chaitanya

Hey,
i40e_aq_set_link_restart_an has second argument set to "true" intentionally, as I understand the "link-down-on-close" does not work for you, right? I will double check if this feature works for me and get back to you, thank you again.
--------------------------------------------------------------------

Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek
przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by
others is strictly prohibited.

^ permalink raw reply

* Re: [PATCH 09/14] net: sched: don't release reference on action overwrite
From: Vlad Buslov @ 2018-05-16  7:47 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <20180516074332.GB1972@nanopsycho>


On Wed 16 May 2018 at 07:43, Jiri Pirko <jiri@resnulli.us> wrote:
> Mon, May 14, 2018 at 04:27:10PM CEST, vladbu@mellanox.com wrote:
>>Return from action init function with reference to action taken,
>>even when overwriting existing action.
>>
>>Action init API initializes its fourth argument (pointer to pointer to
>>tc action) to either existing action with same index or newly created
>>action. In case of existing index(and bind argument is zero), init
>>function returns without incrementing action reference counter. Caller
>>of action init then proceeds working with action without actually
>>holding reference to it. This means that action could be deleted
>>concurrently. To prevent such scenario this patch changes action init
>
> Be imperative to the codebase in the patch description.
>
>
>>behavior to always take reference to action before returning
>>successfully.
>
> Where's the balance? Who does the release instead? I'm probably missing
> something.

I've resplit these patches for V2 to always do take/release in same
patch.

>
>>
>>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>>---
>> net/sched/act_bpf.c        |  8 ++++----
>> net/sched/act_connmark.c   |  5 +++--
>> net/sched/act_csum.c       |  8 ++++----
>> net/sched/act_gact.c       |  5 +++--
>> net/sched/act_ife.c        | 12 +++++-------
>> net/sched/act_ipt.c        |  5 +++--
>> net/sched/act_mirred.c     |  5 ++---
>> net/sched/act_nat.c        |  5 +++--
>> net/sched/act_pedit.c      |  5 +++--
>> net/sched/act_police.c     |  8 +++-----
>> net/sched/act_sample.c     |  8 +++-----
>> net/sched/act_simple.c     |  5 +++--
>> net/sched/act_skbedit.c    |  5 +++--
>> net/sched/act_skbmod.c     |  8 +++-----
>> net/sched/act_tunnel_key.c |  8 +++-----
>> net/sched/act_vlan.c       |  8 +++-----
>> 16 files changed, 51 insertions(+), 57 deletions(-)
>>
>>diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
>>index 5d95c43..5554bf7 100644
>>--- a/net/sched/act_bpf.c
>>+++ b/net/sched/act_bpf.c
>>@@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
>> 		if (bind)
>> 			return 0;
>> 
>>-		tcf_idr_release(*act, bind);
>>-		if (!replace)
>>+		if (!replace) {
>>+			tcf_idr_release(*act, bind);
>> 			return -EEXIST;
>>+		}
>> 	}
>> 
>> 	is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];
>
> [...]

^ permalink raw reply

* Re: [PATCH 09/14] net: sched: don't release reference on action overwrite
From: Jiri Pirko @ 2018-05-16  7:43 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <1526308035-12484-10-git-send-email-vladbu@mellanox.com>

Mon, May 14, 2018 at 04:27:10PM CEST, vladbu@mellanox.com wrote:
>Return from action init function with reference to action taken,
>even when overwriting existing action.
>
>Action init API initializes its fourth argument (pointer to pointer to
>tc action) to either existing action with same index or newly created
>action. In case of existing index(and bind argument is zero), init
>function returns without incrementing action reference counter. Caller
>of action init then proceeds working with action without actually
>holding reference to it. This means that action could be deleted
>concurrently. To prevent such scenario this patch changes action init

Be imperative to the codebase in the patch description.


>behavior to always take reference to action before returning
>successfully.

Where's the balance? Who does the release instead? I'm probably missing
something.

>
>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>---
> net/sched/act_bpf.c        |  8 ++++----
> net/sched/act_connmark.c   |  5 +++--
> net/sched/act_csum.c       |  8 ++++----
> net/sched/act_gact.c       |  5 +++--
> net/sched/act_ife.c        | 12 +++++-------
> net/sched/act_ipt.c        |  5 +++--
> net/sched/act_mirred.c     |  5 ++---
> net/sched/act_nat.c        |  5 +++--
> net/sched/act_pedit.c      |  5 +++--
> net/sched/act_police.c     |  8 +++-----
> net/sched/act_sample.c     |  8 +++-----
> net/sched/act_simple.c     |  5 +++--
> net/sched/act_skbedit.c    |  5 +++--
> net/sched/act_skbmod.c     |  8 +++-----
> net/sched/act_tunnel_key.c |  8 +++-----
> net/sched/act_vlan.c       |  8 +++-----
> 16 files changed, 51 insertions(+), 57 deletions(-)
>
>diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
>index 5d95c43..5554bf7 100644
>--- a/net/sched/act_bpf.c
>+++ b/net/sched/act_bpf.c
>@@ -311,9 +311,10 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
> 		if (bind)
> 			return 0;
> 
>-		tcf_idr_release(*act, bind);
>-		if (!replace)
>+		if (!replace) {
>+			tcf_idr_release(*act, bind);
> 			return -EEXIST;
>+		}
> 	}
> 
> 	is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS];

[...]

^ permalink raw reply

* Re: linux-next: BUG: KASAN: use-after-free in tun_chr_close
From: Andrei Vagin @ 2018-05-16  7:40 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev
In-Reply-To: <9a7440ca-05dd-7ff6-0fa0-a96afc9ed780@redhat.com>

On Wed, May 16, 2018 at 03:32:59PM +0800, Jason Wang wrote:
> 
> 
> On 2018年05月16日 15:12, Andrei Vagin wrote:
> > Hi Jason,
> > 
> > I think the problem is in "tun: hold a tun socket during ptr_ring_cleanup".
> > 
> > Pls take a look at the attached patch.
> 
> Yes.
> 
> It looks to me it's not necessary to take extra refcnt during release, we
> can just do the cleanup at __tun_detach().
> 
> Could you help to test the attached patch?

I've run my test on the kernel with this patch. It fixes the problem.
The patch looks correct for me.

Acked-by: Andrei Vagin <avagin@virtuozzo.com>

> 
> Thanks
> 

> From 4b5ad75208e379dcb32abb9ac4790a0446f8558b Mon Sep 17 00:00:00 2001
> From: Jason Wang <jasowang@redhat.com>
> Date: Wed, 16 May 2018 15:26:52 +0800
> Subject: [PATCH] tuntap: fix use after free during release
> 
> After commit b196d88aba8a ("tun: fix use after free for ptr_ring") we
> need clean up tx ring during release(). But unfortunately, it tries to
> do the cleanup after socket were destroyed which will lead another
> use-after-free. Fix this by doing the cleanup before dropping the last
> reference of the socket in __tun_detach().
> 
> Reported-by: Andrei Vagin <avagin@virtuozzo.com>
> Fixes: b196d88aba8a ("tun: fix use after free for ptr_ring")
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tun.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 9fbbb32..d45ac37 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -729,6 +729,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
>  		}
>  		if (tun)
>  			xdp_rxq_info_unreg(&tfile->xdp_rxq);
> +		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
>  		sock_put(&tfile->sk);
>  	}
>  }
> @@ -3245,7 +3246,6 @@ static int tun_chr_close(struct inode *inode, struct file *file)
>  	struct tun_file *tfile = file->private_data;
>  
>  	tun_detach(tfile, true);
> -	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
>  
>  	return 0;
>  }
> -- 
> 2.7.4
> 

^ permalink raw reply

* Re: linux-next: BUG: KASAN: use-after-free in tun_chr_close
From: Jason Wang @ 2018-05-16  7:32 UTC (permalink / raw)
  To: Andrei Vagin, netdev
In-Reply-To: <20180516071224.GB11416@outlook.office365.com>

[-- Attachment #1: Type: text/plain, Size: 363 bytes --]



On 2018年05月16日 15:12, Andrei Vagin wrote:
> Hi Jason,
>
> I think the problem is in "tun: hold a tun socket during ptr_ring_cleanup".
>
> Pls take a look at the attached patch.

Yes.

It looks to me it's not necessary to take extra refcnt during release, 
we can just do the cleanup at __tun_detach().

Could you help to test the attached patch?

Thanks


[-- Attachment #2: 0001-tuntap-fix-use-after-free-during-release.patch --]
[-- Type: text/x-patch, Size: 1362 bytes --]

>From 4b5ad75208e379dcb32abb9ac4790a0446f8558b Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Wed, 16 May 2018 15:26:52 +0800
Subject: [PATCH] tuntap: fix use after free during release

After commit b196d88aba8a ("tun: fix use after free for ptr_ring") we
need clean up tx ring during release(). But unfortunately, it tries to
do the cleanup after socket were destroyed which will lead another
use-after-free. Fix this by doing the cleanup before dropping the last
reference of the socket in __tun_detach().

Reported-by: Andrei Vagin <avagin@virtuozzo.com>
Fixes: b196d88aba8a ("tun: fix use after free for ptr_ring")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/tun.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9fbbb32..d45ac37 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -729,6 +729,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
 		}
 		if (tun)
 			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+		ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 		sock_put(&tfile->sk);
 	}
 }
@@ -3245,7 +3246,6 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 	struct tun_file *tfile = file->private_data;
 
 	tun_detach(tfile, true);
-	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
 
 	return 0;
 }
-- 
2.7.4


^ permalink raw reply related

* Re: xdp and fragments with virtio
From: Jason Wang @ 2018-05-16  7:24 UTC (permalink / raw)
  To: David Ahern, netdev@vger.kernel.org
In-Reply-To: <e35c149e-cb15-bb8c-2e03-d1641d21d694@gmail.com>



On 2018年05月16日 11:51, David Ahern wrote:
> Hi Jason:
>
> I am trying to test MTU changes to the BPF fib_lookup helper and seeing
> something odd. Hoping you can help.
>
> I have a VM with multiple virtio based NICs and tap backends. I install
> the xdp program on eth1 and eth2 to do forwarding. In the host I send a
> large packet to eth1:
>
> $ ping -s 1500 9.9.9.9
>
>
> The tap device in the host sees 2 packets:
>
> $ sudo tcpdump -nv -i vm02-eth1
> 20:44:33.943160 IP (tos 0x0, ttl 64, id 58746, offset 0, flags [+],
> proto ICMP (1), length 1500)
>      10.100.1.254 > 9.9.9.9: ICMP echo request, id 17917, seq 1, length 1480
> 20:44:33.943172 IP (tos 0x0, ttl 64, id 58746, offset 1480, flags
> [none], proto ICMP (1), length 48)
>      10.100.1.254 > 9.9.9.9: ip-proto-1
>
>
> In the VM, the XDP program only sees the first packet, not the fragment.
> I added a printk to the program (see diff below):
>
> $ cat trace_pipe
>            <idle>-0     [003] ..s2   254.436467: 0: packet length 1514
>
>
> Anything come to mind in the virtio xdp implementation that affects
> fragment packets? I see this with both IPv4 and v6.

Not yet. But we do turn of tap gso when virtio has XDP set, but it 
shouldn't matter this case.

Will try to see what's wrong.

Thanks

>
> Thanks,
> David
>
> [1] xdp program diff showing printk that dumps packet length:
>
> diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
> index 4a6be0f87505..f119b506e782 100644
> --- a/samples/bpf/xdp_fwd_kern.c
> +++ b/samples/bpf/xdp_fwd_kern.c
> @@ -52,6 +52,11 @@ static __always_inline int xdp_fwd_flags(struct
> xdp_md *ctx, u32 flags)
>          u16 h_proto;
>          u64 nh_off;
>
> +       {
> +               char fmt[] = "packet length %u\n";
> +
> +               bpf_trace_printk(fmt, sizeof(fmt), ctx->data_end-ctx->data);
> +       }
>          nh_off = sizeof(*eth);
>          if (data + nh_off > data_end)
>                  return XDP_DROP;
>

^ permalink raw reply

* Re: KMSAN: uninit-value in __sctp_v6_cmp_addr
From: Xin Long @ 2018-05-16  7:17 UTC (permalink / raw)
  To: syzbot
  Cc: davem, LKML, linux-sctp, network dev, Neil Horman, syzkaller-bugs,
	Vlad Yasevich
In-Reply-To: <0000000000004f4075056c410b96@google.com>

On Wed, May 16, 2018 at 12:25 AM, syzbot
<syzbot+85490c30c260afff22f2@syzkaller.appspotmail.com> wrote:
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:    74ee2200b89f kmsan: bump .config.example to v4.17-rc3
> git tree:       https://github.com/google/kmsan.git/master
> console output: https://syzkaller.appspot.com/x/log.txt?x=169efb5b800000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=4ca1e57bafa8ab1f
> dashboard link: https://syzkaller.appspot.com/bug?extid=85490c30c260afff22f2
> compiler:       clang version 7.0.0 (trunk 329391)
> syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=157e9237800000
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10fe5de7800000
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+85490c30c260afff22f2@syzkaller.appspotmail.com
>
> random: sshd: uninitialized urandom read (32 bytes read)
> random: sshd: uninitialized urandom read (32 bytes read)
> random: sshd: uninitialized urandom read (32 bytes read)
> random: sshd: uninitialized urandom read (32 bytes read)
> ==================================================================
> BUG: KMSAN: uninit-value in __sctp_v6_cmp_addr+0x49a/0x850
> net/sctp/ipv6.c:580
> CPU: 0 PID: 4453 Comm: syz-executor325 Not tainted 4.17.0-rc3+ #88
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  <IRQ>
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x185/0x1d0 lib/dump_stack.c:113
>  kmsan_report+0x142/0x240 mm/kmsan/kmsan.c:1067
>  __msan_warning_32+0x6c/0xb0 mm/kmsan/kmsan_instr.c:683
>  __sctp_v6_cmp_addr+0x49a/0x850 net/sctp/ipv6.c:580
Pls check if the testing kernel has this commit:
commit d625329b06e46bd20baf9ee40847d11982569204
Author: Xin Long <lucien.xin@gmail.com>
Date:   Thu Apr 26 14:13:57 2018 +0800

    sctp: handle two v4 addrs comparison in sctp_inet6_cmp_addr

Thanks.

>  sctp_inet6_cmp_addr+0x3dc/0x400 net/sctp/ipv6.c:898
>  sctp_bind_addr_match+0x18b/0x2f0 net/sctp/bind_addr.c:330
>  sctp_addrs_lookup_transport+0x904/0xa20 net/sctp/input.c:942
>  __sctp_lookup_association net/sctp/input.c:985 [inline]
>  __sctp_rcv_lookup net/sctp/input.c:1249 [inline]
>  sctp_rcv+0x15e6/0x4d30 net/sctp/input.c:170
>  ip_local_deliver_finish+0x874/0xec0 net/ipv4/ip_input.c:215
>  NF_HOOK include/linux/netfilter.h:288 [inline]
>  ip_local_deliver+0x43c/0x4e0 net/ipv4/ip_input.c:256
>  dst_input include/net/dst.h:450 [inline]
>  ip_rcv_finish+0xa36/0x1d00 net/ipv4/ip_input.c:396
>  NF_HOOK include/linux/netfilter.h:288 [inline]
>  ip_rcv+0x118f/0x16d0 net/ipv4/ip_input.c:492
>  __netif_receive_skb_core+0x47df/0x4a90 net/core/dev.c:4592
>  __netif_receive_skb net/core/dev.c:4657 [inline]
>  process_backlog+0x62d/0xe20 net/core/dev.c:5337
>  napi_poll net/core/dev.c:5735 [inline]
>  net_rx_action+0x7c1/0x1a70 net/core/dev.c:5801
>  __do_softirq+0x56d/0x93d kernel/softirq.c:285
>  do_softirq_own_stack+0x2a/0x40 arch/x86/entry/entry_64.S:1046
>  </IRQ>
>  do_softirq kernel/softirq.c:329 [inline]
>  __local_bh_enable_ip+0x114/0x140 kernel/softirq.c:182
>  local_bh_enable+0x36/0x40 include/linux/bottom_half.h:32
>  rcu_read_unlock_bh include/linux/rcupdate.h:728 [inline]
>  ip_finish_output2+0x135a/0x1470 net/ipv4/ip_output.c:231
>  ip_finish_output+0xcb2/0xff0 net/ipv4/ip_output.c:317
>  NF_HOOK_COND include/linux/netfilter.h:277 [inline]
>  ip_output+0x505/0x5d0 net/ipv4/ip_output.c:405
>  dst_output include/net/dst.h:444 [inline]
>  ip_local_out net/ipv4/ip_output.c:124 [inline]
>  ip_queue_xmit+0x1a1e/0x1d10 net/ipv4/ip_output.c:504
>  sctp_v4_xmit+0x188/0x210 net/sctp/protocol.c:983
>  sctp_packet_transmit+0x3eaa/0x4350 net/sctp/output.c:650
>  sctp_outq_flush+0x1a7a/0x6320 net/sctp/outqueue.c:1197
>  sctp_outq_uncork+0xd2/0xf0 net/sctp/outqueue.c:776
>  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
>  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
>  sctp_do_sm+0x8707/0x8d20 net/sctp/sm_sideeffect.c:1191
>  sctp_primitive_REQUESTHEARTBEAT+0x175/0x1a0 net/sctp/primitive.c:200
>  sctp_apply_peer_addr_params+0x207/0x1670 net/sctp/socket.c:2487
>  sctp_setsockopt_peer_addr_params net/sctp/socket.c:2683 [inline]
>  sctp_setsockopt+0x10e5f/0x11600 net/sctp/socket.c:4258
>  sock_common_setsockopt+0x136/0x170 net/core/sock.c:3039
>  __sys_setsockopt+0x4af/0x560 net/socket.c:1903
>  __do_sys_setsockopt net/socket.c:1914 [inline]
>  __se_sys_setsockopt net/socket.c:1911 [inline]
>  __x64_sys_setsockopt+0x15c/0x1c0 net/socket.c:1911
>  do_syscall_64+0x154/0x220 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> RIP: 0033:0x43fef9
> RSP: 002b:00007ffc00d9bfd8 EFLAGS: 00000207 ORIG_RAX: 0000000000000036
> RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fef9
> RDX: 0000000000000009 RSI: 0000000000000084 RDI: 0000000000000003
> RBP: 00000000006ca018 R08: 0000000000000098 R09: 000000000000001c
> R10: 0000000020000180 R11: 0000000000000207 R12: 0000000000401820
> R13: 00000000004018b0 R14: 0000000000000000 R15: 0000000000000000
>
> Local variable description: ----dest@sctp_rcv
> Variable was created at:
>  sctp_rcv+0x13d/0x4d30 net/sctp/input.c:97
>  ip_local_deliver_finish+0x874/0xec0 net/ipv4/ip_input.c:215
> ==================================================================
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
> syzbot.
> syzbot can test patches for this bug, for details see:
> https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH 00/14] Modify action API for implementing lockless actions
From: Vlad Buslov @ 2018-05-16  7:13 UTC (permalink / raw)
  To: Lucas Bates
  Cc: Jamal Hadi Salim, Linux Kernel Network Developers, David Miller,
	Cong Wang, Jiri Pirko, pablo, kadlec, fw, ast, Daniel Borkmann,
	edumazet, kliteyn
In-Reply-To: <CAMDBHYLg0sU4AWhr0YCE5R4RA=jhJ3cPciujLYOyKHGx8Rpcjg@mail.gmail.com>


On Tue 15 May 2018 at 22:07, Lucas Bates <lucasb@mojatatu.com> wrote:
> On Tue, May 15, 2018 at 6:03 PM, Lucas Bates <lucasb@mojatatu.com> wrote:
>> On Tue, May 15, 2018 at 5:49 PM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>>>> Test 7d50: Add skbmod action to set destination mac
>>>> exit: 255 0
>>>> dst MAC address <11:22:33:44:55:66>
>>>> RTNETLINK answers: No such file or directory
>>>> We have an error talking to the kernel
>>>>
>>>
>>> You may actually have broken something with your patches in this case.
>>>
>>> Lucas - does this test pass on latest net-next?
>>
>> Yes, 7d50 has been passing on our builds for at least the last month.
>
> Also, Vlad, you can look at the JSON to see the test case data, or run
> tdc.py -s | less and search for the ID to see the commands being run.
> I'm here if you need help using tdc.

Hello Lucas,

I'll look into JSON test definition and try to understand whats wrong.

^ permalink raw reply

* Re: linux-next: BUG: KASAN: use-after-free in tun_chr_close
From: Andrei Vagin @ 2018-05-16  7:12 UTC (permalink / raw)
  To: netdev, Jason Wang
In-Reply-To: <20180516062825.GA11416@outlook.office365.com>

[-- Attachment #1: Type: text/plain, Size: 5091 bytes --]

Hi Jason,

I think the problem is in "tun: hold a tun socket during ptr_ring_cleanup".

Pls take a look at the attached patch.


On Tue, May 15, 2018 at 11:28:25PM -0700, Andrei Vagin wrote:
> We run CRIU tests on linux-next regularly and today we caught this bug:
> 
> https://travis-ci.org/avagin/linux/jobs/379450631
> 
> [   50.264837] ==================================================================
> [   50.264986] BUG: KASAN: use-after-free in __lock_acquire.isra.30+0x1ad4/0x1bb0
> [   50.265088] Read of size 8 at addr ffff88018e1728f8 by task criu/1819
> [   50.265167] 
> [   50.265249] CPU: 0 PID: 1819 Comm: criu Not tainted 4.17.0-rc5-next-20180515+ #1
> [   50.265251] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
> [   50.265252] Call Trace:
> [   50.265262]  dump_stack+0x71/0xab
> [   50.265265]  ? __lock_acquire.isra.30+0x1ad4/0x1bb0
> [   50.265271]  print_address_description+0x6a/0x270
> [   50.265273]  ? __lock_acquire.isra.30+0x1ad4/0x1bb0
> [   50.265275]  kasan_report+0x237/0x360
> [   50.265278]  __lock_acquire.isra.30+0x1ad4/0x1bb0
> [   50.265285]  ? register_netdev+0x30/0x30
> [   50.265288]  lock_acquire+0x10b/0x2a0
> [   50.265294]  ? tun_chr_close+0x1d7/0x4c0
> [   50.265298]  ? kfree+0xd6/0x1f0
> [   50.265303]  _raw_spin_lock+0x25/0x30
> [   50.265306]  ? tun_chr_close+0x1d7/0x4c0
> [   50.265308]  tun_chr_close+0x1d7/0x4c0
> [   50.265313]  ? fcntl_setlk+0xaf0/0xaf0
> [   50.265320]  __fput+0x251/0x770
> [   50.265324]  task_work_run+0x10e/0x180
> [   50.265330]  exit_to_usermode_loop+0xcb/0xf0
> [   50.265332]  do_syscall_64+0x21d/0x280
> [   50.265335]  ? prepare_exit_to_usermode+0x88/0x130
> [   50.265338]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   50.265342] RIP: 0033:0x1494fa6f93f0
> [   50.265342] Code: 73 01 c3 48 8b 0d b8 9b 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 e0 20 00 00 75 10 b8 03 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 0e fa ff ff 48 89 04 24 
> [   50.265388] RSP: 002b:00007ffd229fe7f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
> [   50.265391] RAX: 0000000000000000 RBX: 0000000000000004 RCX: 00001494fa6f93f0
> [   50.265393] RDX: 00007ffd229fe80c RSI: 00000000400454da RDI: 0000000000000004
> [   50.265395] RBP: 0000000000000000 R08: 000000000000420b R09: 0000000000000000
> [   50.265396] R10: 0000000000000000 R11: 0000000000000246 R12: 00001494fab116a0
> [   50.265398] R13: 0000000000000d06 R14: 0000000000000000 R15: 0000000000000000
> [   50.265400] 
> [   50.265476] Allocated by task 1819:
> [   50.265554]  kasan_kmalloc+0xa0/0xd0
> [   50.265556]  __kmalloc+0x13a/0x250
> [   50.265561]  sk_prot_alloc+0xd3/0x250
> [   50.265564]  sk_alloc+0x35/0x9d0
> [   50.265566]  tun_chr_open+0x7b/0x5a0
> [   50.265570]  misc_open+0x313/0x480
> [   50.265573]  chrdev_open+0x1d6/0x4b0
> [   50.265575]  do_dentry_open+0x6ae/0xee0
> [   50.265578]  path_openat+0xce6/0x2890
> [   50.265580]  do_filp_open+0x17a/0x270
> [   50.265582]  do_sys_open+0x203/0x340
> [   50.265584]  do_syscall_64+0xa0/0x280
> [   50.265586]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   50.265587] 
> [   50.265667] Freed by task 1819:
> [   50.265745]  __kasan_slab_free+0x130/0x180
> [   50.265747]  kfree+0xd6/0x1f0
> [   50.265750]  __sk_destruct+0x46f/0x580
> [   50.265752]  tun_chr_close+0x330/0x4c0
> [   50.265754]  __fput+0x251/0x770
> [   50.265756]  task_work_run+0x10e/0x180
> [   50.265758]  exit_to_usermode_loop+0xcb/0xf0
> [   50.265760]  do_syscall_64+0x21d/0x280
> [   50.265762]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [   50.265762] 
> [   50.265840] The buggy address belongs to the object at ffff88018e172200
> [   50.265840]  which belongs to the cache kmalloc-2048 of size 2048
> [   50.265927] The buggy address is located 1784 bytes inside of
> [   50.265927]  2048-byte region [ffff88018e172200, ffff88018e172a00)
> [   50.266011] The buggy address belongs to the page:
> [   50.266089] page:ffffea0006385c00 count:1 mapcount:0 mapping:0000000000000000 index:0x0 compound_mapcount: 0
> [   50.266178] flags: 0x17fff8000008100(slab|head)
> [   50.266257] raw: 017fff8000008100 0000000000000000 0000000000000000 00000001800f000f
> [   50.266342] raw: dead000000000100 dead000000000200 ffff8801d9016800 0000000000000000
> [   50.266425] page dumped because: kasan: bad access detected
> [   50.266501] 
> [   50.266590] Memory state around the buggy address:
> [   50.266693]  ffff88018e172780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [   50.266776]  ffff88018e172800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [   50.266860] >ffff88018e172880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [   50.266943]                                                                 ^
> [   50.267020]  ffff88018e172900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [   50.267103]  ffff88018e172980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
> [   50.267192] ==================================================================
> [   50.267275] Disabling lock debugging due to kernel taint

[-- Attachment #2: 0001-tun-hold-a-tun-socket-during-ptr_ring_cleanup.patch --]
[-- Type: text/plain, Size: 1867 bytes --]

>From 7efa9d087be20a67c5c3953f7bf26ae5bafaa061 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@openvz.org>
Date: Wed, 16 May 2018 00:03:39 -0700
Subject: [PATCH] tun: hold a tun socket during ptr_ring_cleanup

Otherwise a socket will be destroyed together with a tun_file structure,
which is used in ptr_ring_cleanup.

This issue was reported by kasan:

BUG: KASAN: use-after-free in __lock_acquire.isra.30+0x1ad4/0x1bb0
Read of size 8 at addr ffff88018e1728f8 by task criu/1819

CPU: 0 PID: 1819 Comm: criu Not tainted 4.17.0-rc5-next-20180515+ #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 dump_stack+0x71/0xab
 print_address_description+0x6a/0x270
 kasan_report+0x237/0x360
 __lock_acquire.isra.30+0x1ad4/0x1bb0
 lock_acquire+0x10b/0x2a0
 _raw_spin_lock+0x25/0x30
 tun_chr_close+0x1d7/0x4c0
 __fput+0x251/0x770
 task_work_run+0x10e/0x180
 exit_to_usermode_loop+0xcb/0xf0
 do_syscall_64+0x21d/0x280
 ? prepare_exit_to_usermode+0x88/0x130
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Freed by task 1819:
 __kasan_slab_free+0x130/0x180
 kfree+0xd6/0x1f0
 __sk_destruct+0x46f/0x580
 tun_chr_close+0x330/0x4c0
 __fput+0x251/0x770
 task_work_run+0x10e/0x180
 exit_to_usermode_loop+0xcb/0xf0
 do_syscall_64+0x21d/0x280
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 drivers/net/tun.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 8b0f0a0baab4..f3eae203cc58 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -3246,8 +3246,10 @@ static int tun_chr_close(struct inode *inode, struct file *file)
 {
 	struct tun_file *tfile = file->private_data;
 
+	sock_hold(&tfile->sk);
 	tun_detach(tfile, true);
 	ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
+	sock_put(&tfile->sk);
 
 	return 0;
 }
-- 
2.17.0


^ permalink raw reply related

* Re: [PATCH 08/14] net: sched: account for temporary action reference
From: Jiri Pirko @ 2018-05-16  7:12 UTC (permalink / raw)
  To: Vlad Buslov
  Cc: netdev, davem, jhs, xiyou.wangcong, pablo, kadlec, fw, ast,
	daniel, edumazet, keescook, linux-kernel, netfilter-devel,
	coreteam, kliteyn
In-Reply-To: <1526308035-12484-9-git-send-email-vladbu@mellanox.com>

Mon, May 14, 2018 at 04:27:09PM CEST, vladbu@mellanox.com wrote:
>tca_get_fill function has 'bind' and 'ref' arguments that get passed
>down to action dump function. These arguments values are subtracted from
>actual reference and bind counter values before writing them to skb.
>
>In order to prevent concurrent action delete, RTM_GETACTION handler
>acquires a reference to action before 'dumping' it and releases it
>afterwards. This reference is temporal and should not be accounted by
>userspace clients. (both logically and to preserver current API
>behavior)
>
>Use existing infrastructure of tca_get_fill arguments to subtract that
>temporary reference and not expose it to userspace.
>
>Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
>---
> net/sched/act_api.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
>diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>index 3f02cd1..2772276e 100644
>--- a/net/sched/act_api.c
>+++ b/net/sched/act_api.c
>@@ -935,7 +935,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n,
> 	if (!skb)
> 		return -ENOBUFS;
> 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event,
>-			 0, 0) <= 0) {
>+			 0, 1) <= 0) {
> 		NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action");
> 		kfree_skb(skb);
> 		return -EINVAL;
>@@ -1125,7 +1125,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct list_head *actions,
> 		return -ENOBUFS;
> 
> 	if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION,
>-			 0, 1) <= 0) {
>+			 0, 2) <= 0) {

So now you are adjusting dump because of a change in a different patch
right? This also breaks bisect.

^ permalink raw reply

* Re: [PATCH V2] mlx4_core: allocate ICM memory in page size chunks
From: Tariq Toukan @ 2018-05-16  7:04 UTC (permalink / raw)
  To: Qing Huang, Tariq Toukan, davem, haakon.bugge, yanjun.zhu
  Cc: netdev, linux-rdma, linux-kernel
In-Reply-To: <f4c5d8d3-1eea-8566-9921-a9dc48435f66@oracle.com>



On 15/05/2018 9:53 PM, Qing Huang wrote:
> 
> 
> On 5/15/2018 2:19 AM, Tariq Toukan wrote:
>>
>>
>> On 14/05/2018 7:41 PM, Qing Huang wrote:
>>>
>>>
>>> On 5/13/2018 2:00 AM, Tariq Toukan wrote:
>>>>
>>>>
>>>> On 11/05/2018 10:23 PM, Qing Huang wrote:
>>>>> When a system is under memory presure (high usage with fragments),
>>>>> the original 256KB ICM chunk allocations will likely trigger kernel
>>>>> memory management to enter slow path doing memory compact/migration
>>>>> ops in order to complete high order memory allocations.
>>>>>
>>>>> When that happens, user processes calling uverb APIs may get stuck
>>>>> for more than 120s easily even though there are a lot of free pages
>>>>> in smaller chunks available in the system.
>>>>>
>>>>> Syslog:
>>>>> ...
>>>>> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
>>>>> oracle_205573_e:205573 blocked for more than 120 seconds.
>>>>> ...
>>>>>
>>>>> With 4KB ICM chunk size on x86_64 arch, the above issue is fixed.
>>>>>
>>>>> However in order to support smaller ICM chunk size, we need to fix
>>>>> another issue in large size kcalloc allocations.
>>>>>
>>>>> E.g.
>>>>> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
>>>>> size, each ICM chunk can only hold 512 mtt entries (8 bytes for 
>>>>> each mtt
>>>>> entry). So we need a 16MB allocation for a table->icm pointer array to
>>>>> hold 2M pointers which can easily cause kcalloc to fail.
>>>>>
>>>>> The solution is to use vzalloc to replace kcalloc. There is no need
>>>>> for contiguous memory pages for a driver meta data structure (no need
>>>>> of DMA ops).
>>>>>
>>>>> Signed-off-by: Qing Huang <qing.huang@oracle.com>
>>>>> Acked-by: Daniel Jurgens <danielj@mellanox.com>
>>>>> Reviewed-by: Zhu Yanjun <yanjun.zhu@oracle.com>
>>>>> ---
>>>>> v2 -> v1: adjusted chunk size to reflect different architectures.
>>>>>
>>>>>   drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
>>>>>   1 file changed, 7 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c 
>>>>> b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> index a822f7a..ccb62b8 100644
>>>>> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
>>>>> @@ -43,12 +43,12 @@
>>>>>   #include "fw.h"
>>>>>     /*
>>>>> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
>>>>> - * per chunk.
>>>>> + * We allocate in page size (default 4KB on many archs) chunks to 
>>>>> avoid high
>>>>> + * order memory allocations in fragmented/high usage memory 
>>>>> situation.
>>>>>    */
>>>>>   enum {
>>>>> -    MLX4_ICM_ALLOC_SIZE    = 1 << 18,
>>>>> -    MLX4_TABLE_CHUNK_SIZE    = 1 << 18
>>>>> +    MLX4_ICM_ALLOC_SIZE    = 1 << PAGE_SHIFT,
>>>>> +    MLX4_TABLE_CHUNK_SIZE    = 1 << PAGE_SHIFT
>>>>
>>>> Which is actually PAGE_SIZE.
>>>
>>> Yes, we wanted to avoid high order memory allocations.
>>>
>>
>> Then please use PAGE_SIZE instead.
> 
> PAGE_SIZE is usually defined as 1 << PAGE_SHIFT. So I think PAGE_SHIFT 
> is actually more appropriate here.
> 

Definition of PAGE_SIZE varies among different archs.
It is not always as simple as 1 << PAGE_SHIFT.
It might be:
PAGE_SIZE (1UL << PAGE_SHIFT)
PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT)
etc...

Please replace 1 << PAGE_SHIFT with PAGE_SIZE.

> 
>>
>>>> Also, please add a comma at the end of the last entry.
>>>
>>> Hmm..., followed the existing code style and checkpatch.pl didn't 
>>> complain about the comma.
>>>
>>
>> I am in favor of having a comma also after the last element, so that 
>> when another enum element is added we do not modify this line again, 
>> which would falsely affect git blame.
>>
>> I know it didn't exist before your patch, but once we're here, let's 
>> do it.
> 
> I'm okay either way. If adding an extra comma is preferred by many 
> people, someone should update checkpatch.pl to enforce it. :)
> 
I agree.
Until then, please use an extra comma in this patch.

>>
>>>>
>>>>>   };
>>>>>     static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct 
>>>>> mlx4_icm_chunk *chunk)
>>>>> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>>>> struct mlx4_icm_table *table,
>>>>>       obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
>>>>>       num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
>>>>>   -    table->icm      = kcalloc(num_icm, sizeof(*table->icm), 
>>>>> GFP_KERNEL);
>>>>> +    table->icm      = vzalloc(num_icm * sizeof(*table->icm));
>>>>
>>>> Why not kvzalloc ?
>>>
>>> I think table->icm pointer array doesn't really need physically 
>>> contiguous memory. Sometimes high order
>>> memory allocation by kmalloc variants may trigger slow path and cause 
>>> tasks to be blocked.
>>>
>>
>> This is control path so it is less latency-sensitive.
>> Let's not produce unnecessary degradation here, please call kvzalloc 
>> so we maintain a similar behavior when contiguous memory is available, 
>> and a fallback for resiliency.
> 
> No sure what exactly degradation is caused by vzalloc here. I think it's 
> better to keep physically contiguous pages
> to other requests which really need them. Besides slow path/mem 
> compacting can be really expensive.
> 

Degradation is expected when you replace a contig memory with non-contig 
memory, without any perf test.
We agree that when contig memory is not available, we should use 
non-contig instead of simply failing, and for this you can call kvzalloc.

>>
>>> Thanks,
>>> Qing
>>>
>>>>
>>>>>       if (!table->icm)
>>>>>           return -ENOMEM;
>>>>>       table->virt     = virt;
>>>>> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, 
>>>>> struct mlx4_icm_table *table,
>>>>>               mlx4_free_icm(dev, table->icm[i], use_coherent);
>>>>>           }
>>>>>   -    kfree(table->icm);
>>>>> +    vfree(table->icm);
>>>>>         return -ENOMEM;
>>>>>   }
>>>>> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev 
>>>>> *dev, struct mlx4_icm_table *table)
>>>>>               mlx4_free_icm(dev, table->icm[i], table->coherent);
>>>>>           }
>>>>>   -    kfree(table->icm);
>>>>> +    vfree(table->icm);
>>>>>   }
>>>>>
>>>>
>>>> Thanks for your patch.
>>>>
>>>> I need to verify there is no dramatic performance degradation here.
>>>> You can prepare and send a v3 in the meanwhile.
>>>>
>>>> Thanks,
>>>> Tariq
>>>> -- 
>>>> To unsubscribe from this list: send the line "unsubscribe 
>>>> linux-rdma" in
>>>> the body of a message to majordomo@vger.kernel.org
>>>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>>>
>> -- 
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply

* Re: [PATCH net-next v2 2/2] drivers: net: Remove device_node checks with of_mdiobus_register()
From: Antoine Tenart @ 2018-05-16  6:49 UTC (permalink / raw)
  To: Florian Fainelli
  Cc: netdev, Andrew Lunn, Vivien Didelot, David S. Miller,
	Nicolas Ferre, Fugang Duan, Sergei Shtylyov, Giuseppe Cavallaro,
	Alexandre Torgue, Jose Abreu, Grygorii Strashko, Woojung Huh,
	Microchip Linux Driver Support, Rob Herring, Frank Rowand,
	Antoine Tenart, Tobias Jordan, Russell King
In-Reply-To: <20180515235619.27773-3-f.fainelli@gmail.com>

Hi Florian,

On Tue, May 15, 2018 at 04:56:19PM -0700, Florian Fainelli wrote:
> A number of drivers have the following pattern:
> 
> if (np)
> 	of_mdiobus_register()
> else
> 	mdiobus_register()
> 
> which the implementation of of_mdiobus_register() now takes care of.
> Remove that pattern in drivers that strictly adhere to it.
> 
> Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
> ---
>  drivers/net/dsa/bcm_sf2.c                         |  8 ++------
>  drivers/net/dsa/mv88e6xxx/chip.c                  |  5 +----
>  drivers/net/ethernet/cadence/macb_main.c          | 12 +++---------
>  drivers/net/ethernet/freescale/fec_main.c         |  8 ++------
>  drivers/net/ethernet/marvell/mvmdio.c             |  5 +----

For mvmdio,
Reviewed-by: Antoine Tenart <antoine.tenart@bootlin.com>

Thanks!
Antoine

>  drivers/net/ethernet/renesas/sh_eth.c             | 11 +++--------
>  drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c |  5 +----
>  drivers/net/ethernet/ti/davinci_mdio.c            |  8 +++-----
>  drivers/net/phy/mdio-gpio.c                       |  6 +-----
>  drivers/net/phy/mdio-mscc-miim.c                  |  6 +-----
>  drivers/net/usb/lan78xx.c                         |  7 ++-----
>  11 files changed, 20 insertions(+), 61 deletions(-)
> 
> diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
> index ac621f44237a..02e8982519ce 100644
> --- a/drivers/net/dsa/bcm_sf2.c
> +++ b/drivers/net/dsa/bcm_sf2.c
> @@ -450,12 +450,8 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
>  	priv->slave_mii_bus->parent = ds->dev->parent;
>  	priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask;
>  
> -	if (dn)
> -		err = of_mdiobus_register(priv->slave_mii_bus, dn);
> -	else
> -		err = mdiobus_register(priv->slave_mii_bus);
> -
> -	if (err)
> +	err = of_mdiobus_register(priv->slave_mii_bus, dn);
> +	if (err && dn)
>  		of_node_put(dn);
>  
>  	return err;
> diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c
> index b23c11d9f4b2..2bb3f03ee1cb 100644
> --- a/drivers/net/dsa/mv88e6xxx/chip.c
> +++ b/drivers/net/dsa/mv88e6xxx/chip.c
> @@ -2454,10 +2454,7 @@ static int mv88e6xxx_mdio_register(struct mv88e6xxx_chip *chip,
>  			return err;
>  	}
>  
> -	if (np)
> -		err = of_mdiobus_register(bus, np);
> -	else
> -		err = mdiobus_register(bus);
> +	err = of_mdiobus_register(bus, np);
>  	if (err) {
>  		dev_err(chip->dev, "Cannot register MDIO bus (%d)\n", err);
>  		mv88e6xxx_g2_irq_mdio_free(chip, bus);
> diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c
> index b4c9268100bb..3e93df5d4e3b 100644
> --- a/drivers/net/ethernet/cadence/macb_main.c
> +++ b/drivers/net/ethernet/cadence/macb_main.c
> @@ -591,16 +591,10 @@ static int macb_mii_init(struct macb *bp)
>  	dev_set_drvdata(&bp->dev->dev, bp->mii_bus);
>  
>  	np = bp->pdev->dev.of_node;
> +	if (pdata)
> +		bp->mii_bus->phy_mask = pdata->phy_mask;
>  
> -	if (np) {
> -		err = of_mdiobus_register(bp->mii_bus, np);
> -	} else {
> -		if (pdata)
> -			bp->mii_bus->phy_mask = pdata->phy_mask;
> -
> -		err = mdiobus_register(bp->mii_bus);
> -	}
> -
> +	err = of_mdiobus_register(bp->mii_bus, np);
>  	if (err)
>  		goto err_out_free_mdiobus;
>  
> diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
> index d4604bc8eb5b..f3e43db0d6cb 100644
> --- a/drivers/net/ethernet/freescale/fec_main.c
> +++ b/drivers/net/ethernet/freescale/fec_main.c
> @@ -2052,13 +2052,9 @@ static int fec_enet_mii_init(struct platform_device *pdev)
>  	fep->mii_bus->parent = &pdev->dev;
>  
>  	node = of_get_child_by_name(pdev->dev.of_node, "mdio");
> -	if (node) {
> -		err = of_mdiobus_register(fep->mii_bus, node);
> +	err = of_mdiobus_register(fep->mii_bus, node);
> +	if (node)
>  		of_node_put(node);
> -	} else {
> -		err = mdiobus_register(fep->mii_bus);
> -	}
> -
>  	if (err)
>  		goto err_out_free_mdiobus;
>  
> diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
> index 0495487f7b42..c5dac6bd2be4 100644
> --- a/drivers/net/ethernet/marvell/mvmdio.c
> +++ b/drivers/net/ethernet/marvell/mvmdio.c
> @@ -348,10 +348,7 @@ static int orion_mdio_probe(struct platform_device *pdev)
>  		goto out_mdio;
>  	}
>  
> -	if (pdev->dev.of_node)
> -		ret = of_mdiobus_register(bus, pdev->dev.of_node);
> -	else
> -		ret = mdiobus_register(bus);
> +	ret = of_mdiobus_register(bus, pdev->dev.of_node);
>  	if (ret < 0) {
>  		dev_err(&pdev->dev, "Cannot register MDIO bus (%d)\n", ret);
>  		goto out_mdio;
> diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
> index 5970d9e5ddf1..8dd41e08a6c6 100644
> --- a/drivers/net/ethernet/renesas/sh_eth.c
> +++ b/drivers/net/ethernet/renesas/sh_eth.c
> @@ -3025,15 +3025,10 @@ static int sh_mdio_init(struct sh_eth_private *mdp,
>  		 pdev->name, pdev->id);
>  
>  	/* register MDIO bus */
> -	if (dev->of_node) {
> -		ret = of_mdiobus_register(mdp->mii_bus, dev->of_node);
> -	} else {
> -		if (pd->phy_irq > 0)
> -			mdp->mii_bus->irq[pd->phy] = pd->phy_irq;
> -
> -		ret = mdiobus_register(mdp->mii_bus);
> -	}
> +	if (pd->phy_irq > 0)
> +		mdp->mii_bus->irq[pd->phy] = pd->phy_irq;
>  
> +	ret = of_mdiobus_register(mdp->mii_bus, dev->of_node);
>  	if (ret)
>  		goto out_free_bus;
>  
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
> index f5f37bfa1d58..5df1a608e566 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
> @@ -233,10 +233,7 @@ int stmmac_mdio_register(struct net_device *ndev)
>  	new_bus->phy_mask = mdio_bus_data->phy_mask;
>  	new_bus->parent = priv->device;
>  
> -	if (mdio_node)
> -		err = of_mdiobus_register(new_bus, mdio_node);
> -	else
> -		err = mdiobus_register(new_bus);
> +	err = of_mdiobus_register(new_bus, mdio_node);
>  	if (err != 0) {
>  		dev_err(dev, "Cannot register the MDIO bus\n");
>  		goto bus_register_fail;
> diff --git a/drivers/net/ethernet/ti/davinci_mdio.c b/drivers/net/ethernet/ti/davinci_mdio.c
> index 98a1c97fb95e..8ac72831af05 100644
> --- a/drivers/net/ethernet/ti/davinci_mdio.c
> +++ b/drivers/net/ethernet/ti/davinci_mdio.c
> @@ -429,12 +429,10 @@ static int davinci_mdio_probe(struct platform_device *pdev)
>  	 * defined to support backward compatibility with DTs which assume that
>  	 * Davinci MDIO will always scan the bus for PHYs detection.
>  	 */
> -	if (dev->of_node && of_get_child_count(dev->of_node)) {
> +	if (dev->of_node && of_get_child_count(dev->of_node))
>  		data->skip_scan = true;
> -		ret = of_mdiobus_register(data->bus, dev->of_node);
> -	} else {
> -		ret = mdiobus_register(data->bus);
> -	}
> +
> +	ret = of_mdiobus_register(data->bus, dev->of_node);
>  	if (ret)
>  		goto bail_out;
>  
> diff --git a/drivers/net/phy/mdio-gpio.c b/drivers/net/phy/mdio-gpio.c
> index b501221819e1..4e4c8daf44c3 100644
> --- a/drivers/net/phy/mdio-gpio.c
> +++ b/drivers/net/phy/mdio-gpio.c
> @@ -179,11 +179,7 @@ static int mdio_gpio_probe(struct platform_device *pdev)
>  	if (!new_bus)
>  		return -ENODEV;
>  
> -	if (pdev->dev.of_node)
> -		ret = of_mdiobus_register(new_bus, pdev->dev.of_node);
> -	else
> -		ret = mdiobus_register(new_bus);
> -
> +	ret = of_mdiobus_register(new_bus, pdev->dev.of_node);
>  	if (ret)
>  		mdio_gpio_bus_deinit(&pdev->dev);
>  
> diff --git a/drivers/net/phy/mdio-mscc-miim.c b/drivers/net/phy/mdio-mscc-miim.c
> index 8c689ccfdbca..badbc99bedd3 100644
> --- a/drivers/net/phy/mdio-mscc-miim.c
> +++ b/drivers/net/phy/mdio-mscc-miim.c
> @@ -151,11 +151,7 @@ static int mscc_miim_probe(struct platform_device *pdev)
>  		}
>  	}
>  
> -	if (pdev->dev.of_node)
> -		ret = of_mdiobus_register(bus, pdev->dev.of_node);
> -	else
> -		ret = mdiobus_register(bus);
> -
> +	ret = of_mdiobus_register(bus, pdev->dev.of_node);
>  	if (ret < 0) {
>  		dev_err(&pdev->dev, "Cannot register MDIO bus (%d)\n", ret);
>  		return ret;
> diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
> index 91761436709a..8dff87ec6d99 100644
> --- a/drivers/net/usb/lan78xx.c
> +++ b/drivers/net/usb/lan78xx.c
> @@ -1843,12 +1843,9 @@ static int lan78xx_mdio_init(struct lan78xx_net *dev)
>  	}
>  
>  	node = of_get_child_by_name(dev->udev->dev.of_node, "mdio");
> -	if (node) {
> -		ret = of_mdiobus_register(dev->mdiobus, node);
> +	ret = of_mdiobus_register(dev->mdiobus, node);
> +	if (node)
>  		of_node_put(node);
> -	} else {
> -		ret = mdiobus_register(dev->mdiobus);
> -	}
>  	if (ret) {
>  		netdev_err(dev->net, "can't register MDIO bus\n");
>  		goto exit1;
> -- 
> 2.14.1
> 

-- 
Antoine Ténart, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH net-next v2 15/15] arm64: dts: allwinner: a64: add SRAM controller device tree node
From: Chen-Yu Tsai @ 2018-05-16  6:47 UTC (permalink / raw)
  To: Maxime Ripard
  Cc: Icenowy Zheng, linux-arm-kernel, Mark Rutland, devicetree,
	Stephen Boyd, netdev, Michael Turquette, Rob Herring,
	Corentin Labbe, Mark Brown, Giuseppe Cavallaro, linux-clk
In-Reply-To: <20180514080310.ngev5h6cqe4taedl@flea>

On Mon, May 14, 2018 at 1:03 AM, Maxime Ripard
<maxime.ripard@bootlin.com> wrote:
> 1;5201;0c
> On Sun, May 13, 2018 at 12:37:49PM -0700, Chen-Yu Tsai wrote:
>> On Wed, May 2, 2018 at 4:54 AM, Maxime Ripard <maxime.ripard@bootlin.com> wrote:
>> > On Wed, May 02, 2018 at 06:19:51PM +0800, Icenowy Zheng wrote:
>> >>
>> >>
>> >> 于 2018年5月2日 GMT+08:00 下午5:53:21, Chen-Yu Tsai <wens@csie.org> 写到:
>> >> >On Wed, May 2, 2018 at 5:51 PM, Maxime Ripard
>> >> ><maxime.ripard@bootlin.com> wrote:
>> >> >> Hi,
>> >> >>
>> >> >> On Wed, May 02, 2018 at 12:12:27AM +0800, Chen-Yu Tsai wrote:
>> >> >>> From: Icenowy Zheng <icenowy@aosc.io>
>> >> >>>
>> >> >>> Allwinner A64 has a SRAM controller, and in the device tree
>> >> >currently
>> >> >>> we have a syscon node to enable EMAC driver to access the EMAC clock
>> >> >>> register. As SRAM controller driver can now export regmap for this
>> >> >>> register, replace the syscon node to the SRAM controller device
>> >> >node,
>> >> >>> and let EMAC driver to acquire its EMAC clock regmap.
>> >> >>>
>> >> >>> Signed-off-by: Icenowy Zheng <icenowy@aosc.io>
>> >> >>> Signed-off-by: Chen-Yu Tsai <wens@csie.org>
>> >> >>> ---
>> >> >>>  arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi | 23
>> >> >+++++++++++++++----
>> >> >>>  1 file changed, 19 insertions(+), 4 deletions(-)
>> >> >>>
>> >> >>> diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
>> >> >b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
>> >> >>> index 1b2ef28c42bd..1c37659d9d41 100644
>> >> >>> --- a/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
>> >> >>> +++ b/arch/arm64/boot/dts/allwinner/sun50i-a64.dtsi
>> >> >>> @@ -168,10 +168,25 @@
>> >> >>>               #size-cells = <1>;
>> >> >>>               ranges;
>> >> >>>
>> >> >>> -             syscon: syscon@1c00000 {
>> >> >>> -                     compatible =
>> >> >"allwinner,sun50i-a64-system-controller",
>> >> >>> -                             "syscon";
>> >> >>> +             sram_controller: sram-controller@1c00000 {
>> >> >>> +                     compatible =
>> >> >"allwinner,sun50i-a64-sram-controller";
>> >> >>
>> >> >> I don't think there's anything preventing us from keeping the
>> >> >> -system-controller compatible. It's what was in the DT before, and
>> >> >> it's how it's called in the datasheet.
>> >> >
>> >> >I actually meant to ask you about this. The -system-controller
>> >> >compatible matches the datasheet better. Maybe we should just
>> >> >switch to that one?
>> >>
>> >> No, if we do the switch the system-controller compatible,
>> >> the device will be probed on the same memory region with
>> >> a syscon on old DTs.
>> >
>> > The device hasn't magically changed either. Maybe we just need to add
>> > a check to make sure we don't have the syscon compatible in the SRAM
>> > driver probe so that the double driver issue doesn't happen?
>>
>> The syscon interface (which is not even a full blown device driver)
>> only looks at the "syscon" compatible. Either way we're removing that
>> part from the device tree so things should be ok for new device trees.
>> As Maxime mentioned we can do a check for the syscon compatible and
>> either give a warning to the user asking them to update their device
>> tree, or not register our custom regmap, or not probe the SRAM driver.
>> Personally I prefer the first option. The system controller block is
>> probed before any syscon users, so we should be fine, given the dwmac
>> driver goes the custom regmap path first.
>>
>> BTW, I still might end up changing the compatible. The manual uses
>> "system control", not "system controller", which I think makes sense,
>> since it is just a bunch of register files, kind of like the GRF
>> (General Register Files) block found in Rockchip SoCs [1], and not an
>> actual "controller".
>
> I'm not really fond of that, but we should at least make it consistent
> on the other patches Paul sent then.

For the A10s / A13 right?

I think my naming is slightly better, but it's just a minor detail.
While we're still debating this, can I merge the R40 stuff first?
The driver bits are already in.

Thanks
ChenYu

^ permalink raw reply

* Re: [PATCH 00/14] Modify action API for implementing lockless actions
From: Vlad Buslov @ 2018-05-16  6:43 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: netdev, davem, xiyou.wangcong, jiri, pablo, kadlec, fw, ast,
	daniel, edumazet, kliteyn
In-Reply-To: <2ee4066e-643a-f901-8926-7001f8699163@mojatatu.com>


On Tue 15 May 2018 at 21:49, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 15/05/18 05:21 PM, Vlad Buslov wrote:
>> 
>> On Tue 15 May 2018 at 18:25, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>>> On 14/05/18 04:46 PM, Vlad Buslov wrote:
>>>>
>>>> On Mon 14 May 2018 at 18:03, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
>>>>> On 14/05/18 10:27 AM, Vlad Buslov wrote:
>>>
>>>
>>>> Hello Jamal,
>>>>
>>>> I'm trying to run tdc, but keep getting following error even on clean
>>>> branch without my patches:
>>>
>>> Vlad, not sure if you saw my email:
>>> Apply Roman's patch and try again
>>>
>>> https://marc.info/?l=linux-netdev&m=152639369112020&w=2
>>>
>>> cheers,
>>> jamal
>> 
>> With patch applied I get following error:
>> 
>> Test 7d50: Add skbmod action to set destination mac
>> exit: 255 0
>> dst MAC address <11:22:33:44:55:66>
>> RTNETLINK answers: No such file or directory
>> We have an error talking to the kernel
>> 
>
> You may actually have broken something with your patches in this case.

Results is for net-next without my patches.

>
> Lucas - does this test pass on latest net-next?
>
> cheers,
> jamal
>
> PS:- any reason for the big Cc? I have trimmed it down.

This Cc was generated by gen_maintainer.

^ permalink raw reply

* Re: [PATCH] ethernet: stmmac: dwmac-rk: Add GMAC support for px30
From: Shawn Lin @ 2018-05-16  6:34 UTC (permalink / raw)
  To: David Wu, robh+dt
  Cc: davem, heiko, shawn.lin, mark.rutland, huangtao, netdev,
	linux-kernel, linux-rockchip, linux-arm-kernel
In-Reply-To: <1526441925-12654-1-git-send-email-david.wu@rock-chips.com>

Hi David,

On 2018/5/16 11:38, David Wu wrote:
> Add constants and callback functions for the dwmac on px30 soc.

s/soc/SoC

> The base structure is the same, but registers and the bits in
> them moved slightly, and add the clk_mac_speed for the select

s/moved/are moved

> of mac speed.

for selecting mas speed.

> 
> Signed-off-by: David Wu <david.wu@rock-chips.com>

git log --oneline  drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c

shows very inconsistent format wrt. commit title, so please
follow the exsiting exsamples as possible.

> ---
>   .../devicetree/bindings/net/rockchip-dwmac.txt     |  1 +
>   drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c     | 64 ++++++++++++++++++++++
>   2 files changed, 65 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
> index 9c16ee2..3b71da7 100644
> --- a/Documentation/devicetree/bindings/net/rockchip-dwmac.txt
> +++ b/Documentation/devicetree/bindings/net/rockchip-dwmac.txt

It'd be better to split doc changes into a separate patch.

> @@ -4,6 +4,7 @@ The device node has following properties.
>   
>   Required properties:
>    - compatible: should be "rockchip,<name>-gamc"
> +   "rockchip,px30-gmac":   found on PX30 SoCs
>      "rockchip,rk3128-gmac": found on RK312x SoCs
>      "rockchip,rk3228-gmac": found on RK322x SoCs
>      "rockchip,rk3288-gmac": found on RK3288 SoCs
> diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
> index 13133b3..4b2ab71 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-rk.c
> @@ -61,6 +61,7 @@ struct rk_priv_data {
>   	struct clk *mac_clk_tx;
>   	struct clk *clk_mac_ref;
>   	struct clk *clk_mac_refout;
> +	struct clk *clk_mac_speed;

No need to do anything now but it seems you could consider doing some
cleanup by using clk bulk APIs in the future.

>   	struct clk *aclk_mac;
>   	struct clk *pclk_mac;
>   	struct clk *clk_phy;
> @@ -83,6 +84,64 @@ struct rk_priv_data {
>   	(((tx) ? soc##_GMAC_TXCLK_DLY_ENABLE : soc##_GMAC_TXCLK_DLY_DISABLE) | \
>   	 ((rx) ? soc##_GMAC_RXCLK_DLY_ENABLE : soc##_GMAC_RXCLK_DLY_DISABLE))
>   
> +#define PX30_GRF_GMAC_CON1		0X0904

s/0X0904/0x0904 , since the other constants in this file follow the
same format.

> +
> +/* PX30_GRF_GMAC_CON1 */
> +#define PX30_GMAC_PHY_INTF_SEL_RMII	(GRF_CLR_BIT(4) | GRF_CLR_BIT(5) | \
> +					GRF_BIT(6))
> +#define PX30_GMAC_SPEED_10M		GRF_CLR_BIT(2)
> +#define PX30_GMAC_SPEED_100M		GRF_BIT(2)
> +
> +static void px30_set_to_rmii(struct rk_priv_data *bsp_priv)
> +{
> +	struct device *dev = &bsp_priv->pdev->dev;
> +
> +	if (IS_ERR(bsp_priv->grf)) {
> +		dev_err(dev, "%s: Missing rockchip,grf property\n", __func__);
> +		return;
> +	}
> +
> +	regmap_write(bsp_priv->grf, PX30_GRF_GMAC_CON1,
> +		     PX30_GMAC_PHY_INTF_SEL_RMII);
> +}
> +
> +static void px30_set_rmii_speed(struct rk_priv_data *bsp_priv, int speed)
> +{
> +	struct device *dev = &bsp_priv->pdev->dev;
> +	int ret;
> +
> +	if (IS_ERR(bsp_priv->clk_mac_speed)) {
> +		dev_err(dev, "%s: Missing clk_mac_speed clock\n", __func__);
> +		return;
> +	}
> +
> +	if (speed == 10) {
> +		regmap_write(bsp_priv->grf, PX30_GRF_GMAC_CON1,
> +			     PX30_GMAC_SPEED_10M);
> +
> +		ret = clk_set_rate(bsp_priv->clk_mac_speed, 2500000);
> +		if (ret)
> +			dev_err(dev, "%s: set clk_mac_speed rate 2500000 failed: %d\n",
> +				__func__, ret);
> +	} else if (speed == 100) {
> +		regmap_write(bsp_priv->grf, PX30_GRF_GMAC_CON1,
> +			     PX30_GMAC_SPEED_100M);
> +
> +		ret = clk_set_rate(bsp_priv->clk_mac_speed, 25000000);
> +		if (ret)
> +			dev_err(dev, "%s: set clk_mac_speed rate 25000000 failed: %d\n",
> +				__func__, ret);

I know it follows the existing examples, but IMHO it duplicates
unnecessary code as all the difference is PX30_GMAC_SPEED_*

> +
> +	} else {
> +		dev_err(dev, "unknown speed value for RMII! speed=%d", speed);
> +	}
> +}
> +
> +static const struct rk_gmac_ops px30_ops = {
> +	.set_to_rmii = px30_set_to_rmii,
> +	.set_rmii_speed = px30_set_rmii_speed,
> +};
> +
>   #define RK3128_GRF_MAC_CON0	0x0168
>   #define RK3128_GRF_MAC_CON1	0x016c
>   
> @@ -1042,6 +1101,10 @@ static int rk_gmac_clk_init(struct plat_stmmacenet_data *plat)
>   		}
>   	}
>   
> +	bsp_priv->clk_mac_speed = devm_clk_get(dev, "clk_mac_speed");

Mightbe it'd be better to use "mac-speed" in DT bindings.

> +	if (IS_ERR(bsp_priv->clk_mac_speed))
> +		dev_err(dev, "cannot get clock %s\n", "clk_mac_speed");
> +

Would you like to handle deferred probe?

>   	if (bsp_priv->clock_input) {
>   		dev_info(dev, "clock input from PHY\n");
>   	} else {
> @@ -1424,6 +1487,7 @@ static int rk_gmac_resume(struct device *dev)
>   static SIMPLE_DEV_PM_OPS(rk_gmac_pm_ops, rk_gmac_suspend, rk_gmac_resume);
>   
>   static const struct of_device_id rk_gmac_dwmac_match[] = {
> +	{ .compatible = "rockchip,px30-gmac",	.data = &px30_ops   },
>   	{ .compatible = "rockchip,rk3128-gmac", .data = &rk3128_ops },
>   	{ .compatible = "rockchip,rk3228-gmac", .data = &rk3228_ops },
>   	{ .compatible = "rockchip,rk3288-gmac", .data = &rk3288_ops },
> 

^ permalink raw reply

* linux-next: BUG: KASAN: use-after-free in tun_chr_close
From: Andrei Vagin @ 2018-05-16  6:28 UTC (permalink / raw)
  To: netdev; +Cc: Jason Wang

We run CRIU tests on linux-next regularly and today we caught this bug:

https://travis-ci.org/avagin/linux/jobs/379450631

[   50.264837] ==================================================================
[   50.264986] BUG: KASAN: use-after-free in __lock_acquire.isra.30+0x1ad4/0x1bb0
[   50.265088] Read of size 8 at addr ffff88018e1728f8 by task criu/1819
[   50.265167] 
[   50.265249] CPU: 0 PID: 1819 Comm: criu Not tainted 4.17.0-rc5-next-20180515+ #1
[   50.265251] Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
[   50.265252] Call Trace:
[   50.265262]  dump_stack+0x71/0xab
[   50.265265]  ? __lock_acquire.isra.30+0x1ad4/0x1bb0
[   50.265271]  print_address_description+0x6a/0x270
[   50.265273]  ? __lock_acquire.isra.30+0x1ad4/0x1bb0
[   50.265275]  kasan_report+0x237/0x360
[   50.265278]  __lock_acquire.isra.30+0x1ad4/0x1bb0
[   50.265285]  ? register_netdev+0x30/0x30
[   50.265288]  lock_acquire+0x10b/0x2a0
[   50.265294]  ? tun_chr_close+0x1d7/0x4c0
[   50.265298]  ? kfree+0xd6/0x1f0
[   50.265303]  _raw_spin_lock+0x25/0x30
[   50.265306]  ? tun_chr_close+0x1d7/0x4c0
[   50.265308]  tun_chr_close+0x1d7/0x4c0
[   50.265313]  ? fcntl_setlk+0xaf0/0xaf0
[   50.265320]  __fput+0x251/0x770
[   50.265324]  task_work_run+0x10e/0x180
[   50.265330]  exit_to_usermode_loop+0xcb/0xf0
[   50.265332]  do_syscall_64+0x21d/0x280
[   50.265335]  ? prepare_exit_to_usermode+0x88/0x130
[   50.265338]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   50.265342] RIP: 0033:0x1494fa6f93f0
[   50.265342] Code: 73 01 c3 48 8b 0d b8 9b 20 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 59 e0 20 00 00 75 10 b8 03 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 0e fa ff ff 48 89 04 24 
[   50.265388] RSP: 002b:00007ffd229fe7f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000003
[   50.265391] RAX: 0000000000000000 RBX: 0000000000000004 RCX: 00001494fa6f93f0
[   50.265393] RDX: 00007ffd229fe80c RSI: 00000000400454da RDI: 0000000000000004
[   50.265395] RBP: 0000000000000000 R08: 000000000000420b R09: 0000000000000000
[   50.265396] R10: 0000000000000000 R11: 0000000000000246 R12: 00001494fab116a0
[   50.265398] R13: 0000000000000d06 R14: 0000000000000000 R15: 0000000000000000
[   50.265400] 
[   50.265476] Allocated by task 1819:
[   50.265554]  kasan_kmalloc+0xa0/0xd0
[   50.265556]  __kmalloc+0x13a/0x250
[   50.265561]  sk_prot_alloc+0xd3/0x250
[   50.265564]  sk_alloc+0x35/0x9d0
[   50.265566]  tun_chr_open+0x7b/0x5a0
[   50.265570]  misc_open+0x313/0x480
[   50.265573]  chrdev_open+0x1d6/0x4b0
[   50.265575]  do_dentry_open+0x6ae/0xee0
[   50.265578]  path_openat+0xce6/0x2890
[   50.265580]  do_filp_open+0x17a/0x270
[   50.265582]  do_sys_open+0x203/0x340
[   50.265584]  do_syscall_64+0xa0/0x280
[   50.265586]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   50.265587] 
[   50.265667] Freed by task 1819:
[   50.265745]  __kasan_slab_free+0x130/0x180
[   50.265747]  kfree+0xd6/0x1f0
[   50.265750]  __sk_destruct+0x46f/0x580
[   50.265752]  tun_chr_close+0x330/0x4c0
[   50.265754]  __fput+0x251/0x770
[   50.265756]  task_work_run+0x10e/0x180
[   50.265758]  exit_to_usermode_loop+0xcb/0xf0
[   50.265760]  do_syscall_64+0x21d/0x280
[   50.265762]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   50.265762] 
[   50.265840] The buggy address belongs to the object at ffff88018e172200
[   50.265840]  which belongs to the cache kmalloc-2048 of size 2048
[   50.265927] The buggy address is located 1784 bytes inside of
[   50.265927]  2048-byte region [ffff88018e172200, ffff88018e172a00)
[   50.266011] The buggy address belongs to the page:
[   50.266089] page:ffffea0006385c00 count:1 mapcount:0 mapping:0000000000000000 index:0x0 compound_mapcount: 0
[   50.266178] flags: 0x17fff8000008100(slab|head)
[   50.266257] raw: 017fff8000008100 0000000000000000 0000000000000000 00000001800f000f
[   50.266342] raw: dead000000000100 dead000000000200 ffff8801d9016800 0000000000000000
[   50.266425] page dumped because: kasan: bad access detected
[   50.266501] 
[   50.266590] Memory state around the buggy address:
[   50.266693]  ffff88018e172780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   50.266776]  ffff88018e172800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   50.266860] >ffff88018e172880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   50.266943]                                                                 ^
[   50.267020]  ffff88018e172900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   50.267103]  ffff88018e172980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[   50.267192] ==================================================================
[   50.267275] Disabling lock debugging due to kernel taint

^ permalink raw reply

* Re: [PATCH bpf-next 7/7] tools/bpftool: add perf subcommand
From: Yonghong Song @ 2018-05-16  5:54 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: peterz, ast, daniel, netdev, kernel-team, Quentin Monnet
In-Reply-To: <20180515214145.4c6b31bc@cakuba.netronome.com>



On 5/15/18 9:41 PM, Jakub Kicinski wrote:
> On Tue, 15 May 2018 16:45:21 -0700, Yonghong Song wrote:
>> The new command "bpftool perf [show]" will traverse
>> all processes under /proc, and if any fd is associated
>> with a perf event, it will print out related perf event
>> information.
>>
>> Below is an example to show the results using bcc commands.
>> Running the following 4 bcc commands:
>>    kprobe:     trace.py '__x64_sys_nanosleep'
>>    kretprobe:  trace.py 'r::__x64_sys_nanosleep'
>>    tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
>>    uprobe:     trace.py 'p:/home/yhs/a.out:main'
>>
>> The bpftool command line and result:
>>
>>    $ bpftool perf
>>    21711: prog_id 5 kprobe func __x64_sys_write offset 0
>>    21765: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0
>>    21767: prog_id 8 tracepoint sys_enter_nanosleep
>>    21800: prog_id 9 uprobe filename /home/yhs/a.out offset 1159
>>
>>    $ bpftool -j perf
>>    {"pid":21711,"prog_id":5,"prog_info":"kprobe","func":"__x64_sys_write","offset":0}, \
>>    {"pid":21765,"prog_id":7,"prog_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
>>    {"pid":21767,"prog_id":8,"prog_info":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
>>    {"pid":21800,"prog_id":9,"prog_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}
> 
> You need to wrap the objects inside an array, so
> 
> 	if (json_output)
> 		jsonw_start_array(json_wtr);
> 	nftw();
> 	if (json_output)
> 		jsonw_end_array(json_wtr);
> 
> otherwise output will not be a valid JSON.  To validate JSON try:
> 
> $ bpftool -j perf | python -m json.tool

Thanks for detailed review! All of your comments make sense.
I will address them in next revision after getting some feedback
for other patches.

> 
>>    $ bpftool prog
>>    5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
>> 	  loaded_at 2018-05-15T04:46:37-0700  uid 0
>> 	  xlated 200B  not jited  memlock 4096B  map_ids 4
>>    7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
>> 	  loaded_at 2018-05-15T04:48:32-0700  uid 0
>> 	  xlated 200B  not jited  memlock 4096B  map_ids 7
>>    8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
>> 	  loaded_at 2018-05-15T04:48:48-0700  uid 0
>> 	  xlated 200B  not jited  memlock 4096B  map_ids 8
>>    9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
>> 	  loaded_at 2018-05-15T04:49:52-0700  uid 0
>> 	  xlated 200B  not jited  memlock 4096B  map_ids 9
>>
>>    $ ps ax | grep "python ./trace.py"
>>    21711 pts/0    T      0:03 python ./trace.py __x64_sys_write
>>    21765 pts/0    S+     0:00 python ./trace.py r::__x64_sys_nanosleep
>>    21767 pts/2    S+     0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
>>    21800 pts/3    S+     0:00 python ./trace.py p:/home/yhs/a.out:main
>>    22374 pts/1    S+     0:00 grep --color=auto python ./trace.py
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   tools/bpf/bpftool/main.c |   3 +-
>>   tools/bpf/bpftool/main.h |   1 +
>>   tools/bpf/bpftool/perf.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++
> 
> Would you be able to also extend the Documentation/ and bash
> completions?
> 
>>   3 files changed, 191 insertions(+), 1 deletion(-)
>>   create mode 100644 tools/bpf/bpftool/perf.c
>>
>> diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
>> index 1ec852d..eea7f14 100644
>> --- a/tools/bpf/bpftool/main.c
>> +++ b/tools/bpf/bpftool/main.c
>> @@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
>>   		"       %s batch file FILE\n"
>>   		"       %s version\n"
>>   		"\n"
>> -		"       OBJECT := { prog | map | cgroup }\n"
>> +		"       OBJECT := { prog | map | cgroup | perf }\n"
>>   		"       " HELP_SPEC_OPTIONS "\n"
>>   		"",
>>   		bin_name, bin_name, bin_name);
>> @@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
>>   	{ "prog",	do_prog },
>>   	{ "map",	do_map },
>>   	{ "cgroup",	do_cgroup },
>> +	{ "perf",	do_perf },
>>   	{ "version",	do_version },
>>   	{ 0 }
>>   };
>> diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
>> index 6173cd9..63fdb31 100644
>> --- a/tools/bpf/bpftool/main.h
>> +++ b/tools/bpf/bpftool/main.h
>> @@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
>>   int do_map(int argc, char **arg);
>>   int do_event_pipe(int argc, char **argv);
>>   int do_cgroup(int argc, char **arg);
>> +int do_perf(int argc, char **arg);
>>   
>>   int prog_parse_fd(int *argc, char ***argv);
>>   int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
>> diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
>> new file mode 100644
>> index 0000000..6d676e4
>> --- /dev/null
>> +++ b/tools/bpf/bpftool/perf.c
>> @@ -0,0 +1,188 @@
>> +// SPDX-License-Identifier: GPL-2.0+
>> +// Copyright (C) 2018 Facebook
>> +// Author: Yonghong Song <yhs@fb.com>
>> +
>> +#define _GNU_SOURCE
>> +#include <fcntl.h>
>> +#include <stdlib.h>
>> +#include <string.h>
>> +#include <sys/stat.h>
>> +#include <sys/types.h>
>> +#include <unistd.h>
>> +#include <ftw.h>
>> +
>> +#include <bpf.h>
>> +
>> +#include "main.h"
>> +
>> +static void print_perf_json(int pid, __u32 prog_id, __u32 prog_info,
>> +			    char *buf, __u64 probe_offset, __u64 probe_addr)
>> +{
>> +	jsonw_start_object(json_wtr);
>> +	jsonw_int_field(json_wtr, "pid", pid);
>> +	jsonw_uint_field(json_wtr, "prog_id", prog_id);
>> +	switch (prog_info) {
>> +	case BPF_PERF_INFO_TP_NAME:
>> +		jsonw_string_field(json_wtr, "prog_info", "tracepoint");
>> +		jsonw_string_field(json_wtr, "tracepoint", buf);
>> +		break;
>> +	case BPF_PERF_INFO_KPROBE:
>> +		jsonw_string_field(json_wtr, "prog_info", "kprobe");
>> +		if (buf[0] != '\0') {
>> +			jsonw_string_field(json_wtr, "func", buf);
>> +			jsonw_lluint_field(json_wtr, "offset", probe_offset);
>> +		} else {
>> +			jsonw_lluint_field(json_wtr, "addr", probe_addr);
>> +		}
>> +		break;
>> +	case BPF_PERF_INFO_KRETPROBE:
>> +		jsonw_string_field(json_wtr, "prog_info", "kretprobe");
>> +		if (buf[0] != '\0') {
>> +			jsonw_string_field(json_wtr, "func", buf);
>> +			jsonw_lluint_field(json_wtr, "offset", probe_offset);
>> +		} else {
>> +			jsonw_lluint_field(json_wtr, "addr", probe_addr);
>> +		}
>> +		break;
>> +	case BPF_PERF_INFO_UPROBE:
>> +		jsonw_string_field(json_wtr, "prog_info", "uprobe");
>> +		jsonw_string_field(json_wtr, "filename", buf);
>> +		jsonw_lluint_field(json_wtr, "offset", probe_offset);
>> +		break;
>> +	case BPF_PERF_INFO_URETPROBE:
>> +		jsonw_string_field(json_wtr, "prog_info", "uretprobe");
>> +		jsonw_string_field(json_wtr, "filename", buf);
>> +		jsonw_lluint_field(json_wtr, "offset", probe_offset);
>> +		break;
>> +	}
>> +	jsonw_end_object(json_wtr);
>> +}
>> +
>> +static void print_perf_plain(int pid, __u32 prog_id, __u32 prog_info,
>> +			    char *buf, __u64 probe_offset, __u64 probe_addr)
>> +{
>> +	printf("%d: prog_id %u ", pid, prog_id);
> 
> nit: for consistency with prog and map listings consider using double
> spaces after prog_id (i.e. between fields).  Not a hard requirement,
> though, perhaps I'm the only one who finds that more readable :)
> 
>> +	switch (prog_info) {
>> +	case BPF_PERF_INFO_TP_NAME:
>> +		printf("tracepoint %s\n", buf);
>> +		break;
>> +	case BPF_PERF_INFO_KPROBE:
>> +		if (buf[0] != '\0')
>> +			printf("kprobe func %s offset %llu\n", buf,
>> +			       probe_offset);
>> +		else
>> +			printf("kprobe addr %llu\n", probe_addr);
>> +		break;
>> +	case BPF_PERF_INFO_KRETPROBE:
>> +		if (buf[0] != '\0')
>> +			printf("kretprobe func %s offset %llu\n", buf,
>> +			       probe_offset);
>> +		else
>> +			printf("kretprobe addr %llu\n", probe_addr);
>> +		break;
>> +	case BPF_PERF_INFO_UPROBE:
>> +		printf("uprobe filename %s offset %llu\n", buf, probe_offset);
>> +		break;
>> +	case BPF_PERF_INFO_URETPROBE:
>> +		printf("uretprobe filename %s offset %llu\n", buf,
>> +		       probe_offset);
>> +		break;
>> +	}
>> +}
>> +
>> +static int show_proc(const char *fpath, const struct stat *sb,
>> +		     int tflag, struct FTW *ftwbuf)
>> +{
>> +	__u64 probe_offset, probe_addr;
>> +	__u32 prog_id, prog_info;
>> +	int err, pid = 0, fd = 0;
>> +	const char *pch;
>> +	char buf[4096];
>> +
>> +	/* prefix always /proc */
>> +	pch = fpath + 5;
>> +	if (*pch == '\0')
>> +		return 0;
>> +
>> +	/* pid should be all numbers */
>> +	pch++;
>> +	while (*pch >= '0' && *pch <= '9') {
> 
> nit: isdigit()?  strtoul() with its endptr also an option.  That said
>       the code is actually quite readable as is, so I'm not sure if it's
>       worth complicating it.
> 
>> +		pid = pid * 10 + *pch - '0';
>> +		pch++;
>> +	}
>> +	if (*pch == '\0')
>> +		return 0;
>> +	if (*pch != '/')
>> +		return FTW_SKIP_SUBTREE;
>> +
>> +	/* check /proc/<pid>/fd directory */
>> +	pch++;
>> +	if (*pch == '\0' || *pch != 'f')
>> +		return FTW_SKIP_SUBTREE;
> 
> but == '\0' implies != 'f'
> 
>> +	pch++;
>> +	if (*pch == '\0' || *pch != 'd')
>> +		return FTW_SKIP_SUBTREE;
> 
> nit: possibly just:
>       if (strncmp(pch, "fd", 2))
>            return FTW_SKIP_SUBTREE;
>       pch += 2;
> 
>> +	pch++;
>> +	if (*pch == '\0')
>> +		return 0;
>> +	if (*pch != '/')
>> +		return FTW_SKIP_SUBTREE;
>> +
>> +	/* check /proc/<pid>/fd/<fd_num> */
>> +	pch++;
>> +	while (*pch >= '0' && *pch <= '9') {
>> +		fd = fd * 10 + *pch - '0';
>> +		pch++;
>> +	}
>> +	if (*pch != '\0')
>> +		return FTW_SKIP_SUBTREE;
>> +
>> +	/* query (pid, fd) for potential perf events */
>> +	err = bpf_trace_event_query(pid, fd, buf, sizeof(buf),
>> +		&prog_id, &prog_info, &probe_offset, &probe_addr);
> 
> nit: continuation line not aligned with opening bracket
> 
>> +	if (err < 0)
>> +		return 0;
>> +
>> +	if (json_output)
>> +		print_perf_json(pid, prog_id, prog_info, buf, probe_offset,
>> +				probe_addr);
>> +	else
>> +		print_perf_plain(pid, prog_id, prog_info, buf, probe_offset,
>> +				 probe_addr);
>> +
>> +	return 0;
>> +}
>> +
>> +static int do_show(int argc, char **argv)
>> +{
>> +	int nopenfd = 16;
>> +	int flags = FTW_ACTIONRETVAL | FTW_PHYS;
> 
> nit: reverse christmas tree networking style if you don't mind
> 
>> +	if (nftw("/proc", show_proc, nopenfd, flags) == -1) {
>> +		perror("nftw");
> 
> nit: p_err("%s", strerror(errno)); would also show up in JSON output
> 
>> +		return -1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int do_help(int argc, char **argv)
>> +{
>> +	fprintf(stderr,
>> +		"Usage: %s %s { show | help }\n"
>> +		"",
>> +		bin_name, argv[-2]);
>> +
>> +	return 0;
>> +}
>> +
>> +static const struct cmd cmds[] = {
>> +	{ "show",	do_show },
> 
> Other commands alias show and list, so could you add:
> 
> 	{ "list",	do_show },
> 
> and list to help output?
> 
>> +	{ "help",	do_help },
>> +	{ 0 }
>> +};
>> +
>> +int do_perf(int argc, char **argv)
>> +{
>> +	return cmd_select(cmds, argc, argv, do_help);
>> +}
> 
> Thanks a lot for adding bpftool support, and with JSON output included!
> 

^ permalink raw reply

* Re: [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Tiwei Bie @ 2018-05-16  5:55 UTC (permalink / raw)
  To: Jason Wang; +Cc: mst, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <915efeaa-4e47-9004-a722-54b0c40ebcbb@redhat.com>

On Wed, May 16, 2018 at 01:01:04PM +0800, Jason Wang wrote:
> On 2018年04月25日 13:15, Tiwei Bie wrote:
[...]
> > @@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
> >   	/* We optimistically turn back on interrupts, then check if there was
> >   	 * more to do. */
> > +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > +	 * either clear the flags bit or point the event index at the next
> > +	 * entry. Always update the event index to keep code simple. */
> > +
> > +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> > +			vq->last_used_idx | (vq->wrap_counter << 15));
> 
> 
> Using vq->wrap_counter seems not correct, what we need is the warp counter
> for the last_used_idx not next_avail_idx.

Yes, you're right. I have fixed it in my local repo,
but haven't sent out a new version yet.

I'll try to send out a new RFC today.

> 
> And I think there's even no need to bother with event idx here, how about
> just set VRING_EVENT_F_ENABLE?

We had a similar discussion before. Michael prefers
to use VRING_EVENT_F_DESC when possible to avoid
extra interrupts if host is fast:

https://lkml.org/lkml/2018/4/16/1085
"""
I suspect this will lead to extra interrupts if host is fast.
So I think for now we should always use VRING_EVENT_F_DESC
if EVENT_IDX is negotiated.
"""

> 
> >   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
> >   		virtio_wmb(vq->weak_barriers);
> > -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> > +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> > +						     VRING_EVENT_F_ENABLE;
> >   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
> >   							vq->event_flags_shadow);
> >   	}
> > @@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
> >   static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
> >   {
> >   	struct vring_virtqueue *vq = to_vvq(_vq);
> > +	u16 bufs, used_idx, wrap_counter;
> >   	START_USE(vq);
> >   	/* We optimistically turn back on interrupts, then check if there was
> >   	 * more to do. */
> > +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> > +	 * either clear the flags bit or point the event index at the next
> > +	 * entry. Always update the event index to keep code simple. */
> > +
> > +	/* TODO: tune this threshold */
> > +	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;
> 
> bufs could be more than vq->num here, is this intended?

Yes, you're right. Like the above one -- I have fixed
it in my local repo, but haven't sent out a new version
yet. Thanks for spotting this!

> 
> > +
> > +	used_idx = vq->last_used_idx + bufs;
> > +	wrap_counter = vq->wrap_counter;
> > +
> > +	if (used_idx >= vq->vring_packed.num) {
> > +		used_idx -= vq->vring_packed.num;
> > +		wrap_counter ^= 1;
> 
> When used_idx is greater or equal vq->num, there's no need to flip
> warp_counter bit since it should match next_avail_idx.
> 
> And we need also care about the case when next_avail wraps but used_idx not.
> so we probaly need:
> 
> else if (vq->next_avail_idx < used_idx) {
>     wrap_counter ^= 1;
> }
> 
> I think maybe it's time to add some sample codes in the spec to avoid
> duplicating the efforts(bugs).

+1

Best regards,
Tiwei Bie

^ permalink raw reply

* Re: [PATCH 1/2] Convert target drivers to use sbitmap
From: Felipe Balbi @ 2018-05-16  5:54 UTC (permalink / raw)
  To: Matthew Wilcox, linux-kernel, linux-scsi, target-devel,
	linux1394-devel, linux-usb, kvm, virtualization, netdev,
	Juergen Gross, qla2xxx-upstream, Kent Overstreet, Jens Axboe
  Cc: Matthew Wilcox
In-Reply-To: <20180515160043.27044-2-willy@infradead.org>

[-- Attachment #1: Type: text/plain, Size: 564 bytes --]


Hi,

Matthew Wilcox <willy@infradead.org> writes:
> From: Matthew Wilcox <mawilcox@microsoft.com>
>
> The sbitmap and the percpu_ida perform essentially the same task,
> allocating tags for commands.  Since the sbitmap is more used than
> the percpu_ida, convert the percpu_ida users to the sbitmap API.
>
> Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
> ---

[...]

>  drivers/usb/gadget/function/f_tcm.c      |  8 +++---

for drivers/usb/gadget/function/f_tcm.c

Acked-by: Felipe Balbi <felipe.balbi@linux.intel.com>

-- 
balbi

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply

* Re: [RFC v3 4/5] virtio_ring: add event idx support in packed ring
From: Jason Wang @ 2018-05-16  5:01 UTC (permalink / raw)
  To: Tiwei Bie, mst, virtualization, linux-kernel, netdev; +Cc: wexu, jfreimann
In-Reply-To: <20180425051550.24342-5-tiwei.bie@intel.com>



On 2018年04月25日 13:15, Tiwei Bie wrote:
> This commit introduces the event idx support in packed
> ring. This feature is temporarily disabled, because the
> implementation in this patch may not work as expected,
> and some further discussions on the implementation are
> needed, e.g. do we have to check the wrap counter when
> checking whether a kick is needed?
>
> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> ---
>   drivers/virtio/virtio_ring.c | 53 ++++++++++++++++++++++++++++++++++++++++----
>   1 file changed, 49 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 0181e93897be..b1039c2985b9 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -986,7 +986,7 @@ static inline int virtqueue_add_packed(struct virtqueue *_vq,
>   static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> -	u16 flags;
> +	u16 new, old, off_wrap, flags;
>   	bool needs_kick;
>   	u32 snapshot;
>   
> @@ -995,7 +995,12 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>   	 * suppressions. */
>   	virtio_mb(vq->weak_barriers);
>   
> +	old = vq->next_avail_idx - vq->num_added;
> +	new = vq->next_avail_idx;
> +	vq->num_added = 0;
> +
>   	snapshot = *(u32 *)vq->vring_packed.device;
> +	off_wrap = virtio16_to_cpu(_vq->vdev, snapshot & 0xffff);
>   	flags = cpu_to_virtio16(_vq->vdev, snapshot >> 16) & 0x3;
>   
>   #ifdef DEBUG
> @@ -1006,7 +1011,10 @@ static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq)
>   	vq->last_add_time_valid = false;
>   #endif
>   
> -	needs_kick = (flags != VRING_EVENT_F_DISABLE);
> +	if (flags == VRING_EVENT_F_DESC)
> +		needs_kick = vring_need_event(off_wrap & ~(1<<15), new, old);
> +	else
> +		needs_kick = (flags != VRING_EVENT_F_DISABLE);
>   	END_USE(vq);
>   	return needs_kick;
>   }
> @@ -1116,6 +1124,15 @@ static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq,
>   	if (vq->last_used_idx >= vq->vring_packed.num)
>   		vq->last_used_idx -= vq->vring_packed.num;
>   
> +	/* If we expect an interrupt for the next entry, tell host
> +	 * by writing event index and flush out the write before
> +	 * the read in the next get_buf call. */
> +	if (vq->event_flags_shadow == VRING_EVENT_F_DESC)
> +		virtio_store_mb(vq->weak_barriers,
> +				&vq->vring_packed.driver->off_wrap,
> +				cpu_to_virtio16(_vq->vdev, vq->last_used_idx |
> +						(vq->wrap_counter << 15)));
> +
>   #ifdef DEBUG
>   	vq->last_add_time_valid = false;
>   #endif
> @@ -1143,10 +1160,17 @@ static unsigned virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq)
>   
>   	/* We optimistically turn back on interrupts, then check if there was
>   	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +
> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> +			vq->last_used_idx | (vq->wrap_counter << 15));


Using vq->wrap_counter seems not correct, what we need is the warp 
counter for the last_used_idx not next_avail_idx.

And I think there's even no need to bother with event idx here, how 
about just set VRING_EVENT_F_ENABLE?

>   
>   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>   		virtio_wmb(vq->weak_barriers);
> -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> +						     VRING_EVENT_F_ENABLE;
>   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>   							vq->event_flags_shadow);
>   	}
> @@ -1172,15 +1196,34 @@ static bool virtqueue_poll_packed(struct virtqueue *_vq, unsigned last_used_idx)
>   static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq)
>   {
>   	struct vring_virtqueue *vq = to_vvq(_vq);
> +	u16 bufs, used_idx, wrap_counter;
>   
>   	START_USE(vq);
>   
>   	/* We optimistically turn back on interrupts, then check if there was
>   	 * more to do. */
> +	/* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
> +	 * either clear the flags bit or point the event index at the next
> +	 * entry. Always update the event index to keep code simple. */
> +
> +	/* TODO: tune this threshold */
> +	bufs = (u16)(vq->next_avail_idx - vq->last_used_idx) * 3 / 4;

bufs could be more than vq->num here, is this intended?

> +
> +	used_idx = vq->last_used_idx + bufs;
> +	wrap_counter = vq->wrap_counter;
> +
> +	if (used_idx >= vq->vring_packed.num) {
> +		used_idx -= vq->vring_packed.num;
> +		wrap_counter ^= 1;

When used_idx is greater or equal vq->num, there's no need to flip 
warp_counter bit since it should match next_avail_idx.

And we need also care about the case when next_avail wraps but used_idx 
not. so we probaly need:

else if (vq->next_avail_idx < used_idx) {
     wrap_counter ^= 1;
}

I think maybe it's time to add some sample codes in the spec to avoid 
duplicating the efforts(bugs).

Thanks
> +	}
> +
> +	vq->vring_packed.driver->off_wrap = cpu_to_virtio16(_vq->vdev,
> +			used_idx | (wrap_counter << 15));
>   
>   	if (vq->event_flags_shadow == VRING_EVENT_F_DISABLE) {
>   		virtio_wmb(vq->weak_barriers);
> -		vq->event_flags_shadow = VRING_EVENT_F_ENABLE;
> +		vq->event_flags_shadow = vq->event ? VRING_EVENT_F_DESC :
> +						     VRING_EVENT_F_ENABLE;
>   		vq->vring_packed.driver->flags = cpu_to_virtio16(_vq->vdev,
>   							vq->event_flags_shadow);
>   	}
> @@ -1822,8 +1865,10 @@ void vring_transport_features(struct virtio_device *vdev)
>   		switch (i) {
>   		case VIRTIO_RING_F_INDIRECT_DESC:
>   			break;
> +#if 0
>   		case VIRTIO_RING_F_EVENT_IDX:
>   			break;
> +#endif
>   		case VIRTIO_F_VERSION_1:
>   			break;
>   		case VIRTIO_F_IOMMU_PLATFORM:

^ permalink raw reply

* Re: [PATCH bpf-next 7/7] tools/bpftool: add perf subcommand
From: Jakub Kicinski @ 2018-05-16  4:41 UTC (permalink / raw)
  To: Yonghong Song; +Cc: peterz, ast, daniel, netdev, kernel-team, Quentin Monnet
In-Reply-To: <20180515234521.856763-8-yhs@fb.com>

On Tue, 15 May 2018 16:45:21 -0700, Yonghong Song wrote:
> The new command "bpftool perf [show]" will traverse
> all processes under /proc, and if any fd is associated
> with a perf event, it will print out related perf event
> information.
> 
> Below is an example to show the results using bcc commands.
> Running the following 4 bcc commands:
>   kprobe:     trace.py '__x64_sys_nanosleep'
>   kretprobe:  trace.py 'r::__x64_sys_nanosleep'
>   tracepoint: trace.py 't:syscalls:sys_enter_nanosleep'
>   uprobe:     trace.py 'p:/home/yhs/a.out:main'
> 
> The bpftool command line and result:
> 
>   $ bpftool perf
>   21711: prog_id 5 kprobe func __x64_sys_write offset 0
>   21765: prog_id 7 kretprobe func __x64_sys_nanosleep offset 0
>   21767: prog_id 8 tracepoint sys_enter_nanosleep
>   21800: prog_id 9 uprobe filename /home/yhs/a.out offset 1159
> 
>   $ bpftool -j perf
>   {"pid":21711,"prog_id":5,"prog_info":"kprobe","func":"__x64_sys_write","offset":0}, \
>   {"pid":21765,"prog_id":7,"prog_info":"kretprobe","func":"__x64_sys_nanosleep","offset":0}, \
>   {"pid":21767,"prog_id":8,"prog_info":"tracepoint","tracepoint":"sys_enter_nanosleep"}, \
>   {"pid":21800,"prog_id":9,"prog_info":"uprobe","filename":"/home/yhs/a.out","offset":1159}

You need to wrap the objects inside an array, so

	if (json_output)
		jsonw_start_array(json_wtr);
	nftw();
	if (json_output)
		jsonw_end_array(json_wtr);

otherwise output will not be a valid JSON.  To validate JSON try:

$ bpftool -j perf | python -m json.tool

>   $ bpftool prog
>   5: kprobe  name probe___x64_sys  tag e495a0c82f2c7a8d  gpl
> 	  loaded_at 2018-05-15T04:46:37-0700  uid 0
> 	  xlated 200B  not jited  memlock 4096B  map_ids 4
>   7: kprobe  name probe___x64_sys  tag f2fdee479a503abf  gpl
> 	  loaded_at 2018-05-15T04:48:32-0700  uid 0
> 	  xlated 200B  not jited  memlock 4096B  map_ids 7
>   8: tracepoint  name tracepoint__sys  tag 5390badef2395fcf  gpl
> 	  loaded_at 2018-05-15T04:48:48-0700  uid 0
> 	  xlated 200B  not jited  memlock 4096B  map_ids 8
>   9: kprobe  name probe_main_1  tag 0a87bdc2e2953b6d  gpl
> 	  loaded_at 2018-05-15T04:49:52-0700  uid 0
> 	  xlated 200B  not jited  memlock 4096B  map_ids 9
> 
>   $ ps ax | grep "python ./trace.py"
>   21711 pts/0    T      0:03 python ./trace.py __x64_sys_write
>   21765 pts/0    S+     0:00 python ./trace.py r::__x64_sys_nanosleep
>   21767 pts/2    S+     0:00 python ./trace.py t:syscalls:sys_enter_nanosleep
>   21800 pts/3    S+     0:00 python ./trace.py p:/home/yhs/a.out:main
>   22374 pts/1    S+     0:00 grep --color=auto python ./trace.py
> 
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  tools/bpf/bpftool/main.c |   3 +-
>  tools/bpf/bpftool/main.h |   1 +
>  tools/bpf/bpftool/perf.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++

Would you be able to also extend the Documentation/ and bash
completions?

>  3 files changed, 191 insertions(+), 1 deletion(-)
>  create mode 100644 tools/bpf/bpftool/perf.c
> 
> diff --git a/tools/bpf/bpftool/main.c b/tools/bpf/bpftool/main.c
> index 1ec852d..eea7f14 100644
> --- a/tools/bpf/bpftool/main.c
> +++ b/tools/bpf/bpftool/main.c
> @@ -87,7 +87,7 @@ static int do_help(int argc, char **argv)
>  		"       %s batch file FILE\n"
>  		"       %s version\n"
>  		"\n"
> -		"       OBJECT := { prog | map | cgroup }\n"
> +		"       OBJECT := { prog | map | cgroup | perf }\n"
>  		"       " HELP_SPEC_OPTIONS "\n"
>  		"",
>  		bin_name, bin_name, bin_name);
> @@ -216,6 +216,7 @@ static const struct cmd cmds[] = {
>  	{ "prog",	do_prog },
>  	{ "map",	do_map },
>  	{ "cgroup",	do_cgroup },
> +	{ "perf",	do_perf },
>  	{ "version",	do_version },
>  	{ 0 }
>  };
> diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
> index 6173cd9..63fdb31 100644
> --- a/tools/bpf/bpftool/main.h
> +++ b/tools/bpf/bpftool/main.h
> @@ -119,6 +119,7 @@ int do_prog(int argc, char **arg);
>  int do_map(int argc, char **arg);
>  int do_event_pipe(int argc, char **argv);
>  int do_cgroup(int argc, char **arg);
> +int do_perf(int argc, char **arg);
>  
>  int prog_parse_fd(int *argc, char ***argv);
>  int map_parse_fd_and_info(int *argc, char ***argv, void *info, __u32 *info_len);
> diff --git a/tools/bpf/bpftool/perf.c b/tools/bpf/bpftool/perf.c
> new file mode 100644
> index 0000000..6d676e4
> --- /dev/null
> +++ b/tools/bpf/bpftool/perf.c
> @@ -0,0 +1,188 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +// Copyright (C) 2018 Facebook
> +// Author: Yonghong Song <yhs@fb.com>
> +
> +#define _GNU_SOURCE
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <unistd.h>
> +#include <ftw.h>
> +
> +#include <bpf.h>
> +
> +#include "main.h"
> +
> +static void print_perf_json(int pid, __u32 prog_id, __u32 prog_info,
> +			    char *buf, __u64 probe_offset, __u64 probe_addr)
> +{
> +	jsonw_start_object(json_wtr);
> +	jsonw_int_field(json_wtr, "pid", pid);
> +	jsonw_uint_field(json_wtr, "prog_id", prog_id);
> +	switch (prog_info) {
> +	case BPF_PERF_INFO_TP_NAME:
> +		jsonw_string_field(json_wtr, "prog_info", "tracepoint");
> +		jsonw_string_field(json_wtr, "tracepoint", buf);
> +		break;
> +	case BPF_PERF_INFO_KPROBE:
> +		jsonw_string_field(json_wtr, "prog_info", "kprobe");
> +		if (buf[0] != '\0') {
> +			jsonw_string_field(json_wtr, "func", buf);
> +			jsonw_lluint_field(json_wtr, "offset", probe_offset);
> +		} else {
> +			jsonw_lluint_field(json_wtr, "addr", probe_addr);
> +		}
> +		break;
> +	case BPF_PERF_INFO_KRETPROBE:
> +		jsonw_string_field(json_wtr, "prog_info", "kretprobe");
> +		if (buf[0] != '\0') {
> +			jsonw_string_field(json_wtr, "func", buf);
> +			jsonw_lluint_field(json_wtr, "offset", probe_offset);
> +		} else {
> +			jsonw_lluint_field(json_wtr, "addr", probe_addr);
> +		}
> +		break;
> +	case BPF_PERF_INFO_UPROBE:
> +		jsonw_string_field(json_wtr, "prog_info", "uprobe");
> +		jsonw_string_field(json_wtr, "filename", buf);
> +		jsonw_lluint_field(json_wtr, "offset", probe_offset);
> +		break;
> +	case BPF_PERF_INFO_URETPROBE:
> +		jsonw_string_field(json_wtr, "prog_info", "uretprobe");
> +		jsonw_string_field(json_wtr, "filename", buf);
> +		jsonw_lluint_field(json_wtr, "offset", probe_offset);
> +		break;
> +	}
> +	jsonw_end_object(json_wtr);
> +}
> +
> +static void print_perf_plain(int pid, __u32 prog_id, __u32 prog_info,
> +			    char *buf, __u64 probe_offset, __u64 probe_addr)
> +{
> +	printf("%d: prog_id %u ", pid, prog_id);

nit: for consistency with prog and map listings consider using double
spaces after prog_id (i.e. between fields).  Not a hard requirement,
though, perhaps I'm the only one who finds that more readable :)

> +	switch (prog_info) {
> +	case BPF_PERF_INFO_TP_NAME:
> +		printf("tracepoint %s\n", buf);
> +		break;
> +	case BPF_PERF_INFO_KPROBE:
> +		if (buf[0] != '\0')
> +			printf("kprobe func %s offset %llu\n", buf,
> +			       probe_offset);
> +		else
> +			printf("kprobe addr %llu\n", probe_addr);
> +		break;
> +	case BPF_PERF_INFO_KRETPROBE:
> +		if (buf[0] != '\0')
> +			printf("kretprobe func %s offset %llu\n", buf,
> +			       probe_offset);
> +		else
> +			printf("kretprobe addr %llu\n", probe_addr);
> +		break;
> +	case BPF_PERF_INFO_UPROBE:
> +		printf("uprobe filename %s offset %llu\n", buf, probe_offset);
> +		break;
> +	case BPF_PERF_INFO_URETPROBE:
> +		printf("uretprobe filename %s offset %llu\n", buf,
> +		       probe_offset);
> +		break;
> +	}
> +}
> +
> +static int show_proc(const char *fpath, const struct stat *sb,
> +		     int tflag, struct FTW *ftwbuf)
> +{
> +	__u64 probe_offset, probe_addr;
> +	__u32 prog_id, prog_info;
> +	int err, pid = 0, fd = 0;
> +	const char *pch;
> +	char buf[4096];
> +
> +	/* prefix always /proc */
> +	pch = fpath + 5;
> +	if (*pch == '\0')
> +		return 0;
> +
> +	/* pid should be all numbers */
> +	pch++;
> +	while (*pch >= '0' && *pch <= '9') {

nit: isdigit()?  strtoul() with its endptr also an option.  That said
     the code is actually quite readable as is, so I'm not sure if it's
     worth complicating it.

> +		pid = pid * 10 + *pch - '0';
> +		pch++;
> +	}
> +	if (*pch == '\0')
> +		return 0;
> +	if (*pch != '/')
> +		return FTW_SKIP_SUBTREE;
> +
> +	/* check /proc/<pid>/fd directory */
> +	pch++;
> +	if (*pch == '\0' || *pch != 'f')
> +		return FTW_SKIP_SUBTREE;

but == '\0' implies != 'f'

> +	pch++;
> +	if (*pch == '\0' || *pch != 'd')
> +		return FTW_SKIP_SUBTREE;

nit: possibly just:
     if (strncmp(pch, "fd", 2))
          return FTW_SKIP_SUBTREE;
     pch += 2;

> +	pch++;
> +	if (*pch == '\0')
> +		return 0;
> +	if (*pch != '/')
> +		return FTW_SKIP_SUBTREE;
> +
> +	/* check /proc/<pid>/fd/<fd_num> */
> +	pch++;
> +	while (*pch >= '0' && *pch <= '9') {
> +		fd = fd * 10 + *pch - '0';
> +		pch++;
> +	}
> +	if (*pch != '\0')
> +		return FTW_SKIP_SUBTREE;
> +
> +	/* query (pid, fd) for potential perf events */
> +	err = bpf_trace_event_query(pid, fd, buf, sizeof(buf),
> +		&prog_id, &prog_info, &probe_offset, &probe_addr);

nit: continuation line not aligned with opening bracket

> +	if (err < 0)
> +		return 0;
> +
> +	if (json_output)
> +		print_perf_json(pid, prog_id, prog_info, buf, probe_offset,
> +				probe_addr);
> +	else
> +		print_perf_plain(pid, prog_id, prog_info, buf, probe_offset,
> +				 probe_addr);
> +
> +	return 0;
> +}
> +
> +static int do_show(int argc, char **argv)
> +{
> +	int nopenfd = 16;
> +	int flags = FTW_ACTIONRETVAL | FTW_PHYS;

nit: reverse christmas tree networking style if you don't mind

> +	if (nftw("/proc", show_proc, nopenfd, flags) == -1) {
> +		perror("nftw");

nit: p_err("%s", strerror(errno)); would also show up in JSON output

> +		return -1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int do_help(int argc, char **argv)
> +{
> +	fprintf(stderr,
> +		"Usage: %s %s { show | help }\n"
> +		"",
> +		bin_name, argv[-2]);
> +
> +	return 0;
> +}
> +
> +static const struct cmd cmds[] = {
> +	{ "show",	do_show },

Other commands alias show and list, so could you add:

	{ "list",	do_show },

and list to help output?

> +	{ "help",	do_help },
> +	{ 0 }
> +};
> +
> +int do_perf(int argc, char **argv)
> +{
> +	return cmd_select(cmds, argc, argv, do_help);
> +}

Thanks a lot for adding bpftool support, and with JSON output included!

^ permalink raw reply

* [PATCH net-next v3 3/3] selftests: net: initial fib rule tests
From: Roopa Prabhu @ 2018-05-16  3:55 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa, nikolay, idosch
In-Reply-To: <1526442908-13183-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This adds a first set of tests for fib rule match/action for
ipv4 and ipv6. Initial tests only cover action lookup table.
can be extended to cover other actions in the future.
Uses ip route get to validate the rule lookup.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 tools/testing/selftests/net/Makefile          |   2 +-
 tools/testing/selftests/net/fib_rule_tests.sh | 248 ++++++++++++++++++++++++++
 2 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/fib_rule_tests.sh

diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index e60dddb..7cb0f49 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -6,7 +6,7 @@ CFLAGS += -I../../../../usr/include/
 
 TEST_PROGS := run_netsocktests run_afpackettests test_bpf.sh netdevice.sh rtnetlink.sh
 TEST_PROGS += fib_tests.sh fib-onlink-tests.sh pmtu.sh udpgso.sh
-TEST_PROGS += udpgso_bench.sh
+TEST_PROGS += udpgso_bench.sh fib_rule_tests.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy
diff --git a/tools/testing/selftests/net/fib_rule_tests.sh b/tools/testing/selftests/net/fib_rule_tests.sh
new file mode 100644
index 0000000..d4cfb6a
--- /dev/null
+++ b/tools/testing/selftests/net/fib_rule_tests.sh
@@ -0,0 +1,248 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# This test is for checking IPv4 and IPv6 FIB rules API
+
+ret=0
+
+PAUSE_ON_FAIL=${PAUSE_ON_FAIL:=no}
+IP="ip -netns testns"
+
+RTABLE=100
+GW_IP4=192.51.100.2
+SRC_IP=192.51.100.3
+GW_IP6=2001:db8:1::2
+SRC_IP6=2001:db8:1::3
+
+DEV_ADDR=192.51.100.1
+DEV=dummy0
+
+log_test()
+{
+	local rc=$1
+	local expected=$2
+	local msg="$3"
+
+	if [ ${rc} -eq ${expected} ]; then
+		nsuccess=$((nsuccess+1))
+		printf "\n    TEST: %-50s  [ OK ]\n" "${msg}"
+	else
+		nfail=$((nfail+1))
+		printf "\n    TEST: %-50s  [FAIL]\n" "${msg}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "hit enter to continue, 'q' to quit"
+			read a
+			[ "$a" = "q" ] && exit 1
+		fi
+	fi
+}
+
+log_section()
+{
+	echo
+	echo "######################################################################"
+	echo "TEST SECTION: $*"
+	echo "######################################################################"
+}
+
+setup()
+{
+	set -e
+	ip netns add testns
+	$IP link set dev lo up
+
+	$IP link add dummy0 type dummy
+	$IP link set dev dummy0 up
+	$IP address add 198.51.100.1/24 dev dummy0
+	$IP -6 address add 2001:db8:1::1/64 dev dummy0
+
+	set +e
+}
+
+cleanup()
+{
+	$IP link del dev dummy0 &> /dev/null
+	ip netns del testns
+}
+
+fib_check_iproute_support()
+{
+	ip rule help 2>&1 | grep -q $1
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 iprule too old, missing $1 match"
+		return 1
+	fi
+
+	ip route get help 2>&1 | grep -q $2
+	if [ $? -ne 0 ]; then
+		echo "SKIP: iproute2 get route too old, missing $2 match"
+		return 1
+	fi
+
+	return 0
+}
+
+fib_rule6_del()
+{
+	$IP -6 rule del $1
+	log_test $? 0 "rule6 del $1"
+}
+
+fib_rule6_del_by_pref()
+{
+	pref=$($IP -6 rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP -6 rule del pref $pref
+}
+
+fib_rule6_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP -6 rule add $match table $RTABLE
+	$IP -6 route get $GW_IP6 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule6 check: $1"
+
+	fib_rule6_del_by_pref "$match"
+	log_test $? 0 "rule6 del by pref: $match"
+}
+
+fib_rule6_test()
+{
+	# setup the fib rule redirect route
+	$IP -6 route add table $RTABLE default via $GW_IP6 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP6 iif $DEV"
+	fib_rule6_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule6_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule6_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule6_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule6_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule6_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+fib_rule4_del()
+{
+	$IP rule del $1
+	log_test $? 0 "del $1"
+}
+
+fib_rule4_del_by_pref()
+{
+	pref=$($IP rule show | grep "$1 lookup $TABLE" | cut -d ":" -f 1)
+	$IP rule del pref $pref
+}
+
+fib_rule4_test_match_n_redirect()
+{
+	local match="$1"
+	local getmatch="$2"
+
+	$IP rule add $match table $RTABLE
+	$IP route get $GW_IP4 $getmatch | grep -q "table $RTABLE"
+	log_test $? 0 "rule4 check: $1"
+
+	fib_rule4_del_by_pref "$match"
+	log_test $? 0 "rule4 del by pref: $match"
+}
+
+fib_rule4_test()
+{
+	# setup the fib rule redirect route
+	$IP route add table $RTABLE default via $GW_IP4 dev $DEV onlink
+
+	match="oif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "oif redirect to table"
+
+	match="from $SRC_IP iif $DEV"
+	fib_rule4_test_match_n_redirect "$match" "$match" "iif redirect to table"
+
+	match="tos 0x10"
+	fib_rule4_test_match_n_redirect "$match" "$match" "tos redirect to table"
+
+	match="fwmark 0x64"
+	getmatch="mark 0x64"
+	fib_rule4_test_match_n_redirect "$match" "$getmatch" "fwmark redirect to table"
+
+	fib_check_iproute_support "uidrange" "uid"
+	if [ $? -eq 0 ]; then
+		match="uidrange 100-100"
+		getmatch="uid 100"
+		fib_rule4_test_match_n_redirect "$match" "$getmatch" "uid redirect to table"
+	fi
+
+	fib_check_iproute_support "sport" "sport"
+	if [ $? -eq 0 ]; then
+		match="sport 666 dport 777"
+		fib_rule4_test_match_n_redirect "$match" "$match" "sport and dport redirect to table"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto tcp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto tcp match"
+	fi
+
+	fib_check_iproute_support "ipproto" "ipproto"
+	if [ $? -eq 0 ]; then
+		match="ipproto icmp"
+		fib_rule4_test_match_n_redirect "$match" "$match" "ipproto icmp match"
+	fi
+}
+
+run_fibrule_tests()
+{
+	log_section "IPv4 fib rule"
+	fib_rule4_test
+	log_section "IPv6 fib rule"
+	fib_rule6_test
+}
+
+if [ "$(id -u)" -ne 0 ];then
+	echo "SKIP: Need root privileges"
+	exit 0
+fi
+
+if [ ! -x "$(command -v ip)" ]; then
+	echo "SKIP: Could not run test without ip tool"
+	exit 0
+fi
+
+# start clean
+cleanup &> /dev/null
+setup
+run_fibrule_tests
+cleanup
+
+exit $ret
-- 
2.1.4

^ permalink raw reply related

* [PATCH net-next v3 2/3] ipv6: support sport, dport and ip_proto in RTM_GETROUTE
From: Roopa Prabhu @ 2018-05-16  3:55 UTC (permalink / raw)
  To: davem; +Cc: netdev, dsa, nikolay, idosch
In-Reply-To: <1526442908-13183-1-git-send-email-roopa@cumulusnetworks.com>

From: Roopa Prabhu <roopa@cumulusnetworks.com>

This is a followup to fib6 rules sport, dport and ipproto
match support. Only supports tcp, udp and icmp for ipproto.
Used by fib rule self tests.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
---
 net/ipv6/route.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index af04167..31fb5bc 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -63,6 +63,7 @@
 #include <net/lwtunnel.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
+#include <net/ip.h>
 #include <trace/events/fib6.h>
 
 #include <linux/uaccess.h>
@@ -4073,6 +4074,9 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
 	[RTA_UID]		= { .type = NLA_U32 },
 	[RTA_MARK]		= { .type = NLA_U32 },
 	[RTA_TABLE]		= { .type = NLA_U32 },
+	[RTA_IP_PROTO]		= { .type = NLA_U8 },
+	[RTA_SPORT]		= { .type = NLA_U16 },
+	[RTA_DPORT]		= { .type = NLA_U16 },
 };
 
 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -4784,6 +4788,19 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
 	else
 		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
 
+	if (tb[RTA_SPORT])
+		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
+
+	if (tb[RTA_DPORT])
+		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
+
+	if (tb[RTA_IP_PROTO]) {
+		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
+						  &fl6.flowi6_proto, extack);
+		if (err)
+			goto errout;
+	}
+
 	if (iif) {
 		struct net_device *dev;
 		int flags = 0;
-- 
2.1.4

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox