Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 0/2] mlxsw: spectrum_span: Two minor adjustments
From: Ido Schimmel @ 2018-05-11  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel

Petr says:

This patch set fixes a couple of nits in mlxsw's SPAN implementation:
two counts of inaccurate variable name and one count of unsuitable error
code, fixed, respectively, in patches #1 and #2.

Petr Machata (2):
  mlxsw: spectrum_span: Rename misnamed variable l3edev
  mlxsw: spectrum_span: Use a more fitting error code

 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 34 +++++++++++-----------
 1 file changed, 17 insertions(+), 17 deletions(-)

-- 
2.14.3

^ permalink raw reply

* [PATCH net-next 1/2] mlxsw: spectrum_span: Rename misnamed variable l3edev
From: Ido Schimmel @ 2018-05-11  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel
In-Reply-To: <20180511085731.9256-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

Calling the variable l3edev was relevant when neighbor lookup was the
last stage in the simulated pipeline. Now that mlxsw handles bridges and
vlan devices as well, calling it "L3" is a misnomer.

Thus in mlxsw_sp_span_dmac(), rename to "dev", because that function is
just a service routine where the distinction between tunnel and egress
device isn't necessary.

In mlxsw_sp_span_entry_tunnel_parms_common(), rename to "edev" to
emphasize that the routine traces packet egress.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_span.c    | 32 +++++++++++-----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index 3b77990df599..adf1f789b166 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -137,14 +137,14 @@ struct mlxsw_sp_span_entry_ops mlxsw_sp_span_entry_ops_phys = {
 
 static int mlxsw_sp_span_dmac(struct neigh_table *tbl,
 			      const void *pkey,
-			      struct net_device *l3edev,
+			      struct net_device *dev,
 			      unsigned char dmac[ETH_ALEN])
 {
-	struct neighbour *neigh = neigh_lookup(tbl, pkey, l3edev);
+	struct neighbour *neigh = neigh_lookup(tbl, pkey, dev);
 	int err = 0;
 
 	if (!neigh) {
-		neigh = neigh_create(tbl, pkey, l3edev);
+		neigh = neigh_create(tbl, pkey, dev);
 		if (IS_ERR(neigh))
 			return PTR_ERR(neigh);
 	}
@@ -246,7 +246,7 @@ mlxsw_sp_span_entry_vlan(const struct net_device *vlan_dev,
 }
 
 static __maybe_unused int
-mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
+mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *edev,
 					union mlxsw_sp_l3addr saddr,
 					union mlxsw_sp_l3addr daddr,
 					union mlxsw_sp_l3addr gw,
@@ -260,31 +260,31 @@ mlxsw_sp_span_entry_tunnel_parms_common(struct net_device *l3edev,
 	if (mlxsw_sp_l3addr_is_zero(gw))
 		gw = daddr;
 
-	if (!l3edev || mlxsw_sp_span_dmac(tbl, &gw, l3edev, dmac))
+	if (!edev || mlxsw_sp_span_dmac(tbl, &gw, edev, dmac))
 		goto unoffloadable;
 
-	if (is_vlan_dev(l3edev))
-		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+	if (is_vlan_dev(edev))
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
 
-	if (netif_is_bridge_master(l3edev)) {
-		l3edev = mlxsw_sp_span_entry_bridge(l3edev, dmac, &vid);
-		if (!l3edev)
+	if (netif_is_bridge_master(edev)) {
+		edev = mlxsw_sp_span_entry_bridge(edev, dmac, &vid);
+		if (!edev)
 			goto unoffloadable;
 	}
 
-	if (is_vlan_dev(l3edev)) {
-		if (vid || !(l3edev->flags & IFF_UP))
+	if (is_vlan_dev(edev)) {
+		if (vid || !(edev->flags & IFF_UP))
 			goto unoffloadable;
-		l3edev = mlxsw_sp_span_entry_vlan(l3edev, &vid);
+		edev = mlxsw_sp_span_entry_vlan(edev, &vid);
 	}
 
-	if (!mlxsw_sp_port_dev_check(l3edev))
+	if (!mlxsw_sp_port_dev_check(edev))
 		goto unoffloadable;
 
-	sparmsp->dest_port = netdev_priv(l3edev);
+	sparmsp->dest_port = netdev_priv(edev);
 	sparmsp->ttl = ttl;
 	memcpy(sparmsp->dmac, dmac, ETH_ALEN);
-	memcpy(sparmsp->smac, l3edev->dev_addr, ETH_ALEN);
+	memcpy(sparmsp->smac, edev->dev_addr, ETH_ALEN);
 	sparmsp->saddr = saddr;
 	sparmsp->daddr = daddr;
 	sparmsp->vid = vid;
-- 
2.14.3

^ permalink raw reply related

* [PATCH net-next 2/2] mlxsw: spectrum_span: Use a more fitting error code
From: Ido Schimmel @ 2018-05-11  8:57 UTC (permalink / raw)
  To: netdev; +Cc: davem, jiri, petrm, mlxsw, Ido Schimmel
In-Reply-To: <20180511085731.9256-1-idosch@mellanox.com>

From: Petr Machata <petrm@mellanox.com>

ENOENT is suitable when an item is looked for in a collection and can't
be found. The failure here is actually a depletion of a resource, where
ENOBUFS is the more fitting error code.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
index adf1f789b166..e5f4f7620ab7 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c
@@ -924,7 +924,7 @@ int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from,
 
 	span_entry = mlxsw_sp_span_entry_get(mlxsw_sp, to_dev, ops, sparms);
 	if (!span_entry)
-		return -ENOENT;
+		return -ENOBUFS;
 
 	netdev_dbg(from->dev, "Adding inspected port to SPAN entry %d\n",
 		   span_entry->id);
-- 
2.14.3

^ permalink raw reply related

* Re: [PATCH] [PATCH net v2] ipv6: remove min MTU check for ipsec tunnels
From: Sergei Shtylyov @ 2018-05-11  9:02 UTC (permalink / raw)
  To: Ashwanth Goli, netdev, davem; +Cc: pabeni, dsahern, eric.dumazet
In-Reply-To: <1525972764-30904-1-git-send-email-ashwanth@codeaurora.org>

Hello!

On 5/10/2018 8:19 PM, Ashwanth Goli wrote:

> With 749439bfac "fix udpv6 sendmsg crash caused by too small MTU"

    When you cite a comnmit, you must specify at least 12-digit SHA1 and 
enclose the summary in (""), not just "".

> ipsec tunnels that report a MTU less than IPV6_MIN_MTU are broken
> even for packets that are smaller than IPV6_MIN_MTU.
> 
> According to rfc2473#section-7.1
> 
>      if the original IPv6 packet is equal or  smaller  than  the
>      IPv6 minimum link MTU, the tunnel entry-point node
>      encapsulates the original packet, and subsequently
>      fragments the resulting IPv6 tunnel packet into IPv6
>      fragments that do not exceed the Path MTU to the tunnel
>      exit-point.
> 
> Dropping the MTU check for ipsec tunnel destinations.
> 
> Signed-off-by: Ashwanth Goli <ashwanth@codeaurora.org>
[...]

MBR, Sergei

^ permalink raw reply

* Re: [PATCH] [PATCH net v2] ipv6: remove min MTU check for ipsec tunnels
From: Sergei Shtylyov @ 2018-05-11  9:05 UTC (permalink / raw)
  To: Ashwanth Goli, netdev, davem; +Cc: pabeni, dsahern, eric.dumazet
In-Reply-To: <a7572ee6-17d1-801d-bf9a-0eca7757fb00@cogentembedded.com>

On 5/11/2018 12:02 PM, Sergei Shtylyov wrote:

>> With 749439bfac "fix udpv6 sendmsg crash caused by too small MTU"
> 
>     When you cite a comnmit, you must specify at least 12-digit SHA1 and 
> enclose the summary in (""), not just "".
> 
>> ipsec tunnels that report a MTU less than IPV6_MIN_MTU are broken
>> even for packets that are smaller than IPV6_MIN_MTU.
>>
>> According to rfc2473#section-7.1
>>
>>      if the original IPv6 packet is equal or  smaller  than  the
>>      IPv6 minimum link MTU, the tunnel entry-point node
>>      encapsulates the original packet, and subsequently
>>      fragments the resulting IPv6 tunnel packet into IPv6
>>      fragments that do not exceed the Path MTU to the tunnel
>>      exit-point.
>>
>> Dropping the MTU check for ipsec tunnel destinations.
>>
>> Signed-off-by: Ashwanth Goli <ashwanth@codeaurora.org>
> [...]

    And you need to specify the patch version change log after the --- tear line.

MBR, Sergei

^ permalink raw reply

* Re: [PATCH net-next 01/10] net: stmmac: Let descriptor code set skbuff address
From: Jose Abreu @ 2018-05-11  9:09 UTC (permalink / raw)
  To: David Miller, Jose.Abreu
  Cc: netdev, Joao.Pinto, Vitor.Soares, peppe.cavallaro,
	alexandre.torgue
In-Reply-To: <20180510.150613.1334467102643804220.davem@davemloft.net>

On 10-05-2018 20:06, David Miller wrote:
> From: Jose Abreu <Jose.Abreu@synopsys.com>
> Date: Tue,  8 May 2018 15:45:24 +0100
>
>> Stop using if conditions depending on the GMAC version for setting the
>> the descriptor skbuff address and use instead a helper implemented in
>> the descriptor files.
>>
>> Signed-off-by: Jose Abreu <joabreu@synopsys.com>
> With Spectre mitigations, indirect calls are extremely expensive.  Much
> more expensive than conditional checks.
>
> And since this is the descriptor setup in the fast paths of the driver,

Hmm, no. This is only done in HW setup. The only patch in this
series that can impact performance because of what you mentioned
is 09.

> I advise that you keep these conditionals.

I see your point but I'm trying to make the code more flexible
because we will add support for XGMAC soon. This XGMAC will need
a similar structure to what we have in GMAC.

By removing this conditionals we make stmmac_main totally
agnostic of HW and we will prevent patterns like this in the future:

if (priv->has_xgmac)
    do_stuff()
else if (priv->synopsys_id > GMAC_SOME_VERSION)
    do_other_stuff()
else
    do_other_other_stuff()

Thanks and Best Regards,
Jose Miguel Abreu

^ permalink raw reply

* Re: net: hang in unregister_netdevice: waiting for lo to become free
From: Dmitry Vyukov @ 2018-05-11  9:19 UTC (permalink / raw)
  To: Dan Streetman
  Cc: Tommi Rantala, Neil Horman, Xin Long, David Ahern,
	Daniel Borkmann, Cong Wang, David Miller, Eric Dumazet,
	Willem de Bruijn, Jakub Kicinski, Rasmus Villemoes, netdev, LKML,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, syzkaller, Dan Streetman,
	Eric W. Biederman, Alexey Kodanev
In-Reply-To: <CALZtONBohVvUKcEa1K78KpcebP4JYEHNS7JFNpdtYuC+1ZJKiw@mail.gmail.com>

On Thu, May 10, 2018 at 12:23 PM, Dan Streetman <ddstreet@ieee.org> wrote:
>>>>>>> <tommi.t.rantala@nokia.com> wrote:
>>>>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>>>>
>>>>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>>>>
>>>>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>>>>> <tommi.t.rantala@nokia.com> wrote:
>>>>>>>>>>>
>>>>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>>>>> latest upstream tree.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>>>>
>>>>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>>>>> Author: Tommi Rantala <tommi.t.rantala@nokia.com>
>>>>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>>>>
>>>>>>>>>>>      sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>>>>> ...
>>>>>>>>>>>      Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>>>>      Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>>>>> secondary
>>>>>>>>>>> addresses")
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>>>>> Author: Alexey Kodanev <alexey.kodanev@oracle.com>
>>>>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>>>>
>>>>>>>>>>>      sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>>>>> ...
>>>>>>>>>>>      Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>>>>> secondary
>>>>>>>>>>> addresses for ipv6")
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>>>>
>>>>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Hi Tommi,
>>>>>>>>>>
>>>>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>>>>> another one then. But I am still seeing these:
>>>>>>>>>>
>>>>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>> Usage count = 4
>>>>>>>>>>
>>>>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>>>>
>>>>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, printing
>>>>>>>>> out a
>>>>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>>>>> definitively if
>>>>>>>>> the problem is in the same location or not
>>>>>>>>
>>>>>>>>
>>>>>>>> Hi Dmitry, I tested with the reproducer and the kernel .config file that you
>>>>>>>> sent in the first email in this thread:
>>>>>>>>
>>>>>>>> With 4.16-rc2 unable to reproduce.
>>>>>>>>
>>>>>>>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: waiting for
>>>>>>>> lo to become free. Usage count = 3"
>>>>>>>>
>>>>>>>> With 4.15-rc9 and Alexey's "sctp: fix dst refcnt leak in sctp_v6_get_dst()"
>>>>>>>> cherry-picked on top, unable to reproduce.
>>>>>>>>
>>>>>>>>
>>>>>>>> Is syzkaller doing something else now to trigger the bug...?
>>>>>>>> Can you still trigger the bug with the same reproducer?
>>>>>>>
>>>>>>> Hi Neil, Tommi,
>>>>>>>
>>>>>>> Reviving this old thread about "unregister_netdevice: waiting for lo
>>>>>>> to become free. Usage count = 3" hangs.
>>>>>>> I still did not have time to deep dive into what happens there (too
>>>>>>> many bugs coming from syzbot). But this still actively happens and I
>>>>>>> suspect accounts to a significant portion of various hang reports,
>>>>>>> which are quite unpleasant.
>>>>>>>
>>>>>>> One idea that could make it all simpler:
>>>>>>>
>>>>>>> Is this wait loop in netdev_wait_allrefs() supposed to wait for any
>>>>>>> prolonged periods of time under any non-buggy conditions? E.g. more
>>>>>>> than 1-2 minutes?
>>>>>>> If it only supposed to wait briefly for things that already supposed
>>>>>>> to be shutting down, and we add a WARNING there after some timeout,
>>>>>>> then syzbot will report all info how/when it happens, hopefully
>>>>>>> extracting reproducers, and all the nice things.
>>>>>>> But this WARNING should not have any false positives under any
>>>>>>> realistic conditions (e.g. waiting for arrival of remote packets with
>>>>>>> large timeouts).
>>>>>>>
>>>>>>> Looking at some task hung reports, it seems that this code holds some
>>>>>>> mutexes, takes workqueue thread and prevents any progress with
>>>>>>> destruction of other devices (and net namespace creation/destruction),
>>>>>>> so I guess it should not wait for any indefinite periods of time?
>>>>>>
>>>>>> I'm working on this currently:
>>>>>> https://bugs.launchpad.net/ubuntu/zesty/+source/linux/+bug/1711407
>>>>>>
>>>>>> I added a summary of what I've found to be the cause (or at least, one
>>>>>> possible cause) of this:
>>>>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407/comments/72
>>>>>>
>>>>>> I'm working on a patch to work around the main side-effect of this,
>>>>>> which is hanging while holding the global net mutex.  Hangs will still
>>>>>> happen (e.g. if a dst leaks) but should not affect anything else,
>>>>>> other than a leak of the dst and its net namespace.
>>>>>>
>>>>>> Fixing the dst leaks is important too, of course, but a dst leak (or
>>>>>> other cause) shouldn't break the entire system.
>>>>>
>>>>> Leaking some memory is definitely better than hanging the system.
>>>>>
>>>>> So I've made syzkaller to recognize "unregister_netdevice: waiting for
>>>>> (.*) to become free" as a kernel bug:
>>>>> https://github.com/google/syzkaller/commit/7a67784ca8bdc3b26cce2f0ec9a40d2dd9ec9396
>>>>> Unfortunately it does not make it catch these bugs because creating a
>>>>> net namespace per test is too damn slow, so namespaces are reused for
>>>>> lots of tests and when/if it's eventually destroyed it's already too
>>>>> late to find root cause.
>>>>>
>>>>> But I've run a one-off experiment with prompt net namespace
>>>>> destruction and syzkaller was able to easily extract a C reproducer:
>>>>> https://gist.githubusercontent.com/dvyukov/d571e8fff24e127ca48a8c4790d42bfa/raw/52050e93ba9afbb5126b9d7bb39b7e71a82af016/gistfile1.txt
>>>>>
>>>>> On upstream 16e205cf42da1f497b10a4a24f563e6c0d574eec with this config:
>>>>> https://gist.githubusercontent.com/dvyukov/9663c57443adb21f2795b92ef0829d62/raw/bbea0652e23746096dd56855a28f6c681aebcdee/gistfile1.txt
>>>>>
>>>>> this gives me:
>>>>>
>>>>> [   83.183198] unregister_netdevice: waiting for lo to become free.
>>>>> Usage count = 9
>>>>> [   85.231202] unregister_netdevice: waiting for lo to become free.
>>>>> Usage count = 9
>>>>> ...
>>>>> [  523.511205] unregister_netdevice: waiting for lo to become free.
>>>>> Usage count = 9
>>>>> ...
>>>>>
>>>>> This is generated from this syzkaller program:
>>>>>
>>>>> r0 = socket$inet6(0xa, 0x1, 0x84)
>>>>> setsockopt$inet6_IPV6_XFRM_POLICY(r0, 0x29, 0x23,
>>>>> &(0x7f0000000380)={{{@in6=@remote={0xfe, 0x80, [], 0xbb},
>>>>> @in=@dev={0xac, 0x14, 0x14}, 0x0, 0x0, 0x0, 0x0, 0xa}, {}, {}, 0x0,
>>>>> 0x0, 0x1}, {{@in=@local={0xac, 0x14, 0x14, 0xaa}, 0x0, 0x32}, 0x0,
>>>>> @in=@local={0xac, 0x14, 0x14, 0xaa}, 0x3504}}, 0xe8)
>>>>> bind$inet6(r0, &(0x7f0000000000)={0xa, 0x4e20}, 0x1c)
>>>>> connect$inet(r0, &(0x7f0000000040)={0x2, 0x4e20, @dev={0xac, 0x14,
>>>>> 0x14, 0xd}}, 0x10)
>>>>> syz_emit_ethernet(0x3e, &(0x7f00000001c0)={@local={[0xaa, 0xaa, 0xaa,
>>>>> 0xaa, 0xaa], 0xaa}, @dev={[0xaa, 0xaa, 0xaa, 0xaa, 0xaa]}, [],
>>>>> {@ipv6={0x86dd, {0x0, 0x6, "50a09c", 0x8, 0xffffff11, 0x0,
>>>>> @remote={0xfe, 0x80, [], 0xbb}, @local={0xfe, 0x80, [], 0xaa}, {[],
>>>>> @udp={0x0, 0x4e20, 0x8}}}}}}, &(0x7f0000000040))
>>>>>
>>>>> So this seems to be related to IPv6 and/or xfrm and is potentially
>>>>> caused by external packets (that syz_emit_ethernet call).
>>>>
>>>>
>>>>
>>>> Here is another repro which seems to be a different bug (note that it
>>>> requires fault injection):
>>>>
>>>> https://gist.githubusercontent.com/dvyukov/1c56623016cc4c24a69d433c5114ad5b/raw/530478f571b195193101b912aa646948528baa8e/gistfile1.txt
>>>>
>>>> Dan, do you mind taking a look at them? Fixing these should eliminate
>>>> root causes of these hangs/leaks.
>>>
>>> Yep I will look at them, thanks for the reproducers.
>>
>> Hi Dan,
>>
>> Any updates on this? syzbot is hitting this all the time.
>
> Sorry, the recent changes from net_mutex -> net_rwsem/pernet_ops_rwsem
> have complicated what I had done to workaround this, but I'm still
> working on it.  Apologies for the delay.

Are you looking at the mitigation? Or the bugs that trigger it? Or both?

^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Michael Schmitz @ 2018-05-11  9:30 UTC (permalink / raw)
  To: Finn Thain, Geert Uytterhoeven
  Cc: David S. Miller, linux-m68k, netdev, Linux Kernel Mailing List,
	Christoph Hellwig
In-Reply-To: <alpine.LNX.2.21.1805111451220.8@nippy.intranet>

Hi Finn,

Am 11.05.2018 um 17:28 schrieb Finn Thain:
> On Fri, 11 May 2018, Michael Schmitz wrote:
> 
>>
>> I'm afraid using platform_device_register() (which you already use for 
>> the SCC devices) is the only option handling this on a per-device basis 
>> without touching platform core code, while at the same time keeping the 
>> DMA mask setup out of device drivers
> 
> I don't think that will fly. If you call platform_device_register() and 
> follow that with a dma mask assignment, you could race with the bus 
> matching and driver probe, and we are back to the same WARNING message.

I wasn't proposing to follow platform_device_register() with the mask
assignment, but rather to use the same strategy from the Coldfire FEC
patch (f61e64310b75): set pdev.dev.coherent_dma_mask and
pdev.dev.dma_mask _before_ calling platform_device_register().

> If you want to use platform_device_register(), you'd have to implement 
> arch_setup_pdev_archdata() and use that to set up the dma mask.

If you want to avoid the overhead of defining the macmace platform
device and using platform_device_register() ... seeing as you would not
be defining just the DMA mask and nothing else, that's probably the
least effort for the MACE and SONIC drivers.

>> (I can see Geert's point there - device driver code might be shared 
>> across implementations of the device on platforms with different DMA 
>> mask requirements,, something the driver can't be expected to know 
>> about).
> 
> As I said, these drivers might be expected to be portable between Macs and 
> early PowerMacs, but the same dma mask would apply AFAIK.
> 
> If a platform driver isn't expected to be portable, I think either method 
> is reasonable: arch_setup_pdev_archdata() or the method in the patch.
> 
> Anyway, there is this in arch/powerpc/kernel/setup-common.c:
> 
> void arch_setup_pdev_archdata(struct platform_device *pdev)
> {
>         pdev->archdata.dma_mask = DMA_BIT_MASK(32);
>         pdev->dev.dma_mask = &pdev->archdata.dma_mask;
> 	...

You would have to be careful not to overwrite a pdev->dev.dma_mask and
pdev->dev.dma_coherent_mask that might have been set in a platform
device passed via platform_device_register here. Coldfire is the only
m68k platform currently using that, but there might be others in future.

> I'm inclined to propose something similar for m68k. That should fix the 
> problem, since arch_setup_pdev_archdata() is already in the call chain:
> 
> 	platform_device_register_simple()
> 		platform_device_register_resndata()
> 			platform_device_register_full()
> 				platform_device_alloc()
> 					arch_setup_pdev_archdata()
> 
> Thoughts? Will this have nasty side effects for m68k platforms that use 
> smaller dma masks?

These can still set a smaller mask in pdev->dev directly and use
platform_device_register() instead. But I don't think there are smaller
DMA masks used by m68k drivers that use the platform device mechanism at
present. I've only looked at arch/m68k though.

Cheers,

	Michael

^ permalink raw reply

* [PATCH 1/3] bonding: replace the return value type
From: Tonghao Zhang @ 2018-05-11  9:52 UTC (permalink / raw)
  To: netdev; +Cc: Tonghao Zhang

The method ndo_start_xmit is defined as returning a
netdev_tx_t, which is a typedef for an enum type,
but the implementation in this driver returns an int.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 drivers/net/bonding/bond_alb.c  |  8 ++++----
 drivers/net/bonding/bond_main.c | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529..601f678 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1316,8 +1316,8 @@ void bond_alb_deinitialize(struct bonding *bond)
 		rlb_deinitialize(bond);
 }
 
-static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
-			    struct slave *tx_slave)
+static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
+				    struct slave *tx_slave)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 	struct ethhdr *eth_data = eth_hdr(skb);
@@ -1351,7 +1351,7 @@ static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
 	return NETDEV_TX_OK;
 }
 
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
@@ -1389,7 +1389,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	return bond_do_alb_xmit(skb, bond, tx_slave);
 }
 
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e491..01bca86 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3805,7 +3805,8 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
 	return slave_id;
 }
 
-static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
+					struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct iphdr *iph = ip_hdr(skb);
@@ -3841,7 +3842,8 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 /* In active-backup mode, we know that bond->curr_active_slave is always valid if
  * the bond has a usable interface.
  */
-static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
+					  struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave;
@@ -3979,7 +3981,8 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
  */
-static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 	struct slave *slave;
@@ -3999,7 +4002,8 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* in broadcast mode, we send everything to all usable interfaces. */
-static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
+				       struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave = NULL;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 1/3] bonding: replace the return value type
From: Tonghao Zhang @ 2018-05-11  9:53 UTC (permalink / raw)
  To: netdev; +Cc: Tonghao Zhang

The method ndo_start_xmit is defined as returning a
netdev_tx_t, which is a typedef for an enum type,
but the implementation in this driver returns an int.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 drivers/net/bonding/bond_alb.c  |  8 ++++----
 drivers/net/bonding/bond_main.c | 12 ++++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529..601f678 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1316,8 +1316,8 @@ void bond_alb_deinitialize(struct bonding *bond)
 		rlb_deinitialize(bond);
 }
 
-static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
-			    struct slave *tx_slave)
+static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
+				    struct slave *tx_slave)
 {
 	struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
 	struct ethhdr *eth_data = eth_hdr(skb);
@@ -1351,7 +1351,7 @@ static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
 	return NETDEV_TX_OK;
 }
 
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
@@ -1389,7 +1389,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 	return bond_do_alb_xmit(skb, bond, tx_slave);
 }
 
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct ethhdr *eth_data;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e491..01bca86 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3805,7 +3805,8 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
 	return slave_id;
 }
 
-static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
+					struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct iphdr *iph = ip_hdr(skb);
@@ -3841,7 +3842,8 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev
 /* In active-backup mode, we know that bond->curr_active_slave is always valid if
  * the bond has a usable interface.
  */
-static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
+					  struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave;
@@ -3979,7 +3981,8 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
  */
-static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
+				     struct net_device *dev)
 {
 	struct bonding *bond = netdev_priv(dev);
 	struct slave *slave;
@@ -3999,7 +4002,8 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
 }
 
 /* in broadcast mode, we send everything to all usable interfaces. */
-static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev)
+static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
+				       struct net_device *bond_dev)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave = NULL;
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 2/3] bonding: use the skb_get/set_queue_mapping
From: Tonghao Zhang @ 2018-05-11  9:53 UTC (permalink / raw)
  To: netdev; +Cc: Tonghao Zhang
In-Reply-To: <1526032392-65791-1-git-send-email-xiangxia.m.yue@gmail.com>

Use the skb_get_queue_mapping, skb_set_queue_mapping
and skb_rx_queue_recorded for skb queue_mapping in bonding
driver, but not use it directly.

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 drivers/net/bonding/bond_main.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 01bca86..966d091 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -247,7 +247,7 @@ void bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
 
 	BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
 		     sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
-	skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+	skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
 
 	if (unlikely(netpoll_tx_running(bond->dev)))
 		bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb);
@@ -4040,12 +4040,12 @@ static inline int bond_slave_override(struct bonding *bond,
 	struct slave *slave = NULL;
 	struct list_head *iter;
 
-	if (!skb->queue_mapping)
+	if (!skb_rx_queue_recorded(skb))
 		return 1;
 
 	/* Find out if any slaves have the same mapping as this skb. */
 	bond_for_each_slave_rcu(bond, slave, iter) {
-		if (slave->queue_id == skb->queue_mapping) {
+		if (slave->queue_id == skb_get_queue_mapping(skb)) {
 			if (bond_slave_is_up(slave) &&
 			    slave->link == BOND_LINK_UP) {
 				bond_dev_queue_xmit(bond, skb, slave->dev);
@@ -4071,7 +4071,7 @@ static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
 	u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
 
 	/* Save the original txq to restore before passing to the driver */
-	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+	qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb);
 
 	if (unlikely(txq >= dev->real_num_tx_queues)) {
 		do {
-- 
1.8.3.1

^ permalink raw reply related

* [PATCH 3/3] net: doc: fix spelling mistake: "modrobe.d" -> "modprobe.d"
From: Tonghao Zhang @ 2018-05-11  9:53 UTC (permalink / raw)
  To: netdev; +Cc: Tonghao Zhang
In-Reply-To: <1526032392-65791-1-git-send-email-xiangxia.m.yue@gmail.com>

Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
 Documentation/networking/bonding.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index 9ba04c0..c13214d 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -140,7 +140,7 @@ bonding module at load time, or are specified via sysfs.
 
 	Module options may be given as command line arguments to the
 insmod or modprobe command, but are usually specified in either the
-/etc/modrobe.d/*.conf configuration files, or in a distro-specific
+/etc/modprobe.d/*.conf configuration files, or in a distro-specific
 configuration file (some of which are detailed in the next section).
 
 	Details on bonding support for sysfs is provided in the
-- 
1.8.3.1

^ permalink raw reply related

* Re: INFO: rcu detected stall in kfree_skbmem
From: Dmitry Vyukov @ 2018-05-11 10:00 UTC (permalink / raw)
  To: syzbot, Vladislav Yasevich, Neil Horman, Marcelo Ricardo Leitner,
	linux-sctp
  Cc: Andrei Vagin, David Miller, Kirill Tkhai, LKML, netdev,
	syzkaller-bugs
In-Reply-To: <000000000000a9b0e3056b14bfb2@google.com>

On Mon, Apr 30, 2018 at 8:09 PM, syzbot
<syzbot+fc78715ba3b3257caf6a@syzkaller.appspotmail.com> wrote:
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:    5d1365940a68 Merge
> git://git.kernel.org/pub/scm/linux/kerne...
> git tree:       net-next
> console output: https://syzkaller.appspot.com/x/log.txt?id=5667997129637888
> kernel config:
> https://syzkaller.appspot.com/x/.config?id=-5947642240294114534
> dashboard link: https://syzkaller.appspot.com/bug?extid=fc78715ba3b3257caf6a
> compiler:       gcc (GCC) 8.0.1 20180413 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.

This looks sctp-related, +sctp maintainers.

> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+fc78715ba3b3257caf6a@syzkaller.appspotmail.com
>
> INFO: rcu_sched self-detected stall on CPU
>         1-...!: (1 GPs behind) idle=a3e/1/4611686018427387908
> softirq=71980/71983 fqs=33
>          (t=125000 jiffies g=39438 c=39437 q=958)
> rcu_sched kthread starved for 124829 jiffies! g39438 c39437 f0x0
> RCU_GP_WAIT_FQS(3) ->state=0x0 ->cpu=0
> RCU grace-period kthread stack dump:
> rcu_sched       R  running task    23768     9      2 0x80000000
> Call Trace:
>  context_switch kernel/sched/core.c:2848 [inline]
>  __schedule+0x801/0x1e30 kernel/sched/core.c:3490
>  schedule+0xef/0x430 kernel/sched/core.c:3549
>  schedule_timeout+0x138/0x240 kernel/time/timer.c:1801
>  rcu_gp_kthread+0x6b5/0x1940 kernel/rcu/tree.c:2231
>  kthread+0x345/0x410 kernel/kthread.c:238
>  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:411
> NMI backtrace for cpu 1
> CPU: 1 PID: 20560 Comm: syz-executor4 Not tainted 4.16.0+ #1
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>  <IRQ>
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x1b9/0x294 lib/dump_stack.c:113
>  nmi_cpu_backtrace.cold.4+0x19/0xce lib/nmi_backtrace.c:103
>  nmi_trigger_cpumask_backtrace+0x151/0x192 lib/nmi_backtrace.c:62
>  arch_trigger_cpumask_backtrace+0x14/0x20 arch/x86/kernel/apic/hw_nmi.c:38
>  trigger_single_cpu_backtrace include/linux/nmi.h:156 [inline]
>  rcu_dump_cpu_stacks+0x175/0x1c2 kernel/rcu/tree.c:1376
>  print_cpu_stall kernel/rcu/tree.c:1525 [inline]
>  check_cpu_stall.isra.61.cold.80+0x36c/0x59a kernel/rcu/tree.c:1593
>  __rcu_pending kernel/rcu/tree.c:3356 [inline]
>  rcu_pending kernel/rcu/tree.c:3401 [inline]
>  rcu_check_callbacks+0x21b/0xad0 kernel/rcu/tree.c:2763
>  update_process_times+0x2d/0x70 kernel/time/timer.c:1636
>  tick_sched_handle+0x9f/0x180 kernel/time/tick-sched.c:173
>  tick_sched_timer+0x45/0x130 kernel/time/tick-sched.c:1283
>  __run_hrtimer kernel/time/hrtimer.c:1386 [inline]
>  __hrtimer_run_queues+0x3e3/0x10a0 kernel/time/hrtimer.c:1448
>  hrtimer_interrupt+0x286/0x650 kernel/time/hrtimer.c:1506
>  local_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1025 [inline]
>  smp_apic_timer_interrupt+0x15d/0x710 arch/x86/kernel/apic/apic.c:1050
>  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
> RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783
> [inline]
> RIP: 0010:kmem_cache_free+0xb3/0x2d0 mm/slab.c:3757
> RSP: 0018:ffff8801db105228 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> RAX: 0000000000000007 RBX: ffff8800b055c940 RCX: 1ffff1003b2345a5
> RDX: 0000000000000000 RSI: ffff8801d91a2d80 RDI: 0000000000000282
> RBP: ffff8801db105248 R08: ffff8801d91a2cb8 R09: 0000000000000002
> R10: ffff8801d91a2480 R11: 0000000000000000 R12: ffff8801d9848e40
> R13: 0000000000000282 R14: ffffffff85b7f27c R15: 0000000000000000
>  kfree_skbmem+0x13c/0x210 net/core/skbuff.c:582
>  __kfree_skb net/core/skbuff.c:642 [inline]
>  kfree_skb+0x19d/0x560 net/core/skbuff.c:659
>  enqueue_to_backlog+0x2fc/0xc90 net/core/dev.c:3968
>  netif_rx_internal+0x14d/0xae0 net/core/dev.c:4181
>  netif_rx+0xba/0x400 net/core/dev.c:4206
>  loopback_xmit+0x283/0x741 drivers/net/loopback.c:91
>  __netdev_start_xmit include/linux/netdevice.h:4087 [inline]
>  netdev_start_xmit include/linux/netdevice.h:4096 [inline]
>  xmit_one net/core/dev.c:3053 [inline]
>  dev_hard_start_xmit+0x264/0xc10 net/core/dev.c:3069
>  __dev_queue_xmit+0x2724/0x34c0 net/core/dev.c:3584
>  dev_queue_xmit+0x17/0x20 net/core/dev.c:3617
>  neigh_hh_output include/net/neighbour.h:472 [inline]
>  neigh_output include/net/neighbour.h:480 [inline]
>  ip6_finish_output2+0x134e/0x2810 net/ipv6/ip6_output.c:120
>  ip6_finish_output+0x5fe/0xbc0 net/ipv6/ip6_output.c:154
>  NF_HOOK_COND include/linux/netfilter.h:277 [inline]
>  ip6_output+0x227/0x9b0 net/ipv6/ip6_output.c:171
>  dst_output include/net/dst.h:444 [inline]
>  NF_HOOK include/linux/netfilter.h:288 [inline]
>  ip6_xmit+0xf51/0x23f0 net/ipv6/ip6_output.c:277
>  sctp_v6_xmit+0x4a5/0x6b0 net/sctp/ipv6.c:225
>  sctp_packet_transmit+0x26f6/0x3ba0 net/sctp/output.c:650
>  sctp_outq_flush+0x1373/0x4370 net/sctp/outqueue.c:1197
>  sctp_outq_uncork+0x6a/0x80 net/sctp/outqueue.c:776
>  sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1820 [inline]
>  sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
>  sctp_do_sm+0x596/0x7160 net/sctp/sm_sideeffect.c:1191
>  sctp_generate_heartbeat_event+0x218/0x450 net/sctp/sm_sideeffect.c:406
>  call_timer_fn+0x230/0x940 kernel/time/timer.c:1326
>  expire_timers kernel/time/timer.c:1363 [inline]
>  __run_timers+0x79e/0xc50 kernel/time/timer.c:1666
>  run_timer_softirq+0x4c/0x70 kernel/time/timer.c:1692
>  __do_softirq+0x2e0/0xaf5 kernel/softirq.c:285
>  invoke_softirq kernel/softirq.c:365 [inline]
>  irq_exit+0x1d1/0x200 kernel/softirq.c:405
>  exiting_irq arch/x86/include/asm/apic.h:525 [inline]
>  smp_apic_timer_interrupt+0x17e/0x710 arch/x86/kernel/apic/apic.c:1052
>  apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:862
>  </IRQ>
> RIP: 0010:arch_local_irq_restore arch/x86/include/asm/paravirt.h:783
> [inline]
> RIP: 0010:lock_release+0x4d4/0xa10 kernel/locking/lockdep.c:3942
> RSP: 0018:ffff8801971ce7b0 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff13
> RAX: dffffc0000000000 RBX: 1ffff10032e39cfb RCX: 1ffff1003b234595
> RDX: 1ffffffff11630ed RSI: 0000000000000002 RDI: 0000000000000282
> RBP: ffff8801971ce8e0 R08: 1ffff10032e39cff R09: ffffed003b6246c2
> R10: 0000000000000003 R11: 0000000000000001 R12: ffff8801d91a2480
> R13: ffffffff88b8df60 R14: ffff8801d91a2480 R15: ffff8801971ce7f8
>  rcu_lock_release include/linux/rcupdate.h:251 [inline]
>  rcu_read_unlock include/linux/rcupdate.h:688 [inline]
>  __unlock_page_memcg+0x72/0x100 mm/memcontrol.c:1654
>  unlock_page_memcg+0x2c/0x40 mm/memcontrol.c:1663
>  page_remove_file_rmap mm/rmap.c:1248 [inline]
>  page_remove_rmap+0x6f2/0x1250 mm/rmap.c:1299
>  zap_pte_range mm/memory.c:1337 [inline]
>  zap_pmd_range mm/memory.c:1441 [inline]
>  zap_pud_range mm/memory.c:1470 [inline]
>  zap_p4d_range mm/memory.c:1491 [inline]
>  unmap_page_range+0xeb4/0x2200 mm/memory.c:1512
>  unmap_single_vma+0x1a0/0x310 mm/memory.c:1557
>  unmap_vmas+0x120/0x1f0 mm/memory.c:1587
>  exit_mmap+0x265/0x570 mm/mmap.c:3038
>  __mmput kernel/fork.c:962 [inline]
>  mmput+0x251/0x610 kernel/fork.c:983
>  exit_mm kernel/exit.c:544 [inline]
>  do_exit+0xe98/0x2730 kernel/exit.c:852
>  do_group_exit+0x16f/0x430 kernel/exit.c:968
>  get_signal+0x886/0x1960 kernel/signal.c:2469
>  do_signal+0x98/0x2040 arch/x86/kernel/signal.c:810
>  exit_to_usermode_loop+0x28a/0x310 arch/x86/entry/common.c:162
>  prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
>  syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
>  do_syscall_64+0x792/0x9d0 arch/x86/entry/common.c:292
>  entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x455319
> RSP: 002b:00007fa346e81ce8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca
> RAX: fffffffffffffe00 RBX: 000000000072bf80 RCX: 0000000000455319
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: 000000000072bf80
> RBP: 000000000072bf80 R08: 0000000000000000 R09: 000000000072bf58
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> R13: 0000000000a3e81f R14: 00007fa346e829c0 R15: 0000000000000001
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report.
> If you forgot to add the Reported-by tag, once the fix for this bug is
> merged
> into any tree, please reply to this email with:
> #syz fix: exact-commit-title
> To mark this as a duplicate of another syzbot report, please reply with:
> #syz dup: exact-subject-of-another-report
> If it's a one-off invalid bug report, please reply with:
> #syz invalid
> Note: if the crash happens again, it will cause creation of a new bug
> report.
> Note: all commands must start from beginning of the line in the email body.
>
> --
> You received this message because you are subscribed to the Google Groups
> "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit
> https://groups.google.com/d/msgid/syzkaller-bugs/000000000000a9b0e3056b14bfb2%40google.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: [PATCH net] macmace: Set platform device coherent_dma_mask
From: Finn Thain @ 2018-05-11 10:06 UTC (permalink / raw)
  To: Michael Schmitz
  Cc: Geert Uytterhoeven, David S. Miller, linux-m68k, netdev,
	Linux Kernel Mailing List, Christoph Hellwig
In-Reply-To: <8c2c3ad5-eeb5-cf61-b9fb-c6096619e310@gmail.com>

On Fri, 11 May 2018, Michael Schmitz wrote:

> 
> I wasn't proposing to follow platform_device_register() with the mask 
> assignment,

I see.

> but rather to use the same strategy from the Coldfire FEC patch 
> (f61e64310b75): set pdev.dev.coherent_dma_mask and pdev.dev.dma_mask 
> _before_ calling platform_device_register().
> 

That patch is attempting to solve the same problem using a different 
approach, but I'd prefer to keep using platform_device_register_simple().

> 
> If you want to avoid the overhead of defining the macmace platform 
> device and using platform_device_register() ... seeing as you would not 
> be defining just the DMA mask and nothing else, that's probably the 
> least effort for the MACE and SONIC drivers.
> 

I don't mind the effort. I want to avoid all the boilerplate.

> 
> You would have to be careful not to overwrite a pdev->dev.dma_mask and 
> pdev->dev.dma_coherent_mask that might have been set in a platform 
> device passed via platform_device_register here. Coldfire is the only 
> m68k platform currently using that, but there might be others in future.
> 

That Coldfire patch could be reverted if this is a better solution.

> ... But I don't think there are smaller DMA masks used by m68k drivers 
> that use the platform device mechanism at present. I've only looked at 
> arch/m68k though.

So we're back at the same problem that Geert's suggestion also raised: how 
to identify potentially affected platform devices and drivers?

Maybe we can take a leaf out of Christoph's book, and leave a noisy 
WARNING splat in the log.

void arch_setup_pdev_archdata(struct platform_device *pdev)
{
        WARN_ON_ONCE(pdev->dev.coherent_dma_mask != DMA_MASK_NONE ||
                     pdev->dev.dma_mask != NULL);
        pdev->dev.coherent_dma_mask = DMA_BIT_MASK(32);
        pdev->dev.dma_mask = &pdev->dev.coherent_dma_mask;
}

-- 

^ permalink raw reply

* Re: [PATCH] mlx4_core: allocate 4KB ICM chunks
From: Håkon Bugge @ 2018-05-11 10:27 UTC (permalink / raw)
  To: Qing Huang; +Cc: tariqt, davem, netdev, OFED mailing list, linux-kernel
In-Reply-To: <20180510233143.7236-1-qing.huang@oracle.com>



> On 11 May 2018, at 01:31, Qing Huang <qing.huang@oracle.com> wrote:
> 
> When a system is under memory presure (high usage with fragments),
> the original 256KB ICM chunk allocations will likely trigger kernel
> memory management to enter slow path doing memory compact/migration
> ops in order to complete high order memory allocations.
> 
> When that happens, user processes calling uverb APIs may get stuck
> for more than 120s easily even though there are a lot of free pages
> in smaller chunks available in the system.
> 
> Syslog:
> ...
> Dec 10 09:04:51 slcc03db02 kernel: [397078.572732] INFO: task
> oracle_205573_e:205573 blocked for more than 120 seconds.
> ...
> 
> With 4KB ICM chunk size, the above issue is fixed.
> 
> However in order to support 4KB ICM chunk size, we need to fix another
> issue in large size kcalloc allocations.
> 
> E.g.
> Setting log_num_mtt=30 requires 1G mtt entries. With the 4KB ICM chunk
> size, each ICM chunk can only hold 512 mtt entries (8 bytes for each mtt
> entry). So we need a 16MB allocation for a table->icm pointer array to
> hold 2M pointers which can easily cause kcalloc to fail.
> 
> The solution is to use vzalloc to replace kcalloc. There is no need
> for contiguous memory pages for a driver meta data structure (no need
> of DMA ops).
> 
> Signed-off-by: Qing Huang <qing.huang@oracle.com>
> Acked-by: Daniel Jurgens <danielj@mellanox.com>
> ---
> drivers/net/ethernet/mellanox/mlx4/icm.c | 14 +++++++-------
> 1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/ethernet/mellanox/mlx4/icm.c b/drivers/net/ethernet/mellanox/mlx4/icm.c
> index a822f7a..2b17a4b 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/icm.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/icm.c
> @@ -43,12 +43,12 @@
> #include "fw.h"
> 
> /*
> - * We allocate in as big chunks as we can, up to a maximum of 256 KB
> - * per chunk.
> + * We allocate in 4KB page size chunks to avoid high order memory
> + * allocations in fragmented/high usage memory situation.
>  */
> enum {
> -	MLX4_ICM_ALLOC_SIZE	= 1 << 18,
> -	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
> +	MLX4_ICM_ALLOC_SIZE	= 1 << 12,
> +	MLX4_TABLE_CHUNK_SIZE	= 1 << 12

Shouldn’t these be the arch’s page size order? E.g., if running on SPARC, the hw page size is 8KiB.


Thxs, Håkon

> };
> 
> static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
> @@ -400,7 +400,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
> 	obj_per_chunk = MLX4_TABLE_CHUNK_SIZE / obj_size;
> 	num_icm = (nobj + obj_per_chunk - 1) / obj_per_chunk;
> 
> -	table->icm      = kcalloc(num_icm, sizeof(*table->icm), GFP_KERNEL);
> +	table->icm      = vzalloc(num_icm * sizeof(*table->icm));
> 	if (!table->icm)
> 		return -ENOMEM;
> 	table->virt     = virt;
> @@ -446,7 +446,7 @@ int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
> 			mlx4_free_icm(dev, table->icm[i], use_coherent);
> 		}
> 
> -	kfree(table->icm);
> +	vfree(table->icm);
> 
> 	return -ENOMEM;
> }
> @@ -462,5 +462,5 @@ void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table)
> 			mlx4_free_icm(dev, table->icm[i], table->coherent);
> 		}
> 
> -	kfree(table->icm);
> +	vfree(table->icm);
> }
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [rds-devel] KASAN: null-ptr-deref Read in rds_ib_get_mr
From: Sowmini Varadhan @ 2018-05-11 10:46 UTC (permalink / raw)
  To: Yanjun Zhu
  Cc: DaeRyong Jeong, santosh.shilimkar, davem, rds-devel, kt0755,
	linux-rdma, netdev, linux-kernel, byoungyoung
In-Reply-To: <fa3461d4-8872-48af-9b67-be0affd16bbd@oracle.com>

On (05/11/18 15:48), Yanjun Zhu wrote:
> diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
> index e678699..2228b50 100644
> --- a/net/rds/ib_rdma.c
> +++ b/net/rds/ib_rdma.c
> @@ -539,11 +539,17 @@ void rds_ib_flush_mrs(void)
>    void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
>                                         struct rds_sock *rs, u32 *key_ret)
>    {
> -             struct rds_ib_device *rds_ibdev;
> +             struct rds_ib_device *rds_ibdev = NULL;
>                 struct rds_ib_mr *ibmr = NULL;
> -             struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
> +             struct rds_ib_connection *ic = NULL;
>                 int ret;
> 
> +             if (rs->rs_bound_addr == 0) {
> +                             ret = -EPERM;
> +                             goto out;
> +             }
> +
> +             ic = rs->rs_conn->c_transport_data;
>                 rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
>                 if (!rds_ibdev) {
>                                 ret = -ENODEV;
> 
> I made this raw patch. If you can reproduce this bug, please make tests 
> with it.

I dont think this solves the problem, I think it
just changes the timing under which it can still happen. 

what if the rds_remove_bound() in rds_bind() happens after the check
for if (rs->rs_bound_addr == 0) added above by the patch 

I believe you need some type of synchronization (either
through mutex, or some atomic flag in the rs or similar) to make
sure rds_bind() and rds_ib_get_mr() are mutually exclusive.

--Sowmini
 

^ permalink raw reply

* Re: [RFC PATCH 1/3] arcnet: com20020: Add memory map of com20020
From: Andrea Greco @ 2018-05-11 10:50 UTC (permalink / raw)
  To: Rob Herring
  Cc: m.grzeschik, Andrea Greco, Mark Rutland, netdev, devicetree,
	linux-kernel
In-Reply-To: <20180508161636.GA23960@rob-hp-laptop>

On 05/08/2018 06:16 PM, Rob Herring wrote:
> On Sat, May 05, 2018 at 11:34:45PM +0200, Andrea Greco wrote:
>> From: Andrea Greco <a.greco@4sigma.it>
>>
>> Add support for com20022I/com20020, memory mapped chip version.
>> Support bus: Intel 80xx and Motorola 68xx.
>> Bus size: Only 8 bit bus size is supported.
>> Added related device tree bindings
>>
>> Signed-off-by: Andrea Greco <a.greco@4sigma.it>
>> ---
>>   .../devicetree/bindings/net/smsc-com20020.txt      |  23 +++
>
> Please split bindings to separate patch.

Ok
>
>>   drivers/net/arcnet/Kconfig                         |  12 +-
>>   drivers/net/arcnet/Makefile                        |   1 +
>>   drivers/net/arcnet/arcdevice.h                     |  27 ++-
>>   drivers/net/arcnet/com20020-membus.c               | 191 +++++++++++++++++++++
>>   drivers/net/arcnet/com20020.c                      |   9 +-
>>   6 files changed, 253 insertions(+), 10 deletions(-)
>>   create mode 100644 Documentation/devicetree/bindings/net/smsc-com20020.txt
>>   create mode 100644 drivers/net/arcnet/com20020-membus.c
>>
>> diff --git a/Documentation/devicetree/bindings/net/smsc-com20020.txt b/Documentation/devicetree/bindings/net/smsc-com20020.txt
>> new file mode 100644
>> index 000000000000..39c5b19c55af
>> --- /dev/null
>> +++ b/Documentation/devicetree/bindings/net/smsc-com20020.txt
>> @@ -0,0 +1,23 @@
>> +SMSC com20020, com20022I
>
> What does this device do?
>

Changed in:
SMSC com20020 Arcnet network controller

>> +
>> +timeout: Arcnet timeout, checkout datashet
>> +clockp: Clock Prescaler, checkout datashet
>
> s/datashet/datasheet/
>
>> +clockm: Clock multiplier, checkout datasheet
>
> Would these 3 properties be common for arcnet devices? If not, then they
> should have a vendor prefix.
>

Timeout is arcnet propelty:
Other is smsc params, then become:
- timeout: Arcnet timeout
- smsc-clockp: Clock Prescaler
- smsc-clockm: Clock multiplier
- smsc-backplane: Controller use backplane mode inside of transceiver

I forget backplane propelty, but is required

>> +
>> +phy-reset-gpios: Chip reset ppin
>
> Use 'reset-gpios' as that is standard.
>
>> +phy-irq-gpios: Chip irq pin
>
> Use 'interrupts'. Interrupt capable gpio controllers are also interrupt
> controllers.
>

Ok, change to standard

>> +
>> +com20020_A@0 {
>
> Node names should be generic based on the class of device. I don't think
> we have one defined, but how about 'arcnet'.
>
> Unit addresses must have a corresponding reg property. How is this
> device accessed?
>

Then: arcnet@28000000

>> +    compatible = "smsc,com20020";
>
> Not documented.
>
I miss something? Where add this doc?
Is not this file?
Documentation/devicetree/bindings/net/smsc-com20020.txt

>> +
>> + timeout = <0x3>;
>> + backplane = <0x0>;
>> +
>> + clockp = <0x0>;
>> + clockm = <0x3>;
>> +
>> + phy-reset-gpios = <&gpio3 21 GPIO_ACTIVE_LOW>;
>> + phy-irq-gpios = <&gpio2 10 GPIO_ACTIVE_LOW>;
>> +
>> + status = "okay";
>
> Don't should status in examples.
>
>> +};
Ok

Final result of new Patch, for bindings:

SMSC com20020 Arcnet network controller

Required propelty:
- timeout: Arcnet timeout
- smsc-clockp: Clock Prescaler
- smsc-clockm: Clock multiplier
- smsc-backplane: Controller use backplane mode inside of transceiver

- reset-gpios: Chip reset pin
- interrupts: Should contain controller interrupt

arcnet@28000000 {
     compatible = "smsc,com20020";

timeout = <0x3>;
smsc-backplane = <0x0>;
smsc-clockp = <0x0>;
smsc-clockm = <0x3>;

reset-gpios = <&gpio3 21 GPIO_ACTIVE_LOW>;
interrupts = <&gpio2 10 GPIO_ACTIVE_LOW>;
};

Andrea

^ permalink raw reply

* aio poll and a new in-kernel poll API V10
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel

Hi all,

this series adds support for the IOCB_CMD_POLL operation to poll for the
readyness of file descriptors using the aio subsystem.  The API is based
on patches that existed in RHAS2.1 and RHEL3, which means it already is
supported by libaio.  To implement the poll support efficiently new
methods to poll are introduced in struct file_operations:  get_poll_head
and poll_mask.  The first one returns a wait_queue_head to wait on
(lifetime is bound by the file), and the second does a non-blocking
check for the POLL* events.  This allows aio poll to work without
any additional context switches, unlike epoll.

This series sits on top of the aio-fsync series that also includes
support for io_pgetevents.

The changes were sponsored by Scylladb, and improve performance
of the seastar framework up to 10%, while also removing the need
for a privileged SCHED_FIFO epoll listener thread.

    git://git.infradead.org/users/hch/vfs.git aio-poll.10

Gitweb:

    http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/aio-poll.10

Libaio changes:

    https://pagure.io/libaio.git io-poll

Seastar changes (not updated for the new io_pgetevens ABI yet):

    https://github.com/avikivity/seastar/commits/aio


Changes since v9:
 - add to the delayed_cancel_reqs earlier to avoid a race
 - get rid of POLL_TO_PTR magic

Changes since v8:
 - make delayed cancellation conditional again
 - add a cancel_kiocb file operation to split delayed vs normal cancel

Changes since v7:
 - make delayed cancellation safe and unconditional

Changes since v6:
 - reworked cancellation

Changes since v5:
 - small changelog updates
 - rebased on top of the aio-fsync changes

Changes since v4:
 - rebased ontop of Linux 4.16-rc4

Changes since v3:
 - remove the pre-sleep ->poll_mask call in vfs_poll,
   allow ->get_poll_head to return POLL* values.

Changes since v2:
 - removed a double initialization
 - new vfs_get_poll_head helper
 - document that ->get_poll_head can return NULL
 - call ->poll_mask before sleeping
 - various ACKs
 - add conversion of random to ->poll_mask
 - add conversion of af_alg to ->poll_mask
 - lacking ->poll_mask support now returns -EINVAL for IOCB_CMD_POLL
 - reshuffled the series so that prep patches and everything not
   requiring the new in-kernel poll API is in the beginning

Changes since v1:
 - handle the NULL ->poll case in vfs_poll
 - dropped the file argument to the ->poll_mask socket operation
 - replace the ->pre_poll socket operation with ->get_poll_head as
   in the file operations

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply

* [PATCH 01/32] fs: unexport poll_schedule_timeout
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180511110803.10910-1-hch@lst.de>

No users outside of select.c.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/select.c          | 3 +--
 include/linux/poll.h | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index ba879c51288f..a87f396f0313 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -233,7 +233,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	add_wait_queue(wait_address, &entry->wait);
 }
 
-int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
+static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 			  ktime_t *expires, unsigned long slack)
 {
 	int rc = -EINTR;
@@ -258,7 +258,6 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 
 	return rc;
 }
-EXPORT_SYMBOL(poll_schedule_timeout);
 
 /**
  * poll_select_set_timeout - helper function to setup the timeout value
diff --git a/include/linux/poll.h b/include/linux/poll.h
index f45ebd017eaa..a3576da63377 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,8 +96,6 @@ struct poll_wqueues {
 
 extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
-extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
-				 ktime_t *expires, unsigned long slack);
 extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 02/32] fs: cleanup do_pollfd
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180511110803.10910-1-hch@lst.de>

Use straightline code with failure handling gotos instead of a lot
of nested conditionals.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/select.c | 48 +++++++++++++++++++++++-------------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/fs/select.c b/fs/select.c
index a87f396f0313..25da26253485 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -812,34 +812,32 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 				     bool *can_busy_poll,
 				     __poll_t busy_flag)
 {
-	__poll_t mask;
-	int fd;
-
-	mask = 0;
-	fd = pollfd->fd;
-	if (fd >= 0) {
-		struct fd f = fdget(fd);
-		mask = EPOLLNVAL;
-		if (f.file) {
-			/* userland u16 ->events contains POLL... bitmap */
-			__poll_t filter = demangle_poll(pollfd->events) |
-						EPOLLERR | EPOLLHUP;
-			mask = DEFAULT_POLLMASK;
-			if (f.file->f_op->poll) {
-				pwait->_key = filter;
-				pwait->_key |= busy_flag;
-				mask = f.file->f_op->poll(f.file, pwait);
-				if (mask & busy_flag)
-					*can_busy_poll = true;
-			}
-			/* Mask out unneeded events. */
-			mask &= filter;
-			fdput(f);
-		}
+	int fd = pollfd->fd;
+	__poll_t mask = 0, filter;
+	struct fd f;
+
+	if (fd < 0)
+		goto out;
+	mask = EPOLLNVAL;
+	f = fdget(fd);
+	if (!f.file)
+		goto out;
+
+	/* userland u16 ->events contains POLL... bitmap */
+	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
+	mask = DEFAULT_POLLMASK;
+	if (f.file->f_op->poll) {
+		pwait->_key = filter | busy_flag;
+		mask = f.file->f_op->poll(f.file, pwait);
+		if (mask & busy_flag)
+			*can_busy_poll = true;
 	}
+	mask &= filter;		/* Mask out unneeded events. */
+	fdput(f);
+
+out:
 	/* ... and so does ->revents */
 	pollfd->revents = mangle_poll(mask);
-
 	return mask;
 }
 
-- 
2.17.0

^ permalink raw reply related

* [PATCH 03/32] fs: update documentation to mention __poll_t and match the code
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180511110803.10910-1-hch@lst.de>

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/filesystems/Locking | 2 +-
 Documentation/filesystems/vfs.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 75d2d57e2c44..220bba28f72b 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -439,7 +439,7 @@ prototypes:
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 5fd325df59e2..f608180ad59d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -856,7 +856,7 @@ struct file_operations {
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
-	unsigned int (*poll) (struct file *, struct poll_table_struct *);
+	__poll_t (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 04/32] fs: add new vfs_poll and file_can_poll helpers
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180511110803.10910-1-hch@lst.de>

These abstract out calls to the poll method in preparation for changes
in how we poll.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 drivers/staging/comedi/drivers/serial2002.c |  4 ++--
 drivers/vfio/virqfd.c                       |  2 +-
 drivers/vhost/vhost.c                       |  2 +-
 fs/eventpoll.c                              |  5 ++---
 fs/select.c                                 | 23 +++++++--------------
 include/linux/poll.h                        | 12 +++++++++++
 mm/memcontrol.c                             |  2 +-
 net/9p/trans_fd.c                           | 18 ++++------------
 virt/kvm/eventfd.c                          |  2 +-
 9 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/drivers/staging/comedi/drivers/serial2002.c b/drivers/staging/comedi/drivers/serial2002.c
index b3f3b4a201af..5471b2212a62 100644
--- a/drivers/staging/comedi/drivers/serial2002.c
+++ b/drivers/staging/comedi/drivers/serial2002.c
@@ -113,7 +113,7 @@ static void serial2002_tty_read_poll_wait(struct file *f, int timeout)
 		long elapsed;
 		__poll_t mask;
 
-		mask = f->f_op->poll(f, &table.pt);
+		mask = vfs_poll(f, &table.pt);
 		if (mask & (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN |
 			    EPOLLHUP | EPOLLERR)) {
 			break;
@@ -136,7 +136,7 @@ static int serial2002_tty_read(struct file *f, int timeout)
 
 	result = -1;
 	if (!IS_ERR(f)) {
-		if (f->f_op->poll) {
+		if (file_can_poll(f)) {
 			serial2002_tty_read_poll_wait(f, timeout);
 
 			if (kernel_read(f, &ch, 1, &pos) == 1)
diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
index 085700f1be10..2a1be859ee71 100644
--- a/drivers/vfio/virqfd.c
+++ b/drivers/vfio/virqfd.c
@@ -166,7 +166,7 @@ int vfio_virqfd_enable(void *opaque,
 	init_waitqueue_func_entry(&virqfd->wait, virqfd_wakeup);
 	init_poll_funcptr(&virqfd->pt, virqfd_ptable_queue_proc);
 
-	events = irqfd.file->f_op->poll(irqfd.file, &virqfd->pt);
+	events = vfs_poll(irqfd.file, &virqfd->pt);
 
 	/*
 	 * Check if there was an event already pending on the eventfd
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index f3bd8e941224..f6022881f147 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -208,7 +208,7 @@ int vhost_poll_start(struct vhost_poll *poll, struct file *file)
 	if (poll->wqh)
 		return 0;
 
-	mask = file->f_op->poll(file, &poll->table);
+	mask = vfs_poll(file, &poll->table);
 	if (mask)
 		vhost_poll_wakeup(&poll->wait, 0, 0, poll_to_key(mask));
 	if (mask & EPOLLERR) {
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 602ca4285b2e..67db22fe99c5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -884,8 +884,7 @@ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
 
 	pt->_key = epi->event.events;
 	if (!is_file_epoll(epi->ffd.file))
-		return epi->ffd.file->f_op->poll(epi->ffd.file, pt) &
-		       epi->event.events;
+		return vfs_poll(epi->ffd.file, pt) & epi->event.events;
 
 	ep = epi->ffd.file->private_data;
 	poll_wait(epi->ffd.file, &ep->poll_wait, pt);
@@ -2025,7 +2024,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 
 	/* The target file descriptor must support poll */
 	error = -EPERM;
-	if (!tf.file->f_op->poll)
+	if (!file_can_poll(tf.file))
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
diff --git a/fs/select.c b/fs/select.c
index 25da26253485..e30def680b2e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -502,14 +502,10 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 					continue;
 				f = fdget(i);
 				if (f.file) {
-					const struct file_operations *f_op;
-					f_op = f.file->f_op;
-					mask = DEFAULT_POLLMASK;
-					if (f_op->poll) {
-						wait_key_set(wait, in, out,
-							     bit, busy_flag);
-						mask = (*f_op->poll)(f.file, wait);
-					}
+					wait_key_set(wait, in, out, bit,
+						     busy_flag);
+					mask = vfs_poll(f.file, wait);
+
 					fdput(f);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
@@ -825,13 +821,10 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 
 	/* userland u16 ->events contains POLL... bitmap */
 	filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
-	mask = DEFAULT_POLLMASK;
-	if (f.file->f_op->poll) {
-		pwait->_key = filter | busy_flag;
-		mask = f.file->f_op->poll(f.file, pwait);
-		if (mask & busy_flag)
-			*can_busy_poll = true;
-	}
+	pwait->_key = filter | busy_flag;
+	mask = vfs_poll(f.file, pwait);
+	if (mask & busy_flag)
+		*can_busy_poll = true;
 	mask &= filter;		/* Mask out unneeded events. */
 	fdput(f);
 
diff --git a/include/linux/poll.h b/include/linux/poll.h
index a3576da63377..7e0fdcf905d2 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,6 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
+static inline bool file_can_poll(struct file *file)
+{
+	return file->f_op->poll;
+}
+
+static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+	if (unlikely(!file->f_op->poll))
+		return DEFAULT_POLLMASK;
+	return file->f_op->poll(file, pt);
+}
+
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd3df3d101a..1695f38630f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3849,7 +3849,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 	if (ret)
 		goto out_put_css;
 
-	efile.file->f_op->poll(efile.file, &event->pt);
+	vfs_poll(efile.file, &event->pt);
 
 	spin_lock(&memcg->event_list_lock);
 	list_add(&event->list, &memcg->event_list);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 0cfba919d167..3811775692d0 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -231,7 +231,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
 static __poll_t
 p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 {
-	__poll_t ret, n;
+	__poll_t ret;
 	struct p9_trans_fd *ts = NULL;
 
 	if (client && client->status == Connected)
@@ -243,19 +243,9 @@ p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt, int *err)
 		return EPOLLERR;
 	}
 
-	if (!ts->rd->f_op->poll)
-		ret = DEFAULT_POLLMASK;
-	else
-		ret = ts->rd->f_op->poll(ts->rd, pt);
-
-	if (ts->rd != ts->wr) {
-		if (!ts->wr->f_op->poll)
-			n = DEFAULT_POLLMASK;
-		else
-			n = ts->wr->f_op->poll(ts->wr, pt);
-		ret = (ret & ~EPOLLOUT) | (n & ~EPOLLIN);
-	}
-
+	ret = vfs_poll(ts->rd, pt);
+	if (ts->rd != ts->wr)
+		ret = (ret & ~EPOLLOUT) | (vfs_poll(ts->wr, pt) & ~EPOLLIN);
 	return ret;
 }
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 6e865e8b5b10..90d30fbe95ae 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -397,7 +397,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * Check if there was an event already pending on the eventfd
 	 * before we registered, and trigger it as if we didn't miss it.
 	 */
-	events = f.file->f_op->poll(f.file, &irqfd->pt);
+	events = vfs_poll(f.file, &irqfd->pt);
 
 	if (events & EPOLLIN)
 		schedule_work(&irqfd->inject);
-- 
2.17.0

^ permalink raw reply related

* [PATCH 05/32] fs: introduce new ->get_poll_head and ->poll_mask methods
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180511110803.10910-1-hch@lst.de>

->get_poll_head returns the waitqueue that the poll operation is going
to sleep on.  Note that this means we can only use a single waitqueue
for the poll, unlike some current drivers that use two waitqueues for
different events.  But now that we have keyed wakeups and heavily use
those for poll there aren't that many good reason left to keep the
multiple waitqueues, and if there are any ->poll is still around, the
driver just won't support aio poll.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 Documentation/filesystems/Locking |  7 ++++++-
 Documentation/filesystems/vfs.txt | 13 +++++++++++++
 fs/select.c                       | 23 +++++++++++++++++++++++
 include/linux/fs.h                |  2 ++
 include/linux/poll.h              | 12 ++++++------
 5 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 220bba28f72b..6d227f9d7bd9 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -440,6 +440,8 @@ prototypes:
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -470,7 +472,7 @@ prototypes:
 };
 
 locking rules:
-	All may block.
+	All except for ->poll_mask may block.
 
 ->llseek() locking has moved from llseek to the individual llseek
 implementations.  If your fs is not using generic_file_llseek, you
@@ -498,6 +500,9 @@ in sys_read() and friends.
 the lease within the individual filesystem to record the result of the
 operation
 
+->poll_mask can be called with or without the waitqueue lock for the waitqueue
+returned from ->get_poll_head.
+
 --------------------------- dquot_operations -------------------------------
 prototypes:
 	int (*write_dquot) (struct dquot *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index f608180ad59d..829a7b7857a4 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -857,6 +857,8 @@ struct file_operations {
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
@@ -901,6 +903,17 @@ otherwise noted.
 	activity on this file and (optionally) go to sleep until there
 	is activity. Called by the select(2) and poll(2) system calls
 
+  get_poll_head: Returns the struct wait_queue_head that callers can
+  wait on.  Callers need to check the returned events using ->poll_mask
+  once woken.  Can return NULL to indicate polling is not supported,
+  or any error code using the ERR_PTR convention to indicate that a
+  grave error occured and ->poll_mask shall not be called.
+
+  poll_mask: return the mask of EPOLL* values describing the file descriptor
+  state.  Called either before going to sleep on the waitqueue returned by
+  get_poll_head, or after it has been woken.  If ->get_poll_head and
+  ->poll_mask are implemented ->poll does not need to be implement.
+
   unlocked_ioctl: called by the ioctl(2) system call.
 
   compat_ioctl: called by the ioctl(2) system call when 32 bit system calls
diff --git a/fs/select.c b/fs/select.c
index e30def680b2e..bc3cc0f98896 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -34,6 +34,29 @@
 
 #include <linux/uaccess.h>
 
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+{
+	if (file->f_op->poll) {
+		return file->f_op->poll(file, pt);
+	} else if (file_has_poll_mask(file)) {
+		unsigned int events = poll_requested_events(pt);
+		struct wait_queue_head *head;
+
+		if (pt && pt->_qproc) {
+			head = file->f_op->get_poll_head(file, events);
+			if (!head)
+				return DEFAULT_POLLMASK;
+			if (IS_ERR(head))
+				return EPOLLERR;
+			pt->_qproc(file, head, pt);
+		}
+
+		return file->f_op->poll_mask(file, events);
+	} else {
+		return DEFAULT_POLLMASK;
+	}
+}
+EXPORT_SYMBOL_GPL(vfs_poll);
 
 /*
  * Estimate expected accuracy in ns from a timeval.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..4b6045ebb2f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1711,6 +1711,8 @@ struct file_operations {
 	int (*iterate) (struct file *, struct dir_context *);
 	int (*iterate_shared) (struct file *, struct dir_context *);
 	__poll_t (*poll) (struct file *, struct poll_table_struct *);
+	struct wait_queue_head * (*get_poll_head)(struct file *, __poll_t);
+	__poll_t (*poll_mask) (struct file *, __poll_t);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
 	int (*mmap) (struct file *, struct vm_area_struct *);
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 7e0fdcf905d2..fdf86b4cbc71 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -74,18 +74,18 @@ static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 	pt->_key   = ~(__poll_t)0; /* all events enabled */
 }
 
-static inline bool file_can_poll(struct file *file)
+static inline bool file_has_poll_mask(struct file *file)
 {
-	return file->f_op->poll;
+	return file->f_op->get_poll_head && file->f_op->poll_mask;
 }
 
-static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
+static inline bool file_can_poll(struct file *file)
 {
-	if (unlikely(!file->f_op->poll))
-		return DEFAULT_POLLMASK;
-	return file->f_op->poll(file, pt);
+	return file->f_op->poll || file_has_poll_mask(file);
 }
 
+__poll_t vfs_poll(struct file *file, struct poll_table_struct *pt);
+
 struct poll_table_entry {
 	struct file *filp;
 	__poll_t key;
-- 
2.17.0

^ permalink raw reply related

* [PATCH 06/32] aio: simplify KIOCB_KEY handling
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180511110803.10910-1-hch@lst.de>

No need to pass the key field to lookup_iocb to compare it with KIOCB_KEY,
as we can do that right after retrieving it from userspace.  Also move the
KIOCB_KEY definition to aio.c as it is an internal value not used by any
other place in the kernel.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c            | 14 +++++++-------
 include/linux/aio.h |  2 --
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index f3eae5d5771b..2e0bc04f7920 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -46,6 +46,8 @@
 
 #include "internal.h"
 
+#define KIOCB_KEY		0
+
 #define AIO_RING_MAGIC			0xa10a10a1
 #define AIO_RING_COMPAT_FEATURES	1
 #define AIO_RING_INCOMPAT_FEATURES	0
@@ -1812,15 +1814,12 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
  *	Finds a given iocb for cancellation.
  */
 static struct aio_kiocb *
-lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key)
+lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb)
 {
 	struct aio_kiocb *kiocb;
 
 	assert_spin_locked(&ctx->ctx_lock);
 
-	if (key != KIOCB_KEY)
-		return NULL;
-
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
 		if (kiocb->ki_user_iocb == iocb)
@@ -1847,9 +1846,10 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	u32 key;
 	int ret;
 
-	ret = get_user(key, &iocb->aio_key);
-	if (unlikely(ret))
+	if (unlikely(get_user(key, &iocb->aio_key)))
 		return -EFAULT;
+	if (unlikely(key != KIOCB_KEY))
+		return -EINVAL;
 
 	ctx = lookup_ioctx(ctx_id);
 	if (unlikely(!ctx))
@@ -1857,7 +1857,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 
 	spin_lock_irq(&ctx->ctx_lock);
 
-	kiocb = lookup_kiocb(ctx, iocb, key);
+	kiocb = lookup_kiocb(ctx, iocb);
 	if (kiocb)
 		ret = kiocb_cancel(kiocb);
 	else
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 9d8aabecfe2d..b83e68dd006f 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -8,8 +8,6 @@ struct kioctx;
 struct kiocb;
 struct mm_struct;
 
-#define KIOCB_KEY		0
-
 typedef int (kiocb_cancel_fn)(struct kiocb *);
 
 /* prototypes */
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related

* [PATCH 07/32] aio: simplify cancellation
From: Christoph Hellwig @ 2018-05-11 11:07 UTC (permalink / raw)
  To: viro; +Cc: Avi Kivity, linux-aio, linux-fsdevel, netdev, linux-api,
	linux-kernel
In-Reply-To: <20180511110803.10910-1-hch@lst.de>

With the current aio code there is no need for the magic KIOCB_CANCELLED
value, as a cancelation just kicks the driver to queue the completion
ASAP, with all actual completion handling done in another thread. Given
that both the completion path and cancelation take the context lock there
is no need for magic cmpxchg loops either.  If we remove iocbs from the
active list before calling ->ki_cancel we can also rely on the invariant
thay anything found on the list has a ->ki_cancel callback and can be
cancelled, further simplifing the code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/aio.c | 49 ++++++-------------------------------------------
 1 file changed, 6 insertions(+), 43 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 2e0bc04f7920..8991baa38d5d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -164,19 +164,6 @@ struct fsync_iocb {
 	bool			datasync;
 };
 
-/*
- * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
- * cancelled or completed (this makes a certain amount of sense because
- * successful cancellation - io_cancel() - does deliver the completion to
- * userspace).
- *
- * And since most things don't implement kiocb cancellation and we'd really like
- * kiocb completion to be lockless when possible, we use ki_cancel to
- * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
- * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
- */
-#define KIOCB_CANCELLED		((void *) (~0ULL))
-
 struct aio_kiocb {
 	union {
 		struct kiocb		rw;
@@ -574,27 +561,6 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
-static int kiocb_cancel(struct aio_kiocb *kiocb)
-{
-	kiocb_cancel_fn *old, *cancel;
-
-	/*
-	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
-	 * actually has a cancel function, hence the cmpxchg()
-	 */
-
-	cancel = READ_ONCE(kiocb->ki_cancel);
-	do {
-		if (!cancel || cancel == KIOCB_CANCELLED)
-			return -EINVAL;
-
-		old = cancel;
-		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
-	} while (cancel != old);
-
-	return cancel(&kiocb->rw);
-}
-
 /*
  * free_ioctx() should be RCU delayed to synchronize against the RCU
  * protected lookup_ioctx() and also needs process context to call
@@ -641,9 +607,8 @@ static void free_ioctx_users(struct percpu_ref *ref)
 	while (!list_empty(&ctx->active_reqs)) {
 		req = list_first_entry(&ctx->active_reqs,
 				       struct aio_kiocb, ki_list);
-
 		list_del_init(&req->ki_list);
-		kiocb_cancel(req);
+		req->ki_cancel(&req->rw);
 	}
 
 	spin_unlock_irq(&ctx->ctx_lock);
@@ -1843,8 +1808,8 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 {
 	struct kioctx *ctx;
 	struct aio_kiocb *kiocb;
+	int ret = -EINVAL;
 	u32 key;
-	int ret;
 
 	if (unlikely(get_user(key, &iocb->aio_key)))
 		return -EFAULT;
@@ -1856,13 +1821,11 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->ctx_lock);
-
 	kiocb = lookup_kiocb(ctx, iocb);
-	if (kiocb)
-		ret = kiocb_cancel(kiocb);
-	else
-		ret = -EINVAL;
-
+	if (kiocb) {
+		list_del_init(&kiocb->ki_list);
+		ret = kiocb->ki_cancel(&kiocb->rw);
+	}
 	spin_unlock_irq(&ctx->ctx_lock);
 
 	if (!ret) {
-- 
2.17.0

--
To unsubscribe, send a message with 'unsubscribe linux-aio' in
the body to majordomo@kvack.org.  For more info on Linux AIO,
see: http://www.kvack.org/aio/
Don't email: <a href=mailto:"aart@kvack.org">aart@kvack.org</a>

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox