LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [powerpc:fixes-test] BUILD SUCCESS 16d83a540ca4e7f1ebb2b3756869b77451d31414
From: kernel test robot @ 2020-08-27 20:00 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  fixes-test
branch HEAD: 16d83a540ca4e7f1ebb2b3756869b77451d31414  Revert "powerpc/powernv/idle: Replace CPU feature check with PVR check"

elapsed time: 736m

configs tested: 142
configs skipped: 15

The following configs have been built successfully.
More configs may be tested in the coming days.

arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
riscv                    nommu_k210_defconfig
mips                     decstation_defconfig
arm                   milbeaut_m10v_defconfig
powerpc                mpc7448_hpc2_defconfig
sh                              ul2_defconfig
arm                           u8500_defconfig
arm                          badge4_defconfig
m68k                           sun3_defconfig
arm                          ixp4xx_defconfig
m68k                          multi_defconfig
powerpc                         ps3_defconfig
arm                             rpc_defconfig
sh                          sdk7780_defconfig
arm                             ezx_defconfig
mips                            e55_defconfig
arm                          pxa910_defconfig
s390                          debug_defconfig
sh                           se7712_defconfig
arm                           stm32_defconfig
sparc64                          alldefconfig
arm                          pxa168_defconfig
sh                        apsh4ad0a_defconfig
mips                          ath25_defconfig
ia64                             allmodconfig
sh                        sh7763rdp_defconfig
arm                         orion5x_defconfig
powerpc                  mpc885_ads_defconfig
sh                          rsk7203_defconfig
powerpc                     pq2fads_defconfig
mips                           xway_defconfig
m68k                          sun3x_defconfig
sh                            shmin_defconfig
mips                          malta_defconfig
arm                           spitz_defconfig
mips                       rbtx49xx_defconfig
sh                           se7721_defconfig
powerpc                     mpc5200_defconfig
arm                      pxa255-idp_defconfig
mips                           ip22_defconfig
sh                         apsh4a3a_defconfig
microblaze                      mmu_defconfig
microblaze                          defconfig
arc                           tb10x_defconfig
sh                        sh7757lcr_defconfig
mips                      maltaaprp_defconfig
arm                         hackkit_defconfig
arm                          moxart_defconfig
parisc                           alldefconfig
arm                        mvebu_v7_defconfig
arm                       mainstone_defconfig
m68k                             allmodconfig
c6x                         dsk6455_defconfig
powerpc                     powernv_defconfig
mips                          rb532_defconfig
ia64                         bigsur_defconfig
arm                        multi_v5_defconfig
sh                         ecovec24_defconfig
arm                         axm55xx_defconfig
sh                          urquell_defconfig
arm                         assabet_defconfig
h8300                    h8300h-sim_defconfig
h8300                               defconfig
m68k                         apollo_defconfig
sh                             sh03_defconfig
sh                        edosk7760_defconfig
sh                           sh2007_defconfig
sparc                            allyesconfig
sh                         microdev_defconfig
arm                        magician_defconfig
c6x                        evmc6678_defconfig
sparc                               defconfig
arc                    vdk_hs38_smp_defconfig
s390                       zfcpdump_defconfig
x86_64                           allyesconfig
mips                      pic32mzda_defconfig
arm                       netwinder_defconfig
nios2                            alldefconfig
powerpc                    amigaone_defconfig
powerpc                     skiroot_defconfig
powerpc                      tqm8xx_defconfig
arm                       aspeed_g4_defconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
parisc                           allyesconfig
s390                                defconfig
i386                             allyesconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                             defconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a005-20200827
x86_64               randconfig-a006-20200827
x86_64               randconfig-a004-20200827
x86_64               randconfig-a003-20200827
x86_64               randconfig-a002-20200827
x86_64               randconfig-a001-20200827
i386                 randconfig-a002-20200827
i386                 randconfig-a004-20200827
i386                 randconfig-a003-20200827
i386                 randconfig-a005-20200827
i386                 randconfig-a006-20200827
i386                 randconfig-a001-20200827
i386                 randconfig-a016-20200827
i386                 randconfig-a015-20200827
i386                 randconfig-a013-20200827
i386                 randconfig-a012-20200827
i386                 randconfig-a011-20200827
i386                 randconfig-a014-20200827
riscv                            allyesconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                                  kexec

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:merge] BUILD SUCCESS 2281d5a219ba2dca30e0e74e49b811ed8bfc6a81
From: kernel test robot @ 2020-08-27 20:00 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  merge
branch HEAD: 2281d5a219ba2dca30e0e74e49b811ed8bfc6a81  Automatic merge of 'master', 'next' and 'fixes' (2020-08-27 17:47)

elapsed time: 722m

configs tested: 115
configs skipped: 14

The following configs have been built successfully.
More configs may be tested in the coming days.

arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
riscv                    nommu_k210_defconfig
mips                     decstation_defconfig
arm                   milbeaut_m10v_defconfig
powerpc                mpc7448_hpc2_defconfig
sh                              ul2_defconfig
powerpc                         ps3_defconfig
arm                             rpc_defconfig
sh                            shmin_defconfig
arm                         s3c2410_defconfig
m68k                        m5272c3_defconfig
sh                            migor_defconfig
arm                        multi_v7_defconfig
sh                          sdk7780_defconfig
arm                             ezx_defconfig
mips                            e55_defconfig
arm                          pxa910_defconfig
sh                        apsh4ad0a_defconfig
mips                          ath25_defconfig
mips                         mpc30x_defconfig
powerpc                      ppc40x_defconfig
parisc                           allyesconfig
powerpc                     mpc5200_defconfig
arm                      pxa255-idp_defconfig
mips                           ip22_defconfig
arc                           tb10x_defconfig
sparc64                          alldefconfig
sh                        sh7757lcr_defconfig
mips                      maltaaprp_defconfig
arm                         hackkit_defconfig
arm                          moxart_defconfig
parisc                           alldefconfig
arm                        mvebu_v7_defconfig
nios2                         3c120_defconfig
arm                           h3600_defconfig
sparc                       sparc32_defconfig
arm                       mainstone_defconfig
m68k                             allmodconfig
c6x                         dsk6455_defconfig
s390                       zfcpdump_defconfig
x86_64                           allyesconfig
mips                      pic32mzda_defconfig
sh                        sh7763rdp_defconfig
arm                       netwinder_defconfig
arm                         at91_dt_defconfig
sh                          kfr2r09_defconfig
arm                          pcm027_defconfig
powerpc                      ppc6xx_defconfig
nios2                            alldefconfig
powerpc                    amigaone_defconfig
powerpc                     skiroot_defconfig
powerpc                      tqm8xx_defconfig
arm                       aspeed_g4_defconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                             defconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a003-20200827
x86_64               randconfig-a002-20200827
x86_64               randconfig-a001-20200827
x86_64               randconfig-a005-20200827
x86_64               randconfig-a006-20200827
x86_64               randconfig-a004-20200827
i386                 randconfig-a002-20200827
i386                 randconfig-a004-20200827
i386                 randconfig-a003-20200827
i386                 randconfig-a005-20200827
i386                 randconfig-a006-20200827
i386                 randconfig-a001-20200827
i386                 randconfig-a013-20200827
i386                 randconfig-a012-20200827
i386                 randconfig-a011-20200827
i386                 randconfig-a016-20200827
i386                 randconfig-a015-20200827
i386                 randconfig-a014-20200827
riscv                            allyesconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                                  kexec

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* [powerpc:next-test] BUILD SUCCESS adf3447eddc59967b18faf51dff97c07a5a8220b
From: kernel test robot @ 2020-08-27 20:00 UTC (permalink / raw)
  To: Michael Ellerman; +Cc: linuxppc-dev

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  next-test
branch HEAD: adf3447eddc59967b18faf51dff97c07a5a8220b  powerpc: Update documentation of ISA versions for Power10

elapsed time: 735m

configs tested: 118
configs skipped: 13

The following configs have been built successfully.
More configs may be tested in the coming days.

arm                                 defconfig
arm64                            allyesconfig
arm64                               defconfig
arm                              allyesconfig
arm                              allmodconfig
riscv                    nommu_k210_defconfig
mips                     decstation_defconfig
arm                   milbeaut_m10v_defconfig
powerpc                mpc7448_hpc2_defconfig
sh                              ul2_defconfig
mips                           ip28_defconfig
arm                        clps711x_defconfig
m68k                       m5208evb_defconfig
arm                         vf610m4_defconfig
m68k                            mac_defconfig
powerpc                         ps3_defconfig
arm                             rpc_defconfig
sh                            shmin_defconfig
arm                         s3c2410_defconfig
m68k                        m5272c3_defconfig
sh                            migor_defconfig
arm                        multi_v7_defconfig
mips                         mpc30x_defconfig
powerpc                      ppc40x_defconfig
parisc                           allyesconfig
powerpc                     mpc5200_defconfig
arm                      pxa255-idp_defconfig
mips                           ip22_defconfig
arc                           tb10x_defconfig
sparc64                          alldefconfig
sh                        sh7757lcr_defconfig
mips                      maltaaprp_defconfig
arm                         hackkit_defconfig
arm                          moxart_defconfig
parisc                           alldefconfig
arm                        mvebu_v7_defconfig
nios2                         3c120_defconfig
arm                           h3600_defconfig
sparc                       sparc32_defconfig
arm                       mainstone_defconfig
m68k                             allmodconfig
c6x                         dsk6455_defconfig
m68k                         apollo_defconfig
sh                             sh03_defconfig
sh                        edosk7760_defconfig
sh                           sh2007_defconfig
s390                       zfcpdump_defconfig
x86_64                           allyesconfig
mips                      pic32mzda_defconfig
sh                        sh7763rdp_defconfig
arm                       netwinder_defconfig
arm                         at91_dt_defconfig
sh                          kfr2r09_defconfig
arm                          pcm027_defconfig
powerpc                      ppc6xx_defconfig
nios2                            alldefconfig
powerpc                    amigaone_defconfig
powerpc                     skiroot_defconfig
powerpc                      tqm8xx_defconfig
arm                       aspeed_g4_defconfig
ia64                             allmodconfig
ia64                                defconfig
ia64                             allyesconfig
m68k                                defconfig
m68k                             allyesconfig
nios2                               defconfig
arc                              allyesconfig
nds32                             allnoconfig
c6x                              allyesconfig
nds32                               defconfig
nios2                            allyesconfig
csky                                defconfig
alpha                               defconfig
alpha                            allyesconfig
xtensa                           allyesconfig
h8300                            allyesconfig
arc                                 defconfig
sh                               allmodconfig
parisc                              defconfig
s390                             allyesconfig
s390                                defconfig
i386                             allyesconfig
sparc                            allyesconfig
sparc                               defconfig
i386                                defconfig
mips                             allyesconfig
mips                             allmodconfig
powerpc                             defconfig
powerpc                          allyesconfig
powerpc                          allmodconfig
powerpc                           allnoconfig
x86_64               randconfig-a003-20200827
x86_64               randconfig-a002-20200827
x86_64               randconfig-a001-20200827
x86_64               randconfig-a005-20200827
x86_64               randconfig-a006-20200827
x86_64               randconfig-a004-20200827
i386                 randconfig-a002-20200827
i386                 randconfig-a004-20200827
i386                 randconfig-a003-20200827
i386                 randconfig-a005-20200827
i386                 randconfig-a006-20200827
i386                 randconfig-a001-20200827
i386                 randconfig-a013-20200827
i386                 randconfig-a012-20200827
i386                 randconfig-a011-20200827
i386                 randconfig-a016-20200827
i386                 randconfig-a015-20200827
i386                 randconfig-a014-20200827
riscv                            allyesconfig
riscv                             allnoconfig
riscv                               defconfig
riscv                            allmodconfig
x86_64                                   rhel
x86_64                    rhel-7.6-kselftests
x86_64                              defconfig
x86_64                               rhel-8.3
x86_64                                  kexec

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

^ permalink raw reply

* Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
From: Dmitry Safonov @ 2020-08-27 20:34 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: nathanl, linux-arch, Arnd Bergmann, open list, Will Deacon,
	Paul Mackerras, Andy Lutomirski, Thomas Gleixner,
	Vincenzo Frascino, linuxppc-dev
In-Reply-To: <87imd5h5kb.fsf@mpe.ellerman.id.au>

Hello,

On Wed, 26 Aug 2020 at 15:39, Michael Ellerman <mpe@ellerman.id.au> wrote:
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
[..]
> > arch_remap() gets replaced by vdso_remap()
> >
> > For arch_unmap(), I'm wondering how/what other architectures do, because
> > powerpc seems to be the only one to erase the vdso context pointer when
> > unmapping the vdso.
>
> Yeah. The original unmap/remap stuff was added for CRIU, which I thought
> people tested on other architectures (more than powerpc even).
>
> Possibly no one really cares about vdso unmap though, vs just moving the
> vdso.
>
> We added a test for vdso unmap recently because it happened to trigger a
> KAUP failure, and someone actually hit it & reported it.

You right, CRIU cares much more about moving vDSO.
It's done for each restoree and as on most setups vDSO is premapped and
used by the application - it's actively tested.
Speaking about vDSO unmap - that's concerning only for heterogeneous C/R,
i.e when an application is migrated from a system that uses vDSO to the one
which doesn't - it's much rare scenario.
(for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter)

Looking at the code, it seems quite easy to provide/maintain .close() for
vm_special_mapping. A bit harder to add a test from CRIU side
(as glibc won't know on restore that it can't use vdso anymore),
but totally not impossible.

> Running that test on arm64 segfaults:
>
>   # ./sigreturn_vdso
>   VDSO is at 0xffff8191f000-0xffff8191ffff (4096 bytes)
>   Signal delivered OK with VDSO mapped
>   VDSO moved to 0xffff8191a000-0xffff8191afff (4096 bytes)
>   Signal delivered OK with VDSO moved
>   Unmapped VDSO
>   Remapped the stack executable
>   [   48.556191] potentially unexpected fatal signal 11.
>   [   48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted 5.9.0-rc2-00057-g2ac69819ba9e #190
>   [   48.556990] Hardware name: linux,dummy-virt (DT)
>   [   48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--)
>   [   48.557475] pc : 0000ffff8191a7bc
>   [   48.557603] lr : 0000ffff8191a7bc
>   [   48.557697] sp : 0000ffffc13c9e90
>   [   48.557873] x29: 0000ffffc13cb0e0 x28: 0000000000000000
>   [   48.558201] x27: 0000000000000000 x26: 0000000000000000
>   [   48.558337] x25: 0000000000000000 x24: 0000000000000000
>   [   48.558754] x23: 0000000000000000 x22: 0000000000000000
>   [   48.558893] x21: 00000000004009b0 x20: 0000000000000000
>   [   48.559046] x19: 0000000000400ff0 x18: 0000000000000000
>   [   48.559180] x17: 0000ffff817da300 x16: 0000000000412010
>   [   48.559312] x15: 0000000000000000 x14: 000000000000001c
>   [   48.559443] x13: 656c626174756365 x12: 7865206b63617473
>   [   48.559625] x11: 0000000000000003 x10: 0101010101010101
>   [   48.559828] x9 : 0000ffff818afda8 x8 : 0000000000000081
>   [   48.559973] x7 : 6174732065687420 x6 : 64657070616d6552
>   [   48.560115] x5 : 000000000e0388bd x4 : 000000000040135d
>   [   48.560270] x3 : 0000000000000000 x2 : 0000000000000001
>   [   48.560412] x1 : 0000000000000003 x0 : 00000000004120b8
>   Segmentation fault
>   #
>
> So I think we need to keep the unmap hook. Maybe it should be handled by
> the special_mapping stuff generically.

I'll cook a patch for vm_special_mapping if you don't mind :-)

Thanks,
             Dmitry

^ permalink raw reply

* Re: [PATCH v1 05/10] powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper
From: Leonardo Bras @ 2020-08-27 21:23 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Christophe Leroy, Joel Stanley,
	Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <e4ea6264-578d-f8ea-a0c4-3b279b5c0411@ozlabs.ru>

On Mon, 2020-08-24 at 10:38 +1000, Alexey Kardashevskiy wrote:
> 
> On 18/08/2020 09:40, Leonardo Bras wrote:
> > Creates a helper to allow allocating a new iommu_table without the need
> > to reallocate the iommu_group.
> > 
> > This will be helpful for replacing the iommu_table for the new DMA window,
> > after we remove the old one with iommu_tce_table_put().
> > 
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > ---
> >  arch/powerpc/platforms/pseries/iommu.c | 25 ++++++++++++++-----------
> >  1 file changed, 14 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> > index 8fe23b7dff3a..39617ce0ec83 100644
> > --- a/arch/powerpc/platforms/pseries/iommu.c
> > +++ b/arch/powerpc/platforms/pseries/iommu.c
> > @@ -53,28 +53,31 @@ enum {
> >  	DDW_EXT_QUERY_OUT_SIZE = 2
> >  };
> >  
> > -static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> > +static struct iommu_table *iommu_pseries_alloc_table(int node)
> >  {
> > -	struct iommu_table_group *table_group;
> >  	struct iommu_table *tbl;
> >  
> > -	table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL,
> > -			   node);
> > -	if (!table_group)
> > -		return NULL;
> > -
> >  	tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node);
> >  	if (!tbl)
> > -		goto free_group;
> > +		return NULL;
> >  
> >  	INIT_LIST_HEAD_RCU(&tbl->it_group_list);
> >  	kref_init(&tbl->it_kref);
> > +	return tbl;
> > +}
> >  
> > -	table_group->tables[0] = tbl;
> > +static struct iommu_table_group *iommu_pseries_alloc_group(int node)
> > +{
> > +	struct iommu_table_group *table_group;
> > +
> > +	table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node);
> 
> I'd prefer you did not make unrelated changes (sizeof(struct
> iommu_table_group) -> sizeof(*table_group)) so the diff stays shorter
> and easier to follow. You changed  sizeof(struct iommu_table_group) but
> not sizeof(struct iommu_table) and this confused me enough to spend more
> time than this straight forward change deserves.

Sorry, I will keep this in mind for future patches.
Thank you for the tip!

> 
> Not important in this case though so
> 
> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>

Thank you!



^ permalink raw reply

* Re: fsl_espi errors on v5.7.15
From: Chris Packham @ 2020-08-27 22:07 UTC (permalink / raw)
  To: Nicholas Piggin, benh@kernel.crashing.org, broonie@kernel.org,
	Heiner Kallweit, mpe@ellerman.id.au, paulus@samba.org
  Cc: linuxppc-dev@lists.ozlabs.org, linux-kernel@vger.kernel.org,
	linux-spi@vger.kernel.org
In-Reply-To: <1598510348.1g7wt0s02s.astroid@bobo.none>

On 27/08/20 7:12 pm, Nicholas Piggin wrote:
> Excerpts from Heiner Kallweit's message of August 26, 2020 4:38 pm:
>> On 26.08.2020 08:07, Chris Packham wrote:
>>> On 26/08/20 1:48 pm, Chris Packham wrote:
>>>> On 26/08/20 10:22 am, Chris Packham wrote:
>>>>> On 25/08/20 7:22 pm, Heiner Kallweit wrote:
>>>>>
>>>>> <snip>
>>>>>> I've been staring at spi-fsl-espi.c for while now and I think I've
>>>>>>> identified a couple of deficiencies that may or may not be related
>>>>>>> to my
>>>>>>> issue.
>>>>>>>
>>>>>>> First I think the 'Transfer done but SPIE_DON isn't set' message
>>>>>>> can be
>>>>>>> generated spuriously. In fsl_espi_irq() we read the ESPI_SPIE
>>>>>>> register.
>>>>>>> We also write back to it to clear the current events. We re-read it in
>>>>>>> fsl_espi_cpu_irq() and complain when SPIE_DON is not set. But we can
>>>>>>> naturally end up in that situation if we're doing a large read.
>>>>>>> Consider
>>>>>>> the messages for reading a block of data from a spi-nor chip
>>>>>>>
>>>>>>>     tx = READ_OP + ADDR
>>>>>>>     rx = data
>>>>>>>
>>>>>>> We setup the transfer and pump out the tx_buf. The first interrupt
>>>>>>> goes
>>>>>>> off and ESPI_SPIE has SPIM_DON and SPIM_RXT set. We empty the rx fifo,
>>>>>>> clear ESPI_SPIE and wait for the next interrupt. The next interrupt
>>>>>>> fires and this time we have ESPI_SPIE with just SPIM_RXT set. This
>>>>>>> continues until we've received all the data and we finish with
>>>>>>> ESPI_SPIE
>>>>>>> having only SPIM_RXT set. When we re-read it we complain that SPIE_DON
>>>>>>> isn't set.
>>>>>>>
>>>>>>> The other deficiency is that we only get an interrupt when the
>>>>>>> amount of
>>>>>>> data in the rx fifo is above FSL_ESPI_RXTHR. If there are fewer than
>>>>>>> FSL_ESPI_RXTHR left to be received we will never pull them out of
>>>>>>> the fifo.
>>>>>>>
>>>>>> SPIM_DON will trigger an interrupt once the last characters have been
>>>>>> transferred, and read the remaining characters from the FIFO.
>>>>> The T2080RM that I have says the following about the DON bit
>>>>>
>>>>> "Last character was transmitted. The last character was transmitted
>>>>> and a new command can be written for the next frame."
>>>>>
>>>>> That does at least seem to fit with my assertion that it's all about
>>>>> the TX direction. But the fact that it doesn't happen all the time
>>>>> throws some doubt on it.
>>>>>
>>>>>> I think the reason I'm seeing some variability is because of how fast
>>>>>>> (or slow) the interrupts get processed and how fast the spi-nor
>>>>>>> chip can
>>>>>>> fill the CPUs rx fifo.
>>>>>>>
>>>>>> To rule out timing issues at high bus frequencies I initially asked
>>>>>> for re-testing at lower frequencies. If you e.g. limit the bus to 1 MHz
>>>>>> or even less, then timing shouldn't be an issue.
>>>>> Yes I've currently got spi-max-frequency = <1000000>; in my dts. I
>>>>> would also expect a slower frequency would fit my "DON is for TX"
>>>>> narrative.
>>>>>> Last relevant functional changes have been done almost 4 years ago.
>>>>>> And yours is the first such report I see. So question is what could
>>>>>> be so
>>>>>> special with your setup that it seems you're the only one being
>>>>>> affected.
>>>>>> The scenarios you describe are standard, therefore much more people
>>>>>> should be affected in case of a driver bug.
>>>>> Agreed. But even on my hardware (which may have a latent issue
>>>>> despite being in the field for going on 5 years) the issue only
>>>>> triggers under some fairly specific circumstances.
>>>>>> You said that kernel config impacts how frequently the issue happens.
>>>>>> Therefore question is what's the diff in kernel config, and how could
>>>>>> the differences be related to SPI.
>>>>> It did seem to be somewhat random. Things like CONFIG_PREEMPT have an
>>>>> impact but every time I found something that seemed to be having an
>>>>> impact I've been able to disprove it. I actually think its about how
>>>>> busy the system is which may or may not affect when we get round to
>>>>> processing the interrupts.
>>>>>
>>>>> I have managed to get the 'Transfer done but SPIE_DON isn't set!' to
>>>>> occur on the T2080RDB.
>>>>>
>>>>> I've had to add the following to expose the environment as a mtd
>>>>> partition
>>>>>
>>>>> diff --git a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
>>>>> b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
>>>>> index ff87e67c70da..fbf95fc1fd68 100644
>>>>> --- a/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
>>>>> +++ b/arch/powerpc/boot/dts/fsl/t208xrdb.dtsi
>>>>> @@ -116,6 +116,15 @@ flash@0 {
>>>>>                                  compatible = "micron,n25q512ax3",
>>>>> "jedec,spi-nor";
>>>>>                                  reg = <0>;
>>>>>                                  spi-max-frequency = <10000000>; /*
>>>>> input clock */
>>>>> +
>>>>> +                               partition@u-boot {
>>>>> +                                        reg = <0x00000000 0x00100000>;
>>>>> +                                        label = "u-boot";
>>>>> +                                };
>>>>> +                                partition@u-boot-env {
>>>>> +                                        reg = <0x00100000 0x00010000>;
>>>>> +                                        label = "u-boot-env";
>>>>> +                                };
>>>>>                          };
>>>>>                  };
>>>>>
>>>>> And I'm using the following script to poke at the environment
>>>>> (warning if anyone does try this and the bug hits it can render your
>>>>> u-boot environment invalid).
>>>>>
>>>>> cat flash/fw_env_test.sh
>>>>> #!/bin/sh
>>>>>
>>>>> generate_fw_env_config()
>>>>> {
>>>>>    cat /proc/mtd | sed 's/[:"]//g' | while read dev size erasesize
>>>>> name ; do
>>>>>       echo "$dev $size $erasesize $name"
>>>>>       [ "$name" = "u-boot-env" ] && echo "/dev/$dev 0x0000 0x2000
>>>>> $erasesize" >/flash/fw_env.config
>>>>>    done
>>>>> }
>>>>>
>>>>> cycles=10
>>>>> [ $# -ge 1 ] && cycles=$1
>>>>>
>>>>> generate_fw_env_config
>>>>>
>>>>> fw_printenv -c /flash/fw_env.config
>>>>>
>>>>> dmesg -c >/dev/null
>>>>> x=0
>>>>> while [ $x -lt $cycles ]; do
>>>>>      fw_printenv -c /flash/fw_env.config >/dev/null || break
>>>>>      fw_setenv -c /flash/fw_env.config foo $RANDOM || break;
>>>>>      dmesg -c | grep -q fsl_espi && break;
>>>>>      let x=x+1
>>>>> done
>>>>>
>>>>> echo "Ran $x cycles"
>>>> I've also now seen the RX FIFO not empty error on the T2080RDB
>>>>
>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
>>>> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>>>
>>>> With my current workaround of emptying the RX FIFO. It seems
>>>> survivable. Interestingly it only ever seems to be 1 extra byte in the
>>>> RX FIFO and it seems to be after either a READ_SR or a READ_FSR.
>>>>
>>>> fsl_espi ffe110000.spi: tx 70
>>>> fsl_espi ffe110000.spi: rx 03
>>>> fsl_espi ffe110000.spi: Extra RX 00
>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
>>>> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>>> fsl_espi ffe110000.spi: tx 05
>>>> fsl_espi ffe110000.spi: rx 00
>>>> fsl_espi ffe110000.spi: Extra RX 03
>>>> fsl_espi ffe110000.spi: Transfer done but SPIE_DON isn't set!
>>>> fsl_espi ffe110000.spi: Transfer done but rx/tx fifo's aren't empty!
>>>> fsl_espi ffe110000.spi: SPIE_RXCNT = 1, SPIE_TXCNT = 32
>>>> fsl_espi ffe110000.spi: tx 05
>>>> fsl_espi ffe110000.spi: rx 00
>>>> fsl_espi ffe110000.spi: Extra RX 03
>>>>
>>>>  From all the Micron SPI-NOR datasheets I've got access to it is
>>>> possible to continually read the SR/FSR. But I've no idea why it
>>>> happens some times and not others.
>>> So I think I've got a reproduction and I think I've bisected the problem
>>> to commit 3282a3da25bd ("powerpc/64: Implement soft interrupt replay in
>>> C"). My day is just finishing now so I haven't applied too much scrutiny
>>> to this result. Given the various rabbit holes I've been down on this
>>> issue already I'd take this information with a good degree of skepticism.
>>>
>> OK, so an easy test should be to re-test with a 5.4 kernel.
>> It doesn't have yet the change you're referring to, and the fsl-espi driver
>> is basically the same as in 5.7 (just two small changes in 5.7).
> There's 6cc0c16d82f88 and maybe also other interrupt related patches
> around this time that could affect book E, so it's good if that exact
> patch is confirmed.

My confirmation is basically that I can induce the issue in a 5.4 kernel 
by cherry-picking 3282a3da25bd. I'm also able to "fix" the issue in 
5.9-rc2 by reverting that one commit.

I both cases it's not exactly a clean cherry-pick/revert so I also 
confirmed the bisection result by building at 3282a3da25bd (which sees 
the issue) and the commit just before (which does not).

> I've been staring at 3282a3da25bd for a while and nothing immediately
> stands out. It doesn't look like the low level handlers do anything
> special (well 0x900 does ack the decrementer, but so does the masked
> handler).
>
> Can you try this patch and also enable CONFIG_PPC_IRQ_SOFT_MASK_DEBUG?
>
> Thanks,
> Nick
>
> ---
> diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
> index bf21ebd36190..10d339042330 100644
> --- a/arch/powerpc/kernel/irq.c
> +++ b/arch/powerpc/kernel/irq.c
> @@ -214,7 +214,7 @@ void replay_soft_interrupts(void)
>   	struct pt_regs regs;
>   
>   	ppc_save_regs(&regs);
> -	regs.softe = IRQS_ALL_DISABLED;
> +	regs.softe = IRQS_ENABLED;
>   
>   again:
>   	if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
> @@ -349,6 +349,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
>   		if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG))
>   			WARN_ON_ONCE(!(mfmsr() & MSR_EE));
>   		__hard_irq_disable();
> +		local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
>   	} else {
>   		/*
>   		 * We should already be hard disabled here. We had bugs
> @@ -368,6 +369,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
>   		}
>   	}
>   
> +	preempt_disable();
>   	irq_soft_mask_set(IRQS_ALL_DISABLED);
>   	trace_hardirqs_off();
>   
> @@ -377,6 +379,7 @@ notrace void arch_local_irq_restore(unsigned long mask)
>   	trace_hardirqs_on();
>   	irq_soft_mask_set(IRQS_ENABLED);
>   	__hard_irq_enable();
> +	preempt_enable();
>   }
>   EXPORT_SYMBOL(arch_local_irq_restore);
>   
I still saw the issue with this change applied. PPC_IRQ_SOFT_MASK_DEBUG 
didn't report anything (either with or without the change above).

^ permalink raw reply

* Re: [PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper
From: Leonardo Bras @ 2020-08-27 22:11 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Christophe Leroy, Joel Stanley,
	Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <af4246bb-9357-098e-f167-8f30c6b893f2@ozlabs.ru>

On Mon, 2020-08-24 at 13:46 +1000, Alexey Kardashevskiy wrote:
> >  static int find_existing_ddw_windows(void)
> >  {
> >  	int len;
> > @@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void)
> >  		if (!direct64)
> >  			continue;
> >  
> > -		window = kzalloc(sizeof(*window), GFP_KERNEL);
> > -		if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
> > +		window = ddw_list_add(pdn, direct64);
> > +		if (!window || len < sizeof(*direct64)) {
> 
> Since you are touching this code, it looks like the "len <
> sizeof(*direct64)" part should go above to "if (!direct64)".

Sure, makes sense.
It will be fixed for v2.

> 
> 
> 
> >  			kfree(window);
> >  			remove_ddw(pdn, true);
> > -			continue;
> >  		}
> > -
> > -		window->device = pdn;
> > -		window->prop = direct64;
> > -		spin_lock(&direct_window_list_lock);
> > -		list_add(&window->list, &direct_window_list);
> > -		spin_unlock(&direct_window_list_lock);
> >  	}
> >  
> >  	return 0;
> > @@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> >  	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
> >  		  create.liobn, dn);
> >  
> > -	window = kzalloc(sizeof(*window), GFP_KERNEL);
> > +	/* Add new window to existing DDW list */
> 
> The comment seems to duplicate what the ddw_list_add name already suggests.

Ok, I will remove it then.

> > +	window = ddw_list_add(pdn, ddwprop);
> >  	if (!window)
> >  		goto out_clear_window;
> >  
> > @@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
> >  		goto out_free_window;
> >  	}
> >  
> > -	window->device = pdn;
> > -	window->prop = ddwprop;
> > -	spin_lock(&direct_window_list_lock);
> > -	list_add(&window->list, &direct_window_list);
> > -	spin_unlock(&direct_window_list_lock);
> 
> I'd leave these 3 lines here and in find_existing_ddw_windows() (which
> would make  ddw_list_add -> ddw_prop_alloc). In general you want to have
> less stuff to do on the failure path. kmalloc may fail and needs kfree
> but you can safely delay list_add (which cannot fail) and avoid having
> the lock help twice in the same function (one of them is hidden inside
> ddw_list_add).
> Not sure if this change is really needed after all. Thanks,

I understand this leads to better performance in case anything fails.
Also, I think list_add happening in the end is less error-prone (in
case the list is checked between list_add and a fail).

But what if we put it at the end?
What is the chance of a kzalloc of 4 pointers (struct direct_window)
failing after walk_system_ram_range?  

Is it not worthy doing that for making enable_ddw() easier to
understand?

Best regards,
Leonardo


^ permalink raw reply

* Re: [PATCH v1 02/10] powerpc/kernel/iommu: Align size for IOMMU_PAGE_SIZE on iommu_*_coherent()
From: Alexey Kardashevskiy @ 2020-08-28  1:40 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Christophe Leroy, Joel Stanley,
	Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <c67c66e466ad27d15aa2b970c48d2336d95b2971.camel@gmail.com>



On 28/08/2020 02:51, Leonardo Bras wrote:
> On Sat, 2020-08-22 at 20:07 +1000, Alexey Kardashevskiy wrote:
>>
>> On 18/08/2020 09:40, Leonardo Bras wrote:
>>> Both iommu_alloc_coherent() and iommu_free_coherent() assume that once
>>> size is aligned to PAGE_SIZE it will be aligned to IOMMU_PAGE_SIZE.
>>
>> The only case when it is not aligned is when IOMMU_PAGE_SIZE > PAGE_SIZE
>> which is unlikely but not impossible, we could configure the kernel for
>> 4K system pages and 64K IOMMU pages I suppose. Do we really want to do
>> this here, or simply put WARN_ON(tbl->it_page_shift > PAGE_SHIFT)?
> 
> I think it would be better to keep the code as much generic as possible
> regarding page sizes. 

Then you need to test it. Does 4K guest even boot (it should but I would
not bet much on it)?

> 
>> Because if we want the former (==support), then we'll have to align the
>> size up to the bigger page size when allocating/zeroing system pages,
>> etc. 
> 
> This part I don't understand. Why do we need to align everything to the
> bigger pagesize? 
> 
> I mean, is not that enough that the range [ret, ret + size[ is both
> allocated by mm and mapped on a iommu range?
> 
> Suppose a iommu_alloc_coherent() of 16kB on PAGESIZE = 4k and
> IOMMU_PAGE_SIZE() == 64k.
> Why 4 * cpu_pages mapped by a 64k IOMMU page is not enough? 
> All the space the user asked for is allocated and mapped for DMA.


The user asked to map 16K, the rest - 48K - is used for something else
(may be even mapped to another device) but you are making all 64K
accessible by the device which only should be able to access 16K.

In practice, if this happens, H_PUT_TCE will simply fail.


> 
>> Bigger pages are not the case here as I understand it.
> 
> I did not get this part, what do you mean?


Possible IOMMU page sizes are 4K, 64K, 2M, 16M, 256M, 1GB, and the
supported set of sizes is different for P8/P9 and type of IO (PHB,
NVLink/CAPI).


> 
>>> Update those functions to guarantee alignment with requested size
>>> using IOMMU_PAGE_ALIGN() before doing iommu_alloc() / iommu_free().
>>>
>>> Also, on iommu_range_alloc(), replace ALIGN(n, 1 << tbl->it_page_shift)
>>> with IOMMU_PAGE_ALIGN(n, tbl), which seems easier to read.
>>>
>>> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
>>> ---
>>>  arch/powerpc/kernel/iommu.c | 17 +++++++++--------
>>>  1 file changed, 9 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index 9704f3f76e63..d7086087830f 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -237,10 +237,9 @@ static unsigned long iommu_range_alloc(struct device *dev,
>>>  	}
>>>  
>>>  	if (dev)
>>> -		boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
>>> -				      1 << tbl->it_page_shift);
>>> +		boundary_size = IOMMU_PAGE_ALIGN(dma_get_seg_boundary(dev) + 1, tbl);
>>
>> Run checkpatch.pl, should complain about a long line.
> 
> It's 86 columns long, which is less than the new limit of 100 columns
> Linus announced a few weeks ago. checkpatch.pl was updated too:
> https://www.phoronix.com/scan.php?page=news_item&px=Linux-Kernel-Deprecates-80-Col

Yay finally :) Thanks,


> 
>>
>>
>>>  	else
>>> -		boundary_size = ALIGN(1UL << 32, 1 << tbl->it_page_shift);
>>> +		boundary_size = IOMMU_PAGE_ALIGN(1UL << 32, tbl);
>>>  	/* 4GB boundary for iseries_hv_alloc and iseries_hv_map */
>>>  
>>>  	n = iommu_area_alloc(tbl->it_map, limit, start, npages, tbl->it_offset,
>>> @@ -858,6 +857,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
>>>  	unsigned int order;
>>>  	unsigned int nio_pages, io_order;
>>>  	struct page *page;
>>> +	size_t size_io = size;
>>>  
>>>  	size = PAGE_ALIGN(size);
>>>  	order = get_order(size);
>>> @@ -884,8 +884,9 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
>>>  	memset(ret, 0, size);
>>>  
>>>  	/* Set up tces to cover the allocated range */
>>> -	nio_pages = size >> tbl->it_page_shift;
>>> -	io_order = get_iommu_order(size, tbl);
>>> +	size_io = IOMMU_PAGE_ALIGN(size_io, tbl);
>>> +	nio_pages = size_io >> tbl->it_page_shift;
>>> +	io_order = get_iommu_order(size_io, tbl);
>>>  	mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
>>>  			      mask >> tbl->it_page_shift, io_order, 0);
>>>  	if (mapping == DMA_MAPPING_ERROR) {
>>> @@ -900,11 +901,11 @@ void iommu_free_coherent(struct iommu_table *tbl, size_t size,
>>>  			 void *vaddr, dma_addr_t dma_handle)
>>>  {
>>>  	if (tbl) {
>>> -		unsigned int nio_pages;
>>> +		size_t size_io = IOMMU_PAGE_ALIGN(size, tbl);
>>> +		unsigned int nio_pages = size_io >> tbl->it_page_shift;
>>>  
>>> -		size = PAGE_ALIGN(size);
>>> -		nio_pages = size >> tbl->it_page_shift;
>>>  		iommu_free(tbl, dma_handle, nio_pages);
>>> +
>>
>> Unrelated new line.
> 
> Will be removed. Thanks!
> 
>>
>>
>>>  		size = PAGE_ALIGN(size);
>>>  		free_pages((unsigned long)vaddr, get_order(size));
>>>  	}
>>>
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v1 04/10] powerpc/kernel/iommu: Add new iommu_table_in_use() helper
From: Alexey Kardashevskiy @ 2020-08-28  1:51 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Christophe Leroy, Joel Stanley,
	Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <5f26d433abcde7cd3b4da705742e17ca6c0f0f0b.camel@gmail.com>



On 28/08/2020 04:34, Leonardo Bras wrote:
> On Sat, 2020-08-22 at 20:34 +1000, Alexey Kardashevskiy wrote:
>>> +
>>> +	/*ignore reserved bit0*/
>>
>> s/ignore reserved bit0/ ignore reserved bit0 /  (add spaces)
> 
> Fixed
> 
>>> +	if (tbl->it_offset == 0)
>>> +		p1_start = 1;
>>> +
>>> +	/* Check if reserved memory is valid*/
>>
>> A missing space here.
> 
> Fixed
> 
>>
>>> +	if (tbl->it_reserved_start >= tbl->it_offset &&
>>> +	    tbl->it_reserved_start <= (tbl->it_offset + tbl->it_size) &&
>>> +	    tbl->it_reserved_end   >= tbl->it_offset &&
>>> +	    tbl->it_reserved_end   <= (tbl->it_offset + tbl->it_size)) {
>>
>> Uff. What if tbl->it_reserved_end is bigger than tbl->it_offset +
>> tbl->it_size?
>>
>> The reserved area is to preserve MMIO32 so it is for it_offset==0 only
>> and the boundaries are checked in the only callsite, and it is unlikely
>> to change soon or ever.
>>
>> Rather that bothering with fixing that, may be just add (did not test):
>>
>> if (WARN_ON((
>> (tbl->it_reserved_start || tbl->it_reserved_end) && (it_offset != 0))
>> (tbl->it_reserved_start > it_offset && tbl->it_reserved_end < it_offset
>> + it_size) && (it_offset == 0)) )
>>  return true;
>>
>> Or simply always look for it_offset..it_reserved_start and
>> it_reserved_end..it_offset+it_size and if there is no reserved area,
>> initialize it_reserved_start=it_reserved_end=it_offset so the first
>> it_offset..it_reserved_start becomes a no-op.
> 
> The problem here is that the values of it_reserved_{start,end} are not
> necessarily valid. I mean, on iommu_table_reserve_pages() the values
> are stored however they are given (bit reserving is done only if they
> are valid). 
> 
> Having a it_reserved_{start,end} value outside the valid ranges would
> cause find_next_bit() to run over memory outside the bitmap.
> Even if the those values are < tbl->it_offset, the resulting
> subtraction on unsigned would cause it to become a big value and run
> over memory outside the bitmap.
> 
> But I think you are right. That is not the place to check if the
> reserved values are valid. It should just trust them here.
> I intent to change iommu_table_reserve_pages() to only store the
> parameters in it_reserved_{start,end} if they are in the range, and or
> it_offset in both of them if they are not.
> 
> What do you think?

This should work, yes.


> 
> Thanks for the feedback!
> Leonardo Bras
> 
> 
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v1 06/10] powerpc/pseries/iommu: Add ddw_list_add() helper
From: Alexey Kardashevskiy @ 2020-08-28  1:58 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Christophe Leroy, Joel Stanley,
	Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <f80040bf941755469918fb75cf520590a4a5e3db.camel@gmail.com>



On 28/08/2020 08:11, Leonardo Bras wrote:
> On Mon, 2020-08-24 at 13:46 +1000, Alexey Kardashevskiy wrote:
>>>  static int find_existing_ddw_windows(void)
>>>  {
>>>  	int len;
>>> @@ -887,18 +905,11 @@ static int find_existing_ddw_windows(void)
>>>  		if (!direct64)
>>>  			continue;
>>>  
>>> -		window = kzalloc(sizeof(*window), GFP_KERNEL);
>>> -		if (!window || len < sizeof(struct dynamic_dma_window_prop)) {
>>> +		window = ddw_list_add(pdn, direct64);
>>> +		if (!window || len < sizeof(*direct64)) {
>>
>> Since you are touching this code, it looks like the "len <
>> sizeof(*direct64)" part should go above to "if (!direct64)".
> 
> Sure, makes sense.
> It will be fixed for v2.
> 
>>
>>
>>
>>>  			kfree(window);
>>>  			remove_ddw(pdn, true);
>>> -			continue;
>>>  		}
>>> -
>>> -		window->device = pdn;
>>> -		window->prop = direct64;
>>> -		spin_lock(&direct_window_list_lock);
>>> -		list_add(&window->list, &direct_window_list);
>>> -		spin_unlock(&direct_window_list_lock);
>>>  	}
>>>  
>>>  	return 0;
>>> @@ -1261,7 +1272,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>>>  	dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n",
>>>  		  create.liobn, dn);
>>>  
>>> -	window = kzalloc(sizeof(*window), GFP_KERNEL);
>>> +	/* Add new window to existing DDW list */
>>
>> The comment seems to duplicate what the ddw_list_add name already suggests.
> 
> Ok, I will remove it then.
> 
>>> +	window = ddw_list_add(pdn, ddwprop);
>>>  	if (!window)
>>>  		goto out_clear_window;
>>>  
>>> @@ -1280,16 +1292,14 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>>>  		goto out_free_window;
>>>  	}
>>>  
>>> -	window->device = pdn;
>>> -	window->prop = ddwprop;
>>> -	spin_lock(&direct_window_list_lock);
>>> -	list_add(&window->list, &direct_window_list);
>>> -	spin_unlock(&direct_window_list_lock);
>>
>> I'd leave these 3 lines here and in find_existing_ddw_windows() (which
>> would make  ddw_list_add -> ddw_prop_alloc). In general you want to have
>> less stuff to do on the failure path. kmalloc may fail and needs kfree
>> but you can safely delay list_add (which cannot fail) and avoid having
>> the lock help twice in the same function (one of them is hidden inside
>> ddw_list_add).
>> Not sure if this change is really needed after all. Thanks,
> 
> I understand this leads to better performance in case anything fails.
> Also, I think list_add happening in the end is less error-prone (in
> case the list is checked between list_add and a fail).

Performance was not in my mind at all.

I noticed you remove from a list with a lock help and it was not there
before and there is a bunch on labels on the exit path and started
looking for list_add() and if you do not double remove from the list.


> But what if we put it at the end?
> What is the chance of a kzalloc of 4 pointers (struct direct_window)
> failing after walk_system_ram_range?

This is not about chances really, it is about readability. If let's say
kmalloc failed, you just to the error exit label and simply call kfree()
on that pointer, kfree will do nothing if it is NULL already, simple.
list_del() does not have this simplicity.


> Is it not worthy doing that for making enable_ddw() easier to
> understand?

This is my goal here :)


> 
> Best regards,
> Leonardo
> 

-- 
Alexey

^ permalink raw reply

* [PATCH] powerpc/tools: Remove 90 line limit in checkpatch script
From: Russell Currey @ 2020-08-28  2:05 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: Russell Currey

As of commit bdc48fa11e46, scripts/checkpatch.pl now has a default line
length warning of 100 characters.  The powerpc wrapper script was using
a length of 90 instead of 80 in order to make checkpatch less
restrictive, but now it's making it more restrictive instead.

I think it makes sense to just use the default value now.

Signed-off-by: Russell Currey <ruscur@russell.cc>
---
 arch/powerpc/tools/checkpatch.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/tools/checkpatch.sh b/arch/powerpc/tools/checkpatch.sh
index 3ce5c093b19d..91c04802ec31 100755
--- a/arch/powerpc/tools/checkpatch.sh
+++ b/arch/powerpc/tools/checkpatch.sh
@@ -9,7 +9,6 @@ script_base=$(realpath $(dirname $0))
 exec $script_base/../../../scripts/checkpatch.pl \
 	--subjective \
 	--no-summary \
-	--max-line-length=90 \
 	--show-types \
 	--ignore ARCH_INCLUDE_LINUX \
 	--ignore BIT_MACRO \
-- 
2.28.0


^ permalink raw reply related

* Re: [PATCH v8 2/8] powerpc/vdso: Remove __kernel_datapage_offset and simplify __get_datapage()
From: Michael Ellerman @ 2020-08-28  2:14 UTC (permalink / raw)
  To: Dmitry Safonov
  Cc: nathanl, linux-arch, Arnd Bergmann, open list, Will Deacon,
	Paul Mackerras, Andy Lutomirski, Thomas Gleixner,
	Vincenzo Frascino, linuxppc-dev
In-Reply-To: <CAJwJo6ZANqYkSHbQ+3b+Fi_VT80MtrzEV5yreQAWx-L8j8x2zA@mail.gmail.com>

Dmitry Safonov <0x7f454c46@gmail.com> writes:
> Hello,
>
> On Wed, 26 Aug 2020 at 15:39, Michael Ellerman <mpe@ellerman.id.au> wrote:
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
> [..]
>> > arch_remap() gets replaced by vdso_remap()
>> >
>> > For arch_unmap(), I'm wondering how/what other architectures do, because
>> > powerpc seems to be the only one to erase the vdso context pointer when
>> > unmapping the vdso.
>>
>> Yeah. The original unmap/remap stuff was added for CRIU, which I thought
>> people tested on other architectures (more than powerpc even).
>>
>> Possibly no one really cares about vdso unmap though, vs just moving the
>> vdso.
>>
>> We added a test for vdso unmap recently because it happened to trigger a
>> KAUP failure, and someone actually hit it & reported it.
>
> You right, CRIU cares much more about moving vDSO.
> It's done for each restoree and as on most setups vDSO is premapped and
> used by the application - it's actively tested.
> Speaking about vDSO unmap - that's concerning only for heterogeneous C/R,
> i.e when an application is migrated from a system that uses vDSO to the one
> which doesn't - it's much rare scenario.
> (for arm it's !CONFIG_VDSO, for x86 it's `vdso=0` boot parameter)

Ah OK that explains it.

The case we hit of VDSO unmapping was some strange "library OS" thing
which had explicitly unmapped the VDSO, so also very rare.

> Looking at the code, it seems quite easy to provide/maintain .close() for
> vm_special_mapping. A bit harder to add a test from CRIU side
> (as glibc won't know on restore that it can't use vdso anymore),
> but totally not impossible.
>
>> Running that test on arm64 segfaults:
>>
>>   # ./sigreturn_vdso
>>   VDSO is at 0xffff8191f000-0xffff8191ffff (4096 bytes)
>>   Signal delivered OK with VDSO mapped
>>   VDSO moved to 0xffff8191a000-0xffff8191afff (4096 bytes)
>>   Signal delivered OK with VDSO moved
>>   Unmapped VDSO
>>   Remapped the stack executable
>>   [   48.556191] potentially unexpected fatal signal 11.
>>   [   48.556752] CPU: 0 PID: 140 Comm: sigreturn_vdso Not tainted 5.9.0-rc2-00057-g2ac69819ba9e #190
>>   [   48.556990] Hardware name: linux,dummy-virt (DT)
>>   [   48.557336] pstate: 60001000 (nZCv daif -PAN -UAO BTYPE=--)
>>   [   48.557475] pc : 0000ffff8191a7bc
>>   [   48.557603] lr : 0000ffff8191a7bc
>>   [   48.557697] sp : 0000ffffc13c9e90
>>   [   48.557873] x29: 0000ffffc13cb0e0 x28: 0000000000000000
>>   [   48.558201] x27: 0000000000000000 x26: 0000000000000000
>>   [   48.558337] x25: 0000000000000000 x24: 0000000000000000
>>   [   48.558754] x23: 0000000000000000 x22: 0000000000000000
>>   [   48.558893] x21: 00000000004009b0 x20: 0000000000000000
>>   [   48.559046] x19: 0000000000400ff0 x18: 0000000000000000
>>   [   48.559180] x17: 0000ffff817da300 x16: 0000000000412010
>>   [   48.559312] x15: 0000000000000000 x14: 000000000000001c
>>   [   48.559443] x13: 656c626174756365 x12: 7865206b63617473
>>   [   48.559625] x11: 0000000000000003 x10: 0101010101010101
>>   [   48.559828] x9 : 0000ffff818afda8 x8 : 0000000000000081
>>   [   48.559973] x7 : 6174732065687420 x6 : 64657070616d6552
>>   [   48.560115] x5 : 000000000e0388bd x4 : 000000000040135d
>>   [   48.560270] x3 : 0000000000000000 x2 : 0000000000000001
>>   [   48.560412] x1 : 0000000000000003 x0 : 00000000004120b8
>>   Segmentation fault
>>   #
>>
>> So I think we need to keep the unmap hook. Maybe it should be handled by
>> the special_mapping stuff generically.
>
> I'll cook a patch for vm_special_mapping if you don't mind :-)

That would be great, thanks!

cheers

^ permalink raw reply

* Re: [PATCH v1 01/10] powerpc/pseries/iommu: Replace hard-coded page shift
From: Alexey Kardashevskiy @ 2020-08-28  2:27 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Christophe Leroy, Joel Stanley,
	Thiago Jung Bauermann, Ram Pai, Brian King,
	Murilo Fossa Vicentini, David Dai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <31e913d842693b6e107cb2b8e51fd45118b1bd2c.camel@gmail.com>



On 28/08/2020 01:32, Leonardo Bras wrote:
> Hello Alexey, thank you for this feedback!
> 
> On Sat, 2020-08-22 at 19:33 +1000, Alexey Kardashevskiy wrote:
>>> +#define TCE_RPN_BITS		52		/* Bits 0-51 represent RPN on TCE */
>>
>> Ditch this one and use MAX_PHYSMEM_BITS instead? I am pretty sure this
>> is the actual limit.
> 
> I understand this MAX_PHYSMEM_BITS(51) comes from the maximum physical memory addressable in the machine. IIUC, it means we can access physical address up to (1ul << MAX_PHYSMEM_BITS). 
> 
> This 52 comes from PAPR "Table 9. TCE Definition" which defines bits
> 0-51 as the RPN. By looking at code, I understand that it means we may input any address < (1ul << 52) to TCE.
> 
> In practice, MAX_PHYSMEM_BITS should be enough as of today, because I suppose we can't ever pass a physical page address over 
> (1ul << 51), and TCE accepts up to (1ul << 52).
> But if we ever increase MAX_PHYSMEM_BITS, it doesn't necessarily means that TCE_RPN_BITS will also be increased, so I think they are independent values. 
> 
> Does it make sense? Please let me know if I am missing something.

The underlying hardware is PHB3/4 about which the IODA2 Version 2.4
6Apr2012.pdf spec says:

"The number of most significant RPN bits implemented in the TCE is
dependent on the max size of System Memory to be supported by the platform".

IODA3 is the same on this matter.

This is MAX_PHYSMEM_BITS and PHB itself does not have any other limits
on top of that. So the only real limit comes from MAX_PHYSMEM_BITS and
where TCE_RPN_BITS comes from exactly - I have no idea.


> 
>>
>>
>>> +#define TCE_RPN_MASK(ps)	((1ul << (TCE_RPN_BITS - (ps))) - 1)
>>>  #define TCE_VALID		0x800		/* TCE valid */
>>>  #define TCE_ALLIO		0x400		/* TCE valid for all lpars */
>>>  #define TCE_PCI_WRITE		0x2		/* write from PCI allowed */
>>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>>> index e4198700ed1a..8fe23b7dff3a 100644
>>> --- a/arch/powerpc/platforms/pseries/iommu.c
>>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>>> @@ -107,6 +107,9 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index,
>>>  	u64 proto_tce;
>>>  	__be64 *tcep;
>>>  	u64 rpn;
>>> +	const unsigned long tceshift = tbl->it_page_shift;
>>> +	const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl);
>>> +	const u64 rpn_mask = TCE_RPN_MASK(tceshift);
>>
>> Using IOMMU_PAGE_SIZE macro for the page size and not using
>> IOMMU_PAGE_MASK for the mask - this incosistency makes my small brain
>> explode :) I understand the history but maaaaan... Oh well, ok.
>>
> 
> Yeah, it feels kind of weird after two IOMMU related consts. :)
> But sure IOMMU_PAGE_MASK() would not be useful here :)
> 
> And this kind of let me thinking:
>>> +		rpn = __pa(uaddr) >> tceshift;
>>> +		*tcep = cpu_to_be64(proto_tce | (rpn & rpn_mask) << tceshift);
> Why not:
> 	rpn_mask =  TCE_RPN_MASK(tceshift) << tceshift;


A mask for a page number (but not the address!) hurts my brain, masks
are good against addresses but numbers should already have all bits
adjusted imho, may be it is just me :-/


> 	
> 	rpn = __pa(uaddr) & rpn_mask;
> 	*tcep = cpu_to_be64(proto_tce | rpn)
> 
> I am usually afraid of changing stuff like this, but I think it's safe.
> 
>> Good, otherwise. Thanks,
> 
> Thank you for reviewing!
>  
> 
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization
From: Christophe Leroy @ 2020-08-28  5:40 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <87d03c2plb.fsf@mpe.ellerman.id.au>



Le 27/08/2020 à 15:19, Michael Ellerman a écrit :
> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>> On 08/26/2020 02:58 PM, Michael Ellerman wrote:
>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
>>>> index daef14a284a3..bbb69832fd46 100644
>>>> --- a/arch/powerpc/kernel/vdso.c
>>>> +++ b/arch/powerpc/kernel/vdso.c
>>>> @@ -718,16 +710,14 @@ static int __init vdso_init(void)
>>> ...
>>>>    
>>>> -
>>>> -#ifdef CONFIG_VDSO32
>>>>    	vdso32_kbase = &vdso32_start;
>>>>    
>>>>    	/*
>>>> @@ -735,8 +725,6 @@ static int __init vdso_init(void)
>>>>    	 */
>>>>    	vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
>>>>    	DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
>>>> -#endif
>>>
>>> This didn't build for ppc64le:
>>>
>>>     /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to `vdso32_end'
>>>     /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to `vdso32_start'
>>>     make[1]: *** [/scratch/michael/build/maint/Makefile:1166: vmlinux] Error 1
>>>     make: *** [Makefile:185: __sub-make] Error 2
>>>
>>> So I just put that ifdef back.
>>>
>>
>> The problem is because is_32bit() can still return true even when
>> CONFIG_VDSO32 is not set.
> 
> Hmm, you're right. My config had CONFIG_COMPAT enabled.
> 
> But that seems like a bug, if someone enables COMPAT on ppc64le they are
> almost certainly going to want VDSO32 as well.
> 
> So I think I'll do a lead up patch as below.

Ah yes, and with that then no need to consider the case where 
is_32bit_task() is true and CONFIG_VDSO32 is not selected.

I'll update my leading series accordingly.

Christophe

> 
> cheers
> 
> diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
> index d4fd109f177e..cf2da1e401ef 100644
> --- a/arch/powerpc/platforms/Kconfig.cputype
> +++ b/arch/powerpc/platforms/Kconfig.cputype
> @@ -501,13 +501,12 @@ endmenu
>   
>   config VDSO32
>   	def_bool y
> -	depends on PPC32 || CPU_BIG_ENDIAN
> +	depends on PPC32 || COMPAT
>   	help
>   	  This symbol controls whether we build the 32-bit VDSO. We obviously
>   	  want to do that if we're building a 32-bit kernel. If we're building
> -	  a 64-bit kernel then we only want a 32-bit VDSO if we're building for
> -	  big endian. That is because the only little endian configuration we
> -	  support is ppc64le which is 64-bit only.
> +	  a 64-bit kernel then we only want a 32-bit VDSO if we're also enabling
> +	  COMPAT.
>   
>   choice
>   	prompt "Endianness selection"
> 

^ permalink raw reply

* Re: [PATCH v1 4/9] powerpc/vdso: Remove unnecessary ifdefs in vdso_pagelist initialization
From: Christophe Leroy @ 2020-08-28  5:46 UTC (permalink / raw)
  To: Michael Ellerman, Benjamin Herrenschmidt, Paul Mackerras
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <e9795fb5-81bc-5707-79d6-42d9dddf7ac4@csgroup.eu>



Le 28/08/2020 à 07:40, Christophe Leroy a écrit :
> 
> 
> Le 27/08/2020 à 15:19, Michael Ellerman a écrit :
>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>> On 08/26/2020 02:58 PM, Michael Ellerman wrote:
>>>> Christophe Leroy <christophe.leroy@csgroup.eu> writes:
>>>>> diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
>>>>> index daef14a284a3..bbb69832fd46 100644
>>>>> --- a/arch/powerpc/kernel/vdso.c
>>>>> +++ b/arch/powerpc/kernel/vdso.c
>>>>> @@ -718,16 +710,14 @@ static int __init vdso_init(void)
>>>> ...
>>>>> -
>>>>> -#ifdef CONFIG_VDSO32
>>>>>        vdso32_kbase = &vdso32_start;
>>>>>        /*
>>>>> @@ -735,8 +725,6 @@ static int __init vdso_init(void)
>>>>>         */
>>>>>        vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
>>>>>        DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, 
>>>>> vdso32_pages);
>>>>> -#endif
>>>>
>>>> This didn't build for ppc64le:
>>>>
>>>>     
>>>> /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: 
>>>> arch/powerpc/kernel/vdso.o:(.toc+0x0): undefined reference to 
>>>> `vdso32_end'
>>>>     
>>>> /opt/cross/gcc-8.20_binutils-2.32/powerpc64-unknown-linux-gnu/bin/powerpc64-unknown-linux-gnu-ld: 
>>>> arch/powerpc/kernel/vdso.o:(.toc+0x8): undefined reference to 
>>>> `vdso32_start'
>>>>     make[1]: *** [/scratch/michael/build/maint/Makefile:1166: 
>>>> vmlinux] Error 1
>>>>     make: *** [Makefile:185: __sub-make] Error 2
>>>>
>>>> So I just put that ifdef back.
>>>>
>>>
>>> The problem is because is_32bit() can still return true even when
>>> CONFIG_VDSO32 is not set.
>>
>> Hmm, you're right. My config had CONFIG_COMPAT enabled.
>>
>> But that seems like a bug, if someone enables COMPAT on ppc64le they are
>> almost certainly going to want VDSO32 as well.
>>
>> So I think I'll do a lead up patch as below.
> 
> Ah yes, and with that then no need to consider the case where 
> is_32bit_task() is true and CONFIG_VDSO32 is not selected.
> 
> I'll update my leading series accordingly.

I meant follow up series.

Christophe
> 
> Christophe
> 
>>
>> cheers
>>
>> diff --git a/arch/powerpc/platforms/Kconfig.cputype 
>> b/arch/powerpc/platforms/Kconfig.cputype
>> index d4fd109f177e..cf2da1e401ef 100644
>> --- a/arch/powerpc/platforms/Kconfig.cputype
>> +++ b/arch/powerpc/platforms/Kconfig.cputype
>> @@ -501,13 +501,12 @@ endmenu
>>   config VDSO32
>>       def_bool y
>> -    depends on PPC32 || CPU_BIG_ENDIAN
>> +    depends on PPC32 || COMPAT
>>       help
>>         This symbol controls whether we build the 32-bit VDSO. We 
>> obviously
>>         want to do that if we're building a 32-bit kernel. If we're 
>> building
>> -      a 64-bit kernel then we only want a 32-bit VDSO if we're 
>> building for
>> -      big endian. That is because the only little endian 
>> configuration we
>> -      support is ppc64le which is 64-bit only.
>> +      a 64-bit kernel then we only want a 32-bit VDSO if we're also 
>> enabling
>> +      COMPAT.
>>   choice
>>       prompt "Endianness selection"
>>

^ permalink raw reply

* [PATCH v2 2/5] powerpc/vdso: Don't rely on vdso_pages being 0 for failure
From: Christophe Leroy @ 2020-08-28  5:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <0f65bb24a2519e5e6c33089016cb249a7c1b1e35.1598594308.git.christophe.leroy@csgroup.eu>

If vdso initialisation failed, vdso_ready is not set.
Otherwise, vdso_pages is only 0 when it is a 32 bits task
and CONFIG_VDSO32 is not selected.

As arch_setup_additional_pages() now bails out directly in
that case, we don't need to set vdso_pages to 0.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/vdso.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 3ef3fc546ac8..8f245e988a8a 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -176,11 +176,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 
 	current->mm->context.vdso_base = 0;
 
-	/* vDSO has a problem and was disabled, just don't "enable" it for the
-	 * process
-	 */
-	if (vdso_pages == 0)
-		return 0;
 	/* Add a page to the vdso size for the data page */
 	vdso_pages ++;
 
@@ -710,14 +705,16 @@ static int __init vdso_init(void)
 	 * Initialize the vDSO images in memory, that is do necessary
 	 * fixups of vDSO symbols, locate trampolines, etc...
 	 */
-	if (vdso_setup())
-		goto setup_failed;
+	if (vdso_setup()) {
+		pr_err("vDSO setup failure, not enabled !\n");
+		return 0;
+	}
 
 	if (IS_ENABLED(CONFIG_VDSO32)) {
 		/* Make sure pages are in the correct state */
 		pagelist = kcalloc(vdso32_pages + 1, sizeof(struct page *), GFP_KERNEL);
 		if (!pagelist)
-			goto alloc_failed;
+			return 0;
 
 		pagelist[0] = virt_to_page(vdso_data);
 
@@ -730,7 +727,7 @@ static int __init vdso_init(void)
 	if (IS_ENABLED(CONFIG_PPC64)) {
 		pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), GFP_KERNEL);
 		if (!pagelist)
-			goto alloc_failed;
+			return 0;
 
 		pagelist[0] = virt_to_page(vdso_data);
 
@@ -743,14 +740,6 @@ static int __init vdso_init(void)
 	smp_wmb();
 	vdso_ready = 1;
 
-	return 0;
-
-setup_failed:
-	pr_err("vDSO setup failure, not enabled !\n");
-alloc_failed:
-	vdso32_pages = 0;
-	vdso64_pages = 0;
-
 	return 0;
 }
 arch_initcall(vdso_init);
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 1/5] powerpc/vdso: Remove DBG()
From: Christophe Leroy @ 2020-08-28  5:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

DBG() is defined as void when DEBUG is not defined,
and DEBUG is explicitly undefined.

It means there is no other way than modifying source code
to get the messages printed.

It was most likely useful in the first days of VDSO, but
today the only 3 DBG() calls don't deserve a special
handling.

Just remove them. If one day someone need such messages back,
use a standard pr_debug() or equivalent.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
This is a follow up series, applying on top of the series that
switches powerpc VDSO to _install_special_mapping(),
rebased on today's powerpc/next-test (dd419a93bd99)

v2 removes the modification to arch_setup_additional_pages() to
consider when is_32bit_task() returning true when CONFIG_VDSO32
not set, as this should never happen.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/vdso.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index e2568d9ecdff..3ef3fc546ac8 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -31,14 +31,6 @@
 #include <asm/vdso_datapage.h>
 #include <asm/setup.h>
 
-#undef DEBUG
-
-#ifdef DEBUG
-#define DBG(fmt...) printk(fmt)
-#else
-#define DBG(fmt...)
-#endif
-
 /* Max supported size for symbol names */
 #define MAX_SYMNAME	64
 
@@ -567,9 +559,6 @@ static __init int vdso_fixup_alt_funcs(struct lib32_elfinfo *v32,
 		if (!match)
 			continue;
 
-		DBG("replacing %s with %s...\n", patch->gen_name,
-		    patch->fix_name ? "NONE" : patch->fix_name);
-
 		/*
 		 * Patch the 32 bits and 64 bits symbols. Note that we do not
 		 * patch the "." symbol on 64 bits.
@@ -704,7 +693,6 @@ static int __init vdso_init(void)
 	 * Calculate the size of the 64 bits vDSO
 	 */
 	vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
-	DBG("vdso64_kbase: %p, 0x%x pages\n", vdso64_kbase, vdso64_pages);
 
 	vdso32_kbase = &vdso32_start;
 
@@ -712,7 +700,6 @@ static int __init vdso_init(void)
 	 * Calculate the size of the 32 bits vDSO
 	 */
 	vdso32_pages = (&vdso32_end - &vdso32_start) >> PAGE_SHIFT;
-	DBG("vdso32_kbase: %p, 0x%x pages\n", vdso32_kbase, vdso32_pages);
 
 	/*
 	 * Setup the syscall map in the vDOS
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 3/5] powerpc/vdso: Initialise vdso32_kbase at compile time
From: Christophe Leroy @ 2020-08-28  5:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <0f65bb24a2519e5e6c33089016cb249a7c1b1e35.1598594308.git.christophe.leroy@csgroup.eu>

Initialise vdso32_kbase at compile time like vdso64_kbase.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/vdso.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 8f245e988a8a..fb393266b9cb 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -37,13 +37,12 @@
 /* The alignment of the vDSO */
 #define VDSO_ALIGNMENT	(1 << 16)
 
+extern char vdso32_start, vdso32_end;
 static unsigned int vdso32_pages;
-static void *vdso32_kbase;
+static void *vdso32_kbase = &vdso32_start;
 unsigned long vdso32_sigtramp;
 unsigned long vdso32_rt_sigtramp;
 
-extern char vdso32_start, vdso32_end;
-
 extern char vdso64_start, vdso64_end;
 static void *vdso64_kbase = &vdso64_start;
 static unsigned int vdso64_pages;
@@ -689,8 +688,6 @@ static int __init vdso_init(void)
 	 */
 	vdso64_pages = (&vdso64_end - &vdso64_start) >> PAGE_SHIFT;
 
-	vdso32_kbase = &vdso32_start;
-
 	/*
 	 * Calculate the size of the 32 bits vDSO
 	 */
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 4/5] powerpc/vdso: Declare constant vars as __ro_after_init
From: Christophe Leroy @ 2020-08-28  5:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <0f65bb24a2519e5e6c33089016cb249a7c1b1e35.1598594308.git.christophe.leroy@csgroup.eu>

To avoid any risk of modification of vital VDSO variables,
declare them __ro_after_init.

vdso32_kbase and vdso64_kbase could be made 'const', but it would
have high impact on all functions using them as the compiler doesn't
expect const property to be discarded.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/vdso.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index fb393266b9cb..4ad042995ccc 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -38,19 +38,19 @@
 #define VDSO_ALIGNMENT	(1 << 16)
 
 extern char vdso32_start, vdso32_end;
-static unsigned int vdso32_pages;
-static void *vdso32_kbase = &vdso32_start;
-unsigned long vdso32_sigtramp;
-unsigned long vdso32_rt_sigtramp;
+static unsigned int vdso32_pages __ro_after_init;
+static void *vdso32_kbase __ro_after_init = &vdso32_start;
+unsigned long vdso32_sigtramp __ro_after_init;
+unsigned long vdso32_rt_sigtramp __ro_after_init;
 
 extern char vdso64_start, vdso64_end;
-static void *vdso64_kbase = &vdso64_start;
-static unsigned int vdso64_pages;
+static void *vdso64_kbase __ro_after_init = &vdso64_start;
+static unsigned int vdso64_pages __ro_after_init;
 #ifdef CONFIG_PPC64
-unsigned long vdso64_rt_sigtramp;
+unsigned long vdso64_rt_sigtramp __ro_after_init;
 #endif /* CONFIG_PPC64 */
 
-static int vdso_ready;
+static int vdso_ready __ro_after_init;
 
 /*
  * The vdso data page (aka. systemcfg for old ppc64 fans) is here.
-- 
2.25.0


^ permalink raw reply related

* [PATCH v2 5/5] powerpc/vdso: Declare vdso_patches[] as __initdata
From: Christophe Leroy @ 2020-08-28  5:58 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <0f65bb24a2519e5e6c33089016cb249a7c1b1e35.1598594308.git.christophe.leroy@csgroup.eu>

vdso_patches[] table is used only at init time.

Mark it __initdata.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
---
 arch/powerpc/kernel/vdso.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index 4ad042995ccc..dfa08a7b4e7c 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -76,7 +76,7 @@ struct vdso_patch_def
  * Currently, we only change sync_dicache to do nothing on processors
  * with a coherent icache
  */
-static struct vdso_patch_def vdso_patches[] = {
+static struct vdso_patch_def vdso_patches[] __initdata = {
 	{
 		CPU_FTR_COHERENT_ICACHE, CPU_FTR_COHERENT_ICACHE,
 		"__kernel_sync_dicache", "__kernel_sync_dicache_p5"
-- 
2.25.0


^ permalink raw reply related

* Re: [PATCHv5 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents
From: Pingfan Liu @ 2020-08-28  8:10 UTC (permalink / raw)
  To: Laurent Dufour
  Cc: Nathan Lynch, Kexec Mailing List, Nathan Fontenot, linuxppc-dev,
	Hari Bathini
In-Reply-To: <17369122-3e37-1360-0f68-827a8104cd35@linux.ibm.com>

On Thu, Aug 27, 2020 at 3:53 PM Laurent Dufour <ldufour@linux.ibm.com> wrote:
>
> Le 10/08/2020 à 10:52, Pingfan Liu a écrit :
> > A bug is observed on pseries by taking the following steps on rhel:
> > -1. drmgr -c mem -r -q 5
> > -2. echo c > /proc/sysrq-trigger
> >
> > And then, the failure looks like:
> > kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/
> > kdump: saving vmcore-dmesg.txt
> > kdump: saving vmcore-dmesg.txt complete
> > kdump: saving vmcore
> >   Checking for memory holes                         : [  0.0 %] /                   Checking for memory holes                         : [100.0 %] |                   Excluding unnecessary pages                       : [100.0 %] \                   Copying data                                      : [  0.3 %] -          eta: 38s[   44.337636] hash-mmu: mm: Hashing failure ! EA=0x7fffba400000 access=0x8000000000000004 current=makedumpfile
> > [   44.337663] hash-mmu:     trap=0x300 vsid=0x13a109c ssize=1 base psize=2 psize 2 pte=0xc000000050000504
> > [   44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba400000 access=0x8000000000000004 current=makedumpfile
> > [   44.337692] hash-mmu:     trap=0x300 vsid=0x13a109c ssize=1 base psize=2 psize 2 pte=0xc000000050000504
> > [   44.337708] makedumpfile[469]: unhandled signal 7 at 00007fffba400000 nip 00007fffbbc4d7fc lr 000000011356ca3c code 2
> > [   44.338548] Core dump to |/bin/false pipe failed
> > /lib/kdump-lib-initramfs.sh: line 98:   469 Bus error               $CORE_COLLECTOR /proc/vmcore $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete
> > kdump: saving vmcore failed
> >
> > * Root cause *
> >    After analyzing, it turns out that in the current implementation,
> > when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as
> > the code __remove_memory() comes before drmem_update_dt().
> > So in kdump kernel, when read_from_oldmem() resorts to
> > pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to
> > non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, as it
> > can be observed "Bus error"
> >
> >  From a viewpoint of listener and publisher, the publisher notifies the
> > listener before data is ready.  This introduces a problem where udev
> > launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before
> > updating. And in capture kernel, makedumpfile will access the memory based
> > on the stale dt info, and hit a SIGBUS error due to an un-existed lmb.
> >
> > * Fix *
> > This bug is introduced by commit 063b8b1251fd
> > ("powerpc/pseries/memory-hotplug: Only update DT once per memory DLPAR
> > request"), which tried to combine all the dt updating into one.
> >
> > To fix this issue, meanwhile not to introduce a quadratic runtime
> > complexity by the model:
> >    dlpar_memory_add_by_count
> >      for_each_drmem_lmb             <--
> >        dlpar_add_lmb
> >          drmem_update_dt(_v1|_v2)
> >            for_each_drmem_lmb       <--
> > The dt should still be only updated once, and just before the last memory
> > online/offline event is ejected to user space. Achieve this by tracing the
> > num of lmb added or removed.
> >
> > Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
> > Cc: Michael Ellerman <mpe@ellerman.id.au>
> > Cc: Hari Bathini <hbathini@linux.ibm.com>
> > Cc: Nathan Lynch <nathanl@linux.ibm.com>
> > Cc: Nathan Fontenot <nfont@linux.vnet.ibm.com>
> > Cc: Laurent Dufour <ldufour@linux.ibm.com>
> > To: linuxppc-dev@lists.ozlabs.org
> > Cc: kexec@lists.infradead.org
> > ---
> > v4 -> v5: change dlpar_add_lmb()/dlpar_remove_lmb() prototype to report
> >            whether dt is updated successfully.
> >            Fix a condition boundary check bug
> > v3 -> v4: resolve a quadratic runtime complexity issue.
> >            This series is applied on next-test branch
> >   arch/powerpc/platforms/pseries/hotplug-memory.c | 102 +++++++++++++++++++-----
> >   1 file changed, 80 insertions(+), 22 deletions(-)
> >
> > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > index 46cbcd1..1567d9f 100644
> > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> > @@ -350,13 +350,22 @@ static bool lmb_is_removable(struct drmem_lmb *lmb)
> >       return true;
> >   }
> >
> > -static int dlpar_add_lmb(struct drmem_lmb *);
> > +enum dt_update_status {
> > +     DT_NOUPDATE,
> > +     DT_TOUPDATE,
> > +     DT_UPDATED,
> > +};
> > +
> > +/* "*dt_update" returns DT_UPDATED if updated */
> > +static int dlpar_add_lmb(struct drmem_lmb *lmb,
> > +             enum dt_update_status *dt_update);
> >
> > -static int dlpar_remove_lmb(struct drmem_lmb *lmb)
> > +static int dlpar_remove_lmb(struct drmem_lmb *lmb,
> > +             enum dt_update_status *dt_update)
> >   {
> >       unsigned long block_sz;
> >       phys_addr_t base_addr;
> > -     int rc, nid;
> > +     int rc, ret, nid;
> >
> >       if (!lmb_is_removable(lmb))
> >               return -EINVAL;
> > @@ -372,6 +381,13 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
> >       invalidate_lmb_associativity_index(lmb);
> >       lmb_clear_nid(lmb);
> >       lmb->flags &= ~DRCONF_MEM_ASSIGNED;
> > +     if (*dt_update) {
Original, I plan to use it to exclude the case of DT_NOUPDATE, which is value 0.
And I think it looks better by using if (*dt_update == DT_TOUPDATE)
>
> That test is wrong, you should do:
>          if (*dt_update && *dt_update == DT_TOUPDATE) {
I think you mean  if (dt_update && *dt_update == DT_TOUPDATE) {
>
> With the current code, the device tree is updated all the time.
>
> Another option would be to pass a valid pointer (!= NULL) only when DT update is
> required, this way you don't need the DT_TOUPDATE value. The caller would have
> to set the pointer accordingly. The advantage with this option is the caller is
> guaranteed that its variable is not touched by the callee when no device tree is
> requested. A simple boolean pointer would be enough without the need to this enum.
It is expected that dlpar_remove_lmb/dlpar_add_lmb can report whether
they successfully update dt or not. So the caller can handle the
different cases.

Thanks,
Pingfan

^ permalink raw reply

* [PATCH 0/4] more mm switching vs TLB shootdown and lazy tlb
From: Nicholas Piggin @ 2020-08-28 10:00 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-arch, Jens Axboe, Peter Zijlstra, Aneesh Kumar K.V,
	linux-kernel, Nicholas Piggin, Andrew Morton, linuxppc-dev,
	David S. Miller

This is an attempt to fix a few different related issues around
switching mm, TLB flushing, and lazy tlb mm handling.

This will require all architectures to eventually move to disabling
irqs over activate_mm, but it's possible we could add another arch
call after irqs are re-enabled for those few which can't do their
entire activation with irqs disabled.

I'd like some feedback on the sparc/powerpc vs kthread_use_mm
problem too.

Thanks,
Nick

Nicholas Piggin (4):
  mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race
  powerpc: select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
  sparc64: remove mm_cpumask clearing to fix kthread_use_mm race
  powerpc/64s/radix: Fix mm_cpumask trimming race vs kthread_use_mm

 arch/Kconfig                           |  7 +++
 arch/powerpc/Kconfig                   |  1 +
 arch/powerpc/include/asm/mmu_context.h |  2 +-
 arch/powerpc/include/asm/tlb.h         | 13 ------
 arch/powerpc/mm/book3s64/radix_tlb.c   | 23 ++++++---
 arch/sparc/kernel/smp_64.c             | 65 ++++++--------------------
 fs/exec.c                              | 17 ++++++-
 7 files changed, 54 insertions(+), 74 deletions(-)

-- 
2.23.0


^ permalink raw reply

* [PATCH 1/4] mm: fix exec activate_mm vs TLB shootdown and lazy tlb switching race
From: Nicholas Piggin @ 2020-08-28 10:00 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-arch, Jens Axboe, Peter Zijlstra, Aneesh Kumar K.V,
	linux-kernel, Nicholas Piggin, Andrew Morton, linuxppc-dev,
	David S. Miller
In-Reply-To: <20200828100022.1099682-1-npiggin@gmail.com>

Reading and modifying current->mm and current->active_mm and switching
mm should be done with irqs off, to prevent races seeing an intermediate
state.

This is similar to commit 38cf307c1f20 ("mm: fix kthread_use_mm() vs TLB
invalidate"). At exec-time when the new mm is activated, the old one
should usually be single-threaded and no longer used, unless something
else is holding an mm_users reference (which may be possible).

Absent other mm_users, there is also a race with preemption and lazy tlb
switching. Consider the kernel_execve case where the current thread is
using a lazy tlb active mm:

  call_usermodehelper()
    kernel_execve()
      old_mm = current->mm;
      active_mm = current->active_mm;
      *** preempt *** -------------------->  schedule()
                                               prev->active_mm = NULL;
                                               mmdrop(prev active_mm);
                                             ...
                      <--------------------  schedule()
      current->mm = mm;
      current->active_mm = mm;
      if (!old_mm)
          mmdrop(active_mm);

If we switch back to the kernel thread from a different mm, there is a
double free of the old active_mm, and a missing free of the new one.

Closing this race only requires interrupts to be disabled while ->mm
and ->active_mm are being switched, but the TLB problem requires also
holding interrupts off over activate_mm. Unfortunately not all archs
can do that yet, e.g., arm defers the switch if irqs are disabled and
expects finish_arch_post_lock_switch() to be called to complete the
flush; um takes a blocking lock in activate_mm().

So as a first step, disable interrupts across the mm/active_mm updates
to close the lazy tlb preempt race, and provide an arch option to
extend that to activate_mm which allows architectures doing IPI based
TLB shootdowns to close the second race.

This is a bit ugly, but in the interest of fixing the bug and backporting
before all architectures are converted this is a compromise.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/Kconfig |  7 +++++++
 fs/exec.c    | 17 +++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index af14a567b493..94821e3f94d1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -414,6 +414,13 @@ config MMU_GATHER_NO_GATHER
 	bool
 	depends on MMU_GATHER_TABLE_FREE
 
+config ARCH_WANT_IRQS_OFF_ACTIVATE_MM
+	bool
+	help
+	  Temporary select until all architectures can be converted to have
+	  irqs disabled over activate_mm. Architectures that do IPI based TLB
+	  shootdowns should enable this.
+
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
 	bool
 
diff --git a/fs/exec.c b/fs/exec.c
index a91003e28eaa..d4fb18baf1fb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1130,11 +1130,24 @@ static int exec_mmap(struct mm_struct *mm)
 	}
 
 	task_lock(tsk);
-	active_mm = tsk->active_mm;
 	membarrier_exec_mmap(mm);
-	tsk->mm = mm;
+
+	local_irq_disable();
+	active_mm = tsk->active_mm;
 	tsk->active_mm = mm;
+	tsk->mm = mm;
+	/*
+	 * This prevents preemption while active_mm is being loaded and
+	 * it and mm are being updated, which could cause problems for
+	 * lazy tlb mm refcounting when these are updated by context
+	 * switches. Not all architectures can handle irqs off over
+	 * activate_mm yet.
+	 */
+	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+		local_irq_enable();
 	activate_mm(active_mm, mm);
+	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
+		local_irq_enable();
 	tsk->mm->vmacache_seqnum = 0;
 	vmacache_flush(tsk);
 	task_unlock(tsk);
-- 
2.23.0


^ permalink raw reply related

* [PATCH 2/4] powerpc: select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
From: Nicholas Piggin @ 2020-08-28 10:00 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-arch, Jens Axboe, Peter Zijlstra, Aneesh Kumar K.V,
	linux-kernel, Nicholas Piggin, Andrew Morton, linuxppc-dev,
	David S. Miller
In-Reply-To: <20200828100022.1099682-1-npiggin@gmail.com>

powerpc uses IPIs in some situations to switch a kernel thread away
from a lazy tlb mm, which is subject to the TLB flushing race
described in the changelog introducing ARCH_WANT_IRQS_OFF_ACTIVATE_MM.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/powerpc/Kconfig                   | 1 +
 arch/powerpc/include/asm/mmu_context.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1f48bbfb3ce9..65cb32211574 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -149,6 +149,7 @@ config PPC
 	select ARCH_USE_QUEUED_RWLOCKS		if PPC_QUEUED_SPINLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS	if PPC_QUEUED_SPINLOCKS
 	select ARCH_WANT_IPC_PARSE_VERSION
+	select ARCH_WANT_IRQS_OFF_ACTIVATE_MM
 	select ARCH_WEAK_RELEASE_ACQUIRE
 	select BINFMT_ELF
 	select BUILDTIME_TABLE_SORT
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index 7f3658a97384..e02aa793420b 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -244,7 +244,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
  */
 static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next)
 {
-	switch_mm(prev, next, current);
+	switch_mm_irqs_off(prev, next, current);
 }
 
 /* We don't currently use enter_lazy_tlb() for anything */
-- 
2.23.0


^ permalink raw reply related

* [PATCH 3/4] sparc64: remove mm_cpumask clearing to fix kthread_use_mm race
From: Nicholas Piggin @ 2020-08-28 10:00 UTC (permalink / raw)
  To: linux-mm
  Cc: linux-arch, Jens Axboe, Peter Zijlstra, Aneesh Kumar K.V,
	linux-kernel, Nicholas Piggin, sparclinux, Andrew Morton,
	linuxppc-dev, David S. Miller
In-Reply-To: <20200828100022.1099682-1-npiggin@gmail.com>

The de facto (and apparently uncommented) standard for using an mm had,
thanks to this code in sparc if nothing else, been that you must have a
reference on mm_users *and that reference must have been obtained with
mmget()*, i.e., from a thread with a reference to mm_users that had used
the mm.

The introduction of mmget_not_zero() in commit d2005e3f41d4
("userfaultfd: don't pin the user memory in userfaultfd_file_create()")
allowed mm_count holders to aoperate on user mappings asynchronously
from the actual threads using the mm, but they were not to load those
mappings into their TLB (i.e., walking vmas and page tables is okay,
kthread_use_mm() is not).

io_uring 2b188cc1bb857 ("Add io_uring IO interface") added code which
does a kthread_use_mm() from a mmget_not_zero() refcount.

The problem with this is code which previously assumed mm == current->mm
and mm->mm_users == 1 implies the mm will remain single-threaded at
least until this thread creates another mm_users reference, has now
broken.

arch/sparc/kernel/smp_64.c:

    if (atomic_read(&mm->mm_users) == 1) {
        cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
        goto local_flush_and_out;
    }

vs fs/io_uring.c

    if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
                 !mmget_not_zero(ctx->sqo_mm)))
        return -EFAULT;
    kthread_use_mm(ctx->sqo_mm);

mmget_not_zero() could come in right after the mm_users == 1 test, then
kthread_use_mm() which sets its CPU in the mm_cpumask. That update could
be lost if cpumask_copy() occurs afterward.

I propose we fix this by allowing mmget_not_zero() to be a first-class
reference, and not have this obscure undocumented and unchecked
restriction.

The basic fix for sparc64 is to remove its mm_cpumask clearing code. The
optimisation could be effectively restored by sending IPIs to mm_cpumask
members and having them remove themselves from mm_cpumask. This is more
tricky so I leave it as an exercise for someone with a sparc64 SMP.
powerpc has a (currently similarly broken) example.

Cc: sparclinux@vger.kernel.org
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
---
 arch/sparc/kernel/smp_64.c | 65 ++++++++------------------------------
 1 file changed, 14 insertions(+), 51 deletions(-)

diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c
index e286e2badc8a..e38d8bf454e8 100644
--- a/arch/sparc/kernel/smp_64.c
+++ b/arch/sparc/kernel/smp_64.c
@@ -1039,38 +1039,9 @@ void smp_fetch_global_pmu(void)
  * are flush_tlb_*() routines, and these run after flush_cache_*()
  * which performs the flushw.
  *
- * The SMP TLB coherency scheme we use works as follows:
- *
- * 1) mm->cpu_vm_mask is a bit mask of which cpus an address
- *    space has (potentially) executed on, this is the heuristic
- *    we use to avoid doing cross calls.
- *
- *    Also, for flushing from kswapd and also for clones, we
- *    use cpu_vm_mask as the list of cpus to make run the TLB.
- *
- * 2) TLB context numbers are shared globally across all processors
- *    in the system, this allows us to play several games to avoid
- *    cross calls.
- *
- *    One invariant is that when a cpu switches to a process, and
- *    that processes tsk->active_mm->cpu_vm_mask does not have the
- *    current cpu's bit set, that tlb context is flushed locally.
- *
- *    If the address space is non-shared (ie. mm->count == 1) we avoid
- *    cross calls when we want to flush the currently running process's
- *    tlb state.  This is done by clearing all cpu bits except the current
- *    processor's in current->mm->cpu_vm_mask and performing the
- *    flush locally only.  This will force any subsequent cpus which run
- *    this task to flush the context from the local tlb if the process
- *    migrates to another cpu (again).
- *
- * 3) For shared address spaces (threads) and swapping we bite the
- *    bullet for most cases and perform the cross call (but only to
- *    the cpus listed in cpu_vm_mask).
- *
- *    The performance gain from "optimizing" away the cross call for threads is
- *    questionable (in theory the big win for threads is the massive sharing of
- *    address space state across processors).
+ * mm->cpu_vm_mask is a bit mask of which cpus an address
+ * space has (potentially) executed on, this is the heuristic
+ * we use to limit cross calls.
  */
 
 /* This currently is only used by the hugetlb arch pre-fault
@@ -1080,18 +1051,13 @@ void smp_fetch_global_pmu(void)
 void smp_flush_tlb_mm(struct mm_struct *mm)
 {
 	u32 ctx = CTX_HWBITS(mm->context);
-	int cpu = get_cpu();
 
-	if (atomic_read(&mm->mm_users) == 1) {
-		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
-		goto local_flush_and_out;
-	}
+	get_cpu();
 
 	smp_cross_call_masked(&xcall_flush_tlb_mm,
 			      ctx, 0, 0,
 			      mm_cpumask(mm));
 
-local_flush_and_out:
 	__flush_tlb_mm(ctx, SECONDARY_CONTEXT);
 
 	put_cpu();
@@ -1114,17 +1080,15 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 {
 	u32 ctx = CTX_HWBITS(mm->context);
 	struct tlb_pending_info info;
-	int cpu = get_cpu();
+
+	get_cpu();
 
 	info.ctx = ctx;
 	info.nr = nr;
 	info.vaddrs = vaddrs;
 
-	if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
-		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
-	else
-		smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
-				       &info, 1);
+	smp_call_function_many(mm_cpumask(mm), tlb_pending_func,
+			       &info, 1);
 
 	__flush_tlb_pending(ctx, nr, vaddrs);
 
@@ -1134,14 +1098,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 void smp_flush_tlb_page(struct mm_struct *mm, unsigned long vaddr)
 {
 	unsigned long context = CTX_HWBITS(mm->context);
-	int cpu = get_cpu();
 
-	if (mm == current->mm && atomic_read(&mm->mm_users) == 1)
-		cpumask_copy(mm_cpumask(mm), cpumask_of(cpu));
-	else
-		smp_cross_call_masked(&xcall_flush_tlb_page,
-				      context, vaddr, 0,
-				      mm_cpumask(mm));
+	get_cpu();
+
+	smp_cross_call_masked(&xcall_flush_tlb_page,
+			      context, vaddr, 0,
+			      mm_cpumask(mm));
+
 	__flush_tlb_page(context, vaddr);
 
 	put_cpu();
-- 
2.23.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox