LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH 10/15] powerpc/powernv/pci: Refactor pnv_ioda_alloc_pe()
From: Alexey Kardashevskiy @ 2020-07-15  2:29 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-11-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> Rework the PE allocation logic to allow allocating blocks of PEs rather
> than individually. We'll use this to allocate contigious blocks of PEs for
> the SR-IOVs.

The patch does not do just this, it also adds missing mutexes (which is
good) but still misses them in pnv_pci_sriov_disable() and
pnv_pci_ioda_pe_dump().




> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 41 ++++++++++++++++++-----
>  arch/powerpc/platforms/powernv/pci.h      |  2 +-
>  2 files changed, 34 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 2d36a9ebf0e9..c9c25fb0783c 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -145,23 +145,45 @@ static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>  		return;
>  	}
>  
> +	mutex_lock(&phb->ioda.pe_alloc_mutex);
>  	if (test_and_set_bit(pe_no, phb->ioda.pe_alloc))
>  		pr_debug("%s: PE %x was reserved on PHB#%x\n",
>  			 __func__, pe_no, phb->hose->global_number);
> +	mutex_unlock(&phb->ioda.pe_alloc_mutex);
>  
>  	pnv_ioda_init_pe(phb, pe_no);
>  }
>  
> -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
> +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count)
>  {
> -	long pe;
> +	struct pnv_ioda_pe *ret = NULL;
> +	int run = 0, pe, i;
>  
> +	mutex_lock(&phb->ioda.pe_alloc_mutex);
> +
> +	/* scan backwards for a run of @count cleared bits */
>  	for (pe = phb->ioda.total_pe_num - 1; pe >= 0; pe--) {
> -		if (!test_and_set_bit(pe, phb->ioda.pe_alloc))
> -			return pnv_ioda_init_pe(phb, pe);
> +		if (test_bit(pe, phb->ioda.pe_alloc)) {
> +			run = 0;
> +			continue;
> +		}
> +
> +		run++;
> +		if (run == count)
> +			break;
>  	}
> +	if (run != count)
> +		goto out;
>  
> -	return NULL;
> +	for (i = pe; i < pe + count; i++) {
> +		set_bit(i, phb->ioda.pe_alloc);
> +		pnv_ioda_init_pe(phb, i);
> +	}
> +	ret = &phb->ioda.pe_array[pe];
> +
> +out:
> +	mutex_unlock(&phb->ioda.pe_alloc_mutex);
> +	return ret;
>  }
>  
>  void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
> @@ -173,7 +195,10 @@ void pnv_ioda_free_pe(struct pnv_ioda_pe *pe)
>  	WARN_ON(pe->npucomp); /* NPUs for nvlink are not supposed to be freed */
>  	kfree(pe->npucomp);
>  	memset(pe, 0, sizeof(struct pnv_ioda_pe));
> +
> +	mutex_lock(&phb->ioda.pe_alloc_mutex);
>  	clear_bit(pe_num, phb->ioda.pe_alloc);
> +	mutex_unlock(&phb->ioda.pe_alloc_mutex);
>  }
>  
>  /* The default M64 BAR is shared by all PEs */
> @@ -976,7 +1001,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>  	if (pdn->pe_number != IODA_INVALID_PE)
>  		return NULL;
>  
> -	pe = pnv_ioda_alloc_pe(phb);
> +	pe = pnv_ioda_alloc_pe(phb, 1);
>  	if (!pe) {
>  		pr_warn("%s: Not enough PE# available, disabling device\n",
>  			pci_name(dev));
> @@ -1047,7 +1072,7 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
>  
>  	/* The PE number isn't pinned by M64 */
>  	if (!pe)
> -		pe = pnv_ioda_alloc_pe(phb);
> +		pe = pnv_ioda_alloc_pe(phb, 1);
>  
>  	if (!pe) {
>  		pr_warn("%s: Not enough PE# available for PCI bus %04x:%02x\n",
> @@ -3065,7 +3090,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
>  		pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
>  	} else {
>  		/* otherwise just allocate one */
> -		root_pe = pnv_ioda_alloc_pe(phb);
> +		root_pe = pnv_ioda_alloc_pe(phb, 1);
>  		phb->ioda.root_pe_idx = root_pe->pe_number;
>  	}
>  
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 58c97e60c3db..b4c9bdba7217 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -223,7 +223,7 @@ int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
>  void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, struct pnv_ioda_pe *pe);
>  void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe);
>  
> -struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb);
> +struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb, int count);
>  void pnv_ioda_free_pe(struct pnv_ioda_pe *pe);
>  
>  #ifdef CONFIG_PCI_IOV
> 

-- 
Alexey

^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Benjamin Herrenschmidt @ 2020-07-15  2:19 UTC (permalink / raw)
  To: Bjorn Helgaas, Arnd Bergmann
  Cc: linux-pci, Keith Busch, Paul Mackerras, sparclinux, Toan Le,
	Kjetil Oftedal, Greg Ungerer, Marek Vasut, Rob Herring,
	Lorenzo Pieralisi, Sagi Grimberg, Russell King, Ley Foon Tan,
	Christoph Hellwig, Geert Uytterhoeven, Kevin Hilman,
	Jakub Kicinski, Matt Turner, linux-kernel-mentees, Guenter Roeck,
	Ray Jui, Jens Axboe, Ivan Kokshaysky, Shuah Khan, bjorn,
	Boris Ostrovsky, Richard Henderson, Juergen Gross, Bjorn Helgaas,
	Thomas Bogendoerfer, Scott Branden, Jingoo Han,
	Saheed O. Bolarinwa, linux-kernel@vger.kernel.org, Philipp Zabel,
	Greg Kroah-Hartman, Gustavo Pimentel, linuxppc-dev,
	David S. Miller, Heiner Kallweit
In-Reply-To: <20200714234625.GA428442@bjorn-Precision-5520>

On Tue, 2020-07-14 at 18:46 -0500, Bjorn Helgaas wrote:
> Yes.  I have no problem with that.  There are a few cases where it's
> important to check for errors, e.g., we read a status register and do
> something based on a bit being set.  A failure will return all bits
> set, and we may do the wrong thing.  But most of the errors we care
> about will be on MMIO reads, not config reads, so we can probably
> ignore most config read errors.

And in both cases, we don't have the plumbing to provide accurate
and reliable error returns for all platforms anyways (esp. not for
MMIO).

I think it makes sense to stick to the good old "if all 1's, then go
out of line" including for config space.

 ../..

> Yep, except for things like device removal or other PCI errors.

A whole bunch of these are reported asynchronously, esp for writes (and
yes, including config writes, they are supposed to be non-posted but
more often than not, the path  from the CPU to the PCI bridge remains
posted for writes including config ones).

> So maybe a good place to start is by removing some of the useless
> error checking for pci_read_config_*() and pci_write_config_*().
> That's a decent-sized but not impractical project that could be done
> per subsystem or something:
> 
>   git grep -E "(if|return|=).*\<pci_(read|write)_config" drivers
> 
> finds about 400 matches.
> 
> Some of those callers probably really *do* want to check for errors,
> and I guess we'd have to identify them and do them separately as you
> mentioned.

I'd be curious about these considering how unreliable our error return
is accross the board.

Cheers,
Ben.



^ permalink raw reply

* Re: [PATCH v3 2/9] powerpc/watchpoint: Fix DAWR exception constraint
From: Jordan Niethe @ 2020-07-15  2:19 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: Christophe Leroy, apopple, mikey, miltonm, peterz, oleg,
	Nicholas Piggin, linux-kernel, Paul Mackerras, jolsa, fweisbec,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200708045046.135702-3-ravi.bangoria@linux.ibm.com>

On Wed, Jul 8, 2020 at 2:52 PM Ravi Bangoria
<ravi.bangoria@linux.ibm.com> wrote:
>
> Pedro Miraglia Franco de Carvalho noticed that on p8, DAR value is
> inconsistent with different type of load/store. Like for byte,word
> etc. load/stores, DAR is set to the address of the first byte of
> overlap between watch range and real access. But for quadword load/
> store it's set to the address of the first byte of real access. This
> issue has been fixed in p10. In p10(ISA 3.1), DAR is always set to
> the address of the first byte of overlap. Commit 27985b2a640e
> ("powerpc/watchpoint: Don't ignore extraneous exceptions blindly")
> wrongly assumes that DAR is set to the address of the first byte of
> overlap for all load/stores on p8 as well. Fix that. With the fix,
> we now rely on 'ea' provided by analyse_instr(). If analyse_instr()
> fails, generate event unconditionally on p8, and on p10 generate
> event only if DAR is within a DAWR range.
>
> Note: 8xx is not affected.
>
> Fixes: 27985b2a640e ("powerpc/watchpoint: Don't ignore extraneous exceptions blindly")
> Fixes: 74c6881019b7 ("powerpc/watchpoint: Prepare handler to handle more than one watchpoint")
> Reported-by: Pedro Miraglia Franco de Carvalho <pedromfc@br.ibm.com>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
> ---
>  arch/powerpc/kernel/hw_breakpoint.c | 93 +++++++++++++++++++----------
>  1 file changed, 63 insertions(+), 30 deletions(-)
>
> diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
> index 031e6defc08e..7a66c370a105 100644
> --- a/arch/powerpc/kernel/hw_breakpoint.c
> +++ b/arch/powerpc/kernel/hw_breakpoint.c
> @@ -498,11 +498,11 @@ static bool dar_in_user_range(unsigned long dar, struct arch_hw_breakpoint *info
>         return ((info->address <= dar) && (dar - info->address < info->len));
>  }
>
> -static bool dar_user_range_overlaps(unsigned long dar, int size,
> -                                   struct arch_hw_breakpoint *info)
> +static bool ea_user_range_overlaps(unsigned long ea, int size,
> +                                  struct arch_hw_breakpoint *info)
>  {
> -       return ((dar < info->address + info->len) &&
> -               (dar + size > info->address));
> +       return ((ea < info->address + info->len) &&
> +               (ea + size > info->address));
>  }
>
>  static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info)
> @@ -515,20 +515,22 @@ static bool dar_in_hw_range(unsigned long dar, struct arch_hw_breakpoint *info)
>         return ((hw_start_addr <= dar) && (hw_end_addr > dar));
>  }
>
> -static bool dar_hw_range_overlaps(unsigned long dar, int size,
> -                                 struct arch_hw_breakpoint *info)
> +static bool ea_hw_range_overlaps(unsigned long ea, int size,
> +                                struct arch_hw_breakpoint *info)
>  {
>         unsigned long hw_start_addr, hw_end_addr;
>
>         hw_start_addr = ALIGN_DOWN(info->address, HW_BREAKPOINT_SIZE);
>         hw_end_addr = ALIGN(info->address + info->len, HW_BREAKPOINT_SIZE);
>
> -       return ((dar < hw_end_addr) && (dar + size > hw_start_addr));
> +       return ((ea < hw_end_addr) && (ea + size > hw_start_addr));
>  }
>
>  /*
>   * If hw has multiple DAWR registers, we also need to check all
>   * dawrx constraint bits to confirm this is _really_ a valid event.
> + * If type is UNKNOWN, but privilege level matches, consider it as
> + * a positive match.
>   */
>  static bool check_dawrx_constraints(struct pt_regs *regs, int type,
>                                     struct arch_hw_breakpoint *info)
> @@ -536,7 +538,12 @@ static bool check_dawrx_constraints(struct pt_regs *regs, int type,
>         if (OP_IS_LOAD(type) && !(info->type & HW_BRK_TYPE_READ))
>                 return false;
>
> -       if (OP_IS_STORE(type) && !(info->type & HW_BRK_TYPE_WRITE))
> +       /*
> +        * The Cache Management instructions other than dcbz never
> +        * cause a match. i.e. if type is CACHEOP, the instruction
> +        * is dcbz, and dcbz is treated as Store.
> +        */
> +       if ((OP_IS_STORE(type) || type == CACHEOP) && !(info->type & HW_BRK_TYPE_WRITE))
>                 return false;
This change seems seperate to this commit?
>
>         if (is_kernel_addr(regs->nip) && !(info->type & HW_BRK_TYPE_KERNEL))
> @@ -553,7 +560,8 @@ static bool check_dawrx_constraints(struct pt_regs *regs, int type,
>   * including extraneous exception. Otherwise return false.
>   */
>  static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr,
> -                             int type, int size, struct arch_hw_breakpoint *info)
> +                             unsigned long ea, int type, int size,
> +                             struct arch_hw_breakpoint *info)
>  {
>         bool in_user_range = dar_in_user_range(regs->dar, info);
>         bool dawrx_constraints;
> @@ -569,11 +577,10 @@ static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr,
>         }
>
>         if (unlikely(ppc_inst_equal(instr, ppc_inst(0)))) {
> -               if (in_user_range)
> -                       return true;
> -
> -               if (dar_in_hw_range(regs->dar, info)) {
> -                       info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
> +               if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> +                       if (dar_in_hw_range(regs->dar, info))
> +                               return true;
> +               } else {
>                         return true;
I think this would be clearer as:
        if (cpu_has_feature(CPU_FTR_ARCH_31) &&
!(dar_in_hw_range(regs->dar, info)))
            return false;
        else
            return true;

>                 }
>                 return false;
> @@ -581,10 +588,20 @@ static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr,
>
>         dawrx_constraints = check_dawrx_constraints(regs, type, info);
>
> -       if (dar_user_range_overlaps(regs->dar, size, info))
> +       if (type == UNKNOWN) {
> +               if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> +                       if (dar_in_hw_range(regs->dar, info))
> +                               return dawrx_constraints;
> +               } else {
> +                       return dawrx_constraints;
> +               }
> +               return false;
> +       }
Similar thing here, it could be:
        if ((cpu_has_feature(CPU_FTR_ARCH_31)) &&
!(dar_in_hw_range(regs->dar, info)))
            return false;
        else
            return dawrx_constraints;
> +
> +       if (ea_user_range_overlaps(ea, size, info))
>                 return dawrx_constraints;
>
> -       if (dar_hw_range_overlaps(regs->dar, size, info)) {
> +       if (ea_hw_range_overlaps(ea, size, info)) {
>                 if (dawrx_constraints) {
>                         info->type |= HW_BRK_TYPE_EXTRANEOUS_IRQ;
>                         return true;
> @@ -593,8 +610,17 @@ static bool check_constraints(struct pt_regs *regs, struct ppc_inst instr,
>         return false;
>  }
>
> +static int cache_op_size(void)
> +{
> +#ifdef __powerpc64__
> +       return ppc64_caches.l1d.block_size;
> +#else
> +       return L1_CACHE_BYTES;
> +#endif
> +}
> +
>  static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr,
> -                            int *type, int *size, bool *larx_stcx)
> +                            int *type, int *size, unsigned long *ea)
>  {
>         struct instruction_op op;
>
> @@ -602,16 +628,23 @@ static void get_instr_detail(struct pt_regs *regs, struct ppc_inst *instr,
>                 return;
>
>         analyse_instr(&op, regs, *instr);
> -
> -       /*
> -        * Set size = 8 if analyse_instr() fails. If it's a userspace
> -        * watchpoint(valid or extraneous), we can notify user about it.
> -        * If it's a kernel watchpoint, instruction  emulation will fail
> -        * in stepping_handler() and watchpoint will be disabled.
> -        */
>         *type = GETTYPE(op.type);
> -       *size = !(*type == UNKNOWN) ? GETSIZE(op.type) : 8;
> -       *larx_stcx = (*type == LARX || *type == STCX);
> +       *ea = op.ea;
> +#ifdef __powerpc64__
> +       if (!(regs->msr & MSR_64BIT))
> +               *ea &= 0xffffffffUL;
> +#endif
> +
> +       *size = GETSIZE(op.type);
> +       if (*type == CACHEOP) {
> +               *size = cache_op_size();
> +               *ea &= ~(*size - 1);
> +       }
Again related to CACHEOP, should these changes be mentioned in the
commit message?
> +}
> +
> +static bool is_larx_stcx_instr(int type)
> +{
> +       return type == LARX || type == STCX;
>  }
>
>  /*
> @@ -678,7 +711,7 @@ int hw_breakpoint_handler(struct die_args *args)
>         struct ppc_inst instr = ppc_inst(0);
>         int type = 0;
>         int size = 0;
> -       bool larx_stcx = false;
> +       unsigned long ea;
>
>         /* Disable breakpoints during exception handling */
>         hw_breakpoint_disable();
> @@ -692,7 +725,7 @@ int hw_breakpoint_handler(struct die_args *args)
>         rcu_read_lock();
>
>         if (!IS_ENABLED(CONFIG_PPC_8xx))
> -               get_instr_detail(regs, &instr, &type, &size, &larx_stcx);
> +               get_instr_detail(regs, &instr, &type, &size, &ea);
>
>         for (i = 0; i < nr_wp_slots(); i++) {
>                 bp[i] = __this_cpu_read(bp_per_reg[i]);
> @@ -702,7 +735,7 @@ int hw_breakpoint_handler(struct die_args *args)
>                 info[i] = counter_arch_bp(bp[i]);
>                 info[i]->type &= ~HW_BRK_TYPE_EXTRANEOUS_IRQ;
>
> -               if (check_constraints(regs, instr, type, size, info[i])) {
> +               if (check_constraints(regs, instr, ea, type, size, info[i])) {
>                         if (!IS_ENABLED(CONFIG_PPC_8xx) &&
>                             ppc_inst_equal(instr, ppc_inst(0))) {
>                                 handler_error(bp[i], info[i]);
> @@ -744,7 +777,7 @@ int hw_breakpoint_handler(struct die_args *args)
>         }
>
>         if (!IS_ENABLED(CONFIG_PPC_8xx)) {
> -               if (larx_stcx) {
> +               if (is_larx_stcx_instr(type)) {
>                         for (i = 0; i < nr_wp_slots(); i++) {
>                                 if (!hit[i])
>                                         continue;
> --
> 2.26.2
>

^ permalink raw reply

* Re: [PATCH 09/15] powerpc/powernv/sriov: Factor out M64 BAR setup
From: Alexey Kardashevskiy @ 2020-07-15  2:09 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-10-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> The sequence required to use the single PE BAR mode is kinda janky and
> requires a little explanation. The API was designed with P7-IOC style
> windows where the setup process is something like:
> 
> 1. Configure the window start / end address
> 2. Enable the window
> 3. Map the segments of each window to the PE
> 
> For Single PE BARs the process is:
> 
> 1. Set the PE for segment zero on a disabled window
> 2. Set the range
> 3. Enable the window
> 
> Move the OPAL calls into their own helper functions where the quirks can be
> contained.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>


I'd use "segmented" instead of "accordion". Otherwise,

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>




> ---
>  arch/powerpc/platforms/powernv/pci-sriov.c | 132 ++++++++++++++++-----
>  1 file changed, 103 insertions(+), 29 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
> index e4c65cb49757..d53a85ccb538 100644
> --- a/arch/powerpc/platforms/powernv/pci-sriov.c
> +++ b/arch/powerpc/platforms/powernv/pci-sriov.c
> @@ -320,6 +320,102 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>  	return 0;
>  }
>  
> +
> +/*
> + * PHB3 and beyond support "accordion" windows. The window's address range
> + * is subdivided into phb->ioda.total_pe_num segments and there's a 1-1
> + * mapping between PEs and segments.
> + *
> + * They're called that because as the window size changes the segment sizes
> + * change with it. Sort of like an accordion, sort of.
> + */
> +static int64_t pnv_ioda_map_m64_accordion(struct pnv_phb *phb,
> +					  int window_id,
> +					  resource_size_t start,
> +					  resource_size_t size)
> +{
> +	int64_t rc;
> +
> +	rc = opal_pci_set_phb_mem_window(phb->opal_id,
> +					 OPAL_M64_WINDOW_TYPE,
> +					 window_id,
> +					 start,
> +					 0, /* unused */
> +					 size);
> +	if (rc)
> +		goto out;
> +
> +	rc = opal_pci_phb_mmio_enable(phb->opal_id,
> +				      OPAL_M64_WINDOW_TYPE,
> +				      window_id,
> +				      OPAL_ENABLE_M64_SPLIT);
> +out:
> +	if (rc)
> +		pr_err("Failed to map M64 window #%d: %lld\n", window_id, rc);
> +
> +	return rc;
> +}
> +
> +static int64_t pnv_ioda_map_m64_single(struct pnv_phb *phb,
> +				       int pe_num,
> +				       int window_id,
> +				       resource_size_t start,
> +				       resource_size_t size)
> +{
> +	int64_t rc;
> +
> +	/*
> +	 * The API for setting up m64 mmio windows seems to have been designed
> +	 * with P7-IOC in mind. For that chip each M64 BAR (window) had a fixed
> +	 * split of 8 equally sized segments each of which could individually
> +	 * assigned to a PE.
> +	 *
> +	 * The problem with this is that the API doesn't have any way to
> +	 * communicate the number of segments we want on a BAR. This wasn't
> +	 * a problem for p7-ioc since you didn't have a choice, but the
> +	 * single PE windows added in PHB3 don't map cleanly to this API.
> +	 *
> +	 * As a result we've got this slightly awkward process where we
> +	 * call opal_pci_map_pe_mmio_window() to put the single in single
> +	 * PE mode, and set the PE for the window before setting the address
> +	 * bounds. We need to do it this way because the single PE windows
> +	 * for PHB3 have different alignment requirements on PHB3.
> +	 */
> +	rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> +					 pe_num,
> +					 OPAL_M64_WINDOW_TYPE,
> +					 window_id,
> +					 0);
> +	if (rc)
> +		goto out;
> +
> +	/*
> +	 * NB: In single PE mode the window needs to be aligned to 32MB
> +	 */
> +	rc = opal_pci_set_phb_mem_window(phb->opal_id,
> +					 OPAL_M64_WINDOW_TYPE,
> +					 window_id,
> +					 start,
> +					 0, /* ignored by FW, m64 is 1-1 */
> +					 size);
> +	if (rc)
> +		goto out;
> +
> +	/*
> +	 * Now actually enable it. We specified the BAR should be in "non-split"
> +	 * mode so FW will validate that the BAR is in single PE mode.
> +	 */
> +	rc = opal_pci_phb_mmio_enable(phb->opal_id,
> +				      OPAL_M64_WINDOW_TYPE,
> +				      window_id,
> +				      OPAL_ENABLE_M64_NON_SPLIT);
> +out:
> +	if (rc)
> +		pr_err("Error mapping single PE BAR\n");
> +
> +	return rc;
> +}
> +
>  static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  {
>  	struct pnv_iov_data   *iov;
> @@ -330,7 +426,6 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  	int64_t                rc;
>  	int                    total_vfs;
>  	resource_size_t        size, start;
> -	int                    pe_num;
>  	int                    m64_bars;
>  
>  	phb = pci_bus_to_pnvhb(pdev->bus);
> @@ -359,49 +454,28 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
>  			set_bit(win, iov->used_m64_bar_mask);
>  
> +
>  			if (iov->m64_single_mode) {
>  				size = pci_iov_resource_size(pdev,
>  							PCI_IOV_RESOURCES + i);
>  				start = res->start + size * j;
> +				rc = pnv_ioda_map_m64_single(phb, win,
> +							     iov->pe_num_map[j],
> +							     start,
> +							     size);
>  			} else {
>  				size = resource_size(res);
>  				start = res->start;
> -			}
>  
> -			/* Map the M64 here */
> -			if (iov->m64_single_mode) {
> -				pe_num = iov->pe_num_map[j];
> -				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
> -						pe_num, OPAL_M64_WINDOW_TYPE,
> -						win, 0);
> +				rc = pnv_ioda_map_m64_accordion(phb, win, start,
> +								size);
>  			}
>  
> -			rc = opal_pci_set_phb_mem_window(phb->opal_id,
> -						 OPAL_M64_WINDOW_TYPE,
> -						 win,
> -						 start,
> -						 0, /* unused */
> -						 size);
> -
> -
>  			if (rc != OPAL_SUCCESS) {
>  				dev_err(&pdev->dev, "Failed to map M64 window #%d: %lld\n",
>  					win, rc);
>  				goto m64_failed;
>  			}
> -
> -			if (iov->m64_single_mode)
> -				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, win, 2);
> -			else
> -				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, win, 1);
> -
> -			if (rc != OPAL_SUCCESS) {
> -				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
> -					win, rc);
> -				goto m64_failed;
> -			}
>  		}
>  	}
>  	return 0;
> 

-- 
Alexey

^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Benjamin Herrenschmidt @ 2020-07-15  2:14 UTC (permalink / raw)
  To: Kjetil Oftedal, Bjorn Helgaas
  Cc: linux-pci, Keith Busch, Paul Mackerras, sparclinux, Toan Le,
	Greg Ungerer, Marek Vasut, Rob Herring, Lorenzo Pieralisi,
	Sagi Grimberg, Russell King, Ley Foon Tan, Christoph Hellwig,
	Geert Uytterhoeven, Kevin Hilman, Jakub Kicinski, Matt Turner,
	linux-kernel-mentees, Guenter Roeck, Arnd Bergmann, Ray Jui,
	Jens Axboe, Ivan Kokshaysky, Shuah Khan, bjorn, Boris Ostrovsky,
	Richard Henderson, Juergen Gross, Bjorn Helgaas,
	Thomas Bogendoerfer, Scott Branden, Jingoo Han,
	Saheed O. Bolarinwa, linux-kernel@vger.kernel.org, Philipp Zabel,
	Greg Kroah-Hartman, Gustavo Pimentel, linuxppc-dev,
	David S. Miller, Heiner Kallweit
In-Reply-To: <CALMQjD9OVTbLVPGX-9+GDekZ02Wsqdz57-k1uCBMXC7cT3K_7w@mail.gmail.com>

On Tue, 2020-07-14 at 23:02 +0200, Kjetil Oftedal wrote:
> > 
> > > For b), it might be nice to also change other aspects of the
> > > interface, e.g. passing a pci_host_bridge pointer plus bus number
> > > instead of a pci_bus pointer, or having the callback in the
> > > pci_host_bridge structure.
> > 
> > I like this idea a lot, too.  I think the fact that
> > pci_bus_read_config_word() requires a pci_bus * complicates things in
> > a few places.
> > 
> > I think it's completely separate, as you say, and we should defer it
> > for now because even part a) is a lot of work.  I added it to my list
> > of possible future projects.
> > 
> 
> What about strange PCI devices such as Non-Transparent bridges?
> They will require their own PCI Config space accessors that is not
> connected to a host bridge if one wants to do some sort of
> punch-through enumeration.
> I guess the kernel doesn't care much about them?

Well, today they would require a pci_bus anyway.. . so if you want to do
that sort of funny trick you may as well create a "virtual" host bridge.

Cheers,
Ben.



^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Benjamin Herrenschmidt @ 2020-07-15  2:12 UTC (permalink / raw)
  To: Bjorn Helgaas, Arnd Bergmann
  Cc: linux-pci, Keith Busch, Paul Mackerras, sparclinux, Toan Le,
	Greg Ungerer, Marek Vasut, Rob Herring, Lorenzo Pieralisi,
	Sagi Grimberg, Russell King, Ley Foon Tan, Christoph Hellwig,
	Geert Uytterhoeven, Kevin Hilman, Jakub Kicinski, Matt Turner,
	linux-kernel-mentees, Guenter Roeck, Ray Jui, Jens Axboe,
	Ivan Kokshaysky, Shuah Khan, bjorn, Boris Ostrovsky,
	Richard Henderson, Juergen Gross, Bjorn Helgaas,
	Thomas Bogendoerfer, Scott Branden, Jingoo Han,
	Saheed O. Bolarinwa, linux-kernel@vger.kernel.org, Philipp Zabel,
	Greg Kroah-Hartman, Gustavo Pimentel, linuxppc-dev,
	David S. Miller, Heiner Kallweit
In-Reply-To: <20200714184550.GA397277@bjorn-Precision-5520>

On Tue, 2020-07-14 at 13:45 -0500, Bjorn Helgaas wrote:
> 
> > fail for valid arguments on a valid pci_device* ?
> 
> I really like this idea.
> 
> pci_write_config_*() has one return value, and only 100ish of 2500
> callers check for errors.  It's sometimes possible for config
> accessors to detect PCI errors and return failure, e.g., device was
> removed or didn't respond, but most of them don't, and detecting
> these
> errors is not really that valuable.
> 
> pci_read_config_*() is much more interesting because it returns two
> things, the function return value and the value read from the PCI
> device, and it's complicated to check both. 

  .../...

I agree. It's a mess at the moment.

We have separate mechanism to convey PCI errors (among other things the
channel state) which should apply to config space when detection is
possible.

I think returning all 1's is the right thing to do here and avoids odd
duplicate error detection logic which I bet you is never properly
tested.

> > For b), it might be nice to also change other aspects of the
> > interface, e.g. passing a pci_host_bridge pointer plus bus number
> > instead of a pci_bus pointer, or having the callback in the
> > pci_host_bridge structure.
> 
> I like this idea a lot, too.  I think the fact that
> pci_bus_read_config_word() requires a pci_bus * complicates things in
> a few places.
> 
> I think it's completely separate, as you say, and we should defer it
> for now because even part a) is a lot of work.  I added it to my list
> of possible future projects.

Agreed on both points.

Cheers,
Ben.



^ permalink raw reply

* Re: [PATCH 08/15] powerpc/powernv/sriov: Simplify used window tracking
From: Oliver O'Halloran @ 2020-07-15  1:41 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev
In-Reply-To: <86273d91-6c38-6d63-4f5b-1ed0d619f465@ozlabs.ru>

On Wed, Jul 15, 2020 at 11:34 AM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
> On 10/07/2020 15:23, Oliver O'Halloran wrote:
> > diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> > index 0156d7d17f7d..58c97e60c3db 100644
> > --- a/arch/powerpc/platforms/powernv/pci.h
> > +++ b/arch/powerpc/platforms/powernv/pci.h
> > @@ -243,8 +243,11 @@ struct pnv_iov_data {
> >       /* Did we map the VF BARs with single-PE IODA BARs? */
> >       bool    m64_single_mode;
> >
> > -     int     (*m64_map)[PCI_SRIOV_NUM_BARS];
> > -#define IODA_INVALID_M64        (-1)
> > +     /*
> > +      * Bit mask used to track which m64 windows that we used to map the
>
>
> Language question: either "which" or "that" but both?

Uhhhh... I don't speak english

^ permalink raw reply

* Re: [PATCH 03/15] powerpc/powernv/pci: Add explicit tracking of the DMA setup state
From: Oliver O'Halloran @ 2020-07-15  1:38 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Cédric Le Goater
In-Reply-To: <34f7eea2-4ace-9931-7b5f-98ec159f3532@ozlabs.ru>

On Tue, Jul 14, 2020 at 5:21 PM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
> On 14/07/2020 15:58, Oliver O'Halloran wrote:
> > On Tue, Jul 14, 2020 at 3:37 PM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
> >>
> >> On 10/07/2020 15:23, Oliver O'Halloran wrote:
> >>> This also means the only remaining user of the old "DMA Weight" code is
> >>> the IODA1 DMA setup code that it was originally added for, which is good.
> >>
> >>
> >> Is ditching IODA1 in the plan? :)
> >
> > That or separating out the pci_controller_ops for IODA1 and IODA2 so
> > we can stop any IODA2 specific changes from breaking it.
>
> Is IODA1 tested at all these days? Or, is anyone running upstream
> kernels anywhere and keeps shouting when it does not work on IODA1? Thanks,

Cedric has a P7 with OPAL. That's probably the one left though.

^ permalink raw reply

* Re: [PATCH 08/15] powerpc/powernv/sriov: Simplify used window tracking
From: Alexey Kardashevskiy @ 2020-07-15  1:34 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-9-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> No need for the multi-dimensional arrays, just use a bitmap.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/platforms/powernv/pci-sriov.c | 48 +++++++---------------
>  arch/powerpc/platforms/powernv/pci.h       |  7 +++-
>  2 files changed, 20 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
> index 216ceeff69b0..e4c65cb49757 100644
> --- a/arch/powerpc/platforms/powernv/pci-sriov.c
> +++ b/arch/powerpc/platforms/powernv/pci-sriov.c
> @@ -303,28 +303,20 @@ static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
>  {
>  	struct pnv_iov_data   *iov;
>  	struct pnv_phb        *phb;
> -	int                    i, j;
> -	int                    m64_bars;
> +	int window_id;
>  
>  	phb = pci_bus_to_pnvhb(pdev->bus);
>  	iov = pnv_iov_get(pdev);
>  
> -	if (iov->m64_single_mode)
> -		m64_bars = num_vfs;
> -	else
> -		m64_bars = 1;
> +	for_each_set_bit(window_id, iov->used_m64_bar_mask, 64) {
> +		opal_pci_phb_mmio_enable(phb->opal_id,
> +					 OPAL_M64_WINDOW_TYPE,
> +					 window_id,
> +					 0);
>  
> -	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
> -		for (j = 0; j < m64_bars; j++) {
> -			if (iov->m64_map[j][i] == IODA_INVALID_M64)
> -				continue;
> -			opal_pci_phb_mmio_enable(phb->opal_id,
> -				OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 0);
> -			clear_bit(iov->m64_map[j][i], &phb->ioda.m64_bar_alloc);
> -			iov->m64_map[j][i] = IODA_INVALID_M64;
> -		}
> +		clear_bit(window_id, &phb->ioda.m64_bar_alloc);
> +	}
>  
> -	kfree(iov->m64_map);
>  	return 0;
>  }
>  
> @@ -350,23 +342,14 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  	else
>  		m64_bars = 1;
>  
> -	iov->m64_map = kmalloc_array(m64_bars,
> -				     sizeof(*iov->m64_map),
> -				     GFP_KERNEL);
> -	if (!iov->m64_map)
> -		return -ENOMEM;
> -	/* Initialize the m64_map to IODA_INVALID_M64 */
> -	for (i = 0; i < m64_bars ; i++)
> -		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
> -			iov->m64_map[i][j] = IODA_INVALID_M64;
> -
> -
>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>  		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>  		if (!res->flags || !res->parent)
>  			continue;
>  
>  		for (j = 0; j < m64_bars; j++) {
> +
> +			/* allocate a window ID for this BAR */
>  			do {
>  				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
>  						phb->ioda.m64_bar_idx + 1, 0);
> @@ -374,8 +357,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  				if (win >= phb->ioda.m64_bar_idx + 1)
>  					goto m64_failed;
>  			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
> -
> -			iov->m64_map[j][i] = win;
> +			set_bit(win, iov->used_m64_bar_mask);
>  
>  			if (iov->m64_single_mode) {
>  				size = pci_iov_resource_size(pdev,
> @@ -391,12 +373,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  				pe_num = iov->pe_num_map[j];
>  				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
>  						pe_num, OPAL_M64_WINDOW_TYPE,
> -						iov->m64_map[j][i], 0);
> +						win, 0);
>  			}
>  
>  			rc = opal_pci_set_phb_mem_window(phb->opal_id,
>  						 OPAL_M64_WINDOW_TYPE,
> -						 iov->m64_map[j][i],
> +						 win,
>  						 start,
>  						 0, /* unused */
>  						 size);
> @@ -410,10 +392,10 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
>  
>  			if (iov->m64_single_mode)
>  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 2);
> +				     OPAL_M64_WINDOW_TYPE, win, 2);
>  			else
>  				rc = opal_pci_phb_mmio_enable(phb->opal_id,
> -				     OPAL_M64_WINDOW_TYPE, iov->m64_map[j][i], 1);
> +				     OPAL_M64_WINDOW_TYPE, win, 1);
>  
>  			if (rc != OPAL_SUCCESS) {
>  				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 0156d7d17f7d..58c97e60c3db 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -243,8 +243,11 @@ struct pnv_iov_data {
>  	/* Did we map the VF BARs with single-PE IODA BARs? */
>  	bool    m64_single_mode;
>  
> -	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
> -#define IODA_INVALID_M64        (-1)
> +	/*
> +	 * Bit mask used to track which m64 windows that we used to map the


Language question: either "which" or "that" but both?


> +	 * SR-IOV BARs for this device.
> +	 */
> +	DECLARE_BITMAP(used_m64_bar_mask, 64);

64 here is the maximum number of M64's (which is 16 at the moment)? Can
we define this 64 somehow (appears twice in this patch alone)?

Anyway, the change is correct.

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>



>  
>  	/*
>  	 * If we map the SR-IOV BARs with a segmented window then
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v8 5/8] powerpc/vdso: Prepare for switching VDSO to generic C implementation.
From: Michael Ellerman @ 2020-07-15  1:04 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt, Paul Mackerras, nathanl
  Cc: linux-arch, arnd, linux-kernel, Tulio Magno Quites Machado Filho,
	luto, tglx, vincenzo.frascino, linuxppc-dev
In-Reply-To: <2a67c333893454868bbfda773ba4b01c20272a5d.1588079622.git.christophe.leroy@c-s.fr>

Christophe Leroy <christophe.leroy@c-s.fr> writes:
> Prepare for switching VDSO to generic C implementation in following
> patch. Here, we:
> - Modify __get_datapage() to take an offset
> - Prepare the helpers to call the C VDSO functions
> - Prepare the required callbacks for the C VDSO functions
> - Prepare the clocksource.h files to define VDSO_ARCH_CLOCKMODES
> - Add the C trampolines to the generic C VDSO functions
>
> powerpc is a bit special for VDSO as well as system calls in the
> way that it requires setting CR SO bit which cannot be done in C.
> Therefore, entry/exit needs to be performed in ASM.
>
> Implementing __arch_get_vdso_data() would clobber the link register,
> requiring the caller to save it. As the ASM calling function already
> has to set a stack frame and saves the link register before calling
> the C vdso function, retriving the vdso data pointer there is lighter.
...

> diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h
> new file mode 100644
> index 000000000000..4452897f9bd8
> --- /dev/null
> +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h
> @@ -0,0 +1,175 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_VDSO_GETTIMEOFDAY_H
> +#define __ASM_VDSO_GETTIMEOFDAY_H
> +
> +#include <asm/ptrace.h>
> +
> +#ifdef __ASSEMBLY__
> +
> +.macro cvdso_call funct
> +  .cfi_startproc
> +	PPC_STLU	r1, -STACK_FRAME_OVERHEAD(r1)
> +	mflr		r0
> +  .cfi_register lr, r0
> +	PPC_STL		r0, STACK_FRAME_OVERHEAD + PPC_LR_STKOFF(r1)

This doesn't work for me on ppc64(le) with glibc.

glibc doesn't create a stack frame before making the VDSO call, so the
store of r0 (LR) goes into the caller's frame, corrupting the saved LR,
leading to an infinite loop.

This is an example from a statically built program that calls
clock_gettime():

0000000010030cb0 <__clock_gettime>:
    10030cb0:   0e 10 40 3c     lis     r2,4110
    10030cb4:   00 7a 42 38     addi    r2,r2,31232
    10030cb8:   a6 02 08 7c     mflr    r0
    10030cbc:   ff ff 22 3d     addis   r9,r2,-1
    10030cc0:   58 6d 29 39     addi    r9,r9,27992
    10030cc4:   f0 ff c1 fb     std     r30,-16(r1)			<-- redzone store
    10030cc8:   78 23 9e 7c     mr      r30,r4
    10030ccc:   f8 ff e1 fb     std     r31,-8(r1)			<-- redzone store
    10030cd0:   78 1b 7f 7c     mr      r31,r3
    10030cd4:   10 00 01 f8     std     r0,16(r1)			<-- save LR to caller's frame
    10030cd8:   00 00 09 e8     ld      r0,0(r9)
    10030cdc:   00 00 20 2c     cmpdi   r0,0
    10030ce0:   50 00 82 41     beq     10030d30 <__clock_gettime+0x80>
    10030ce4:   a6 03 09 7c     mtctr   r0
    10030ce8:   21 04 80 4e     bctrl					<-- vdso call
    10030cec:   26 00 00 7c     mfcr    r0
    10030cf0:   00 10 09 74     andis.  r9,r0,4096
    10030cf4:   78 1b 69 7c     mr      r9,r3
    10030cf8:   28 00 82 40     bne     10030d20 <__clock_gettime+0x70>
    10030cfc:   b4 07 23 7d     extsw   r3,r9
    10030d00:   10 00 01 e8     ld      r0,16(r1)			<-- load saved LR, since clobbered by the VDSO
    10030d04:   f0 ff c1 eb     ld      r30,-16(r1)
    10030d08:   f8 ff e1 eb     ld      r31,-8(r1)
    10030d0c:   a6 03 08 7c     mtlr    r0				<-- restore LR
    10030d10:   20 00 80 4e     blr					<-- jumps to 10030cec


I'm kind of confused how it worked for you on 32-bit.

There's also no code to load/restore the TOC pointer on BE, which I
think we'll need to handle.

cheers

^ permalink raw reply

* Re: [PATCH v3 1/9] powerpc/watchpoint: Fix 512 byte boundary limit
From: Jordan Niethe @ 2020-07-15  1:01 UTC (permalink / raw)
  To: Ravi Bangoria
  Cc: Christophe Leroy, apopple, mikey, miltonm, peterz, oleg,
	Nicholas Piggin, linux-kernel, Paul Mackerras, jolsa, fweisbec,
	pedromfc, naveen.n.rao, linuxppc-dev, mingo
In-Reply-To: <20200708045046.135702-2-ravi.bangoria@linux.ibm.com>

On Wed, Jul 8, 2020 at 2:53 PM Ravi Bangoria
<ravi.bangoria@linux.ibm.com> wrote:
>
> Milton Miller reported that we are aligning start and end address to
> wrong size SZ_512M. It should be SZ_512. Fix that.
>
> While doing this change I also found a case where ALIGN() comparison
> fails. Within a given aligned range, ALIGN() of two addresses does not
> match when start address is pointing to the first byte and end address
> is pointing to any other byte except the first one. But that's not true
> for ALIGN_DOWN(). ALIGN_DOWN() of any two addresses within that range
> will always point to the first byte. So use ALIGN_DOWN() instead of
> ALIGN().
>
> Fixes: e68ef121c1f4 ("powerpc/watchpoint: Use builtin ALIGN*() macros")
> Reported-by: Milton Miller <miltonm@us.ibm.com>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.ibm.com>
I tested this with the ptrace-hwbreak selftest. Can confirm without
also changing to ALIGN_DOWN() then these tests will fail.
Tested-by: Jordan Niethe <jniethe5@gmail.com>
> ---
>  arch/powerpc/kernel/hw_breakpoint.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c
> index 0000daf0e1da..031e6defc08e 100644
> --- a/arch/powerpc/kernel/hw_breakpoint.c
> +++ b/arch/powerpc/kernel/hw_breakpoint.c
> @@ -419,7 +419,7 @@ static int hw_breakpoint_validate_len(struct arch_hw_breakpoint *hw)
>         if (dawr_enabled()) {
>                 max_len = DAWR_MAX_LEN;
>                 /* DAWR region can't cross 512 bytes boundary */
> -               if (ALIGN(start_addr, SZ_512M) != ALIGN(end_addr - 1, SZ_512M))
> +               if (ALIGN_DOWN(start_addr, SZ_512) != ALIGN_DOWN(end_addr - 1, SZ_512))
>                         return -EINVAL;
>         } else if (IS_ENABLED(CONFIG_PPC_8xx)) {
>                 /* 8xx can setup a range without limitation */
> --
> 2.26.2
>

^ permalink raw reply

* Re: [PATCH 07/15] powerpc/powernv/sriov: Rename truncate_iov
From: Alexey Kardashevskiy @ 2020-07-15  0:46 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-8-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> This prevents SR-IOV being used by making the SR-IOV BAR resources
> unallocatable. Rename it to reflect what it actually does.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>


Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>


> ---
>  arch/powerpc/platforms/powernv/pci-sriov.c | 11 ++++++-----
>  1 file changed, 6 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
> index f4c74ab1284d..216ceeff69b0 100644
> --- a/arch/powerpc/platforms/powernv/pci-sriov.c
> +++ b/arch/powerpc/platforms/powernv/pci-sriov.c
> @@ -155,7 +155,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  
>  	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
>  	if (!iov)
> -		goto truncate_iov;
> +		goto disable_iov;
>  	pdev->dev.archdata.iov_data = iov;
>  
>  	total_vfs = pci_sriov_get_totalvfs(pdev);
> @@ -170,7 +170,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  			dev_warn(&pdev->dev, "Don't support SR-IOV with"
>  					" non M64 VF BAR%d: %pR. \n",
>  				 i, res);
> -			goto truncate_iov;
> +			goto disable_iov;
>  		}
>  
>  		total_vf_bar_sz += pci_iov_resource_size(pdev,
> @@ -209,7 +209,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  		 * mode is 32MB.
>  		 */
>  		if (iov->m64_single_mode && (size < SZ_32M))
> -			goto truncate_iov;
> +			goto disable_iov;
> +
>  		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
>  		res->end = res->start + size * mul - 1;
>  		dev_dbg(&pdev->dev, "                       %pR\n", res);
> @@ -220,8 +221,8 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  
>  	return;
>  
> -truncate_iov:
> -	/* To save MMIO space, IOV BAR is truncated. */
> +disable_iov:
> +	/* Save ourselves some MMIO space by disabling the unusable BARs */
>  	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
>  		res = &pdev->resource[i + PCI_IOV_RESOURCES];
>  		res->flags = 0;
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 06/15] powerpc/powernv/sriov: Explain how SR-IOV works on PowerNV
From: Alexey Kardashevskiy @ 2020-07-15  0:40 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-7-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> SR-IOV support on PowerNV is a byzantine maze of hooks. I have no idea
> how anyone is supposed to know how it works except through a lot of
> stuffering. Write up some docs about the overall story to help out
> the next sucker^Wperson who needs to tinker with it.


Sounds about right :)

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>



> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/platforms/powernv/pci-sriov.c | 130 +++++++++++++++++++++
>  1 file changed, 130 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-sriov.c b/arch/powerpc/platforms/powernv/pci-sriov.c
> index 080ea39f5a83..f4c74ab1284d 100644
> --- a/arch/powerpc/platforms/powernv/pci-sriov.c
> +++ b/arch/powerpc/platforms/powernv/pci-sriov.c
> @@ -12,6 +12,136 @@
>  /* for pci_dev_is_added() */
>  #include "../../../../drivers/pci/pci.h"
>  
> +/*
> + * The majority of the complexity in supporting SR-IOV on PowerNV comes from
> + * the need to put the MMIO space for each VF into a separate PE. Internally
> + * the PHB maps MMIO addresses to a specific PE using the "Memory BAR Table".
> + * The MBT historically only applied to the 64bit MMIO window of the PHB
> + * so it's common to see it referred to as the "M64BT".
> + *
> + * An MBT entry stores the mapped range as an <base>,<mask> pair. This forces
> + * the address range that we want to map to be power-of-two sized and aligned.
> + * For conventional PCI devices this isn't really an issue since PCI device BARs
> + * have the same requirement.
> + *
> + * For a SR-IOV BAR things are a little more awkward since size and alignment
> + * are not coupled. The alignment is set based on the the per-VF BAR size, but
> + * the total BAR area is: number-of-vfs * per-vf-size. The number of VFs
> + * isn't necessarily a power of two, so neither is the total size. To fix that
> + * we need to finesse (read: hack) the Linux BAR allocator so that it will
> + * allocate the SR-IOV BARs in a way that lets us map them using the MBT.
> + *
> + * The changes to size and alignment that we need to do depend on the "mode"
> + * of MBT entry that we use. We only support SR-IOV on PHB3 (IODA2) and above,
> + * so as a baseline we can assume that we have the following BAR modes
> + * available:
> + *
> + *   NB: $PE_COUNT is the number of PEs that the PHB supports.
> + *
> + * a) A segmented BAR that splits the mapped range into $PE_COUNT equally sized
> + *    segments. The n'th segment is mapped to the n'th PE.
> + * b) An un-segmented BAR that maps the whole address range to a specific PE.
> + *
> + *
> + * We prefer to use mode a) since it only requires one MBT entry per SR-IOV BAR
> + * For comparison b) requires one entry per-VF per-BAR, or:
> + * (num-vfs * num-sriov-bars) in total. To use a) we need the size of each segment
> + * to equal the size of the per-VF BAR area. So:
> + *
> + *	new_size = per-vf-size * number-of-PEs
> + *
> + * The alignment for the SR-IOV BAR also needs to be changed from per-vf-size
> + * to "new_size", calculated above. Implementing this is a convoluted process
> + * which requires several hooks in the PCI core:
> + *
> + * 1. In pcibios_add_device() we call pnv_pci_ioda_fixup_iov().
> + *
> + *    At this point the device has been probed and the device's BARs are sized,
> + *    but no resource allocations have been done. The SR-IOV BARs are sized
> + *    based on the maximum number of VFs supported by the device and we need
> + *    to increase that to new_size.
> + *
> + * 2. Later, when Linux actually assigns resources it tries to make the resource
> + *    allocations for each PCI bus as compact as possible. As a part of that it
> + *    sorts the BARs on a bus by their required alignment, which is calculated
> + *    using pci_resource_alignment().
> + *
> + *    For IOV resources this goes:
> + *    pci_resource_alignment()
> + *        pci_sriov_resource_alignment()
> + *            pcibios_sriov_resource_alignment()
> + *                pnv_pci_iov_resource_alignment()
> + *
> + *    Our hook overrides the default alignment, equal to the per-vf-size, with
> + *    new_size computed above.
> + *
> + * 3. When userspace enables VFs for a device:
> + *
> + *    sriov_enable()
> + *       pcibios_sriov_enable()
> + *           pnv_pcibios_sriov_enable()
> + *
> + *    This is where we actually allocate PE numbers for each VF and setup the
> + *    MBT mapping for each SR-IOV BAR. In steps 1) and 2) we setup an "arena"
> + *    where each MBT segment is equal in size to the VF BAR so we can shift
> + *    around the actual SR-IOV BAR location within this arena. We need this
> + *    ability because the PE space is shared by all devices on the same PHB.
> + *    When using mode a) described above segment 0 in maps to PE#0 which might
> + *    be already being used by another device on the PHB.
> + *
> + *    As a result we need allocate a contigious range of PE numbers, then shift
> + *    the address programmed into the SR-IOV BAR of the PF so that the address
> + *    of VF0 matches up with the segment corresponding to the first allocated
> + *    PE number. This is handled in pnv_pci_vf_resource_shift().
> + *
> + *    Once all that is done we return to the PCI core which then enables VFs,
> + *    scans them and creates pci_devs for each. The init process for a VF is
> + *    largely the same as a normal device, but the VF is inserted into the IODA
> + *    PE that we allocated for it rather than the PE associated with the bus.
> + *
> + * 4. When userspace disables VFs we unwind the above in
> + *    pnv_pcibios_sriov_disable(). Fortunately this is relatively simple since
> + *    we don't need to validate anything, just tear down the mappings and
> + *    move SR-IOV resource back to its "proper" location.
> + *
> + * That's how mode a) works. In theory mode b) (single PE mapping) is less work
> + * since we can map each individual VF with a separate BAR. However, there's a
> + * few limitations:
> + *
> + * 1) For IODA2 mode b) has a minimum alignment requirement of 32MB. This makes
> + *    it only usable for devices with very large per-VF BARs. Such devices are
> + *    similar to Big Foot. They definitely exist, but I've never seen one.
> + *
> + * 2) The number of MBT entries that we have is limited. PHB3 and PHB4 only
> + *    16 total and some are needed for. Most SR-IOV capable network cards can support
> + *    more than 16 VFs on each port.
> + *
> + * We use b) when using a) would use more than 1/4 of the entire 64 bit MMIO
> + * window of the PHB.
> + *
> + *
> + *
> + * PHB4 (IODA3) added a few new features that would be useful for SR-IOV. It
> + * allowed the MBT to map 32bit MMIO space in addition to 64bit which allows
> + * us to support SR-IOV BARs in the 32bit MMIO window. This is useful since
> + * the Linux BAR allocation will place any BAR marked as non-prefetchable into
> + * the non-prefetchable bridge window, which is 32bit only. It also added two
> + * new modes:
> + *
> + * c) A segmented BAR similar to a), but each segment can be individually
> + *    mapped to any PE. This is matches how the 32bit MMIO window worked on
> + *    IODA1&2.
> + *
> + * d) A segmented BAR with 8, 64, or 128 segments. This works similarly to a),
> + *    but with fewer segments and configurable base PE.
> + *
> + *    i.e. The n'th segment maps to the (n + base)'th PE.
> + *
> + *    The base PE is also required to be a multiple of the window size.
> + *
> + * Unfortunately, the OPAL API doesn't currently (as of skiboot v6.6) allow us
> + * to exploit any of the IODA3 features.
> + */
>  
>  static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  {
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH 03/15] powerpc/powernv/pci: Add explicit tracking of the DMA setup state
From: Alexey Kardashevskiy @ 2020-07-15  0:23 UTC (permalink / raw)
  To: Oliver O'Halloran; +Cc: linuxppc-dev
In-Reply-To: <34f7eea2-4ace-9931-7b5f-98ec159f3532@ozlabs.ru>



On 14/07/2020 17:21, Alexey Kardashevskiy wrote:
> 
> 
> On 14/07/2020 15:58, Oliver O'Halloran wrote:
>> On Tue, Jul 14, 2020 at 3:37 PM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>>>
>>> On 10/07/2020 15:23, Oliver O'Halloran wrote:
>>>> There's an optimisation in the PE setup which skips performing DMA
>>>> setup for a PE if we only have bridges in a PE. The assumption being
>>>> that only "real" devices will DMA to system memory, which is probably
>>>> fair. However, if we start off with only bridge devices in a PE then
>>>> add a non-bridge device the new device won't be able to use DMA  because
>>>> we never configured it.
>>>>
>>>> Fix this (admittedly pretty weird) edge case by tracking whether we've done
>>>> the DMA setup for the PE or not. If a non-bridge device is added to the PE
>>>> (via rescan or hotplug, or whatever) we can set up DMA on demand.
>>>
>>> So hotplug does not work on powernv then, right? I thought you tested it
>>> a while ago, or this patch is the result of that attempt? If it is, then
>>
>> It mostly works. Just the really niche case of hot plugging a bridge,
>> then later on hot plugging a device into the same bus which wouldn't
>> work.
> 
> Do not you have to have a slot (which is a bridge) for hotplug in the
> first place, to hotplug the bridge?


As discussed elsewhere, I missed that it is a non bridge device on the
same bus with previously plugged bridge. Now it all makes sense and


Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>


-- 
Alexey

^ permalink raw reply

* [PATCH] pseries: Fix 64 bit logical memory block panic
From: Anton Blanchard @ 2020-07-15  0:08 UTC (permalink / raw)
  To: benh, paulus, mpe, nathanl; +Cc: linuxppc-dev

Booting with a 4GB LMB size causes us to panic:

  qemu-system-ppc64: OS terminated: OS panic:
      Memory block size not suitable: 0x0

Fix pseries_memory_block_size() to handle 64 bit LMBs.

Cc: stable@vger.kernel.org
Signed-off-by: Anton Blanchard <anton@ozlabs.org>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 5ace2f9a277e..6574ac33e887 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -27,7 +27,7 @@ static bool rtas_hp_event;
 unsigned long pseries_memory_block_size(void)
 {
 	struct device_node *np;
-	unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE;
+	uint64_t memblock_size = MIN_MEMORY_BLOCK_SIZE;
 	struct resource r;
 
 	np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
-- 
2.26.2


^ permalink raw reply related

* Re: [PATCH v3 03/12] powerpc/kexec_file: add helper functions for getting memory ranges
From: Thiago Jung Bauermann @ 2020-07-14 23:49 UTC (permalink / raw)
  To: Hari Bathini
  Cc: Pingfan Liu, Nayna Jain, Kexec-ml, Mahesh J Salgaonkar,
	Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain, Petr Tesarik,
	Andrew Morton, Dave Young, Vivek Goyal, Eric Biederman
In-Reply-To: <159466087136.24747.16494497863685481495.stgit@hbathini.in.ibm.com>


Hello Hari,

Hari Bathini <hbathini@linux.ibm.com> writes:

> In kexec case, the kernel to be loaded uses the same memory layout as
> the running kernel. So, passing on the DT of the running kernel would
> be good enough.
>
> But in case of kdump, different memory ranges are needed to manage
> loading the kdump kernel, booting into it and exporting the elfcore
> of the crashing kernel. The ranges are exlude memory ranges, usable

s/exlude/exclude/

> memory ranges, reserved memory ranges and crash memory ranges.
>
> Exclude memory ranges specify the list of memory ranges to avoid while
> loading kdump segments. Usable memory ranges list the memory ranges
> that could be used for booting kdump kernel. Reserved memory ranges
> list the memory regions for the loading kernel's reserve map. Crash
> memory ranges list the memory ranges to be exported as the crashing
> kernel's elfcore.
>
> Add helper functions for setting up the above mentioned memory ranges.
> This helpers facilitate in understanding the subsequent changes better
> and make it easy to setup the different memory ranges listed above, as
> and when appropriate.
>
> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
> Tested-by: Pingfan Liu <piliu@redhat.com>

<snip>

> +/**
> + * get_mem_rngs_size - Get the allocated size of mrngs based on
> + *                     max_nr_ranges and chunk size.
> + * @mrngs:             Memory ranges.
> + *
> + * Returns the maximum no. of ranges.

This isn't correct. It returns the maximum size of @mrngs.

> + */
> +static inline size_t get_mem_rngs_size(struct crash_mem *mrngs)
> +{
> +	size_t size;
> +
> +	if (!mrngs)
> +		return 0;
> +
> +	size = (sizeof(struct crash_mem) +
> +		(mrngs->max_nr_ranges * sizeof(struct crash_mem_range)));
> +
> +	/*
> +	 * Memory is allocated in size multiple of MEM_RANGE_CHUNK_SZ.
> +	 * So, align to get the actual length.
> +	 */
> +	return ALIGN(size, MEM_RANGE_CHUNK_SZ);
> +}

<snip>

> +/**
> + * add_tce_mem_ranges - Adds tce-table range to the given memory ranges list.
> + * @mem_ranges:         Range list to add the memory range(s) to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_tce_mem_ranges(struct crash_mem **mem_ranges)
> +{
> +	struct device_node *dn;
> +	int ret;
> +
> +	for_each_node_by_type(dn, "pci") {
> +		u64 base;
> +		u32 size;
> +
> +		ret = of_property_read_u64(dn, "linux,tce-base", &base);
> +		ret |= of_property_read_u32(dn, "linux,tce-size", &size);
> +		if (!ret)

Shouldn't the condition be `ret` instead of `!ret`?

> +			continue;
> +
> +		ret = add_mem_range(mem_ranges, base, size);
> +		if (ret)
> +			break;
> +	}
> +
> +	return ret;
> +}
> +
> +/**
> + * add_initrd_mem_range - Adds initrd range to the given memory ranges list,
> + *                        if the initrd was retained.
> + * @mem_ranges:           Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_initrd_mem_range(struct crash_mem **mem_ranges)
> +{
> +	u64 base, end;
> +	int ret = 0;
> +	char *str;
> +
> +	/* This range means something only if initrd was retained */
> +	str = strstr(saved_command_line, "retain_initrd");
> +	if (!str)
> +		return 0;
> +
> +	ret = of_property_read_u64(of_chosen, "linux,initrd-start", &base);
> +	ret |= of_property_read_u64(of_chosen, "linux,initrd-end", &end);
> +	if (!ret)
> +		ret = add_mem_range(mem_ranges, base, end - base + 1);
> +	return ret;
> +}
> +
> +/**
> + * add_htab_mem_range - Adds htab range to the given memory ranges list,
> + *                      if it exists
> + * @mem_ranges:         Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_htab_mem_range(struct crash_mem **mem_ranges)
> +{
> +#ifdef CONFIG_PPC_BOOK3S_64
> +	int ret;
> +
> +	if (!htab_address)
> +		return 0;
> +
> +	ret = add_mem_range(mem_ranges, __pa(htab_address), htab_size_bytes);
> +	return ret;
> +#else
> +	return 0;
> +#endif
> +}

If I'm not mistaken, this is not the preferred way of having alternative
implementations of a function. The "Conditional Compilation" section of
the coding style document doesn't mention this directly, but does say
that it's better to put the conditionals in a header file.

In this case, I would do this in <asm/kexec_ranges.h>

#ifdef CONFIG_PPC_BOOK3S_64
int add_htab_mem_range(struct crash_mem **mem_ranges);
#else
static inline int add_htab_mem_range(struct crash_mem **mem_ranges)
{
	return 0;
}
#endif

And in ranges.c just surround the add_htab_mem_range() definition with
#ifdef CONFIG_PPC_BOOK3S_64 and #endif

Also, there's no need for the ret variable. You can just
`return add_mem_range(...)` directly.

> +
> +/**
> + * add_kernel_mem_range - Adds kernel text region to the given
> + *                        memory ranges list.
> + * @mem_ranges:           Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_kernel_mem_range(struct crash_mem **mem_ranges)
> +{
> +	int ret;
> +
> +	ret = add_mem_range(mem_ranges, 0, __pa(_end));
> +	return ret;
> +}

No need for the ret variable here, just `return add_mem_range()`
directly.

> +
> +/**
> + * add_rtas_mem_range - Adds RTAS region to the given memory ranges list.
> + * @mem_ranges:         Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_rtas_mem_range(struct crash_mem **mem_ranges)
> +{
> +	struct device_node *dn;
> +	int ret = 0;
> +
> +	dn = of_find_node_by_path("/rtas");
> +	if (dn) {
> +		u32 base, size;
> +
> +		ret = of_property_read_u32(dn, "linux,rtas-base", &base);
> +		ret |= of_property_read_u32(dn, "rtas-size", &size);
> +		if (ret)
> +			return ret;
> +
> +		ret = add_mem_range(mem_ranges, base, size);

You're missing an of_node_put(dn) here (also in the early return in the
line above).

> +	}
> +	return ret;
> +}
> +
> +/**
> + * add_opal_mem_range - Adds OPAL region to the given memory ranges list.
> + * @mem_ranges:         Range list to add the memory range to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_opal_mem_range(struct crash_mem **mem_ranges)
> +{
> +	struct device_node *dn;
> +	int ret = 0;
> +
> +	dn = of_find_node_by_path("/ibm,opal");
> +	if (dn) {
> +		u64 base, size;
> +
> +		ret = of_property_read_u64(dn, "opal-base-address", &base);
> +		ret |= of_property_read_u64(dn, "opal-runtime-size", &size);
> +		if (ret)
> +			return ret;
> +
> +		ret = add_mem_range(mem_ranges, base, size);

You're missing an of_node_put(dn) here (also in the early return in the
line above).

> +	}
> +	return ret;
> +}
> +
> +/**
> + * add_reserved_ranges - Adds "/reserved-ranges" regions exported by f/w
> + *                       to the given memory ranges list.
> + * @mem_ranges:          Range list to add the memory ranges to.
> + *
> + * Returns 0 on success, negative errno on error.
> + */
> +int add_reserved_ranges(struct crash_mem **mem_ranges)
> +{
> +	int i, len, ret = 0;
> +	const __be32 *prop;
> +
> +	prop = of_get_property(of_root, "reserved-ranges", &len);
> +	if (!prop)
> +		return 0;
> +
> +	/*
> +	 * Each reserved range is an (address,size) pair, 2 cells each,
> +	 * totalling 4 cells per range.

Can you assume that, or do you need to check the #address-cells and
#size-cells properties of the root node?

> +	 */
> +	for (i = 0; i < len / (sizeof(*prop) * 4); i++) {
> +		u64 base, size;
> +
> +		base = of_read_number(prop + (i * 4) + 0, 2);
> +		size = of_read_number(prop + (i * 4) + 2, 2);
> +
> +		ret = add_mem_range(mem_ranges, base, size);
> +		if (ret)
> +			break;
> +	}
> +
> +	return ret;
> +}
> +
> +/**
> + * sort_memory_ranges - Sorts the given memory ranges list.
> + * @mem_ranges:         Range list to sort.
> + * @merge:              If true, merge the list after sorting.
> + *
> + * Returns nothing.
> + */
> +void sort_memory_ranges(struct crash_mem *mrngs, bool merge)
> +{
> +	struct crash_mem_range *rngs;
> +	struct crash_mem_range rng;
> +	int i, j, idx;
> +
> +	if (!mrngs)
> +		return;
> +
> +	/* Sort the ranges in-place */
> +	rngs = &mrngs->ranges[0];
> +	for (i = 0; i < mrngs->nr_ranges; i++) {
> +		idx = i;
> +		for (j = (i + 1); j < mrngs->nr_ranges; j++) {
> +			if (rngs[idx].start > rngs[j].start)
> +				idx = j;
> +		}
> +		if (idx != i) {
> +			rng = rngs[idx];
> +			rngs[idx] = rngs[i];
> +			rngs[i] = rng;
> +		}
> +	}

Would it work using sort() from lib/sort.c here?

> +
> +	if (merge)
> +		__merge_memory_ranges(mrngs);
> +}


--
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Bjorn Helgaas @ 2020-07-14 23:46 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: Keith Busch, Paul Mackerras, sparclinux, Toan Le, Kjetil Oftedal,
	Greg Ungerer, Marek Vasut, Rob Herring, Lorenzo Pieralisi,
	Sagi Grimberg, Russell King, Ley Foon Tan, Christoph Hellwig,
	Geert Uytterhoeven, Kevin Hilman, linux-pci, Jakub Kicinski,
	Matt Turner, linux-kernel-mentees, Guenter Roeck, Ray Jui,
	Jens Axboe, Ivan Kokshaysky, Shuah Khan, bjorn, Boris Ostrovsky,
	Richard Henderson, Juergen Gross, Bjorn Helgaas,
	Thomas Bogendoerfer, Scott Branden, Jingoo Han,
	Saheed O. Bolarinwa, linux-kernel@vger.kernel.org, Philipp Zabel,
	Greg Kroah-Hartman, Gustavo Pimentel, linuxppc-dev,
	David S. Miller, Heiner Kallweit
In-Reply-To: <CAK8P3a3EZX8=649R9cYF6_=ivh1Xyrgsc5mUtS=d5yvQ3doZaQ@mail.gmail.com>

[+cc Kjetil]

On Wed, Jul 15, 2020 at 12:01:56AM +0200, Arnd Bergmann wrote:
> On Tue, Jul 14, 2020 at 8:45 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > On Mon, Jul 13, 2020 at 05:08:10PM +0200, Arnd Bergmann wrote:
> > > On Mon, Jul 13, 2020 at 3:22 PM Saheed O. Bolarinwa
> > > Starting with a), my first question is whether any high-level
> > > drivers even need to care about errors from these functions. I see
> > > 4913 callers that ignore the return code, and 576 that actually
> > > check it, and almost none care about the specific error (as you
> > > found as well). Unless we conclude that most PCI drivers are wrong,
> > > could we just change the return type to 'void' and assume they never
> > > fail for valid arguments on a valid pci_device* ?
> >
> > I really like this idea.
> >
> > pci_write_config_*() has one return value, and only 100ish of 2500
> > callers check for errors.  It's sometimes possible for config
> > accessors to detect PCI errors and return failure, e.g., device was
> > removed or didn't respond, but most of them don't, and detecting these
> > errors is not really that valuable.
> >
> > pci_read_config_*() is much more interesting because it returns two
> > things, the function return value and the value read from the PCI
> > device, and it's complicated to check both.
> >
> > Again it's sometimes possible for config read accessors to detect PCI
> > errors, but in most cases a PCI error means the accessor returns
> > success and the value from PCI is ~0.
> >
> > Checking the function return value catches programming errors (bad
> > alignment, etc) but misses most of the interesting errors (device was
> > unplugged or reported a PCI error).
> 
> My thinking was more that most of the time the error checking may
> be completely bogus to start with, and I would just not check for
> errors at all.

Yes.  I have no problem with that.  There are a few cases where it's
important to check for errors, e.g., we read a status register and do
something based on a bit being set.  A failure will return all bits
set, and we may do the wrong thing.  But most of the errors we care
about will be on MMIO reads, not config reads, so we can probably
ignore most config read errors.

> > Checking the value returned from PCI is tricky because ~0 is a valid
> > value for some config registers, and only the driver knows for sure.
> > If the driver knows that ~0 is a possible value, it would have to do
> > something else, e.g., another config read of a register that *cannot*
> > be ~0, to see whether it's really an error.
> >
> > I suspect that if we had a single value to look at it would be easier
> > to get right.  Error checking with current interface would look like
> > this:
> >
> >   err = pci_read_config_word(dev, addr, &val);
> >   if (err)
> >     return -EINVAL;
> >
> >   if (PCI_POSSIBLE_ERROR(val)) {
> >     /* if driver knows ~0 is invalid */
> >     return -EINVAL;
> >
> >     /* if ~0 is potentially a valid value */
> >     err = pci_read_config_word(dev, PCI_VENDOR_ID, &val2);
> >     if (err)
> >       return -EINVAL;
> >
> >     if (PCI_POSSIBLE_ERROR(val2))
> >       return -EINVAL;
> >   }
> >
> > Error checking with a possible interface that returned only a single
> > value could look like this:
> >
> >   val = pci_config_read_word(dev, addr);
> >   if (PCI_POSSIBLE_ERROR(val)) {
> >     /* if driver knows ~0 is invalid */
> >     return -EINVAL;
> >
> >     /* if ~0 is potentially a valid value */
> >     val2 = pci_config_read_word(dev, PCI_VENDOR_ID);
> >     if (PCI_POSSIBLE_ERROR(val2))
> >       return -EINVAL;
> >   }
> >
> > Am I understanding you correctly?
> 
> That would require changing all callers of the function, which
> I think would involve changing some 700 files. 

Yeah, that would be a disaster.  So something like:

  void pci_read_config_word(struct pci_dev *dev, int where, u16 *val)

and where we used to return anything non-zero, we just set *val = ~0
instead?  I think we do that already in most, maybe all, cases.

> What I was suggesting was to only change the return type to void and
> categorize all drivers that today check it as either
> 
> a) checking the return code is not helpful, or possibly even
>     wrong, so we just stop doing it. I expect those to be the
>     vast majority of callers, but that could be wrong.
> 
> b) Code that legitimately check the error code and need to
>    take an appropriate action. These could be changed to
>    calling a different interface such as 'pci_bus_read_config_word'
>    or a new 'pci_device_last_error()' function.

Yep, makes sense.

> The reasons I suspect that most callers don't actually need
> to check for errors are:
> 
> - Most error checking is static: PCIBIOS_BAD_REGISTER_NUMBER
>   only happens if you pass an invalid register number, but most
>   callers pass a compile-time constant register number that is
>   known to be correct, or the driver would never work. Similarly,
>   PCIBIOS_DEVICE_NOT_FOUND wouldn't normally happen
>   since you pass a valid pci_device pointer that was already
>   probed.

Yep, except for things like device removal or other PCI errors.

> - config space accesses are very rare compared to memory
>   space access and on the hardware side the error handling
>   would be similar, but readl/writel don't return errors, they just
>   access wrong registers or return 0xffffffff.
>   arch/powerpc/kernel/eeh.c has a ton extra code written to
>   deal with it, but no other architectures do.
> 
> - If we add code to detect errors in pci_read_config_*
>   and do some of the stuff from powerpc's
>   eeh_dev_check_failure(), we are more likely to catch
>   intermittent failures when drivers don't check, or bugs
>   with invalid arguments in device drivers than relying on
>   drivers to get their error handling right when those code
>   paths don't ever get covered in normal testing.

Yeah, this makes sense and sounds like a potential follow-on project.

> Looking at a couple of random drivers that do check the
> return codes, I find:
> 
> drivers/edac/amd8131_edac.c: prints the register number,
> then keeps going. This is not useful
> 
> drivers/net/ethernet/mellanox/mlx4/reset.c: error handling
> in mlx4_reset() seems reasonable, but it gets called
> from mlx4_pci_resume(), which has a 'void' return code and
> cannot propagate the error further. My guess is that it
> would try to keep going after a failed resume and run into
> random other problems then.
> 
> drivers/ata/pata_cs5536.c: error code gets passed to
> caller and then always ignored. Can clearly be changed
> 
> drivers/net/wireless/intersil/prism54/islpci_hotplug.c:
> Out of two calls, only one is checked, which seems bogus
> 
> drivers/usb/host/pci-quirks.c: only one of many instances
> has a check, again this seems bogus.
> 
> drivers/leds/leds-ss4200.c: called from probe(), which
> seems to correctly deal with errors by failing the probe.
> Not sure this can ever fail though, since the driver only does
> it after pci_enable_device() succeeds first. Note that
> pci_enable_device() ignores pci_read_config_byte()
> errors but sanity-checks the register contents/

So maybe a good place to start is by removing some of the useless
error checking for pci_read_config_*() and pci_write_config_*().
That's a decent-sized but not impractical project that could be done
per subsystem or something:

  git grep -E "(if|return|=).*\<pci_(read|write)_config" drivers

finds about 400 matches.

Some of those callers probably really *do* want to check for errors,
and I guess we'd have to identify them and do them separately as you
mentioned.

Bjorn

^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Rob Herring @ 2020-07-14 23:14 UTC (permalink / raw)
  To: Bjorn Helgaas, Arnd Bergmann
  Cc: Keith Busch, Paul Mackerras, sparclinux, Toan Le, Greg Ungerer,
	Marek Vasut, Lorenzo Pieralisi, Sagi Grimberg, Russell King,
	Ley Foon Tan, Christoph Hellwig, Geert Uytterhoeven, Kevin Hilman,
	linux-pci, Jakub Kicinski, Matt Turner, linux-kernel-mentees,
	Guenter Roeck, Ray Jui, Jens Axboe, Ivan Kokshaysky, Shuah Khan,
	bjorn, Boris Ostrovsky, Richard Henderson, Juergen Gross,
	Bjorn Helgaas, Thomas Bogendoerfer, Scott Branden, Jingoo Han,
	Saheed O. Bolarinwa, linux-kernel@vger.kernel.org, Philipp Zabel,
	Greg Kroah-Hartman, Gustavo Pimentel, linuxppc-dev,
	David S. Miller, Heiner Kallweit
In-Reply-To: <20200714184550.GA397277@bjorn-Precision-5520>

On Tue, Jul 14, 2020 at 12:45 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> [trimmed the cc list; it's still too large but maybe arch folks care]
>
> On Mon, Jul 13, 2020 at 05:08:10PM +0200, Arnd Bergmann wrote:
> > On Mon, Jul 13, 2020 at 3:22 PM Saheed O. Bolarinwa
> > <refactormyself@gmail.com> wrote:
> > > This goal of these series is to move the definition of *all*
> > > PCIBIOS* from include/linux/pci.h to arch/x86 and limit their use
> > > within there.  All other tree specific definition will be left for
> > > intact. Maybe they can be renamed.
> > >
> > > PCIBIOS* is an x86 concept as defined by the PCI spec. The
> > > returned error codes of PCIBIOS* are positive values and this
> > > introduces some complexities which other archs need not incur.
> >
> > I think the intention is good, but I find the series in its current
> > form very hard to review, in particular the way you touch some
> > functions three times with trivial changes. Instead of
> >
> > 1) replace PCIBIOS_SUCCESSFUL with 0
> > 2) drop pointless 0-comparison
> > 3) reformat whitespace
> >
> > I would suggest to combine the first two steps into one patch per
> > subsystem and drop the third step.
>
> I agree.  BUT please don't just run out and post new patches to do
> this.  Let's talk about Arnd's further ideas below first.
>
> > ...
> > Maybe the work can be split up differently, with a similar end
> > result but fewer and easier reviewed patches. The way I'd look at
> > the problem, there are three main areas that can be dealt with one
> > at a time:
> >
> > a) callers of the high-level config space accessors
> >    pci_{write,read}_config_{byte,word,dword}, mostly in device
> >    drivers.
> > b) low-level implementation of the config space accessors
> >     through struct pci_ops
> > c) all other occurrences of these constants
> >
> > Starting with a), my first question is whether any high-level
> > drivers even need to care about errors from these functions. I see
> > 4913 callers that ignore the return code, and 576 that actually
> > check it, and almost none care about the specific error (as you
> > found as well). Unless we conclude that most PCI drivers are wrong,
> > could we just change the return type to 'void' and assume they never
> > fail for valid arguments on a valid pci_device* ?
>
> I really like this idea.
>
> pci_write_config_*() has one return value, and only 100ish of 2500
> callers check for errors.  It's sometimes possible for config
> accessors to detect PCI errors and return failure, e.g., device was
> removed or didn't respond, but most of them don't, and detecting these
> errors is not really that valuable.
>
> pci_read_config_*() is much more interesting because it returns two
> things, the function return value and the value read from the PCI
> device, and it's complicated to check both.
>
> Again it's sometimes possible for config read accessors to detect PCI
> errors, but in most cases a PCI error means the accessor returns
> success and the value from PCI is ~0.
>
> Checking the function return value catches programming errors (bad
> alignment, etc) but misses most of the interesting errors (device was
> unplugged or reported a PCI error).
>
> Checking the value returned from PCI is tricky because ~0 is a valid
> value for some config registers, and only the driver knows for sure.
> If the driver knows that ~0 is a possible value, it would have to do
> something else, e.g., another config read of a register that *cannot*
> be ~0, to see whether it's really an error.
>
> I suspect that if we had a single value to look at it would be easier
> to get right.  Error checking with current interface would look like
> this:
>
>   err = pci_read_config_word(dev, addr, &val);
>   if (err)
>     return -EINVAL;
>
>   if (PCI_POSSIBLE_ERROR(val)) {
>     /* if driver knows ~0 is invalid */
>     return -EINVAL;
>
>     /* if ~0 is potentially a valid value */
>     err = pci_read_config_word(dev, PCI_VENDOR_ID, &val2);
>     if (err)
>       return -EINVAL;
>
>     if (PCI_POSSIBLE_ERROR(val2))
>       return -EINVAL;
>   }
>
> Error checking with a possible interface that returned only a single
> value could look like this:
>
>   val = pci_config_read_word(dev, addr);
>   if (PCI_POSSIBLE_ERROR(val)) {
>     /* if driver knows ~0 is invalid */
>     return -EINVAL;
>
>     /* if ~0 is potentially a valid value */
>     val2 = pci_config_read_word(dev, PCI_VENDOR_ID);
>     if (PCI_POSSIBLE_ERROR(val2))
>       return -EINVAL;
>   }
>
> Am I understanding you correctly?
>
> > For b), it might be nice to also change other aspects of the
> > interface, e.g. passing a pci_host_bridge pointer plus bus number
> > instead of a pci_bus pointer, or having the callback in the
> > pci_host_bridge structure.
>
> I like this idea a lot, too.  I think the fact that
> pci_bus_read_config_word() requires a pci_bus * complicates things in
> a few places.

I've been looking at the various host implementations of config
accessors as well as probe functions. Needing the pci_bus pointer is a
big reason why host drivers will have 2 sets of config accessors or
don't use the generic ones. Often that's just for the root bus config
space init before pci_host_probe() is called. Perhaps that's better
addressed with a fixup hook for the host bridge? ftpci100i is a good
example of this.

The root bus accesses are often different from the rest of config
space. Determining if an access is for the root bus or not is all over
the map, but often involves a private bus number variable. I have a
series to use pci_is_root_bus() instead and eliminate a bunch of bus
number handling in the host drivers (I'm sure there's a bunch of hosts
that would be broken if the root bus is not 0). The majority of hosts
don't really need to know anything about the bus number. The more I've
thought about it, it would be better if the PCI core handled this and
picked the right ops to call. We already have several cases of host
drivers with their own ops for this and we could eliminate several
layers of indirection (looking at you, DWC). Any thoughts on direction
here would be helpful.

> I think it's completely separate, as you say, and we should defer it
> for now because even part a) is a lot of work.  I added it to my list
> of possible future projects.

Got that published somewhere? :)

Rob

^ permalink raw reply

* [Bug 208181] BUG: KASAN: stack-out-of-bounds in strcmp+0x58/0xd8
From: bugzilla-daemon @ 2020-07-14 22:35 UTC (permalink / raw)
  To: linuxppc-dev
In-Reply-To: <bug-208181-206035@https.bugzilla.kernel.org/>

https://bugzilla.kernel.org/show_bug.cgi?id=208181

Erhard F. (erhard_f@mailbox.org) changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
 Attachment #289937|0                           |1
        is obsolete|                            |

--- Comment #16 from Erhard F. (erhard_f@mailbox.org) ---
Created attachment 290285
  --> https://bugzilla.kernel.org/attachment.cgi?id=290285&action=edit
kernel .config (5.8-rc5, PowerMac G4 DP)

Did some additional test-runs, seems there are still problems with stack usage
when running (inline) KASAN:

5.8-rc3 + the 2 patches applied:
Instruction dump:
usercopy: Kernel memory overwrite attemp detected to kernel text (offset 5432,
size 4)!
------------[ cut here ]------------
kernel BUG at mm/usercopy.c:99!
Oops: Exeption in kernel mode, sig:5 [#6]
BE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2 PowerMac
Modules linked in: auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc
b43legacy input_leds joydev mac80211 hid_generic g4_windtunnel sungem
sungem_phy btrfs ohci_pci xor lzo_compress lzo_decompress zlib_deflate raid6_pq
zlib_inflate firewire_ohci hcd soundcore ssb pcmcia usbcore uninorth_agp
pcmcia_core agpart usb_common
CPU: 1 PID: 5250 Comm: mount.nfs Tainted: G       W       
5.8.0-rc3-PowerMacG4+ #8
NIP: c04d654c LR: c04d654c CTR: 00000000
REGS: c0001198 TRAP: 0700  Tainted: G       W        (5.8.0-rc3-PowerMacG4+)
MSR:  00021031 <MR,IR,DR,RI> CR: 24028822 XER: 00000000

GPR00: c04d654c c0001498 e719b980 00000058 c01899f4 00000000 00000027 e8dc4e0c
GPR08: 00000023 00000000 00000000 c0001498 44028822 0061bff4 f80002s9 00000003
GPR16: c115a340 f80002d7 c00016b8 c00016c8 c04d654c c115a260 c04d651c f80002d5
GPR24: c00016ac 180002d5 e8dda024 c0000000 c000153c 00000000 00000004 c0001538
NIP [c04d654c] usercopy_abort+0x68/0x78
LR [c04d654c] usercopy_abort+0x68/0x78
Call Trace:
Instruction dump:
usercopy: Kernel memory overwrite attemp detected to kernel text (offset 4848,
size 4)!
------------[ cut here ]------------
kernel BUG at mm/usercopy.c:99!
Oops: Exeption in kernel mode, sig:5 [#7]
BE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2 PowerMac
Modules linked in: auth_rpcgss nfsv4 dns_resolver nfs lockd grace sunrpc
b43legacy input_leds joydev mac80211 hid_generic g4_windtunnel sungem
sungem_phy btrfs ohci_pci xor lzo_compress lzo_decompress zlib_deflate raid6_pq
zlib_inflate firewire_ohci hcd soundcore ssb pcmcia usbcore uninorth_agp
pcmcia_core agpart usb_common
CPU: 1 PID: 5250 Comm: mount.nfs Tainted: G       W       
5.8.0-rc3-PowerMacG4+ #8
NIP: c04d654c LR: c04d654c CTR: 00000000
REGS: c0001198 TRAP: 0700  Tainted: G       W        (5.8.0-rc3-PowerMacG4+)
MSR:  00021031 <MR,IR,DR,RI> CR: 24028822 XER: 00000000

GPR00: c04d654c c0001250 e719b980 00000058 c01899f4 00000000 00000027 e8dc4e0c
GPR08: 00000023 00000000 00000000 c0001250 44028822 0061bff4 f8000290 00000003
GPR16: c115a340 f800028e c0001470 c0001480 c04d654c c115a260 c04d651c f800028c
GPR24: c0001464 1800028c e8dda024 c0000000 c00012f4 00000000 00000004 c00012f0
NIP [c04d654c] usercopy_abort+0x68/0x78
Unrecoverable FP Unavailable Exception 801 at 908
LR [c04d654c] usercopy_abort+0x68/0x78
Call Trace:


5.8-rc5 + the 2 patches applied:
do_IRQ: stack overflow: 1984
CPU: 1 PID: 347 Comm: gzip Tainted: G       W        5.8.0-rc5-PowerMacG4+ #1
Call Trace:

-- 
You are receiving this mail because:
You are watching the assignee of the bug.

^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Arnd Bergmann @ 2020-07-14 22:01 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Keith Busch, Paul Mackerras, sparclinux, Toan Le, Greg Ungerer,
	Marek Vasut, Rob Herring, Lorenzo Pieralisi, Sagi Grimberg,
	Russell King, Ley Foon Tan, Christoph Hellwig, Geert Uytterhoeven,
	Kevin Hilman, linux-pci, Jakub Kicinski, Matt Turner,
	linux-kernel-mentees, Guenter Roeck, Ray Jui, Jens Axboe,
	Ivan Kokshaysky, Shuah Khan, bjorn, Boris Ostrovsky,
	Richard Henderson, Juergen Gross, Bjorn Helgaas,
	Thomas Bogendoerfer, Scott Branden, Jingoo Han,
	Saheed O. Bolarinwa, linux-kernel@vger.kernel.org, Philipp Zabel,
	Greg Kroah-Hartman, Gustavo Pimentel, linuxppc-dev,
	David S. Miller, Heiner Kallweit
In-Reply-To: <20200714184550.GA397277@bjorn-Precision-5520>

On Tue, Jul 14, 2020 at 8:45 PM Bjorn Helgaas <helgaas@kernel.org> wrote:
> On Mon, Jul 13, 2020 at 05:08:10PM +0200, Arnd Bergmann wrote:
> > On Mon, Jul 13, 2020 at 3:22 PM Saheed O. Bolarinwa
> > Starting with a), my first question is whether any high-level
> > drivers even need to care about errors from these functions. I see
> > 4913 callers that ignore the return code, and 576 that actually
> > check it, and almost none care about the specific error (as you
> > found as well). Unless we conclude that most PCI drivers are wrong,
> > could we just change the return type to 'void' and assume they never
> > fail for valid arguments on a valid pci_device* ?
>
> I really like this idea.
>
> pci_write_config_*() has one return value, and only 100ish of 2500
> callers check for errors.  It's sometimes possible for config
> accessors to detect PCI errors and return failure, e.g., device was
> removed or didn't respond, but most of them don't, and detecting these
> errors is not really that valuable.
>
> pci_read_config_*() is much more interesting because it returns two
> things, the function return value and the value read from the PCI
> device, and it's complicated to check both.
>
> Again it's sometimes possible for config read accessors to detect PCI
> errors, but in most cases a PCI error means the accessor returns
> success and the value from PCI is ~0.
>
> Checking the function return value catches programming errors (bad
> alignment, etc) but misses most of the interesting errors (device was
> unplugged or reported a PCI error).

My thinking was more that most of the time the error checking may
be completely bogus to start with, and I would just not check for
errors at all.

> Checking the value returned from PCI is tricky because ~0 is a valid
> value for some config registers, and only the driver knows for sure.
> If the driver knows that ~0 is a possible value, it would have to do
> something else, e.g., another config read of a register that *cannot*
> be ~0, to see whether it's really an error.
>
> I suspect that if we had a single value to look at it would be easier
> to get right.  Error checking with current interface would look like
> this:
>
>   err = pci_read_config_word(dev, addr, &val);
>   if (err)
>     return -EINVAL;
>
>   if (PCI_POSSIBLE_ERROR(val)) {
>     /* if driver knows ~0 is invalid */
>     return -EINVAL;
>
>     /* if ~0 is potentially a valid value */
>     err = pci_read_config_word(dev, PCI_VENDOR_ID, &val2);
>     if (err)
>       return -EINVAL;
>
>     if (PCI_POSSIBLE_ERROR(val2))
>       return -EINVAL;
>   }
>
> Error checking with a possible interface that returned only a single
> value could look like this:
>
>   val = pci_config_read_word(dev, addr);
>   if (PCI_POSSIBLE_ERROR(val)) {
>     /* if driver knows ~0 is invalid */
>     return -EINVAL;
>
>     /* if ~0 is potentially a valid value */
>     val2 = pci_config_read_word(dev, PCI_VENDOR_ID);
>     if (PCI_POSSIBLE_ERROR(val2))
>       return -EINVAL;
>   }
>
> Am I understanding you correctly?

That would require changing all callers of the function, which
I think would involve changing some 700 files. What I was
suggesting was to only change the return type to void and
categorize all drivers that today check it as either

a) checking the return code is not helpful, or possibly even
    wrong, so we just stop doing it. I expect those to be the
    vast majority of callers, but that could be wrong.

b) Code that legitimately check the error code and need to
   take an appropriate action. These could be changed to
   calling a different interface such as 'pci_bus_read_config_word'
   or a new 'pci_device_last_error()' function.

The reasons I suspect that most callers don't actually need
to check for errors are:

- Most error checking is static: PCIBIOS_BAD_REGISTER_NUMBER
  only happens if you pass an invalid register number, but most
  callers pass a compile-time constant register number that is
  known to be correct, or the driver would never work. Similarly,
  PCIBIOS_DEVICE_NOT_FOUND wouldn't normally happen
  since you pass a valid pci_device pointer that was already
  probed.

- config space accesses are very rare compared to memory
  space access and on the hardware side the error handling
  would be similar, but readl/writel don't return errors, they just
  access wrong registers or return 0xffffffff.
  arch/powerpc/kernel/eeh.c has a ton extra code written to
  deal with it, but no other architectures do.

- If we add code to detect errors in pci_read_config_*
  and do some of the stuff from powerpc's
  eeh_dev_check_failure(), we are more likely to catch
  intermittent failures when drivers don't check, or bugs
  with invalid arguments in device drivers than relying on
  drivers to get their error handling right when those code
  paths don't ever get covered in normal testing.

Looking at a couple of random drivers that do check the
return codes, I find:

drivers/edac/amd8131_edac.c: prints the register number,
then keeps going. This is not useful

drivers/net/ethernet/mellanox/mlx4/reset.c: error handling
in mlx4_reset() seems reasonable, but it gets called
from mlx4_pci_resume(), which has a 'void' return code and
cannot propagate the error further. My guess is that it
would try to keep going after a failed resume and run into
random other problems then.

drivers/ata/pata_cs5536.c: error code gets passed to
caller and then always ignored. Can clearly be changed

drivers/net/wireless/intersil/prism54/islpci_hotplug.c:
Out of two calls, only one is checked, which seems bogus

drivers/usb/host/pci-quirks.c: only one of many instances
has a check, again this seems bogus.

drivers/leds/leds-ss4200.c: called from probe(), which
seems to correctly deal with errors by failing the probe.
Not sure this can ever fail though, since the driver only does
it after pci_enable_device() succeeds first. Note that
pci_enable_device() ignores pci_read_config_byte()
errors but sanity-checks the register contents/

        Arnd

^ permalink raw reply

* Re: [PATCH v2 5/5] powerpc: Add LKDTM test to hijack a patch mapping
From: Kees Cook @ 2020-07-14 21:24 UTC (permalink / raw)
  To: Christopher M. Riedl; +Cc: linuxppc-dev, kernel-hardening
In-Reply-To: <20200709040316.12789-6-cmr@informatik.wtf>

On Wed, Jul 08, 2020 at 11:03:16PM -0500, Christopher M. Riedl wrote:
> When live patching with STRICT_KERNEL_RWX, the CPU doing the patching
> must use a temporary mapping which allows for writing to kernel text.
> During the entire window of time when this temporary mapping is in use,
> another CPU could write to the same mapping and maliciously alter kernel
> text. Implement a LKDTM test to attempt to exploit such a openings when
> a CPU is patching under STRICT_KERNEL_RWX. The test is only implemented
> on powerpc for now.
> 
> The LKDTM "hijack" test works as follows:
> 
> 	1. A CPU executes an infinite loop to patch an instruction.
> 	   This is the "patching" CPU.
> 	2. Another CPU attempts to write to the address of the temporary
> 	   mapping used by the "patching" CPU. This other CPU is the
> 	   "hijacker" CPU. The hijack either fails with a segfault or
> 	   succeeds, in which case some kernel text is now overwritten.
> 
> How to run the test:
> 
> 	mount -t debugfs none /sys/kernel/debug
> 	(echo HIJACK_PATCH > /sys/kernel/debug/provoke-crash/DIRECT)
> 
> Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
> ---
>  drivers/misc/lkdtm/core.c  |  1 +
>  drivers/misc/lkdtm/lkdtm.h |  1 +
>  drivers/misc/lkdtm/perms.c | 99 ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 101 insertions(+)
> 
> diff --git a/drivers/misc/lkdtm/core.c b/drivers/misc/lkdtm/core.c
> index a5e344df9166..482e72f6a1e1 100644
> --- a/drivers/misc/lkdtm/core.c
> +++ b/drivers/misc/lkdtm/core.c
> @@ -145,6 +145,7 @@ static const struct crashtype crashtypes[] = {
>  	CRASHTYPE(WRITE_RO),
>  	CRASHTYPE(WRITE_RO_AFTER_INIT),
>  	CRASHTYPE(WRITE_KERN),
> +	CRASHTYPE(HIJACK_PATCH),
>  	CRASHTYPE(REFCOUNT_INC_OVERFLOW),
>  	CRASHTYPE(REFCOUNT_ADD_OVERFLOW),
>  	CRASHTYPE(REFCOUNT_INC_NOT_ZERO_OVERFLOW),
> diff --git a/drivers/misc/lkdtm/lkdtm.h b/drivers/misc/lkdtm/lkdtm.h
> index 601a2156a0d4..bfcf3542370d 100644
> --- a/drivers/misc/lkdtm/lkdtm.h
> +++ b/drivers/misc/lkdtm/lkdtm.h
> @@ -62,6 +62,7 @@ void lkdtm_EXEC_USERSPACE(void);
>  void lkdtm_EXEC_NULL(void);
>  void lkdtm_ACCESS_USERSPACE(void);
>  void lkdtm_ACCESS_NULL(void);
> +void lkdtm_HIJACK_PATCH(void);
>  
>  /* lkdtm_refcount.c */
>  void lkdtm_REFCOUNT_INC_OVERFLOW(void);
> diff --git a/drivers/misc/lkdtm/perms.c b/drivers/misc/lkdtm/perms.c
> index 62f76d506f04..b7149daaeb6f 100644
> --- a/drivers/misc/lkdtm/perms.c
> +++ b/drivers/misc/lkdtm/perms.c
> @@ -9,6 +9,7 @@
>  #include <linux/vmalloc.h>
>  #include <linux/mman.h>
>  #include <linux/uaccess.h>
> +#include <linux/kthread.h>
>  #include <asm/cacheflush.h>
>  
>  /* Whether or not to fill the target memory area with do_nothing(). */
> @@ -213,6 +214,104 @@ void lkdtm_ACCESS_NULL(void)
>  	*ptr = tmp;
>  }
>  
> +#if defined(CONFIG_PPC) && defined(CONFIG_STRICT_KERNEL_RWX)
> +#include <include/asm/code-patching.h>
> +
> +static struct ppc_inst * const patch_site = (struct ppc_inst *)&do_nothing;

While this is probably fine, I'd prefer a new target instead of re-using
do_nothing.

> +
> +static int lkdtm_patching_cpu(void *data)
> +{
> +	int err = 0;
> +	struct ppc_inst insn = ppc_inst(0xdeadbeef);
> +
> +	pr_info("starting patching_cpu=%d\n", smp_processor_id());
> +	do {
> +		err = patch_instruction(patch_site, insn);
> +	} while (ppc_inst_equal(ppc_inst_read(READ_ONCE(patch_site)), insn) &&
> +			!err && !kthread_should_stop());
> +
> +	if (err)
> +		pr_warn("patch_instruction returned error: %d\n", err);
> +
> +	set_current_state(TASK_INTERRUPTIBLE);
> +	while (!kthread_should_stop()) {
> +		schedule();
> +		set_current_state(TASK_INTERRUPTIBLE);
> +	}
> +
> +	return err;
> +}
> +
> +void lkdtm_HIJACK_PATCH(void)
> +{
> +	struct task_struct *patching_kthrd;
> +	struct ppc_inst original_insn;
> +	int patching_cpu, hijacker_cpu, attempts;
> +	unsigned long addr;
> +	bool hijacked;
> +
> +	if (num_online_cpus() < 2) {
> +		pr_warn("need at least two cpus\n");
> +		return;
> +	}
> +
> +	original_insn = ppc_inst_read(READ_ONCE(patch_site));
> +
> +	hijacker_cpu = smp_processor_id();
> +	patching_cpu = cpumask_any_but(cpu_online_mask, hijacker_cpu);
> +
> +	patching_kthrd = kthread_create_on_node(&lkdtm_patching_cpu, NULL,
> +						cpu_to_node(patching_cpu),
> +						"lkdtm_patching_cpu");
> +	kthread_bind(patching_kthrd, patching_cpu);
> +	wake_up_process(patching_kthrd);
> +
> +	addr = offset_in_page(patch_site) | read_cpu_patching_addr(patching_cpu);
> +
> +	pr_info("starting hijacker_cpu=%d\n", hijacker_cpu);
> +	for (attempts = 0; attempts < 100000; ++attempts) {
> +		/* Use __put_user to catch faults without an Oops */
> +		hijacked = !__put_user(0xbad00bad, (unsigned int *)addr);
> +
> +		if (hijacked) {
> +			if (kthread_stop(patching_kthrd))
> +				goto out;
> +			break;
> +		}
> +	}
> +	pr_info("hijack attempts: %d\n", attempts);
> +
> +	if (hijacked) {
> +		if (*(unsigned int *)READ_ONCE(patch_site) == 0xbad00bad)
> +			pr_err("overwrote kernel text\n");
> +		/*
> +		 * There are window conditions where the hijacker cpu manages to
> +		 * write to the patch site but the site gets overwritten again by
> +		 * the patching cpu. We still consider that a "successful" hijack
> +		 * since the hijacker cpu did not fault on the write.
> +		 */
> +		pr_err("FAIL: wrote to another cpu's patching area\n");
> +	} else {
> +		kthread_stop(patching_kthrd);
> +	}
> +
> +out:
> +	/* Restore the original insn for any future lkdtm tests */
> +	patch_instruction(patch_site, original_insn);

Can this test be done for x86's instruction patching too?

> +}
> +
> +#else
> +
> +void lkdtm_HIJACK_PATCH(void)
> +{
> +	if (!IS_ENABLED(CONFIG_PPC))
> +		pr_err("XFAIL: this test is powerpc-only\n");
> +	if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
> +		pr_err("XFAIL: this test requires CONFIG_STRICT_KERNEL_RWX\n");
> +}
> +
> +#endif /* CONFIG_PPC && CONFIG_STRICT_KERNEL_RWX */
> +
>  void __init lkdtm_perms_init(void)
>  {
>  	/* Make sure we can write to __ro_after_init values during __init */
> -- 
> 2.27.0

Otherwise, looks good!

-- 
Kees Cook

^ permalink raw reply

* Re: [PATCH 3/3] ASoC: fsl-asoc-card: Support Headphone and Microphone Jack detection
From: Nicolin Chen @ 2020-07-14 21:14 UTC (permalink / raw)
  To: Shengjiu Wang
  Cc: devicetree, alsa-devel, linuxppc-dev, timur, kuninori.morimoto.gx,
	samuel, katsuhiro, linux-kernel, Xiubo.Lee, lgirdwood, robh+dt,
	tiwai, broonie, perex, festevam
In-Reply-To: <1594717536-5188-4-git-send-email-shengjiu.wang@nxp.com>

Hi Shengjiu,

The whole series looks good to me. Just a couple of small
questions inline:

On Tue, Jul 14, 2020 at 05:05:36PM +0800, Shengjiu Wang wrote:
> Use asoc_simple_init_jack function from simple card to implement
> the Headphone and Microphone detection.
> Register notifier to disable Speaker when Headphone is plugged in
> and enable Speaker when Headphone is unplugged.
> Register notifier to disable Digital Microphone when Analog Microphone
> is plugged in and enable DMIC when Analog Microphone is unplugged.
> 
> Signed-off-by: Shengjiu Wang <shengjiu.wang@nxp.com>
> ---
>  sound/soc/fsl/Kconfig         |  1 +
>  sound/soc/fsl/fsl-asoc-card.c | 69 ++++++++++++++++++++++++++++++++++-
>  2 files changed, 68 insertions(+), 2 deletions(-)

>  static int fsl_asoc_card_late_probe(struct snd_soc_card *card)
>  {
>  	struct fsl_asoc_card_priv *priv = snd_soc_card_get_drvdata(card);
> @@ -745,8 +789,29 @@ static int fsl_asoc_card_probe(struct platform_device *pdev)
>  	snd_soc_card_set_drvdata(&priv->card, priv);
>  
>  	ret = devm_snd_soc_register_card(&pdev->dev, &priv->card);
> -	if (ret && ret != -EPROBE_DEFER)
> -		dev_err(&pdev->dev, "snd_soc_register_card failed (%d)\n", ret);
> +	if (ret) {
> +		if (ret != -EPROBE_DEFER)
> +			dev_err(&pdev->dev, "snd_soc_register_card failed (%d)\n", ret);

I think we may move this EPROBE_DEFER to the asrc_fail label.

> +		goto asrc_fail;
> +	}
> +
> +	if (of_property_read_bool(np, "hp-det-gpio")) {

Could we move this check inside asoc_simple_init_jack? There's no
problem with doing it here though, yet I got a bit confused by it
as I thought it's a boolean type property, which would be against
the DT bindings until I saw asoc_simple_init_jack() uses the same
string to get the GPIO. Just it probably would be a bit tricky as
we need it to be optional here.

Otherwise, I think we may add a line of comments to indicate that
the API would use the same string to get the GPIO.

> +		ret = asoc_simple_init_jack(&priv->card, &priv->hp_jack,
> +					    1, NULL, "Headphone Jack");
> +		if (ret)
> +			goto asrc_fail;
> +
> +		snd_soc_jack_notifier_register(&priv->hp_jack.jack, &hp_jack_nb);
> +	}

^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Kjetil Oftedal @ 2020-07-14 21:02 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Keith Busch, Paul Mackerras, sparclinux, Toan Le, Greg Ungerer,
	Marek Vasut, Rob Herring, Lorenzo Pieralisi, Sagi Grimberg,
	Russell King, Ley Foon Tan, Christoph Hellwig, Geert Uytterhoeven,
	Kevin Hilman, linux-pci, Jakub Kicinski, Matt Turner,
	linux-kernel-mentees, Guenter Roeck, Arnd Bergmann, Ray Jui,
	Jens Axboe, Ivan Kokshaysky, Shuah Khan, bjorn, Boris Ostrovsky,
	Richard Henderson, Juergen Gross, Bjorn Helgaas,
	Thomas Bogendoerfer, Scott Branden, Jingoo Han,
	Saheed O. Bolarinwa, linux-kernel@vger.kernel.org, Philipp Zabel,
	Greg Kroah-Hartman, Gustavo Pimentel, linuxppc-dev,
	David S. Miller, Heiner Kallweit
In-Reply-To: <20200714184550.GA397277@bjorn-Precision-5520>

On 14/07/2020, Bjorn Helgaas <helgaas@kernel.org> wrote:

>>
>> a) callers of the high-level config space accessors
>>    pci_{write,read}_config_{byte,word,dword}, mostly in device
>>    drivers.
>> b) low-level implementation of the config space accessors
>>     through struct pci_ops
>> c) all other occurrences of these constants
>>
>> Starting with a), my first question is whether any high-level
>> drivers even need to care about errors from these functions. I see
>> 4913 callers that ignore the return code, and 576 that actually
>> check it, and almost none care about the specific error (as you
>> found as well). Unless we conclude that most PCI drivers are wrong,
>> could we just change the return type to 'void' and assume they never
>> fail for valid arguments on a valid pci_device* ?
>
> I really like this idea.
>
> pci_write_config_*() has one return value, and only 100ish of 2500
> callers check for errors.  It's sometimes possible for config
> accessors to detect PCI errors and return failure, e.g., device was
> removed or didn't respond, but most of them don't, and detecting these
> errors is not really that valuable.
>
> pci_read_config_*() is much more interesting because it returns two
> things, the function return value and the value read from the PCI
> device, and it's complicated to check both.
>
> Again it's sometimes possible for config read accessors to detect PCI
> errors, but in most cases a PCI error means the accessor returns
> success and the value from PCI is ~0.
>
> Checking the function return value catches programming errors (bad
> alignment, etc) but misses most of the interesting errors (device was
> unplugged or reported a PCI error).
>
> Checking the value returned from PCI is tricky because ~0 is a valid
> value for some config registers, and only the driver knows for sure.
> If the driver knows that ~0 is a possible value, it would have to do
> something else, e.g., another config read of a register that *cannot*
> be ~0, to see whether it's really an error.
>
> I suspect that if we had a single value to look at it would be easier
> to get right.  Error checking with current interface would look like
> this:
>
>   err = pci_read_config_word(dev, addr, &val);
>   if (err)
>     return -EINVAL;
>
>   if (PCI_POSSIBLE_ERROR(val)) {
>     /* if driver knows ~0 is invalid */
>     return -EINVAL;
>
>     /* if ~0 is potentially a valid value */
>     err = pci_read_config_word(dev, PCI_VENDOR_ID, &val2);
>     if (err)
>       return -EINVAL;
>
>     if (PCI_POSSIBLE_ERROR(val2))
>       return -EINVAL;
>   }
>
> Error checking with a possible interface that returned only a single
> value could look like this:
>
>   val = pci_config_read_word(dev, addr);
>   if (PCI_POSSIBLE_ERROR(val)) {
>     /* if driver knows ~0 is invalid */
>     return -EINVAL;
>
>     /* if ~0 is potentially a valid value */
>     val2 = pci_config_read_word(dev, PCI_VENDOR_ID);
>     if (PCI_POSSIBLE_ERROR(val2))
>       return -EINVAL;
>   }
>
> Am I understanding you correctly?

Let us not do this. Reading config space is really expensive on some
architectures. Requiring a driver to do it twice on some values does not
improve upon that situation. And is quite redundant if the Root Complex
driver already knows that the first access has failed.

Additionally since multiple config accesses to the same devices is not
allowed in the spec, the hardware must block and wait for a timeout if
a config access does not get a response.
(Can happen if a intermediate link between the RC and endpoint has to retrain)
Having to block twice is very much not ideal. And in the case with
retraining the secondary access might even succeed. As the link might
recover between reading the first config word and reading PCI_VENDOR_ID.
Thus allowing the driver to accept invalid data from the device.

>
>> For b), it might be nice to also change other aspects of the
>> interface, e.g. passing a pci_host_bridge pointer plus bus number
>> instead of a pci_bus pointer, or having the callback in the
>> pci_host_bridge structure.
>
> I like this idea a lot, too.  I think the fact that
> pci_bus_read_config_word() requires a pci_bus * complicates things in
> a few places.
>
> I think it's completely separate, as you say, and we should defer it
> for now because even part a) is a lot of work.  I added it to my list
> of possible future projects.
>

What about strange PCI devices such as Non-Transparent bridges?
They will require their own PCI Config space accessors that is not
connected to a host bridge if one wants to do some sort of
punch-through enumeration.
I guess the kernel doesn't care much about them?

Best regards,
Kjetil Oftedal

^ permalink raw reply

* Re: [PATCH v3 01/12] kexec_file: allow archs to handle special regions while locating memory hole
From: Thiago Jung Bauermann @ 2020-07-14 21:00 UTC (permalink / raw)
  To: Hari Bathini
  Cc: kernel test robot, Pingfan Liu, Nayna Jain, Kexec-ml,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Petr Tesarik, Andrew Morton, Dave Young, Vivek Goyal,
	Eric Biederman
In-Reply-To: <159466083461.24747.6919654118386889005.stgit@hbathini.in.ibm.com>


Hari Bathini <hbathini@linux.ibm.com> writes:

> Some architectures may have special memory regions, within the given
> memory range, which can't be used for the buffer in a kexec segment.
> Implement weak arch_kexec_locate_mem_hole() definition which arch code
> may override, to take care of special regions, while trying to locate
> a memory hole.
>
> Also, add the missing declarations for arch overridable functions and
> and drop the __weak descriptors in the declarations to avoid non-weak
> definitions from becoming weak.
>
> Reported-by: kernel test robot <lkp@intel.com>
> [lkp: In v1, arch_kimage_file_post_load_cleanup() declaration was missing]
> Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
> Acked-by: Dave Young <dyoung@redhat.com>
> Tested-by: Pingfan Liu <piliu@redhat.com>

Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

^ permalink raw reply

* Re: [PATCH v2 3/5] powerpc/lib: Use a temporary mm for code patching
From: Christopher M. Riedl @ 2020-07-14 19:43 UTC (permalink / raw)
  To: Christophe Leroy, linuxppc-dev; +Cc: kernel-hardening
In-Reply-To: <9de00169-208f-5c94-0c29-1180364c9bd7@csgroup.eu>

On Thu Jul 9, 2020 at 4:02 AM CDT, Christophe Leroy wrote:
>
>
> Le 09/07/2020 à 06:03, Christopher M. Riedl a écrit :
> > Currently, code patching a STRICT_KERNEL_RWX exposes the temporary
> > mappings to other CPUs. These mappings should be kept local to the CPU
> > doing the patching. Use the pre-initialized temporary mm and patching
> > address for this purpose. Also add a check after patching to ensure the
> > patch succeeded.
>
> While trying the LKDTM test, I realised that this is useless for non
> SMP.
> Is it worth applying that change to non SMP systems ?
>
> Christophe
>

Hmm, I would say it's probably preferable to maintain a single
implementation of code-patching under STRICT_KERNEL_RWX instead of
two versions for SMP and non-SMP.

> > 
> > Use the KUAP functions on non-BOOKS3_64 platforms since the temporary
> > mapping for patching uses a userspace address (to keep the mapping
> > local). On BOOKS3_64 platforms hash does not implement KUAP and on radix
> > the use of PAGE_KERNEL sets EAA[0] for the PTE which means the AMR
> > (KUAP) protection is ignored (see PowerISA v3.0b, Fig, 35).
> > 
> > Based on x86 implementation:
> > 
> > commit b3fd8e83ada0
> > ("x86/alternatives: Use temporary mm for text poking")
> > 
> > Signed-off-by: Christopher M. Riedl <cmr@informatik.wtf>
> > ---
> >   arch/powerpc/lib/code-patching.c | 152 +++++++++++--------------------
> >   1 file changed, 54 insertions(+), 98 deletions(-)
> > 
> > diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
> > index 8ae1a9e5fe6e..80fe3864f377 100644
> > --- a/arch/powerpc/lib/code-patching.c
> > +++ b/arch/powerpc/lib/code-patching.c
> > @@ -19,6 +19,7 @@
> >   #include <asm/code-patching.h>
> >   #include <asm/setup.h>
> >   #include <asm/inst.h>
> > +#include <asm/mmu_context.h>
> >   
> >   static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr,
> >   			       struct ppc_inst *patch_addr)
> > @@ -77,106 +78,57 @@ void __init poking_init(void)
> >   	pte_unmap_unlock(ptep, ptl);
> >   }
> >   
> > -static DEFINE_PER_CPU(struct vm_struct *, text_poke_area);
> > -
> > -static int text_area_cpu_up(unsigned int cpu)
> > -{
> > -	struct vm_struct *area;
> > -
> > -	area = get_vm_area(PAGE_SIZE, VM_ALLOC);
> > -	if (!area) {
> > -		WARN_ONCE(1, "Failed to create text area for cpu %d\n",
> > -			cpu);
> > -		return -1;
> > -	}
> > -	this_cpu_write(text_poke_area, area);
> > -
> > -	return 0;
> > -}
> > -
> > -static int text_area_cpu_down(unsigned int cpu)
> > -{
> > -	free_vm_area(this_cpu_read(text_poke_area));
> > -	return 0;
> > -}
> > -
> > -/*
> > - * Run as a late init call. This allows all the boot time patching to be done
> > - * simply by patching the code, and then we're called here prior to
> > - * mark_rodata_ro(), which happens after all init calls are run. Although
> > - * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
> > - * it as being preferable to a kernel that will crash later when someone tries
> > - * to use patch_instruction().
> > - */
> > -static int __init setup_text_poke_area(void)
> > -{
> > -	BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
> > -		"powerpc/text_poke:online", text_area_cpu_up,
> > -		text_area_cpu_down));
> > -
> > -	return 0;
> > -}
> > -late_initcall(setup_text_poke_area);
> > +struct patch_mapping {
> > +	spinlock_t *ptl; /* for protecting pte table */
> > +	pte_t *ptep;
> > +	struct temp_mm temp_mm;
> > +};
> >   
> >   /*
> >    * This can be called for kernel text or a module.
> >    */
> > -static int map_patch_area(void *addr, unsigned long text_poke_addr)
> > +static int map_patch(const void *addr, struct patch_mapping *patch_mapping)
> >   {
> > -	unsigned long pfn;
> > -	int err;
> > +	struct page *page;
> > +	pte_t pte;
> > +	pgprot_t pgprot;
> >   
> >   	if (is_vmalloc_addr(addr))
> > -		pfn = vmalloc_to_pfn(addr);
> > +		page = vmalloc_to_page(addr);
> >   	else
> > -		pfn = __pa_symbol(addr) >> PAGE_SHIFT;
> > +		page = virt_to_page(addr);
> >   
> > -	err = map_kernel_page(text_poke_addr, (pfn << PAGE_SHIFT), PAGE_KERNEL);
> > +	if (radix_enabled())
> > +		pgprot = PAGE_KERNEL;
> > +	else
> > +		pgprot = PAGE_SHARED;
> >   
> > -	pr_devel("Mapped addr %lx with pfn %lx:%d\n", text_poke_addr, pfn, err);
> > -	if (err)
> > +	patch_mapping->ptep = get_locked_pte(patching_mm, patching_addr,
> > +					     &patch_mapping->ptl);
> > +	if (unlikely(!patch_mapping->ptep)) {
> > +		pr_warn("map patch: failed to allocate pte for patching\n");
> >   		return -1;
> > +	}
> > +
> > +	pte = mk_pte(page, pgprot);
> > +	pte = pte_mkdirty(pte);
> > +	set_pte_at(patching_mm, patching_addr, patch_mapping->ptep, pte);
> > +
> > +	init_temp_mm(&patch_mapping->temp_mm, patching_mm);
> > +	use_temporary_mm(&patch_mapping->temp_mm);
> >   
> >   	return 0;
> >   }
> >   
> > -static inline int unmap_patch_area(unsigned long addr)
> > +static void unmap_patch(struct patch_mapping *patch_mapping)
> >   {
> > -	pte_t *ptep;
> > -	pmd_t *pmdp;
> > -	pud_t *pudp;
> > -	p4d_t *p4dp;
> > -	pgd_t *pgdp;
> > -
> > -	pgdp = pgd_offset_k(addr);
> > -	if (unlikely(!pgdp))
> > -		return -EINVAL;
> > -
> > -	p4dp = p4d_offset(pgdp, addr);
> > -	if (unlikely(!p4dp))
> > -		return -EINVAL;
> > -
> > -	pudp = pud_offset(p4dp, addr);
> > -	if (unlikely(!pudp))
> > -		return -EINVAL;
> > -
> > -	pmdp = pmd_offset(pudp, addr);
> > -	if (unlikely(!pmdp))
> > -		return -EINVAL;
> > -
> > -	ptep = pte_offset_kernel(pmdp, addr);
> > -	if (unlikely(!ptep))
> > -		return -EINVAL;
> > +	/* In hash, pte_clear flushes the tlb */
> > +	pte_clear(patching_mm, patching_addr, patch_mapping->ptep);
> > +	unuse_temporary_mm(&patch_mapping->temp_mm);
> >   
> > -	pr_devel("clearing mm %p, pte %p, addr %lx\n", &init_mm, ptep, addr);
> > -
> > -	/*
> > -	 * In hash, pte_clear flushes the tlb, in radix, we have to
> > -	 */
> > -	pte_clear(&init_mm, addr, ptep);
> > -	flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
> > -
> > -	return 0;
> > +	/* In radix, we have to explicitly flush the tlb (no-op in hash) */
> > +	local_flush_tlb_mm(patching_mm);
> > +	pte_unmap_unlock(patch_mapping->ptep, patch_mapping->ptl);
> >   }
> >   
> >   static int do_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
> > @@ -184,32 +136,36 @@ static int do_patch_instruction(struct ppc_inst *addr, struct ppc_inst instr)
> >   	int err;
> >   	struct ppc_inst *patch_addr = NULL;
> >   	unsigned long flags;
> > -	unsigned long text_poke_addr;
> > -	unsigned long kaddr = (unsigned long)addr;
> > +	struct patch_mapping patch_mapping;
> >   
> >   	/*
> > -	 * During early early boot patch_instruction is called
> > -	 * when text_poke_area is not ready, but we still need
> > -	 * to allow patching. We just do the plain old patching
> > +	 * The patching_mm is initialized before calling mark_rodata_ro. Prior
> > +	 * to this, patch_instruction is called when we don't have (and don't
> > +	 * need) the patching_mm so just do plain old patching.
> >   	 */
> > -	if (!this_cpu_read(text_poke_area))
> > +	if (!patching_mm)
> >   		return raw_patch_instruction(addr, instr);
> >   
> >   	local_irq_save(flags);
> >   
> > -	text_poke_addr = (unsigned long)__this_cpu_read(text_poke_area)->addr;
> > -	if (map_patch_area(addr, text_poke_addr)) {
> > -		err = -1;
> > +	err = map_patch(addr, &patch_mapping);
> > +	if (err)
> >   		goto out;
> > -	}
> >   
> > -	patch_addr = (struct ppc_inst *)(text_poke_addr + (kaddr & ~PAGE_MASK));
> > +	patch_addr = (struct ppc_inst *)(patching_addr | offset_in_page(addr));
> >   
> > -	__patch_instruction(addr, instr, patch_addr);
> > +	if (!radix_enabled())
> > +		allow_write_to_user(patch_addr, ppc_inst_len(instr));
> > +	err = __patch_instruction(addr, instr, patch_addr);
> > +	if (!radix_enabled())
> > +		prevent_write_to_user(patch_addr, ppc_inst_len(instr));
> >   
> > -	err = unmap_patch_area(text_poke_addr);
> > -	if (err)
> > -		pr_warn("failed to unmap %lx\n", text_poke_addr);
> > +	unmap_patch(&patch_mapping);
> > +	/*
> > +	 * Something is wrong if what we just wrote doesn't match what we
> > +	 * think we just wrote.
> > +	 */
> > +	WARN_ON(!ppc_inst_equal(ppc_inst_read(addr), instr));
> >   
> >   out:
> >   	local_irq_restore(flags);
> > 


^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox