LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* Re: [PATCH 03/15] powerpc/powernv/pci: Add explicit tracking of the DMA setup state
From: Alexey Kardashevskiy @ 2020-07-14  5:37 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev
In-Reply-To: <20200710052340.737567-4-oohall@gmail.com>



On 10/07/2020 15:23, Oliver O'Halloran wrote:
> There's an optimisation in the PE setup which skips performing DMA
> setup for a PE if we only have bridges in a PE. The assumption being
> that only "real" devices will DMA to system memory, which is probably
> fair. However, if we start off with only bridge devices in a PE then
> add a non-bridge device the new device won't be able to use DMA  because
> we never configured it.
> 
> Fix this (admittedly pretty weird) edge case by tracking whether we've done
> the DMA setup for the PE or not. If a non-bridge device is added to the PE
> (via rescan or hotplug, or whatever) we can set up DMA on demand.

So hotplug does not work on powernv then, right? I thought you tested it
a while ago, or this patch is the result of that attempt? If it is, then

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>


> This also means the only remaining user of the old "DMA Weight" code is
> the IODA1 DMA setup code that it was originally added for, which is good.


Is ditching IODA1 in the plan? :)

> 
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
> Alexey, do we need to have the IOMMU API stuff set/clear this flag?


I'd say no as that API only cares if a device is in a PE and for those
the PE DMA setup  optimization is skipped. Thanks,




> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 48 ++++++++++++++---------
>  arch/powerpc/platforms/powernv/pci.h      |  7 ++++
>  2 files changed, 36 insertions(+), 19 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index bfb40607aa0e..bb9c1cc60c33 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -141,6 +141,7 @@ static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb, int pe_no)
>  
>  	phb->ioda.pe_array[pe_no].phb = phb;
>  	phb->ioda.pe_array[pe_no].pe_number = pe_no;
> +	phb->ioda.pe_array[pe_no].dma_setup_done = false;
>  
>  	/*
>  	 * Clear the PE frozen state as it might be put into frozen state
> @@ -1685,6 +1686,12 @@ static int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
>  }
>  #endif /* CONFIG_PCI_IOV */
>  
> +static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
> +				       struct pnv_ioda_pe *pe);
> +
> +static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
> +				       struct pnv_ioda_pe *pe);
> +
>  static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
>  {
>  	struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus);
> @@ -1713,6 +1720,24 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
>  		pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
>  	}
>  
> +	/*
> +	 * We assume that bridges *probably* don't need to do any DMA so we can
> +	 * skip allocating a TCE table, etc unless we get a non-bridge device.
> +	 */
> +	if (!pe->dma_setup_done && !pci_is_bridge(pdev)) {
> +		switch (phb->type) {
> +		case PNV_PHB_IODA1:
> +			pnv_pci_ioda1_setup_dma_pe(phb, pe);
> +			break;
> +		case PNV_PHB_IODA2:
> +			pnv_pci_ioda2_setup_dma_pe(phb, pe);
> +			break;
> +		default:
> +			pr_warn("%s: No DMA for PHB#%x (type %d)\n",
> +				__func__, phb->hose->global_number, phb->type);
> +		}
> +	}
> +
>  	if (pdn)
>  		pdn->pe_number = pe->pe_number;
>  	pe->device_count++;
> @@ -2222,6 +2247,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
>  	pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
>  	iommu_init_table(tbl, phb->hose->node, 0, 0);
>  
> +	pe->dma_setup_done = true;
>  	return;
>   fail:
>  	/* XXX Failure: Try to fallback to 64-bit only ? */
> @@ -2536,9 +2562,6 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  {
>  	int64_t rc;
>  
> -	if (!pnv_pci_ioda_pe_dma_weight(pe))
> -		return;
> -
>  	/* TVE #1 is selected by PCI address bit 59 */
>  	pe->tce_bypass_base = 1ull << 59;
>  
> @@ -2563,6 +2586,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
>  	iommu_register_group(&pe->table_group, phb->hose->global_number,
>  			     pe->pe_number);
>  #endif
> +	pe->dma_setup_done = true;
>  }
>  
>  int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
> @@ -3136,7 +3160,6 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
>  
>  static void pnv_pci_configure_bus(struct pci_bus *bus)
>  {
> -	struct pnv_phb *phb = pci_bus_to_pnvhb(bus);
>  	struct pci_dev *bridge = bus->self;
>  	struct pnv_ioda_pe *pe;
>  	bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
> @@ -3160,17 +3183,6 @@ static void pnv_pci_configure_bus(struct pci_bus *bus)
>  		return;
>  
>  	pnv_ioda_setup_pe_seg(pe);
> -	switch (phb->type) {
> -	case PNV_PHB_IODA1:
> -		pnv_pci_ioda1_setup_dma_pe(phb, pe);
> -		break;
> -	case PNV_PHB_IODA2:
> -		pnv_pci_ioda2_setup_dma_pe(phb, pe);
> -		break;
> -	default:
> -		pr_warn("%s: No DMA for PHB#%x (type %d)\n",
> -			__func__, phb->hose->global_number, phb->type);
> -	}
>  }
>  
>  static resource_size_t pnv_pci_default_alignment(void)
> @@ -3289,11 +3301,10 @@ static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
>  
>  static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
>  {
> -	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
>  	struct iommu_table *tbl = pe->table_group.tables[0];
>  	int64_t rc;
>  
> -	if (!weight)
> +	if (!pe->dma_setup_done)
>  		return;
>  
>  	rc = pnv_pci_ioda1_unset_window(&pe->table_group, 0);
> @@ -3313,10 +3324,9 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
>  static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
>  {
>  	struct iommu_table *tbl = pe->table_group.tables[0];
> -	unsigned int weight = pnv_pci_ioda_pe_dma_weight(pe);
>  	int64_t rc;
>  
> -	if (!weight)
> +	if (pe->dma_setup_done)
>  		return;
>  
>  	rc = pnv_pci_ioda2_unset_window(&pe->table_group, 0);
> diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
> index 0727dec9a0d1..6aa6aefb637d 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -87,6 +87,13 @@ struct pnv_ioda_pe {
>  	bool			tce_bypass_enabled;
>  	uint64_t		tce_bypass_base;
>  
> +	/*
> +	 * Used to track whether we've done DMA setup for this PE or not. We
> +	 * want to defer allocating TCE tables, etc until we've added a
> +	 * non-bridge device to the PE.
> +	 */
> +	bool			dma_setup_done;
> +
>  	/* MSIs. MVE index is identical for for 32 and 64 bit MSI
>  	 * and -1 if not supported. (It's actually identical to the
>  	 * PE number)
> 

-- 
Alexey

^ permalink raw reply

* Re: [RFC PATCH v0 2/2] KVM: PPC: Book3S HV: Use H_RPT_INVALIDATE in nested KVM
From: Bharata B Rao @ 2020-07-14  5:31 UTC (permalink / raw)
  To: Paul Mackerras; +Cc: aneesh.kumar, linuxppc-dev, npiggin, kvm-ppc
In-Reply-To: <20200709100711.GA2961345@thinks.paulus.ozlabs.org>

On Thu, Jul 09, 2020 at 08:07:11PM +1000, Paul Mackerras wrote:
> On Thu, Jul 09, 2020 at 02:38:51PM +0530, Bharata B Rao wrote:
> > On Thu, Jul 09, 2020 at 03:18:03PM +1000, Paul Mackerras wrote:
> > > On Fri, Jul 03, 2020 at 04:14:20PM +0530, Bharata B Rao wrote:
> > > > In the nested KVM case, replace H_TLB_INVALIDATE by the new hcall
> > > > H_RPT_INVALIDATE if available. The availability of this hcall
> > > > is determined from "hcall-rpt-invalidate" string in ibm,hypertas-functions
> > > > DT property.
> > > 
> > > What are we going to use when nested KVM supports HPT guests at L2?
> > > L1 will need to do partition-scoped tlbies with R=0 via a hypercall,
> > > but H_RPT_INVALIDATE says in its name that it only handles radix
> > > page tables (i.e. R=1).
> > 
> > For L2 HPT guests, the old hcall is expected to work after it adds
> > support for R=0 case?
> 
> That was the plan.
> 
> > The new hcall should be advertised via ibm,hypertas-functions only
> > for radix guests I suppose.
> 
> Well, the L1 hypervisor is a radix guest of L0, so it would have
> H_RPT_INVALIDATE available to it?
> 
> I guess the question is whether H_RPT_INVALIDATE is supposed to do
> everything, that is, radix process-scoped invalidations, radix
> partition-scoped invalidations, and HPT partition-scoped
> invalidations.  If that is the plan then we should call it something
> different.

Guess we are bit late now to rename it and include HPT in the scope.

> 
> This patchset seems to imply that H_RPT_INVALIDATE is at least going
> to be used for radix partition-scoped invalidations as well as radix
> process-scoped invalidations.  If you are thinking that in future when
> we need HPT partition-scoped invalidations for a radix L1 hypervisor
> running a HPT L2 guest, we are going to define a new hypercall for
> that, I suppose that is OK, though it doesn't really seem necessary.

Guess a new hcall would be the way forward to cover the HPT L2 guest
requirements.

Thanks for pointing this out.

Regards,
Bharata.

^ permalink raw reply

* Re: [PATCH 00/11] Support for grouping cores
From: Srikar Dronamraju @ 2020-07-14  5:06 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Michael Ellerman, Anton Blanchard, linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

* Srikar Dronamraju <srikar@linux.vnet.ibm.com> [2020-07-14 10:06:13]:

> 
> On Power 9 (with device-tree enablement to show coregroups).
> -----------------------------------------------------------

With help of Gautham, I tried to kexec into a newer kernel with a modified
dtb. However when passing with the dtb option, kexec would get stuck.

Hence alternatively, I hardcoded some assumptions to test the same.
These hunks apply on top of all the patches.

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 8ec7ff05ae47..762aa573c313 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -56,6 +56,12 @@ static int n_mem_addr_cells, n_mem_size_cells;
 static int form1_affinity;
 
 #define MAX_DISTANCE_REF_POINTS 4
+/*
+ *  HAck2: On Power9, there are 12 SMT8 cores per chip.
+ *  Hard code this for now.
+ */
+#define CORES_PER_CHIP 12
+#define CORES_PER_COREGROUP (CORES_PER_CHIP / 2)
 static int distance_ref_points_depth;
 static const __be32 *distance_ref_points;
 static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
@@ -326,6 +332,13 @@ static int __init find_min_common_depth(void)
 
 	if (form1_affinity) {
 		depth = of_read_number(distance_ref_points, 1);
+		/*
+		 * Hack: Hack: Hack:
+		 * For now only on Phyp machines.
+		 */
+		if (!firmware_has_feature(FW_FEATURE_OPAL))
+			depth--;
 	} else {
 		if (distance_ref_points_depth < 2) {
 			printk(KERN_WARNING "NUMA: "
@@ -1247,8 +1260,8 @@ int cpu_to_coregroup_id(int cpu)
 		goto out;
 
 	index = of_read_number(associativity, 1);
-	if ((index > min_common_depth + 1) && coregroup_enabled)
-		return of_read_number(&associativity[index - 1], 1);
+	if ((index > min_common_depth + 1) && coregroup_enabled && has_big_cores)
+		return of_read_number(&associativity[index], 1) / CORES_PER_COREGROUP;
 
 out:
 	return cpu_to_core_id(cpu);
-- 
Thanks and Regards
Srikar Dronamraju

^ permalink raw reply related

* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Nicholas Piggin @ 2020-07-14  5:04 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, Andy Lutomirski, linuxppc-dev
In-Reply-To: <010054C3-7FFF-4FB5-BDA8-D2B80F7B1A5D@amacapital.net>

Excerpts from Andy Lutomirski's message of July 14, 2020 4:18 am:
> 
>> On Jul 13, 2020, at 9:48 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
>> 
>> Excerpts from Andy Lutomirski's message of July 14, 2020 1:59 am:
>>>> On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>>> 
>>>> On big systems, the mm refcount can become highly contented when doing
>>>> a lot of context switching with threaded applications (particularly
>>>> switching between the idle thread and an application thread).
>>>> 
>>>> Abandoning lazy tlb slows switching down quite a bit in the important
>>>> user->idle->user cases, so so instead implement a non-refcounted scheme
>>>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>>>> any remaining lazy ones.
>>>> 
>>>> On a 16-socket 192-core POWER8 system, a context switching benchmark
>>>> with as many software threads as CPUs (so each switch will go in and
>>>> out of idle), upstream can achieve a rate of about 1 million context
>>>> switches per second. After this patch it goes up to 118 million.
>>>> 
>>> 
>>> I read the patch a couple of times, and I have a suggestion that could
>>> be nonsense.  You are, effectively, using mm_cpumask() as a sort of
>>> refcount.  You're saying "hey, this mm has no more references, but it
>>> still has nonempty mm_cpumask(), so let's send an IPI and shoot down
>>> those references too."  I'm wondering whether you actually need the
>>> IPI.  What if, instead, you actually treated mm_cpumask as a refcount
>>> for real?  Roughly, in __mmdrop(), you would only free the page tables
>>> if mm_cpumask() is empty.  And, in the code that removes a CPU from
>>> mm_cpumask(), you would check if mm_users == 0 and, if so, check if
>>> you just removed the last bit from mm_cpumask and potentially free the
>>> mm.
>>> 
>>> Getting the locking right here could be a bit tricky -- you need to
>>> avoid two CPUs simultaneously exiting lazy TLB and thinking they
>>> should free the mm, and you also need to avoid an mm with mm_users
>>> hitting zero concurrently with the last remote CPU using it lazily
>>> exiting lazy TLB.  Perhaps this could be resolved by having mm_count
>>> == 1 mean "mm_cpumask() is might contain bits and, if so, it owns the
>>> mm" and mm_count == 0 meaning "now it's dead" and using some careful
>>> cmpxchg or dec_return to make sure that only one CPU frees it.
>>> 
>>> Or maybe you'd need a lock or RCU for this, but the idea would be to
>>> only ever take the lock after mm_users goes to zero.
>> 
>> I don't think it's nonsense, it could be a good way to avoid IPIs.
>> 
>> I haven't seen much problem here that made me too concerned about IPIs 
>> yet, so I think the simple patch may be good enough to start with
>> for powerpc. I'm looking at avoiding/reducing the IPIs by combining the
>> unlazying with the exit TLB flush without doing anything fancy with
>> ref counting, but we'll see.
> 
> I would be cautious with benchmarking here. I would expect that the
> nasty cases may affect power consumption more than performance — the 
> specific issue is IPIs hitting idle cores, and the main effects are to 
> slow down exit() a bit but also to kick the idle core out of idle. 
> Although, if the idle core is in a deep sleep, that IPI could be 
> *very* slow.

It will tend to be self-limiting to some degree (deeper idle cores
would tend to have less chance of IPI) but we have bigger issues on
powerpc with that, like broadcast IPIs to the mm cpumask for THP
management. Power hasn't really shown up as an issue but powerpc
CPUs may have their own requirements and issues there, shall we say.

> So I think it’s worth at least giving this a try.

To be clear it's not a complete solution itself. The problem is of 
course that mm cpumask gives you false negatives, so the bits
won't always clean up after themselves as CPUs switch away from their
lazy tlb mms.

I would suspect it _may_ help with garbage collecting some remainders
nicely after exit, but only with somewhat of a different accounting
system than powerpc uses -- we tie mm_cpumask to TLB valids, so it can
become spread over CPUs that don't (and even have never) used that mm
as a lazy mm I don't know that the self-culling trick would help
a great deal within that scheme.

So powerpc needs a bit more work on that side of things too, hence
looking at doing more of this in the final TLB shootdown.

There's actually a lot of other things we can do as well to reduce
IPIs, batching being a simple hammer, some kind of quiescing, testing
the remote CPU to check what active mm it is using, doing the un-lazy
at certain defined points etc, so I'm actually not worried about IPIs
suddenly popping up and rendering the whole concept unworkable. At
some point (unless we go something pretty complex like a SRCU type 
thing, or adding extra locking .e.g, to use_mm()), then at least 
sometimes an IPI will be required so I think it's reasonable to
start here and introduce complexity more slowly if it's justified.

Thanks,
Nick

^ permalink raw reply

* Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Alexey Kardashevskiy @ 2020-07-14  4:52 UTC (permalink / raw)
  To: Leonardo Bras, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <8c29be499e8741e7d77d53ca005034a2ca0179ac.camel@gmail.com>



On 14/07/2020 12:40, Leonardo Bras wrote:
> Thank you for this feedback Alexey!
> 
> On Mon, 2020-07-13 at 17:33 +1000, Alexey Kardashevskiy wrote:
>> [...]
>>> -	int len, ret;
>>> +	int len, ret, reset_win_ext;
>>
>> Make it "reset_token".
> 
> Oh, it's not a token here, it just checks if the reset_win extension
> exists. The token would be returned in *value, but since we did not
> need it here, it's not copied.

ah right, so it is a bool actually.


> 
>>> [...]
>>> -out_failed:
>>> +out_restore_defwin:
>>> +	if (default_win && reset_win_ext == 0)
>>
>> reset_win_ext potentially may be uninitialized here. Yeah I know it is
>> tied to default_win but still.
> 
> I can't see it being used uninitialized here, as you said it's tied to
> default_win. 

Where it is declared - it is not initialized so in theory it can skip
"if (query.windows_available == 0)".


> Could you please tell me how it can be used uninitialized here, or what
> is bad by doing this way?
> 
>> After looking at this function for a few minutes, it could use some
>> refactoring (way too many gotos)  such as:
> 
> Yes, I agree.
> 
>> 1. move (query.page_size & xx) checks before "if
>> (query.windows_available == 0)"
> 
> Moving 'page_size selection' above 'checking windows available' will
> need us to duplicate the 'page_size selection' after the new query,
> inside the if.

page_size selection is not going to change, why?


> I mean, as query will be done again, it will need to get the (new) page
> size.
> 
>> 2. move "win64 = kzalloc(sizeof(struct property), GFP_KERNEL)" before
>> "if (query.windows_available == 0)"
> 
>> 3. call "reset_dma_window(dev, pdn)" inside the "if
>> (query.windows_available == 0)" branch.
> 
>> Then you can drop all "goto out_restore_defwin" and move default_win and
>> reset_win_ext inside "if (query.windows_available == 0)".
> 
> I did all changes suggested locally and did some analysis in the
> result:
> 
> I did not see a way to put default_win and reset_win_ext inside 
> "if (query.windows_available == 0)", because if we still need a way to
> know if the default window was removed, and if so, restore in case
> anything ever fails ahead (like creating the node property). 

Ah, I missed that new out_restore_defwin label is between other exit
labels. Sorry :-/


> But from that analysis I noted it's possible to remove all the new
> "goto out_restore_defwin", if we do default_win = NULL if
> ddw_read_ext() fails. 
> 
> So testing only default_win should always be enough to say if the
> default window was deleted, and reset_win_ext could be moved inside "if
> (query.windows_available == 0)".
> Also, it would avoid reset_win_ext being 'used uninitialized' and
> "out_restore_defwin:" would not be needed.
> 
> Against the current patch, we would have something like this:
> 
> #####
> 
>  static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>  {
> -       int len, ret, reset_win_ext;
> +       int len, ret;
>         struct ddw_query_response query;
>         struct ddw_create_response create;
>         int page_shift;
> @@ -1173,25 +1173,28 @@ static u64 enable_ddw(struct pci_dev *dev,
> struct device_node *pdn)
>          * for extensions presence.
>          */
>         if (query.windows_available == 0) {
> +               int reset_win_ext;
>                 default_win = of_find_property(pdn, "ibm,dma-window",
> NULL);
>                 if (!default_win)
>                         goto out_failed;
>  
>                 reset_win_ext = ddw_read_ext(pdn,
> DDW_EXT_RESET_DMA_WIN, NULL);
> -               if (reset_win_ext)
> +               if (reset_win_ext){
> +                       default_win = NULL;
>                         goto out_failed;
> +               }


This says "if we can reset, then we fail", no?

>                 remove_dma_window(pdn, ddw_avail, default_win);


I think you can do "default_win=NULL" here and later at
out_restore_defwin check if it is NULL - then call reset.

>                 /* Query again, to check if the window is available */
>                 ret = query_ddw(dev, ddw_avail, &query, pdn);
>                 if (ret != 0)
> -                       goto out_restore_defwin;
> +                       goto out_failed;
>  
>                 if (query.windows_available == 0) {
>                         /* no windows are available for this device. */
>                         dev_dbg(&dev->dev, "no free dynamic windows");
> -                       goto out_restore_defwin;
> +                       goto out_failed;
>                 }
>         }
>         if (query.page_size & 4) {
> @@ -1203,7 +1206,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct
> device_node *pdn)
>         } else {
>                 dev_dbg(&dev->dev, "no supported direct page size in
> mask %x",
>                           query.page_size);
> -               goto out_restore_defwin;
> +               goto out_failed;
>         }
>         /* verify the window * number of ptes will map the partition */
>         /* check largest block * page size > max memory hotplug addr */
> @@ -1212,14 +1215,14 @@ static u64 enable_ddw(struct pci_dev *dev,
> struct device_node *pdn)
>                 dev_dbg(&dev->dev, "can't map partition max 0x%llx with
> %llu "
>                           "%llu-sized pages\n",
> max_addr,  query.largest_available_block,
>                           1ULL << page_shift);
> -               goto out_restore_defwin;
> +               goto out_failed;
>         }
>         len = order_base_2(max_addr);
>         win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
>         if (!win64) {
>                 dev_info(&dev->dev,
>                         "couldn't allocate property for 64bit dma
> window\n");
> -               goto out_restore_defwin;
> +               goto out_failed;
>         }
>         win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
>         win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
> @@ -1282,11 +1285,10 @@ static u64 enable_ddw(struct pci_dev *dev,
> struct device_node *pdn)
>         kfree(win64->value);
>         kfree(win64);
>  
> -out_restore_defwin:
> -       if (default_win && reset_win_ext == 0)
> +out_failed:
> +       if (default_win)
>                 reset_dma_window(dev, pdn);
>  
> -out_failed:
>         fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
>         if (!fpdn)
>                 goto out_unlock;
> 
> #####
> 
> What do you think?
> 
> 
> 
>> The rest of the series is good as it is,
> 
> Thank you :)
> 
>>  however it may conflict with
>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200713062348.100552-1-aik@ozlabs.ru/
>> and the patchset it is made on top of -
>> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=188385 .
> 
> <From the message of the first link>
>> (do not rush, let me finish reviewing this first) 
> 
> Ok, I have no problem rebasing on top of those patchsets, but what
> would you suggest to be done?

Polish this patch one more time and if by the time when you reposted it
the other patchset is not in upstream, I'll ask Michael to take yours first.


> Would it be ok doing a big multi-author patchset, so we guarantee it
> being applied in the correct order?
>> (You probably want me to rebase my patchset on top of Hellwig + yours,
> right?) 

Nah, at least not yet.


-- 
Alexey

^ permalink raw reply

* [PATCH 02/11] powerpc/smp: Merge Power9 topology with Power topology
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

A new sched_domain_topology_level was added just for Power9. However the
same can be achieved by merging powerpc_topology with power9_topology
and makes the code more simpler especially when adding a new sched
domain.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 680c0edcc59d..069ea4b21c6d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1315,7 +1315,7 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymetric SMT dependancy */
+/* cpumask of CPUs with asymmetric SMT dependency */
 static int powerpc_smt_flags(void)
 {
 	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
@@ -1328,14 +1328,6 @@ static int powerpc_smt_flags(void)
 }
 #endif
 
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
-
 /*
  * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
  * This topology makes it *much* cheaper to migrate tasks between adjacent cores
@@ -1353,7 +1345,13 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-	return cpu_l2_cache_mask(cpu);
+	if (shared_caches)
+		return cpu_l2_cache_mask(cpu);
+
+	if (has_big_cores)
+		return cpu_smallcore_mask(cpu);
+
+	return cpu_smt_mask(cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -1363,7 +1361,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
-static struct sched_domain_topology_level power9_topology[] = {
+static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
@@ -1388,21 +1386,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
 #ifdef CONFIG_SCHED_SMT
 	if (has_big_cores) {
 		pr_info("Big cores detected but using small core scheduling\n");
-		power9_topology[0].mask = smallcore_smt_mask;
 		powerpc_topology[0].mask = smallcore_smt_mask;
 	}
 #endif
-	/*
-	 * If any CPU detects that it's sharing a cache with another CPU then
-	 * use the deeper topology that is aware of this sharing.
-	 */
-	if (shared_caches) {
-		pr_info("Using shared cache scheduler topology\n");
-		set_sched_topology(power9_topology);
-	} else {
-		pr_info("Using standard scheduler topology\n");
-		set_sched_topology(powerpc_topology);
-	}
+	set_sched_topology(powerpc_topology);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
2.17.1


^ permalink raw reply related

* [PATCH 11/11] powerpc/smp: Provide an ability to disable coregroup
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

If user wants to enable coregroup sched_domain then they can boot with
kernel parameter "coregroup_support=on"

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index bb25c13bbb79..c43909e6e8e9 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -118,6 +118,23 @@ struct smp_ops_t *smp_ops;
 volatile unsigned int cpu_callin_map[NR_CPUS];
 
 int smt_enabled_at_boot = 1;
+int coregroup_support;
+
+static int __init early_coregroup(char *p)
+{
+	if (!p)
+		return 0;
+
+	if (strstr(p, "on"))
+		coregroup_support = 1;
+
+	if (strstr(p, "1"))
+		coregroup_support = 1;
+
+	return 0;
+}
+
+early_param("coregroup_support", early_coregroup);
 
 /*
  * Returns 1 if the specified cpu should be brought up during boot.
@@ -878,7 +895,7 @@ static struct cpumask *cpu_coregroup_mask(int cpu)
 
 static bool has_coregroup_support(void)
 {
-	return coregroup_enabled;
+	return coregroup_enabled && coregroup_support;
 }
 
 static const struct cpumask *cpu_mc_mask(int cpu)
-- 
2.17.1


^ permalink raw reply related

* [PATCH 10/11] powerpc/smp: Implement cpu_to_coregroup_id
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Lookup the coregroup id from the associativity array.

If unable to detect the coregroup id, fallback on the core id.
This way, ensure sched_domain degenerates and an extra sched domain is
not created.

Ideally this function should have been implemented in
arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
don't need to find the primary domain again.

If the device-tree mentions more than one coregroup, then kernel
implements only the last or the smallest coregroup, which currently
corresponds to the penultimate domain in the device-tree.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/mm/numa.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index d9ab9da85eab..4e85564ef62a 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1697,6 +1697,23 @@ static const struct proc_ops topology_proc_ops = {
 
 int cpu_to_coregroup_id(int cpu)
 {
+	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+	int index;
+
+	if (cpu < 0 || cpu > nr_cpu_ids)
+		return -1;
+
+	if (!firmware_has_feature(FW_FEATURE_VPHN))
+		goto out;
+
+	if (vphn_get_associativity(cpu, associativity))
+		goto out;
+
+	index = of_read_number(associativity, 1);
+	if ((index > min_common_depth + 1) && coregroup_enabled)
+		return of_read_number(&associativity[index - 1], 1);
+
+out:
 	return cpu_to_core_id(cpu);
 }
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH 09/11] Powerpc/smp: Create coregroup domain
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Add percpu coregroup maps and masks to create coregroup domain.
If a coregroup doesn't exist, the coregroup domain will be degenerated
in favour of SMT/CACHE domain.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/topology.h | 10 ++++++++
 arch/powerpc/kernel/smp.c           | 37 +++++++++++++++++++++++++++++
 arch/powerpc/mm/numa.c              |  5 ++++
 3 files changed, 52 insertions(+)

diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h
index 2db7ba789720..34812c35018e 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -98,6 +98,7 @@ extern int stop_topology_update(void);
 extern int prrn_is_enabled(void);
 extern int find_and_online_cpu_nid(int cpu);
 extern int timed_topology_update(int nsecs);
+extern int cpu_to_coregroup_id(int cpu);
 #else
 static inline int start_topology_update(void)
 {
@@ -120,6 +121,15 @@ static inline int timed_topology_update(int nsecs)
 	return 0;
 }
 
+static inline int cpu_to_coregroup_id(int cpu)
+{
+#ifdef CONFIG_SMP
+	return cpu_to_core_id(cpu);
+#else
+	return 0;
+#endif
+}
+
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include <asm-generic/topology.h>
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index ef19eeccd21e..bb25c13bbb79 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
 
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
@@ -91,6 +92,7 @@ enum {
 	smt_idx,
 #endif
 	bigcore_idx,
+	mc_idx,
 	die_idx,
 };
 
@@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static struct cpumask *cpu_coregroup_mask(int cpu)
+{
+	return per_cpu(cpu_coregroup_map, cpu);
+}
+
+static bool has_coregroup_support(void)
+{
+	return coregroup_enabled;
+}
+
+static const struct cpumask *cpu_mc_mask(int cpu)
+{
+	return cpu_coregroup_mask(cpu);
+}
+
 static const struct cpumask *cpu_bigcore_mask(int cpu)
 {
 	return cpu_core_mask(cpu);
@@ -879,6 +896,7 @@ static struct sched_domain_topology_level powerpc_topology[] = {
 	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
 	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
+	{ cpu_mc_mask, SD_INIT_NAME(MC) },
 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };
@@ -933,6 +951,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 					GFP_KERNEL, node);
 		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
 					GFP_KERNEL, node);
+		if (has_coregroup_support())
+			zalloc_cpumask_var_node(&per_cpu(cpu_coregroup_map, cpu),
+						GFP_KERNEL, node);
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
 		/*
 		 * numa_node_id() works after this.
@@ -950,6 +972,11 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
 	cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
+	if (has_coregroup_support())
+		cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
+	else
+		powerpc_topology[mc_idx].mask = cpu_bigcore_mask;
+
 	init_big_cores();
 	if (has_big_cores) {
 		cpumask_set_cpu(boot_cpuid,
@@ -1241,6 +1268,8 @@ static void remove_cpu_from_masks(int cpu)
 		set_cpus_unrelated(cpu, i, cpu_sibling_mask);
 		if (has_big_cores)
 			set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
+		if (has_coregroup_support())
+			set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
 	}
 }
 #endif
@@ -1301,6 +1330,14 @@ static void add_cpu_to_masks(int cpu)
 	add_cpu_to_smallcore_masks(cpu);
 	update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
+	if (has_coregroup_support()) {
+		cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
+		for_each_cpu(i, cpu_online_mask) {
+			if (cpu_to_coregroup_id(cpu) == cpu_to_coregroup_id(i))
+				set_cpus_related(cpu, i, cpu_coregroup_mask);
+		}
+	}
+
 	if (pkg_id == -1) {
 		struct cpumask *(*mask)(int) = cpu_sibling_mask;
 
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index a43eab455be4..d9ab9da85eab 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1695,6 +1695,11 @@ static const struct proc_ops topology_proc_ops = {
 	.proc_release	= single_release,
 };
 
+int cpu_to_coregroup_id(int cpu)
+{
+	return cpu_to_core_id(cpu);
+}
+
 static int topology_update_init(void)
 {
 	start_topology_update();
-- 
2.17.1


^ permalink raw reply related

* [PATCH 08/11] powerpc/smp: Allocate cpumask only after searching thread group
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

If allocated earlier and the search fails, then cpumask need to be
freed. However cpu_l1_cache_map can be allocated after we search thread
group.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 96e47450d9b3..ef19eeccd21e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
 	if (err)
 		goto out;
 
-	zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
-				GFP_KERNEL,
-				cpu_to_node(cpu));
-
 	cpu_group_start = get_cpu_thread_group_start(cpu, &tg);
 
 	if (unlikely(cpu_group_start == -1)) {
@@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
 		goto out;
 	}
 
+	zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu),
+				GFP_KERNEL, cpu_to_node(cpu));
+
 	for (i = first_thread; i < first_thread + threads_per_core; i++) {
 		int i_group_start = get_cpu_thread_group_start(i, &tg);
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH 07/11] Powerpc/numa: Detect support for coregroup
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Add support for grouping cores based on the device-tree classification.
- The last domain in the associativity domains always refers to the
core.
- If primary reference domain happens to be the penultimate domain in
the associativity domains device-tree property, then there are no
coregroups. However if its not a penultimate domain, then there are
coregroups. There can be more than one coregroup. For now we would be
interested in the last or the smallest coregroups.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c      |  1 +
 arch/powerpc/mm/numa.c         | 34 +++++++++++++++++++++-------------
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 49a25e2400f2..5bdc17a7049f 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -28,6 +28,7 @@
 extern int boot_cpuid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
+extern bool coregroup_enabled;
 
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index f8faf75135af..96e47450d9b3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 struct task_struct *secondary_current;
 bool has_big_cores;
+bool coregroup_enabled;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index fc7b0505bdd8..a43eab455be4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -887,7 +887,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
 static void __init find_possible_nodes(void)
 {
 	struct device_node *rtas;
-	u32 numnodes, i;
+	const __be32 *domains;
+	int prop_length, max_nodes;
+	u32 i;
 
 	if (!numa_enabled)
 		return;
@@ -896,25 +898,31 @@ static void __init find_possible_nodes(void)
 	if (!rtas)
 		return;
 
-	if (of_property_read_u32_index(rtas, "ibm,current-associativity-domains",
-				min_common_depth, &numnodes)) {
-		/*
-		 * ibm,current-associativity-domains is a fairly recent
-		 * property. If it doesn't exist, then fallback on
-		 * ibm,max-associativity-domains. Current denotes what the
-		 * platform can support compared to max which denotes what the
-		 * Hypervisor can support.
-		 */
-		if (of_property_read_u32_index(rtas, "ibm,max-associativity-domains",
-				min_common_depth, &numnodes))
+	/*
+	 * ibm,current-associativity-domains is a fairly recent property. If
+	 * it doesn't exist, then fallback on ibm,max-associativity-domains.
+	 * Current denotes what the platform can support compared to max
+	 * which denotes what the Hypervisor can support.
+	 */
+	domains = of_get_property(rtas, "ibm,current-associativity-domains",
+					&prop_length);
+	if (!domains) {
+		domains = of_get_property(rtas, "ibm,max-associativity-domains",
+					&prop_length);
+		if (!domains)
 			goto out;
 	}
 
-	for (i = 0; i < numnodes; i++) {
+	max_nodes = of_read_number(&domains[min_common_depth], 1);
+	for (i = 0; i < max_nodes; i++) {
 		if (!node_possible(i))
 			node_set(i, node_possible_map);
 	}
 
+	prop_length /= sizeof(int);
+	if (prop_length > min_common_depth + 2)
+		coregroup_enabled = 1;
+
 out:
 	of_node_put(rtas);
 }
-- 
2.17.1


^ permalink raw reply related

* [PATCH 06/11] powerpc/smp: Generalize 2nd sched domain
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Currently "CACHE" domain happens to be the 2nd sched domain as per
powerpc_topology. This domain will collapse if cpumask of l2-cache is
same as SMT domain. However we could generalize this domain such that it
could mean either be a "CACHE" domain or a "BIGCORE" domain.

While setting up the "CACHE" domain, check if shared_cache is already
set.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 48 +++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 875f57e41355..f8faf75135af 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL_GPL(has_big_cores);
 
+enum {
+#ifdef CONFIG_SCHED_SMT
+	smt_idx,
+#endif
+	bigcore_idx,
+	die_idx,
+};
+
 #define MAX_THREAD_LIST_SIZE	8
 #define THREAD_GROUP_SHARE_L1   1
 struct thread_groups {
@@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-	if (shared_caches)
-		return cpu_l2_cache_mask(cpu);
-
-	if (has_big_cores)
-		return cpu_smallcore_mask(cpu);
-
-	return cpu_smt_mask(cpu);
+	return per_cpu(cpu_l2_cache_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static const struct cpumask *cpu_bigcore_mask(int cpu)
+{
+	return cpu_core_mask(cpu);
+}
+
 static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
-	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };
@@ -895,7 +902,7 @@ static int init_big_cores(void)
 
 #ifdef CONFIG_SCHED_SMT
 	pr_info("Big cores detected. Using small core scheduling\n");
-	powerpc_topology[0].mask = smallcore_smt_mask;
+	powerpc_topology[smt_idx].mask = smallcore_smt_mask;
 #endif
 
 	return 0;
@@ -1319,7 +1326,6 @@ static void add_cpu_to_masks(int cpu)
 void start_secondary(void *unused)
 {
 	unsigned int cpu = smp_processor_id();
-	struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
 
 	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
@@ -1345,14 +1351,20 @@ void start_secondary(void *unused)
 	/* Update topology CPU masks */
 	add_cpu_to_masks(cpu);
 
-	if (has_big_cores)
-		sibling_mask = cpu_smallcore_mask;
 	/*
 	 * Check for any shared caches. Note that this must be done on a
 	 * per-core basis because one core in the pair might be disabled.
 	 */
-	if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
-		shared_caches = true;
+	if (!shared_caches) {
+		struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+		struct cpumask *mask = cpu_l2_cache_mask(cpu);
+
+		if (has_big_cores)
+			sibling_mask = cpu_smallcore_mask;
+
+		if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+			shared_caches = true;
+	}
 
 	set_numa_node(numa_cpu_lookup_table[cpu]);
 	set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
@@ -1390,6 +1402,14 @@ void __init smp_cpus_done(unsigned int max_cpus)
 		smp_ops->bringup_done();
 
 	dump_numa_cpu_topology();
+	if (shared_caches) {
+		pr_info("Using shared cache scheduler topology\n");
+		powerpc_topology[bigcore_idx].mask = shared_cache_mask;
+#ifdef CONFIG_SCHED_DEBUG
+		powerpc_topology[bigcore_idx].name = "CACHE";
+#endif
+		powerpc_topology[bigcore_idx].sd_flags = powerpc_shared_cache_flags;
+	}
 
 	set_sched_topology(powerpc_topology);
 }
-- 
2.17.1


^ permalink raw reply related

* [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7d430fc536cc..875f57e41355 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
 	struct device_node *l2_cache, *np;
 	int i;
 
+	cpumask_set_cpu(cpu, mask_fn(cpu));
 	l2_cache = cpu_to_l2cache(cpu);
 	if (!l2_cache)
 		return false;
@@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
 	 * add it to it's own thread sibling mask.
 	 */
 	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
 	for (i = first_thread; i < first_thread + threads_per_core; i++)
 		if (cpu_online(i))
 			set_cpus_related(i, cpu, cpu_sibling_mask);
 
 	add_cpu_to_smallcore_masks(cpu);
-	/*
-	 * Copy the thread sibling mask into the cache sibling mask
-	 * and mark any CPUs that share an L2 with this CPU.
-	 */
-	for_each_cpu(i, cpu_sibling_mask(cpu))
-		set_cpus_related(cpu, i, cpu_l2_cache_mask);
 	update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-	/*
-	 * Copy the cache sibling mask into core sibling mask and mark
-	 * any CPUs on the same chip as this CPU.
-	 */
-	for_each_cpu(i, cpu_l2_cache_mask(cpu))
-		set_cpus_related(cpu, i, cpu_core_mask);
+	if (pkg_id == -1) {
+		struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+		/*
+		 * Copy the sibling mask into core sibling mask and
+		 * mark any CPUs on the same chip as this CPU.
+		 */
+		if (shared_caches)
+			mask = cpu_l2_cache_mask;
+
+		for_each_cpu(i, mask(cpu))
+			set_cpus_related(cpu, i, cpu_core_mask);
 
-	if (pkg_id == -1)
 		return;
+	}
 
 	for_each_cpu(i, cpu_online_mask)
 		if (get_physical_package_id(i) == pkg_id)
-- 
2.17.1


^ permalink raw reply related

* [PATCH 04/11] powerpc/smp: Enable small core scheduling sooner
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Enable small core scheduling as soon as we detect that we are in a
system that supports thread group. Doing so would avoid a redundant
check.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 24529f6134aa..7d430fc536cc 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -892,6 +892,12 @@ static int init_big_cores(void)
 	}
 
 	has_big_cores = true;
+
+#ifdef CONFIG_SCHED_SMT
+	pr_info("Big cores detected. Using small core scheduling\n");
+	powerpc_topology[0].mask = smallcore_smt_mask;
+#endif
+
 	return 0;
 }
 
@@ -1383,12 +1389,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 	dump_numa_cpu_topology();
 
-#ifdef CONFIG_SCHED_SMT
-	if (has_big_cores) {
-		pr_info("Big cores detected but using small core scheduling\n");
-		powerpc_topology[0].mask = smallcore_smt_mask;
-	}
-#endif
 	set_sched_topology(powerpc_topology);
 }
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH 03/11] powerpc/smp: Move powerpc_topology above
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Just moving the powerpc_topology description above.
This will help in using functions in this file and avoid declarations.

No other functional changes

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 116 +++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 069ea4b21c6d..24529f6134aa 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
 	return err;
 }
 
+static bool shared_caches;
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymmetric SMT dependency */
+static int powerpc_smt_flags(void)
+{
+	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+		flags |= SD_ASYM_PACKING;
+	}
+	return flags;
+}
+#endif
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+	if (shared_caches)
+		return cpu_l2_cache_mask(cpu);
+
+	if (has_big_cores)
+		return cpu_smallcore_mask(cpu);
+
+	return cpu_smt_mask(cpu);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *smallcore_smt_mask(int cpu)
+{
+	return cpu_smallcore_mask(cpu);
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
 static int init_big_cores(void)
 {
 	int cpu;
@@ -1249,8 +1307,6 @@ static void add_cpu_to_masks(int cpu)
 			set_cpus_related(cpu, i, cpu_core_mask);
 }
 
-static bool shared_caches;
-
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
@@ -1314,62 +1370,6 @@ int setup_profiling_timer(unsigned int multiplier)
 	return 0;
 }
 
-#ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymmetric SMT dependency */
-static int powerpc_smt_flags(void)
-{
-	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-
-	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-		flags |= SD_ASYM_PACKING;
-	}
-	return flags;
-}
-#endif
-
-/*
- * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
- * This topology makes it *much* cheaper to migrate tasks between adjacent cores
- * since the migrated task remains cache hot. We want to take advantage of this
- * at the scheduler level so an extra topology level is required.
- */
-static int powerpc_shared_cache_flags(void)
-{
-	return SD_SHARE_PKG_RESOURCES;
-}
-
-/*
- * We can't just pass cpu_l2_cache_mask() directly because
- * returns a non-const pointer and the compiler barfs on that.
- */
-static const struct cpumask *shared_cache_mask(int cpu)
-{
-	if (shared_caches)
-		return cpu_l2_cache_mask(cpu);
-
-	if (has_big_cores)
-		return cpu_smallcore_mask(cpu);
-
-	return cpu_smt_mask(cpu);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
-{
-	return cpu_smallcore_mask(cpu);
-}
-#endif
-
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	/*
-- 
2.17.1


^ permalink raw reply related

* [PATCH 01/11] powerpc/smp: Cache node for reuse
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

While cpu_to_node is inline function with access to per_cpu variable.
However when using repeatedly, it may be cleaner to cache it in a local
variable.

Also fix a build error in a some weird config.
"error: _numa_cpu_lookup_table_ undeclared"

No functional change

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..680c0edcc59d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -843,7 +843,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
 	DBG("smp_prepare_cpus\n");
 
-	/* 
+	/*
 	 * setup_cpu may need to be called on the boot cpu. We havent
 	 * spun any cpus up but lets be paranoid.
 	 */
@@ -854,20 +854,24 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	cpu_callin_map[boot_cpuid] = 1;
 
 	for_each_possible_cpu(cpu) {
+		int node = cpu_to_node(cpu);
+
 		zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu),
-					GFP_KERNEL, cpu_to_node(cpu));
+					GFP_KERNEL, node);
 		zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu),
-					GFP_KERNEL, cpu_to_node(cpu));
+					GFP_KERNEL, node);
 		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
-					GFP_KERNEL, cpu_to_node(cpu));
+					GFP_KERNEL, node);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
 		/*
 		 * numa_node_id() works after this.
 		 */
 		if (cpu_present(cpu)) {
-			set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
-			set_cpu_numa_mem(cpu,
-				local_memory_node(numa_cpu_lookup_table[cpu]));
+			node = numa_cpu_lookup_table[cpu];
+			set_cpu_numa_node(cpu, node);
+			set_cpu_numa_mem(cpu, local_memory_node(node));
 		}
+#endif
 	}
 
 	/* Init the cpumasks so the boot CPU is related to itself */
-- 
2.17.1


^ permalink raw reply related

* [PATCH 00/11] Support for grouping cores
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin

Cleanup of existing powerpc topologies and add support for grouping cores.

Patch 7 of this patch series: "Powerpc/numa: Detect support for coregroup"
depends on
https://lore.kernel.org/linuxppc-dev/20200707140644.7241-1-srikar@linux.vnet.ibm.com/t/#u
However it should be easy to rebase the patch without the above patch.

On Power 8 Systems
------------------
$ tail /proc/cpuinfo
processor	: 255
cpu		: POWER8 (architected), altivec supported
clock		: 3724.000000MHz
revision	: 2.1 (pvr 004b 0201)

timebase	: 512000000
platform	: pSeries
model		: IBM,8408-E8E
machine		: CHRP IBM,8408-E8E
MMU		: Hash

Before the patchset
-------------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,00000000,00000000,00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After the patchset
------------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,00000000,00000000,00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

On Power 9 (with device-tree enablement to show coregroups).
-----------------------------------------------------------
$ tail /proc/cpuinfo
processor	: 127
cpu		: POWER9 (architected), altivec supported
clock		: 3000.000000MHz
revision	: 2.2 (pvr 004e 0202)

timebase	: 512000000
platform	: pSeries
model		: IBM,9008-22L
machine		: CHRP IBM,9008-22L
MMU		: Hash

Before patchset
--------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 00000000,00000000,00000000,00000055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 00000000,00000000,00000000,000000aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After patchset
--------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
MC
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 00000000,00000000,00000000,00000055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain4 ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 00000000,00000000,00000000,000000aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>

Srikar Dronamraju (11):
  powerpc/smp: Cache node for reuse
  powerpc/smp: Merge Power9 topology with Power topology
  powerpc/smp: Move powerpc_topology above
  powerpc/smp: Enable small core scheduling sooner
  powerpc/smp: Dont assume l2-cache to be superset of sibling
  powerpc/smp: Generalize 2nd sched domain
  Powerpc/numa: Detect support for coregroup
  powerpc/smp: Allocate cpumask only after searching thread group
  Powerpc/smp: Create coregroup domain
  powerpc/smp: Implement cpu_to_coregroup_id
  powerpc/smp: Provide an ability to disable coregroup

 arch/powerpc/include/asm/smp.h      |   1 +
 arch/powerpc/include/asm/topology.h |  10 +
 arch/powerpc/kernel/smp.c           | 277 +++++++++++++++++-----------
 arch/powerpc/mm/numa.c              |  56 ++++--
 4 files changed, 226 insertions(+), 118 deletions(-)

-- 
2.17.1


^ permalink raw reply

* Re: [PATCH 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Steven Rostedt @ 2020-07-14  2:04 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Yonghong Song, Thomas Gleixner, Jiri Kosina,
	Anup Patel, Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, open list:S390, Joe Lawrence,
	Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Andrey Ryabinin, Iurii Zaikin, Andrii Nakryiko, Vasily Gorbik,
	moderated list:ARM PORT, Daniel Axtens, Damien Le Moal,
	Peter Oberparleiter, Sean Christopherson, Martin KaFai Lau,
	Song Liu, Paul Walmsley, Heiko Carstens, Alexei Starovoitov,
	Jarkko Sakkinen, Atish Patra, Will Deacon, Daniel Borkmann,
	Masahiro Yamada, Nayna Jain, Ley Foon Tan, Christian Borntraeger,
	Dmitry Vyukov, Sami Tolvanen, Naveen N. Rao, Mao Han, Marco Elver,
	Babu Moger, Borislav Petkov, Greentime Hu, Ben Dooks, Guan Xuetao,
	Thomas Bogendoerfer, open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC (sparc/sparc64), Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, open list:RISC-V ARCHITECTURE, Vincenzo Frascino,
	Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Josh Poimboeuf, KP Singh, Gerald Schaefer, Nick Hu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), open list:MIPS,
	Sergey Senozhatsky, Palmer Dabbelt,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
In-Reply-To: <CAMj1kXGhZYxjZTP+_PGdBy4hZgdeeTNUkuaE_eQKwB4pPAYNXA@mail.gmail.com>

On Mon, 13 Jul 2020 22:49:48 +0300
Ard Biesheuvel <ardb@kernel.org> wrote:

> On arm64, we no longer use module_alloc for bpf or kprobes, to avoid
> wasting va space on code that does not need to be loaded close to the
> kernel. Also, module_alloc() allocates kasan shadow, which is
> unnecessary for kprobes or bpf programs, which don't have kasan
> instrumentation.
> 
> This patch suggests that there are other reasons why conflating
> allocation of module space and allocating  text pages for other uses
> is a bad idea, but switching all users to text_alloc() is a step in
> the wrong direction. It would be better to stop using module_alloc()
> in core code except in the module loader, and have a generic
> text_alloc() that can be overridden by the arch if necessary. Note
> that x86  and s390 are the only architectures that use module_alloc()
> in ftrace code.
> 
> Please have a look at alloc_insn_page() or bpf_jit_alloc_exec() in the
> arm64 tree to see what I mean.

Hmm, so you have another method for allocating memory for trampolines?
(I haven't looked at those functions you pointed out, out of sheer
laziness ;-)

It would be nice to implement the trampoline optimization in arm, which
x86 has (see arch_ftrace_update_trampoline() and
arch_ftrace_trampoline_func()).

It helps when you have two different callbacks for different functions
(like having live patching enabled and function tracing enabled, or
kprobes using ftrace). Each callback will get its own allocated
trampoline to jump to instead of jumping to the a trampoline that calls
a looping function that tests to see which callback wants to be called
by the traced function.

-- Steve

^ permalink raw reply

* Re: [PATCH 14/14] powerpc/eeh: Move PE tree setup into the platform
From: Oliver O'Halloran @ 2020-07-14  3:08 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Mahesh J Salgaonkar
In-Reply-To: <d2ce45e8-92e6-f875-fd38-c3fa82bf8f4e@ozlabs.ru>

On Tue, Jul 14, 2020 at 11:50 AM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
> On 06/07/2020 11:36, Oliver O'Halloran wrote:
> >  /**
> >   * eeh_pe_tree_insert - Add EEH device to parent PE
> >   * @edev: EEH device
> > + * @new_pe_parent: PE to create additional PEs under
> >   *
> > - * Add EEH device to the parent PE. If the parent PE already
> > - * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
> > - * we have to create new PE to hold the EEH device and the new
> > - * PE will be linked to its parent PE as well.
> > + * Add EEH device to the PE in edev->pe_config_addr. If a PE already
> > + * exists with that address then @edev is added to that PE. Otherwise
> > + * a new PE is created and inserted into the PE tree as a child of
> > + * @new_pe_parent.
> > + *
> > + * If @new_pe_parent is NULL then the new PE will be inserted under
> > + * directly under the the PHB.
> >   */
> > -int eeh_pe_tree_insert(struct eeh_dev *edev)
> > +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent)
> >  {
> >       struct pci_controller *hose = edev->controller;
> >       struct eeh_pe *pe, *parent;
>
>
> We can ditch this "parent" in that single place now and use "pe"
> instead. And name the new parameter simply "parent". Dunno if it
> improves things though.

I did this at some point and then decided not to. It added a bunch of
noise to the diff and calling the parameter "parent" ended up being
pretty unreadable. The parameter is "the parent of the PE that will be
created to contain edev", or "parent of the parent PE". It's pretty
unwieldy.

> > @@ -399,7 +366,7 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
> >                       }
> >
> >                       eeh_edev_dbg(edev,
> > -                                  "Added to device PE (parent: PE#%x)\n",
> > +                                  "Added to existing PE (parent: PE#%x)\n",
> >                                    pe->parent->addr);
> >               } else {
> >                       /* Mark the PE as type of PCI bus */
> > @@ -431,10 +398,9 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
> >        * to PHB directly. Otherwise, we have to associate the
> >        * PE with its parent.
> >        */
> > -     parent = eeh_pe_get_parent(edev);
> > -     if (!parent) {
> > -             parent = eeh_phb_pe_get(hose);
> > -             if (!parent) {
> > +     if (!new_pe_parent) {
> > +             new_pe_parent = eeh_phb_pe_get(hose);
> > +             if (!new_pe_parent) {
>
>
>
> afaict only pseries can realisticly pass new_pe_parent==NULL so this
> chunk could go to pseries_eeh_pe_get_parent.

pnv_eeh_get_upstream_pe() will never return the PHB PE so
new_pe_parent will be NULL for the first PE created under a PowerNV
PHB. I guess we could move the PHB PE handling into the platform too,
but I think that just results in having to special case PHB PEs in two
places rather than one.

> > +static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev)
> > +{
> > +     struct eeh_dev *parent;
> > +     struct pci_dn *pdn = eeh_dev_to_pdn(edev);
> > +
> > +     /*
> > +      * It might have the case for the indirect parent
> > +      * EEH device already having associated PE, but
> > +      * the direct parent EEH device doesn't have yet.
> > +      */
> > +     if (edev->physfn)
> > +             pdn = pci_get_pdn(edev->physfn);
> > +     else
> > +             pdn = pdn ? pdn->parent : NULL;
> > +     while (pdn) {
> > +             /* We're poking out of PCI territory */
>
>
> We are traversing up PCI hierarchy here - pci_dn->parent, how is this
> out of PCI territory? Or I understand "out of" incorrectly?
>
>
> > +             parent = pdn_to_eeh_dev(pdn);
> > +             if (!parent)
> > +                     return NULL;

If there's no eeh dev then the node we're looking at is a PHB rather
than an actual PCI device so it stops looking. I think. The comment
was copied over from the existing code and I haven't spent a whole lot
of time parsing it's meaning.



> > @@ -301,6 +343,8 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
> >       if (ret) {
> >               eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
> >       } else {
> > +             struct eeh_pe *parent;
> > +
> >               /* Retrieve PE address */
> >               edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn);
> >               pe.addr = edev->pe_config_addr;
> > @@ -313,16 +357,23 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
> >               if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
> >                       enable = 1;
> >
> > -             if (enable) {
> > +             /* This device doesn't support EEH, but it may have an
> > +              * EEH parent, in which case we mark it as supported.
> > +              */
> > +             parent = pseries_eeh_pe_get_parent(edev);
> > +             if (parent && !enable)
> > +                     edev->pe_config_addr = parent->addr;
>
>
> What if pseries_eeh_pe_get_parent() returned NULL - we won't write
> edev->pe_config_addr so it remains 0 which is fine just by accident? :)

edev->pe_config_addr is set above when we call
pseries_eeh_get_pe_addr(). The check there is mainly to cover the case
where pseries_eeh_get_pe_addr() fails because the device is on a
subordinate bus rather than the root bus of the PE. PAPR says the
get-pe-addr-info RTAS call can fail in that situation and that you're
supposed to traverse up the DT to find the pe_config_addr, which is
what pe_get_parent() does. Yeah it's confusing, but that's what it
does today too.

> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> > +
> > +             if (enable || parent) {
> >                       eeh_add_flag(EEH_ENABLED);
> > -                     eeh_pe_tree_insert(edev);
> > +                     eeh_pe_tree_insert(edev, parent);

> >               } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
> >                          (pdn_to_eeh_dev(pdn->parent))->pe) {
> >                       /* This device doesn't support EEH, but it may have an
> >                        * EEH parent, in which case we mark it as supported.
> >                        */
> >                       edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
> > -                     eeh_pe_tree_insert(edev);
> > +                     eeh_pe_tree_insert(edev, parent);

I think I was supposed to delete this hunk and then forgot to since it
handles the same case mentioned above.

> >               }
> >               eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
> >                            (enable ? "enabled" : "unsupported"), ret);
> >
>
> --
> Alexey

^ permalink raw reply

* Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Leonardo Bras @ 2020-07-14  2:40 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cc15a81d-04d9-3ee4-4fdb-093618f6e635@ozlabs.ru>

Thank you for this feedback Alexey!

On Mon, 2020-07-13 at 17:33 +1000, Alexey Kardashevskiy wrote:
> [...]
> > -	int len, ret;
> > +	int len, ret, reset_win_ext;
> 
> Make it "reset_token".

Oh, it's not a token here, it just checks if the reset_win extension
exists. The token would be returned in *value, but since we did not
need it here, it's not copied.

> > [...]
> > -out_failed:
> > +out_restore_defwin:
> > +	if (default_win && reset_win_ext == 0)
> 
> reset_win_ext potentially may be uninitialized here. Yeah I know it is
> tied to default_win but still.

I can't see it being used uninitialized here, as you said it's tied to
default_win. 
Could you please tell me how it can be used uninitialized here, or what
is bad by doing this way?

> After looking at this function for a few minutes, it could use some
> refactoring (way too many gotos)  such as:

Yes, I agree.

> 1. move (query.page_size & xx) checks before "if
> (query.windows_available == 0)"

Moving 'page_size selection' above 'checking windows available' will
need us to duplicate the 'page_size selection' after the new query,
inside the if.
I mean, as query will be done again, it will need to get the (new) page
size.

> 2. move "win64 = kzalloc(sizeof(struct property), GFP_KERNEL)" before
> "if (query.windows_available == 0)"

> 3. call "reset_dma_window(dev, pdn)" inside the "if
> (query.windows_available == 0)" branch.

> Then you can drop all "goto out_restore_defwin" and move default_win and
> reset_win_ext inside "if (query.windows_available == 0)".

I did all changes suggested locally and did some analysis in the
result:

I did not see a way to put default_win and reset_win_ext inside 
"if (query.windows_available == 0)", because if we still need a way to
know if the default window was removed, and if so, restore in case
anything ever fails ahead (like creating the node property). 

But from that analysis I noted it's possible to remove all the new
"goto out_restore_defwin", if we do default_win = NULL if
ddw_read_ext() fails. 

So testing only default_win should always be enough to say if the
default window was deleted, and reset_win_ext could be moved inside "if
(query.windows_available == 0)".
Also, it would avoid reset_win_ext being 'used uninitialized' and
"out_restore_defwin:" would not be needed.

Against the current patch, we would have something like this:

#####

 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
-       int len, ret, reset_win_ext;
+       int len, ret;
        struct ddw_query_response query;
        struct ddw_create_response create;
        int page_shift;
@@ -1173,25 +1173,28 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
         * for extensions presence.
         */
        if (query.windows_available == 0) {
+               int reset_win_ext;
                default_win = of_find_property(pdn, "ibm,dma-window",
NULL);
                if (!default_win)
                        goto out_failed;
 
                reset_win_ext = ddw_read_ext(pdn,
DDW_EXT_RESET_DMA_WIN, NULL);
-               if (reset_win_ext)
+               if (reset_win_ext){
+                       default_win = NULL;
                        goto out_failed;
+               }
 
                remove_dma_window(pdn, ddw_avail, default_win);
 
                /* Query again, to check if the window is available */
                ret = query_ddw(dev, ddw_avail, &query, pdn);
                if (ret != 0)
-                       goto out_restore_defwin;
+                       goto out_failed;
 
                if (query.windows_available == 0) {
                        /* no windows are available for this device. */
                        dev_dbg(&dev->dev, "no free dynamic windows");
-                       goto out_restore_defwin;
+                       goto out_failed;
                }
        }
        if (query.page_size & 4) {
@@ -1203,7 +1206,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct
device_node *pdn)
        } else {
                dev_dbg(&dev->dev, "no supported direct page size in
mask %x",
                          query.page_size);
-               goto out_restore_defwin;
+               goto out_failed;
        }
        /* verify the window * number of ptes will map the partition */
        /* check largest block * page size > max memory hotplug addr */
@@ -1212,14 +1215,14 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
                dev_dbg(&dev->dev, "can't map partition max 0x%llx with
%llu "
                          "%llu-sized pages\n",
max_addr,  query.largest_available_block,
                          1ULL << page_shift);
-               goto out_restore_defwin;
+               goto out_failed;
        }
        len = order_base_2(max_addr);
        win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
        if (!win64) {
                dev_info(&dev->dev,
                        "couldn't allocate property for 64bit dma
window\n");
-               goto out_restore_defwin;
+               goto out_failed;
        }
        win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
        win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
@@ -1282,11 +1285,10 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
        kfree(win64->value);
        kfree(win64);
 
-out_restore_defwin:
-       if (default_win && reset_win_ext == 0)
+out_failed:
+       if (default_win)
                reset_dma_window(dev, pdn);
 
-out_failed:
        fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
        if (!fpdn)
                goto out_unlock;

#####

What do you think?



> The rest of the series is good as it is,

Thank you :)

>  however it may conflict with
> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200713062348.100552-1-aik@ozlabs.ru/
> and the patchset it is made on top of -
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=188385 .

<From the message of the first link>
> (do not rush, let me finish reviewing this first) 

Ok, I have no problem rebasing on top of those patchsets, but what
would you suggest to be done?

Would it be ok doing a big multi-author patchset, so we guarantee it
being applied in the correct order?

(You probably want me to rebase my patchset on top of Hellwig + yours,
right?) 


> thanks,
Thank you!




^ permalink raw reply

* Re: [PATCH 14/14] powerpc/eeh: Move PE tree setup into the platform
From: Alexey Kardashevskiy @ 2020-07-14  1:50 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev; +Cc: mahesh
In-Reply-To: <20200706013619.459420-15-oohall@gmail.com>



On 06/07/2020 11:36, Oliver O'Halloran wrote:
> The EEH core has a concept of a "PE tree" to support PowerNV. The PE tree
> follows the PCI bus structures because a reset asserted on an upstream
> bridge will be propagated to the downstream bridges. On pseries there's a
> 1-1 correspondence between what the guest sees are a PHB and a PE so the
> "tree" is really just a single node.
> 
> Current the EEH core is reponsible for setting up this PE tree which it
> does by traversing the pci_dn tree. The structure of the pci_dn tree
> matches the bus tree on PowerNV which leads to the PE tree being "correct"
> this setup method doesn't make a whole lot of sense and it's actively
> confusing for the pseries case where it doesn't really do anything.
> 
> We want to remove the dependence on pci_dn anyway so this patch move
> choosing where to insert a new PE into the platform code rather than
> being part of the generic EEH code. For PowerNV this simplifies the
> tree building logic and removes the use of pci_dn. For pseries we
> keep the existing logic. I'm not really convinced it does anything
> due to the 1-1 PE-to-PHB correspondence so every device under that
> PHB should be in the same PE, but I'd rather not remove it entirely
> until we've had a chance to look at it more deeply.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/include/asm/eeh.h               |  2 +-
>  arch/powerpc/kernel/eeh_pe.c                 | 70 ++++++--------------
>  arch/powerpc/platforms/powernv/eeh-powernv.c | 27 +++++++-
>  arch/powerpc/platforms/pseries/eeh_pseries.c | 59 +++++++++++++++--
>  4 files changed, 101 insertions(+), 57 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 8d34e5b790c2..1cab629dbc74 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -283,7 +283,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
>  struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
>  struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
>  			  int pe_no, int config_addr);
> -int eeh_pe_tree_insert(struct eeh_dev *edev);
> +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent);
>  int eeh_pe_tree_remove(struct eeh_dev *edev);
>  void eeh_pe_update_time_stamp(struct eeh_pe *pe);
>  void *eeh_pe_traverse(struct eeh_pe *root,
> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
> index 898205829a8f..ea2f8b362d18 100644
> --- a/arch/powerpc/kernel/eeh_pe.c
> +++ b/arch/powerpc/kernel/eeh_pe.c
> @@ -318,53 +318,20 @@ struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
>  	return pe;
>  }
>  
> -/**
> - * eeh_pe_get_parent - Retrieve the parent PE
> - * @edev: EEH device
> - *
> - * The whole PEs existing in the system are organized as hierarchy
> - * tree. The function is used to retrieve the parent PE according
> - * to the parent EEH device.
> - */
> -static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
> -{
> -	struct eeh_dev *parent;
> -	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
> -
> -	/*
> -	 * It might have the case for the indirect parent
> -	 * EEH device already having associated PE, but
> -	 * the direct parent EEH device doesn't have yet.
> -	 */
> -	if (edev->physfn)
> -		pdn = pci_get_pdn(edev->physfn);
> -	else
> -		pdn = pdn ? pdn->parent : NULL;
> -	while (pdn) {
> -		/* We're poking out of PCI territory */
> -		parent = pdn_to_eeh_dev(pdn);
> -		if (!parent)
> -			return NULL;
> -
> -		if (parent->pe)
> -			return parent->pe;
> -
> -		pdn = pdn->parent;
> -	}
> -
> -	return NULL;
> -}
> -
>  /**
>   * eeh_pe_tree_insert - Add EEH device to parent PE
>   * @edev: EEH device
> + * @new_pe_parent: PE to create additional PEs under
>   *
> - * Add EEH device to the parent PE. If the parent PE already
> - * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
> - * we have to create new PE to hold the EEH device and the new
> - * PE will be linked to its parent PE as well.
> + * Add EEH device to the PE in edev->pe_config_addr. If a PE already
> + * exists with that address then @edev is added to that PE. Otherwise
> + * a new PE is created and inserted into the PE tree as a child of
> + * @new_pe_parent.
> + *
> + * If @new_pe_parent is NULL then the new PE will be inserted under
> + * directly under the the PHB.
>   */
> -int eeh_pe_tree_insert(struct eeh_dev *edev)
> +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent)
>  {
>  	struct pci_controller *hose = edev->controller;
>  	struct eeh_pe *pe, *parent;


We can ditch this "parent" in that single place now and use "pe"
instead. And name the new parameter simply "parent". Dunno if it
improves things though.



> @@ -399,7 +366,7 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>  			}
>  
>  			eeh_edev_dbg(edev,
> -				     "Added to device PE (parent: PE#%x)\n",
> +				     "Added to existing PE (parent: PE#%x)\n",
>  				     pe->parent->addr);
>  		} else {
>  			/* Mark the PE as type of PCI bus */
> @@ -431,10 +398,9 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>  	 * to PHB directly. Otherwise, we have to associate the
>  	 * PE with its parent.
>  	 */
> -	parent = eeh_pe_get_parent(edev);
> -	if (!parent) {
> -		parent = eeh_phb_pe_get(hose);
> -		if (!parent) {
> +	if (!new_pe_parent) {
> +		new_pe_parent = eeh_phb_pe_get(hose);
> +		if (!new_pe_parent) {



afaict only pseries can realisticly pass new_pe_parent==NULL so this
chunk could go to pseries_eeh_pe_get_parent.


>  			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
>  				__func__, hose->global_number);
>  			edev->pe = NULL;
> @@ -442,17 +408,19 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>  			return -EEXIST;
>  		}
>  	}
> -	pe->parent = parent;
> +
> +	/* link new PE into the tree */
> +	pe->parent = new_pe_parent;
> +	list_add_tail(&pe->child, &new_pe_parent->child_list);
>  
>  	/*
>  	 * Put the newly created PE into the child list and
>  	 * link the EEH device accordingly.
>  	 */
> -	list_add_tail(&pe->child, &parent->child_list);
>  	list_add_tail(&edev->entry, &pe->edevs);
>  	edev->pe = pe;
> -	eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
> -		     pe->parent->addr);
> +	eeh_edev_dbg(edev, "Added to new (parent: PE#%x)\n",
> +		     new_pe_parent->addr);
>  
>  	return 0;
>  }
> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
> index 8c9fca773692..9af8c3b98853 100644
> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> @@ -338,6 +338,28 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
>  	return 0;
>  }
>  
> +static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev)
> +{
> +	struct pci_controller *hose = pdev->bus->sysdata;
> +	struct pnv_phb *phb = hose->private_data;
> +	struct pci_dev *parent = pdev->bus->self;
> +
> +#ifdef CONFIG_PCI_IOV
> +	/* for VFs we use the PF's PE as the upstream PE */
> +	if (pdev->is_virtfn)
> +		parent = pdev->physfn;
> +#endif
> +
> +	/* otherwise use the PE of our parent bridge */
> +	if (parent) {
> +		struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent);
> +
> +		return eeh_pe_get(phb->hose, ioda_pe->pe_number, 0);
> +	}
> +
> +	return NULL;
> +}
> +
>  /**
>   * pnv_eeh_probe - Do probe on PCI device
>   * @pdev: pci_dev to probe
> @@ -350,6 +372,7 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
>  	struct pci_controller *hose = pdn->phb;
>  	struct pnv_phb *phb = hose->private_data;
>  	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
> +	struct eeh_pe *upstream_pe;
>  	uint32_t pcie_flags;
>  	int ret;
>  	int config_addr = (pdn->busno << 8) | (pdn->devfn);
> @@ -398,8 +421,10 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
>  
>  	edev->pe_config_addr = phb->ioda.pe_rmap[config_addr];
>  
> +	upstream_pe = pnv_eeh_get_upstream_pe(pdev);
> +
>  	/* Create PE */
> -	ret = eeh_pe_tree_insert(edev);
> +	ret = eeh_pe_tree_insert(edev, upstream_pe);
>  	if (ret) {
>  		eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
>  		return NULL;
> diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
> index 72556f4436a8..98f9a1b269be 100644
> --- a/arch/powerpc/platforms/pseries/eeh_pseries.c
> +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
> @@ -68,11 +68,16 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
>  	pseries_eeh_init_edev(pdn);
>  #ifdef CONFIG_PCI_IOV
>  	if (pdev->is_virtfn) {
> +		/*
> +		 * FIXME: This really should be handled by choosing the right
> +		 *        parent PE in in pseries_eeh_init_edev().
> +		 */
> +		struct eeh_pe *physfn_pe = pci_dev_to_eeh_dev(pdev->physfn)->pe;
>  		struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
>  
>  		edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
>  		eeh_pe_tree_remove(edev); /* Remove as it is adding to bus pe */
> -		eeh_pe_tree_insert(edev);   /* Add as VF PE type */
> +		eeh_pe_tree_insert(edev, physfn_pe);   /* Add as VF PE type */
>  	}
>  #endif
>  	eeh_probe_device(pdev);
> @@ -218,6 +223,43 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap)
>  	return 0;
>  }
>  
> +/**
> + * pseries_eeh_pe_get_parent - Retrieve the parent PE
> + * @edev: EEH device
> + *
> + * The whole PEs existing in the system are organized as hierarchy
> + * tree. The function is used to retrieve the parent PE according
> + * to the parent EEH device.
> + */
> +static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev)
> +{
> +	struct eeh_dev *parent;
> +	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
> +
> +	/*
> +	 * It might have the case for the indirect parent
> +	 * EEH device already having associated PE, but
> +	 * the direct parent EEH device doesn't have yet.
> +	 */
> +	if (edev->physfn)
> +		pdn = pci_get_pdn(edev->physfn);
> +	else
> +		pdn = pdn ? pdn->parent : NULL;
> +	while (pdn) {
> +		/* We're poking out of PCI territory */


We are traversing up PCI hierarchy here - pci_dn->parent, how is this
out of PCI territory? Or I understand "out of" incorrectly?


> +		parent = pdn_to_eeh_dev(pdn);
> +		if (!parent)
> +			return NULL;
> +
> +		if (parent->pe)
> +			return parent->pe;
> +
> +		pdn = pdn->parent;
> +	}
> +
> +	return NULL;
> +}
> +
>  /**
>   * pseries_eeh_init_edev - initialise the eeh_dev and eeh_pe for a pci_dn
>   *
> @@ -301,6 +343,8 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
>  	if (ret) {
>  		eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
>  	} else {
> +		struct eeh_pe *parent;
> +
>  		/* Retrieve PE address */
>  		edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn);
>  		pe.addr = edev->pe_config_addr;
> @@ -313,16 +357,23 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
>  		if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
>  			enable = 1;
>  
> -		if (enable) {
> +		/* This device doesn't support EEH, but it may have an
> +		 * EEH parent, in which case we mark it as supported.
> +		 */
> +		parent = pseries_eeh_pe_get_parent(edev);
> +		if (parent && !enable)
> +			edev->pe_config_addr = parent->addr;


What if pseries_eeh_pe_get_parent() returned NULL - we won't write
edev->pe_config_addr so it remains 0 which is fine just by accident? :)

I'd make pseries_eeh_pe_get_parent() always return not NULL (a parent or
a hose) and simplify the block below.


I mean the change is alright but part of the excersise is making the
code more readable but there also can go forever :) so

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>



> +
> +		if (enable || parent) {
>  			eeh_add_flag(EEH_ENABLED);
> -			eeh_pe_tree_insert(edev);
> +			eeh_pe_tree_insert(edev, parent);
>  		} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
>  			   (pdn_to_eeh_dev(pdn->parent))->pe) {
>  			/* This device doesn't support EEH, but it may have an
>  			 * EEH parent, in which case we mark it as supported.
>  			 */
>  			edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
> -			eeh_pe_tree_insert(edev);
> +			eeh_pe_tree_insert(edev, parent);
>  		}
>  		eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
>  			     (enable ? "enabled" : "unsupported"), ret);
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH] ocxl: Replace HTTP links with HTTPS ones
From: Andrew Donnellan @ 2020-07-13 22:55 UTC (permalink / raw)
  To: Alexander A. Klimov, fbarrat, arnd, gregkh, linuxppc-dev,
	linux-kernel
In-Reply-To: <20200713175506.36676-1-grandmaster@al2klimov.de>

On 14/7/20 3:55 am, Alexander A. Klimov wrote:
> Rationale:
> Reduces attack surface on kernel devs opening the links for MITM
> as HTTPS traffic is much harder to manipulate.
> 
> Deterministic algorithm:
> For each file:
>    If not .svg:
>      For each line:
>        If doesn't contain `\bxmlns\b`:
>          For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
> 	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
>              If both the HTTP and HTTPS versions
>              return 200 OK and serve the same content:
>                Replace HTTP with HTTPS.
> 
> Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>

Thanks.

Acked-by: Andrew Donnellan <ajd@linux.ibm.com>

-- 
Andrew Donnellan              OzLabs, ADL Canberra
ajd@linux.ibm.com             IBM Australia Limited

^ permalink raw reply

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Bjorn Helgaas @ 2020-07-13 22:01 UTC (permalink / raw)
  To: Saheed O. Bolarinwa
  Cc: Rich Felker, Martin K. Petersen, linux-sh, linux-pci, linux-nvme,
	Yicong Yang, Keith Busch, Realtek linux nic maintainers,
	Paul Mackerras, linux-i2c, bcm-kernel-feedback-list, sparclinux,
	rfi, Toan Le, Greg Ungerer, Marek Vasut, Rob Herring,
	Stefano Stabellini, Sagi Grimberg, Yoshinori Sato, linux-scsi,
	Greg Kroah-Hartman, linux-atm-general, Russell King, Ley Foon Tan,
	Christoph Hellwig, Geert Uytterhoeven, Rafał Miłecki,
	Chas Williams, xen-devel, Matt Turner, linux-mips,
	linux-kernel-mentees, Kevin Hilman, Guenter Roeck, linux-hwmon,
	Jean Delvare, Andrew Donnellan, Arnd Bergmann, Ray Jui,
	James E.J. Bottomley, Yue Wang, Jens Axboe, Jakub Kicinski,
	linux-m68k, Lorenzo Pieralisi, Ivan Kokshaysky, Michael Buesch,
	skhan, bjorn, linux-amlogic, Boris Ostrovsky, Guan Xuetao,
	linux-arm-kernel, Richard Henderson, Juergen Gross, Michal Simek,
	Thomas Bogendoerfer, Scott Branden, Bjorn Helgaas, Jingoo Han,
	netdev, Yoshihiro Shimoda, linux-wireless, linux-kernel,
	linux-renesas-soc, Brian King, Philipp Zabel, linux-alpha,
	Frederic Barrat, Gustavo Pimentel, linuxppc-dev, David S. Miller,
	Heiner Kallweit
In-Reply-To: <20200713122247.10985-1-refactormyself@gmail.com>

On Mon, Jul 13, 2020 at 02:22:12PM +0200, Saheed O. Bolarinwa wrote:
> This goal of these series is to move the definition of *all* PCIBIOS* from
> include/linux/pci.h to arch/x86 and limit their use within there.
> All other tree specific definition will be left for intact. Maybe they can
> be renamed.

More comments later, but a few trivial whitespace issues you can clean
up in the meantime.  Don't repost for at least a few days to avoid
spamming everybody.  I found these with:

  $ b4 am -om/ 20200713122247.10985-1-refactormyself@gmail.com
  $ git am m/20200713_refactormyself_move_all_pcibios_definitions_into_arch_x86.mbx

  Applying: atm: Change PCIBIOS_SUCCESSFUL to 0
  .git/rebase-apply/patch:11: trailing whitespace.
	  iadev = INPH_IA_DEV(dev);
  .git/rebase-apply/patch:12: trailing whitespace.
	  for(i=0; i<64; i++)
  .git/rebase-apply/patch:13: trailing whitespace.
	    if ((error = pci_read_config_dword(iadev->pci,
  .git/rebase-apply/patch:16: trailing whitespace, space before tab in indent.
		return error;
  .git/rebase-apply/patch:17: trailing whitespace.
	  writel(0, iadev->reg+IPHASE5575_EXT_RESET);
  warning: squelched 5 whitespace errors
  warning: 10 lines add whitespace errors.
  Applying: atm: Tidy Success/Failure checks
  .git/rebase-apply/patch:13: trailing whitespace.

  .git/rebase-apply/patch:14: trailing whitespace.
	  iadev = INPH_IA_DEV(dev);
  .git/rebase-apply/patch:15: trailing whitespace.
	  for(i=0; i<64; i++)
  .git/rebase-apply/patch:21: trailing whitespace.
	  writel(0, iadev->reg+IPHASE5575_EXT_RESET);
  .git/rebase-apply/patch:22: trailing whitespace.
	  for(i=0; i<64; i++)
  warning: squelched 3 whitespace errors
  warning: 8 lines add whitespace errors.
  Applying: atm: Fix Style ERROR- assignment in if condition
  .git/rebase-apply/patch:12: trailing whitespace.
	  unsigned int pci[64];
  .git/rebase-apply/patch:13: trailing whitespace.

  .git/rebase-apply/patch:14: trailing whitespace.
	  iadev = INPH_IA_DEV(dev);
  .git/rebase-apply/patch:23: trailing whitespace.
	  writel(0, iadev->reg+IPHASE5575_EXT_RESET);
  .git/rebase-apply/patch:32: trailing whitespace.
	  udelay(5);
  warning: squelched 2 whitespace errors
  warning: 7 lines add whitespace errors.
  Applying: PCI: Change PCIBIOS_SUCCESSFUL to 0
  .git/rebase-apply/patch:37: trailing whitespace.
  struct pci_ops apecs_pci_ops =
  .git/rebase-apply/patch:50: trailing whitespace.
  static int
  .git/rebase-apply/patch:59: trailing whitespace.
  struct pci_ops cia_pci_ops =
  .git/rebase-apply/patch:94: trailing whitespace.
  static int
  .git/rebase-apply/patch:103: trailing whitespace.
  struct pci_ops lca_pci_ops =
  warning: squelched 10 whitespace errors
  warning: 15 lines add whitespace errors.

^ permalink raw reply

* [PATCH] selftests: Replace HTTP links with HTTPS ones
From: Alexander A. Klimov @ 2020-07-13 19:57 UTC (permalink / raw)
  To: shuah, mpe, benh, paulus, ravi.bangoria, broonie, linux-kselftest,
	linux-kernel, linuxppc-dev
  Cc: Alexander A. Klimov

Rationale:
Reduces attack surface on kernel devs opening the links for MITM
as HTTPS traffic is much harder to manipulate.

Deterministic algorithm:
For each file:
  If not .svg:
    For each line:
      If doesn't contain `\bxmlns\b`:
        For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
            If both the HTTP and HTTPS versions
            return 200 OK and serve the same content:
              Replace HTTP with HTTPS.

Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
---
 Continuing my work started at 93431e0607e5.
 See also: git log --oneline '--author=Alexander A. Klimov <grandmaster@al2klimov.de>' v5.7..master
 (Actually letting a shell for loop submit all this stuff for me.)

 If there are any URLs to be removed completely or at least not just HTTPSified:
 Just clearly say so and I'll *undo my change*.
 See also: https://lkml.org/lkml/2020/6/27/64

 If there are any valid, but yet not changed URLs:
 See: https://lkml.org/lkml/2020/6/26/837

 If you apply the patch, please let me know.

 Sorry again to all maintainers who complained about subject lines.
 Now I realized that you want an actually perfect prefixes,
 not just subsystem ones.
 I tried my best...
 And yes, *I could* (at least half-)automate it.
 Impossible is nothing! :)


 tools/testing/selftests/futex/include/atomic.h        | 2 +-
 tools/testing/selftests/futex/include/futextest.h     | 2 +-
 tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c | 2 +-
 tools/testing/selftests/vDSO/parse_vdso.c             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/futex/include/atomic.h b/tools/testing/selftests/futex/include/atomic.h
index 428bcd921bb5..23703ecfcd68 100644
--- a/tools/testing/selftests/futex/include/atomic.h
+++ b/tools/testing/selftests/futex/include/atomic.h
@@ -5,7 +5,7 @@
  *
  * DESCRIPTION
  *      GCC atomic builtin wrappers
- *      http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ *      https://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
  *
  * AUTHOR
  *      Darren Hart <dvhart@linux.intel.com>
diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h
index ddbcfc9b7bac..2a210c482f7b 100644
--- a/tools/testing/selftests/futex/include/futextest.h
+++ b/tools/testing/selftests/futex/include/futextest.h
@@ -211,7 +211,7 @@ futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
  * @newval:	The new value to try and assign the futex
  *
  * Implement cmpxchg using gcc atomic builtins.
- * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ * https://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
  *
  * Return the old futex value.
  */
diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index c1f324afdbf3..946c52e1f327 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -12,7 +12,7 @@
  * times. Then check the output count from perf is as expected.
  *
  * Based on:
- *   http://ozlabs.org/~anton/junkcode/perf_events_example1.c
+ *   https://ozlabs.org/~anton/junkcode/perf_events_example1.c
  *
  * Copyright (C) 2018 Michael Neuling, IBM Corporation.
  */
diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c
index 413f75620a35..3413fc00c835 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -5,7 +5,7 @@
  * This code is meant to be linked in to various programs that run on Linux.
  * As such, it is available with as few restrictions as possible.  This file
  * is licensed under the Creative Commons Zero License, version 1.0,
- * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
+ * available at https://creativecommons.org/publicdomain/zero/1.0/legalcode
  *
  * The vDSO is a regular ELF DSO that the kernel maps into user space when
  * it starts a program.  It works equally well in statically and dynamically
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Ard Biesheuvel @ 2020-07-13 19:49 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Yonghong Song, Thomas Gleixner, Jiri Kosina,
	Anup Patel, Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, open list:S390, Joe Lawrence,
	Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Andrey Ryabinin, Iurii Zaikin, Andrii Nakryiko, Vasily Gorbik,
	moderated list:ARM PORT, Daniel Axtens, Damien Le Moal,
	Peter Oberparleiter, Sean Christopherson, Martin KaFai Lau,
	Song Liu, Paul Walmsley, Heiko Carstens, Alexei Starovoitov,
	Atish Patra, Will Deacon, Daniel Borkmann, Masahiro Yamada,
	Nayna Jain, Ley Foon Tan, Christian Borntraeger, Dmitry Vyukov,
	Sami Tolvanen, Naveen N. Rao, Mao Han, Marco Elver,
	Steven Rostedt, Babu Moger, Borislav Petkov, Greentime Hu,
	Ben Dooks, Guan Xuetao, Thomas Bogendoerfer,
	open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC (sparc/sparc64), Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, open list:RISC-V ARCHITECTURE, Vincenzo Frascino,
	Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Josh Poimboeuf, KP Singh, Gerald Schaefer, Nick Hu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), open list:MIPS,
	Sergey Senozhatsky, Palmer Dabbelt,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
In-Reply-To: <20200713182030.1418041-1-jarkko.sakkinen@linux.intel.com>

On Mon, 13 Jul 2020 at 21:21, Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> Rename module_alloc() to text_alloc() and module_memfree() to
> text_memfree(), and move them to kernel/text.c, which is unconditionally
> compiled to the kernel proper. This allows kprobes, ftrace and bpf to
> allocate space for executable code without requiring to compile the modules
> support (CONFIG_MODULES=y) in.
>

On arm64, we no longer use module_alloc for bpf or kprobes, to avoid
wasting va space on code that does not need to be loaded close to the
kernel. Also, module_alloc() allocates kasan shadow, which is
unnecessary for kprobes or bpf programs, which don't have kasan
instrumentation.

This patch suggests that there are other reasons why conflating
allocation of module space and allocating  text pages for other uses
is a bad idea, but switching all users to text_alloc() is a step in
the wrong direction. It would be better to stop using module_alloc()
in core code except in the module loader, and have a generic
text_alloc() that can be overridden by the arch if necessary. Note
that x86  and s390 are the only architectures that use module_alloc()
in ftrace code.

Please have a look at alloc_insn_page() or bpf_jit_alloc_exec() in the
arm64 tree to see what I mean.



> Cc: Andi Kleen <ak@linux.intel.com>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---
>  arch/arm/kernel/Makefile         |  3 +-
>  arch/arm/kernel/module.c         | 21 -----------
>  arch/arm/kernel/text.c           | 33 ++++++++++++++++++
>  arch/arm64/kernel/Makefile       |  2 +-
>  arch/arm64/kernel/module.c       | 42 ----------------------
>  arch/arm64/kernel/text.c         | 54 ++++++++++++++++++++++++++++
>  arch/mips/kernel/Makefile        |  2 +-
>  arch/mips/kernel/module.c        |  9 -----
>  arch/mips/kernel/text.c          | 19 ++++++++++
>  arch/mips/net/bpf_jit.c          |  4 +--
>  arch/nds32/kernel/Makefile       |  2 +-
>  arch/nds32/kernel/module.c       |  7 ----
>  arch/nds32/kernel/text.c         | 12 +++++++
>  arch/nios2/kernel/Makefile       |  1 +
>  arch/nios2/kernel/module.c       | 19 ----------
>  arch/nios2/kernel/text.c         | 34 ++++++++++++++++++
>  arch/parisc/kernel/Makefile      |  2 +-
>  arch/parisc/kernel/module.c      | 11 ------
>  arch/parisc/kernel/text.c        | 22 ++++++++++++
>  arch/powerpc/net/bpf_jit_comp.c  |  4 +--
>  arch/riscv/kernel/Makefile       |  1 +
>  arch/riscv/kernel/module.c       | 12 -------
>  arch/riscv/kernel/text.c         | 20 +++++++++++
>  arch/s390/kernel/Makefile        |  2 +-
>  arch/s390/kernel/ftrace.c        |  2 +-
>  arch/s390/kernel/module.c        | 16 ---------
>  arch/s390/kernel/text.c          | 23 ++++++++++++
>  arch/sparc/kernel/Makefile       |  1 +
>  arch/sparc/kernel/module.c       | 30 ----------------
>  arch/sparc/kernel/text.c         | 39 +++++++++++++++++++++
>  arch/sparc/net/bpf_jit_comp_32.c |  6 ++--
>  arch/unicore32/kernel/Makefile   |  1 +
>  arch/unicore32/kernel/module.c   |  7 ----
>  arch/unicore32/kernel/text.c     | 18 ++++++++++
>  arch/x86/kernel/Makefile         |  1 +
>  arch/x86/kernel/ftrace.c         |  4 +--
>  arch/x86/kernel/kprobes/core.c   |  4 +--
>  arch/x86/kernel/module.c         | 49 --------------------------
>  arch/x86/kernel/text.c           | 60 ++++++++++++++++++++++++++++++++
>  include/linux/moduleloader.h     |  4 +--
>  kernel/Makefile                  |  2 +-
>  kernel/bpf/core.c                |  4 +--
>  kernel/kprobes.c                 |  4 +--
>  kernel/module.c                  | 37 ++++++--------------
>  kernel/text.c                    | 25 +++++++++++++
>  45 files changed, 400 insertions(+), 275 deletions(-)
>  create mode 100644 arch/arm/kernel/text.c
>  create mode 100644 arch/arm64/kernel/text.c
>  create mode 100644 arch/mips/kernel/text.c
>  create mode 100644 arch/nds32/kernel/text.c
>  create mode 100644 arch/nios2/kernel/text.c
>  create mode 100644 arch/parisc/kernel/text.c
>  create mode 100644 arch/riscv/kernel/text.c
>  create mode 100644 arch/s390/kernel/text.c
>  create mode 100644 arch/sparc/kernel/text.c
>  create mode 100644 arch/unicore32/kernel/text.c
>  create mode 100644 arch/x86/kernel/text.c
>  create mode 100644 kernel/text.c
>
> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> index 89e5d864e923..69bfacfd60ef 100644
> --- a/arch/arm/kernel/Makefile
> +++ b/arch/arm/kernel/Makefile
> @@ -19,7 +19,8 @@ CFLAGS_REMOVE_return_address.o = -pg
>  obj-y          := elf.o entry-common.o irq.o opcodes.o \
>                    process.o ptrace.o reboot.o \
>                    setup.o signal.o sigreturn_codes.o \
> -                  stacktrace.o sys_arm.o time.o traps.o
> +                  stacktrace.o sys_arm.o time.o traps.o \
> +                  text.o
>
>  ifneq ($(CONFIG_ARM_UNWIND),y)
>  obj-$(CONFIG_FRAME_POINTER)    += return_address.o
> diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
> index e15444b25ca0..13e3442a6b9f 100644
> --- a/arch/arm/kernel/module.c
> +++ b/arch/arm/kernel/module.c
> @@ -33,27 +33,6 @@
>  #define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
>  #endif
>
> -#ifdef CONFIG_MMU
> -void *module_alloc(unsigned long size)
> -{
> -       gfp_t gfp_mask = GFP_KERNEL;
> -       void *p;
> -
> -       /* Silence the initial allocation */
> -       if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
> -               gfp_mask |= __GFP_NOWARN;
> -
> -       p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                               gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -       if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
> -               return p;
> -       return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
> -                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -#endif
> -
>  bool module_init_section(const char *name)
>  {
>         return strstarts(name, ".init") ||
> diff --git a/arch/arm/kernel/text.c b/arch/arm/kernel/text.c
> new file mode 100644
> index 000000000000..600143fb909d
> --- /dev/null
> +++ b/arch/arm/kernel/text.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + *  linux/arch/arm/kernel/module.c
> + *
> + *  Copyright (C) 2002 Russell King.
> + *  Modified for nommu by Hyok S. Choi
> + *
> + * Module allocation method suggested by Andi Kleen.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef CONFIG_MMU
> +void *text_alloc(unsigned long size)
> +{
> +       gfp_t gfp_mask = GFP_KERNEL;
> +       void *p;
> +
> +       /* Silence the initial allocation */
> +       if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
> +               gfp_mask |= __GFP_NOWARN;
> +
> +       p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                               gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +       if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
> +               return p;
> +       return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
> +                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index a561cbb91d4d..7765a45d71b5 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -19,7 +19,7 @@ obj-y                 := debug-monitors.o entry.o irq.o fpsimd.o              \
>                            return_address.o cpuinfo.o cpu_errata.o              \
>                            cpufeature.o alternative.o cacheinfo.o               \
>                            smp.o smp_spin_table.o topology.o smccc-call.o       \
> -                          syscall.o
> +                          syscall.o text.o
>
>  targets                        += efi-entry.o
>
> diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> index 1cd1a4d0ed30..adde022f703c 100644
> --- a/arch/arm64/kernel/module.c
> +++ b/arch/arm64/kernel/module.c
> @@ -20,48 +20,6 @@
>  #include <asm/insn.h>
>  #include <asm/sections.h>
>
> -void *module_alloc(unsigned long size)
> -{
> -       u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> -       gfp_t gfp_mask = GFP_KERNEL;
> -       void *p;
> -
> -       /* Silence the initial allocation */
> -       if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> -               gfp_mask |= __GFP_NOWARN;
> -
> -       if (IS_ENABLED(CONFIG_KASAN))
> -               /* don't exceed the static module region - see below */
> -               module_alloc_end = MODULES_END;
> -
> -       p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> -                               module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
> -                               NUMA_NO_NODE, __builtin_return_address(0));
> -
> -       if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> -           !IS_ENABLED(CONFIG_KASAN))
> -               /*
> -                * KASAN can only deal with module allocations being served
> -                * from the reserved module region, since the remainder of
> -                * the vmalloc region is already backed by zero shadow pages,
> -                * and punching holes into it is non-trivial. Since the module
> -                * region is not randomized when KASAN is enabled, it is even
> -                * less likely that the module region gets exhausted, so we
> -                * can simply omit this fallback in that case.
> -                */
> -               p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> -                               module_alloc_base + SZ_2G, GFP_KERNEL,
> -                               PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -
> -       if (p && (kasan_module_alloc(p, size) < 0)) {
> -               vfree(p);
> -               return NULL;
> -       }
> -
> -       return p;
> -}
> -
>  enum aarch64_reloc_op {
>         RELOC_OP_NONE,
>         RELOC_OP_ABS,
> diff --git a/arch/arm64/kernel/text.c b/arch/arm64/kernel/text.c
> new file mode 100644
> index 000000000000..64fc7e2d85df
> --- /dev/null
> +++ b/arch/arm64/kernel/text.c
> @@ -0,0 +1,54 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * AArch64 loadable module support.
> + *
> + * Copyright (C) 2012 ARM Limited
> + *
> + * Author: Will Deacon <will.deacon@arm.com>
> + */
> +#include <linux/kasan.h>
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> +       gfp_t gfp_mask = GFP_KERNEL;
> +       void *p;
> +
> +       /* Silence the initial allocation */
> +       if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> +               gfp_mask |= __GFP_NOWARN;
> +
> +       if (IS_ENABLED(CONFIG_KASAN))
> +               /* don't exceed the static module region - see below */
> +               module_alloc_end = MODULES_END;
> +
> +       p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> +                               module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
> +                               NUMA_NO_NODE, __builtin_return_address(0));
> +
> +       if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> +           !IS_ENABLED(CONFIG_KASAN))
> +               /*
> +                * KASAN can only deal with module allocations being served
> +                * from the reserved module region, since the remainder of
> +                * the vmalloc region is already backed by zero shadow pages,
> +                * and punching holes into it is non-trivial. Since the module
> +                * region is not randomized when KASAN is enabled, it is even
> +                * less likely that the module region gets exhausted, so we
> +                * can simply omit this fallback in that case.
> +                */
> +               p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> +                               module_alloc_base + SZ_2G, GFP_KERNEL,
> +                               PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +
> +       if (p && (kasan_module_alloc(p, size) < 0)) {
> +               vfree(p);
> +               return NULL;
> +       }
> +
> +       return p;
> +}
> diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
> index 8c7a043295ed..37ebf9a7f259 100644
> --- a/arch/mips/kernel/Makefile
> +++ b/arch/mips/kernel/Makefile
> @@ -8,7 +8,7 @@ extra-y         := head.o vmlinux.lds
>  obj-y          += cmpxchg.o cpu-probe.o branch.o elf.o entry.o genex.o idle.o irq.o \
>                    process.o prom.o ptrace.o reset.o setup.o signal.o \
>                    syscall.o time.o topology.o traps.o unaligned.o watch.o \
> -                  vdso.o cacheinfo.o
> +                  vdso.o cacheinfo.o text.o
>
>  ifdef CONFIG_FUNCTION_TRACER
>  CFLAGS_REMOVE_ftrace.o = -pg
> diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
> index 3c0c3d1260c1..f5ac4ebc4bad 100644
> --- a/arch/mips/kernel/module.c
> +++ b/arch/mips/kernel/module.c
> @@ -31,15 +31,6 @@ struct mips_hi16 {
>  static LIST_HEAD(dbe_list);
>  static DEFINE_SPINLOCK(dbe_lock);
>
> -#ifdef MODULE_START
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
> -                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -#endif
> -
>  static int apply_r_mips_none(struct module *me, u32 *location,
>                              u32 base, Elf_Addr v, bool rela)
>  {
> diff --git a/arch/mips/kernel/text.c b/arch/mips/kernel/text.c
> new file mode 100644
> index 000000000000..55ca87a421c3
> --- /dev/null
> +++ b/arch/mips/kernel/text.c
> @@ -0,0 +1,19 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *
> + *  Copyright (C) 2001 Rusty Russell.
> + *  Copyright (C) 2003, 2004 Ralf Baechle (ralf@linux-mips.org)
> + *  Copyright (C) 2005 Thiemo Seufer
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef MODULE_START
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
> +                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
> index 0af88622c619..2b03f7128809 100644
> --- a/arch/mips/net/bpf_jit.c
> +++ b/arch/mips/net/bpf_jit.c
> @@ -1233,7 +1233,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>         build_epilogue(&ctx);
>
>         alloc_size = 4 * ctx.idx;
> -       ctx.target = module_alloc(alloc_size);
> +       ctx.target = text_alloc(alloc_size);
>         if (ctx.target == NULL)
>                 goto out;
>
> @@ -1264,7 +1264,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>  void bpf_jit_free(struct bpf_prog *fp)
>  {
>         if (fp->jited)
> -               module_memfree(fp->bpf_func);
> +               text_memfree(fp->bpf_func);
>
>         bpf_prog_unlock_free(fp);
>  }
> diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile
> index 394df3f6442c..fc15778c59d1 100644
> --- a/arch/nds32/kernel/Makefile
> +++ b/arch/nds32/kernel/Makefile
> @@ -10,7 +10,7 @@ AFLAGS_head.o         := -DTEXTADDR=$(TEXTADDR)
>  obj-y                  := ex-entry.o ex-exit.o ex-scall.o irq.o \
>                         process.o ptrace.o setup.o signal.o \
>                         sys_nds32.o time.o traps.o cacheinfo.o \
> -                       dma.o syscall_table.o vdso.o
> +                       dma.o syscall_table.o vdso.o text.o
>
>  obj-$(CONFIG_MODULES)          += nds32_ksyms.o module.o
>  obj-$(CONFIG_STACKTRACE)       += stacktrace.o
> diff --git a/arch/nds32/kernel/module.c b/arch/nds32/kernel/module.c
> index 3897fd14a21d..3d23a12ed535 100644
> --- a/arch/nds32/kernel/module.c
> +++ b/arch/nds32/kernel/module.c
> @@ -7,13 +7,6 @@
>  #include <linux/moduleloader.h>
>  #include <linux/pgtable.h>
>
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                                   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -}
> -
>  void module_free(struct module *module, void *region)
>  {
>         vfree(region);
> diff --git a/arch/nds32/kernel/text.c b/arch/nds32/kernel/text.c
> new file mode 100644
> index 000000000000..6e86eff9aaf0
> --- /dev/null
> +++ b/arch/nds32/kernel/text.c
> @@ -0,0 +1,12 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) 2005-2017 Andes Technology Corporation
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                                   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +}
> diff --git a/arch/nios2/kernel/Makefile b/arch/nios2/kernel/Makefile
> index 0b645e1e3158..5476fc749f37 100644
> --- a/arch/nios2/kernel/Makefile
> +++ b/arch/nios2/kernel/Makefile
> @@ -18,6 +18,7 @@ obj-y += setup.o
>  obj-y  += signal.o
>  obj-y  += sys_nios2.o
>  obj-y  += syscall_table.o
> +obj-y  += text.o
>  obj-y  += time.o
>  obj-y  += traps.o
>
> diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
> index 76e0a42d6e36..20a0faf64e38 100644
> --- a/arch/nios2/kernel/module.c
> +++ b/arch/nios2/kernel/module.c
> @@ -21,25 +21,6 @@
>
>  #include <asm/cacheflush.h>
>
> -/*
> - * Modules should NOT be allocated with kmalloc for (obvious) reasons.
> - * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
> - * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
> - * addresses in 0xc0000000)
> - */
> -void *module_alloc(unsigned long size)
> -{
> -       if (size == 0)
> -               return NULL;
> -       return kmalloc(size, GFP_KERNEL);
> -}
> -
> -/* Free memory returned from module_alloc */
> -void module_memfree(void *module_region)
> -{
> -       kfree(module_region);
> -}
> -
>  int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
>                         unsigned int symindex, unsigned int relsec,
>                         struct module *mod)
> diff --git a/arch/nios2/kernel/text.c b/arch/nios2/kernel/text.c
> new file mode 100644
> index 000000000000..af424174442f
> --- /dev/null
> +++ b/arch/nios2/kernel/text.c
> @@ -0,0 +1,34 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Kernel module support for Nios II.
> + *
> + * Copyright (C) 2004 Microtronix Datacom Ltd.
> + *   Written by Wentao Xu <xuwentao@microtronix.com>
> + * Copyright (C) 2001, 2003 Rusty Russell
> + *
> + * This file is subject to the terms and conditions of the GNU General
> + * Public License.  See the file COPYING in the main directory of this
> + * archive for more details.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +/*
> + * Modules should NOT be allocated with kmalloc for (obvious) reasons.
> + * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
> + * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
> + * addresses in 0xc0000000)
> + */
> +void *text_alloc(unsigned long size)
> +{
> +       if (size == 0)
> +               return NULL;
> +       return kmalloc(size, GFP_KERNEL);
> +}
> +
> +/* Free memory returned from module_alloc */
> +void text_memfree(void *module_region)
> +{
> +       kfree(module_region);
> +}
> diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
> index 068d90950d93..f71f7ffdae2e 100644
> --- a/arch/parisc/kernel/Makefile
> +++ b/arch/parisc/kernel/Makefile
> @@ -10,7 +10,7 @@ obj-y         := cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \
>                    ptrace.o hardware.o inventory.o drivers.o alternative.o \
>                    signal.o hpmc.o real2.o parisc_ksyms.o unaligned.o \
>                    process.o processor.o pdc_cons.o pdc_chassis.o unwind.o \
> -                  patch.o
> +                  patch.o text.o
>
>  ifdef CONFIG_FUNCTION_TRACER
>  # Do not profile debug and lowlevel utilities
> diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
> index 7df140545b22..c81e63e2549b 100644
> --- a/arch/parisc/kernel/module.c
> +++ b/arch/parisc/kernel/module.c
> @@ -192,17 +192,6 @@ static inline int reassemble_22(int as22)
>                 ((as22 & 0x0003ff) << 3));
>  }
>
> -void *module_alloc(unsigned long size)
> -{
> -       /* using RWX means less protection for modules, but it's
> -        * easier than trying to map the text, data, init_text and
> -        * init_data correctly */
> -       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -                                   GFP_KERNEL,
> -                                   PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -}
> -
>  #ifndef CONFIG_64BIT
>  static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n)
>  {
> diff --git a/arch/parisc/kernel/text.c b/arch/parisc/kernel/text.c
> new file mode 100644
> index 000000000000..9ff503084191
> --- /dev/null
> +++ b/arch/parisc/kernel/text.c
> @@ -0,0 +1,22 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *    Linux/PA-RISC Project
> + *    Copyright (C) 2003 Randolph Chung <tausq at debian . org>
> + *    Copyright (C) 2008 Helge Deller <deller@gmx.de>
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       /*
> +        * Using RWX means less protection for modules, but it's
> +        * easier than trying to map the text, data, init_text and
> +        * init_data correctly.
> +        */
> +       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +                                   GFP_KERNEL,
> +                                   PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +}
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 0acc9d5fb19e..ba1cef7a812d 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -634,7 +634,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>
>         proglen = cgctx.idx * 4;
>         alloclen = proglen + FUNCTION_DESCR_SIZE;
> -       image = module_alloc(alloclen);
> +       image = text_alloc(alloclen);
>         if (!image)
>                 goto out;
>
> @@ -678,7 +678,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>  void bpf_jit_free(struct bpf_prog *fp)
>  {
>         if (fp->jited)
> -               module_memfree(fp->bpf_func);
> +               text_memfree(fp->bpf_func);
>
>         bpf_prog_unlock_free(fp);
>  }
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index b355cf485671..d0b30f286ce6 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -29,6 +29,7 @@ obj-y += riscv_ksyms.o
>  obj-y  += stacktrace.o
>  obj-y  += cacheinfo.o
>  obj-y  += patch.o
> +obj-y  += text.o
>  obj-$(CONFIG_MMU) += vdso.o vdso/
>
>  obj-$(CONFIG_RISCV_M_MODE)     += clint.o traps_misaligned.o
> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> index 7191342c54da..f6aa66431c9e 100644
> --- a/arch/riscv/kernel/module.c
> +++ b/arch/riscv/kernel/module.c
> @@ -390,15 +390,3 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
>
>         return 0;
>  }
> -
> -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> -#define VMALLOC_MODULE_START \
> -        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
> -                                   VMALLOC_END, GFP_KERNEL,
> -                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -}
> -#endif
> diff --git a/arch/riscv/kernel/text.c b/arch/riscv/kernel/text.c
> new file mode 100644
> index 000000000000..201608a25641
> --- /dev/null
> +++ b/arch/riscv/kernel/text.c
> @@ -0,0 +1,20 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *
> + *  Copyright (C) 2017 Zihao Yu
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> +#define VMALLOC_MODULE_START \
> +        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
> +                                   VMALLOC_END, GFP_KERNEL,
> +                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
> index a8f136943deb..9f00c320b938 100644
> --- a/arch/s390/kernel/Makefile
> +++ b/arch/s390/kernel/Makefile
> @@ -40,7 +40,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
>  obj-y  += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
>  obj-y  += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
>  obj-y  += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
> -obj-y  += smp.o
> +obj-y  += smp.o text.o
>
>  extra-y                                += head64.o vmlinux.lds
>
> diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
> index b388e87a08bf..a752b1442846 100644
> --- a/arch/s390/kernel/ftrace.c
> +++ b/arch/s390/kernel/ftrace.c
> @@ -134,7 +134,7 @@ static int __init ftrace_plt_init(void)
>  {
>         unsigned int *ip;
>
> -       ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
> +       ftrace_plt = (unsigned long) text_alloc(PAGE_SIZE);
>         if (!ftrace_plt)
>                 panic("cannot allocate ftrace plt\n");
>         ip = (unsigned int *) ftrace_plt;
> diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
> index 4055f1c49814..087cb5951de6 100644
> --- a/arch/s390/kernel/module.c
> +++ b/arch/s390/kernel/module.c
> @@ -32,22 +32,6 @@
>
>  #define PLT_ENTRY_SIZE 20
>
> -void *module_alloc(unsigned long size)
> -{
> -       void *p;
> -
> -       if (PAGE_ALIGN(size) > MODULES_LEN)
> -               return NULL;
> -       p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
> -                                GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                                __builtin_return_address(0));
> -       if (p && (kasan_module_alloc(p, size) < 0)) {
> -               vfree(p);
> -               return NULL;
> -       }
> -       return p;
> -}
> -
>  void module_arch_freeing_init(struct module *mod)
>  {
>         if (is_livepatch_module(mod) &&
> diff --git a/arch/s390/kernel/text.c b/arch/s390/kernel/text.c
> new file mode 100644
> index 000000000000..63aaa1ab727b
> --- /dev/null
> +++ b/arch/s390/kernel/text.c
> @@ -0,0 +1,23 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + *  Kernel module help for s390.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       void *p;
> +
> +       if (PAGE_ALIGN(size) > MODULES_LEN)
> +               return NULL;
> +       p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
> +                                GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                                __builtin_return_address(0));
> +       if (p && (kasan_module_alloc(p, size) < 0)) {
> +               vfree(p);
> +               return NULL;
> +       }
> +       return p;
> +}
> diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
> index 97c0e19263d1..e025f9e1db4a 100644
> --- a/arch/sparc/kernel/Makefile
> +++ b/arch/sparc/kernel/Makefile
> @@ -52,6 +52,7 @@ obj-y                   += prom_common.o
>  obj-y                   += prom_$(BITS).o
>  obj-y                   += of_device_common.o
>  obj-y                   += of_device_$(BITS).o
> +obj-y                  += text.o
>  obj-$(CONFIG_SPARC64)   += prom_irqtrans.o
>
>  obj-$(CONFIG_SPARC32)   += leon_kernel.o
> diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
> index df39580f398d..f2babc69f189 100644
> --- a/arch/sparc/kernel/module.c
> +++ b/arch/sparc/kernel/module.c
> @@ -21,36 +21,6 @@
>
>  #include "entry.h"
>
> -#ifdef CONFIG_SPARC64
> -
> -#include <linux/jump_label.h>
> -
> -static void *module_map(unsigned long size)
> -{
> -       if (PAGE_ALIGN(size) > MODULES_LEN)
> -               return NULL;
> -       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -#else
> -static void *module_map(unsigned long size)
> -{
> -       return vmalloc(size);
> -}
> -#endif /* CONFIG_SPARC64 */
> -
> -void *module_alloc(unsigned long size)
> -{
> -       void *ret;
> -
> -       ret = module_map(size);
> -       if (ret)
> -               memset(ret, 0, size);
> -
> -       return ret;
> -}
> -
>  /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
>  int module_frob_arch_sections(Elf_Ehdr *hdr,
>                               Elf_Shdr *sechdrs,
> diff --git a/arch/sparc/kernel/text.c b/arch/sparc/kernel/text.c
> new file mode 100644
> index 000000000000..d16663f2c6ba
> --- /dev/null
> +++ b/arch/sparc/kernel/text.c
> @@ -0,0 +1,39 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Kernel module help for sparc64.
> + *
> + * Copyright (C) 2001 Rusty Russell.
> + * Copyright (C) 2002 David S. Miller.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef CONFIG_SPARC64
> +
> +#include <linux/jump_label.h>
> +
> +static void *module_map(unsigned long size)
> +{
> +       if (PAGE_ALIGN(size) > MODULES_LEN)
> +               return NULL;
> +       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> +#else
> +static void *module_map(unsigned long size)
> +{
> +       return vmalloc(size);
> +}
> +#endif /* CONFIG_SPARC64 */
> +
> +void *text_alloc(unsigned long size)
> +{
> +       void *ret;
> +
> +       ret = module_map(size);
> +       if (ret)
> +               memset(ret, 0, size);
> +
> +       return ret;
> +}
> diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
> index c8eabb973b86..d9dd513b27b2 100644
> --- a/arch/sparc/net/bpf_jit_comp_32.c
> +++ b/arch/sparc/net/bpf_jit_comp_32.c
> @@ -713,7 +713,7 @@ cond_branch:                        f_offset = addrs[i + filter[i].jf];
>                                 if (unlikely(proglen + ilen > oldproglen)) {
>                                         pr_err("bpb_jit_compile fatal error\n");
>                                         kfree(addrs);
> -                                       module_memfree(image);
> +                                       text_memfree(image);
>                                         return;
>                                 }
>                                 memcpy(image + proglen, temp, ilen);
> @@ -736,7 +736,7 @@ cond_branch:                        f_offset = addrs[i + filter[i].jf];
>                         break;
>                 }
>                 if (proglen == oldproglen) {
> -                       image = module_alloc(proglen);
> +                       image = text_alloc(proglen);
>                         if (!image)
>                                 goto out;
>                 }
> @@ -758,7 +758,7 @@ cond_branch:                        f_offset = addrs[i + filter[i].jf];
>  void bpf_jit_free(struct bpf_prog *fp)
>  {
>         if (fp->jited)
> -               module_memfree(fp->bpf_func);
> +               text_memfree(fp->bpf_func);
>
>         bpf_prog_unlock_free(fp);
>  }
> diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile
> index 2f79aa56735b..96eb8cfc8b1e 100644
> --- a/arch/unicore32/kernel/Makefile
> +++ b/arch/unicore32/kernel/Makefile
> @@ -6,6 +6,7 @@
>  # Object file lists.
>  obj-y                          := dma.o elf.o entry.o process.o ptrace.o
>  obj-y                          += setup.o signal.o sys.o stacktrace.o traps.o
> +obj-y                          += text.o
>
>  obj-$(CONFIG_MODULES)          += ksyms.o module.o
>  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
> diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
> index 67c89ef2d6ee..e1e703c02379 100644
> --- a/arch/unicore32/kernel/module.c
> +++ b/arch/unicore32/kernel/module.c
> @@ -18,13 +18,6 @@
>
>  #include <asm/sections.h>
>
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -
>  int
>  apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
>                unsigned int relindex, struct module *module)
> diff --git a/arch/unicore32/kernel/text.c b/arch/unicore32/kernel/text.c
> new file mode 100644
> index 000000000000..b94aac824bb8
> --- /dev/null
> +++ b/arch/unicore32/kernel/text.c
> @@ -0,0 +1,18 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * linux/arch/unicore32/kernel/module.c
> + *
> + * Code specific to PKUnity SoC and UniCore ISA
> + *
> + * Copyright (C) 2001-2010 GUAN Xue-tao
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index e77261db2391..2878e4b753a0 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -68,6 +68,7 @@ obj-y                 += tsc.o tsc_msr.o io_delay.o rtc.o
>  obj-y                  += pci-iommu_table.o
>  obj-y                  += resource.o
>  obj-y                  += irqflags.o
> +obj-y                  += text.o
>
>  obj-y                          += process.o
>  obj-y                          += fpu/
> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
> index 51504566b3a6..f76703ee96f2 100644
> --- a/arch/x86/kernel/ftrace.c
> +++ b/arch/x86/kernel/ftrace.c
> @@ -265,11 +265,11 @@ int __init ftrace_dyn_arch_init(void)
>  /* Module allocation simplifies allocating memory for code */
>  static inline void *alloc_tramp(unsigned long size)
>  {
> -       return module_alloc(size);
> +       return text_alloc(size);
>  }
>  static inline void tramp_free(void *tramp)
>  {
> -       module_memfree(tramp);
> +       text_memfree(tramp);
>  }
>  #else
>  /* Trampolines can only be created if modules are supported */
> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
> index ada39ddbc922..e9ac7d3c658e 100644
> --- a/arch/x86/kernel/kprobes/core.c
> +++ b/arch/x86/kernel/kprobes/core.c
> @@ -423,7 +423,7 @@ void *alloc_insn_page(void)
>  {
>         void *page;
>
> -       page = module_alloc(PAGE_SIZE);
> +       page = text_alloc(PAGE_SIZE);
>         if (!page)
>                 return NULL;
>
> @@ -446,7 +446,7 @@ void *alloc_insn_page(void)
>  /* Recover page to RW mode before releasing it */
>  void free_insn_page(void *page)
>  {
> -       module_memfree(page);
> +       text_memfree(page);
>  }
>
>  static int arch_copy_kprobe(struct kprobe *p)
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index 34b153cbd4ac..261df078f127 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -36,55 +36,6 @@ do {                                                 \
>  } while (0)
>  #endif
>
> -#ifdef CONFIG_RANDOMIZE_BASE
> -static unsigned long module_load_offset;
> -
> -/* Mutex protects the module_load_offset. */
> -static DEFINE_MUTEX(module_kaslr_mutex);
> -
> -static unsigned long int get_module_load_offset(void)
> -{
> -       if (kaslr_enabled()) {
> -               mutex_lock(&module_kaslr_mutex);
> -               /*
> -                * Calculate the module_load_offset the first time this
> -                * code is called. Once calculated it stays the same until
> -                * reboot.
> -                */
> -               if (module_load_offset == 0)
> -                       module_load_offset =
> -                               (get_random_int() % 1024 + 1) * PAGE_SIZE;
> -               mutex_unlock(&module_kaslr_mutex);
> -       }
> -       return module_load_offset;
> -}
> -#else
> -static unsigned long int get_module_load_offset(void)
> -{
> -       return 0;
> -}
> -#endif
> -
> -void *module_alloc(unsigned long size)
> -{
> -       void *p;
> -
> -       if (PAGE_ALIGN(size) > MODULES_LEN)
> -               return NULL;
> -
> -       p = __vmalloc_node_range(size, MODULE_ALIGN,
> -                                   MODULES_VADDR + get_module_load_offset(),
> -                                   MODULES_END, GFP_KERNEL,
> -                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -       if (p && (kasan_module_alloc(p, size) < 0)) {
> -               vfree(p);
> -               return NULL;
> -       }
> -
> -       return p;
> -}
> -
>  #ifdef CONFIG_X86_32
>  int apply_relocate(Elf32_Shdr *sechdrs,
>                    const char *strtab,
> diff --git a/arch/x86/kernel/text.c b/arch/x86/kernel/text.c
> new file mode 100644
> index 000000000000..724ab2d93ac5
> --- /dev/null
> +++ b/arch/x86/kernel/text.c
> @@ -0,0 +1,60 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *  Kernel module help for x86.
> + *  Copyright (C) 2001 Rusty Russell.
> + */
> +#include <linux/kasan.h>
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/random.h>
> +#include <linux/vmalloc.h>
> +#include <asm/setup.h>
> +
> +#ifdef CONFIG_RANDOMIZE_BASE
> +static unsigned long module_load_offset;
> +
> +/* Mutex protects the module_load_offset. */
> +static DEFINE_MUTEX(module_kaslr_mutex);
> +
> +static unsigned long get_module_load_offset(void)
> +{
> +       if (kaslr_enabled()) {
> +               mutex_lock(&module_kaslr_mutex);
> +               /*
> +                * Calculate the module_load_offset the first time this
> +                * code is called. Once calculated it stays the same until
> +                * reboot.
> +                */
> +               if (module_load_offset == 0)
> +                       module_load_offset =
> +                               (get_random_int() % 1024 + 1) * PAGE_SIZE;
> +               mutex_unlock(&module_kaslr_mutex);
> +       }
> +       return module_load_offset;
> +}
> +#else
> +static unsigned long get_module_load_offset(void)
> +{
> +       return 0;
> +}
> +#endif
> +
> +void *text_alloc(unsigned long size)
> +{
> +       void *p;
> +
> +       if (PAGE_ALIGN(size) > MODULES_LEN)
> +               return NULL;
> +
> +       p = __vmalloc_node_range(size, MODULE_ALIGN,
> +                                   MODULES_VADDR + get_module_load_offset(),
> +                                   MODULES_END, GFP_KERNEL,
> +                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +       if (p && (kasan_module_alloc(p, size) < 0)) {
> +               vfree(p);
> +               return NULL;
> +       }
> +
> +       return p;
> +}
> diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
> index 4fa67a8b2265..4e8b9ba431ee 100644
> --- a/include/linux/moduleloader.h
> +++ b/include/linux/moduleloader.h
> @@ -24,10 +24,10 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
>
>  /* Allocator used for allocating struct module, core sections and init
>     sections.  Returns NULL on failure. */
> -void *module_alloc(unsigned long size);
> +void *text_alloc(unsigned long size);
>
>  /* Free memory returned from module_alloc. */
> -void module_memfree(void *module_region);
> +void text_memfree(void *module_region);
>
>  /* Determines if the section name is an init section (that is only used during
>   * module loading).
> diff --git a/kernel/Makefile b/kernel/Makefile
> index f3218bc5ec69..9e88e81f68ef 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o \
>             extable.o params.o \
>             kthread.o sys_ni.o nsproxy.o \
>             notifier.o ksysfs.o cred.o reboot.o \
> -           async.o range.o smpboot.o ucount.o
> +           async.o range.o smpboot.o ucount.o text.o
>
>  obj-$(CONFIG_MODULES) += kmod.o
>  obj-$(CONFIG_MULTIUSER) += groups.o
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 9df4cc9a2907..febd55019a8a 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -840,12 +840,12 @@ static void bpf_jit_uncharge_modmem(u32 pages)
>
>  void *__weak bpf_jit_alloc_exec(unsigned long size)
>  {
> -       return module_alloc(size);
> +       return text_alloc(size);
>  }
>
>  void __weak bpf_jit_free_exec(void *addr)
>  {
> -       module_memfree(addr);
> +       text_memfree(addr);
>  }
>
>  struct bpf_binary_header *
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 4a904cc56d68..d1c354ec89de 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -111,12 +111,12 @@ enum kprobe_slot_state {
>
>  void __weak *alloc_insn_page(void)
>  {
> -       return module_alloc(PAGE_SIZE);
> +       return text_alloc(PAGE_SIZE);
>  }
>
>  void __weak free_insn_page(void *page)
>  {
> -       module_memfree(page);
> +       text_memfree(page);
>  }
>
>  struct kprobe_insn_cache kprobe_insn_slots = {
> diff --git a/kernel/module.c b/kernel/module.c
> index bee1c25ca5c5..bdb3773f3668 100644
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -2141,16 +2141,6 @@ static void free_module_elf(struct module *mod)
>  }
>  #endif /* CONFIG_LIVEPATCH */
>
> -void __weak module_memfree(void *module_region)
> -{
> -       /*
> -        * This memory may be RO, and freeing RO memory in an interrupt is not
> -        * supported by vmalloc.
> -        */
> -       WARN_ON(in_interrupt());
> -       vfree(module_region);
> -}
> -
>  void __weak module_arch_cleanup(struct module *mod)
>  {
>  }
> @@ -2200,7 +2190,7 @@ static void free_module(struct module *mod)
>
>         /* This may be empty, but that's OK */
>         module_arch_freeing_init(mod);
> -       module_memfree(mod->init_layout.base);
> +       text_memfree(mod->init_layout.base);
>         kfree(mod->args);
>         percpu_modfree(mod);
>
> @@ -2208,7 +2198,7 @@ static void free_module(struct module *mod)
>         lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
>
>         /* Finally, free the core (containing the module structure) */
> -       module_memfree(mod->core_layout.base);
> +       text_memfree(mod->core_layout.base);
>  }
>
>  void *__symbol_get(const char *symbol)
> @@ -2781,13 +2771,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
>                 ddebug_remove_module(mod->name);
>  }
>
> -void * __weak module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -                       GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> -                       NUMA_NO_NODE, __builtin_return_address(0));
> -}
> -
>  bool __weak module_init_section(const char *name)
>  {
>         return strstarts(name, ".init");
> @@ -3246,7 +3229,7 @@ static int move_module(struct module *mod, struct load_info *info)
>         void *ptr;
>
>         /* Do the allocs. */
> -       ptr = module_alloc(mod->core_layout.size);
> +       ptr = text_alloc(mod->core_layout.size);
>         /*
>          * The pointer to this block is stored in the module structure
>          * which is inside the block. Just mark it as not being a
> @@ -3260,7 +3243,7 @@ static int move_module(struct module *mod, struct load_info *info)
>         mod->core_layout.base = ptr;
>
>         if (mod->init_layout.size) {
> -               ptr = module_alloc(mod->init_layout.size);
> +               ptr = text_alloc(mod->init_layout.size);
>                 /*
>                  * The pointer to this block is stored in the module structure
>                  * which is inside the block. This block doesn't need to be
> @@ -3269,7 +3252,7 @@ static int move_module(struct module *mod, struct load_info *info)
>                  */
>                 kmemleak_ignore(ptr);
>                 if (!ptr) {
> -                       module_memfree(mod->core_layout.base);
> +                       text_memfree(mod->core_layout.base);
>                         return -ENOMEM;
>                 }
>                 memset(ptr, 0, mod->init_layout.size);
> @@ -3452,8 +3435,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
>  {
>         percpu_modfree(mod);
>         module_arch_freeing_init(mod);
> -       module_memfree(mod->init_layout.base);
> -       module_memfree(mod->core_layout.base);
> +       text_memfree(mod->init_layout.base);
> +       text_memfree(mod->core_layout.base);
>  }
>
>  int __weak module_finalize(const Elf_Ehdr *hdr,
> @@ -3527,7 +3510,7 @@ static void do_free_init(struct work_struct *w)
>
>         llist_for_each_safe(pos, n, list) {
>                 initfree = container_of(pos, struct mod_initfree, node);
> -               module_memfree(initfree->module_init);
> +               text_memfree(initfree->module_init);
>                 kfree(initfree);
>         }
>  }
> @@ -3626,10 +3609,10 @@ static noinline int do_init_module(struct module *mod)
>          * We want to free module_init, but be aware that kallsyms may be
>          * walking this with preempt disabled.  In all the failure paths, we
>          * call synchronize_rcu(), but we don't want to slow down the success
> -        * path. module_memfree() cannot be called in an interrupt, so do the
> +        * path. text_memfree() cannot be called in an interrupt, so do the
>          * work and call synchronize_rcu() in a work queue.
>          *
> -        * Note that module_alloc() on most architectures creates W+X page
> +        * Note that text_alloc() on most architectures creates W+X page
>          * mappings which won't be cleaned up until do_free_init() runs.  Any
>          * code such as mark_rodata_ro() which depends on those mappings to
>          * be cleaned up needs to sync with the queued work - ie
> diff --git a/kernel/text.c b/kernel/text.c
> new file mode 100644
> index 000000000000..9a12c508ded5
> --- /dev/null
> +++ b/kernel/text.c
> @@ -0,0 +1,25 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *  Copyright (C) 2002 Richard Henderson
> + *  Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void __weak text_memfree(void *module_region)
> +{
> +       /*
> +        * This memory may be RO, and freeing RO memory in an interrupt is not
> +        * supported by vmalloc.
> +        */
> +       WARN_ON(in_interrupt());
> +       vfree(module_region);
> +}
> +
> +void * __weak text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +                       GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> +                       NUMA_NO_NODE, __builtin_return_address(0));
> +}
> --
> 2.25.1
>

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox