LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 11/12] ppc64/kexec_file: add appropriate regions for memory reserve map
From: Hari Bathini @ 2020-07-13 17:23 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159466074408.24747.10036072269371204890.stgit@hbathini.in.ibm.com>

While initrd, elfcorehdr and backup regions are already added to the
reserve map, there are a few missing regions that need to be added to
the memory reserve map. Add them here. And now that all the changes
to load panic kernel are in place, claim likewise.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
---

v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.

v1 -> v2:
* Updated add_rtas_mem_range() & add_opal_mem_range() callsites based on
  the new prototype for these functions.


 arch/powerpc/kexec/file_load_64.c |   58 ++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 2531bb5..29e5d11 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -193,6 +193,34 @@ static int get_crash_memory_ranges(struct crash_mem **mem_ranges)
 }
 
 /**
+ * get_reserved_memory_ranges - Get reserve memory ranges. This list includes
+ *                              memory regions that should be added to the
+ *                              memory reserve map to ensure the region is
+ *                              protected from any mischeif.
+ * @mem_ranges:                 Range list to add the memory ranges to.
+ *
+ * Returns 0 on success, negative errno on error.
+ */
+static int get_reserved_memory_ranges(struct crash_mem **mem_ranges)
+{
+	int ret;
+
+	ret = add_rtas_mem_range(mem_ranges);
+	if (ret)
+		goto out;
+
+	ret = add_tce_mem_ranges(mem_ranges);
+	if (ret)
+		goto out;
+
+	ret = add_reserved_ranges(mem_ranges);
+out:
+	if (ret)
+		pr_err("Failed to setup reserved memory ranges\n");
+	return ret;
+}
+
+/**
  * __locate_mem_hole_top_down - Looks top down for a large enough memory hole
  *                              in the memory regions between buf_min & buf_max
  *                              for the buffer. If found, sets kbuf->mem.
@@ -1202,8 +1230,8 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 			unsigned long initrd_load_addr,
 			unsigned long initrd_len, const char *cmdline)
 {
-	struct crash_mem *umem = NULL;
-	int chosen_node, ret;
+	struct crash_mem *umem = NULL, *rmem = NULL;
+	int i, nr_ranges, chosen_node, ret;
 
 	/* Remove memory reservation for the current device tree. */
 	ret = delete_fdt_mem_rsv(fdt, __pa(initial_boot_params),
@@ -1249,6 +1277,25 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 		}
 	}
 
+	/* Update memory reserve map */
+	ret = get_reserved_memory_ranges(&rmem);
+	if (ret)
+		goto out;
+
+	nr_ranges = rmem ? rmem->nr_ranges : 0;
+	for (i = 0; i < nr_ranges; i++) {
+		u64 base, size;
+
+		base = rmem->ranges[i].start;
+		size = rmem->ranges[i].end - base + 1;
+		ret = fdt_add_mem_rsv(fdt, base, size);
+		if (ret) {
+			pr_err("Error updating memory reserve map: %s\n",
+			       fdt_strerror(ret));
+			goto out;
+		}
+	}
+
 	ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len,
 			    cmdline, &chosen_node);
 	if (ret)
@@ -1259,6 +1306,7 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt,
 		pr_err("Failed to update device-tree with linux,booted-from-kexec\n");
 out:
 	kfree(umem);
+	kfree(rmem);
 	return ret;
 }
 
@@ -1433,10 +1481,10 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
 
 		/* Get exclude memory ranges needed for setting up kdump segments */
 		ret = get_exclude_memory_ranges(&(image->arch.exclude_ranges));
-		if (ret)
+		if (ret) {
 			pr_err("Failed to setup exclude memory ranges for buffer lookup\n");
-		/* Return this until all changes for panic kernel are in */
-		return -EOPNOTSUPP;
+			return ret;
+		}
 	}
 
 	return kexec_image_probe_default(image, buf, buf_len);


^ permalink raw reply related

* [PATCH v3 12/12] ppc64/kexec_file: fix kexec load failure with lack of memory hole
From: Hari Bathini @ 2020-07-13 17:23 UTC (permalink / raw)
  To: Michael Ellerman, Andrew Morton
  Cc: Pingfan Liu, Kexec-ml, Nayna Jain, Petr Tesarik,
	Mahesh J Salgaonkar, Mimi Zohar, lkml, linuxppc-dev, Sourabh Jain,
	Vivek Goyal, Dave Young, Thiago Jung Bauermann, Eric Biederman
In-Reply-To: <159466074408.24747.10036072269371204890.stgit@hbathini.in.ibm.com>

The kexec purgatory has to run in real mode. Only the first memory
block maybe accessible in real mode. And, unlike the case with panic
kernel, no memory is set aside for regular kexec load. Another thing
to note is, the memory for crashkernel is reserved at an offset of
128MB. So, when crashkernel memory is reserved, the memory ranges to
load kexec segments shrink further as the generic code only looks for
memblock free memory ranges and in all likelihood only a tiny bit of
memory from 0 to 128MB would be available to load kexec segments.

With kdump being used by default in general, kexec file load is likely
to fail almost always. This can be fixed by changing the memory hole
lookup logic for regular kexec to use the same method as kdump. This
would mean that most kexec segments will overlap with crashkernel
memory region. That should still be ok as the pages, whose destination
address isn't available while loading, are placed in an intermediate
location till a flush to the actual destination address happens during
kexec boot sequence.

Signed-off-by: Hari Bathini <hbathini@linux.ibm.com>
Tested-by: Pingfan Liu <piliu@redhat.com>
---

v2 -> v3:
* Unchanged. Added Tested-by tag from Pingfan.

v1 -> v2:
* New patch to fix locating memory hole for kexec_file_load (kexec -s -l)
  when memory is reserved for crashkernel.


 arch/powerpc/kexec/file_load_64.c |   33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c
index 29e5d11..b79d62bc 100644
--- a/arch/powerpc/kexec/file_load_64.c
+++ b/arch/powerpc/kexec/file_load_64.c
@@ -1328,13 +1328,6 @@ int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
 	u64 buf_min, buf_max;
 	int ret;
 
-	/*
-	 * Use the generic kexec_locate_mem_hole for regular
-	 * kexec_file_load syscall
-	 */
-	if (kbuf->image->type != KEXEC_TYPE_CRASH)
-		return kexec_locate_mem_hole(kbuf);
-
 	/* Look up the exclude ranges list while locating the memory hole */
 	emem = &(kbuf->image->arch.exclude_ranges);
 	if (!(*emem) || ((*emem)->nr_ranges == 0)) {
@@ -1342,11 +1335,15 @@ int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf)
 		return kexec_locate_mem_hole(kbuf);
 	}
 
+	buf_min = kbuf->buf_min;
+	buf_max = kbuf->buf_max;
 	/* Segments for kdump kernel should be within crashkernel region */
-	buf_min = (kbuf->buf_min < crashk_res.start ?
-		   crashk_res.start : kbuf->buf_min);
-	buf_max = (kbuf->buf_max > crashk_res.end ?
-		   crashk_res.end : kbuf->buf_max);
+	if (kbuf->image->type == KEXEC_TYPE_CRASH) {
+		buf_min = (buf_min < crashk_res.start ?
+			   crashk_res.start : buf_min);
+		buf_max = (buf_max > crashk_res.end ?
+			   crashk_res.end : buf_max);
+	}
 
 	if (buf_min > buf_max) {
 		pr_err("Invalid buffer min and/or max values\n");
@@ -1476,15 +1473,13 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi,
 int arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
 				  unsigned long buf_len)
 {
-	if (image->type == KEXEC_TYPE_CRASH) {
-		int ret;
+	int ret;
 
-		/* Get exclude memory ranges needed for setting up kdump segments */
-		ret = get_exclude_memory_ranges(&(image->arch.exclude_ranges));
-		if (ret) {
-			pr_err("Failed to setup exclude memory ranges for buffer lookup\n");
-			return ret;
-		}
+	/* Get exclude memory ranges needed for setting up kexec segments */
+	ret = get_exclude_memory_ranges(&(image->arch.exclude_ranges));
+	if (ret) {
+		pr_err("Failed to setup exclude memory ranges for buffer lookup\n");
+		return ret;
 	}
 
 	return kexec_image_probe_default(image, buf, buf_len);


^ permalink raw reply related

* Re: [RFC PATCH 7/7] lazy tlb: shoot lazies, a non-refcounting lazy tlb option
From: Andy Lutomirski @ 2020-07-13 18:18 UTC (permalink / raw)
  To: Nicholas Piggin
  Cc: linux-arch, Arnd Bergmann, Peter Zijlstra, X86 ML, LKML, Linux-MM,
	Mathieu Desnoyers, Andy Lutomirski, linuxppc-dev
In-Reply-To: <1594658283.qabzoxga67.astroid@bobo.none>


> On Jul 13, 2020, at 9:48 AM, Nicholas Piggin <npiggin@gmail.com> wrote:
> 
> Excerpts from Andy Lutomirski's message of July 14, 2020 1:59 am:
>>> On Thu, Jul 9, 2020 at 6:57 PM Nicholas Piggin <npiggin@gmail.com> wrote:
>>> 
>>> On big systems, the mm refcount can become highly contented when doing
>>> a lot of context switching with threaded applications (particularly
>>> switching between the idle thread and an application thread).
>>> 
>>> Abandoning lazy tlb slows switching down quite a bit in the important
>>> user->idle->user cases, so so instead implement a non-refcounted scheme
>>> that causes __mmdrop() to IPI all CPUs in the mm_cpumask and shoot down
>>> any remaining lazy ones.
>>> 
>>> On a 16-socket 192-core POWER8 system, a context switching benchmark
>>> with as many software threads as CPUs (so each switch will go in and
>>> out of idle), upstream can achieve a rate of about 1 million context
>>> switches per second. After this patch it goes up to 118 million.
>>> 
>> 
>> I read the patch a couple of times, and I have a suggestion that could
>> be nonsense.  You are, effectively, using mm_cpumask() as a sort of
>> refcount.  You're saying "hey, this mm has no more references, but it
>> still has nonempty mm_cpumask(), so let's send an IPI and shoot down
>> those references too."  I'm wondering whether you actually need the
>> IPI.  What if, instead, you actually treated mm_cpumask as a refcount
>> for real?  Roughly, in __mmdrop(), you would only free the page tables
>> if mm_cpumask() is empty.  And, in the code that removes a CPU from
>> mm_cpumask(), you would check if mm_users == 0 and, if so, check if
>> you just removed the last bit from mm_cpumask and potentially free the
>> mm.
>> 
>> Getting the locking right here could be a bit tricky -- you need to
>> avoid two CPUs simultaneously exiting lazy TLB and thinking they
>> should free the mm, and you also need to avoid an mm with mm_users
>> hitting zero concurrently with the last remote CPU using it lazily
>> exiting lazy TLB.  Perhaps this could be resolved by having mm_count
>> == 1 mean "mm_cpumask() is might contain bits and, if so, it owns the
>> mm" and mm_count == 0 meaning "now it's dead" and using some careful
>> cmpxchg or dec_return to make sure that only one CPU frees it.
>> 
>> Or maybe you'd need a lock or RCU for this, but the idea would be to
>> only ever take the lock after mm_users goes to zero.
> 
> I don't think it's nonsense, it could be a good way to avoid IPIs.
> 
> I haven't seen much problem here that made me too concerned about IPIs 
> yet, so I think the simple patch may be good enough to start with
> for powerpc. I'm looking at avoiding/reducing the IPIs by combining the
> unlazying with the exit TLB flush without doing anything fancy with
> ref counting, but we'll see.

I would be cautious with benchmarking here. I would expect that the nasty cases may affect power consumption more than performance — the specific issue is IPIs hitting idle cores, and the main effects are to slow down exit() a bit but also to kick the idle core out of idle. Although, if the idle core is in a deep sleep, that IPI could be *very* slow.

So I think it’s worth at least giving this a try.

> 
> Thanks,
> Nick

^ permalink raw reply

* Re: [PATCH v2 0/3] Power10 basic energy management
From: Pratik Sampat @ 2020-07-13 18:27 UTC (permalink / raw)
  To: Nicholas Piggin, benh, ego, linux-kernel, linuxppc-dev, mikey,
	mpe, paulus, pratik.r.sampat, ravi.bangoria, svaidy
In-Reply-To: <1594658947.97ndhsx6xh.astroid@bobo.none>



On 13/07/20 10:20 pm, Nicholas Piggin wrote:
> Excerpts from Pratik Sampat's message of July 13, 2020 8:02 pm:
>> Thank you for your comments,
>>
>> On 13/07/20 10:53 am, Nicholas Piggin wrote:
>>> Excerpts from Pratik Rajesh Sampat's message of July 10, 2020 3:22 pm:
>>>> Changelog v1 --> v2:
>>>> 1. Save-restore DAWR and DAWRX unconditionally as they are lost in
>>>> shallow idle states too
>>>> 2. Rename pnv_first_spr_loss_level to pnv_first_fullstate_loss_level to
>>>> correct naming terminology
>>>>
>>>> Pratik Rajesh Sampat (3):
>>>>     powerpc/powernv/idle: Exclude mfspr on HID1,4,5 on P9 and above
>>>>     powerpc/powernv/idle: save-restore DAWR0,DAWRX0 for P10
>>>>     powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable
>>>>
>>>>    arch/powerpc/platforms/powernv/idle.c | 34 +++++++++++++++++----------
>>>>    1 file changed, 22 insertions(+), 12 deletions(-)
>>> These look okay to me, but the CPU_FTR_ARCH_300 test for
>>> pnv_power9_idle_init() is actually wrong, it should be a PVR test
>>> because idle is not completely architected (not even shallow stop
>>> states, unfortunately).
>>>
>>> It doesn't look like we support POWER10 idle correctly yet, and on older
>>> kernels it wouldn't work even if we fixed newer, so ideally the PVR
>>> check would be backported as a fix in the front of the series.
>>>
>>> Sadly, we have no OPAL idle driver yet. Hopefully we will before the
>>> next processor shows up :P
>>>
>>> Thanks,
>>> Nick
>> So if I understand this correctly, in powernv/idle.c where we check for
>> CPU_FTR_ARCH_300, we should rather be making a pvr_version_is(PVR_POWER9)
>> check instead?
>>
>> Of course, the P10 PVR and its relevant checks will have to be added then too.
> Yes I think so, unfortunately.
>
> Thanks,
> Nick

Sure, I'll add these checks in.

Thanks,
Pratik


^ permalink raw reply

* Re: /sys/kernel/debug/kmemleak empty despite kmemleak reports
From: Catalin Marinas @ 2020-07-13 18:27 UTC (permalink / raw)
  To: Paul Menzel; +Cc: linuxppc-dev
In-Reply-To: <276e4bce-3e21-8020-9a1c-729a6cafcdd3@molgen.mpg.de>

On Thu, Jul 09, 2020 at 11:08:52PM +0200, Paul Menzel wrote:
> Am 09.07.20 um 19:57 schrieb Catalin Marinas:
> > On Thu, Jul 09, 2020 at 04:37:10PM +0200, Paul Menzel wrote:
> > > Despite Linux 5.8-rc4 reporting memory leaks on the IBM POWER 8 S822LC, the
> > > file does not contain more information.
> > > 
> > > > $ dmesg
> > > > […] > [48662.953323] perf: interrupt took too long (2570 > 2500), lowering kernel.perf_event_max_sample_rate to 77750
> > > > [48854.810636] perf: interrupt took too long (3216 > 3212), lowering kernel.perf_event_max_sample_rate to 62000
> > > > [52300.044518] perf: interrupt took too long (4244 > 4020), lowering kernel.perf_event_max_sample_rate to 47000
> > > > [52751.373083] perf: interrupt took too long (5373 > 5305), lowering kernel.perf_event_max_sample_rate to 37000
> > > > [53354.000363] perf: interrupt took too long (6793 > 6716), lowering kernel.perf_event_max_sample_rate to 29250
> > > > [53850.215606] perf: interrupt took too long (8672 > 8491), lowering kernel.perf_event_max_sample_rate to 23000
> > > > [57542.266099] perf: interrupt took too long (10940 > 10840), lowering kernel.perf_event_max_sample_rate to 18250
> > > > [57559.645404] perf: interrupt took too long (13714 > 13675), lowering kernel.perf_event_max_sample_rate to 14500
> > > > [61608.697728] Can't find PMC that caused IRQ
> > > > [71774.463111] kmemleak: 12 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > > > [92372.044785] process '@/usr/bin/gnatmake-5' started with executable stack
> > > > [92849.380672] FS-Cache: Loaded
> > > > [92849.417269] FS-Cache: Netfs 'nfs' registered for caching
> > > > [92849.595974] NFS: Registering the id_resolver key type
> > > > [92849.596000] Key type id_resolver registered
> > > > [92849.596000] Key type id_legacy registered
> > > > [101808.079143] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > > > [106904.323471] Can't find PMC that caused IRQ
> > > > [129416.391456] kmemleak: 1 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > > > [158171.604221] kmemleak: 34 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
> > > > $ sudo cat /sys/kernel/debug/kmemleak
> > 
> > When they are no longer present, they are most likely false positives.
> 
> How can this be? Shouldn’t the false positive also be logged in
> `/sys/kernel/debug/kmemleak`?

Sorry, I wasn't clear. It can be a transient false positive. At a
subsequent scan, kmemleak found pointer referring the previously
reported objects and no longer shows them.

> > Was this triggered during boot? Or under some workload?
> 
> From the timestamps it looks like under some load.

Was it during boot? I put a delay of 60s to avoid this but, depending on
the platform, it can still trigger.

-- 
Catalin

^ permalink raw reply

* Re: [V2 PATCH 1/3] Refactoring powerpc code for carrying over IMA measurement logs, to move non architecture specific code to security/ima.
From: Prakhar Srivastava @ 2020-07-13 20:30 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: kstewart, mark.rutland, gregkh, bhsharma, tao.li, zohar, paulus,
	vincenzo.frascino, will, nramas, frowand.list, masahiroy, jmorris,
	takahiro.akashi, linux-arm-kernel, catalin.marinas, serge,
	devicetree, pasha.tatashin, robh+dt, hsinyi, tusharsu, tglx,
	allison, christophe.leroy, mbrugger, balajib, dmitry.kasatkin,
	linux-kernel, linux-security-module, james.morse, linux-integrity,
	linuxppc-dev
In-Reply-To: <87o8per3m0.fsf@morokweng.localdomain>



On 6/19/20 5:19 PM, Thiago Jung Bauermann wrote:
> 
> Prakhar Srivastava <prsriva@linux.microsoft.com> writes:
> 
>> Powerpc has support to carry over the IMA measurement logs. Refatoring the
>> non-architecture specific code out of arch/powerpc and into security/ima.
>>
>> The code adds support for reserving and freeing up of memory for IMA measurement
>> logs.
> 
> Last week, Mimi provided this feedback:
> 
> "From your patch description, this patch should be broken up.  Moving
> the non-architecture specific code out of powerpc should be one patch.
>   Additional support should be in another patch.  After each patch, the
> code should work properly."
> 
> That's not what you do here. You move the code, but you also make other
> changes at the same time. This has two problems:
> 
> 1. It makes the patch harder to review, because it's very easy to miss a
>     change.
> 
> 2. If in the future a git bisect later points to this patch, it's not
>     clear whether the problem is because of the code movement, or because
>     of the other changes.
> 
> When you move code, ideally the patch should only make the changes
> necessary to make the code work at its new location. The patch which
> does code movement should not cause any change in behavior.
> 
> Other changes should go in separate patches, either before or after the
> one moving the code.
> 
> More comments below.
> 
Hi Thiago,

Apologies for the delayed response i was away for a few days.
I am working on breaking up the changes so that its easier to review and 
update as well.

Thanks,
Prakhar Srivastava

>>
>> ---
>>   arch/powerpc/include/asm/ima.h     |  10 ---
>>   arch/powerpc/kexec/ima.c           | 126 ++---------------------------
>>   security/integrity/ima/ima_kexec.c | 116 ++++++++++++++++++++++++++
>>   3 files changed, 124 insertions(+), 128 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h
>> index ead488cf3981..c29ec86498f8 100644
>> --- a/arch/powerpc/include/asm/ima.h
>> +++ b/arch/powerpc/include/asm/ima.h
>> @@ -4,15 +4,6 @@
>>
>>   struct kimage;
>>
>> -int ima_get_kexec_buffer(void **addr, size_t *size);
>> -int ima_free_kexec_buffer(void);
>> -
>> -#ifdef CONFIG_IMA
>> -void remove_ima_buffer(void *fdt, int chosen_node);
>> -#else
>> -static inline void remove_ima_buffer(void *fdt, int chosen_node) {}
>> -#endif
>> -
>>   #ifdef CONFIG_IMA_KEXEC
>>   int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
>>   			      size_t size);
>> @@ -22,7 +13,6 @@ int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node);
>>   static inline int setup_ima_buffer(const struct kimage *image, void *fdt,
>>   				   int chosen_node)
>>   {
>> -	remove_ima_buffer(fdt, chosen_node);
>>   	return 0;
>>   }
> 
> This is wrong. Even if the currently running kernel doesn't have
> CONFIG_IMA_KEXEC, it should remove the IMA buffer property and memory
> reservation from the FDT that is being prepared for the next kernel.
> 
> This is because the IMA kexec buffer is useless for the next kernel,
> regardless of whether the current kernel supports CONFIG_IMA_KEXEC or
> not. Keeping it around would be a waste of memory.
> 
I will keep it in my next revision.
My understanding was the reserved memory is freed and property removed 
when IMA loads the logs on init. During setup_fdt in kexec, a duplicate 
copy of the dt is used, but memory still needs to be allocated, thus the 
property itself indicats presence of reserved memory.


>> @@ -179,13 +64,18 @@ int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node)
>>   	int ret, addr_cells, size_cells, entry_size;
>>   	u8 value[16];
>>
>> -	remove_ima_buffer(fdt, chosen_node);
> 
> This is wrong, for the same reason stated above.
> 
>>   	if (!image->arch.ima_buffer_size)
>>   		return 0;
>>
>> -	ret = get_addr_size_cells(&addr_cells, &size_cells);
>> -	if (ret)
>> +	ret = fdt_address_cells(fdt, chosen_node);
>> +	if (ret < 0)
>> +		return ret;
>> +	addr_cells = ret;
>> +
>> +	ret = fdt_size_cells(fdt, chosen_node);
>> +	if (ret < 0)
>>   		return ret;
>> +	size_cells = ret;
>>
>>   	entry_size = 4 * (addr_cells + size_cells);
>>
> 
> I liked this change. Thanks! I agree it's better to use
> fdt_address_cells() and fdt_size_cells() here.
> 
> But it should be in a separate patch. Either before or after the one
> moving the code.
> 
>> diff --git a/security/integrity/ima/ima_kexec.c b/security/integrity/ima/ima_kexec.c
>> index 121de3e04af2..e1e6d6154015 100644
>> --- a/security/integrity/ima/ima_kexec.c
>> +++ b/security/integrity/ima/ima_kexec.c
>> @@ -10,8 +10,124 @@
>>   #include <linux/seq_file.h>
>>   #include <linux/vmalloc.h>
>>   #include <linux/kexec.h>
>> +#include <linux/of.h>
>> +#include <linux/memblock.h>
>> +#include <linux/libfdt.h>
>>   #include "ima.h"
>>
>> +static int get_addr_size_cells(int *addr_cells, int *size_cells)
>> +{
>> +	struct device_node *root;
>> +
>> +	root = of_find_node_by_path("/");
>> +	if (!root)
>> +		return -EINVAL;
>> +
>> +	*addr_cells = of_n_addr_cells(root);
>> +	*size_cells = of_n_size_cells(root);
>> +
>> +	of_node_put(root);
>> +
>> +	return 0;
>> +}
>> +
>> +static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr,
>> +			       size_t *size)
>> +{
>> +	int ret, addr_cells, size_cells;
>> +
>> +	ret = get_addr_size_cells(&addr_cells, &size_cells);
>> +	if (ret)
>> +		return ret;
>> +
>> +	if (len < 4 * (addr_cells + size_cells))
>> +		return -ENOENT;
>> +
>> +	*addr = of_read_number(prop, addr_cells);
>> +	*size = of_read_number(prop + 4 * addr_cells, size_cells);
>> +
>> +	return 0;
>> +}
>> +
>> +/**
>> + * ima_get_kexec_buffer - get IMA buffer from the previous kernel
>> + * @addr:	On successful return, set to point to the buffer contents.
>> + * @size:	On successful return, set to the buffer size.
>> + *
>> + * Return: 0 on success, negative errno on error.
>> + */
>> +int ima_get_kexec_buffer(void **addr, size_t *size)
>> +{
>> +	int ret, len;
>> +	unsigned long tmp_addr;
>> +	size_t tmp_size;
>> +	const void *prop;
>> +
>> +	prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len);
>> +	if (!prop)
>> +		return -ENOENT;
>> +
>> +	ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size);
>> +	if (ret)
>> +		return ret;
>> +
>> +	*addr = __va(tmp_addr);
>> +	*size = tmp_size;
>> +
>> +	return 0;
>> +}
> 
> The functions above were moved without being changed. Good.
> 
>> +/**
>> + * ima_free_kexec_buffer - free memory used by the IMA buffer
>> + */
>> +int ima_free_kexec_buffer(void)
>> +{
>> +	int ret;
>> +	unsigned long addr;
>> +	size_t size;
>> +	struct property *prop;
>> +
>> +	prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL);
>> +	if (!prop)
>> +		return -ENOENT;
>> +
>> +	ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size);
>> +	if (ret)
>> +		return ret;
>> +
>> +	ret = of_remove_property(of_chosen, prop);
>> +	if (ret)
>> +		return ret;
>> +
>> +	return memblock_free(__pa(addr), size);
> 
> Here you added a __pa() call. Do you store a virtual address in
> linux,ima-kexec-buffer property? Doesn't it make more sense to store a
> physical address?
> 
trying to minimize the changes here as do_get_kexec_buffer return the va.
I will refactor this to remove the double translation.

> Even if making this change is the correct thing to do, it should be a
> separate patch, unless it can't be avoided. And if that is the case,
> then it should be explained in the commit message.
> 
>> +
>> +}
>> +
>> +/**
>> + * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt
>> + *
>> + * The IMA measurement buffer is of no use to a subsequent kernel, so we always
>> + * remove it from the device tree.
>> + */
>> +void remove_ima_buffer(void *fdt, int chosen_node)
>> +{
>> +	int ret, len;
>> +	unsigned long addr;
>> +	size_t size;
>> +	const void *prop;
>> +
>> +	prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len);
>> +	if (!prop)
>> +		return;
>> +
>> +	do_get_kexec_buffer(prop, len, &addr, &size);
>> +	ret = fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer");
>> +	if (ret < 0)
>> +		return;
>> +
>> +	memblock_free(addr, size);
>> +}
> 
> Here is another function that changed when moved. This one I know to be
> wrong. You're confusing the purposes of remove_ima_buffer() and
> ima_free_kexec_buffer().
> 
> You did send me a question about them nearly three weeks ago which I
> only answered today, so I apologize. Also, their names could more
> clearly reflect their differences, so it's bad naming on my part.
> 
> With IMA kexec buffers, there are two kernels (and thus their two
> respective, separate device trees) to be concerned about:
> 
> 1. the currently running kernel, which uses the live device tree
> (accessed with the of_* functions) and the memblock subsystem;
> 
> 2. the kernel which is being loaded by kexec, which will use the FDT
> blob being passed around as argument to these functions, and the memory
> reservations in the memory reservation table of the FDT blob.
> 
> ima_free_kexec_buffer() is used by IMA in the currently running kernel.
> Therefore the device tree it is concerned about is the live one, and
> thus uses the of_* functions to access it. And uses memblock to change
> the memory reservation.
> 
> remove_ima_buffer() on the other hand is used by the kexec code to
> prepare an FDT blob for the kernel that is being loaded. It should not
> make any changes to live device tree, nor to memblock allocations. It
> should only make changes to the FDT blob.
> 
Thank you for this, greatly appreciate clearing my misunderstandings.
> --
> Thiago Jung Bauermann
> IBM Linux Technology Center
> 

^ permalink raw reply

* Re: [V2 PATCH 2/3] dt-bindings: chosen: Document ima-kexec-buffer
From: Prakhar Srivastava @ 2020-07-13 20:32 UTC (permalink / raw)
  To: Thiago Jung Bauermann
  Cc: kstewart, mark.rutland, gregkh, bhsharma, tao.li, zohar, paulus,
	vincenzo.frascino, will, nramas, frowand.list, masahiroy, jmorris,
	takahiro.akashi, linux-arm-kernel, catalin.marinas, serge,
	devicetree, pasha.tatashin, robh+dt, hsinyi, tusharsu, tglx,
	allison, christophe.leroy, mbrugger, balajib, dmitry.kasatkin,
	linux-kernel, linux-security-module, james.morse, linux-integrity,
	linuxppc-dev
In-Reply-To: <87mu4yr2k2.fsf@morokweng.localdomain>



On 6/19/20 5:41 PM, Thiago Jung Bauermann wrote:
> 
> Prakhar Srivastava <prsriva@linux.microsoft.com> writes:
> 
>> Integrity measurement architecture(IMA) validates if files
>> have been accidentally or maliciously altered, both remotely and
>> locally, appraise a file's measurement against a "good" value stored
>> as an extended attribute, and enforce local file integrity.
>>
>> IMA also measures singatures of kernel and initrd during kexec along with
>> the command line used for kexec.
>> These measurements are critical to verify the seccurity posture of the OS.
>>
>> Resering memory and adding the memory information to a device tree node
>> acts as the mechanism to carry over IMA measurement logs.
>>
>> Update devicetree documentation to reflect the addition of new property
>> under the chosen node.
> 
> Thank you for writing this documentation patch. It's something I should
> have done when I added the powerpc IMA kexec support.
> 
> You addressed Rob Herring's comments regarding the commit message, but
> not the ones regarding the patch contents.
> 
> When posting a new version of the patches, make sure to address all
> comments made so far. Addressing a comment doesn't necessarily mean
> implementing the requested change. If you don't then you should at least
> explain why you chose a different path.
> 
> I mention it because this has occurred before with this patch series,
> and it's hard to make forward progress if review comments get ignored.
> 
>> ---
>>   Documentation/devicetree/bindings/chosen.txt | 17 +++++++++++++++++
>>   1 file changed, 17 insertions(+)
>>
>> diff --git a/Documentation/devicetree/bindings/chosen.txt b/Documentation/devicetree/bindings/chosen.txt
>> index 45e79172a646..a15f70c007ef 100644
>> --- a/Documentation/devicetree/bindings/chosen.txt
>> +++ b/Documentation/devicetree/bindings/chosen.txt
>> @@ -135,3 +135,20 @@ e.g.
>>   		linux,initrd-end = <0x82800000>;
>>   	};
>>   };
>> +
>> +linux,ima-kexec-buffer
>> +----------------------
>> +
>> +This property(currently used by powerpc, arm64) holds the memory range,
> 
> space before the parenthesis.
> 
>> +the address and the size, of the IMA measurement logs that are being carried
> 
> Maybe it's because English isn't my first language, but IMHO it's
> clearer if "the address and the size" is between parentheses rather than
> commas.
> 
>> +over to the kexec session.
> 
> I don't think there's a "kexec session", but I'm not sure what a good
> term would be. "linux,booted-from-kexec" uses "new kernel" so perhaps
> that's a good option to use instead of "kexec session".
> 
>> +
>> +/ {
>> +	chosen {
>> +		linux,ima-kexec-buffer = <0x9 0x82000000 0x0 0x00008000>;
>> +	};
>> +};
>> +
>> +This porperty does not represent real hardware, but the memory allocated for
>> +carrying the IMA measurement logs. The address and the suze are expressed in
>> +#address-cells and #size-cells, respectively of the root node.
> 
> 
I will update the descriptions and ack the comments/changes in the 
patches as well.

Thankyou,
Prakhar Srivastava
> --
> Thiago Jung Bauermann
> IBM Linux Technology Center
> 

^ permalink raw reply

* [PATCH] ocxl: Replace HTTP links with HTTPS ones
From: Alexander A. Klimov @ 2020-07-13 17:55 UTC (permalink / raw)
  To: fbarrat, ajd, arnd, gregkh, linuxppc-dev, linux-kernel
  Cc: Alexander A. Klimov

Rationale:
Reduces attack surface on kernel devs opening the links for MITM
as HTTPS traffic is much harder to manipulate.

Deterministic algorithm:
For each file:
  If not .svg:
    For each line:
      If doesn't contain `\bxmlns\b`:
        For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
            If both the HTTP and HTTPS versions
            return 200 OK and serve the same content:
              Replace HTTP with HTTPS.

Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
---
 Continuing my work started at 93431e0607e5.
 See also: git log --oneline '--author=Alexander A. Klimov <grandmaster@al2klimov.de>' v5.7..master
 (Actually letting a shell for loop submit all this stuff for me.)

 If there are any URLs to be removed completely or at least not just HTTPSified:
 Just clearly say so and I'll *undo my change*.
 See also: https://lkml.org/lkml/2020/6/27/64

 If there are any valid, but yet not changed URLs:
 See: https://lkml.org/lkml/2020/6/26/837

 If you apply the patch, please let me know.

 Sorry again to all maintainers who complained about subject lines.
 Now I realized that you want an actually perfect prefixes,
 not just subsystem ones.
 I tried my best...
 And yes, *I could* (at least half-)automate it.
 Impossible is nothing! :)


 drivers/misc/ocxl/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/misc/ocxl/Kconfig b/drivers/misc/ocxl/Kconfig
index 2d2266c1439e..6551007a066c 100644
--- a/drivers/misc/ocxl/Kconfig
+++ b/drivers/misc/ocxl/Kconfig
@@ -23,7 +23,7 @@ config OCXL
 	  The ocxl driver enables userspace programs to access these
 	  accelerators through devices in /dev/ocxl/.
 
-	  For more information, see http://opencapi.org.
+	  For more information, see https://opencapi.org.
 
 	  This is not to be confused with the support for IBM CAPI
 	  accelerators (CONFIG_CXL), which are PCI-based instead of a
-- 
2.27.0


^ permalink raw reply related

* [PATCH 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Jarkko Sakkinen @ 2020-07-13 18:19 UTC (permalink / raw)
  To: linux-kernel
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Yonghong Song, Thomas Gleixner, Jiri Kosina,
	Anup Patel, Philipp Rudo, Torsten Duwe, Masami Hiramatsu,
	Andrew Morton, Mark Rutland, James E.J. Bottomley, Vincent Chen,
	open list:S390, Joe Lawrence, Helge Deller, John Fastabend,
	Anil S Keshavamurthy, Andrey Ryabinin, Iurii Zaikin,
	Andrii Nakryiko, Vasily Gorbik, moderated list:ARM PORT,
	Daniel Axtens, Damien Le Moal, Peter Oberparleiter,
	Sean Christopherson, Martin KaFai Lau, Song Liu, Paul Walmsley,
	Heiko Carstens, Alexei Starovoitov, Jarkko Sakkinen, Atish Patra,
	Will Deacon, Daniel Borkmann, Masahiro Yamada, Nayna Jain,
	Ley Foon Tan, Christian Borntraeger, Dmitry Vyukov, Sami Tolvanen,
	Naveen N. Rao, Mao Han, Marco Elver, Steven Rostedt, Babu Moger,
	Borislav Petkov, Greentime Hu, Ben Dooks, Guan Xuetao,
	Thomas Bogendoerfer, open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC sparc/sparc64, Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, Ard Biesheuvel, Vincenzo Frascino, Anders Roxell,
	Sven Schnelle, maintainer:X86 ARCHITECTURE 32-BIT AND 64-BIT,
	Russell King, open list:RISC-V ARCHITECTURE, Mike Rapoport,
	Ingo Molnar, Albert Ou, Paul E. McKenney, Josh Poimboeuf,
	KP Singh, Gerald Schaefer, Nick Hu,
	open list:BPF JIT for MIPS 32-BIT AND 64-BIT, open list:MIPS,
	Sergey Senozhatsky, Palmer Dabbelt,
	open list:LINUX FOR POWERPC 32-BIT AND 64-BIT

Rename module_alloc() to text_alloc() and module_memfree() to
text_memfree(), and move them to kernel/text.c, which is unconditionally
compiled to the kernel proper. This allows kprobes, ftrace and bpf to
allocate space for executable code without requiring to compile the modules
support (CONFIG_MODULES=y) in.

Cc: Andi Kleen <ak@linux.intel.com>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
---
 arch/arm/kernel/Makefile         |  3 +-
 arch/arm/kernel/module.c         | 21 -----------
 arch/arm/kernel/text.c           | 33 ++++++++++++++++++
 arch/arm64/kernel/Makefile       |  2 +-
 arch/arm64/kernel/module.c       | 42 ----------------------
 arch/arm64/kernel/text.c         | 54 ++++++++++++++++++++++++++++
 arch/mips/kernel/Makefile        |  2 +-
 arch/mips/kernel/module.c        |  9 -----
 arch/mips/kernel/text.c          | 19 ++++++++++
 arch/mips/net/bpf_jit.c          |  4 +--
 arch/nds32/kernel/Makefile       |  2 +-
 arch/nds32/kernel/module.c       |  7 ----
 arch/nds32/kernel/text.c         | 12 +++++++
 arch/nios2/kernel/Makefile       |  1 +
 arch/nios2/kernel/module.c       | 19 ----------
 arch/nios2/kernel/text.c         | 34 ++++++++++++++++++
 arch/parisc/kernel/Makefile      |  2 +-
 arch/parisc/kernel/module.c      | 11 ------
 arch/parisc/kernel/text.c        | 22 ++++++++++++
 arch/powerpc/net/bpf_jit_comp.c  |  4 +--
 arch/riscv/kernel/Makefile       |  1 +
 arch/riscv/kernel/module.c       | 12 -------
 arch/riscv/kernel/text.c         | 20 +++++++++++
 arch/s390/kernel/Makefile        |  2 +-
 arch/s390/kernel/ftrace.c        |  2 +-
 arch/s390/kernel/module.c        | 16 ---------
 arch/s390/kernel/text.c          | 23 ++++++++++++
 arch/sparc/kernel/Makefile       |  1 +
 arch/sparc/kernel/module.c       | 30 ----------------
 arch/sparc/kernel/text.c         | 39 +++++++++++++++++++++
 arch/sparc/net/bpf_jit_comp_32.c |  6 ++--
 arch/unicore32/kernel/Makefile   |  1 +
 arch/unicore32/kernel/module.c   |  7 ----
 arch/unicore32/kernel/text.c     | 18 ++++++++++
 arch/x86/kernel/Makefile         |  1 +
 arch/x86/kernel/ftrace.c         |  4 +--
 arch/x86/kernel/kprobes/core.c   |  4 +--
 arch/x86/kernel/module.c         | 49 --------------------------
 arch/x86/kernel/text.c           | 60 ++++++++++++++++++++++++++++++++
 include/linux/moduleloader.h     |  4 +--
 kernel/Makefile                  |  2 +-
 kernel/bpf/core.c                |  4 +--
 kernel/kprobes.c                 |  4 +--
 kernel/module.c                  | 37 ++++++--------------
 kernel/text.c                    | 25 +++++++++++++
 45 files changed, 400 insertions(+), 275 deletions(-)
 create mode 100644 arch/arm/kernel/text.c
 create mode 100644 arch/arm64/kernel/text.c
 create mode 100644 arch/mips/kernel/text.c
 create mode 100644 arch/nds32/kernel/text.c
 create mode 100644 arch/nios2/kernel/text.c
 create mode 100644 arch/parisc/kernel/text.c
 create mode 100644 arch/riscv/kernel/text.c
 create mode 100644 arch/s390/kernel/text.c
 create mode 100644 arch/sparc/kernel/text.c
 create mode 100644 arch/unicore32/kernel/text.c
 create mode 100644 arch/x86/kernel/text.c
 create mode 100644 kernel/text.c

diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
index 89e5d864e923..69bfacfd60ef 100644
--- a/arch/arm/kernel/Makefile
+++ b/arch/arm/kernel/Makefile
@@ -19,7 +19,8 @@ CFLAGS_REMOVE_return_address.o = -pg
 obj-y		:= elf.o entry-common.o irq.o opcodes.o \
 		   process.o ptrace.o reboot.o \
 		   setup.o signal.o sigreturn_codes.o \
-		   stacktrace.o sys_arm.o time.o traps.o
+		   stacktrace.o sys_arm.o time.o traps.o \
+		   text.o
 
 ifneq ($(CONFIG_ARM_UNWIND),y)
 obj-$(CONFIG_FRAME_POINTER)	+= return_address.o
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index e15444b25ca0..13e3442a6b9f 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -33,27 +33,6 @@
 #define MODULES_VADDR	(((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
 #endif
 
-#ifdef CONFIG_MMU
-void *module_alloc(unsigned long size)
-{
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	/* Silence the initial allocation */
-	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
-		gfp_mask |= __GFP_NOWARN;
-
-	p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-	if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
-		return p;
-	return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-#endif
-
 bool module_init_section(const char *name)
 {
 	return strstarts(name, ".init") ||
diff --git a/arch/arm/kernel/text.c b/arch/arm/kernel/text.c
new file mode 100644
index 000000000000..600143fb909d
--- /dev/null
+++ b/arch/arm/kernel/text.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  linux/arch/arm/kernel/module.c
+ *
+ *  Copyright (C) 2002 Russell King.
+ *  Modified for nommu by Hyok S. Choi
+ *
+ * Module allocation method suggested by Andi Kleen.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_MMU
+void *text_alloc(unsigned long size)
+{
+	gfp_t gfp_mask = GFP_KERNEL;
+	void *p;
+
+	/* Silence the initial allocation */
+	if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
+		gfp_mask |= __GFP_NOWARN;
+
+	p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+	if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
+		return p;
+	return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+#endif
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index a561cbb91d4d..7765a45d71b5 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -19,7 +19,7 @@ obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
 			   return_address.o cpuinfo.o cpu_errata.o		\
 			   cpufeature.o alternative.o cacheinfo.o		\
 			   smp.o smp_spin_table.o topology.o smccc-call.o	\
-			   syscall.o
+			   syscall.o text.o
 
 targets			+= efi-entry.o
 
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
index 1cd1a4d0ed30..adde022f703c 100644
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -20,48 +20,6 @@
 #include <asm/insn.h>
 #include <asm/sections.h>
 
-void *module_alloc(unsigned long size)
-{
-	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
-	gfp_t gfp_mask = GFP_KERNEL;
-	void *p;
-
-	/* Silence the initial allocation */
-	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
-		gfp_mask |= __GFP_NOWARN;
-
-	if (IS_ENABLED(CONFIG_KASAN))
-		/* don't exceed the static module region - see below */
-		module_alloc_end = MODULES_END;
-
-	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
-				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
-				NUMA_NO_NODE, __builtin_return_address(0));
-
-	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
-	    !IS_ENABLED(CONFIG_KASAN))
-		/*
-		 * KASAN can only deal with module allocations being served
-		 * from the reserved module region, since the remainder of
-		 * the vmalloc region is already backed by zero shadow pages,
-		 * and punching holes into it is non-trivial. Since the module
-		 * region is not randomized when KASAN is enabled, it is even
-		 * less likely that the module region gets exhausted, so we
-		 * can simply omit this fallback in that case.
-		 */
-		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
-				module_alloc_base + SZ_2G, GFP_KERNEL,
-				PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-
-	if (p && (kasan_module_alloc(p, size) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-
-	return p;
-}
-
 enum aarch64_reloc_op {
 	RELOC_OP_NONE,
 	RELOC_OP_ABS,
diff --git a/arch/arm64/kernel/text.c b/arch/arm64/kernel/text.c
new file mode 100644
index 000000000000..64fc7e2d85df
--- /dev/null
+++ b/arch/arm64/kernel/text.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * AArch64 loadable module support.
+ *
+ * Copyright (C) 2012 ARM Limited
+ *
+ * Author: Will Deacon <will.deacon@arm.com>
+ */
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
+	gfp_t gfp_mask = GFP_KERNEL;
+	void *p;
+
+	/* Silence the initial allocation */
+	if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
+		gfp_mask |= __GFP_NOWARN;
+
+	if (IS_ENABLED(CONFIG_KASAN))
+		/* don't exceed the static module region - see below */
+		module_alloc_end = MODULES_END;
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+				module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
+				NUMA_NO_NODE, __builtin_return_address(0));
+
+	if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
+	    !IS_ENABLED(CONFIG_KASAN))
+		/*
+		 * KASAN can only deal with module allocations being served
+		 * from the reserved module region, since the remainder of
+		 * the vmalloc region is already backed by zero shadow pages,
+		 * and punching holes into it is non-trivial. Since the module
+		 * region is not randomized when KASAN is enabled, it is even
+		 * less likely that the module region gets exhausted, so we
+		 * can simply omit this fallback in that case.
+		 */
+		p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
+				module_alloc_base + SZ_2G, GFP_KERNEL,
+				PAGE_KERNEL, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
+}
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index 8c7a043295ed..37ebf9a7f259 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -8,7 +8,7 @@ extra-y		:= head.o vmlinux.lds
 obj-y		+= cmpxchg.o cpu-probe.o branch.o elf.o entry.o genex.o idle.o irq.o \
 		   process.o prom.o ptrace.o reset.o setup.o signal.o \
 		   syscall.o time.o topology.o traps.o unaligned.o watch.o \
-		   vdso.o cacheinfo.o
+		   vdso.o cacheinfo.o text.o
 
 ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_ftrace.o = -pg
diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
index 3c0c3d1260c1..f5ac4ebc4bad 100644
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -31,15 +31,6 @@ struct mips_hi16 {
 static LIST_HEAD(dbe_list);
 static DEFINE_SPINLOCK(dbe_lock);
 
-#ifdef MODULE_START
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
-				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-#endif
-
 static int apply_r_mips_none(struct module *me, u32 *location,
 			     u32 base, Elf_Addr v, bool rela)
 {
diff --git a/arch/mips/kernel/text.c b/arch/mips/kernel/text.c
new file mode 100644
index 000000000000..55ca87a421c3
--- /dev/null
+++ b/arch/mips/kernel/text.c
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *
+ *  Copyright (C) 2001 Rusty Russell.
+ *  Copyright (C) 2003, 2004 Ralf Baechle (ralf@linux-mips.org)
+ *  Copyright (C) 2005 Thiemo Seufer
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#ifdef MODULE_START
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+#endif
diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
index 0af88622c619..2b03f7128809 100644
--- a/arch/mips/net/bpf_jit.c
+++ b/arch/mips/net/bpf_jit.c
@@ -1233,7 +1233,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 	build_epilogue(&ctx);
 
 	alloc_size = 4 * ctx.idx;
-	ctx.target = module_alloc(alloc_size);
+	ctx.target = text_alloc(alloc_size);
 	if (ctx.target == NULL)
 		goto out;
 
@@ -1264,7 +1264,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		text_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile
index 394df3f6442c..fc15778c59d1 100644
--- a/arch/nds32/kernel/Makefile
+++ b/arch/nds32/kernel/Makefile
@@ -10,7 +10,7 @@ AFLAGS_head.o		:= -DTEXTADDR=$(TEXTADDR)
 obj-y			:= ex-entry.o ex-exit.o ex-scall.o irq.o \
 			process.o ptrace.o setup.o signal.o \
 			sys_nds32.o time.o traps.o cacheinfo.o \
-			dma.o syscall_table.o vdso.o
+			dma.o syscall_table.o vdso.o text.o
 
 obj-$(CONFIG_MODULES)		+= nds32_ksyms.o module.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
diff --git a/arch/nds32/kernel/module.c b/arch/nds32/kernel/module.c
index 3897fd14a21d..3d23a12ed535 100644
--- a/arch/nds32/kernel/module.c
+++ b/arch/nds32/kernel/module.c
@@ -7,13 +7,6 @@
 #include <linux/moduleloader.h>
 #include <linux/pgtable.h>
 
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-}
-
 void module_free(struct module *module, void *region)
 {
 	vfree(region);
diff --git a/arch/nds32/kernel/text.c b/arch/nds32/kernel/text.c
new file mode 100644
index 000000000000..6e86eff9aaf0
--- /dev/null
+++ b/arch/nds32/kernel/text.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (C) 2005-2017 Andes Technology Corporation
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
diff --git a/arch/nios2/kernel/Makefile b/arch/nios2/kernel/Makefile
index 0b645e1e3158..5476fc749f37 100644
--- a/arch/nios2/kernel/Makefile
+++ b/arch/nios2/kernel/Makefile
@@ -18,6 +18,7 @@ obj-y	+= setup.o
 obj-y	+= signal.o
 obj-y	+= sys_nios2.o
 obj-y	+= syscall_table.o
+obj-y	+= text.o
 obj-y	+= time.o
 obj-y	+= traps.o
 
diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
index 76e0a42d6e36..20a0faf64e38 100644
--- a/arch/nios2/kernel/module.c
+++ b/arch/nios2/kernel/module.c
@@ -21,25 +21,6 @@
 
 #include <asm/cacheflush.h>
 
-/*
- * Modules should NOT be allocated with kmalloc for (obvious) reasons.
- * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
- * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
- * addresses in 0xc0000000)
- */
-void *module_alloc(unsigned long size)
-{
-	if (size == 0)
-		return NULL;
-	return kmalloc(size, GFP_KERNEL);
-}
-
-/* Free memory returned from module_alloc */
-void module_memfree(void *module_region)
-{
-	kfree(module_region);
-}
-
 int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
 			unsigned int symindex, unsigned int relsec,
 			struct module *mod)
diff --git a/arch/nios2/kernel/text.c b/arch/nios2/kernel/text.c
new file mode 100644
index 000000000000..af424174442f
--- /dev/null
+++ b/arch/nios2/kernel/text.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Kernel module support for Nios II.
+ *
+ * Copyright (C) 2004 Microtronix Datacom Ltd.
+ *   Written by Wentao Xu <xuwentao@microtronix.com>
+ * Copyright (C) 2001, 2003 Rusty Russell
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of this
+ * archive for more details.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+/*
+ * Modules should NOT be allocated with kmalloc for (obvious) reasons.
+ * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
+ * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
+ * addresses in 0xc0000000)
+ */
+void *text_alloc(unsigned long size)
+{
+	if (size == 0)
+		return NULL;
+	return kmalloc(size, GFP_KERNEL);
+}
+
+/* Free memory returned from module_alloc */
+void text_memfree(void *module_region)
+{
+	kfree(module_region);
+}
diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
index 068d90950d93..f71f7ffdae2e 100644
--- a/arch/parisc/kernel/Makefile
+++ b/arch/parisc/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y	     	:= cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \
 		   ptrace.o hardware.o inventory.o drivers.o alternative.o \
 		   signal.o hpmc.o real2.o parisc_ksyms.o unaligned.o \
 		   process.o processor.o pdc_cons.o pdc_chassis.o unwind.o \
-		   patch.o
+		   patch.o text.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not profile debug and lowlevel utilities
diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
index 7df140545b22..c81e63e2549b 100644
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -192,17 +192,6 @@ static inline int reassemble_22(int as22)
 		((as22 & 0x0003ff) << 3));
 }
 
-void *module_alloc(unsigned long size)
-{
-	/* using RWX means less protection for modules, but it's
-	 * easier than trying to map the text, data, init_text and
-	 * init_data correctly */
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-				    GFP_KERNEL,
-				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-}
-
 #ifndef CONFIG_64BIT
 static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n)
 {
diff --git a/arch/parisc/kernel/text.c b/arch/parisc/kernel/text.c
new file mode 100644
index 000000000000..9ff503084191
--- /dev/null
+++ b/arch/parisc/kernel/text.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *    Linux/PA-RISC Project
+ *    Copyright (C) 2003 Randolph Chung <tausq at debian . org>
+ *    Copyright (C) 2008 Helge Deller <deller@gmx.de>
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	/*
+	 * Using RWX means less protection for modules, but it's
+	 * easier than trying to map the text, data, init_text and
+	 * init_data correctly.
+	 */
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+				    GFP_KERNEL,
+				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 0acc9d5fb19e..ba1cef7a812d 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -634,7 +634,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 
 	proglen = cgctx.idx * 4;
 	alloclen = proglen + FUNCTION_DESCR_SIZE;
-	image = module_alloc(alloclen);
+	image = text_alloc(alloclen);
 	if (!image)
 		goto out;
 
@@ -678,7 +678,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		text_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index b355cf485671..d0b30f286ce6 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -29,6 +29,7 @@ obj-y	+= riscv_ksyms.o
 obj-y	+= stacktrace.o
 obj-y	+= cacheinfo.o
 obj-y	+= patch.o
+obj-y	+= text.o
 obj-$(CONFIG_MMU) += vdso.o vdso/
 
 obj-$(CONFIG_RISCV_M_MODE)	+= clint.o traps_misaligned.o
diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
index 7191342c54da..f6aa66431c9e 100644
--- a/arch/riscv/kernel/module.c
+++ b/arch/riscv/kernel/module.c
@@ -390,15 +390,3 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 
 	return 0;
 }
-
-#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
-#define VMALLOC_MODULE_START \
-	 max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
-				    VMALLOC_END, GFP_KERNEL,
-				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-}
-#endif
diff --git a/arch/riscv/kernel/text.c b/arch/riscv/kernel/text.c
new file mode 100644
index 000000000000..201608a25641
--- /dev/null
+++ b/arch/riscv/kernel/text.c
@@ -0,0 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *
+ *  Copyright (C) 2017 Zihao Yu
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
+#define VMALLOC_MODULE_START \
+	 max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
+				    VMALLOC_END, GFP_KERNEL,
+				    PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+}
+#endif
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index a8f136943deb..9f00c320b938 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -40,7 +40,7 @@ obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y	+= smp.o
+obj-y	+= smp.o text.o
 
 extra-y				+= head64.o vmlinux.lds
 
diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
index b388e87a08bf..a752b1442846 100644
--- a/arch/s390/kernel/ftrace.c
+++ b/arch/s390/kernel/ftrace.c
@@ -134,7 +134,7 @@ static int __init ftrace_plt_init(void)
 {
 	unsigned int *ip;
 
-	ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
+	ftrace_plt = (unsigned long) text_alloc(PAGE_SIZE);
 	if (!ftrace_plt)
 		panic("cannot allocate ftrace plt\n");
 	ip = (unsigned int *) ftrace_plt;
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index 4055f1c49814..087cb5951de6 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -32,22 +32,6 @@
 
 #define PLT_ENTRY_SIZE 20
 
-void *module_alloc(unsigned long size)
-{
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-	p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
-				 GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				 __builtin_return_address(0));
-	if (p && (kasan_module_alloc(p, size) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-	return p;
-}
-
 void module_arch_freeing_init(struct module *mod)
 {
 	if (is_livepatch_module(mod) &&
diff --git a/arch/s390/kernel/text.c b/arch/s390/kernel/text.c
new file mode 100644
index 000000000000..63aaa1ab727b
--- /dev/null
+++ b/arch/s390/kernel/text.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ *  Kernel module help for s390.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	void *p;
+
+	if (PAGE_ALIGN(size) > MODULES_LEN)
+		return NULL;
+	p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
+				 GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				 __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+	return p;
+}
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 97c0e19263d1..e025f9e1db4a 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -52,6 +52,7 @@ obj-y                   += prom_common.o
 obj-y                   += prom_$(BITS).o
 obj-y                   += of_device_common.o
 obj-y                   += of_device_$(BITS).o
+obj-y			+= text.o
 obj-$(CONFIG_SPARC64)   += prom_irqtrans.o
 
 obj-$(CONFIG_SPARC32)   += leon_kernel.o
diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
index df39580f398d..f2babc69f189 100644
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -21,36 +21,6 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_SPARC64
-
-#include <linux/jump_label.h>
-
-static void *module_map(unsigned long size)
-{
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-#else
-static void *module_map(unsigned long size)
-{
-	return vmalloc(size);
-}
-#endif /* CONFIG_SPARC64 */
-
-void *module_alloc(unsigned long size)
-{
-	void *ret;
-
-	ret = module_map(size);
-	if (ret)
-		memset(ret, 0, size);
-
-	return ret;
-}
-
 /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
 int module_frob_arch_sections(Elf_Ehdr *hdr,
 			      Elf_Shdr *sechdrs,
diff --git a/arch/sparc/kernel/text.c b/arch/sparc/kernel/text.c
new file mode 100644
index 000000000000..d16663f2c6ba
--- /dev/null
+++ b/arch/sparc/kernel/text.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Kernel module help for sparc64.
+ *
+ * Copyright (C) 2001 Rusty Russell.
+ * Copyright (C) 2002 David S. Miller.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_SPARC64
+
+#include <linux/jump_label.h>
+
+static void *module_map(unsigned long size)
+{
+	if (PAGE_ALIGN(size) > MODULES_LEN)
+		return NULL;
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
+#else
+static void *module_map(unsigned long size)
+{
+	return vmalloc(size);
+}
+#endif /* CONFIG_SPARC64 */
+
+void *text_alloc(unsigned long size)
+{
+	void *ret;
+
+	ret = module_map(size);
+	if (ret)
+		memset(ret, 0, size);
+
+	return ret;
+}
diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
index c8eabb973b86..d9dd513b27b2 100644
--- a/arch/sparc/net/bpf_jit_comp_32.c
+++ b/arch/sparc/net/bpf_jit_comp_32.c
@@ -713,7 +713,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 				if (unlikely(proglen + ilen > oldproglen)) {
 					pr_err("bpb_jit_compile fatal error\n");
 					kfree(addrs);
-					module_memfree(image);
+					text_memfree(image);
 					return;
 				}
 				memcpy(image + proglen, temp, ilen);
@@ -736,7 +736,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 			break;
 		}
 		if (proglen == oldproglen) {
-			image = module_alloc(proglen);
+			image = text_alloc(proglen);
 			if (!image)
 				goto out;
 		}
@@ -758,7 +758,7 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 void bpf_jit_free(struct bpf_prog *fp)
 {
 	if (fp->jited)
-		module_memfree(fp->bpf_func);
+		text_memfree(fp->bpf_func);
 
 	bpf_prog_unlock_free(fp);
 }
diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile
index 2f79aa56735b..96eb8cfc8b1e 100644
--- a/arch/unicore32/kernel/Makefile
+++ b/arch/unicore32/kernel/Makefile
@@ -6,6 +6,7 @@
 # Object file lists.
 obj-y				:= dma.o elf.o entry.o process.o ptrace.o
 obj-y				+= setup.o signal.o sys.o stacktrace.o traps.o
+obj-y				+= text.o
 
 obj-$(CONFIG_MODULES)		+= ksyms.o module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
index 67c89ef2d6ee..e1e703c02379 100644
--- a/arch/unicore32/kernel/module.c
+++ b/arch/unicore32/kernel/module.c
@@ -18,13 +18,6 @@
 
 #include <asm/sections.h>
 
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-
 int
 apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
 	       unsigned int relindex, struct module *module)
diff --git a/arch/unicore32/kernel/text.c b/arch/unicore32/kernel/text.c
new file mode 100644
index 000000000000..b94aac824bb8
--- /dev/null
+++ b/arch/unicore32/kernel/text.c
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/arch/unicore32/kernel/module.c
+ *
+ * Code specific to PKUnity SoC and UniCore ISA
+ *
+ * Copyright (C) 2001-2010 GUAN Xue-tao
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void *text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
+				__builtin_return_address(0));
+}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e77261db2391..2878e4b753a0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -68,6 +68,7 @@ obj-y			+= tsc.o tsc_msr.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 obj-y			+= irqflags.o
+obj-y			+= text.o
 
 obj-y				+= process.o
 obj-y				+= fpu/
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 51504566b3a6..f76703ee96f2 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -265,11 +265,11 @@ int __init ftrace_dyn_arch_init(void)
 /* Module allocation simplifies allocating memory for code */
 static inline void *alloc_tramp(unsigned long size)
 {
-	return module_alloc(size);
+	return text_alloc(size);
 }
 static inline void tramp_free(void *tramp)
 {
-	module_memfree(tramp);
+	text_memfree(tramp);
 }
 #else
 /* Trampolines can only be created if modules are supported */
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index ada39ddbc922..e9ac7d3c658e 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -423,7 +423,7 @@ void *alloc_insn_page(void)
 {
 	void *page;
 
-	page = module_alloc(PAGE_SIZE);
+	page = text_alloc(PAGE_SIZE);
 	if (!page)
 		return NULL;
 
@@ -446,7 +446,7 @@ void *alloc_insn_page(void)
 /* Recover page to RW mode before releasing it */
 void free_insn_page(void *page)
 {
-	module_memfree(page);
+	text_memfree(page);
 }
 
 static int arch_copy_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 34b153cbd4ac..261df078f127 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -36,55 +36,6 @@ do {							\
 } while (0)
 #endif
 
-#ifdef CONFIG_RANDOMIZE_BASE
-static unsigned long module_load_offset;
-
-/* Mutex protects the module_load_offset. */
-static DEFINE_MUTEX(module_kaslr_mutex);
-
-static unsigned long int get_module_load_offset(void)
-{
-	if (kaslr_enabled()) {
-		mutex_lock(&module_kaslr_mutex);
-		/*
-		 * Calculate the module_load_offset the first time this
-		 * code is called. Once calculated it stays the same until
-		 * reboot.
-		 */
-		if (module_load_offset == 0)
-			module_load_offset =
-				(get_random_int() % 1024 + 1) * PAGE_SIZE;
-		mutex_unlock(&module_kaslr_mutex);
-	}
-	return module_load_offset;
-}
-#else
-static unsigned long int get_module_load_offset(void)
-{
-	return 0;
-}
-#endif
-
-void *module_alloc(unsigned long size)
-{
-	void *p;
-
-	if (PAGE_ALIGN(size) > MODULES_LEN)
-		return NULL;
-
-	p = __vmalloc_node_range(size, MODULE_ALIGN,
-				    MODULES_VADDR + get_module_load_offset(),
-				    MODULES_END, GFP_KERNEL,
-				    PAGE_KERNEL, 0, NUMA_NO_NODE,
-				    __builtin_return_address(0));
-	if (p && (kasan_module_alloc(p, size) < 0)) {
-		vfree(p);
-		return NULL;
-	}
-
-	return p;
-}
-
 #ifdef CONFIG_X86_32
 int apply_relocate(Elf32_Shdr *sechdrs,
 		   const char *strtab,
diff --git a/arch/x86/kernel/text.c b/arch/x86/kernel/text.c
new file mode 100644
index 000000000000..724ab2d93ac5
--- /dev/null
+++ b/arch/x86/kernel/text.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  Kernel module help for x86.
+ *  Copyright (C) 2001 Rusty Russell.
+ */
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/random.h>
+#include <linux/vmalloc.h>
+#include <asm/setup.h>
+
+#ifdef CONFIG_RANDOMIZE_BASE
+static unsigned long module_load_offset;
+
+/* Mutex protects the module_load_offset. */
+static DEFINE_MUTEX(module_kaslr_mutex);
+
+static unsigned long get_module_load_offset(void)
+{
+	if (kaslr_enabled()) {
+		mutex_lock(&module_kaslr_mutex);
+		/*
+		 * Calculate the module_load_offset the first time this
+		 * code is called. Once calculated it stays the same until
+		 * reboot.
+		 */
+		if (module_load_offset == 0)
+			module_load_offset =
+				(get_random_int() % 1024 + 1) * PAGE_SIZE;
+		mutex_unlock(&module_kaslr_mutex);
+	}
+	return module_load_offset;
+}
+#else
+static unsigned long get_module_load_offset(void)
+{
+	return 0;
+}
+#endif
+
+void *text_alloc(unsigned long size)
+{
+	void *p;
+
+	if (PAGE_ALIGN(size) > MODULES_LEN)
+		return NULL;
+
+	p = __vmalloc_node_range(size, MODULE_ALIGN,
+				    MODULES_VADDR + get_module_load_offset(),
+				    MODULES_END, GFP_KERNEL,
+				    PAGE_KERNEL, 0, NUMA_NO_NODE,
+				    __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+
+	return p;
+}
diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
index 4fa67a8b2265..4e8b9ba431ee 100644
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -24,10 +24,10 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
 
 /* Allocator used for allocating struct module, core sections and init
    sections.  Returns NULL on failure. */
-void *module_alloc(unsigned long size);
+void *text_alloc(unsigned long size);
 
 /* Free memory returned from module_alloc. */
-void module_memfree(void *module_region);
+void text_memfree(void *module_region);
 
 /* Determines if the section name is an init section (that is only used during
  * module loading).
diff --git a/kernel/Makefile b/kernel/Makefile
index f3218bc5ec69..9e88e81f68ef 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o ucount.o
+	    async.o range.o smpboot.o ucount.o text.o
 
 obj-$(CONFIG_MODULES) += kmod.o
 obj-$(CONFIG_MULTIUSER) += groups.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9df4cc9a2907..febd55019a8a 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -840,12 +840,12 @@ static void bpf_jit_uncharge_modmem(u32 pages)
 
 void *__weak bpf_jit_alloc_exec(unsigned long size)
 {
-	return module_alloc(size);
+	return text_alloc(size);
 }
 
 void __weak bpf_jit_free_exec(void *addr)
 {
-	module_memfree(addr);
+	text_memfree(addr);
 }
 
 struct bpf_binary_header *
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 4a904cc56d68..d1c354ec89de 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -111,12 +111,12 @@ enum kprobe_slot_state {
 
 void __weak *alloc_insn_page(void)
 {
-	return module_alloc(PAGE_SIZE);
+	return text_alloc(PAGE_SIZE);
 }
 
 void __weak free_insn_page(void *page)
 {
-	module_memfree(page);
+	text_memfree(page);
 }
 
 struct kprobe_insn_cache kprobe_insn_slots = {
diff --git a/kernel/module.c b/kernel/module.c
index bee1c25ca5c5..bdb3773f3668 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2141,16 +2141,6 @@ static void free_module_elf(struct module *mod)
 }
 #endif /* CONFIG_LIVEPATCH */
 
-void __weak module_memfree(void *module_region)
-{
-	/*
-	 * This memory may be RO, and freeing RO memory in an interrupt is not
-	 * supported by vmalloc.
-	 */
-	WARN_ON(in_interrupt());
-	vfree(module_region);
-}
-
 void __weak module_arch_cleanup(struct module *mod)
 {
 }
@@ -2200,7 +2190,7 @@ static void free_module(struct module *mod)
 
 	/* This may be empty, but that's OK */
 	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
+	text_memfree(mod->init_layout.base);
 	kfree(mod->args);
 	percpu_modfree(mod);
 
@@ -2208,7 +2198,7 @@ static void free_module(struct module *mod)
 	lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
 
 	/* Finally, free the core (containing the module structure) */
-	module_memfree(mod->core_layout.base);
+	text_memfree(mod->core_layout.base);
 }
 
 void *__symbol_get(const char *symbol)
@@ -2781,13 +2771,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
 		ddebug_remove_module(mod->name);
 }
 
-void * __weak module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
-			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
-			NUMA_NO_NODE, __builtin_return_address(0));
-}
-
 bool __weak module_init_section(const char *name)
 {
 	return strstarts(name, ".init");
@@ -3246,7 +3229,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	void *ptr;
 
 	/* Do the allocs. */
-	ptr = module_alloc(mod->core_layout.size);
+	ptr = text_alloc(mod->core_layout.size);
 	/*
 	 * The pointer to this block is stored in the module structure
 	 * which is inside the block. Just mark it as not being a
@@ -3260,7 +3243,7 @@ static int move_module(struct module *mod, struct load_info *info)
 	mod->core_layout.base = ptr;
 
 	if (mod->init_layout.size) {
-		ptr = module_alloc(mod->init_layout.size);
+		ptr = text_alloc(mod->init_layout.size);
 		/*
 		 * The pointer to this block is stored in the module structure
 		 * which is inside the block. This block doesn't need to be
@@ -3269,7 +3252,7 @@ static int move_module(struct module *mod, struct load_info *info)
 		 */
 		kmemleak_ignore(ptr);
 		if (!ptr) {
-			module_memfree(mod->core_layout.base);
+			text_memfree(mod->core_layout.base);
 			return -ENOMEM;
 		}
 		memset(ptr, 0, mod->init_layout.size);
@@ -3452,8 +3435,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
 {
 	percpu_modfree(mod);
 	module_arch_freeing_init(mod);
-	module_memfree(mod->init_layout.base);
-	module_memfree(mod->core_layout.base);
+	text_memfree(mod->init_layout.base);
+	text_memfree(mod->core_layout.base);
 }
 
 int __weak module_finalize(const Elf_Ehdr *hdr,
@@ -3527,7 +3510,7 @@ static void do_free_init(struct work_struct *w)
 
 	llist_for_each_safe(pos, n, list) {
 		initfree = container_of(pos, struct mod_initfree, node);
-		module_memfree(initfree->module_init);
+		text_memfree(initfree->module_init);
 		kfree(initfree);
 	}
 }
@@ -3626,10 +3609,10 @@ static noinline int do_init_module(struct module *mod)
 	 * We want to free module_init, but be aware that kallsyms may be
 	 * walking this with preempt disabled.  In all the failure paths, we
 	 * call synchronize_rcu(), but we don't want to slow down the success
-	 * path. module_memfree() cannot be called in an interrupt, so do the
+	 * path. text_memfree() cannot be called in an interrupt, so do the
 	 * work and call synchronize_rcu() in a work queue.
 	 *
-	 * Note that module_alloc() on most architectures creates W+X page
+	 * Note that text_alloc() on most architectures creates W+X page
 	 * mappings which won't be cleaned up until do_free_init() runs.  Any
 	 * code such as mark_rodata_ro() which depends on those mappings to
 	 * be cleaned up needs to sync with the queued work - ie
diff --git a/kernel/text.c b/kernel/text.c
new file mode 100644
index 000000000000..9a12c508ded5
--- /dev/null
+++ b/kernel/text.c
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ *  Copyright (C) 2002 Richard Henderson
+ *  Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
+ */
+#include <linux/mm.h>
+#include <linux/moduleloader.h>
+#include <linux/vmalloc.h>
+
+void __weak text_memfree(void *module_region)
+{
+	/*
+	 * This memory may be RO, and freeing RO memory in an interrupt is not
+	 * supported by vmalloc.
+	 */
+	WARN_ON(in_interrupt());
+	vfree(module_region);
+}
+
+void * __weak text_alloc(unsigned long size)
+{
+	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+			GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+			NUMA_NO_NODE, __builtin_return_address(0));
+}
-- 
2.25.1


^ permalink raw reply related

* Re: [PATCH 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Russell King - ARM Linux admin @ 2020-07-13 18:34 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Yonghong Song, Thomas Gleixner, Jiri Kosina,
	Anup Patel, linux-kernel, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, open list:S390, Joe Lawrence,
	Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Andrey Ryabinin, Iurii Zaikin, Andrii Nakryiko, Vasily Gorbik,
	moderated list:ARM PORT, Daniel Axtens, Damien Le Moal,
	Peter Oberparleiter, Sean Christopherson, Martin KaFai Lau,
	Song Liu, Paul Walmsley, Heiko Carstens, Alexei Starovoitov,
	Atish Patra, Will Deacon, Daniel Borkmann, Masahiro Yamada,
	Nayna Jain, Ley Foon Tan, Christian Borntraeger, Dmitry Vyukov,
	Sami Tolvanen, Naveen N. Rao, Mao Han, Marco Elver,
	Steven Rostedt, Babu Moger, Borislav Petkov, Greentime Hu,
	Ben Dooks, Guan Xuetao, Thomas Bogendoerfer,
	open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC (sparc/sparc64), Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, Ard Biesheuvel, Vincenzo Frascino, Anders Roxell,
	Sven Schnelle, maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT),
	open list:RISC-V ARCHITECTURE, Mike Rapoport, Ingo Molnar,
	Albert Ou, Paul E. McKenney, Josh Poimboeuf, KP Singh,
	Gerald Schaefer, Nick Hu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), open list:MIPS,
	Sergey Senozhatsky, Palmer Dabbelt,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
In-Reply-To: <20200713182030.1418041-1-jarkko.sakkinen@linux.intel.com>

On Mon, Jul 13, 2020 at 09:19:37PM +0300, Jarkko Sakkinen wrote:
> Rename module_alloc() to text_alloc() and module_memfree() to
> text_memfree(), and move them to kernel/text.c, which is unconditionally
> compiled to the kernel proper. This allows kprobes, ftrace and bpf to
> allocate space for executable code without requiring to compile the modules
> support (CONFIG_MODULES=y) in.

I'm not sure this is a good idea for 32-bit ARM.  The code you are
moving for 32-bit ARM is quite specific to module use in that it also
supports allocating from the vmalloc area, where the module code
knows to add PLT entries.

If the other proposed users of this text_alloc() do not have the logic
to add PLT entries when branches between kernel code and this
allocation are not reachable by, e.g. a 26-bit signed offset for 32-bit
ARM code, then this code is not suitable for that use.

-- 
RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!

^ permalink raw reply

* [PATCH] powerpc/Kconfig: Replace HTTP links with HTTPS ones
From: Alexander A. Klimov @ 2020-07-13 19:26 UTC (permalink / raw)
  To: oss, mpe, benh, paulus, linuxppc-dev, linux-kernel; +Cc: Alexander A. Klimov

Rationale:
Reduces attack surface on kernel devs opening the links for MITM
as HTTPS traffic is much harder to manipulate.

Deterministic algorithm:
For each file:
  If not .svg:
    For each line:
      If doesn't contain `\bxmlns\b`:
        For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
            If both the HTTP and HTTPS versions
            return 200 OK and serve the same content:
              Replace HTTP with HTTPS.

Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
---
 Continuing my work started at 93431e0607e5.
 See also: git log --oneline '--author=Alexander A. Klimov <grandmaster@al2klimov.de>' v5.7..master
 (Actually letting a shell for loop submit all this stuff for me.)

 If there are any URLs to be removed completely or at least not just HTTPSified:
 Just clearly say so and I'll *undo my change*.
 See also: https://lkml.org/lkml/2020/6/27/64

 If there are any valid, but yet not changed URLs:
 See: https://lkml.org/lkml/2020/6/26/837

 If you apply the patch, please let me know.

 Sorry again to all maintainers who complained about subject lines.
 Now I realized that you want an actually perfect prefixes,
 not just subsystem ones.
 I tried my best...
 And yes, *I could* (at least half-)automate it.
 Impossible is nothing! :)


 arch/powerpc/platforms/85xx/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig
index fa3d29dcb57e..b77cbb0a50e1 100644
--- a/arch/powerpc/platforms/85xx/Kconfig
+++ b/arch/powerpc/platforms/85xx/Kconfig
@@ -160,7 +160,7 @@ config XES_MPC85xx
 	  computers from Extreme Engineering Solutions (X-ES) based on
 	  Freescale MPC85xx processors.
 	  Manufacturer: Extreme Engineering Solutions, Inc.
-	  URL: <http://www.xes-inc.com/>
+	  URL: <https://www.xes-inc.com/>
 
 config STX_GP3
 	bool "Silicon Turnkey Express GP3"
-- 
2.27.0


^ permalink raw reply related

* Re: [PATCH 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Ard Biesheuvel @ 2020-07-13 19:49 UTC (permalink / raw)
  To: Jarkko Sakkinen
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Yonghong Song, Thomas Gleixner, Jiri Kosina,
	Anup Patel, Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, open list:S390, Joe Lawrence,
	Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Andrey Ryabinin, Iurii Zaikin, Andrii Nakryiko, Vasily Gorbik,
	moderated list:ARM PORT, Daniel Axtens, Damien Le Moal,
	Peter Oberparleiter, Sean Christopherson, Martin KaFai Lau,
	Song Liu, Paul Walmsley, Heiko Carstens, Alexei Starovoitov,
	Atish Patra, Will Deacon, Daniel Borkmann, Masahiro Yamada,
	Nayna Jain, Ley Foon Tan, Christian Borntraeger, Dmitry Vyukov,
	Sami Tolvanen, Naveen N. Rao, Mao Han, Marco Elver,
	Steven Rostedt, Babu Moger, Borislav Petkov, Greentime Hu,
	Ben Dooks, Guan Xuetao, Thomas Bogendoerfer,
	open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC (sparc/sparc64), Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, open list:RISC-V ARCHITECTURE, Vincenzo Frascino,
	Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Josh Poimboeuf, KP Singh, Gerald Schaefer, Nick Hu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), open list:MIPS,
	Sergey Senozhatsky, Palmer Dabbelt,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
In-Reply-To: <20200713182030.1418041-1-jarkko.sakkinen@linux.intel.com>

On Mon, 13 Jul 2020 at 21:21, Jarkko Sakkinen
<jarkko.sakkinen@linux.intel.com> wrote:
>
> Rename module_alloc() to text_alloc() and module_memfree() to
> text_memfree(), and move them to kernel/text.c, which is unconditionally
> compiled to the kernel proper. This allows kprobes, ftrace and bpf to
> allocate space for executable code without requiring to compile the modules
> support (CONFIG_MODULES=y) in.
>

On arm64, we no longer use module_alloc for bpf or kprobes, to avoid
wasting va space on code that does not need to be loaded close to the
kernel. Also, module_alloc() allocates kasan shadow, which is
unnecessary for kprobes or bpf programs, which don't have kasan
instrumentation.

This patch suggests that there are other reasons why conflating
allocation of module space and allocating  text pages for other uses
is a bad idea, but switching all users to text_alloc() is a step in
the wrong direction. It would be better to stop using module_alloc()
in core code except in the module loader, and have a generic
text_alloc() that can be overridden by the arch if necessary. Note
that x86  and s390 are the only architectures that use module_alloc()
in ftrace code.

Please have a look at alloc_insn_page() or bpf_jit_alloc_exec() in the
arm64 tree to see what I mean.



> Cc: Andi Kleen <ak@linux.intel.com>
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
> ---
>  arch/arm/kernel/Makefile         |  3 +-
>  arch/arm/kernel/module.c         | 21 -----------
>  arch/arm/kernel/text.c           | 33 ++++++++++++++++++
>  arch/arm64/kernel/Makefile       |  2 +-
>  arch/arm64/kernel/module.c       | 42 ----------------------
>  arch/arm64/kernel/text.c         | 54 ++++++++++++++++++++++++++++
>  arch/mips/kernel/Makefile        |  2 +-
>  arch/mips/kernel/module.c        |  9 -----
>  arch/mips/kernel/text.c          | 19 ++++++++++
>  arch/mips/net/bpf_jit.c          |  4 +--
>  arch/nds32/kernel/Makefile       |  2 +-
>  arch/nds32/kernel/module.c       |  7 ----
>  arch/nds32/kernel/text.c         | 12 +++++++
>  arch/nios2/kernel/Makefile       |  1 +
>  arch/nios2/kernel/module.c       | 19 ----------
>  arch/nios2/kernel/text.c         | 34 ++++++++++++++++++
>  arch/parisc/kernel/Makefile      |  2 +-
>  arch/parisc/kernel/module.c      | 11 ------
>  arch/parisc/kernel/text.c        | 22 ++++++++++++
>  arch/powerpc/net/bpf_jit_comp.c  |  4 +--
>  arch/riscv/kernel/Makefile       |  1 +
>  arch/riscv/kernel/module.c       | 12 -------
>  arch/riscv/kernel/text.c         | 20 +++++++++++
>  arch/s390/kernel/Makefile        |  2 +-
>  arch/s390/kernel/ftrace.c        |  2 +-
>  arch/s390/kernel/module.c        | 16 ---------
>  arch/s390/kernel/text.c          | 23 ++++++++++++
>  arch/sparc/kernel/Makefile       |  1 +
>  arch/sparc/kernel/module.c       | 30 ----------------
>  arch/sparc/kernel/text.c         | 39 +++++++++++++++++++++
>  arch/sparc/net/bpf_jit_comp_32.c |  6 ++--
>  arch/unicore32/kernel/Makefile   |  1 +
>  arch/unicore32/kernel/module.c   |  7 ----
>  arch/unicore32/kernel/text.c     | 18 ++++++++++
>  arch/x86/kernel/Makefile         |  1 +
>  arch/x86/kernel/ftrace.c         |  4 +--
>  arch/x86/kernel/kprobes/core.c   |  4 +--
>  arch/x86/kernel/module.c         | 49 --------------------------
>  arch/x86/kernel/text.c           | 60 ++++++++++++++++++++++++++++++++
>  include/linux/moduleloader.h     |  4 +--
>  kernel/Makefile                  |  2 +-
>  kernel/bpf/core.c                |  4 +--
>  kernel/kprobes.c                 |  4 +--
>  kernel/module.c                  | 37 ++++++--------------
>  kernel/text.c                    | 25 +++++++++++++
>  45 files changed, 400 insertions(+), 275 deletions(-)
>  create mode 100644 arch/arm/kernel/text.c
>  create mode 100644 arch/arm64/kernel/text.c
>  create mode 100644 arch/mips/kernel/text.c
>  create mode 100644 arch/nds32/kernel/text.c
>  create mode 100644 arch/nios2/kernel/text.c
>  create mode 100644 arch/parisc/kernel/text.c
>  create mode 100644 arch/riscv/kernel/text.c
>  create mode 100644 arch/s390/kernel/text.c
>  create mode 100644 arch/sparc/kernel/text.c
>  create mode 100644 arch/unicore32/kernel/text.c
>  create mode 100644 arch/x86/kernel/text.c
>  create mode 100644 kernel/text.c
>
> diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
> index 89e5d864e923..69bfacfd60ef 100644
> --- a/arch/arm/kernel/Makefile
> +++ b/arch/arm/kernel/Makefile
> @@ -19,7 +19,8 @@ CFLAGS_REMOVE_return_address.o = -pg
>  obj-y          := elf.o entry-common.o irq.o opcodes.o \
>                    process.o ptrace.o reboot.o \
>                    setup.o signal.o sigreturn_codes.o \
> -                  stacktrace.o sys_arm.o time.o traps.o
> +                  stacktrace.o sys_arm.o time.o traps.o \
> +                  text.o
>
>  ifneq ($(CONFIG_ARM_UNWIND),y)
>  obj-$(CONFIG_FRAME_POINTER)    += return_address.o
> diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
> index e15444b25ca0..13e3442a6b9f 100644
> --- a/arch/arm/kernel/module.c
> +++ b/arch/arm/kernel/module.c
> @@ -33,27 +33,6 @@
>  #define MODULES_VADDR  (((unsigned long)_exiprom + ~PMD_MASK) & PMD_MASK)
>  #endif
>
> -#ifdef CONFIG_MMU
> -void *module_alloc(unsigned long size)
> -{
> -       gfp_t gfp_mask = GFP_KERNEL;
> -       void *p;
> -
> -       /* Silence the initial allocation */
> -       if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
> -               gfp_mask |= __GFP_NOWARN;
> -
> -       p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                               gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -       if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
> -               return p;
> -       return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
> -                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -#endif
> -
>  bool module_init_section(const char *name)
>  {
>         return strstarts(name, ".init") ||
> diff --git a/arch/arm/kernel/text.c b/arch/arm/kernel/text.c
> new file mode 100644
> index 000000000000..600143fb909d
> --- /dev/null
> +++ b/arch/arm/kernel/text.c
> @@ -0,0 +1,33 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + *  linux/arch/arm/kernel/module.c
> + *
> + *  Copyright (C) 2002 Russell King.
> + *  Modified for nommu by Hyok S. Choi
> + *
> + * Module allocation method suggested by Andi Kleen.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef CONFIG_MMU
> +void *text_alloc(unsigned long size)
> +{
> +       gfp_t gfp_mask = GFP_KERNEL;
> +       void *p;
> +
> +       /* Silence the initial allocation */
> +       if (IS_ENABLED(CONFIG_ARM_MODULE_PLTS))
> +               gfp_mask |= __GFP_NOWARN;
> +
> +       p = __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                               gfp_mask, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +       if (!IS_ENABLED(CONFIG_ARM_MODULE_PLTS) || p)
> +               return p;
> +       return __vmalloc_node_range(size, 1,  VMALLOC_START, VMALLOC_END,
> +                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index a561cbb91d4d..7765a45d71b5 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -19,7 +19,7 @@ obj-y                 := debug-monitors.o entry.o irq.o fpsimd.o              \
>                            return_address.o cpuinfo.o cpu_errata.o              \
>                            cpufeature.o alternative.o cacheinfo.o               \
>                            smp.o smp_spin_table.o topology.o smccc-call.o       \
> -                          syscall.o
> +                          syscall.o text.o
>
>  targets                        += efi-entry.o
>
> diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c
> index 1cd1a4d0ed30..adde022f703c 100644
> --- a/arch/arm64/kernel/module.c
> +++ b/arch/arm64/kernel/module.c
> @@ -20,48 +20,6 @@
>  #include <asm/insn.h>
>  #include <asm/sections.h>
>
> -void *module_alloc(unsigned long size)
> -{
> -       u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> -       gfp_t gfp_mask = GFP_KERNEL;
> -       void *p;
> -
> -       /* Silence the initial allocation */
> -       if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> -               gfp_mask |= __GFP_NOWARN;
> -
> -       if (IS_ENABLED(CONFIG_KASAN))
> -               /* don't exceed the static module region - see below */
> -               module_alloc_end = MODULES_END;
> -
> -       p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> -                               module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
> -                               NUMA_NO_NODE, __builtin_return_address(0));
> -
> -       if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> -           !IS_ENABLED(CONFIG_KASAN))
> -               /*
> -                * KASAN can only deal with module allocations being served
> -                * from the reserved module region, since the remainder of
> -                * the vmalloc region is already backed by zero shadow pages,
> -                * and punching holes into it is non-trivial. Since the module
> -                * region is not randomized when KASAN is enabled, it is even
> -                * less likely that the module region gets exhausted, so we
> -                * can simply omit this fallback in that case.
> -                */
> -               p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> -                               module_alloc_base + SZ_2G, GFP_KERNEL,
> -                               PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -
> -       if (p && (kasan_module_alloc(p, size) < 0)) {
> -               vfree(p);
> -               return NULL;
> -       }
> -
> -       return p;
> -}
> -
>  enum aarch64_reloc_op {
>         RELOC_OP_NONE,
>         RELOC_OP_ABS,
> diff --git a/arch/arm64/kernel/text.c b/arch/arm64/kernel/text.c
> new file mode 100644
> index 000000000000..64fc7e2d85df
> --- /dev/null
> +++ b/arch/arm64/kernel/text.c
> @@ -0,0 +1,54 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * AArch64 loadable module support.
> + *
> + * Copyright (C) 2012 ARM Limited
> + *
> + * Author: Will Deacon <will.deacon@arm.com>
> + */
> +#include <linux/kasan.h>
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       u64 module_alloc_end = module_alloc_base + MODULES_VSIZE;
> +       gfp_t gfp_mask = GFP_KERNEL;
> +       void *p;
> +
> +       /* Silence the initial allocation */
> +       if (IS_ENABLED(CONFIG_ARM64_MODULE_PLTS))
> +               gfp_mask |= __GFP_NOWARN;
> +
> +       if (IS_ENABLED(CONFIG_KASAN))
> +               /* don't exceed the static module region - see below */
> +               module_alloc_end = MODULES_END;
> +
> +       p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> +                               module_alloc_end, gfp_mask, PAGE_KERNEL, 0,
> +                               NUMA_NO_NODE, __builtin_return_address(0));
> +
> +       if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) &&
> +           !IS_ENABLED(CONFIG_KASAN))
> +               /*
> +                * KASAN can only deal with module allocations being served
> +                * from the reserved module region, since the remainder of
> +                * the vmalloc region is already backed by zero shadow pages,
> +                * and punching holes into it is non-trivial. Since the module
> +                * region is not randomized when KASAN is enabled, it is even
> +                * less likely that the module region gets exhausted, so we
> +                * can simply omit this fallback in that case.
> +                */
> +               p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base,
> +                               module_alloc_base + SZ_2G, GFP_KERNEL,
> +                               PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +
> +       if (p && (kasan_module_alloc(p, size) < 0)) {
> +               vfree(p);
> +               return NULL;
> +       }
> +
> +       return p;
> +}
> diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
> index 8c7a043295ed..37ebf9a7f259 100644
> --- a/arch/mips/kernel/Makefile
> +++ b/arch/mips/kernel/Makefile
> @@ -8,7 +8,7 @@ extra-y         := head.o vmlinux.lds
>  obj-y          += cmpxchg.o cpu-probe.o branch.o elf.o entry.o genex.o idle.o irq.o \
>                    process.o prom.o ptrace.o reset.o setup.o signal.o \
>                    syscall.o time.o topology.o traps.o unaligned.o watch.o \
> -                  vdso.o cacheinfo.o
> +                  vdso.o cacheinfo.o text.o
>
>  ifdef CONFIG_FUNCTION_TRACER
>  CFLAGS_REMOVE_ftrace.o = -pg
> diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c
> index 3c0c3d1260c1..f5ac4ebc4bad 100644
> --- a/arch/mips/kernel/module.c
> +++ b/arch/mips/kernel/module.c
> @@ -31,15 +31,6 @@ struct mips_hi16 {
>  static LIST_HEAD(dbe_list);
>  static DEFINE_SPINLOCK(dbe_lock);
>
> -#ifdef MODULE_START
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
> -                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -#endif
> -
>  static int apply_r_mips_none(struct module *me, u32 *location,
>                              u32 base, Elf_Addr v, bool rela)
>  {
> diff --git a/arch/mips/kernel/text.c b/arch/mips/kernel/text.c
> new file mode 100644
> index 000000000000..55ca87a421c3
> --- /dev/null
> +++ b/arch/mips/kernel/text.c
> @@ -0,0 +1,19 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *
> + *  Copyright (C) 2001 Rusty Russell.
> + *  Copyright (C) 2003, 2004 Ralf Baechle (ralf@linux-mips.org)
> + *  Copyright (C) 2005 Thiemo Seufer
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef MODULE_START
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
> +                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c
> index 0af88622c619..2b03f7128809 100644
> --- a/arch/mips/net/bpf_jit.c
> +++ b/arch/mips/net/bpf_jit.c
> @@ -1233,7 +1233,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>         build_epilogue(&ctx);
>
>         alloc_size = 4 * ctx.idx;
> -       ctx.target = module_alloc(alloc_size);
> +       ctx.target = text_alloc(alloc_size);
>         if (ctx.target == NULL)
>                 goto out;
>
> @@ -1264,7 +1264,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>  void bpf_jit_free(struct bpf_prog *fp)
>  {
>         if (fp->jited)
> -               module_memfree(fp->bpf_func);
> +               text_memfree(fp->bpf_func);
>
>         bpf_prog_unlock_free(fp);
>  }
> diff --git a/arch/nds32/kernel/Makefile b/arch/nds32/kernel/Makefile
> index 394df3f6442c..fc15778c59d1 100644
> --- a/arch/nds32/kernel/Makefile
> +++ b/arch/nds32/kernel/Makefile
> @@ -10,7 +10,7 @@ AFLAGS_head.o         := -DTEXTADDR=$(TEXTADDR)
>  obj-y                  := ex-entry.o ex-exit.o ex-scall.o irq.o \
>                         process.o ptrace.o setup.o signal.o \
>                         sys_nds32.o time.o traps.o cacheinfo.o \
> -                       dma.o syscall_table.o vdso.o
> +                       dma.o syscall_table.o vdso.o text.o
>
>  obj-$(CONFIG_MODULES)          += nds32_ksyms.o module.o
>  obj-$(CONFIG_STACKTRACE)       += stacktrace.o
> diff --git a/arch/nds32/kernel/module.c b/arch/nds32/kernel/module.c
> index 3897fd14a21d..3d23a12ed535 100644
> --- a/arch/nds32/kernel/module.c
> +++ b/arch/nds32/kernel/module.c
> @@ -7,13 +7,6 @@
>  #include <linux/moduleloader.h>
>  #include <linux/pgtable.h>
>
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                                   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -}
> -
>  void module_free(struct module *module, void *region)
>  {
>         vfree(region);
> diff --git a/arch/nds32/kernel/text.c b/arch/nds32/kernel/text.c
> new file mode 100644
> index 000000000000..6e86eff9aaf0
> --- /dev/null
> +++ b/arch/nds32/kernel/text.c
> @@ -0,0 +1,12 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) 2005-2017 Andes Technology Corporation
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                                   GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +}
> diff --git a/arch/nios2/kernel/Makefile b/arch/nios2/kernel/Makefile
> index 0b645e1e3158..5476fc749f37 100644
> --- a/arch/nios2/kernel/Makefile
> +++ b/arch/nios2/kernel/Makefile
> @@ -18,6 +18,7 @@ obj-y += setup.o
>  obj-y  += signal.o
>  obj-y  += sys_nios2.o
>  obj-y  += syscall_table.o
> +obj-y  += text.o
>  obj-y  += time.o
>  obj-y  += traps.o
>
> diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c
> index 76e0a42d6e36..20a0faf64e38 100644
> --- a/arch/nios2/kernel/module.c
> +++ b/arch/nios2/kernel/module.c
> @@ -21,25 +21,6 @@
>
>  #include <asm/cacheflush.h>
>
> -/*
> - * Modules should NOT be allocated with kmalloc for (obvious) reasons.
> - * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
> - * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
> - * addresses in 0xc0000000)
> - */
> -void *module_alloc(unsigned long size)
> -{
> -       if (size == 0)
> -               return NULL;
> -       return kmalloc(size, GFP_KERNEL);
> -}
> -
> -/* Free memory returned from module_alloc */
> -void module_memfree(void *module_region)
> -{
> -       kfree(module_region);
> -}
> -
>  int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab,
>                         unsigned int symindex, unsigned int relsec,
>                         struct module *mod)
> diff --git a/arch/nios2/kernel/text.c b/arch/nios2/kernel/text.c
> new file mode 100644
> index 000000000000..af424174442f
> --- /dev/null
> +++ b/arch/nios2/kernel/text.c
> @@ -0,0 +1,34 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Kernel module support for Nios II.
> + *
> + * Copyright (C) 2004 Microtronix Datacom Ltd.
> + *   Written by Wentao Xu <xuwentao@microtronix.com>
> + * Copyright (C) 2001, 2003 Rusty Russell
> + *
> + * This file is subject to the terms and conditions of the GNU General
> + * Public License.  See the file COPYING in the main directory of this
> + * archive for more details.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +/*
> + * Modules should NOT be allocated with kmalloc for (obvious) reasons.
> + * But we do it for now to avoid relocation issues. CALL26/PCREL26 cannot reach
> + * from 0x80000000 (vmalloc area) to 0xc00000000 (kernel) (kmalloc returns
> + * addresses in 0xc0000000)
> + */
> +void *text_alloc(unsigned long size)
> +{
> +       if (size == 0)
> +               return NULL;
> +       return kmalloc(size, GFP_KERNEL);
> +}
> +
> +/* Free memory returned from module_alloc */
> +void text_memfree(void *module_region)
> +{
> +       kfree(module_region);
> +}
> diff --git a/arch/parisc/kernel/Makefile b/arch/parisc/kernel/Makefile
> index 068d90950d93..f71f7ffdae2e 100644
> --- a/arch/parisc/kernel/Makefile
> +++ b/arch/parisc/kernel/Makefile
> @@ -10,7 +10,7 @@ obj-y         := cache.o pacache.o setup.o pdt.o traps.o time.o irq.o \
>                    ptrace.o hardware.o inventory.o drivers.o alternative.o \
>                    signal.o hpmc.o real2.o parisc_ksyms.o unaligned.o \
>                    process.o processor.o pdc_cons.o pdc_chassis.o unwind.o \
> -                  patch.o
> +                  patch.o text.o
>
>  ifdef CONFIG_FUNCTION_TRACER
>  # Do not profile debug and lowlevel utilities
> diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c
> index 7df140545b22..c81e63e2549b 100644
> --- a/arch/parisc/kernel/module.c
> +++ b/arch/parisc/kernel/module.c
> @@ -192,17 +192,6 @@ static inline int reassemble_22(int as22)
>                 ((as22 & 0x0003ff) << 3));
>  }
>
> -void *module_alloc(unsigned long size)
> -{
> -       /* using RWX means less protection for modules, but it's
> -        * easier than trying to map the text, data, init_text and
> -        * init_data correctly */
> -       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -                                   GFP_KERNEL,
> -                                   PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -}
> -
>  #ifndef CONFIG_64BIT
>  static inline unsigned long count_gots(const Elf_Rela *rela, unsigned long n)
>  {
> diff --git a/arch/parisc/kernel/text.c b/arch/parisc/kernel/text.c
> new file mode 100644
> index 000000000000..9ff503084191
> --- /dev/null
> +++ b/arch/parisc/kernel/text.c
> @@ -0,0 +1,22 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *    Linux/PA-RISC Project
> + *    Copyright (C) 2003 Randolph Chung <tausq at debian . org>
> + *    Copyright (C) 2008 Helge Deller <deller@gmx.de>
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       /*
> +        * Using RWX means less protection for modules, but it's
> +        * easier than trying to map the text, data, init_text and
> +        * init_data correctly.
> +        */
> +       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +                                   GFP_KERNEL,
> +                                   PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +}
> diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
> index 0acc9d5fb19e..ba1cef7a812d 100644
> --- a/arch/powerpc/net/bpf_jit_comp.c
> +++ b/arch/powerpc/net/bpf_jit_comp.c
> @@ -634,7 +634,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>
>         proglen = cgctx.idx * 4;
>         alloclen = proglen + FUNCTION_DESCR_SIZE;
> -       image = module_alloc(alloclen);
> +       image = text_alloc(alloclen);
>         if (!image)
>                 goto out;
>
> @@ -678,7 +678,7 @@ void bpf_jit_compile(struct bpf_prog *fp)
>  void bpf_jit_free(struct bpf_prog *fp)
>  {
>         if (fp->jited)
> -               module_memfree(fp->bpf_func);
> +               text_memfree(fp->bpf_func);
>
>         bpf_prog_unlock_free(fp);
>  }
> diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
> index b355cf485671..d0b30f286ce6 100644
> --- a/arch/riscv/kernel/Makefile
> +++ b/arch/riscv/kernel/Makefile
> @@ -29,6 +29,7 @@ obj-y += riscv_ksyms.o
>  obj-y  += stacktrace.o
>  obj-y  += cacheinfo.o
>  obj-y  += patch.o
> +obj-y  += text.o
>  obj-$(CONFIG_MMU) += vdso.o vdso/
>
>  obj-$(CONFIG_RISCV_M_MODE)     += clint.o traps_misaligned.o
> diff --git a/arch/riscv/kernel/module.c b/arch/riscv/kernel/module.c
> index 7191342c54da..f6aa66431c9e 100644
> --- a/arch/riscv/kernel/module.c
> +++ b/arch/riscv/kernel/module.c
> @@ -390,15 +390,3 @@ int apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
>
>         return 0;
>  }
> -
> -#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> -#define VMALLOC_MODULE_START \
> -        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
> -                                   VMALLOC_END, GFP_KERNEL,
> -                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -}
> -#endif
> diff --git a/arch/riscv/kernel/text.c b/arch/riscv/kernel/text.c
> new file mode 100644
> index 000000000000..201608a25641
> --- /dev/null
> +++ b/arch/riscv/kernel/text.c
> @@ -0,0 +1,20 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *
> + *  Copyright (C) 2017 Zihao Yu
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#if defined(CONFIG_MMU) && defined(CONFIG_64BIT)
> +#define VMALLOC_MODULE_START \
> +        max(PFN_ALIGN((unsigned long)&_end - SZ_2G), VMALLOC_START)
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, VMALLOC_MODULE_START,
> +                                   VMALLOC_END, GFP_KERNEL,
> +                                   PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +}
> +#endif
> diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
> index a8f136943deb..9f00c320b938 100644
> --- a/arch/s390/kernel/Makefile
> +++ b/arch/s390/kernel/Makefile
> @@ -40,7 +40,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o
>  obj-y  += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
>  obj-y  += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
>  obj-y  += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
> -obj-y  += smp.o
> +obj-y  += smp.o text.o
>
>  extra-y                                += head64.o vmlinux.lds
>
> diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c
> index b388e87a08bf..a752b1442846 100644
> --- a/arch/s390/kernel/ftrace.c
> +++ b/arch/s390/kernel/ftrace.c
> @@ -134,7 +134,7 @@ static int __init ftrace_plt_init(void)
>  {
>         unsigned int *ip;
>
> -       ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE);
> +       ftrace_plt = (unsigned long) text_alloc(PAGE_SIZE);
>         if (!ftrace_plt)
>                 panic("cannot allocate ftrace plt\n");
>         ip = (unsigned int *) ftrace_plt;
> diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
> index 4055f1c49814..087cb5951de6 100644
> --- a/arch/s390/kernel/module.c
> +++ b/arch/s390/kernel/module.c
> @@ -32,22 +32,6 @@
>
>  #define PLT_ENTRY_SIZE 20
>
> -void *module_alloc(unsigned long size)
> -{
> -       void *p;
> -
> -       if (PAGE_ALIGN(size) > MODULES_LEN)
> -               return NULL;
> -       p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
> -                                GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                                __builtin_return_address(0));
> -       if (p && (kasan_module_alloc(p, size) < 0)) {
> -               vfree(p);
> -               return NULL;
> -       }
> -       return p;
> -}
> -
>  void module_arch_freeing_init(struct module *mod)
>  {
>         if (is_livepatch_module(mod) &&
> diff --git a/arch/s390/kernel/text.c b/arch/s390/kernel/text.c
> new file mode 100644
> index 000000000000..63aaa1ab727b
> --- /dev/null
> +++ b/arch/s390/kernel/text.c
> @@ -0,0 +1,23 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + *  Kernel module help for s390.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       void *p;
> +
> +       if (PAGE_ALIGN(size) > MODULES_LEN)
> +               return NULL;
> +       p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
> +                                GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                                __builtin_return_address(0));
> +       if (p && (kasan_module_alloc(p, size) < 0)) {
> +               vfree(p);
> +               return NULL;
> +       }
> +       return p;
> +}
> diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
> index 97c0e19263d1..e025f9e1db4a 100644
> --- a/arch/sparc/kernel/Makefile
> +++ b/arch/sparc/kernel/Makefile
> @@ -52,6 +52,7 @@ obj-y                   += prom_common.o
>  obj-y                   += prom_$(BITS).o
>  obj-y                   += of_device_common.o
>  obj-y                   += of_device_$(BITS).o
> +obj-y                  += text.o
>  obj-$(CONFIG_SPARC64)   += prom_irqtrans.o
>
>  obj-$(CONFIG_SPARC32)   += leon_kernel.o
> diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c
> index df39580f398d..f2babc69f189 100644
> --- a/arch/sparc/kernel/module.c
> +++ b/arch/sparc/kernel/module.c
> @@ -21,36 +21,6 @@
>
>  #include "entry.h"
>
> -#ifdef CONFIG_SPARC64
> -
> -#include <linux/jump_label.h>
> -
> -static void *module_map(unsigned long size)
> -{
> -       if (PAGE_ALIGN(size) > MODULES_LEN)
> -               return NULL;
> -       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -#else
> -static void *module_map(unsigned long size)
> -{
> -       return vmalloc(size);
> -}
> -#endif /* CONFIG_SPARC64 */
> -
> -void *module_alloc(unsigned long size)
> -{
> -       void *ret;
> -
> -       ret = module_map(size);
> -       if (ret)
> -               memset(ret, 0, size);
> -
> -       return ret;
> -}
> -
>  /* Make generic code ignore STT_REGISTER dummy undefined symbols.  */
>  int module_frob_arch_sections(Elf_Ehdr *hdr,
>                               Elf_Shdr *sechdrs,
> diff --git a/arch/sparc/kernel/text.c b/arch/sparc/kernel/text.c
> new file mode 100644
> index 000000000000..d16663f2c6ba
> --- /dev/null
> +++ b/arch/sparc/kernel/text.c
> @@ -0,0 +1,39 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Kernel module help for sparc64.
> + *
> + * Copyright (C) 2001 Rusty Russell.
> + * Copyright (C) 2002 David S. Miller.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +#ifdef CONFIG_SPARC64
> +
> +#include <linux/jump_label.h>
> +
> +static void *module_map(unsigned long size)
> +{
> +       if (PAGE_ALIGN(size) > MODULES_LEN)
> +               return NULL;
> +       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                               GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> +#else
> +static void *module_map(unsigned long size)
> +{
> +       return vmalloc(size);
> +}
> +#endif /* CONFIG_SPARC64 */
> +
> +void *text_alloc(unsigned long size)
> +{
> +       void *ret;
> +
> +       ret = module_map(size);
> +       if (ret)
> +               memset(ret, 0, size);
> +
> +       return ret;
> +}
> diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c
> index c8eabb973b86..d9dd513b27b2 100644
> --- a/arch/sparc/net/bpf_jit_comp_32.c
> +++ b/arch/sparc/net/bpf_jit_comp_32.c
> @@ -713,7 +713,7 @@ cond_branch:                        f_offset = addrs[i + filter[i].jf];
>                                 if (unlikely(proglen + ilen > oldproglen)) {
>                                         pr_err("bpb_jit_compile fatal error\n");
>                                         kfree(addrs);
> -                                       module_memfree(image);
> +                                       text_memfree(image);
>                                         return;
>                                 }
>                                 memcpy(image + proglen, temp, ilen);
> @@ -736,7 +736,7 @@ cond_branch:                        f_offset = addrs[i + filter[i].jf];
>                         break;
>                 }
>                 if (proglen == oldproglen) {
> -                       image = module_alloc(proglen);
> +                       image = text_alloc(proglen);
>                         if (!image)
>                                 goto out;
>                 }
> @@ -758,7 +758,7 @@ cond_branch:                        f_offset = addrs[i + filter[i].jf];
>  void bpf_jit_free(struct bpf_prog *fp)
>  {
>         if (fp->jited)
> -               module_memfree(fp->bpf_func);
> +               text_memfree(fp->bpf_func);
>
>         bpf_prog_unlock_free(fp);
>  }
> diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile
> index 2f79aa56735b..96eb8cfc8b1e 100644
> --- a/arch/unicore32/kernel/Makefile
> +++ b/arch/unicore32/kernel/Makefile
> @@ -6,6 +6,7 @@
>  # Object file lists.
>  obj-y                          := dma.o elf.o entry.o process.o ptrace.o
>  obj-y                          += setup.o signal.o sys.o stacktrace.o traps.o
> +obj-y                          += text.o
>
>  obj-$(CONFIG_MODULES)          += ksyms.o module.o
>  obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
> diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
> index 67c89ef2d6ee..e1e703c02379 100644
> --- a/arch/unicore32/kernel/module.c
> +++ b/arch/unicore32/kernel/module.c
> @@ -18,13 +18,6 @@
>
>  #include <asm/sections.h>
>
> -void *module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> -                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> -                               __builtin_return_address(0));
> -}
> -
>  int
>  apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
>                unsigned int relindex, struct module *module)
> diff --git a/arch/unicore32/kernel/text.c b/arch/unicore32/kernel/text.c
> new file mode 100644
> index 000000000000..b94aac824bb8
> --- /dev/null
> +++ b/arch/unicore32/kernel/text.c
> @@ -0,0 +1,18 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * linux/arch/unicore32/kernel/module.c
> + *
> + * Code specific to PKUnity SoC and UniCore ISA
> + *
> + * Copyright (C) 2001-2010 GUAN Xue-tao
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void *text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
> +                               GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
> +                               __builtin_return_address(0));
> +}
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index e77261db2391..2878e4b753a0 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -68,6 +68,7 @@ obj-y                 += tsc.o tsc_msr.o io_delay.o rtc.o
>  obj-y                  += pci-iommu_table.o
>  obj-y                  += resource.o
>  obj-y                  += irqflags.o
> +obj-y                  += text.o
>
>  obj-y                          += process.o
>  obj-y                          += fpu/
> diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
> index 51504566b3a6..f76703ee96f2 100644
> --- a/arch/x86/kernel/ftrace.c
> +++ b/arch/x86/kernel/ftrace.c
> @@ -265,11 +265,11 @@ int __init ftrace_dyn_arch_init(void)
>  /* Module allocation simplifies allocating memory for code */
>  static inline void *alloc_tramp(unsigned long size)
>  {
> -       return module_alloc(size);
> +       return text_alloc(size);
>  }
>  static inline void tramp_free(void *tramp)
>  {
> -       module_memfree(tramp);
> +       text_memfree(tramp);
>  }
>  #else
>  /* Trampolines can only be created if modules are supported */
> diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
> index ada39ddbc922..e9ac7d3c658e 100644
> --- a/arch/x86/kernel/kprobes/core.c
> +++ b/arch/x86/kernel/kprobes/core.c
> @@ -423,7 +423,7 @@ void *alloc_insn_page(void)
>  {
>         void *page;
>
> -       page = module_alloc(PAGE_SIZE);
> +       page = text_alloc(PAGE_SIZE);
>         if (!page)
>                 return NULL;
>
> @@ -446,7 +446,7 @@ void *alloc_insn_page(void)
>  /* Recover page to RW mode before releasing it */
>  void free_insn_page(void *page)
>  {
> -       module_memfree(page);
> +       text_memfree(page);
>  }
>
>  static int arch_copy_kprobe(struct kprobe *p)
> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
> index 34b153cbd4ac..261df078f127 100644
> --- a/arch/x86/kernel/module.c
> +++ b/arch/x86/kernel/module.c
> @@ -36,55 +36,6 @@ do {                                                 \
>  } while (0)
>  #endif
>
> -#ifdef CONFIG_RANDOMIZE_BASE
> -static unsigned long module_load_offset;
> -
> -/* Mutex protects the module_load_offset. */
> -static DEFINE_MUTEX(module_kaslr_mutex);
> -
> -static unsigned long int get_module_load_offset(void)
> -{
> -       if (kaslr_enabled()) {
> -               mutex_lock(&module_kaslr_mutex);
> -               /*
> -                * Calculate the module_load_offset the first time this
> -                * code is called. Once calculated it stays the same until
> -                * reboot.
> -                */
> -               if (module_load_offset == 0)
> -                       module_load_offset =
> -                               (get_random_int() % 1024 + 1) * PAGE_SIZE;
> -               mutex_unlock(&module_kaslr_mutex);
> -       }
> -       return module_load_offset;
> -}
> -#else
> -static unsigned long int get_module_load_offset(void)
> -{
> -       return 0;
> -}
> -#endif
> -
> -void *module_alloc(unsigned long size)
> -{
> -       void *p;
> -
> -       if (PAGE_ALIGN(size) > MODULES_LEN)
> -               return NULL;
> -
> -       p = __vmalloc_node_range(size, MODULE_ALIGN,
> -                                   MODULES_VADDR + get_module_load_offset(),
> -                                   MODULES_END, GFP_KERNEL,
> -                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
> -                                   __builtin_return_address(0));
> -       if (p && (kasan_module_alloc(p, size) < 0)) {
> -               vfree(p);
> -               return NULL;
> -       }
> -
> -       return p;
> -}
> -
>  #ifdef CONFIG_X86_32
>  int apply_relocate(Elf32_Shdr *sechdrs,
>                    const char *strtab,
> diff --git a/arch/x86/kernel/text.c b/arch/x86/kernel/text.c
> new file mode 100644
> index 000000000000..724ab2d93ac5
> --- /dev/null
> +++ b/arch/x86/kernel/text.c
> @@ -0,0 +1,60 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *  Kernel module help for x86.
> + *  Copyright (C) 2001 Rusty Russell.
> + */
> +#include <linux/kasan.h>
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/random.h>
> +#include <linux/vmalloc.h>
> +#include <asm/setup.h>
> +
> +#ifdef CONFIG_RANDOMIZE_BASE
> +static unsigned long module_load_offset;
> +
> +/* Mutex protects the module_load_offset. */
> +static DEFINE_MUTEX(module_kaslr_mutex);
> +
> +static unsigned long get_module_load_offset(void)
> +{
> +       if (kaslr_enabled()) {
> +               mutex_lock(&module_kaslr_mutex);
> +               /*
> +                * Calculate the module_load_offset the first time this
> +                * code is called. Once calculated it stays the same until
> +                * reboot.
> +                */
> +               if (module_load_offset == 0)
> +                       module_load_offset =
> +                               (get_random_int() % 1024 + 1) * PAGE_SIZE;
> +               mutex_unlock(&module_kaslr_mutex);
> +       }
> +       return module_load_offset;
> +}
> +#else
> +static unsigned long get_module_load_offset(void)
> +{
> +       return 0;
> +}
> +#endif
> +
> +void *text_alloc(unsigned long size)
> +{
> +       void *p;
> +
> +       if (PAGE_ALIGN(size) > MODULES_LEN)
> +               return NULL;
> +
> +       p = __vmalloc_node_range(size, MODULE_ALIGN,
> +                                   MODULES_VADDR + get_module_load_offset(),
> +                                   MODULES_END, GFP_KERNEL,
> +                                   PAGE_KERNEL, 0, NUMA_NO_NODE,
> +                                   __builtin_return_address(0));
> +       if (p && (kasan_module_alloc(p, size) < 0)) {
> +               vfree(p);
> +               return NULL;
> +       }
> +
> +       return p;
> +}
> diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h
> index 4fa67a8b2265..4e8b9ba431ee 100644
> --- a/include/linux/moduleloader.h
> +++ b/include/linux/moduleloader.h
> @@ -24,10 +24,10 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section);
>
>  /* Allocator used for allocating struct module, core sections and init
>     sections.  Returns NULL on failure. */
> -void *module_alloc(unsigned long size);
> +void *text_alloc(unsigned long size);
>
>  /* Free memory returned from module_alloc. */
> -void module_memfree(void *module_region);
> +void text_memfree(void *module_region);
>
>  /* Determines if the section name is an init section (that is only used during
>   * module loading).
> diff --git a/kernel/Makefile b/kernel/Makefile
> index f3218bc5ec69..9e88e81f68ef 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o \
>             extable.o params.o \
>             kthread.o sys_ni.o nsproxy.o \
>             notifier.o ksysfs.o cred.o reboot.o \
> -           async.o range.o smpboot.o ucount.o
> +           async.o range.o smpboot.o ucount.o text.o
>
>  obj-$(CONFIG_MODULES) += kmod.o
>  obj-$(CONFIG_MULTIUSER) += groups.o
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 9df4cc9a2907..febd55019a8a 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -840,12 +840,12 @@ static void bpf_jit_uncharge_modmem(u32 pages)
>
>  void *__weak bpf_jit_alloc_exec(unsigned long size)
>  {
> -       return module_alloc(size);
> +       return text_alloc(size);
>  }
>
>  void __weak bpf_jit_free_exec(void *addr)
>  {
> -       module_memfree(addr);
> +       text_memfree(addr);
>  }
>
>  struct bpf_binary_header *
> diff --git a/kernel/kprobes.c b/kernel/kprobes.c
> index 4a904cc56d68..d1c354ec89de 100644
> --- a/kernel/kprobes.c
> +++ b/kernel/kprobes.c
> @@ -111,12 +111,12 @@ enum kprobe_slot_state {
>
>  void __weak *alloc_insn_page(void)
>  {
> -       return module_alloc(PAGE_SIZE);
> +       return text_alloc(PAGE_SIZE);
>  }
>
>  void __weak free_insn_page(void *page)
>  {
> -       module_memfree(page);
> +       text_memfree(page);
>  }
>
>  struct kprobe_insn_cache kprobe_insn_slots = {
> diff --git a/kernel/module.c b/kernel/module.c
> index bee1c25ca5c5..bdb3773f3668 100644
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -2141,16 +2141,6 @@ static void free_module_elf(struct module *mod)
>  }
>  #endif /* CONFIG_LIVEPATCH */
>
> -void __weak module_memfree(void *module_region)
> -{
> -       /*
> -        * This memory may be RO, and freeing RO memory in an interrupt is not
> -        * supported by vmalloc.
> -        */
> -       WARN_ON(in_interrupt());
> -       vfree(module_region);
> -}
> -
>  void __weak module_arch_cleanup(struct module *mod)
>  {
>  }
> @@ -2200,7 +2190,7 @@ static void free_module(struct module *mod)
>
>         /* This may be empty, but that's OK */
>         module_arch_freeing_init(mod);
> -       module_memfree(mod->init_layout.base);
> +       text_memfree(mod->init_layout.base);
>         kfree(mod->args);
>         percpu_modfree(mod);
>
> @@ -2208,7 +2198,7 @@ static void free_module(struct module *mod)
>         lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
>
>         /* Finally, free the core (containing the module structure) */
> -       module_memfree(mod->core_layout.base);
> +       text_memfree(mod->core_layout.base);
>  }
>
>  void *__symbol_get(const char *symbol)
> @@ -2781,13 +2771,6 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
>                 ddebug_remove_module(mod->name);
>  }
>
> -void * __weak module_alloc(unsigned long size)
> -{
> -       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> -                       GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> -                       NUMA_NO_NODE, __builtin_return_address(0));
> -}
> -
>  bool __weak module_init_section(const char *name)
>  {
>         return strstarts(name, ".init");
> @@ -3246,7 +3229,7 @@ static int move_module(struct module *mod, struct load_info *info)
>         void *ptr;
>
>         /* Do the allocs. */
> -       ptr = module_alloc(mod->core_layout.size);
> +       ptr = text_alloc(mod->core_layout.size);
>         /*
>          * The pointer to this block is stored in the module structure
>          * which is inside the block. Just mark it as not being a
> @@ -3260,7 +3243,7 @@ static int move_module(struct module *mod, struct load_info *info)
>         mod->core_layout.base = ptr;
>
>         if (mod->init_layout.size) {
> -               ptr = module_alloc(mod->init_layout.size);
> +               ptr = text_alloc(mod->init_layout.size);
>                 /*
>                  * The pointer to this block is stored in the module structure
>                  * which is inside the block. This block doesn't need to be
> @@ -3269,7 +3252,7 @@ static int move_module(struct module *mod, struct load_info *info)
>                  */
>                 kmemleak_ignore(ptr);
>                 if (!ptr) {
> -                       module_memfree(mod->core_layout.base);
> +                       text_memfree(mod->core_layout.base);
>                         return -ENOMEM;
>                 }
>                 memset(ptr, 0, mod->init_layout.size);
> @@ -3452,8 +3435,8 @@ static void module_deallocate(struct module *mod, struct load_info *info)
>  {
>         percpu_modfree(mod);
>         module_arch_freeing_init(mod);
> -       module_memfree(mod->init_layout.base);
> -       module_memfree(mod->core_layout.base);
> +       text_memfree(mod->init_layout.base);
> +       text_memfree(mod->core_layout.base);
>  }
>
>  int __weak module_finalize(const Elf_Ehdr *hdr,
> @@ -3527,7 +3510,7 @@ static void do_free_init(struct work_struct *w)
>
>         llist_for_each_safe(pos, n, list) {
>                 initfree = container_of(pos, struct mod_initfree, node);
> -               module_memfree(initfree->module_init);
> +               text_memfree(initfree->module_init);
>                 kfree(initfree);
>         }
>  }
> @@ -3626,10 +3609,10 @@ static noinline int do_init_module(struct module *mod)
>          * We want to free module_init, but be aware that kallsyms may be
>          * walking this with preempt disabled.  In all the failure paths, we
>          * call synchronize_rcu(), but we don't want to slow down the success
> -        * path. module_memfree() cannot be called in an interrupt, so do the
> +        * path. text_memfree() cannot be called in an interrupt, so do the
>          * work and call synchronize_rcu() in a work queue.
>          *
> -        * Note that module_alloc() on most architectures creates W+X page
> +        * Note that text_alloc() on most architectures creates W+X page
>          * mappings which won't be cleaned up until do_free_init() runs.  Any
>          * code such as mark_rodata_ro() which depends on those mappings to
>          * be cleaned up needs to sync with the queued work - ie
> diff --git a/kernel/text.c b/kernel/text.c
> new file mode 100644
> index 000000000000..9a12c508ded5
> --- /dev/null
> +++ b/kernel/text.c
> @@ -0,0 +1,25 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *  Copyright (C) 2002 Richard Henderson
> + *  Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM.
> + */
> +#include <linux/mm.h>
> +#include <linux/moduleloader.h>
> +#include <linux/vmalloc.h>
> +
> +void __weak text_memfree(void *module_region)
> +{
> +       /*
> +        * This memory may be RO, and freeing RO memory in an interrupt is not
> +        * supported by vmalloc.
> +        */
> +       WARN_ON(in_interrupt());
> +       vfree(module_region);
> +}
> +
> +void * __weak text_alloc(unsigned long size)
> +{
> +       return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
> +                       GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
> +                       NUMA_NO_NODE, __builtin_return_address(0));
> +}
> --
> 2.25.1
>

^ permalink raw reply

* [PATCH] selftests: Replace HTTP links with HTTPS ones
From: Alexander A. Klimov @ 2020-07-13 19:57 UTC (permalink / raw)
  To: shuah, mpe, benh, paulus, ravi.bangoria, broonie, linux-kselftest,
	linux-kernel, linuxppc-dev
  Cc: Alexander A. Klimov

Rationale:
Reduces attack surface on kernel devs opening the links for MITM
as HTTPS traffic is much harder to manipulate.

Deterministic algorithm:
For each file:
  If not .svg:
    For each line:
      If doesn't contain `\bxmlns\b`:
        For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
            If both the HTTP and HTTPS versions
            return 200 OK and serve the same content:
              Replace HTTP with HTTPS.

Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
---
 Continuing my work started at 93431e0607e5.
 See also: git log --oneline '--author=Alexander A. Klimov <grandmaster@al2klimov.de>' v5.7..master
 (Actually letting a shell for loop submit all this stuff for me.)

 If there are any URLs to be removed completely or at least not just HTTPSified:
 Just clearly say so and I'll *undo my change*.
 See also: https://lkml.org/lkml/2020/6/27/64

 If there are any valid, but yet not changed URLs:
 See: https://lkml.org/lkml/2020/6/26/837

 If you apply the patch, please let me know.

 Sorry again to all maintainers who complained about subject lines.
 Now I realized that you want an actually perfect prefixes,
 not just subsystem ones.
 I tried my best...
 And yes, *I could* (at least half-)automate it.
 Impossible is nothing! :)


 tools/testing/selftests/futex/include/atomic.h        | 2 +-
 tools/testing/selftests/futex/include/futextest.h     | 2 +-
 tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c | 2 +-
 tools/testing/selftests/vDSO/parse_vdso.c             | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/futex/include/atomic.h b/tools/testing/selftests/futex/include/atomic.h
index 428bcd921bb5..23703ecfcd68 100644
--- a/tools/testing/selftests/futex/include/atomic.h
+++ b/tools/testing/selftests/futex/include/atomic.h
@@ -5,7 +5,7 @@
  *
  * DESCRIPTION
  *      GCC atomic builtin wrappers
- *      http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ *      https://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
  *
  * AUTHOR
  *      Darren Hart <dvhart@linux.intel.com>
diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h
index ddbcfc9b7bac..2a210c482f7b 100644
--- a/tools/testing/selftests/futex/include/futextest.h
+++ b/tools/testing/selftests/futex/include/futextest.h
@@ -211,7 +211,7 @@ futex_cmp_requeue_pi(futex_t *uaddr, futex_t val, futex_t *uaddr2, int nr_wake,
  * @newval:	The new value to try and assign the futex
  *
  * Implement cmpxchg using gcc atomic builtins.
- * http://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
+ * https://gcc.gnu.org/onlinedocs/gcc-4.1.0/gcc/Atomic-Builtins.html
  *
  * Return the old futex value.
  */
diff --git a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
index c1f324afdbf3..946c52e1f327 100644
--- a/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
+++ b/tools/testing/selftests/powerpc/ptrace/perf-hwbreak.c
@@ -12,7 +12,7 @@
  * times. Then check the output count from perf is as expected.
  *
  * Based on:
- *   http://ozlabs.org/~anton/junkcode/perf_events_example1.c
+ *   https://ozlabs.org/~anton/junkcode/perf_events_example1.c
  *
  * Copyright (C) 2018 Michael Neuling, IBM Corporation.
  */
diff --git a/tools/testing/selftests/vDSO/parse_vdso.c b/tools/testing/selftests/vDSO/parse_vdso.c
index 413f75620a35..3413fc00c835 100644
--- a/tools/testing/selftests/vDSO/parse_vdso.c
+++ b/tools/testing/selftests/vDSO/parse_vdso.c
@@ -5,7 +5,7 @@
  * This code is meant to be linked in to various programs that run on Linux.
  * As such, it is available with as few restrictions as possible.  This file
  * is licensed under the Creative Commons Zero License, version 1.0,
- * available at http://creativecommons.org/publicdomain/zero/1.0/legalcode
+ * available at https://creativecommons.org/publicdomain/zero/1.0/legalcode
  *
  * The vDSO is a regular ELF DSO that the kernel maps into user space when
  * it starts a program.  It works equally well in statically and dynamically
-- 
2.27.0


^ permalink raw reply related

* Re: [RFC PATCH 00/35] Move all PCIBIOS* definitions into arch/x86
From: Bjorn Helgaas @ 2020-07-13 22:01 UTC (permalink / raw)
  To: Saheed O. Bolarinwa
  Cc: Rich Felker, Martin K. Petersen, linux-sh, linux-pci, linux-nvme,
	Yicong Yang, Keith Busch, Realtek linux nic maintainers,
	Paul Mackerras, linux-i2c, bcm-kernel-feedback-list, sparclinux,
	rfi, Toan Le, Greg Ungerer, Marek Vasut, Rob Herring,
	Stefano Stabellini, Sagi Grimberg, Yoshinori Sato, linux-scsi,
	Greg Kroah-Hartman, linux-atm-general, Russell King, Ley Foon Tan,
	Christoph Hellwig, Geert Uytterhoeven, Rafał Miłecki,
	Chas Williams, xen-devel, Matt Turner, linux-mips,
	linux-kernel-mentees, Kevin Hilman, Guenter Roeck, linux-hwmon,
	Jean Delvare, Andrew Donnellan, Arnd Bergmann, Ray Jui,
	James E.J. Bottomley, Yue Wang, Jens Axboe, Jakub Kicinski,
	linux-m68k, Lorenzo Pieralisi, Ivan Kokshaysky, Michael Buesch,
	skhan, bjorn, linux-amlogic, Boris Ostrovsky, Guan Xuetao,
	linux-arm-kernel, Richard Henderson, Juergen Gross, Michal Simek,
	Thomas Bogendoerfer, Scott Branden, Bjorn Helgaas, Jingoo Han,
	netdev, Yoshihiro Shimoda, linux-wireless, linux-kernel,
	linux-renesas-soc, Brian King, Philipp Zabel, linux-alpha,
	Frederic Barrat, Gustavo Pimentel, linuxppc-dev, David S. Miller,
	Heiner Kallweit
In-Reply-To: <20200713122247.10985-1-refactormyself@gmail.com>

On Mon, Jul 13, 2020 at 02:22:12PM +0200, Saheed O. Bolarinwa wrote:
> This goal of these series is to move the definition of *all* PCIBIOS* from
> include/linux/pci.h to arch/x86 and limit their use within there.
> All other tree specific definition will be left for intact. Maybe they can
> be renamed.

More comments later, but a few trivial whitespace issues you can clean
up in the meantime.  Don't repost for at least a few days to avoid
spamming everybody.  I found these with:

  $ b4 am -om/ 20200713122247.10985-1-refactormyself@gmail.com
  $ git am m/20200713_refactormyself_move_all_pcibios_definitions_into_arch_x86.mbx

  Applying: atm: Change PCIBIOS_SUCCESSFUL to 0
  .git/rebase-apply/patch:11: trailing whitespace.
	  iadev = INPH_IA_DEV(dev);
  .git/rebase-apply/patch:12: trailing whitespace.
	  for(i=0; i<64; i++)
  .git/rebase-apply/patch:13: trailing whitespace.
	    if ((error = pci_read_config_dword(iadev->pci,
  .git/rebase-apply/patch:16: trailing whitespace, space before tab in indent.
		return error;
  .git/rebase-apply/patch:17: trailing whitespace.
	  writel(0, iadev->reg+IPHASE5575_EXT_RESET);
  warning: squelched 5 whitespace errors
  warning: 10 lines add whitespace errors.
  Applying: atm: Tidy Success/Failure checks
  .git/rebase-apply/patch:13: trailing whitespace.

  .git/rebase-apply/patch:14: trailing whitespace.
	  iadev = INPH_IA_DEV(dev);
  .git/rebase-apply/patch:15: trailing whitespace.
	  for(i=0; i<64; i++)
  .git/rebase-apply/patch:21: trailing whitespace.
	  writel(0, iadev->reg+IPHASE5575_EXT_RESET);
  .git/rebase-apply/patch:22: trailing whitespace.
	  for(i=0; i<64; i++)
  warning: squelched 3 whitespace errors
  warning: 8 lines add whitespace errors.
  Applying: atm: Fix Style ERROR- assignment in if condition
  .git/rebase-apply/patch:12: trailing whitespace.
	  unsigned int pci[64];
  .git/rebase-apply/patch:13: trailing whitespace.

  .git/rebase-apply/patch:14: trailing whitespace.
	  iadev = INPH_IA_DEV(dev);
  .git/rebase-apply/patch:23: trailing whitespace.
	  writel(0, iadev->reg+IPHASE5575_EXT_RESET);
  .git/rebase-apply/patch:32: trailing whitespace.
	  udelay(5);
  warning: squelched 2 whitespace errors
  warning: 7 lines add whitespace errors.
  Applying: PCI: Change PCIBIOS_SUCCESSFUL to 0
  .git/rebase-apply/patch:37: trailing whitespace.
  struct pci_ops apecs_pci_ops =
  .git/rebase-apply/patch:50: trailing whitespace.
  static int
  .git/rebase-apply/patch:59: trailing whitespace.
  struct pci_ops cia_pci_ops =
  .git/rebase-apply/patch:94: trailing whitespace.
  static int
  .git/rebase-apply/patch:103: trailing whitespace.
  struct pci_ops lca_pci_ops =
  warning: squelched 10 whitespace errors
  warning: 15 lines add whitespace errors.

^ permalink raw reply

* Re: [PATCH] ocxl: Replace HTTP links with HTTPS ones
From: Andrew Donnellan @ 2020-07-13 22:55 UTC (permalink / raw)
  To: Alexander A. Klimov, fbarrat, arnd, gregkh, linuxppc-dev,
	linux-kernel
In-Reply-To: <20200713175506.36676-1-grandmaster@al2klimov.de>

On 14/7/20 3:55 am, Alexander A. Klimov wrote:
> Rationale:
> Reduces attack surface on kernel devs opening the links for MITM
> as HTTPS traffic is much harder to manipulate.
> 
> Deterministic algorithm:
> For each file:
>    If not .svg:
>      For each line:
>        If doesn't contain `\bxmlns\b`:
>          For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
> 	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
>              If both the HTTP and HTTPS versions
>              return 200 OK and serve the same content:
>                Replace HTTP with HTTPS.
> 
> Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>

Thanks.

Acked-by: Andrew Donnellan <ajd@linux.ibm.com>

-- 
Andrew Donnellan              OzLabs, ADL Canberra
ajd@linux.ibm.com             IBM Australia Limited

^ permalink raw reply

* Re: [PATCH 14/14] powerpc/eeh: Move PE tree setup into the platform
From: Alexey Kardashevskiy @ 2020-07-14  1:50 UTC (permalink / raw)
  To: Oliver O'Halloran, linuxppc-dev; +Cc: mahesh
In-Reply-To: <20200706013619.459420-15-oohall@gmail.com>



On 06/07/2020 11:36, Oliver O'Halloran wrote:
> The EEH core has a concept of a "PE tree" to support PowerNV. The PE tree
> follows the PCI bus structures because a reset asserted on an upstream
> bridge will be propagated to the downstream bridges. On pseries there's a
> 1-1 correspondence between what the guest sees are a PHB and a PE so the
> "tree" is really just a single node.
> 
> Current the EEH core is reponsible for setting up this PE tree which it
> does by traversing the pci_dn tree. The structure of the pci_dn tree
> matches the bus tree on PowerNV which leads to the PE tree being "correct"
> this setup method doesn't make a whole lot of sense and it's actively
> confusing for the pseries case where it doesn't really do anything.
> 
> We want to remove the dependence on pci_dn anyway so this patch move
> choosing where to insert a new PE into the platform code rather than
> being part of the generic EEH code. For PowerNV this simplifies the
> tree building logic and removes the use of pci_dn. For pseries we
> keep the existing logic. I'm not really convinced it does anything
> due to the 1-1 PE-to-PHB correspondence so every device under that
> PHB should be in the same PE, but I'd rather not remove it entirely
> until we've had a chance to look at it more deeply.
> 
> Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
> ---
>  arch/powerpc/include/asm/eeh.h               |  2 +-
>  arch/powerpc/kernel/eeh_pe.c                 | 70 ++++++--------------
>  arch/powerpc/platforms/powernv/eeh-powernv.c | 27 +++++++-
>  arch/powerpc/platforms/pseries/eeh_pseries.c | 59 +++++++++++++++--
>  4 files changed, 101 insertions(+), 57 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 8d34e5b790c2..1cab629dbc74 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -283,7 +283,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
>  struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
>  struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
>  			  int pe_no, int config_addr);
> -int eeh_pe_tree_insert(struct eeh_dev *edev);
> +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent);
>  int eeh_pe_tree_remove(struct eeh_dev *edev);
>  void eeh_pe_update_time_stamp(struct eeh_pe *pe);
>  void *eeh_pe_traverse(struct eeh_pe *root,
> diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
> index 898205829a8f..ea2f8b362d18 100644
> --- a/arch/powerpc/kernel/eeh_pe.c
> +++ b/arch/powerpc/kernel/eeh_pe.c
> @@ -318,53 +318,20 @@ struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
>  	return pe;
>  }
>  
> -/**
> - * eeh_pe_get_parent - Retrieve the parent PE
> - * @edev: EEH device
> - *
> - * The whole PEs existing in the system are organized as hierarchy
> - * tree. The function is used to retrieve the parent PE according
> - * to the parent EEH device.
> - */
> -static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
> -{
> -	struct eeh_dev *parent;
> -	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
> -
> -	/*
> -	 * It might have the case for the indirect parent
> -	 * EEH device already having associated PE, but
> -	 * the direct parent EEH device doesn't have yet.
> -	 */
> -	if (edev->physfn)
> -		pdn = pci_get_pdn(edev->physfn);
> -	else
> -		pdn = pdn ? pdn->parent : NULL;
> -	while (pdn) {
> -		/* We're poking out of PCI territory */
> -		parent = pdn_to_eeh_dev(pdn);
> -		if (!parent)
> -			return NULL;
> -
> -		if (parent->pe)
> -			return parent->pe;
> -
> -		pdn = pdn->parent;
> -	}
> -
> -	return NULL;
> -}
> -
>  /**
>   * eeh_pe_tree_insert - Add EEH device to parent PE
>   * @edev: EEH device
> + * @new_pe_parent: PE to create additional PEs under
>   *
> - * Add EEH device to the parent PE. If the parent PE already
> - * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
> - * we have to create new PE to hold the EEH device and the new
> - * PE will be linked to its parent PE as well.
> + * Add EEH device to the PE in edev->pe_config_addr. If a PE already
> + * exists with that address then @edev is added to that PE. Otherwise
> + * a new PE is created and inserted into the PE tree as a child of
> + * @new_pe_parent.
> + *
> + * If @new_pe_parent is NULL then the new PE will be inserted under
> + * directly under the the PHB.
>   */
> -int eeh_pe_tree_insert(struct eeh_dev *edev)
> +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent)
>  {
>  	struct pci_controller *hose = edev->controller;
>  	struct eeh_pe *pe, *parent;


We can ditch this "parent" in that single place now and use "pe"
instead. And name the new parameter simply "parent". Dunno if it
improves things though.



> @@ -399,7 +366,7 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>  			}
>  
>  			eeh_edev_dbg(edev,
> -				     "Added to device PE (parent: PE#%x)\n",
> +				     "Added to existing PE (parent: PE#%x)\n",
>  				     pe->parent->addr);
>  		} else {
>  			/* Mark the PE as type of PCI bus */
> @@ -431,10 +398,9 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>  	 * to PHB directly. Otherwise, we have to associate the
>  	 * PE with its parent.
>  	 */
> -	parent = eeh_pe_get_parent(edev);
> -	if (!parent) {
> -		parent = eeh_phb_pe_get(hose);
> -		if (!parent) {
> +	if (!new_pe_parent) {
> +		new_pe_parent = eeh_phb_pe_get(hose);
> +		if (!new_pe_parent) {



afaict only pseries can realisticly pass new_pe_parent==NULL so this
chunk could go to pseries_eeh_pe_get_parent.


>  			pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
>  				__func__, hose->global_number);
>  			edev->pe = NULL;
> @@ -442,17 +408,19 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
>  			return -EEXIST;
>  		}
>  	}
> -	pe->parent = parent;
> +
> +	/* link new PE into the tree */
> +	pe->parent = new_pe_parent;
> +	list_add_tail(&pe->child, &new_pe_parent->child_list);
>  
>  	/*
>  	 * Put the newly created PE into the child list and
>  	 * link the EEH device accordingly.
>  	 */
> -	list_add_tail(&pe->child, &parent->child_list);
>  	list_add_tail(&edev->entry, &pe->edevs);
>  	edev->pe = pe;
> -	eeh_edev_dbg(edev, "Added to device PE (parent: PE#%x)\n",
> -		     pe->parent->addr);
> +	eeh_edev_dbg(edev, "Added to new (parent: PE#%x)\n",
> +		     new_pe_parent->addr);
>  
>  	return 0;
>  }
> diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
> index 8c9fca773692..9af8c3b98853 100644
> --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> @@ -338,6 +338,28 @@ static int pnv_eeh_find_ecap(struct pci_dn *pdn, int cap)
>  	return 0;
>  }
>  
> +static struct eeh_pe *pnv_eeh_get_upstream_pe(struct pci_dev *pdev)
> +{
> +	struct pci_controller *hose = pdev->bus->sysdata;
> +	struct pnv_phb *phb = hose->private_data;
> +	struct pci_dev *parent = pdev->bus->self;
> +
> +#ifdef CONFIG_PCI_IOV
> +	/* for VFs we use the PF's PE as the upstream PE */
> +	if (pdev->is_virtfn)
> +		parent = pdev->physfn;
> +#endif
> +
> +	/* otherwise use the PE of our parent bridge */
> +	if (parent) {
> +		struct pnv_ioda_pe *ioda_pe = pnv_ioda_get_pe(parent);
> +
> +		return eeh_pe_get(phb->hose, ioda_pe->pe_number, 0);
> +	}
> +
> +	return NULL;
> +}
> +
>  /**
>   * pnv_eeh_probe - Do probe on PCI device
>   * @pdev: pci_dev to probe
> @@ -350,6 +372,7 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
>  	struct pci_controller *hose = pdn->phb;
>  	struct pnv_phb *phb = hose->private_data;
>  	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
> +	struct eeh_pe *upstream_pe;
>  	uint32_t pcie_flags;
>  	int ret;
>  	int config_addr = (pdn->busno << 8) | (pdn->devfn);
> @@ -398,8 +421,10 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
>  
>  	edev->pe_config_addr = phb->ioda.pe_rmap[config_addr];
>  
> +	upstream_pe = pnv_eeh_get_upstream_pe(pdev);
> +
>  	/* Create PE */
> -	ret = eeh_pe_tree_insert(edev);
> +	ret = eeh_pe_tree_insert(edev, upstream_pe);
>  	if (ret) {
>  		eeh_edev_warn(edev, "Failed to add device to PE (code %d)\n", ret);
>  		return NULL;
> diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
> index 72556f4436a8..98f9a1b269be 100644
> --- a/arch/powerpc/platforms/pseries/eeh_pseries.c
> +++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
> @@ -68,11 +68,16 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
>  	pseries_eeh_init_edev(pdn);
>  #ifdef CONFIG_PCI_IOV
>  	if (pdev->is_virtfn) {
> +		/*
> +		 * FIXME: This really should be handled by choosing the right
> +		 *        parent PE in in pseries_eeh_init_edev().
> +		 */
> +		struct eeh_pe *physfn_pe = pci_dev_to_eeh_dev(pdev->physfn)->pe;
>  		struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
>  
>  		edev->pe_config_addr =  (pdn->busno << 16) | (pdn->devfn << 8);
>  		eeh_pe_tree_remove(edev); /* Remove as it is adding to bus pe */
> -		eeh_pe_tree_insert(edev);   /* Add as VF PE type */
> +		eeh_pe_tree_insert(edev, physfn_pe);   /* Add as VF PE type */
>  	}
>  #endif
>  	eeh_probe_device(pdev);
> @@ -218,6 +223,43 @@ static int pseries_eeh_find_ecap(struct pci_dn *pdn, int cap)
>  	return 0;
>  }
>  
> +/**
> + * pseries_eeh_pe_get_parent - Retrieve the parent PE
> + * @edev: EEH device
> + *
> + * The whole PEs existing in the system are organized as hierarchy
> + * tree. The function is used to retrieve the parent PE according
> + * to the parent EEH device.
> + */
> +static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev)
> +{
> +	struct eeh_dev *parent;
> +	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
> +
> +	/*
> +	 * It might have the case for the indirect parent
> +	 * EEH device already having associated PE, but
> +	 * the direct parent EEH device doesn't have yet.
> +	 */
> +	if (edev->physfn)
> +		pdn = pci_get_pdn(edev->physfn);
> +	else
> +		pdn = pdn ? pdn->parent : NULL;
> +	while (pdn) {
> +		/* We're poking out of PCI territory */


We are traversing up PCI hierarchy here - pci_dn->parent, how is this
out of PCI territory? Or I understand "out of" incorrectly?


> +		parent = pdn_to_eeh_dev(pdn);
> +		if (!parent)
> +			return NULL;
> +
> +		if (parent->pe)
> +			return parent->pe;
> +
> +		pdn = pdn->parent;
> +	}
> +
> +	return NULL;
> +}
> +
>  /**
>   * pseries_eeh_init_edev - initialise the eeh_dev and eeh_pe for a pci_dn
>   *
> @@ -301,6 +343,8 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
>  	if (ret) {
>  		eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
>  	} else {
> +		struct eeh_pe *parent;
> +
>  		/* Retrieve PE address */
>  		edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn);
>  		pe.addr = edev->pe_config_addr;
> @@ -313,16 +357,23 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
>  		if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
>  			enable = 1;
>  
> -		if (enable) {
> +		/* This device doesn't support EEH, but it may have an
> +		 * EEH parent, in which case we mark it as supported.
> +		 */
> +		parent = pseries_eeh_pe_get_parent(edev);
> +		if (parent && !enable)
> +			edev->pe_config_addr = parent->addr;


What if pseries_eeh_pe_get_parent() returned NULL - we won't write
edev->pe_config_addr so it remains 0 which is fine just by accident? :)

I'd make pseries_eeh_pe_get_parent() always return not NULL (a parent or
a hose) and simplify the block below.


I mean the change is alright but part of the excersise is making the
code more readable but there also can go forever :) so

Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>



> +
> +		if (enable || parent) {
>  			eeh_add_flag(EEH_ENABLED);
> -			eeh_pe_tree_insert(edev);
> +			eeh_pe_tree_insert(edev, parent);
>  		} else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
>  			   (pdn_to_eeh_dev(pdn->parent))->pe) {
>  			/* This device doesn't support EEH, but it may have an
>  			 * EEH parent, in which case we mark it as supported.
>  			 */
>  			edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
> -			eeh_pe_tree_insert(edev);
> +			eeh_pe_tree_insert(edev, parent);
>  		}
>  		eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
>  			     (enable ? "enabled" : "unsupported"), ret);
> 

-- 
Alexey

^ permalink raw reply

* Re: [PATCH v3 4/6] powerpc/pseries/iommu: Remove default DMA window before creating DDW
From: Leonardo Bras @ 2020-07-14  2:40 UTC (permalink / raw)
  To: Alexey Kardashevskiy, Michael Ellerman, Benjamin Herrenschmidt,
	Paul Mackerras, Thiago Jung Bauermann, Ram Pai
  Cc: linuxppc-dev, linux-kernel
In-Reply-To: <cc15a81d-04d9-3ee4-4fdb-093618f6e635@ozlabs.ru>

Thank you for this feedback Alexey!

On Mon, 2020-07-13 at 17:33 +1000, Alexey Kardashevskiy wrote:
> [...]
> > -	int len, ret;
> > +	int len, ret, reset_win_ext;
> 
> Make it "reset_token".

Oh, it's not a token here, it just checks if the reset_win extension
exists. The token would be returned in *value, but since we did not
need it here, it's not copied.

> > [...]
> > -out_failed:
> > +out_restore_defwin:
> > +	if (default_win && reset_win_ext == 0)
> 
> reset_win_ext potentially may be uninitialized here. Yeah I know it is
> tied to default_win but still.

I can't see it being used uninitialized here, as you said it's tied to
default_win. 
Could you please tell me how it can be used uninitialized here, or what
is bad by doing this way?

> After looking at this function for a few minutes, it could use some
> refactoring (way too many gotos)  such as:

Yes, I agree.

> 1. move (query.page_size & xx) checks before "if
> (query.windows_available == 0)"

Moving 'page_size selection' above 'checking windows available' will
need us to duplicate the 'page_size selection' after the new query,
inside the if.
I mean, as query will be done again, it will need to get the (new) page
size.

> 2. move "win64 = kzalloc(sizeof(struct property), GFP_KERNEL)" before
> "if (query.windows_available == 0)"

> 3. call "reset_dma_window(dev, pdn)" inside the "if
> (query.windows_available == 0)" branch.

> Then you can drop all "goto out_restore_defwin" and move default_win and
> reset_win_ext inside "if (query.windows_available == 0)".

I did all changes suggested locally and did some analysis in the
result:

I did not see a way to put default_win and reset_win_ext inside 
"if (query.windows_available == 0)", because if we still need a way to
know if the default window was removed, and if so, restore in case
anything ever fails ahead (like creating the node property). 

But from that analysis I noted it's possible to remove all the new
"goto out_restore_defwin", if we do default_win = NULL if
ddw_read_ext() fails. 

So testing only default_win should always be enough to say if the
default window was deleted, and reset_win_ext could be moved inside "if
(query.windows_available == 0)".
Also, it would avoid reset_win_ext being 'used uninitialized' and
"out_restore_defwin:" would not be needed.

Against the current patch, we would have something like this:

#####

 static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn)
 {
-       int len, ret, reset_win_ext;
+       int len, ret;
        struct ddw_query_response query;
        struct ddw_create_response create;
        int page_shift;
@@ -1173,25 +1173,28 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
         * for extensions presence.
         */
        if (query.windows_available == 0) {
+               int reset_win_ext;
                default_win = of_find_property(pdn, "ibm,dma-window",
NULL);
                if (!default_win)
                        goto out_failed;
 
                reset_win_ext = ddw_read_ext(pdn,
DDW_EXT_RESET_DMA_WIN, NULL);
-               if (reset_win_ext)
+               if (reset_win_ext){
+                       default_win = NULL;
                        goto out_failed;
+               }
 
                remove_dma_window(pdn, ddw_avail, default_win);
 
                /* Query again, to check if the window is available */
                ret = query_ddw(dev, ddw_avail, &query, pdn);
                if (ret != 0)
-                       goto out_restore_defwin;
+                       goto out_failed;
 
                if (query.windows_available == 0) {
                        /* no windows are available for this device. */
                        dev_dbg(&dev->dev, "no free dynamic windows");
-                       goto out_restore_defwin;
+                       goto out_failed;
                }
        }
        if (query.page_size & 4) {
@@ -1203,7 +1206,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct
device_node *pdn)
        } else {
                dev_dbg(&dev->dev, "no supported direct page size in
mask %x",
                          query.page_size);
-               goto out_restore_defwin;
+               goto out_failed;
        }
        /* verify the window * number of ptes will map the partition */
        /* check largest block * page size > max memory hotplug addr */
@@ -1212,14 +1215,14 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
                dev_dbg(&dev->dev, "can't map partition max 0x%llx with
%llu "
                          "%llu-sized pages\n",
max_addr,  query.largest_available_block,
                          1ULL << page_shift);
-               goto out_restore_defwin;
+               goto out_failed;
        }
        len = order_base_2(max_addr);
        win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
        if (!win64) {
                dev_info(&dev->dev,
                        "couldn't allocate property for 64bit dma
window\n");
-               goto out_restore_defwin;
+               goto out_failed;
        }
        win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
        win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
@@ -1282,11 +1285,10 @@ static u64 enable_ddw(struct pci_dev *dev,
struct device_node *pdn)
        kfree(win64->value);
        kfree(win64);
 
-out_restore_defwin:
-       if (default_win && reset_win_ext == 0)
+out_failed:
+       if (default_win)
                reset_dma_window(dev, pdn);
 
-out_failed:
        fpdn = kzalloc(sizeof(*fpdn), GFP_KERNEL);
        if (!fpdn)
                goto out_unlock;

#####

What do you think?



> The rest of the series is good as it is,

Thank you :)

>  however it may conflict with
> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20200713062348.100552-1-aik@ozlabs.ru/
> and the patchset it is made on top of -
> https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=188385 .

<From the message of the first link>
> (do not rush, let me finish reviewing this first) 

Ok, I have no problem rebasing on top of those patchsets, but what
would you suggest to be done?

Would it be ok doing a big multi-author patchset, so we guarantee it
being applied in the correct order?

(You probably want me to rebase my patchset on top of Hellwig + yours,
right?) 


> thanks,
Thank you!




^ permalink raw reply

* Re: [PATCH 14/14] powerpc/eeh: Move PE tree setup into the platform
From: Oliver O'Halloran @ 2020-07-14  3:08 UTC (permalink / raw)
  To: Alexey Kardashevskiy; +Cc: linuxppc-dev, Mahesh J Salgaonkar
In-Reply-To: <d2ce45e8-92e6-f875-fd38-c3fa82bf8f4e@ozlabs.ru>

On Tue, Jul 14, 2020 at 11:50 AM Alexey Kardashevskiy <aik@ozlabs.ru> wrote:
>
> On 06/07/2020 11:36, Oliver O'Halloran wrote:
> >  /**
> >   * eeh_pe_tree_insert - Add EEH device to parent PE
> >   * @edev: EEH device
> > + * @new_pe_parent: PE to create additional PEs under
> >   *
> > - * Add EEH device to the parent PE. If the parent PE already
> > - * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
> > - * we have to create new PE to hold the EEH device and the new
> > - * PE will be linked to its parent PE as well.
> > + * Add EEH device to the PE in edev->pe_config_addr. If a PE already
> > + * exists with that address then @edev is added to that PE. Otherwise
> > + * a new PE is created and inserted into the PE tree as a child of
> > + * @new_pe_parent.
> > + *
> > + * If @new_pe_parent is NULL then the new PE will be inserted under
> > + * directly under the the PHB.
> >   */
> > -int eeh_pe_tree_insert(struct eeh_dev *edev)
> > +int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent)
> >  {
> >       struct pci_controller *hose = edev->controller;
> >       struct eeh_pe *pe, *parent;
>
>
> We can ditch this "parent" in that single place now and use "pe"
> instead. And name the new parameter simply "parent". Dunno if it
> improves things though.

I did this at some point and then decided not to. It added a bunch of
noise to the diff and calling the parameter "parent" ended up being
pretty unreadable. The parameter is "the parent of the PE that will be
created to contain edev", or "parent of the parent PE". It's pretty
unwieldy.

> > @@ -399,7 +366,7 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
> >                       }
> >
> >                       eeh_edev_dbg(edev,
> > -                                  "Added to device PE (parent: PE#%x)\n",
> > +                                  "Added to existing PE (parent: PE#%x)\n",
> >                                    pe->parent->addr);
> >               } else {
> >                       /* Mark the PE as type of PCI bus */
> > @@ -431,10 +398,9 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
> >        * to PHB directly. Otherwise, we have to associate the
> >        * PE with its parent.
> >        */
> > -     parent = eeh_pe_get_parent(edev);
> > -     if (!parent) {
> > -             parent = eeh_phb_pe_get(hose);
> > -             if (!parent) {
> > +     if (!new_pe_parent) {
> > +             new_pe_parent = eeh_phb_pe_get(hose);
> > +             if (!new_pe_parent) {
>
>
>
> afaict only pseries can realisticly pass new_pe_parent==NULL so this
> chunk could go to pseries_eeh_pe_get_parent.

pnv_eeh_get_upstream_pe() will never return the PHB PE so
new_pe_parent will be NULL for the first PE created under a PowerNV
PHB. I guess we could move the PHB PE handling into the platform too,
but I think that just results in having to special case PHB PEs in two
places rather than one.

> > +static struct eeh_pe *pseries_eeh_pe_get_parent(struct eeh_dev *edev)
> > +{
> > +     struct eeh_dev *parent;
> > +     struct pci_dn *pdn = eeh_dev_to_pdn(edev);
> > +
> > +     /*
> > +      * It might have the case for the indirect parent
> > +      * EEH device already having associated PE, but
> > +      * the direct parent EEH device doesn't have yet.
> > +      */
> > +     if (edev->physfn)
> > +             pdn = pci_get_pdn(edev->physfn);
> > +     else
> > +             pdn = pdn ? pdn->parent : NULL;
> > +     while (pdn) {
> > +             /* We're poking out of PCI territory */
>
>
> We are traversing up PCI hierarchy here - pci_dn->parent, how is this
> out of PCI territory? Or I understand "out of" incorrectly?
>
>
> > +             parent = pdn_to_eeh_dev(pdn);
> > +             if (!parent)
> > +                     return NULL;

If there's no eeh dev then the node we're looking at is a PHB rather
than an actual PCI device so it stops looking. I think. The comment
was copied over from the existing code and I haven't spent a whole lot
of time parsing it's meaning.



> > @@ -301,6 +343,8 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
> >       if (ret) {
> >               eeh_edev_dbg(edev, "EEH failed to enable on device (code %d)\n", ret);
> >       } else {
> > +             struct eeh_pe *parent;
> > +
> >               /* Retrieve PE address */
> >               edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn);
> >               pe.addr = edev->pe_config_addr;
> > @@ -313,16 +357,23 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
> >               if (ret > 0 && ret != EEH_STATE_NOT_SUPPORT)
> >                       enable = 1;
> >
> > -             if (enable) {
> > +             /* This device doesn't support EEH, but it may have an
> > +              * EEH parent, in which case we mark it as supported.
> > +              */
> > +             parent = pseries_eeh_pe_get_parent(edev);
> > +             if (parent && !enable)
> > +                     edev->pe_config_addr = parent->addr;
>
>
> What if pseries_eeh_pe_get_parent() returned NULL - we won't write
> edev->pe_config_addr so it remains 0 which is fine just by accident? :)

edev->pe_config_addr is set above when we call
pseries_eeh_get_pe_addr(). The check there is mainly to cover the case
where pseries_eeh_get_pe_addr() fails because the device is on a
subordinate bus rather than the root bus of the PE. PAPR says the
get-pe-addr-info RTAS call can fail in that situation and that you're
supposed to traverse up the DT to find the pe_config_addr, which is
what pe_get_parent() does. Yeah it's confusing, but that's what it
does today too.

> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
> > +
> > +             if (enable || parent) {
> >                       eeh_add_flag(EEH_ENABLED);
> > -                     eeh_pe_tree_insert(edev);
> > +                     eeh_pe_tree_insert(edev, parent);

> >               } else if (pdn->parent && pdn_to_eeh_dev(pdn->parent) &&
> >                          (pdn_to_eeh_dev(pdn->parent))->pe) {
> >                       /* This device doesn't support EEH, but it may have an
> >                        * EEH parent, in which case we mark it as supported.
> >                        */
> >                       edev->pe_config_addr = pdn_to_eeh_dev(pdn->parent)->pe_config_addr;
> > -                     eeh_pe_tree_insert(edev);
> > +                     eeh_pe_tree_insert(edev, parent);

I think I was supposed to delete this hunk and then forgot to since it
handles the same case mentioned above.

> >               }
> >               eeh_edev_dbg(edev, "EEH is %s on device (code %d)\n",
> >                            (enable ? "enabled" : "unsupported"), ret);
> >
>
> --
> Alexey

^ permalink raw reply

* Re: [PATCH 1/3] module: Rename module_alloc() to text_alloc() and move to kernel proper
From: Steven Rostedt @ 2020-07-14  2:04 UTC (permalink / raw)
  To: Ard Biesheuvel
  Cc: Catalin Marinas, Kefeng Wang, Paul Mackerras, Zong Li, Andi Kleen,
	Paul Burton, Vincent Whitchurch, Petr Mladek, Brian Gerst,
	Andy Lutomirski, Yonghong Song, Thomas Gleixner, Jiri Kosina,
	Anup Patel, Linux Kernel Mailing List, Philipp Rudo, Torsten Duwe,
	Masami Hiramatsu, Andrew Morton, Mark Rutland,
	James E.J. Bottomley, Vincent Chen, open list:S390, Joe Lawrence,
	Helge Deller, John Fastabend, Anil S Keshavamurthy,
	Andrey Ryabinin, Iurii Zaikin, Andrii Nakryiko, Vasily Gorbik,
	moderated list:ARM PORT, Daniel Axtens, Damien Le Moal,
	Peter Oberparleiter, Sean Christopherson, Martin KaFai Lau,
	Song Liu, Paul Walmsley, Heiko Carstens, Alexei Starovoitov,
	Jarkko Sakkinen, Atish Patra, Will Deacon, Daniel Borkmann,
	Masahiro Yamada, Nayna Jain, Ley Foon Tan, Christian Borntraeger,
	Dmitry Vyukov, Sami Tolvanen, Naveen N. Rao, Mao Han, Marco Elver,
	Babu Moger, Borislav Petkov, Greentime Hu, Ben Dooks, Guan Xuetao,
	Thomas Bogendoerfer, open list:PARISC ARCHITECTURE, Jessica Yu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), David S. Miller,
	Thiago Jung Bauermann, Peter Zijlstra,
	open list:SPARC + UltraSPARC (sparc/sparc64), Sandipan Das,
	H. Peter Anvin, Amit Daniel Kachhap, Tiezhu Yang, Miroslav Benes,
	Jiri Olsa, open list:RISC-V ARCHITECTURE, Vincenzo Frascino,
	Anders Roxell, Sven Schnelle,
	maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT), Russell King,
	Mike Rapoport, Ingo Molnar, Albert Ou, Paul E. McKenney,
	Josh Poimboeuf, KP Singh, Gerald Schaefer, Nick Hu,
	open list:BPF JIT for MIPS (32-BIT AND 64-BIT), open list:MIPS,
	Sergey Senozhatsky, Palmer Dabbelt,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
In-Reply-To: <CAMj1kXGhZYxjZTP+_PGdBy4hZgdeeTNUkuaE_eQKwB4pPAYNXA@mail.gmail.com>

On Mon, 13 Jul 2020 22:49:48 +0300
Ard Biesheuvel <ardb@kernel.org> wrote:

> On arm64, we no longer use module_alloc for bpf or kprobes, to avoid
> wasting va space on code that does not need to be loaded close to the
> kernel. Also, module_alloc() allocates kasan shadow, which is
> unnecessary for kprobes or bpf programs, which don't have kasan
> instrumentation.
> 
> This patch suggests that there are other reasons why conflating
> allocation of module space and allocating  text pages for other uses
> is a bad idea, but switching all users to text_alloc() is a step in
> the wrong direction. It would be better to stop using module_alloc()
> in core code except in the module loader, and have a generic
> text_alloc() that can be overridden by the arch if necessary. Note
> that x86  and s390 are the only architectures that use module_alloc()
> in ftrace code.
> 
> Please have a look at alloc_insn_page() or bpf_jit_alloc_exec() in the
> arm64 tree to see what I mean.

Hmm, so you have another method for allocating memory for trampolines?
(I haven't looked at those functions you pointed out, out of sheer
laziness ;-)

It would be nice to implement the trampoline optimization in arm, which
x86 has (see arch_ftrace_update_trampoline() and
arch_ftrace_trampoline_func()).

It helps when you have two different callbacks for different functions
(like having live patching enabled and function tracing enabled, or
kprobes using ftrace). Each callback will get its own allocated
trampoline to jump to instead of jumping to the a trampoline that calls
a looping function that tests to see which callback wants to be called
by the traced function.

-- Steve

^ permalink raw reply

* [PATCH 00/11] Support for grouping cores
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin

Cleanup of existing powerpc topologies and add support for grouping cores.

Patch 7 of this patch series: "Powerpc/numa: Detect support for coregroup"
depends on
https://lore.kernel.org/linuxppc-dev/20200707140644.7241-1-srikar@linux.vnet.ibm.com/t/#u
However it should be easy to rebase the patch without the above patch.

On Power 8 Systems
------------------
$ tail /proc/cpuinfo
processor	: 255
cpu		: POWER8 (architected), altivec supported
clock		: 3724.000000MHz
revision	: 2.1 (pvr 004b 0201)

timebase	: 512000000
platform	: pSeries
model		: IBM,8408-E8E
machine		: CHRP IBM,8408-E8E
MMU		: Hash

Before the patchset
-------------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,00000000,00000000,00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After the patchset
------------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,00000000,00000000,00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 00000000,00000000,00000000,00000000,00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,00000000,00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

On Power 9 (with device-tree enablement to show coregroups).
-----------------------------------------------------------
$ tail /proc/cpuinfo
processor	: 127
cpu		: POWER9 (architected), altivec supported
clock		: 3000.000000MHz
revision	: 2.2 (pvr 004e 0202)

timebase	: 512000000
platform	: pSeries
model		: IBM,9008-22L
machine		: CHRP IBM,9008-22L
MMU		: Hash

Before patchset
--------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 00000000,00000000,00000000,00000055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 00000000,00000000,00000000,000000aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After patchset
--------------
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
CACHE
MC
DIE
NUMA

$ head /proc/schedstat
version 15
timestamp 4318242208
cpu0 0 0 0 0 0 0 28077107004 4773387362 78205
domain0 00000000,00000000,00000000,00000055 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 00000000,00000000,00000000,000000ff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 00000000,00000000,00000000,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 00000000,00000000,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain4 ffffffff,ffffffff,ffffffff,ffffffff 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 24177439200 413887604 75393
domain0 00000000,00000000,00000000,000000aa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>

Srikar Dronamraju (11):
  powerpc/smp: Cache node for reuse
  powerpc/smp: Merge Power9 topology with Power topology
  powerpc/smp: Move powerpc_topology above
  powerpc/smp: Enable small core scheduling sooner
  powerpc/smp: Dont assume l2-cache to be superset of sibling
  powerpc/smp: Generalize 2nd sched domain
  Powerpc/numa: Detect support for coregroup
  powerpc/smp: Allocate cpumask only after searching thread group
  Powerpc/smp: Create coregroup domain
  powerpc/smp: Implement cpu_to_coregroup_id
  powerpc/smp: Provide an ability to disable coregroup

 arch/powerpc/include/asm/smp.h      |   1 +
 arch/powerpc/include/asm/topology.h |  10 +
 arch/powerpc/kernel/smp.c           | 277 +++++++++++++++++-----------
 arch/powerpc/mm/numa.c              |  56 ++++--
 4 files changed, 226 insertions(+), 118 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH 01/11] powerpc/smp: Cache node for reuse
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

While cpu_to_node is inline function with access to per_cpu variable.
However when using repeatedly, it may be cleaner to cache it in a local
variable.

Also fix a build error in a some weird config.
"error: _numa_cpu_lookup_table_ undeclared"

No functional change

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..680c0edcc59d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -843,7 +843,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
 	DBG("smp_prepare_cpus\n");
 
-	/* 
+	/*
 	 * setup_cpu may need to be called on the boot cpu. We havent
 	 * spun any cpus up but lets be paranoid.
 	 */
@@ -854,20 +854,24 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	cpu_callin_map[boot_cpuid] = 1;
 
 	for_each_possible_cpu(cpu) {
+		int node = cpu_to_node(cpu);
+
 		zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu),
-					GFP_KERNEL, cpu_to_node(cpu));
+					GFP_KERNEL, node);
 		zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu),
-					GFP_KERNEL, cpu_to_node(cpu));
+					GFP_KERNEL, node);
 		zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
-					GFP_KERNEL, cpu_to_node(cpu));
+					GFP_KERNEL, node);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
 		/*
 		 * numa_node_id() works after this.
 		 */
 		if (cpu_present(cpu)) {
-			set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
-			set_cpu_numa_mem(cpu,
-				local_memory_node(numa_cpu_lookup_table[cpu]));
+			node = numa_cpu_lookup_table[cpu];
+			set_cpu_numa_node(cpu, node);
+			set_cpu_numa_mem(cpu, local_memory_node(node));
 		}
+#endif
 	}
 
 	/* Init the cpumasks so the boot CPU is related to itself */
-- 
2.17.1


^ permalink raw reply related

* [PATCH 03/11] powerpc/smp: Move powerpc_topology above
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Just moving the powerpc_topology description above.
This will help in using functions in this file and avoid declarations.

No other functional changes

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 116 +++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 069ea4b21c6d..24529f6134aa 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
 	return err;
 }
 
+static bool shared_caches;
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymmetric SMT dependency */
+static int powerpc_smt_flags(void)
+{
+	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+		flags |= SD_ASYM_PACKING;
+	}
+	return flags;
+}
+#endif
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+	if (shared_caches)
+		return cpu_l2_cache_mask(cpu);
+
+	if (has_big_cores)
+		return cpu_smallcore_mask(cpu);
+
+	return cpu_smt_mask(cpu);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *smallcore_smt_mask(int cpu)
+{
+	return cpu_smallcore_mask(cpu);
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
 static int init_big_cores(void)
 {
 	int cpu;
@@ -1249,8 +1307,6 @@ static void add_cpu_to_masks(int cpu)
 			set_cpus_related(cpu, i, cpu_core_mask);
 }
 
-static bool shared_caches;
-
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
@@ -1314,62 +1370,6 @@ int setup_profiling_timer(unsigned int multiplier)
 	return 0;
 }
 
-#ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymmetric SMT dependency */
-static int powerpc_smt_flags(void)
-{
-	int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-
-	if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-		printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-		flags |= SD_ASYM_PACKING;
-	}
-	return flags;
-}
-#endif
-
-/*
- * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
- * This topology makes it *much* cheaper to migrate tasks between adjacent cores
- * since the migrated task remains cache hot. We want to take advantage of this
- * at the scheduler level so an extra topology level is required.
- */
-static int powerpc_shared_cache_flags(void)
-{
-	return SD_SHARE_PKG_RESOURCES;
-}
-
-/*
- * We can't just pass cpu_l2_cache_mask() directly because
- * returns a non-const pointer and the compiler barfs on that.
- */
-static const struct cpumask *shared_cache_mask(int cpu)
-{
-	if (shared_caches)
-		return cpu_l2_cache_mask(cpu);
-
-	if (has_big_cores)
-		return cpu_smallcore_mask(cpu);
-
-	return cpu_smt_mask(cpu);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
-{
-	return cpu_smallcore_mask(cpu);
-}
-#endif
-
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
-	{ NULL, },
-};
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	/*
-- 
2.17.1


^ permalink raw reply related

* [PATCH 04/11] powerpc/smp: Enable small core scheduling sooner
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Enable small core scheduling as soon as we detect that we are in a
system that supports thread group. Doing so would avoid a redundant
check.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 24529f6134aa..7d430fc536cc 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -892,6 +892,12 @@ static int init_big_cores(void)
 	}
 
 	has_big_cores = true;
+
+#ifdef CONFIG_SCHED_SMT
+	pr_info("Big cores detected. Using small core scheduling\n");
+	powerpc_topology[0].mask = smallcore_smt_mask;
+#endif
+
 	return 0;
 }
 
@@ -1383,12 +1389,6 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 	dump_numa_cpu_topology();
 
-#ifdef CONFIG_SCHED_SMT
-	if (has_big_cores) {
-		pr_info("Big cores detected but using small core scheduling\n");
-		powerpc_topology[0].mask = smallcore_smt_mask;
-	}
-#endif
 	set_sched_topology(powerpc_topology);
 }
 
-- 
2.17.1


^ permalink raw reply related

* [PATCH 05/11] powerpc/smp: Dont assume l2-cache to be superset of sibling
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 7d430fc536cc..875f57e41355 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1198,6 +1198,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
 	struct device_node *l2_cache, *np;
 	int i;
 
+	cpumask_set_cpu(cpu, mask_fn(cpu));
 	l2_cache = cpu_to_l2cache(cpu);
 	if (!l2_cache)
 		return false;
@@ -1284,29 +1285,30 @@ static void add_cpu_to_masks(int cpu)
 	 * add it to it's own thread sibling mask.
 	 */
 	cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+	cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
 	for (i = first_thread; i < first_thread + threads_per_core; i++)
 		if (cpu_online(i))
 			set_cpus_related(i, cpu, cpu_sibling_mask);
 
 	add_cpu_to_smallcore_masks(cpu);
-	/*
-	 * Copy the thread sibling mask into the cache sibling mask
-	 * and mark any CPUs that share an L2 with this CPU.
-	 */
-	for_each_cpu(i, cpu_sibling_mask(cpu))
-		set_cpus_related(cpu, i, cpu_l2_cache_mask);
 	update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-	/*
-	 * Copy the cache sibling mask into core sibling mask and mark
-	 * any CPUs on the same chip as this CPU.
-	 */
-	for_each_cpu(i, cpu_l2_cache_mask(cpu))
-		set_cpus_related(cpu, i, cpu_core_mask);
+	if (pkg_id == -1) {
+		struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+		/*
+		 * Copy the sibling mask into core sibling mask and
+		 * mark any CPUs on the same chip as this CPU.
+		 */
+		if (shared_caches)
+			mask = cpu_l2_cache_mask;
+
+		for_each_cpu(i, mask(cpu))
+			set_cpus_related(cpu, i, cpu_core_mask);
 
-	if (pkg_id == -1)
 		return;
+	}
 
 	for_each_cpu(i, cpu_online_mask)
 		if (get_physical_package_id(i) == pkg_id)
-- 
2.17.1


^ permalink raw reply related

* [PATCH 06/11] powerpc/smp: Generalize 2nd sched domain
From: Srikar Dronamraju @ 2020-07-14  4:36 UTC (permalink / raw)
  To: Michael Ellerman
  Cc: Nathan Lynch, Gautham R Shenoy, Oliver OHalloran, Michael Neuling,
	Srikar Dronamraju, Michael Ellerman, Anton Blanchard,
	linuxppc-dev, Nick Piggin
In-Reply-To: <20200714043624.5648-1-srikar@linux.vnet.ibm.com>

Currently "CACHE" domain happens to be the 2nd sched domain as per
powerpc_topology. This domain will collapse if cpumask of l2-cache is
same as SMT domain. However we could generalize this domain such that it
could mean either be a "CACHE" domain or a "BIGCORE" domain.

While setting up the "CACHE" domain, check if shared_cache is already
set.

Cc: linuxppc-dev <linuxppc-dev@lists.ozlabs.org>
Cc: Michael Ellerman <michaele@au1.ibm.com>
Cc: Nick Piggin <npiggin@au1.ibm.com>
Cc: Oliver OHalloran <oliveroh@au1.ibm.com>
Cc: Nathan Lynch <nathanl@linux.ibm.com>
Cc: Michael Neuling <mikey@linux.ibm.com>
Cc: Anton Blanchard <anton@au1.ibm.com>
Cc: Gautham R Shenoy <ego@linux.vnet.ibm.com>
Cc: Vaidyanathan Srinivasan <svaidy@linux.ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
---
 arch/powerpc/kernel/smp.c | 48 +++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 875f57e41355..f8faf75135af 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL_GPL(has_big_cores);
 
+enum {
+#ifdef CONFIG_SCHED_SMT
+	smt_idx,
+#endif
+	bigcore_idx,
+	die_idx,
+};
+
 #define MAX_THREAD_LIST_SIZE	8
 #define THREAD_GROUP_SHARE_L1   1
 struct thread_groups {
@@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-	if (shared_caches)
-		return cpu_l2_cache_mask(cpu);
-
-	if (has_big_cores)
-		return cpu_smallcore_mask(cpu);
-
-	return cpu_smt_mask(cpu);
+	return per_cpu(cpu_l2_cache_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static const struct cpumask *cpu_bigcore_mask(int cpu)
+{
+	return cpu_core_mask(cpu);
+}
+
 static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
 	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
-	{ shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+	{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
 	{ NULL, },
 };
@@ -895,7 +902,7 @@ static int init_big_cores(void)
 
 #ifdef CONFIG_SCHED_SMT
 	pr_info("Big cores detected. Using small core scheduling\n");
-	powerpc_topology[0].mask = smallcore_smt_mask;
+	powerpc_topology[smt_idx].mask = smallcore_smt_mask;
 #endif
 
 	return 0;
@@ -1319,7 +1326,6 @@ static void add_cpu_to_masks(int cpu)
 void start_secondary(void *unused)
 {
 	unsigned int cpu = smp_processor_id();
-	struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
 
 	mmgrab(&init_mm);
 	current->active_mm = &init_mm;
@@ -1345,14 +1351,20 @@ void start_secondary(void *unused)
 	/* Update topology CPU masks */
 	add_cpu_to_masks(cpu);
 
-	if (has_big_cores)
-		sibling_mask = cpu_smallcore_mask;
 	/*
 	 * Check for any shared caches. Note that this must be done on a
 	 * per-core basis because one core in the pair might be disabled.
 	 */
-	if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
-		shared_caches = true;
+	if (!shared_caches) {
+		struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+		struct cpumask *mask = cpu_l2_cache_mask(cpu);
+
+		if (has_big_cores)
+			sibling_mask = cpu_smallcore_mask;
+
+		if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+			shared_caches = true;
+	}
 
 	set_numa_node(numa_cpu_lookup_table[cpu]);
 	set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
@@ -1390,6 +1402,14 @@ void __init smp_cpus_done(unsigned int max_cpus)
 		smp_ops->bringup_done();
 
 	dump_numa_cpu_topology();
+	if (shared_caches) {
+		pr_info("Using shared cache scheduler topology\n");
+		powerpc_topology[bigcore_idx].mask = shared_cache_mask;
+#ifdef CONFIG_SCHED_DEBUG
+		powerpc_topology[bigcore_idx].name = "CACHE";
+#endif
+		powerpc_topology[bigcore_idx].sd_flags = powerpc_shared_cache_flags;
+	}
 
 	set_sched_topology(powerpc_topology);
 }
-- 
2.17.1


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox