* Re: [PATCH v2 2/2] ima: Free IMA measurement buffer after kexec syscall
From: Greg KH @ 2021-02-05 10:05 UTC (permalink / raw)
To: Lakshmi Ramasubramanian
Cc: sashal, dmitry.kasatkin, linux-kernel, zohar, tyhicks, ebiederm,
linux-integrity, linuxppc-dev, bauerman
In-Reply-To: <20210204174951.25771-2-nramas@linux.microsoft.com>
On Thu, Feb 04, 2021 at 09:49:51AM -0800, Lakshmi Ramasubramanian wrote:
> IMA allocates kernel virtual memory to carry forward the measurement
> list, from the current kernel to the next kernel on kexec system call,
> in ima_add_kexec_buffer() function. This buffer is not freed before
> completing the kexec system call resulting in memory leak.
>
> Add ima_buffer field in "struct kimage" to store the virtual address
> of the buffer allocated for the IMA measurement list.
> Free the memory allocated for the IMA measurement list in
> kimage_file_post_load_cleanup() function.
>
> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> Suggested-by: Tyler Hicks <tyhicks@linux.microsoft.com>
> Reviewed-by: Thiago Jung Bauermann <bauerman@linux.ibm.com>
> Reviewed-by: Tyler Hicks <tyhicks@linux.microsoft.com>
> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
> ---
> include/linux/kexec.h | 5 +++++
> kernel/kexec_file.c | 5 +++++
> security/integrity/ima/ima_kexec.c | 2 ++
> 3 files changed, 12 insertions(+)
<formletter>
This is not the correct way to submit patches for inclusion in the
stable kernel tree. Please read:
https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.
</formletter>
^ permalink raw reply
* Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.
From: Fabiano Rosas @ 2021-02-05 13:09 UTC (permalink / raw)
To: Leonardo Bras, Paul Mackerras, Michael Ellerman,
Benjamin Herrenschmidt, Christophe Leroy, Athira Rajeev,
Aneesh Kumar K.V, Leonardo Bras, Jordan Niethe, Nicholas Piggin,
Frederic Weisbecker, Thomas Gleixner, Geert Uytterhoeven
Cc: linuxppc-dev, linux-kernel, kvm-ppc
In-Reply-To: <20210205060643.233481-1-leobras.c@gmail.com>
Leonardo Bras <leobras.c@gmail.com> writes:
> Before guest entry, TBU40 register is changed to reflect guest timebase.
> After exitting guest, the register is reverted to it's original value.
>
> If one tries to get the timestamp from host between those changes, it
> will present an incorrect value.
>
> An example would be trying to add a tracepoint in
> kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> acquired could actually cause the host to crash.
>
> Save the Timebase Offset to PACA and use it on sched_clock() to always
> get the correct timestamp.
>
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> Suggested-by: Paul Mackerras <paulus@ozlabs.org>
> ---
> Changes since v1:
> - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> CONFIG_PPC_BOOK3S_64 are defined.
> ---
> arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> arch/powerpc/kernel/asm-offsets.c | 1 +
> arch/powerpc/kernel/time.c | 8 +++++++-
> arch/powerpc/kvm/book3s_hv.c | 2 ++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 ++
> 5 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index 078f4648ea27..e2c12a10eed2 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> u64 cfar;
> u64 ppr;
> u64 host_fscr;
> + u64 tb_offset; /* Timebase offset: keeps correct
> timebase while on guest */
Couldn't you use the vc->tb_offset_applied for this? We have a reference
for the vcore in the hstate already.
> #endif
> };
>
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index b12d7c049bfe..0beb8fdc6352 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -706,6 +706,7 @@ int main(void)
> HSTATE_FIELD(HSTATE_CFAR, cfar);
> HSTATE_FIELD(HSTATE_PPR, ppr);
> HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> + HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> #endif /* CONFIG_PPC_BOOK3S_64 */
>
> #else /* CONFIG_PPC_BOOK3S */
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index 67feb3524460..f27f0163792b 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
> */
> notrace unsigned long long sched_clock(void)
> {
> - return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> + u64 tb = get_tb() - boot_tb;
> +
> +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> + tb -= local_paca->kvm_hstate.tb_offset;
> +#endif
> +
> + return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
> }
>
>
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index b3731572295e..c08593c63353 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> if ((tb & 0xffffff) < (new_tb & 0xffffff))
> mtspr(SPRN_TBU40, new_tb + 0x1000000);
> vc->tb_offset_applied = vc->tb_offset;
> + local_paca->kvm_hstate.tb_offset = vc->tb_offset;
> }
>
> if (vc->pcr)
> @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> if ((tb & 0xffffff) < (new_tb & 0xffffff))
> mtspr(SPRN_TBU40, new_tb + 0x1000000);
> vc->tb_offset_applied = 0;
> + local_paca->kvm_hstate.tb_offset = 0;
> }
>
> mtspr(SPRN_HDEC, 0x7fffffff);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b73140607875..8f7a9f7f4ee6 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> cmpdi r8,0
> beq 37f
> std r8, VCORE_TB_OFFSET_APPL(r5)
> + std r8, HSTATE_TB_OFFSET(r13)
> mftb r6 /* current host timebase */
> add r8,r8,r6
> mtspr SPRN_TBU40,r8 /* update upper 40 bits */
> @@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> beq 17f
> li r0, 0
> std r0, VCORE_TB_OFFSET_APPL(r5)
> + std r0, HSTATE_TB_OFFSET(r13)
> mftb r6 /* current guest timebase */
> subf r8,r8,r6
> mtspr SPRN_TBU40,r8 /* update upper 40 bits */
^ permalink raw reply
* Re: [PATCH] powerpc/pseries/dlpar: handle ibm, configure-connector delay status
From: Nathan Lynch @ 2021-02-05 13:15 UTC (permalink / raw)
To: linuxppc-dev; +Cc: tyreld, brking, Laurent Dufour, Scott Cheloha
In-Reply-To: <20210107025900.410369-1-nathanl@linux.ibm.com>
Nathan Lynch <nathanl@linux.ibm.com> writes:
> dlpar_configure_connector() has two problems in its handling of
> ibm,configure-connector's return status:
>
> 1. When the status is -2 (busy, call again), we call
> ibm,configure-connector again immediately without checking whether
> to schedule, which can result in monopolizing the CPU.
> 2. Extended delay status (9900..9905) goes completely unhandled,
> causing the configuration to unnecessarily terminate.
>
> Fix both of these issues by using rtas_busy_delay().
>
> Fixes: ab519a011caa ("powerpc/pseries: Kernel DLPAR Infrastructure")
> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Just following up and adding some people to cc in hopes of getting some
review for this.
> ---
> arch/powerpc/platforms/pseries/dlpar.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
> index 16e86ba8aa20..f6b7749d6ada 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -127,7 +127,6 @@ void dlpar_free_cc_nodes(struct device_node *dn)
> #define NEXT_PROPERTY 3
> #define PREV_PARENT 4
> #define MORE_MEMORY 5
> -#define CALL_AGAIN -2
> #define ERR_CFG_USE -9003
>
> struct device_node *dlpar_configure_connector(__be32 drc_index,
> @@ -168,6 +167,9 @@ struct device_node *dlpar_configure_connector(__be32 drc_index,
>
> spin_unlock(&rtas_data_buf_lock);
>
> + if (rtas_busy_delay(rc))
> + continue;
> +
> switch (rc) {
> case COMPLETE:
> break;
> @@ -216,9 +218,6 @@ struct device_node *dlpar_configure_connector(__be32 drc_index,
> last_dn = last_dn->parent;
> break;
>
> - case CALL_AGAIN:
> - break;
> -
> case MORE_MEMORY:
> case ERR_CFG_USE:
> default:
> --
> 2.29.2
^ permalink raw reply
* Re: [PATCH] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm
From: Aneesh Kumar K.V @ 2021-02-05 13:49 UTC (permalink / raw)
To: Zorro Lang; +Cc: Jens Axboe, linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210205095820.GI14354@localhost.localdomain>
Zorro Lang <zlang@redhat.com> writes:
....
> ...
> [ 530.180466] run fstests generic/617 at 2021-02-05 03:41:10
> [ 530.707969] ------------[ cut here ]------------
> [ 530.708006] kernel BUG at arch/powerpc/include/asm/book3s/64/kup.h:207!
> [ 530.708013] Oops: Exception in kernel mode, sig: 5 [#1]
> [ 530.708018] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> [ 530.708022] Modules linked in: bonding rfkill sunrpc uio_pdrv_genirq pseries_rng uio drm fuse drm_panel_orientation_quirks ip_tables xfs libcrc32c sd_mod t10_pi ibmvscsi ibmveth scsi_trans
> port_srp xts vmx_crypto
> [ 530.708049] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Not tainted 5.11.0-r
ok so we call current_thread_amr() with kthread.
commit ae33fb7b069ebb41e32f55ae397c887031e47472
Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Date: Fri Feb 5 19:11:49 2021 +0530
The other stack that matters is
...
[ 530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G D 5.11.0-rc6+ #3
....
NIP [c0000000000aa0c8] pkey_access_permitted+0x28/0x90
LR [c0000000004b9278] gup_pte_range+0x188/0x420
--- interrupt: 700
[c00000001c4ef3f0] [0000000000000000] 0x0 (unreliable)
[c00000001c4ef490] [c0000000004bd39c] gup_pgd_range+0x3ac/0xa20
[c00000001c4ef5a0] [c0000000004bdd44] internal_get_user_pages_fast+0x334/0x410
[c00000001c4ef620] [c000000000852028] iov_iter_get_pages+0xf8/0x5c0
[c00000001c4ef6a0] [c0000000007da44c] bio_iov_iter_get_pages+0xec/0x700
[c00000001c4ef770] [c0000000006a325c] iomap_dio_bio_actor+0x2ac/0x4f0
[c00000001c4ef810] [c00000000069cd94] iomap_apply+0x2b4/0x740
[c00000001c4ef920] [c0000000006a38b8] __iomap_dio_rw+0x238/0x5c0
[c00000001c4ef9d0] [c0000000006a3c60] iomap_dio_rw+0x20/0x80
[c00000001c4ef9f0] [c008000001927a30] xfs_file_dio_aio_write+0x1f8/0x650 [xfs]
[c00000001c4efa60] [c0080000019284dc] xfs_file_write_iter+0xc4/0x130 [xfs]
[c00000001c4efa90] [c000000000669984] io_write+0x104/0x4b0
[c00000001c4efbb0] [c00000000066cea4] io_issue_sqe+0x3d4/0xf50
[c00000001c4efc60] [c000000000670200] io_wq_submit_work+0xb0/0x2f0
[c00000001c4efcb0] [c000000000674268] io_worker_handle_work+0x248/0x4a0
[c00000001c4efd30] [c0000000006746e8] io_wqe_worker+0x228/0x2a0
[c00000001c4efda0] [c00000000019d994] kthread+0x1b4/0x1c0
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index 2064621ae7b6..21e59c1f0d67 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -204,14 +204,16 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
static inline u64 current_thread_amr(void)
{
- VM_BUG_ON(!current->thread.regs);
- return current->thread.regs->amr;
+ if (current->thread.regs)
+ return current->thread.regs->amr;
+ return 0;
}
static inline u64 current_thread_iamr(void)
{
- VM_BUG_ON(!current->thread.regs);
- return current->thread.regs->iamr;
+ if (current->thread.regs)
+ return current->thread.regs->iamr;
+ return 0;
}
#endif /* CONFIG_PPC_PKEY */
^ permalink raw reply related
* Re: [PATCH 2/7] ASoC: fsl_rpmsg: Add CPU DAI driver for audio base on rpmsg
From: Mark Brown @ 2021-02-05 14:02 UTC (permalink / raw)
To: Shengjiu Wang
Cc: devicetree, alsa-devel, timur, lgirdwood, linuxppc-dev, Xiubo.Lee,
linux-kernel, tiwai, nicoleotsuka, robh+dt, perex, festevam
In-Reply-To: <1612508250-10586-3-git-send-email-shengjiu.wang@nxp.com>
[-- Attachment #1: Type: text/plain, Size: 362 bytes --]
On Fri, Feb 05, 2021 at 02:57:25PM +0800, Shengjiu Wang wrote:
> This is a dummy cpu dai driver for rpmsg audio use case,
> which is mainly used for getting the user's configuration
This is actually doing stuff, it's not a dummy driver.
> +static int fsl_rpmsg_remove(struct platform_device *pdev)
> +{
> + return 0;
> +}
If this isn't needed just remove it.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH 4/7] ASoC: imx-audio-rpmsg: Add rpmsg_driver for audio channel
From: Mark Brown @ 2021-02-05 14:25 UTC (permalink / raw)
To: Shengjiu Wang
Cc: devicetree, alsa-devel, timur, lgirdwood, linuxppc-dev, Xiubo.Lee,
linux-kernel, tiwai, nicoleotsuka, robh+dt, perex, festevam
In-Reply-To: <1612508250-10586-5-git-send-email-shengjiu.wang@nxp.com>
[-- Attachment #1: Type: text/plain, Size: 584 bytes --]
On Fri, Feb 05, 2021 at 02:57:27PM +0800, Shengjiu Wang wrote:
> + /* TYPE C is notification from M core */
> + if (r_msg->header.type == MSG_TYPE_C) {
> + if (r_msg->header.cmd == TX_PERIOD_DONE) {
> + } else if (r_msg->header.cmd == RX_PERIOD_DONE) {
A switch statement would be clearer and more extensible...
> + /* TYPE B is response msg */
> + if (r_msg->header.type == MSG_TYPE_B) {
> + memcpy(&info->r_msg, r_msg, sizeof(struct rpmsg_r_msg));
> + complete(&info->cmd_complete);
> + }
...and make this flow clearer for example. Do we need to warn on
unknown messages?
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH 5/7] ASoC: imx-pcm-rpmsg: Add platform driver for audio base on rpmsg
From: Mark Brown @ 2021-02-05 14:58 UTC (permalink / raw)
To: Shengjiu Wang
Cc: devicetree, alsa-devel, timur, lgirdwood, linuxppc-dev, Xiubo.Lee,
linux-kernel, tiwai, nicoleotsuka, robh+dt, perex, festevam
In-Reply-To: <1612508250-10586-6-git-send-email-shengjiu.wang@nxp.com>
[-- Attachment #1: Type: text/plain, Size: 1431 bytes --]
On Fri, Feb 05, 2021 at 02:57:28PM +0800, Shengjiu Wang wrote:
> + if (params_format(params) == SNDRV_PCM_FORMAT_S16_LE)
> + msg->s_msg.param.format = RPMSG_S16_LE;
> + else if (params_format(params) == SNDRV_PCM_FORMAT_S24_LE)
Again this should be a switch statement.
> + if (params_channels(params) == 1)
> + msg->s_msg.param.channels = RPMSG_CH_LEFT;
> + else
> + msg->s_msg.param.channels = RPMSG_CH_STEREO;
Shouldn't this be reporting an error if the number of channels is more
than 2?
> + /*
> + * if the data in the buffer is less than one period
> + * send message immediately.
> + * if there is more than one period data, delay one
> + * period (timer) to send the message.
> + */
> + if ((avail - writen_num * period_size) <= period_size) {
> + imx_rpmsg_insert_workqueue(substream, msg, info);
> + } else if (rpmsg->force_lpa && !timer_pending(timer)) {
> + int time_msec;
> +
> + time_msec = (int)(runtime->period_size * 1000 / runtime->rate);
> + mod_timer(timer, jiffies + msecs_to_jiffies(time_msec));
> + }
The comment here is at least confusing - why would we not send a full
buffer immediately if we have one? This sounds like it's the opposite
way round to what we'd do if we were trying to cut down the number of
messages. It might help to say which buffer and where?
> + /**
> + * Every work in the work queue, first we check if there
/** comments are only for kerneldoc.
[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply
* Re: [PATCH 1/2] PCI/AER: Disable AER interrupt during suspend
From: Kai-Heng Feng @ 2021-02-05 15:17 UTC (permalink / raw)
To: Bjorn Helgaas
Cc: Joerg Roedel,
open list:PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC,
open list:PCI SUBSYSTEM, open list, Lalithambika Krishnakumar,
Alex Williamson, Oliver O'Halloran, Bjorn Helgaas,
Mika Westerberg, Lu Baolu
In-Reply-To: <20210204232758.GA125392@bjorn-Precision-5520>
On Fri, Feb 5, 2021 at 7:28 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
>
> [+cc Alex]
>
> On Thu, Jan 28, 2021 at 12:09:37PM +0800, Kai-Heng Feng wrote:
> > On Thu, Jan 28, 2021 at 4:51 AM Bjorn Helgaas <helgaas@kernel.org> wrote:
> > > On Thu, Jan 28, 2021 at 01:31:00AM +0800, Kai-Heng Feng wrote:
> > > > Commit 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in
> > > > hint") enables ACS, and some platforms lose its NVMe after resume from
> > > > firmware:
> > > > [ 50.947816] pcieport 0000:00:1b.0: DPC: containment event, status:0x1f01 source:0x0000
> > > > [ 50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> > > > [ 50.947829] pcieport 0000:00:1b.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver ID)
> > > > [ 50.947830] pcieport 0000:00:1b.0: device [8086:06ac] error status/mask=00200000/00010000
> > > > [ 50.947831] pcieport 0000:00:1b.0: [21] ACSViol (First)
> > > > [ 50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> > > > [ 50.947843] nvme nvme0: frozen state error detected, reset controller
> > > >
> > > > It happens right after ACS gets enabled during resume.
> > > >
> > > > To prevent that from happening, disable AER interrupt and enable it on
> > > > system suspend and resume, respectively.
> > >
> > > Lots of questions here. Maybe this is what we'll end up doing, but I
> > > am curious about why the error is reported in the first place.
> > >
> > > Is this a consequence of the link going down and back up?
> >
> > Could be. From the observations, it only happens when firmware suspend
> > (S3) is used.
> > Maybe it happens when it's gets powered up, but I don't have equipment
> > to debug at hardware level.
> >
> > If we use non-firmware suspend method, enabling ACS after resume won't
> > trip AER and DPC.
> >
> > > Is it consequence of the device doing a DMA when it shouldn't?
> >
> > If it's doing DMA while suspending, the same error should also happen
> > after NVMe is suspended and before PCIe port suspending.
> > Furthermore, if non-firmware suspend method is used, there's so such
> > issue, so less likely to be any DMA operation.
> >
> > > Are we doing something in the wrong order during suspend? Or maybe
> > > resume, since I assume the error is reported during resume?
> >
> > Yes the error is reported during resume. The suspend/resume order
> > seems fine as non-firmware suspend doesn't have this issue.
>
> I really feel like we need a better understanding of what's going on
> here. Disabling the AER interrupt is like closing our eyes and
> pretending that because we don't see it, it didn't happen.
>
> An ACS error is triggered by a DMA, right? I'm assuming an MMIO
> access from the CPU wouldn't trigger this error. And it sounds like
> the error is triggered before we even start running the driver after
> resume.
>
> If we're powering up an NVMe device from D3cold and it DMAs before the
> driver touches it, something would be seriously broken. I doubt
> that's what's happening. Maybe a device could resume some previously
> programmed DMA after powering up from D3hot.
I am not that familiar with PCIe ACS/AER/DPC, so I can't really answer
questions you raised.
PCIe spec doesn't say the suspend/resume order is also not helping here.
However, I really think it's a system firmware issue.
I've seen some suspend-to-idle platforms with NVMe can reach D3cold,
those are unaffected.
>
> Or maybe the error occurred on suspend, like if the device wasn't
> quiesced or something, but we didn't notice it until resume? The
> AER error status bits are RW1CS, which means they can be preserved
> across hot/warm/cold resets.
>
> Can you instrument the code to see whether the AER error status bit is
> set before enabling ACS? I'm not sure that merely enabling ACS (I
> assume you mean pci_std_enable_acs(), where we write PCI_ACS_CTRL)
> should cause an interrupt for a previously-logged error. I suspect
> that could happen when enabling *AER*, but I wouldn't think it would
> happen when enabling *ACS*.
Diff to print AER status:
https://bugzilla.kernel.org/show_bug.cgi?id=209149#c11
And dmesg:
https://bugzilla.kernel.org/show_bug.cgi?id=209149#c12
Looks like the read before suspend and after resume are both fine.
>
> Does this error happen on multiple machines from different vendors?
> Wondering if it could be a BIOS issue, e.g., BIOS not cleaning up
> after it did something to cause an error.
AFAIK, systems from both HP and Dell are affected.
I was told that the reference platform from Intel is using
suspend-to-idle, but vendors changed the sleep method to S3 to have
lower power consumption to pass regulation.
Kai-Heng
>
> > > If we *do* take the error, why doesn't DPC recovery work?
> >
> > It works for the root port, but not for the NVMe drive:
> > [ 50.947816] pcieport 0000:00:1b.0: DPC: containment event,
> > status:0x1f01 source:0x0000
> > [ 50.947817] pcieport 0000:00:1b.0: DPC: unmasked uncorrectable error detected
> > [ 50.947829] pcieport 0000:00:1b.0: PCIe Bus Error:
> > severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Receiver
> > ID)
> > [ 50.947830] pcieport 0000:00:1b.0: device [8086:06ac] error
> > status/mask=00200000/00010000
> > [ 50.947831] pcieport 0000:00:1b.0: [21] ACSViol (First)
> > [ 50.947841] pcieport 0000:00:1b.0: AER: broadcast error_detected message
> > [ 50.947843] nvme nvme0: frozen state error detected, reset controller
> > [ 50.948400] ACPI: EC: event unblocked
> > [ 50.948432] xhci_hcd 0000:00:14.0: PME# disabled
> > [ 50.948444] xhci_hcd 0000:00:14.0: enabling bus mastering
> > [ 50.949056] pcieport 0000:00:1b.0: PME# disabled
> > [ 50.949068] pcieport 0000:00:1c.0: PME# disabled
> > [ 50.949416] e1000e 0000:00:1f.6: PME# disabled
> > [ 50.949463] e1000e 0000:00:1f.6: enabling bus mastering
> > [ 50.951606] sd 0:0:0:0: [sda] Starting disk
> > [ 50.951610] nvme 0000:01:00.0: can't change power state from D3hot
> > to D0 (config space inaccessible)
> > [ 50.951730] nvme nvme0: Removing after probe failure status: -19
> > [ 50.952360] nvme nvme0: failed to set APST feature (-19)
> > [ 50.971136] snd_hda_intel 0000:00:1f.3: PME# disabled
> > [ 51.089330] pcieport 0000:00:1b.0: AER: broadcast resume message
> > [ 51.089345] pcieport 0000:00:1b.0: AER: device recovery successful
> >
> > But I think why recovery doesn't work for NVMe is for another discussion...
> >
> > Kai-Heng
> >
> > >
> > > > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=209149
> > > > Fixes: 50310600ebda ("iommu/vt-d: Enable PCI ACS for platform opt in hint")
> > > > Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
> > > > ---
> > > > drivers/pci/pcie/aer.c | 18 ++++++++++++++++++
> > > > 1 file changed, 18 insertions(+)
> > > >
> > > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> > > > index 77b0f2c45bc0..0e9a85530ae6 100644
> > > > --- a/drivers/pci/pcie/aer.c
> > > > +++ b/drivers/pci/pcie/aer.c
> > > > @@ -1365,6 +1365,22 @@ static int aer_probe(struct pcie_device *dev)
> > > > return 0;
> > > > }
> > > >
> > > > +static int aer_suspend(struct pcie_device *dev)
> > > > +{
> > > > + struct aer_rpc *rpc = get_service_data(dev);
> > > > +
> > > > + aer_disable_rootport(rpc);
> > > > + return 0;
> > > > +}
> > > > +
> > > > +static int aer_resume(struct pcie_device *dev)
> > > > +{
> > > > + struct aer_rpc *rpc = get_service_data(dev);
> > > > +
> > > > + aer_enable_rootport(rpc);
> > > > + return 0;
> > > > +}
> > > > +
> > > > /**
> > > > * aer_root_reset - reset Root Port hierarchy, RCEC, or RCiEP
> > > > * @dev: pointer to Root Port, RCEC, or RCiEP
> > > > @@ -1437,6 +1453,8 @@ static struct pcie_port_service_driver aerdriver = {
> > > > .service = PCIE_PORT_SERVICE_AER,
> > > >
> > > > .probe = aer_probe,
> > > > + .suspend = aer_suspend,
> > > > + .resume = aer_resume,
> > > > .remove = aer_remove,
> > > > };
> > > >
> > > > --
> > > > 2.29.2
> > > >
^ permalink raw reply
* Re: [PATCH] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm
From: Zorro Lang @ 2021-02-05 16:12 UTC (permalink / raw)
To: Aneesh Kumar K.V; +Cc: Jens Axboe, linuxppc-dev, Nicholas Piggin
In-Reply-To: <871rdur5e7.fsf@linux.ibm.com>
On Fri, Feb 05, 2021 at 07:19:36PM +0530, Aneesh Kumar K.V wrote:
> Zorro Lang <zlang@redhat.com> writes:
>
> ....
>
> > ...
> > [ 530.180466] run fstests generic/617 at 2021-02-05 03:41:10
> > [ 530.707969] ------------[ cut here ]------------
> > [ 530.708006] kernel BUG at arch/powerpc/include/asm/book3s/64/kup.h:207!
> > [ 530.708013] Oops: Exception in kernel mode, sig: 5 [#1]
> > [ 530.708018] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> > [ 530.708022] Modules linked in: bonding rfkill sunrpc uio_pdrv_genirq pseries_rng uio drm fuse drm_panel_orientation_quirks ip_tables xfs libcrc32c sd_mod t10_pi ibmvscsi ibmveth scsi_trans
> > port_srp xts vmx_crypto
> > [ 530.708049] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Not tainted 5.11.0-r
>
> ok so we call current_thread_amr() with kthread.
>
> commit ae33fb7b069ebb41e32f55ae397c887031e47472
> Author: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> Date: Fri Feb 5 19:11:49 2021 +0530
>
>
> The other stack that matters is
> ...
> [ 530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G D 5.11.0-rc6+ #3
> ....
>
> NIP [c0000000000aa0c8] pkey_access_permitted+0x28/0x90
> LR [c0000000004b9278] gup_pte_range+0x188/0x420
> --- interrupt: 700
> [c00000001c4ef3f0] [0000000000000000] 0x0 (unreliable)
> [c00000001c4ef490] [c0000000004bd39c] gup_pgd_range+0x3ac/0xa20
> [c00000001c4ef5a0] [c0000000004bdd44] internal_get_user_pages_fast+0x334/0x410
> [c00000001c4ef620] [c000000000852028] iov_iter_get_pages+0xf8/0x5c0
> [c00000001c4ef6a0] [c0000000007da44c] bio_iov_iter_get_pages+0xec/0x700
> [c00000001c4ef770] [c0000000006a325c] iomap_dio_bio_actor+0x2ac/0x4f0
> [c00000001c4ef810] [c00000000069cd94] iomap_apply+0x2b4/0x740
> [c00000001c4ef920] [c0000000006a38b8] __iomap_dio_rw+0x238/0x5c0
> [c00000001c4ef9d0] [c0000000006a3c60] iomap_dio_rw+0x20/0x80
> [c00000001c4ef9f0] [c008000001927a30] xfs_file_dio_aio_write+0x1f8/0x650 [xfs]
> [c00000001c4efa60] [c0080000019284dc] xfs_file_write_iter+0xc4/0x130 [xfs]
> [c00000001c4efa90] [c000000000669984] io_write+0x104/0x4b0
> [c00000001c4efbb0] [c00000000066cea4] io_issue_sqe+0x3d4/0xf50
> [c00000001c4efc60] [c000000000670200] io_wq_submit_work+0xb0/0x2f0
> [c00000001c4efcb0] [c000000000674268] io_worker_handle_work+0x248/0x4a0
> [c00000001c4efd30] [c0000000006746e8] io_wqe_worker+0x228/0x2a0
> [c00000001c4efda0] [c00000000019d994] kthread+0x1b4/0x1c0
>
> diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
> index 2064621ae7b6..21e59c1f0d67 100644
> --- a/arch/powerpc/include/asm/book3s/64/kup.h
> +++ b/arch/powerpc/include/asm/book3s/64/kup.h
> @@ -204,14 +204,16 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
>
> static inline u64 current_thread_amr(void)
> {
> - VM_BUG_ON(!current->thread.regs);
> - return current->thread.regs->amr;
> + if (current->thread.regs)
> + return current->thread.regs->amr;
> + return 0;
> }
>
> static inline u64 current_thread_iamr(void)
> {
> - VM_BUG_ON(!current->thread.regs);
> - return current->thread.regs->iamr;
> + if (current->thread.regs)
> + return current->thread.regs->iamr;
> + return 0;
> }
> #endif /* CONFIG_PPC_PKEY */
This change can help to avoid above regression issue:
# ./check generic/013 generic/616 generic/617
FSTYP -- xfs (debug)
PLATFORM -- Linux/ppc64le ibm-p9z-xx-xxx 5.11.0-rc6+ #4 SMP Fri Feb 5 10:22:14 EST 2021
MKFS_OPTIONS -- -f -m crc=1,finobt=1,rmapbt=1,reflink=1,inobtcount=1,bigtime=1 /dev/sda3
MOUNT_OPTIONS -- -o context=system_u:object_r:root_t:s0 /dev/sda3 /mnt/xfstests/scratch
generic/013 37s ... 42s
generic/616 166s
generic/617 16s ... 20s
Ran: generic/013 generic/616 generic/617
Passed all 3 tests
>
>
^ permalink raw reply
* [PATCH] KVM: PPC: Don't always report hash MMU capability for P9 < DD2.2
From: Fabiano Rosas @ 2021-02-05 16:41 UTC (permalink / raw)
To: kvm-ppc; +Cc: linuxppc-dev, npiggin
In-Reply-To: <20210118062809.1430920-2-npiggin@gmail.com>
These machines don't support running both MMU types at the same time,
so remove the KVM_CAP_PPC_MMU_HASH_V3 capability when the host is
using Radix MMU.
Signed-off-by: Fabiano Rosas <farosas@linux.ibm.com>
---
arch/powerpc/include/asm/kvm_ppc.h | 1 +
arch/powerpc/kvm/book3s_hv.c | 10 ++++++++++
arch/powerpc/kvm/powerpc.c | 3 +--
3 files changed, 12 insertions(+), 2 deletions(-)
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0a056c64c317..b36abc89baf3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -314,6 +314,7 @@ struct kvmppc_ops {
int size);
int (*enable_svm)(struct kvm *kvm);
int (*svm_off)(struct kvm *kvm);
+ bool (*hash_v3_possible)(void);
};
extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6f612d240392..d20c0682cae5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5599,6 +5599,15 @@ static int kvmhv_svm_off(struct kvm *kvm)
return ret;
}
+static bool kvmppc_hash_v3_possible(void)
+{
+ if (radix_enabled() && no_mixing_hpt_and_radix)
+ return false;
+
+ return cpu_has_feature(CPU_FTR_ARCH_300) &&
+ cpu_has_feature(CPU_FTR_HVMODE);
+}
+
static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -5642,6 +5651,7 @@ static struct kvmppc_ops kvm_ops_hv = {
.store_to_eaddr = kvmhv_store_to_eaddr,
.enable_svm = kvmhv_enable_svm,
.svm_off = kvmhv_svm_off,
+ .hash_v3_possible = kvmppc_hash_v3_possible,
};
static int kvm_init_subcore_bitmap(void)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index cf52d26f49cd..b9fb2f20f879 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -611,8 +611,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
- r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
- cpu_has_feature(CPU_FTR_HVMODE));
+ r = !!(hv_enabled && kvmppc_hv_ops->hash_v3_possible());
break;
case KVM_CAP_PPC_NESTED_HV:
r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
--
2.29.2
^ permalink raw reply related
* Re: [PATCH v2 1/2] ima: Free IMA measurement buffer on error
From: Lakshmi Ramasubramanian @ 2021-02-05 17:39 UTC (permalink / raw)
To: Greg KH
Cc: sashal, dmitry.kasatkin, linux-kernel, zohar, tyhicks, ebiederm,
linux-integrity, linuxppc-dev, bauerman
In-Reply-To: <YB0YdqbbdAdbEOQw@kroah.com>
On 2/5/21 2:05 AM, Greg KH wrote:
> On Thu, Feb 04, 2021 at 09:49:50AM -0800, Lakshmi Ramasubramanian wrote:
>> IMA allocates kernel virtual memory to carry forward the measurement
>> list, from the current kernel to the next kernel on kexec system call,
>> in ima_add_kexec_buffer() function. In error code paths this memory
>> is not freed resulting in memory leak.
>>
>> Free the memory allocated for the IMA measurement list in
>> the error code paths in ima_add_kexec_buffer() function.
>>
>> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
>> Suggested-by: Tyler Hicks <tyhicks@linux.microsoft.com>
>> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
>> ---
>> security/integrity/ima/ima_kexec.c | 1 +
>> 1 file changed, 1 insertion(+)
>
> <formletter>
>
> This is not the correct way to submit patches for inclusion in the
> stable kernel tree. Please read:
> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> for how to do this properly.
>
> </formletter>
>
Thanks for the info Greg.
I will re-submit the two patches in the proper format.
-lakshmi
^ permalink raw reply
* Re: [PATCH] mm/pmem: Avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled
From: Dan Williams @ 2021-02-05 17:47 UTC (permalink / raw)
To: Aneesh Kumar K.V
Cc: Jan Kara, linux-nvdimm, Linux MM, Andrew Morton, linuxppc-dev,
Kirill A . Shutemov
In-Reply-To: <20210205023956.417587-1-aneesh.kumar@linux.ibm.com>
[ add Andrew ]
On Thu, Feb 4, 2021 at 6:40 PM Aneesh Kumar K.V
<aneesh.kumar@linux.ibm.com> wrote:
>
> Differentiate between hardware not supporting hugepages and user disabling THP
> via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'
>
> For the devdax namespace, the kernel handles the above via the
> supported_alignment attribute and failing to initialize the namespace
> if the namespace align value is not supported on the platform.
>
> For the fsdax namespace, the kernel will continue to initialize
> the namespace. This can result in the kernel creating a huge pte
> entry even though the hardware don't support the same.
>
> We do want hugepage support with pmem even if the end-user disabled THP
> via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled). Hence
> differentiate between hardware/firmware lacking support vs user-controlled
> disable of THP and prevent a huge fault if the hardware lacks hugepage
> support.
Looks good to me.
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
I assume this will go through Andrew.
^ permalink raw reply
* Re: [PATCH v2 1/2] ima: Free IMA measurement buffer on error
From: Mimi Zohar @ 2021-02-05 17:49 UTC (permalink / raw)
To: Lakshmi Ramasubramanian, Greg KH
Cc: sashal, dmitry.kasatkin, linux-kernel, tyhicks, ebiederm,
linux-integrity, linuxppc-dev, bauerman
In-Reply-To: <7000d128-272e-3654-8480-e46bf7dfad74@linux.microsoft.com>
On Fri, 2021-02-05 at 09:39 -0800, Lakshmi Ramasubramanian wrote:
> On 2/5/21 2:05 AM, Greg KH wrote:
> > On Thu, Feb 04, 2021 at 09:49:50AM -0800, Lakshmi Ramasubramanian wrote:
> >> IMA allocates kernel virtual memory to carry forward the measurement
> >> list, from the current kernel to the next kernel on kexec system call,
> >> in ima_add_kexec_buffer() function. In error code paths this memory
> >> is not freed resulting in memory leak.
> >>
> >> Free the memory allocated for the IMA measurement list in
> >> the error code paths in ima_add_kexec_buffer() function.
> >>
> >> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
> >> Suggested-by: Tyler Hicks <tyhicks@linux.microsoft.com>
> >> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
> >> ---
> >> security/integrity/ima/ima_kexec.c | 1 +
> >> 1 file changed, 1 insertion(+)
> >
> > <formletter>
> >
> > This is not the correct way to submit patches for inclusion in the
> > stable kernel tree. Please read:
> > https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> > for how to do this properly.
> >
> > </formletter>
> >
>
> Thanks for the info Greg.
>
> I will re-submit the two patches in the proper format.
No need. I'm testing these patches now. I'm not exactly sure what the
problem is. Stable wasn't Cc'ed. Is it that you sent the patch
directly to Greg or added "Fixes"?
thanks,
Mimi
^ permalink raw reply
* Re: [PATCH v2 1/2] ima: Free IMA measurement buffer on error
From: Lakshmi Ramasubramanian @ 2021-02-05 17:59 UTC (permalink / raw)
To: Mimi Zohar, Greg KH
Cc: sashal, dmitry.kasatkin, linux-kernel, tyhicks, ebiederm,
linux-integrity, linuxppc-dev, bauerman
In-Reply-To: <6a5b7a1767265122d21f185c81399692d12191f4.camel@linux.ibm.com>
On 2/5/21 9:49 AM, Mimi Zohar wrote:
Hi Mimi,
> On Fri, 2021-02-05 at 09:39 -0800, Lakshmi Ramasubramanian wrote:
>> On 2/5/21 2:05 AM, Greg KH wrote:
>>> On Thu, Feb 04, 2021 at 09:49:50AM -0800, Lakshmi Ramasubramanian wrote:
>>>> IMA allocates kernel virtual memory to carry forward the measurement
>>>> list, from the current kernel to the next kernel on kexec system call,
>>>> in ima_add_kexec_buffer() function. In error code paths this memory
>>>> is not freed resulting in memory leak.
>>>>
>>>> Free the memory allocated for the IMA measurement list in
>>>> the error code paths in ima_add_kexec_buffer() function.
>>>>
>>>> Signed-off-by: Lakshmi Ramasubramanian <nramas@linux.microsoft.com>
>>>> Suggested-by: Tyler Hicks <tyhicks@linux.microsoft.com>
>>>> Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
>>>> ---
>>>> security/integrity/ima/ima_kexec.c | 1 +
>>>> 1 file changed, 1 insertion(+)
>>>
>>> <formletter>
>>>
>>> This is not the correct way to submit patches for inclusion in the
>>> stable kernel tree. Please read:
>>> https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
>>> for how to do this properly.
>>>
>>> </formletter>
>>>
>>
>> Thanks for the info Greg.
>>
>> I will re-submit the two patches in the proper format.
>
> No need. I'm testing these patches now. I'm not exactly sure what the
> problem is. Stable wasn't Cc'ed. Is it that you sent the patch
> directly to Greg or added "Fixes"?
>
I had not Cced stable, but had "Fixes" tag in the patch.
Fixes: 7b8589cc29e7 ("ima: on soft reboot, save the measurement list")
The problem is that the buffer allocated for forwarding the IMA
measurement list is not freed - at the end of the kexec call and also in
an error path. Please see the patch description for
[PATCH v2 2/2] ima: Free IMA measurement buffer after kexec syscall
IMA allocates kernel virtual memory to carry forward the measurement
list, from the current kernel to the next kernel on kexec system call,
in ima_add_kexec_buffer() function. This buffer is not freed before
completing the kexec system call resulting in memory leak.
thanks,
-lakshmi
^ permalink raw reply
* [PATCH v2] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm
From: Aneesh Kumar K.V @ 2021-02-05 18:17 UTC (permalink / raw)
To: linuxppc-dev, mpe
Cc: Jens Axboe, Aneesh Kumar K.V, Zorro Lang, Nicholas Piggin
This fix the bad fault reported by KUAP when io_wqe_worker access userspace.
Bug: Read fault blocked by KUAP!
WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 __do_page_fault+0x6b4/0xcd0
NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
..........
Call Trace:
[c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
[c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
[c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
--- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
..........
NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
interrupt: 300
[c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
[c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
[c000000016367990] [c000000000710330] iomap_file_buffered_write+0xa0/0x120
[c0000000163679e0] [c00800000040791c] xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
[c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
[c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
[c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
[c000000016367cb0] [c0000000006e2578] io_worker_handle_work+0x498/0x800
[c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
[c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
[c000000016367e10] [c00000000000dbf0] ret_from_kernel_thread+0x5c/0x6c
The kernel consider thread AMR value for kernel thread to be
AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
of course not correct and we should allow userspace access after
kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
AMR value of the operating address space. But, the AMR value is
thread-specific and we inherit the address space and not thread
access restrictions. Because of this ignore AMR value when accessing
userspace via kernel thread.
current_thread_amr/iamr() are also updated, because we use them in the
below stack.
....
[ 530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G D 5.11.0-rc6+ #3
....
NIP [c0000000000aa0c8] pkey_access_permitted+0x28/0x90
LR [c0000000004b9278] gup_pte_range+0x188/0x420
--- interrupt: 700
[c00000001c4ef3f0] [0000000000000000] 0x0 (unreliable)
[c00000001c4ef490] [c0000000004bd39c] gup_pgd_range+0x3ac/0xa20
[c00000001c4ef5a0] [c0000000004bdd44] internal_get_user_pages_fast+0x334/0x410
[c00000001c4ef620] [c000000000852028] iov_iter_get_pages+0xf8/0x5c0
[c00000001c4ef6a0] [c0000000007da44c] bio_iov_iter_get_pages+0xec/0x700
[c00000001c4ef770] [c0000000006a325c] iomap_dio_bio_actor+0x2ac/0x4f0
[c00000001c4ef810] [c00000000069cd94] iomap_apply+0x2b4/0x740
[c00000001c4ef920] [c0000000006a38b8] __iomap_dio_rw+0x238/0x5c0
[c00000001c4ef9d0] [c0000000006a3c60] iomap_dio_rw+0x20/0x80
[c00000001c4ef9f0] [c008000001927a30] xfs_file_dio_aio_write+0x1f8/0x650 [xfs]
[c00000001c4efa60] [c0080000019284dc] xfs_file_write_iter+0xc4/0x130 [xfs]
[c00000001c4efa90] [c000000000669984] io_write+0x104/0x4b0
[c00000001c4efbb0] [c00000000066cea4] io_issue_sqe+0x3d4/0xf50
[c00000001c4efc60] [c000000000670200] io_wq_submit_work+0xb0/0x2f0
[c00000001c4efcb0] [c000000000674268] io_worker_handle_work+0x248/0x4a0
[c00000001c4efd30] [c0000000006746e8] io_wqe_worker+0x228/0x2a0
[c00000001c4efda0] [c00000000019d994] kthread+0x1b4/0x1c0
Cc: Zorro Lang <zlang@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
arch/powerpc/include/asm/book3s/64/kup.h | 16 +++++++++++-----
arch/powerpc/include/asm/book3s/64/pkeys.h | 4 ----
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index f50f72e535aa..7d1ef7b9754e 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -199,25 +199,31 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
#ifdef CONFIG_PPC_PKEY
+extern u64 __ro_after_init default_uamor;
+extern u64 __ro_after_init default_amr;
+extern u64 __ro_after_init default_iamr;
+
#include <asm/mmu.h>
#include <asm/ptrace.h>
-/*
- * For kernel thread that doesn't have thread.regs return
- * default AMR/IAMR values.
+/* usage of kthread_use_mm() should inherit the
+ * AMR value of the operating address space. But, the AMR value is
+ * thread-specific and we inherit the address space and not thread
+ * access restrictions. Because of this ignore AMR value when accessing
+ * userspace via kernel thread.
*/
static inline u64 current_thread_amr(void)
{
if (current->thread.regs)
return current->thread.regs->amr;
- return AMR_KUAP_BLOCKED;
+ return default_amr;
}
static inline u64 current_thread_iamr(void)
{
if (current->thread.regs)
return current->thread.regs->iamr;
- return AMR_KUEP_BLOCKED;
+ return default_iamr;
}
#endif /* CONFIG_PPC_PKEY */
diff --git a/arch/powerpc/include/asm/book3s/64/pkeys.h b/arch/powerpc/include/asm/book3s/64/pkeys.h
index 3b8640498f5b..5b178139f3c0 100644
--- a/arch/powerpc/include/asm/book3s/64/pkeys.h
+++ b/arch/powerpc/include/asm/book3s/64/pkeys.h
@@ -5,10 +5,6 @@
#include <asm/book3s/64/hash-pkey.h>
-extern u64 __ro_after_init default_uamor;
-extern u64 __ro_after_init default_amr;
-extern u64 __ro_after_init default_iamr;
-
static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
{
if (!mmu_has_feature(MMU_FTR_PKEY))
--
2.29.2
^ permalink raw reply related
* Re: [PATCH] powerpc/pseries/dlpar: handle ibm,configure-connector delay status
From: Tyrel Datwyler @ 2021-02-05 19:46 UTC (permalink / raw)
To: Nathan Lynch, linuxppc-dev; +Cc: brking
In-Reply-To: <20210107025900.410369-1-nathanl@linux.ibm.com>
On 1/6/21 6:59 PM, Nathan Lynch wrote:
> dlpar_configure_connector() has two problems in its handling of
> ibm,configure-connector's return status:
>
> 1. When the status is -2 (busy, call again), we call
> ibm,configure-connector again immediately without checking whether
> to schedule, which can result in monopolizing the CPU.
> 2. Extended delay status (9900..9905) goes completely unhandled,
> causing the configuration to unnecessarily terminate.
>
> Fix both of these issues by using rtas_busy_delay().
>
> Fixes: ab519a011caa ("powerpc/pseries: Kernel DLPAR Infrastructure")
> Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
Reviewed-by: Tyrel Datwyler <tyreld@linux.ibm.com>
^ permalink raw reply
* Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.
From: Leonardo Bras @ 2021-02-05 20:35 UTC (permalink / raw)
To: Fabiano Rosas, Paul Mackerras, Michael Ellerman,
Benjamin Herrenschmidt, Christophe Leroy, Athira Rajeev,
Aneesh Kumar K.V, Jordan Niethe, Nicholas Piggin,
Frederic Weisbecker, Thomas Gleixner, Geert Uytterhoeven
Cc: linuxppc-dev, linux-kernel, kvm-ppc
In-Reply-To: <874kiqy82t.fsf@linux.ibm.com>
Hello Fabiano,
Thanks for reviewing!
(answers inline)
On Fri, 2021-02-05 at 10:09 -0300, Fabiano Rosas wrote:
> Leonardo Bras <leobras.c@gmail.com> writes:
>
> > Before guest entry, TBU40 register is changed to reflect guest timebase.
> > After exitting guest, the register is reverted to it's original value.
> >
> > If one tries to get the timestamp from host between those changes, it
> > will present an incorrect value.
> >
> > An example would be trying to add a tracepoint in
> > kvmppc_guest_entry_inject_int(), which depending on last tracepoint
> > acquired could actually cause the host to crash.
> >
> > Save the Timebase Offset to PACA and use it on sched_clock() to always
> > get the correct timestamp.
> >
> > Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> > Suggested-by: Paul Mackerras <paulus@ozlabs.org>
> > ---
> > Changes since v1:
> > - Subtracts offset only when CONFIG_KVM_BOOK3S_HANDLER and
> > CONFIG_PPC_BOOK3S_64 are defined.
> > ---
> > arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
> > arch/powerpc/kernel/asm-offsets.c | 1 +
> > arch/powerpc/kernel/time.c | 8 +++++++-
> > arch/powerpc/kvm/book3s_hv.c | 2 ++
> > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 ++
> > 5 files changed, 13 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > index 078f4648ea27..e2c12a10eed2 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> > @@ -131,6 +131,7 @@ struct kvmppc_host_state {
> > u64 cfar;
> > u64 ppr;
> > u64 host_fscr;
> > + u64 tb_offset; /* Timebase offset: keeps correct
> > timebase while on guest */
>
> Couldn't you use the vc->tb_offset_applied for this? We have a reference
> for the vcore in the hstate already.
But it's a pointer, which means we would have to keep checking for NULL
every time we need sched_clock().
Potentially it would cost a cache miss for PACA memory region that
contain vc, another for getting the part of *vc that contains the
tb_offset_applied, instead of only one for PACA struct region that
contains tb_offset.
On the other hand, it got me thinking: If the offset is applied per
cpu, why don't we get this info only in PACA, instead of in vc?
It could be a general way to get an offset applied for any purpose and
still get the sched_clock() right.
(Not that I have any idea of any other purpose we could use it)
Best regards!
Leonardo Bras
>
> > #endif
> > };
> >
> > diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> > index b12d7c049bfe..0beb8fdc6352 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -706,6 +706,7 @@ int main(void)
> > HSTATE_FIELD(HSTATE_CFAR, cfar);
> > HSTATE_FIELD(HSTATE_PPR, ppr);
> > HSTATE_FIELD(HSTATE_HOST_FSCR, host_fscr);
> > + HSTATE_FIELD(HSTATE_TB_OFFSET, tb_offset);
> > #endif /* CONFIG_PPC_BOOK3S_64 */
> >
> > #else /* CONFIG_PPC_BOOK3S */
> > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> > index 67feb3524460..f27f0163792b 100644
> > --- a/arch/powerpc/kernel/time.c
> > +++ b/arch/powerpc/kernel/time.c
> > @@ -699,7 +699,13 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
> > */
> > notrace unsigned long long sched_clock(void)
> > {
> > - return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift;
> > + u64 tb = get_tb() - boot_tb;
> > +
> > +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_HANDLER)
> > + tb -= local_paca->kvm_hstate.tb_offset;
> > +#endif
> > +
> > + return mulhdu(tb, tb_to_ns_scale) << tb_to_ns_shift;
> > }
> >
> >
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index b3731572295e..c08593c63353 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -3491,6 +3491,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> > if ((tb & 0xffffff) < (new_tb & 0xffffff))
> > mtspr(SPRN_TBU40, new_tb + 0x1000000);
> > vc->tb_offset_applied = vc->tb_offset;
> > + local_paca->kvm_hstate.tb_offset = vc->tb_offset;
> > }
> >
> > if (vc->pcr)
> > @@ -3594,6 +3595,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
> > if ((tb & 0xffffff) < (new_tb & 0xffffff))
> > mtspr(SPRN_TBU40, new_tb + 0x1000000);
> > vc->tb_offset_applied = 0;
> > + local_paca->kvm_hstate.tb_offset = 0;
> > }
> >
> > mtspr(SPRN_HDEC, 0x7fffffff);
> > diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > index b73140607875..8f7a9f7f4ee6 100644
> > --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> > @@ -632,6 +632,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> > cmpdi r8,0
> > beq 37f
> > std r8, VCORE_TB_OFFSET_APPL(r5)
> > + std r8, HSTATE_TB_OFFSET(r13)
> > mftb r6 /* current host timebase */
> > add r8,r8,r6
> > mtspr SPRN_TBU40,r8 /* update upper 40 bits */
> > @@ -1907,6 +1908,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
> > beq 17f
> > li r0, 0
> > std r0, VCORE_TB_OFFSET_APPL(r5)
> > + std r0, HSTATE_TB_OFFSET(r13)
> > mftb r6 /* current guest timebase */
> > subf r8,r8,r6
> > mtspr SPRN_TBU40,r8 /* update upper 40 bits */
^ permalink raw reply
* Re: [RFC PATCH 1/9] KVM: PPC: Book3S 64: move KVM interrupt entry to a common entry point
From: Fabiano Rosas @ 2021-02-05 21:28 UTC (permalink / raw)
To: Nicholas Piggin, kvm-ppc; +Cc: linuxppc-dev, Nicholas Piggin
In-Reply-To: <20210202030313.3509446-2-npiggin@gmail.com>
Nicholas Piggin <npiggin@gmail.com> writes:
> Rather than bifurcate the call depending on whether or not HV is
> possible, and have the HV entry test for PR, just make a single
> common point which does the demultiplexing. This makes it simpler
> to add another type of exit handler.
>
> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Reviewed-by: Fabiano Rosas <farosas@linux.ibm.com>
> ---
> arch/powerpc/kernel/exceptions-64s.S | 8 +-----
> arch/powerpc/kvm/Makefile | 3 +++
> arch/powerpc/kvm/book3s_64_entry.S | 34 +++++++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 ++------
> 4 files changed, 40 insertions(+), 16 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_64_entry.S
>
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index e02ad6fefa46..65659ea3cec4 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -212,7 +212,6 @@ do_define_int n
> .endm
>
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> /*
> * All interrupts which set HSRR registers, as well as SRESET and MCE and
> * syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken,
> @@ -242,13 +241,8 @@ do_define_int n
>
> /*
> * If an interrupt is taken while a guest is running, it is immediately routed
> - * to KVM to handle. If both HV and PR KVM arepossible, KVM interrupts go first
> - * to kvmppc_interrupt_hv, which handles the PR guest case.
> + * to KVM to handle.
> */
> -#define kvmppc_interrupt kvmppc_interrupt_hv
> -#else
> -#define kvmppc_interrupt kvmppc_interrupt_pr
> -#endif
>
> .macro KVMTEST name
> lbz r10,HSTATE_IN_GUEST(r13)
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 2bfeaa13befb..cdd119028f64 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -59,6 +59,9 @@ kvm-pr-y := \
> kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> tm.o
>
> +kvm-book3s_64-builtin-objs-y += \
> + book3s_64_entry.o
> +
> ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> book3s_rmhandlers.o
> diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
> new file mode 100644
> index 000000000000..22e34b95f478
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_64_entry.S
> @@ -0,0 +1,34 @@
> +#include <asm/cache.h>
> +#include <asm/ppc_asm.h>
> +#include <asm/kvm_asm.h>
> +#include <asm/reg.h>
> +#include <asm/asm-offsets.h>
> +#include <asm/kvm_book3s_asm.h>
> +
> +/*
> + * We come here from the first-level interrupt handlers.
> + */
> +.global kvmppc_interrupt
> +.balign IFETCH_ALIGN_BYTES
> +kvmppc_interrupt:
> + /*
> + * Register contents:
> + * R12 = (guest CR << 32) | interrupt vector
> + * R13 = PACA
> + * guest R12 saved in shadow VCPU SCRATCH0
> + * guest R13 saved in SPRN_SCRATCH0
> + */
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> + std r9, HSTATE_SCRATCH2(r13)
> + lbz r9, HSTATE_IN_GUEST(r13)
> + cmpwi r9, KVM_GUEST_MODE_HOST_HV
> + beq kvmppc_bad_host_intr
> +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> + cmpwi r9, KVM_GUEST_MODE_GUEST
> + ld r9, HSTATE_SCRATCH2(r13)
> + beq kvmppc_interrupt_pr
> +#endif
> + b kvmppc_interrupt_hv
> +#else
> + b kvmppc_interrupt_pr
> +#endif
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 8cf1f69f442e..b9c4acd747f7 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1255,16 +1255,8 @@ kvmppc_interrupt_hv:
> * R13 = PACA
> * guest R12 saved in shadow VCPU SCRATCH0
> * guest R13 saved in SPRN_SCRATCH0
> + * guest R9 saved in HSTATE_SCRATCH2
> */
> - std r9, HSTATE_SCRATCH2(r13)
> - lbz r9, HSTATE_IN_GUEST(r13)
> - cmpwi r9, KVM_GUEST_MODE_HOST_HV
> - beq kvmppc_bad_host_intr
> -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
> - cmpwi r9, KVM_GUEST_MODE_GUEST
> - ld r9, HSTATE_SCRATCH2(r13)
> - beq kvmppc_interrupt_pr
> -#endif
> /* We're now back in the host but in guest MMU context */
> li r9, KVM_GUEST_MODE_HOST_HV
> stb r9, HSTATE_IN_GUEST(r13)
> @@ -3253,6 +3245,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST)
> * cfar is saved in HSTATE_CFAR(r13)
> * ppr is saved in HSTATE_PPR(r13)
> */
> +.global kvmppc_bad_host_intr
> kvmppc_bad_host_intr:
> /*
> * Switch to the emergency stack, but start half-way down in
^ permalink raw reply
* Re: [PATCH v2 1/1] powerpc/kvm: Save Timebase Offset to fix sched_clock() while running guest code.
From: Michael Ellerman @ 2021-02-05 23:32 UTC (permalink / raw)
To: Nicholas Piggin, Aneesh Kumar K.V, Athira Rajeev,
Benjamin Herrenschmidt, Christophe Leroy, Frederic Weisbecker,
Geert Uytterhoeven, Jordan Niethe, Leonardo Bras, Paul Mackerras,
Thomas Gleixner
Cc: linuxppc-dev, linux-kernel, kvm-ppc
In-Reply-To: <1612506268.6rrvx34gzu.astroid@bobo.none>
Nicholas Piggin <npiggin@gmail.com> writes:
> Excerpts from Leonardo Bras's message of February 5, 2021 4:06 pm:
>> Before guest entry, TBU40 register is changed to reflect guest timebase.
>> After exitting guest, the register is reverted to it's original value.
>>
>> If one tries to get the timestamp from host between those changes, it
>> will present an incorrect value.
>>
>> An example would be trying to add a tracepoint in
>> kvmppc_guest_entry_inject_int(), which depending on last tracepoint
>> acquired could actually cause the host to crash.
>>
>> Save the Timebase Offset to PACA and use it on sched_clock() to always
>> get the correct timestamp.
>
> Ouch. Not sure how reasonable it is to half switch into guest registers
> and expect to call into the wider kernel, fixing things up as we go.
Yeah it's not.
We need to disable tracing on those routines that are called in that
half-exited state.
cheers
^ permalink raw reply
* Re: [PATCH v7 39/42] powerpc: move NMI entry/exit code into wrapper
From: Michael Ellerman @ 2021-02-05 23:38 UTC (permalink / raw)
To: Nicholas Piggin, linuxppc-dev; +Cc: Athira Rajeev
In-Reply-To: <1612438069.44myr3nzfs.astroid@bobo.none>
Nicholas Piggin <npiggin@gmail.com> writes:
> Excerpts from Michael Ellerman's message of February 4, 2021 8:15 pm:
>> Nicholas Piggin <npiggin@gmail.com> writes:
>>> This moves the common NMI entry and exit code into the interrupt handler
>>> wrappers.
>>>
>>> This changes the behaviour of soft-NMI (watchdog) and HMI interrupts, and
>>> also MCE interrupts on 64e, by adding missing parts of the NMI entry to
>>> them.
>>>
>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>> ---
>>> arch/powerpc/include/asm/interrupt.h | 28 ++++++++++++++++++++++
>>> arch/powerpc/kernel/mce.c | 11 ---------
>>> arch/powerpc/kernel/traps.c | 35 +++++-----------------------
>>> arch/powerpc/kernel/watchdog.c | 10 ++++----
>>> 4 files changed, 38 insertions(+), 46 deletions(-)
>>
>> This is unhappy when injecting SLB multi-hits:
>>
>> root@p86-2:~# echo PPC_SLB_MULTIHIT > /sys/kernel/debug/provoke-crash/DIRECT
>> [ 312.496026][ T1344] kernel BUG at arch/powerpc/include/asm/interrupt.h:152!
>> [ 312.496037][ T1344] Oops: Exception in kernel mode, sig: 5 [#1]
>> [ 312.496045][ T1344] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>
> pseries hash. Blast!
The worst kind.
>> 147 static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state)
>> 148 {
>> 149 if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
>> 150 !firmware_has_feature(FW_FEATURE_LPAR) ||
>> 151 radix_enabled() || (mfmsr() & MSR_DR))
>> 152 nmi_exit();
>>
>>
>> So presumably it's:
>>
>> #define __nmi_exit() \
>> do { \
>> BUG_ON(!in_nmi()); \
>
> Yes that would be it, pseries machine check enables MMU half way through
> so only one side of this triggers.
>
> The MSR_DR check is supposed to catch the other NMIs that run with MMU
> on (perf, watchdog, etc). Suppose it could test TRAP(regs) explicitly
> although I wonder if we should also do this to keep things balanced
Yeah I think I like that. I'll give it a test.
cheers
> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
> index 149cec2212e6..f57ca0c570be 100644
> --- a/arch/powerpc/platforms/pseries/ras.c
> +++ b/arch/powerpc/platforms/pseries/ras.c
> @@ -719,6 +719,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>
> static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
> {
> + unsigned long msr;
> struct pseries_errorlog *pseries_log;
> struct pseries_mc_errorlog *mce_log = NULL;
> int disposition = rtas_error_disposition(errp);
> @@ -747,9 +748,12 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
> * SLB multihit is done by now.
> */
> out:
> - mtmsr(mfmsr() | MSR_IR | MSR_DR);
> + msr = mfmsr();
> + mtmsr(msr | MSR_IR | MSR_DR);
> disposition = mce_handle_err_virtmode(regs, errp, mce_log,
> disposition);
> + mtmsr(msr);
> +
> return disposition;
> }
>
^ permalink raw reply
* Re: [PATCH v3 28/32] powerpc/64s: interrupt implement exit logic in C
From: Nicholas Piggin @ 2021-02-06 2:28 UTC (permalink / raw)
To: Christophe Leroy, linuxppc-dev, Michael Ellerman; +Cc: Michal Suchanek
In-Reply-To: <d8e6b971-c783-fb5f-f9f2-24e7d8d0726d@csgroup.eu>
Excerpts from Christophe Leroy's message of February 5, 2021 4:04 pm:
>
>
> Le 05/02/2021 à 03:16, Nicholas Piggin a écrit :
>> Excerpts from Michael Ellerman's message of February 5, 2021 10:22 am:
>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>> Excerpts from Christophe Leroy's message of February 4, 2021 6:03 pm:
>>>>> Le 04/02/2021 à 04:27, Nicholas Piggin a écrit :
>>>>>> Excerpts from Christophe Leroy's message of February 4, 2021 2:25 am:
>>>>>>> Le 25/02/2020 à 18:35, Nicholas Piggin a écrit :
>>> ...
>>>>>>>> +
>>>>>>>> + /*
>>>>>>>> + * We don't need to restore AMR on the way back to userspace for KUAP.
>>>>>>>> + * The value of AMR only matters while we're in the kernel.
>>>>>>>> + */
>>>>>>>> + kuap_restore_amr(regs);
>>>>>>>
>>>>>>> Is that correct to restore KUAP state here ? Shouldn't we have it at lower level in assembly ?
>>>>>>>
>>>>>>> Isn't there a risk that someone manages to call interrupt_exit_kernel_prepare() or the end of it in
>>>>>>> a way or another, and get the previous KUAP state restored by this way ?
>>>>>>
>>>>>> I'm not sure if there much more risk if it's here rather than the
>>>>>> instruction being in another place in the code.
>>>>>>
>>>>>> There's a lot of user access around the kernel too if you want to find a
>>>>>> gadget to unlock KUAP then I suppose there is a pretty large attack
>>>>>> surface.
>>>>>
>>>>> My understanding is that user access scope is strictly limited, for instance we enforce the
>>>>> begin/end of user access to be in the same function, and we refrain from calling any other function
>>>>> inside the user access window. x86 even have 'objtool' to enforce it at build time. So in theory
>>>>> there is no way to get out of the function while user access is open.
>>>>>
>>>>> Here with the interrupt exit function it is free beer. You have a place where you re-open user
>>>>> access and return with a simple blr. So that's open bar. If someone manages to just call the
>>>>> interrupt exit function, then user access remains open
>>>>
>>>> Hmm okay maybe that's a good point.
>>>
>>> I don't think it's a very attractive gadget, it's not just a plain blr,
>>> it does a full stack frame tear down before the return. And there's no
>>> LR reloads anywhere very close.
>>>
>>> Obviously it depends on what the compiler decides to do, it's possible
>>> it could be a usable gadget. But there are other places that are more
>>> attractive I think, eg:
>>>
>>> c00000000061d768: a6 03 3d 7d mtspr 29,r9
>>> c00000000061d76c: 2c 01 00 4c isync
>>> c00000000061d770: 00 00 00 60 nop
>>> c00000000061d774: 30 00 21 38 addi r1,r1,48
>>> c00000000061d778: 20 00 80 4e blr
>>>
>>>
>>> So I don't think we should redesign the code *purely* because we're
>>> worried about interrupt_exit_kernel_prepare() being a useful gadget. If
>>> we can come up with a way to restructure it that reads well and is
>>> maintainable, and also reduces the chance of it being a good gadget then
>>> sure.
>>
>> Okay. That would be good if we can keep it in C, the pkeys + kuap combo
>> is fairly complicated and we might want to something cleverer with it,
>> so that would make it even more difficult in asm.
>>
>
> Ok.
>
> For ppc32, I prefer to keep it in assembly for the time being and move everything from ASM to C at
> once after porting syscall and interrupts to C and wrappers.
>
> Hope this is OK for you.
I don't see a problem with that.
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH] powerpc/8xx: Fix software emulation interrupt
From: Nicholas Piggin @ 2021-02-06 2:30 UTC (permalink / raw)
To: Benjamin Herrenschmidt, Christophe Leroy, Michael Ellerman,
Paul Mackerras
Cc: linuxppc-dev, linux-kernel
In-Reply-To: <ad782af87a222efc79cfb06079b0fd23d4224eaf.1612515180.git.christophe.leroy@csgroup.eu>
Excerpts from Christophe Leroy's message of February 5, 2021 6:56 pm:
> For unimplemented instructions or unimplemented SPRs, the 8xx triggers
> a "Software Emulation Exception" (0x1000). That interrupt doesn't set
> reason bits in SRR1 as the "Program Check Exception" does.
>
> Go through emulation_assist_interrupt() to set REASON_ILLEGAL.
>
> Fixes: fbbcc3bb139e ("powerpc/8xx: Remove SoftwareEmulation()")
> Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
> ---
> I'm wondering whether it wouldn't be better to set REASON_ILLEGAL
> in the exception prolog and still call program_check_exception.
> And do the same in book3s/64 to avoid the nightmare of an
> INTERRUPT_HANDLER calling another INTERRUPT_HANDLER.
Hmm, I missed this. We just change program_check_exception to
a common function which is called by both.
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH v7 28/42] powerpc: convert interrupt handlers to use wrappers
From: Nicholas Piggin @ 2021-02-06 2:43 UTC (permalink / raw)
To: Christophe Leroy, linuxppc-dev; +Cc: Athira Rajeev
In-Reply-To: <0e319d85-9fa0-ff97-03b2-93637ad89a99@csgroup.eu>
Excerpts from Christophe Leroy's message of February 5, 2021 6:09 pm:
>
>
> Le 30/01/2021 à 14:08, Nicholas Piggin a écrit :
>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>> ---
>
>> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
>> index f70d3f6174c8..7ff915aae8ec 100644
>> --- a/arch/powerpc/kernel/traps.c
>> +++ b/arch/powerpc/kernel/traps.c
>
>> @@ -1462,7 +1474,7 @@ static int emulate_math(struct pt_regs *regs)
>> static inline int emulate_math(struct pt_regs *regs) { return -1; }
>> #endif
>>
>> -void program_check_exception(struct pt_regs *regs)
>> +DEFINE_INTERRUPT_HANDLER(program_check_exception)
>> {
>> enum ctx_state prev_state = exception_enter();
>> unsigned int reason = get_reason(regs);
>> @@ -1587,14 +1599,14 @@ NOKPROBE_SYMBOL(program_check_exception);
>> * This occurs when running in hypervisor mode on POWER6 or later
>> * and an illegal instruction is encountered.
>> */
>> -void emulation_assist_interrupt(struct pt_regs *regs)
>> +DEFINE_INTERRUPT_HANDLER(emulation_assist_interrupt)
>> {
>> regs->msr |= REASON_ILLEGAL;
>> program_check_exception(regs);
>
> Is it correct that an INTERRUPT_HANDLER calls another INTERRUPT_HANDLER ?
No you're right, I'll have to send a patch.
Thanks,
Nick
^ permalink raw reply
* Re: [PATCH v7 39/42] powerpc: move NMI entry/exit code into wrapper
From: Nicholas Piggin @ 2021-02-06 2:46 UTC (permalink / raw)
To: linuxppc-dev, Michael Ellerman; +Cc: Athira Rajeev
In-Reply-To: <875z36ozkq.fsf@mpe.ellerman.id.au>
Excerpts from Michael Ellerman's message of February 6, 2021 9:38 am:
> Nicholas Piggin <npiggin@gmail.com> writes:
>> Excerpts from Michael Ellerman's message of February 4, 2021 8:15 pm:
>>> Nicholas Piggin <npiggin@gmail.com> writes:
>>>> This moves the common NMI entry and exit code into the interrupt handler
>>>> wrappers.
>>>>
>>>> This changes the behaviour of soft-NMI (watchdog) and HMI interrupts, and
>>>> also MCE interrupts on 64e, by adding missing parts of the NMI entry to
>>>> them.
>>>>
>>>> Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
>>>> ---
>>>> arch/powerpc/include/asm/interrupt.h | 28 ++++++++++++++++++++++
>>>> arch/powerpc/kernel/mce.c | 11 ---------
>>>> arch/powerpc/kernel/traps.c | 35 +++++-----------------------
>>>> arch/powerpc/kernel/watchdog.c | 10 ++++----
>>>> 4 files changed, 38 insertions(+), 46 deletions(-)
>>>
>>> This is unhappy when injecting SLB multi-hits:
>>>
>>> root@p86-2:~# echo PPC_SLB_MULTIHIT > /sys/kernel/debug/provoke-crash/DIRECT
>>> [ 312.496026][ T1344] kernel BUG at arch/powerpc/include/asm/interrupt.h:152!
>>> [ 312.496037][ T1344] Oops: Exception in kernel mode, sig: 5 [#1]
>>> [ 312.496045][ T1344] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
>>
>> pseries hash. Blast!
>
> The worst kind.
>
>>> 147 static inline void interrupt_nmi_exit_prepare(struct pt_regs *regs, struct interrupt_nmi_state *state)
>>> 148 {
>>> 149 if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) ||
>>> 150 !firmware_has_feature(FW_FEATURE_LPAR) ||
>>> 151 radix_enabled() || (mfmsr() & MSR_DR))
>>> 152 nmi_exit();
>>>
>>>
>>> So presumably it's:
>>>
>>> #define __nmi_exit() \
>>> do { \
>>> BUG_ON(!in_nmi()); \
>>
>> Yes that would be it, pseries machine check enables MMU half way through
>> so only one side of this triggers.
>>
>> The MSR_DR check is supposed to catch the other NMIs that run with MMU
>> on (perf, watchdog, etc). Suppose it could test TRAP(regs) explicitly
>> although I wonder if we should also do this to keep things balanced
>
> Yeah I think I like that. I'll give it a test.
The msr restore? Looking closer, pseries_machine_check_realmode may have
expected mce_handle_error to enable the MMU, because irq_work_queue uses
some per-cpu variables I think.
So the code might have to be rearranged a bit more than the patch below.
Thanks,
Nick
>
> cheers
>
>
>> diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
>> index 149cec2212e6..f57ca0c570be 100644
>> --- a/arch/powerpc/platforms/pseries/ras.c
>> +++ b/arch/powerpc/platforms/pseries/ras.c
>> @@ -719,6 +719,7 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
>>
>> static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>> {
>> + unsigned long msr;
>> struct pseries_errorlog *pseries_log;
>> struct pseries_mc_errorlog *mce_log = NULL;
>> int disposition = rtas_error_disposition(errp);
>> @@ -747,9 +748,12 @@ static int mce_handle_error(struct pt_regs *regs, struct rtas_error_log *errp)
>> * SLB multihit is done by now.
>> */
>> out:
>> - mtmsr(mfmsr() | MSR_IR | MSR_DR);
>> + msr = mfmsr();
>> + mtmsr(msr | MSR_IR | MSR_DR);
>> disposition = mce_handle_err_virtmode(regs, errp, mce_log,
>> disposition);
>> + mtmsr(msr);
>> +
>> return disposition;
>> }
>>
>
^ permalink raw reply
* [PATCH v3] powerpc/kuap: Allow kernel thread to access userspace after kthread_use_mm
From: Aneesh Kumar K.V @ 2021-02-06 2:56 UTC (permalink / raw)
To: linuxppc-dev, mpe
Cc: Jens Axboe, Aneesh Kumar K.V, Zorro Lang, Nicholas Piggin
This fix the bad fault reported by KUAP when io_wqe_worker access userspace.
Bug: Read fault blocked by KUAP!
WARNING: CPU: 1 PID: 101841 at arch/powerpc/mm/fault.c:229 __do_page_fault+0x6b4/0xcd0
NIP [c00000000009e7e4] __do_page_fault+0x6b4/0xcd0
LR [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0
..........
Call Trace:
[c000000016367330] [c00000000009e7e0] __do_page_fault+0x6b0/0xcd0 (unreliable)
[c0000000163673e0] [c00000000009ee3c] do_page_fault+0x3c/0x120
[c000000016367430] [c00000000000c848] handle_page_fault+0x10/0x2c
--- interrupt: 300 at iov_iter_fault_in_readable+0x148/0x6f0
..........
NIP [c0000000008e8228] iov_iter_fault_in_readable+0x148/0x6f0
LR [c0000000008e834c] iov_iter_fault_in_readable+0x26c/0x6f0
interrupt: 300
[c0000000163677e0] [c0000000007154a0] iomap_write_actor+0xc0/0x280
[c000000016367880] [c00000000070fc94] iomap_apply+0x1c4/0x780
[c000000016367990] [c000000000710330] iomap_file_buffered_write+0xa0/0x120
[c0000000163679e0] [c00800000040791c] xfs_file_buffered_aio_write+0x314/0x5e0 [xfs]
[c000000016367a90] [c0000000006d74bc] io_write+0x10c/0x460
[c000000016367bb0] [c0000000006d80e4] io_issue_sqe+0x8d4/0x1200
[c000000016367c70] [c0000000006d8ad0] io_wq_submit_work+0xc0/0x250
[c000000016367cb0] [c0000000006e2578] io_worker_handle_work+0x498/0x800
[c000000016367d40] [c0000000006e2cdc] io_wqe_worker+0x3fc/0x4f0
[c000000016367da0] [c0000000001cb0a4] kthread+0x1c4/0x1d0
[c000000016367e10] [c00000000000dbf0] ret_from_kernel_thread+0x5c/0x6c
The kernel consider thread AMR value for kernel thread to be
AMR_KUAP_BLOCKED. Hence access to userspace is denied. This
of course not correct and we should allow userspace access after
kthread_use_mm(). To be precise, kthread_use_mm() should inherit the
AMR value of the operating address space. But, the AMR value is
thread-specific and we inherit the address space and not thread
access restrictions. Because of this ignore AMR value when accessing
userspace via kernel thread.
current_thread_amr/iamr() are updated, because we use them in the
below stack.
....
[ 530.710838] CPU: 13 PID: 5587 Comm: io_wqe_worker-0 Tainted: G D 5.11.0-rc6+ #3
....
NIP [c0000000000aa0c8] pkey_access_permitted+0x28/0x90
LR [c0000000004b9278] gup_pte_range+0x188/0x420
--- interrupt: 700
[c00000001c4ef3f0] [0000000000000000] 0x0 (unreliable)
[c00000001c4ef490] [c0000000004bd39c] gup_pgd_range+0x3ac/0xa20
[c00000001c4ef5a0] [c0000000004bdd44] internal_get_user_pages_fast+0x334/0x410
[c00000001c4ef620] [c000000000852028] iov_iter_get_pages+0xf8/0x5c0
[c00000001c4ef6a0] [c0000000007da44c] bio_iov_iter_get_pages+0xec/0x700
[c00000001c4ef770] [c0000000006a325c] iomap_dio_bio_actor+0x2ac/0x4f0
[c00000001c4ef810] [c00000000069cd94] iomap_apply+0x2b4/0x740
[c00000001c4ef920] [c0000000006a38b8] __iomap_dio_rw+0x238/0x5c0
[c00000001c4ef9d0] [c0000000006a3c60] iomap_dio_rw+0x20/0x80
[c00000001c4ef9f0] [c008000001927a30] xfs_file_dio_aio_write+0x1f8/0x650 [xfs]
[c00000001c4efa60] [c0080000019284dc] xfs_file_write_iter+0xc4/0x130 [xfs]
[c00000001c4efa90] [c000000000669984] io_write+0x104/0x4b0
[c00000001c4efbb0] [c00000000066cea4] io_issue_sqe+0x3d4/0xf50
[c00000001c4efc60] [c000000000670200] io_wq_submit_work+0xb0/0x2f0
[c00000001c4efcb0] [c000000000674268] io_worker_handle_work+0x248/0x4a0
[c00000001c4efd30] [c0000000006746e8] io_wqe_worker+0x228/0x2a0
[c00000001c4efda0] [c00000000019d994] kthread+0x1b4/0x1c0
Cc: Zorro Lang <zlang@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
Changes from v2:
* Fix build error
Changes from v1:
* use default_amr/default_iamr for kthread.
arch/powerpc/include/asm/book3s/64/kup.h | 16 +++++++++++-----
arch/powerpc/include/asm/book3s/64/pkeys.h | 4 ----
arch/powerpc/mm/book3s64/pkeys.c | 1 +
3 files changed, 12 insertions(+), 9 deletions(-)
diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h
index f50f72e535aa..7d1ef7b9754e 100644
--- a/arch/powerpc/include/asm/book3s/64/kup.h
+++ b/arch/powerpc/include/asm/book3s/64/kup.h
@@ -199,25 +199,31 @@ DECLARE_STATIC_KEY_FALSE(uaccess_flush_key);
#ifdef CONFIG_PPC_PKEY
+extern u64 __ro_after_init default_uamor;
+extern u64 __ro_after_init default_amr;
+extern u64 __ro_after_init default_iamr;
+
#include <asm/mmu.h>
#include <asm/ptrace.h>
-/*
- * For kernel thread that doesn't have thread.regs return
- * default AMR/IAMR values.
+/* usage of kthread_use_mm() should inherit the
+ * AMR value of the operating address space. But, the AMR value is
+ * thread-specific and we inherit the address space and not thread
+ * access restrictions. Because of this ignore AMR value when accessing
+ * userspace via kernel thread.
*/
static inline u64 current_thread_amr(void)
{
if (current->thread.regs)
return current->thread.regs->amr;
- return AMR_KUAP_BLOCKED;
+ return default_amr;
}
static inline u64 current_thread_iamr(void)
{
if (current->thread.regs)
return current->thread.regs->iamr;
- return AMR_KUEP_BLOCKED;
+ return default_iamr;
}
#endif /* CONFIG_PPC_PKEY */
diff --git a/arch/powerpc/include/asm/book3s/64/pkeys.h b/arch/powerpc/include/asm/book3s/64/pkeys.h
index 3b8640498f5b..5b178139f3c0 100644
--- a/arch/powerpc/include/asm/book3s/64/pkeys.h
+++ b/arch/powerpc/include/asm/book3s/64/pkeys.h
@@ -5,10 +5,6 @@
#include <asm/book3s/64/hash-pkey.h>
-extern u64 __ro_after_init default_uamor;
-extern u64 __ro_after_init default_amr;
-extern u64 __ro_after_init default_iamr;
-
static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
{
if (!mmu_has_feature(MMU_FTR_PKEY))
diff --git a/arch/powerpc/mm/book3s64/pkeys.c b/arch/powerpc/mm/book3s64/pkeys.c
index f1c6f264ed91..15dcc5ad91c5 100644
--- a/arch/powerpc/mm/book3s64/pkeys.c
+++ b/arch/powerpc/mm/book3s64/pkeys.c
@@ -31,6 +31,7 @@ static u32 initial_allocation_mask __ro_after_init;
u64 default_amr __ro_after_init = ~0x0UL;
u64 default_iamr __ro_after_init = 0x5555555555555555UL;
u64 default_uamor __ro_after_init;
+EXPORT_SYMBOL(default_amr);
/*
* Key used to implement PROT_EXEC mmap. Denies READ/WRITE
* We pick key 2 because 0 is special key and 1 is reserved as per ISA.
--
2.29.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox