* [PATCH 06/23] Add 970 highmem asm code
@ 2009-07-07 14:17 Alexander Graf
2009-07-08 4:30 ` Benjamin Herrenschmidt
` (2 more replies)
0 siblings, 3 replies; 4+ messages in thread
From: Alexander Graf @ 2009-07-07 14:17 UTC (permalink / raw)
To: kvm-ppc
This is the of entry / exit code. In order to switch between host and guest
context, we need to switch register state and call the exit code handler on
exit.
This assembly file does exactly that. To finally enter the guest it calls
into 970_slb.S. On exit it gets jumped at from 970_slb.S too.
Add header definition for highmem handler
Signed-off-by: Alexander Graf <agraf@suse.de>
---
arch/powerpc/include/asm/kvm_ppc.h | 1 +
arch/powerpc/kvm/970_interrupts.S | 422 ++++++++++++++++++++++++++++++++++++
2 files changed, 423 insertions(+), 0 deletions(-)
create mode 100644 arch/powerpc/kvm/970_interrupts.S
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 2c6ee34..269ee46 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -39,6 +39,7 @@ enum emulation_result {
extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
extern char kvmppc_handlers_start[];
extern unsigned long kvmppc_handler_len;
+extern void kvmppc_handler_highmem(void);
extern void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu);
extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/kvm/970_interrupts.S b/arch/powerpc/kvm/970_interrupts.S
new file mode 100644
index 0000000..14f4112
--- /dev/null
+++ b/arch/powerpc/kvm/970_interrupts.S
@@ -0,0 +1,422 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright SUSE Linux Products GmbH 2009
+ *
+ * Authors: Alexander Graf <agraf@suse.de>
+ */
+
+#include <asm/kvm_970_asm.h>
+
+/*****************************************************************************
+ * *
+ * Guest entry / exit code that is in kernel module memory (highmem) *
+ * *
+ ****************************************************************************/
+
+/* Registers:
+ * r3: kvm_run pointer
+ * r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_entry)
+
+kvm_start_entry:
+
+ /* Save host state to the stack */
+ stdu r1, -HOST_STACK_SIZE(r1)
+ std r3, HOST_STACK_RUN(r1)
+ std r4, HOST_STACK_VCPU(r1)
+
+ std r14, HOST_STACK_R14(r1)
+ std r15, HOST_STACK_R15(r1)
+ std r16, HOST_STACK_R16(r1)
+ std r17, HOST_STACK_R17(r1)
+ std r18, HOST_STACK_R18(r1)
+ std r19, HOST_STACK_R19(r1)
+ std r20, HOST_STACK_R20(r1)
+ std r21, HOST_STACK_R21(r1)
+ std r22, HOST_STACK_R22(r1)
+ std r23, HOST_STACK_R23(r1)
+ std r24, HOST_STACK_R24(r1)
+ std r25, HOST_STACK_R25(r1)
+ std r26, HOST_STACK_R26(r1)
+ std r27, HOST_STACK_R27(r1)
+ std r28, HOST_STACK_R28(r1)
+ std r29, HOST_STACK_R29(r1)
+ std r30, HOST_STACK_R30(r1)
+ std r31, HOST_STACK_R31(r1)
+ mflr r14
+ std r14, HOST_STACK_LR(r1)
+
+/* XXX optimize non-volatile loading away */
+kvm_start_lightweight:
+
+ DISABLE_INTERRUPTS
+
+ /* Save R1/R2 in the PACA */
+ std r1, PACAR1(r13)
+ std r2, (PACA_EXMC+EX_SRR0)(r13)
+ ld r3, VCPU_HIGHMEM_HANDLER(r4)
+ std r3, PACASAVEDMSR(r13)
+
+ /* Load non-volatile guest state from the vcpu */
+ ld r14, VCPU_GPR(r14)(r4)
+ ld r15, VCPU_GPR(r15)(r4)
+ ld r16, VCPU_GPR(r16)(r4)
+ ld r17, VCPU_GPR(r17)(r4)
+ ld r18, VCPU_GPR(r18)(r4)
+ ld r19, VCPU_GPR(r19)(r4)
+ ld r20, VCPU_GPR(r20)(r4)
+ ld r21, VCPU_GPR(r21)(r4)
+ ld r22, VCPU_GPR(r22)(r4)
+ ld r23, VCPU_GPR(r23)(r4)
+ ld r24, VCPU_GPR(r24)(r4)
+ ld r25, VCPU_GPR(r25)(r4)
+ ld r26, VCPU_GPR(r26)(r4)
+ ld r27, VCPU_GPR(r27)(r4)
+ ld r28, VCPU_GPR(r28)(r4)
+ ld r29, VCPU_GPR(r29)(r4)
+ ld r30, VCPU_GPR(r30)(r4)
+ ld r31, VCPU_GPR(r31)(r4)
+
+ ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */
+ ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
+
+ ld r3, VCPU_TRAMPOLINE_ENTER(r4)
+ mtsrr0 r3
+
+ loadimm r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)
+ mtsrr1 r3
+
+ /* Load guest state in the respective registers */
+ lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */
+ stw r3, (PACA_EXMC + EX_CCR)(r13)
+
+ ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */
+ mtctr r3 /* CTR = r3 */
+
+ ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */
+ mtlr r3 /* LR = r3 */
+
+ ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */
+ std r3, (PACA_EXMC + EX_R3)(r13)
+
+ /* This sets the Magic value for the trampoline:
+ *
+ * PPC64: SPRG3 |= 1
+ */
+ setmagc r3
+
+ /* Some guests may need to have dcbz set to 32 byte length.
+ *
+ * Usually we ensure that by patching the guest's instructions
+ * to trap on dcbz and emulate it in the hypervisor.
+ *
+ * If we can, we should tell the CPU to use 32 byte dcbz though,
+ * because that's a lot faster.
+ */
+
+ ld r3, VCPU_HFLAGS(r4)
+ rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) = 0) */
+ beq no_dcbz32_on
+
+ mfspr r3,SPRN_HID5
+ ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */
+ mtspr SPRN_HID5,r3
+
+no_dcbz32_on:
+ /* Load guest GPRs */
+
+ ld r3, VCPU_GPR(r9)(r4)
+ std r3, (PACA_EXMC + EX_R9)(r13)
+ ld r3, VCPU_GPR(r10)(r4)
+ std r3, (PACA_EXMC + EX_R10)(r13)
+ ld r3, VCPU_GPR(r11)(r4)
+ std r3, (PACA_EXMC + EX_R11)(r13)
+ ld r3, VCPU_GPR(r12)(r4)
+ std r3, (PACA_EXMC + EX_R12)(r13)
+ ld r3, VCPU_GPR(r13)(r4)
+ std r3, (PACA_EXMC + EX_R13)(r13)
+
+ ld r0, VCPU_GPR(r0)(r4)
+ mtspr SPRN_SPRG1, r0
+
+ ld r1, VCPU_GPR(r1)(r4)
+ ld r2, VCPU_GPR(r2)(r4)
+ ld r3, VCPU_GPR(r3)(r4)
+ ld r5, VCPU_GPR(r5)(r4)
+ ld r6, VCPU_GPR(r6)(r4)
+ ld r7, VCPU_GPR(r7)(r4)
+ ld r8, VCPU_GPR(r8)(r4)
+ ld r4, VCPU_GPR(r4)(r4)
+
+ /* Jump to SLB patching handlder and into our guest */
+ RFI
+
+/*
+ * This is the handler in module memory. It gets jumped at from the
+ * lowmem trampoline code, so it's basically the guest exit code.
+ *
+ */
+
+.global kvmppc_handler_highmem
+kvmppc_handler_highmem:
+
+ /* SPRG usage at this point:
+ *
+ * SPRG0 = reserved
+ * SPRG1 = guest R13
+ * SPRG2 = guest CR
+ * SPRG3 = virt. PACA
+ * R01 = host R1
+ * R02 = host R2
+ * R10 = guest PC
+ * R11 = guest MSR
+ * R12 = exit handler id
+ * R13 = PACA
+ * PACA.exgen.R9 = guest R1
+ * PACA.exgen.R10 = guest R10
+ * PACA.exgen.R11 = guest R11
+ * PACA.exgen.R12 = guest R12
+ * PACA.exgen.R13 = guest R2
+ * PACA.exgen.LR = guest instruction
+ *
+ */
+
+ std r3, (PACA_EXMC+EX_R3)(r13)
+
+ /* save the exit id in R3 */
+ mr r3, r12
+
+ /* R12 = vcpu */
+ ld r12, HOST_STACK_VCPU(r1)
+
+ /* Now save the guest state */
+
+ std r0, VCPU_GPR(r0)(r12)
+ std r4, VCPU_GPR(r4)(r12)
+ std r5, VCPU_GPR(r5)(r12)
+ std r6, VCPU_GPR(r6)(r12)
+ std r7, VCPU_GPR(r7)(r12)
+ std r8, VCPU_GPR(r8)(r12)
+ std r9, VCPU_GPR(r9)(r12)
+
+ /* R13 is in SPRG1 */
+ mfspr r5, SPRN_SPRG1
+ std r5, VCPU_GPR(r13)(r12)
+
+ /* get registers from PACA */
+ mfpaca r5, r3, EX_R3, r12
+ mfpaca r5, r1, EX_R9, r12
+ mfpaca r5, r10, EX_R10, r12
+ mfpaca r5, r11, EX_R11, r12
+ mfpaca r5, r12, EX_R12, r12
+ mfpaca r5, r2, EX_R13, r12
+
+ lwz r5, (PACA_EXMC+EX_LR)(r13)
+ stw r5, VCPU_LAST_INST(r12)
+
+ ld r5, VCPU_HFLAGS(r12)
+ rldicl. r5, r5, 0, 63 /* CR = ((r5 & 1) = 0) */
+ beq no_dcbz32_off
+
+ mfspr r5,SPRN_HID5
+ rldimi r5,r5,6,56
+ mtspr SPRN_HID5,r5
+
+no_dcbz32_off:
+
+ /* XXX maybe skip on lightweight? */
+ std r14, VCPU_GPR(r14)(r12)
+ std r15, VCPU_GPR(r15)(r12)
+ std r16, VCPU_GPR(r16)(r12)
+ std r17, VCPU_GPR(r17)(r12)
+ std r18, VCPU_GPR(r18)(r12)
+ std r19, VCPU_GPR(r19)(r12)
+ std r20, VCPU_GPR(r20)(r12)
+ std r21, VCPU_GPR(r21)(r12)
+ std r22, VCPU_GPR(r22)(r12)
+ std r23, VCPU_GPR(r23)(r12)
+ std r24, VCPU_GPR(r24)(r12)
+ std r25, VCPU_GPR(r25)(r12)
+ std r26, VCPU_GPR(r26)(r12)
+ std r27, VCPU_GPR(r27)(r12)
+ std r28, VCPU_GPR(r28)(r12)
+ std r29, VCPU_GPR(r29)(r12)
+ std r30, VCPU_GPR(r30)(r12)
+ std r31, VCPU_GPR(r31)(r12)
+
+ /* Restore non-volatile host registers */
+ ld r14, HOST_STACK_R14(r1)
+ ld r15, HOST_STACK_R15(r1)
+ ld r16, HOST_STACK_R16(r1)
+ ld r17, HOST_STACK_R17(r1)
+ ld r18, HOST_STACK_R18(r1)
+ ld r19, HOST_STACK_R19(r1)
+ ld r20, HOST_STACK_R20(r1)
+ ld r21, HOST_STACK_R21(r1)
+ ld r22, HOST_STACK_R22(r1)
+ ld r23, HOST_STACK_R23(r1)
+ ld r24, HOST_STACK_R24(r1)
+ ld r25, HOST_STACK_R25(r1)
+ ld r26, HOST_STACK_R26(r1)
+ ld r27, HOST_STACK_R27(r1)
+ ld r28, HOST_STACK_R28(r1)
+ ld r29, HOST_STACK_R29(r1)
+ ld r30, HOST_STACK_R30(r1)
+ ld r31, HOST_STACK_R31(r1)
+
+ /* Save guest PC (R10) */
+ std r10, VCPU_PC(r12)
+
+ /* Save guest msr (R11) */
+ std r11, VCPU_SHADOW_MSR(r12)
+
+ /* Save guest CR (SPRG2) */
+ mfspr r5, SPRN_SPRG2
+ stw r5, VCPU_CR(r12)
+
+ /* Save guest CTR (in R12) */
+ mfctr r5
+ std r5, VCPU_CTR(r12)
+
+ /* Save guest LR */
+ mflr r5
+ std r5, VCPU_LR(r12)
+
+ /* Save guest XER */
+ mfxer r5
+ std r5, VCPU_XER(r12)
+
+ /* Save guest DAR */
+ mfdar r5
+ std r5, VCPU_FAULT_DEAR(r12)
+
+ /* Save guest DSISR */
+ mfdsisr r5
+ std r5, VCPU_FAULT_DSISR(r12)
+
+ /* Restore host msr -> SRR1 */
+ ld r7, VCPU_HOST_MSR(r12)
+ mtsrr1 r7
+
+ /* Restore host IP -> SRR0 */
+ ld r6, VCPU_HOST_RETIP(r12)
+ mtsrr0 r6
+
+ /* For some interrupts, we need to call the real Linux */
+ /* handler, so it can do work for us. This has to happen */
+ /* as if the interrupt arrived from the kernel though, */
+ /* so let's fake it here where most state is restored. */
+
+ /* Call Linux for hardware interrupts/decrementer */
+ /* r3 = address of interrupt handler (exit reason) */
+
+ cmpwi r3, PPC970_INTERRUPT_EXTERNAL
+ beq call_linux_handler
+ cmpwi r3, PPC970_INTERRUPT_DECREMENTER
+ beq call_linux_handler
+
+ /* Back to Paged Mode! (goto kvm_return_point) with interrupts enabled */
+ RFI
+
+call_linux_handler:
+
+ /* If we land here we need to jump back to the handler we */
+ /* came from. */
+
+ /* We have a page that we can access from real mode, so let's */
+ /* jump back to that and use it as a trampoline to get back into the */
+ /* interrupt handler! */
+
+ /* Enable soft interrupts again, so the handler acts */
+ li r5, 1
+ stb r5, PACASOFTIRQEN(r13)
+
+ /* R3 still contains the exit code, */
+ /* R6 VCPU_HOST_RETIP and */
+ /* R7 VCPU_HOST_MSR */
+
+ mtlr r3
+
+ ld r5, VCPU_TRAMPOLINE_LOWMEM(r12)
+ mtsrr0 r5
+ loadimm r5, MSR_KERNEL & ~(MSR_IR | MSR_DR)
+ mtsrr1 r5
+
+ RFI
+
+.global kvm_return_point
+kvm_return_point:
+
+ /* Jump back to lightweight entry if we're supposed to */
+ /* go back into the guest */
+ mr r5, r3
+ ld r3, HOST_STACK_RUN(r1)
+ ld r4, HOST_STACK_VCPU(r1)
+ bl KVMPPC_HANDLE_EXIT
+
+#if 0 /* XXX get lightweight exits back */
+ cmpwi r3, RESUME_GUEST
+ bne kvm_exit_heavyweight
+
+ /* put VCPU and KVM_RUN back into place and roll again! */
+ ld r3, HOST_STACK_RUN(r1)
+ ld r4, HOST_STACK_VCPU(r1)
+ b kvm_start_lightweight
+
+kvm_exit_heavyweight:
+ /* Restore non-volatile host registers */
+ ld r14, HOST_STACK_LR(r1)
+ mtlr r14
+ ld r14, HOST_STACK_R14(r1)
+ ld r15, HOST_STACK_R15(r1)
+ ld r16, HOST_STACK_R16(r1)
+ ld r17, HOST_STACK_R17(r1)
+ ld r18, HOST_STACK_R18(r1)
+ ld r19, HOST_STACK_R19(r1)
+ ld r20, HOST_STACK_R20(r1)
+ ld r21, HOST_STACK_R21(r1)
+ ld r22, HOST_STACK_R22(r1)
+ ld r23, HOST_STACK_R23(r1)
+ ld r24, HOST_STACK_R24(r1)
+ ld r25, HOST_STACK_R25(r1)
+ ld r26, HOST_STACK_R26(r1)
+ ld r27, HOST_STACK_R27(r1)
+ ld r28, HOST_STACK_R28(r1)
+ ld r29, HOST_STACK_R29(r1)
+ ld r30, HOST_STACK_R30(r1)
+ ld r31, HOST_STACK_R31(r1)
+
+ addi r1, r1, HOST_STACK_SIZE
+#else
+ ld r4, HOST_STACK_LR(r1)
+ mtlr r4
+
+ cmpwi r3, RESUME_GUEST
+ bne kvm_exit_heavyweight
+
+ ld r3, HOST_STACK_RUN(r1)
+ ld r4, HOST_STACK_VCPU(r1)
+
+ addi r1, r1, HOST_STACK_SIZE
+
+ b kvm_start_entry
+
+kvm_exit_heavyweight:
+
+ addi r1, r1, HOST_STACK_SIZE
+#endif
+
+ blr
--
1.6.0.2
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH 06/23] Add 970 highmem asm code
2009-07-07 14:17 [PATCH 06/23] Add 970 highmem asm code Alexander Graf
@ 2009-07-08 4:30 ` Benjamin Herrenschmidt
2009-07-08 7:14 ` Alexander Graf
2009-07-08 7:37 ` Benjamin Herrenschmidt
2 siblings, 0 replies; 4+ messages in thread
From: Benjamin Herrenschmidt @ 2009-07-08 4:30 UTC (permalink / raw)
To: kvm-ppc
On Tue, 2009-07-07 at 16:17 +0200, Alexander Graf wrote:
> This is the of entry / exit code. In order to switch between host and guest
> context, we need to switch register state and call the exit code handler on
> exit.
>
> This assembly file does exactly that. To finally enter the guest it calls
> into 970_slb.S. On exit it gets jumped at from 970_slb.S too.
>
> Add header definition for highmem handler
>
> Signed-off-by: Alexander Graf <agraf@suse.de>
> ---
Why "highmem" ? IE, That terminology usually means something completely
different in Linux :-) (Aka, memory beyond the linear mapping which is
a concept that does not exist on ppc64).
I suppose you mean code that runs outside of the RMA ? (AKA Real Memory
Area, which is the memory that can be accessed while in real mode).
I have a few comments, this is in no way an in-depth review, I don't yet
totally see the big picture of your implementation but a few things I
spotted along the way:
> arch/powerpc/include/asm/kvm_ppc.h | 1 +
.../...
> +
> +/*****************************************************************************
> + * *
> + * Guest entry / exit code that is in kernel module memory (highmem) *
> + * *
> + ****************************************************************************/
> +
> +/* Registers:
> + * r3: kvm_run pointer
> + * r4: vcpu pointer
> + */
> +_GLOBAL(__kvmppc_vcpu_entry)
> +
> +kvm_start_entry:
> +
> + /* Save host state to the stack */
> + stdu r1, -HOST_STACK_SIZE(r1)
> + std r3, HOST_STACK_RUN(r1)
> + std r4, HOST_STACK_VCPU(r1)
> +
> + std r14, HOST_STACK_R14(r1)
> + std r15, HOST_STACK_R15(r1)
> + std r16, HOST_STACK_R16(r1)
> + std r17, HOST_STACK_R17(r1)
> + std r18, HOST_STACK_R18(r1)
> + std r19, HOST_STACK_R19(r1)
> + std r20, HOST_STACK_R20(r1)
> + std r21, HOST_STACK_R21(r1)
> + std r22, HOST_STACK_R22(r1)
> + std r23, HOST_STACK_R23(r1)
> + std r24, HOST_STACK_R24(r1)
> + std r25, HOST_STACK_R25(r1)
> + std r26, HOST_STACK_R26(r1)
> + std r27, HOST_STACK_R27(r1)
> + std r28, HOST_STACK_R28(r1)
> + std r29, HOST_STACK_R29(r1)
> + std r30, HOST_STACK_R30(r1)
> + std r31, HOST_STACK_R31(r1)
> + mflr r14
> + std r14, HOST_STACK_LR(r1)
Can we make that look closer to a pt_regs maybe or is that not
worth it ?
> +/* XXX optimize non-volatile loading away */
> +kvm_start_lightweight:
> +
> + DISABLE_INTERRUPTS
> +
> + /* Save R1/R2 in the PACA */
> + std r1, PACAR1(r13)
> + std r2, (PACA_EXMC+EX_SRR0)(r13)
> + ld r3, VCPU_HIGHMEM_HANDLER(r4)
> + std r3, PACASAVEDMSR(r13)
> +
> + /* Load non-volatile guest state from the vcpu */
> + ld r14, VCPU_GPR(r14)(r4)
> + ld r15, VCPU_GPR(r15)(r4)
> + ld r16, VCPU_GPR(r16)(r4)
> + ld r17, VCPU_GPR(r17)(r4)
> + ld r18, VCPU_GPR(r18)(r4)
> + ld r19, VCPU_GPR(r19)(r4)
> + ld r20, VCPU_GPR(r20)(r4)
> + ld r21, VCPU_GPR(r21)(r4)
> + ld r22, VCPU_GPR(r22)(r4)
> + ld r23, VCPU_GPR(r23)(r4)
> + ld r24, VCPU_GPR(r24)(r4)
> + ld r25, VCPU_GPR(r25)(r4)
> + ld r26, VCPU_GPR(r26)(r4)
> + ld r27, VCPU_GPR(r27)(r4)
> + ld r28, VCPU_GPR(r28)(r4)
> + ld r29, VCPU_GPR(r29)(r4)
> + ld r30, VCPU_GPR(r30)(r4)
> + ld r31, VCPU_GPR(r31)(r4)
> +
> + ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */
> + ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
> +
> + ld r3, VCPU_TRAMPOLINE_ENTER(r4)
> + mtsrr0 r3
> +
> + loadimm r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)
> + mtsrr1 r3
> +
> + /* Load guest state in the respective registers */
> + lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */
> + stw r3, (PACA_EXMC + EX_CCR)(r13)
> +
> + ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */
> + mtctr r3 /* CTR = r3 */
> +
> + ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */
> + mtlr r3 /* LR = r3 */
> +
> + ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */
> + std r3, (PACA_EXMC + EX_R3)(r13)
> +
> + /* This sets the Magic value for the trampoline:
> + *
> + * PPC64: SPRG3 |= 1
> + */
> + setmagc r3
> +
> + /* Some guests may need to have dcbz set to 32 byte length.
> + *
> + * Usually we ensure that by patching the guest's instructions
> + * to trap on dcbz and emulate it in the hypervisor.
> + *
> + * If we can, we should tell the CPU to use 32 byte dcbz though,
> + * because that's a lot faster.
> + */
> +
> + ld r3, VCPU_HFLAGS(r4)
> + rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) = 0) */
> + beq no_dcbz32_on
> +
> + mfspr r3,SPRN_HID5
> + ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */
> + mtspr SPRN_HID5,r3
> +
> +no_dcbz32_on:
The whole dcbz stuff could probably be a cpufeature block so it
gets nop'ed out when running on other processors than 970 since
they don't all support that magic dcbz trick. Also, I think HID5
is a HV reserved register thus you won't be able to do that trick
when running yourself with MSR:HV=0, for example when running on
a js2x blade.
> + /* Save guest DAR */
> + mfdar r5
> + std r5, VCPU_FAULT_DEAR(r12)
The guest is running with MSR:PR set to 0 or 1 ? If 1, it doesn't have
access to DAR or DSISR so I don't quite see the point of
saving/restoring them here, you can just hand out the register straight
off your shadow when taking the protection faults as the guest tries
to access them. If the guest is running with PR:0 then there is no
protection of the host against the guest which sucks :-)
Or do I miss something ?
> + /* Save guest DSISR */
> + mfdsisr r5
> + std r5, VCPU_FAULT_DSISR(r12)
> +
> + /* Restore host msr -> SRR1 */
> + ld r7, VCPU_HOST_MSR(r12)
> + mtsrr1 r7
> +
> + /* Restore host IP -> SRR0 */
> + ld r6, VCPU_HOST_RETIP(r12)
> + mtsrr0 r6
> +
> + /* For some interrupts, we need to call the real Linux */
> + /* handler, so it can do work for us. This has to happen */
> + /* as if the interrupt arrived from the kernel though, */
> + /* so let's fake it here where most state is restored. */
> +
> + /* Call Linux for hardware interrupts/decrementer */
> + /* r3 = address of interrupt handler (exit reason) */
> +
> + cmpwi r3, PPC970_INTERRUPT_EXTERNAL
> + beq call_linux_handler
> + cmpwi r3, PPC970_INTERRUPT_DECREMENTER
> + beq call_linux_handler
> +
> + /* Back to Paged Mode! (goto kvm_return_point) with interrupts enabled */
> + RFI
Ok so I need to understand better the whole model... ie how you get
in/out of the guest etc... I would have thought you wanted to call into
kernel interrupts such as DEC or EE as if coming from userspace
actually...
> +call_linux_handler:
> +
> + /* If we land here we need to jump back to the handler we */
> + /* came from. */
> +
> + /* We have a page that we can access from real mode, so let's */
> + /* jump back to that and use it as a trampoline to get back into the */
> + /* interrupt handler! */
> +
> + /* Enable soft interrupts again, so the handler acts */
> + li r5, 1
> + stb r5, PACASOFTIRQEN(r13)
But we aren't supposed to enter the timer or EE with softirq enabled...
BTW we probably also need to record some of that stuff with lockdep
but we can look at that later.
> + /* R3 still contains the exit code, */
> + /* R6 VCPU_HOST_RETIP and */
> + /* R7 VCPU_HOST_MSR */
> +
> + mtlr r3
> +
> + ld r5, VCPU_TRAMPOLINE_LOWMEM(r12)
> + mtsrr0 r5
> + loadimm r5, MSR_KERNEL & ~(MSR_IR | MSR_DR)
> + mtsrr1 r5
> +
> + RFI
> +
> +.global kvm_return_point
> +kvm_return_point:
> +
> + /* Jump back to lightweight entry if we're supposed to */
> + /* go back into the guest */
> + mr r5, r3
> + ld r3, HOST_STACK_RUN(r1)
> + ld r4, HOST_STACK_VCPU(r1)
> + bl KVMPPC_HANDLE_EXIT
> +
> +#if 0 /* XXX get lightweight exits back */
> + cmpwi r3, RESUME_GUEST
> + bne kvm_exit_heavyweight
> +
> + /* put VCPU and KVM_RUN back into place and roll again! */
> + ld r3, HOST_STACK_RUN(r1)
> + ld r4, HOST_STACK_VCPU(r1)
> + b kvm_start_lightweight
> +
> +kvm_exit_heavyweight:
> + /* Restore non-volatile host registers */
> + ld r14, HOST_STACK_LR(r1)
> + mtlr r14
> + ld r14, HOST_STACK_R14(r1)
> + ld r15, HOST_STACK_R15(r1)
> + ld r16, HOST_STACK_R16(r1)
> + ld r17, HOST_STACK_R17(r1)
> + ld r18, HOST_STACK_R18(r1)
> + ld r19, HOST_STACK_R19(r1)
> + ld r20, HOST_STACK_R20(r1)
> + ld r21, HOST_STACK_R21(r1)
> + ld r22, HOST_STACK_R22(r1)
> + ld r23, HOST_STACK_R23(r1)
> + ld r24, HOST_STACK_R24(r1)
> + ld r25, HOST_STACK_R25(r1)
> + ld r26, HOST_STACK_R26(r1)
> + ld r27, HOST_STACK_R27(r1)
> + ld r28, HOST_STACK_R28(r1)
> + ld r29, HOST_STACK_R29(r1)
> + ld r30, HOST_STACK_R30(r1)
> + ld r31, HOST_STACK_R31(r1)
> +
> + addi r1, r1, HOST_STACK_SIZE
> +#else
> + ld r4, HOST_STACK_LR(r1)
> + mtlr r4
> +
> + cmpwi r3, RESUME_GUEST
> + bne kvm_exit_heavyweight
> +
> + ld r3, HOST_STACK_RUN(r1)
> + ld r4, HOST_STACK_VCPU(r1)
> +
> + addi r1, r1, HOST_STACK_SIZE
> +
> + b kvm_start_entry
> +
> +kvm_exit_heavyweight:
> +
> + addi r1, r1, HOST_STACK_SIZE
> +#endif
> +
> + blr
> --
> 1.6.0.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 06/23] Add 970 highmem asm code
2009-07-07 14:17 [PATCH 06/23] Add 970 highmem asm code Alexander Graf
2009-07-08 4:30 ` Benjamin Herrenschmidt
@ 2009-07-08 7:14 ` Alexander Graf
2009-07-08 7:37 ` Benjamin Herrenschmidt
2 siblings, 0 replies; 4+ messages in thread
From: Alexander Graf @ 2009-07-08 7:14 UTC (permalink / raw)
To: kvm-ppc
On 08.07.2009, at 06:30, Benjamin Herrenschmidt wrote:
> On Tue, 2009-07-07 at 16:17 +0200, Alexander Graf wrote:
>> This is the of entry / exit code. In order to switch between host
>> and guest
>> context, we need to switch register state and call the exit code
>> handler on
>> exit.
>>
>> This assembly file does exactly that. To finally enter the guest it
>> calls
>> into 970_slb.S. On exit it gets jumped at from 970_slb.S too.
>>
>> Add header definition for highmem handler
>>
>> Signed-off-by: Alexander Graf <agraf@suse.de>
>> ---
>
> Why "highmem" ? IE, That terminology usually means something
> completely
> different in Linux :-) (Aka, memory beyond the linear mapping which is
> a concept that does not exist on ppc64).
>
> I suppose you mean code that runs outside of the RMA ? (AKA Real
> Memory
> Area, which is the memory that can be accessed while in real mode).
Yes, I'm open for naming convention suggestions :-).
> I have a few comments, this is in no way an in-depth review, I don't
> yet
> totally see the big picture of your implementation but a few things I
> spotted along the way:
>
>> arch/powerpc/include/asm/kvm_ppc.h | 1 +
>
> .../...
mh?
>
>> +
>> +/
>> *****************************************************************************
>> +
>> * *
>> + * Guest entry / exit code that is in kernel module memory
>> (highmem) *
>> +
>> * *
>> +
>> ****************************************************************************/
>> +
>> +/* Registers:
>> + * r3: kvm_run pointer
>> + * r4: vcpu pointer
>> + */
>> +_GLOBAL(__kvmppc_vcpu_entry)
>> +
>> +kvm_start_entry:
>> +
>> + /* Save host state to the stack */
>> + stdu r1, -HOST_STACK_SIZE(r1)
>> + std r3, HOST_STACK_RUN(r1)
>> + std r4, HOST_STACK_VCPU(r1)
>> +
>> + std r14, HOST_STACK_R14(r1)
>> + std r15, HOST_STACK_R15(r1)
>> + std r16, HOST_STACK_R16(r1)
>> + std r17, HOST_STACK_R17(r1)
>> + std r18, HOST_STACK_R18(r1)
>> + std r19, HOST_STACK_R19(r1)
>> + std r20, HOST_STACK_R20(r1)
>> + std r21, HOST_STACK_R21(r1)
>> + std r22, HOST_STACK_R22(r1)
>> + std r23, HOST_STACK_R23(r1)
>> + std r24, HOST_STACK_R24(r1)
>> + std r25, HOST_STACK_R25(r1)
>> + std r26, HOST_STACK_R26(r1)
>> + std r27, HOST_STACK_R27(r1)
>> + std r28, HOST_STACK_R28(r1)
>> + std r29, HOST_STACK_R29(r1)
>> + std r30, HOST_STACK_R30(r1)
>> + std r31, HOST_STACK_R31(r1)
>> + mflr r14
>> + std r14, HOST_STACK_LR(r1)
>
> Can we make that look closer to a pt_regs maybe or is that not
> worth it ?
Yeah, that should be definitely possible. While it's not really
necessary it makes the code smaller, so it's probably worth it ;-).
>
>> +/* XXX optimize non-volatile loading away */
>> +kvm_start_lightweight:
>> +
>> + DISABLE_INTERRUPTS
>> +
>> + /* Save R1/R2 in the PACA */
>> + std r1, PACAR1(r13)
>> + std r2, (PACA_EXMC+EX_SRR0)(r13)
>> + ld r3, VCPU_HIGHMEM_HANDLER(r4)
>> + std r3, PACASAVEDMSR(r13)
>> +
>> + /* Load non-volatile guest state from the vcpu */
>> + ld r14, VCPU_GPR(r14)(r4)
>> + ld r15, VCPU_GPR(r15)(r4)
>> + ld r16, VCPU_GPR(r16)(r4)
>> + ld r17, VCPU_GPR(r17)(r4)
>> + ld r18, VCPU_GPR(r18)(r4)
>> + ld r19, VCPU_GPR(r19)(r4)
>> + ld r20, VCPU_GPR(r20)(r4)
>> + ld r21, VCPU_GPR(r21)(r4)
>> + ld r22, VCPU_GPR(r22)(r4)
>> + ld r23, VCPU_GPR(r23)(r4)
>> + ld r24, VCPU_GPR(r24)(r4)
>> + ld r25, VCPU_GPR(r25)(r4)
>> + ld r26, VCPU_GPR(r26)(r4)
>> + ld r27, VCPU_GPR(r27)(r4)
>> + ld r28, VCPU_GPR(r28)(r4)
>> + ld r29, VCPU_GPR(r29)(r4)
>> + ld r30, VCPU_GPR(r30)(r4)
>> + ld r31, VCPU_GPR(r31)(r4)
>> +
>> + ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */
>> + ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */
>> +
>> + ld r3, VCPU_TRAMPOLINE_ENTER(r4)
>> + mtsrr0 r3
>> +
>> + loadimm r3, MSR_KERNEL & ~(MSR_IR | MSR_DR)
>> + mtsrr1 r3
>> +
>> + /* Load guest state in the respective registers */
>> + lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */
>> + stw r3, (PACA_EXMC + EX_CCR)(r13)
>> +
>> + ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */
>> + mtctr r3 /* CTR = r3 */
>> +
>> + ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */
>> + mtlr r3 /* LR = r3 */
>> +
>> + ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */
>> + std r3, (PACA_EXMC + EX_R3)(r13)
>> +
>> + /* This sets the Magic value for the trampoline:
>> + *
>> + * PPC64: SPRG3 |= 1
>> + */
>> + setmagc r3
>> +
>> + /* Some guests may need to have dcbz set to 32 byte length.
>> + *
>> + * Usually we ensure that by patching the guest's instructions
>> + * to trap on dcbz and emulate it in the hypervisor.
>> + *
>> + * If we can, we should tell the CPU to use 32 byte dcbz though,
>> + * because that's a lot faster.
>> + */
>> +
>> + ld r3, VCPU_HFLAGS(r4)
>> + rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) = 0) */
>> + beq no_dcbz32_on
>> +
>> + mfspr r3,SPRN_HID5
>> + ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */
>> + mtspr SPRN_HID5,r3
>> +
>> +no_dcbz32_on:
>
> The whole dcbz stuff could probably be a cpufeature block so it
> gets nop'ed out when running on other processors than 970 since
> they don't all support that magic dcbz trick.
Yeah, I never really understood those cpufeature blocks ...
> Also, I think HID5
> is a HV reserved register thus you won't be able to do that trick
> when running yourself with MSR:HV=0, for example when running on
> a js2x blade.
Yes, it is. That's why the HFLAGS bit is only set when HV=1 :-).
>
>> + /* Save guest DAR */
>> + mfdar r5
>> + std r5, VCPU_FAULT_DEAR(r12)
>
> The guest is running with MSR:PR set to 0 or 1 ? If 1, it doesn't have
> access to DAR or DSISR so I don't quite see the point of
> saving/restoring them here, you can just hand out the register
> straight
> off your shadow when taking the protection faults as the guest tries
> to access them. If the guest is running with PR:0 then there is no
> protection of the host against the guest which sucks :-)
>
> Or do I miss something ?
FAULT_* are basically the registers that store where the guest
faulted. So if the guest triggers a data store interrupt, the
corresponding dar gets stored to a vcpu field, so we don't clobber it
later.
Yes, the guest runs with PR=1 :-).
>
>> + /* Save guest DSISR */
>> + mfdsisr r5
>> + std r5, VCPU_FAULT_DSISR(r12)
>> +
>> + /* Restore host msr -> SRR1 */
>> + ld r7, VCPU_HOST_MSR(r12)
>> + mtsrr1 r7
>> +
>> + /* Restore host IP -> SRR0 */
>> + ld r6, VCPU_HOST_RETIP(r12)
>> + mtsrr0 r6
>> +
>> + /* For some interrupts, we need to call the real Linux */
>> + /* handler, so it can do work for us. This has to happen */
>> + /* as if the interrupt arrived from the kernel though, */
>> + /* so let's fake it here where most state is restored. */
>> +
>> + /* Call Linux for hardware interrupts/decrementer */
>> + /* r3 = address of interrupt handler (exit reason) */
>> +
>> + cmpwi r3, PPC970_INTERRUPT_EXTERNAL
>> + beq call_linux_handler
>> + cmpwi r3, PPC970_INTERRUPT_DECREMENTER
>> + beq call_linux_handler
>> +
>> + /* Back to Paged Mode! (goto kvm_return_point) with interrupts
>> enabled */
>> + RFI
>
> Ok so I need to understand better the whole model... ie how you get
> in/out of the guest etc... I would have thought you wanted to call
> into
> kernel interrupts such as DEC or EE as if coming from userspace
> actually...
I don't think we can easily have Linux running while we're in the
guest context. What if the DEC issues the scheduler, which schedules
off and back again? How would it know where to resume the guest? And
who'd set the magic bit in SPRG3?
When running a PPC64 guest things get even worse, as we have to switch
the SLB as well, which is actually the slow part of the entry/exit
code atm.
Maybe we could work around those problems by integrating things a bit
more, but I doubt it's necessary. Host DEC and EE interrupts shouldn't
really hurt performance that much.
What we do here is do a full guest exit cycle and go back to the Linux
handler we came from, so it can handle the interrupt we intercepted.
That way we're in normal kernel code from the point of view of every
other part of Linux.
>
>> +call_linux_handler:
>> +
>> + /* If we land here we need to jump back to the handler we */
>> + /* came from. */
>> +
>> + /* We have a page that we can access from real mode, so let's */
>> + /* jump back to that and use it as a trampoline to get back into
>> the */
>> + /* interrupt handler! */
>> +
>> + /* Enable soft interrupts again, so the handler acts */
>> + li r5, 1
>> + stb r5, PACASOFTIRQEN(r13)
>
> But we aren't supposed to enter the timer or EE with softirq
> enabled...
> BTW we probably also need to record some of that stuff with lockdep
> but we can look at that later.
Maybe I'm calling it wrong? Basically, I want Linux to handle
interrupts :-). And I did a irq_local_disable before, so this is the
asm equivalent of _enable, no?
Alex
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 06/23] Add 970 highmem asm code
2009-07-07 14:17 [PATCH 06/23] Add 970 highmem asm code Alexander Graf
2009-07-08 4:30 ` Benjamin Herrenschmidt
2009-07-08 7:14 ` Alexander Graf
@ 2009-07-08 7:37 ` Benjamin Herrenschmidt
2 siblings, 0 replies; 4+ messages in thread
From: Benjamin Herrenschmidt @ 2009-07-08 7:37 UTC (permalink / raw)
To: kvm-ppc
On Wed, 2009-07-08 at 09:14 +0200, Alexander Graf wrote:
> >> arch/powerpc/include/asm/kvm_ppc.h | 1 +
> >
> > .../...
>
> mh?
Just a standard way to say I snipped some of the quote :-)
> Yeah, that should be definitely possible. While it's not really
> necessary it makes the code smaller, so it's probably worth it ;-).
Could also make debugging easier. In fact you should make the whole
thing look like an interrupt frame (aka pt_regs + STACK_FRAME_OVERHEAD)
and stick in a similar signature than we put in our interrupt frames
(see the exception common macro) so we properly see them for what they
are in xmon etc...
> >> +/* XXX optimize non-volatile loading away */
> >> +kvm_start_lightweight:
> >> +
> >> + DISABLE_INTERRUPTS
BTW. If this is coming from C code, I'd rather have a hard_irq_disable()
call in the C code before calling into the asm.
> >> + /* This sets the Magic value for the trampoline:
> >> + *
> >> + * PPC64: SPRG3 |= 1
> >> + */
> >> + setmagc r3
> >> +
From the moment we do that, we must not take an exception until we
actually end up in the guest right ? So the code below must not
take an SLB miss.
However that is not guaranteed I think that your VCPU thingy pointed to
by r4 is currently in a bolted SLB entry. On some P5 or later machines,
the SLB is effectively volatile: the underlying pHyp hypervisor can crap
on it, though it will restore bits of it via the shadow SLB data
structure in main memory. However, unless you arrange for the VCPU
structure to be in the first 256M of memory, it won't be covered by that
shadow. You may want to modify the SLB code when using KVM to also
"bolt" the VCPU or delay the flicking of SPRG3 if you can get away with
clobbering a GPR ...
> > The whole dcbz stuff could probably be a cpufeature block so it
> > gets nop'ed out when running on other processors than 970 since
> > they don't all support that magic dcbz trick.
>
> Yeah, I never really understood those cpufeature blocks ...
Hehehe :-) There's also the MMU features and FW features btw :-) The
base principle is that we stick references to the start and end of the
block into an ELF section along with a mask & value of CPU feature bits
to compare against. At boot time, if it doesn't match we NOP out
everything between start and stop. Recently, Michael Ellerman also
improved on it by allowing to have "alternate", ie two implementations
of the block of code, the first one in by default, the second one in a
separate ELF section, and the second one is copied over the first one
(and padded with NOPs, branches are fixed up too) if the CPU features
don't match, which allows to have "alternate" implementations of perf.
critical asm code (of course, the "default" implementation needs to be
larger or equal in size to the "alternate" one).
> > Also, I think HID5
> > is a HV reserved register thus you won't be able to do that trick
> > when running yourself with MSR:HV=0, for example when running on
> > a js2x blade.
>
> Yes, it is. That's why the HFLAGS bit is only set when HV=1 :-).
Ok. This is also something that should only be done on a real 970, 970FX
or 970MP processor as others don't have that bit in HID5 afaik.
> FAULT_* are basically the registers that store where the guest
> faulted. So if the guest triggers a data store interrupt, the
> corresponding dar gets stored to a vcpu field, so we don't clobber it
> later.
Ok.
> Yes, the guest runs with PR=1 :-).
Right, that was my understanding too but heh, better being sure :-)
> I don't think we can easily have Linux running while we're in the
> guest context. What if the DEC issues the scheduler, which schedules
> off and back again? How would it know where to resume the guest? And
> who'd set the magic bit in SPRG3?
No, you misunderstood me. But then, I need to better "get" what you are
doing. For example, with MOL, the guest is split in two... the part that
is in the virtual machine, but also the parts that run as a normal linux
process (which do the device emulation etc...). The trick when we take
any exception is we context switch back to make it look like we are
coming from that part, basically from the magic syscall where the
"linux" part of the guest called into the kernel to switch into
emulation.
I have to get more familiar with how KVM does these things though to
provide a more useful feedback.
> When running a PPC64 guest things get even worse, as we have to switch
> the SLB as well, which is actually the slow part of the entry/exit
> code atm.
I'm not totally sure we really have to, I need to better understand what
you do with the SLB, and that with my own knowledge of what Linux needs,
we can probably simplify things quite a bit. For example, most of the
Linux host side SLB entries can just be ditched.
> Maybe we could work around those problems by integrating things a bit
> more, but I doubt it's necessary. Host DEC and EE interrupts shouldn't
> really hurt performance that much.
Right. Beware that MacOS 9, if you ever want to run that, will trigger
shitloads of guest DEC interrupts tho.
> What we do here is do a full guest exit cycle and go back to the Linux
> handler we came from, so it can handle the interrupt we intercepted.
> That way we're in normal kernel code from the point of view of every
> other part of Linux.
But don't we do that for any interrupt ? I don't quite get why DEC and
EE are "special" here...
What about machine checks, for excample ? Or system reset ? I understand
that you want synchronous interrupts such as FP, altivec, etc... to be
routed back to the guest but DEC and EE aren't the only ones that need
to be reflected back to Linux are they ?
> Maybe I'm calling it wrong? Basically, I want Linux to handle
> interrupts :-). And I did a irq_local_disable before, so this is the
> asm equivalent of _enable, no?
Well, no, if you were to do that you should call raw_local_irq_restore()
since we may need to do some "fixups" for example if an interrupt did
happen while we were soft-disabled.
But then, you should not call into the linux EE or decrementer handler
with interrupts enabled in the first place. You should really just make
it look like you took the interrupt from the underlying userland process
in which the guest runs...
Catch me on IRC, I need to better understand your model, and we can sort
that out.
Cheers,
Ben.
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2009-07-08 7:37 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-07-07 14:17 [PATCH 06/23] Add 970 highmem asm code Alexander Graf
2009-07-08 4:30 ` Benjamin Herrenschmidt
2009-07-08 7:14 ` Alexander Graf
2009-07-08 7:37 ` Benjamin Herrenschmidt
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.