All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christoffer Dall <christoffer.dall@linaro.org>
To: Zhichao Huang <zhichao.huang@linaro.org>
Cc: kvm@vger.kernel.org, linux-arm-kernel@lists.infradead.org,
	kvmarm@lists.cs.columbia.edu, marc.zyngier@arm.com,
	alex.bennee@linaro.org, will.deacon@arm.com,
	huangzhichao@huawei.com
Subject: Re: [PATCH v3 09/11] KVM: arm: implement lazy world switch for debug registers
Date: Tue, 30 Jun 2015 15:15:22 +0200	[thread overview]
Message-ID: <20150630131522.GQ11332@cbox> (raw)
In-Reply-To: <1434969694-7432-10-git-send-email-zhichao.huang@linaro.org>

On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote:
> Implement switching of the debug registers. While the number
> of registers is massive, CPUs usually don't implement them all
> (A15 has 6 breakpoints and 4 watchpoints, which gives us a total
> of 22 registers "only").
> 
> Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
> the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).
> 
> We have to do the save/restore dance in this case, because the host
> and the guest might use their respective debug registers at any moment.

this sounds expensive, and I suggested an alternative approach in the
previsou patch.  In any case, measuring the impact on this on hardware
would be a great idea...

> 
> If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
> the debug registers as dirty, we only save/resotre DBGDSCR.

restore

> 
> Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
> ---
>  arch/arm/kvm/interrupts.S      |  16 +++
>  arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 263 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
> index 79caf79..d626275 100644
> --- a/arch/arm/kvm/interrupts.S
> +++ b/arch/arm/kvm/interrupts.S
> @@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run)
>  	read_cp15_state store_to_vcpu = 0
>  	write_cp15_state read_from_vcpu = 1
>  
> +	@ Store hardware CP14 state and load guest state
> +	compute_debug_state 1f
> +	bl __save_host_debug_regs
> +	bl __restore_guest_debug_regs
> +
> +1:
>  	@ If the host kernel has not been configured with VFPv3 support,
>  	@ then it is safer if we deny guests from using it as well.
>  #ifdef CONFIG_VFPv3
> @@ -201,6 +207,16 @@ after_vfp_restore:
>  	mrc	p15, 0, r2, c0, c0, 5
>  	mcr	p15, 4, r2, c0, c0, 5
>  
> +	@ Store guest CP14 state and restore host state
> +	skip_debug_state 1f
> +	bl __save_guest_debug_regs
> +	bl __restore_host_debug_regs
> +	/* Clear the dirty flag for the next run, as all the state has
> +	 * already been saved. Note that we nuke the whole 32bit word.
> +	 * If we ever add more flags, we'll have to be more careful...
> +	 */
> +	clear_debug_dirty_bit
> +1:
>  	@ Store guest CP15 state and restore host state
>  	read_cp15_state store_to_vcpu = 1
>  	write_cp15_state read_from_vcpu = 0
> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
> index 5662c39..ed406be 100644
> --- a/arch/arm/kvm/interrupts_head.S
> +++ b/arch/arm/kvm/interrupts_head.S
> @@ -7,6 +7,7 @@
>  #define VCPU_USR_SP		(VCPU_USR_REG(13))
>  #define VCPU_USR_LR		(VCPU_USR_REG(14))
>  #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
> +#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) * 4))
>  
>  /*
>   * Many of these macros need to access the VCPU structure, which is always
> @@ -168,8 +169,7 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>   * Clobbers *all* registers.
>   */
>  .macro restore_guest_regs
> -	/* reset DBGDSCR to disable debug mode */
> -	mov	r2, #0
> +	ldr	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>  	mcr	p14, 0, r2, c0, c2, 2
>  
>  	restore_guest_regs_mode svc, #VCPU_SVC_REGS
> @@ -250,6 +250,10 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	save_guest_regs_mode abt, #VCPU_ABT_REGS
>  	save_guest_regs_mode und, #VCPU_UND_REGS
>  	save_guest_regs_mode irq, #VCPU_IRQ_REGS
> +
> +	/* DBGDSCR reg */
> +	mrc	p14, 0, r2, c0, c1, 0
> +	str	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>  .endm
>  
>  /* Reads cp15 registers from hardware and stores them in memory
> @@ -449,6 +453,231 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	str	r5, [vcpu, #VCPU_DEBUG_FLAGS]
>  .endm
>  
> +/* Assume r11/r12 in used, clobbers r2-r10 */
> +.macro cp14_read_and_push Op2 skip_num
> +	cmp	\skip_num, #8
> +	// if (skip_num >= 8) then skip c8-c15 directly
> +	bge	1f
> +	adr	r2, 9998f
> +	add	r2, r2, \skip_num, lsl #2
> +	bx	r2
> +1:
> +	adr	r2, 9999f
> +	sub	r3, \skip_num, #8
> +	add	r2, r2, r3, lsl #2
> +	bx	r2
> +9998:
> +	mrc	p14, 0, r10, c0, c15, \Op2
> +	mrc	p14, 0, r9, c0, c14, \Op2
> +	mrc	p14, 0, r8, c0, c13, \Op2
> +	mrc	p14, 0, r7, c0, c12, \Op2
> +	mrc	p14, 0, r6, c0, c11, \Op2
> +	mrc	p14, 0, r5, c0, c10, \Op2
> +	mrc	p14, 0, r4, c0, c9, \Op2
> +	mrc	p14, 0, r3, c0, c8, \Op2
> +	push	{r3-r10}

you probably don't want to do more stores to memory than required

> +9999:
> +	mrc	p14, 0, r10, c0, c7, \Op2
> +	mrc	p14, 0, r9, c0, c6, \Op2
> +	mrc	p14, 0, r8, c0, c5, \Op2
> +	mrc	p14, 0, r7, c0, c4, \Op2
> +	mrc	p14, 0, r6, c0, c3, \Op2
> +	mrc	p14, 0, r5, c0, c2, \Op2
> +	mrc	p14, 0, r4, c0, c1, \Op2
> +	mrc	p14, 0, r3, c0, c0, \Op2
> +	push	{r3-r10}

same

> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r10 */
> +.macro cp14_pop_and_write Op2 skip_num
> +	cmp	\skip_num, #8
> +	// if (skip_num >= 8) then skip c8-c15 directly
> +	bge	1f
> +	adr	r2, 9998f
> +	add	r2, r2, \skip_num, lsl #2
> +	pop	{r3-r10}

you probably don't want to do more loads from memory than required

> +	bx	r2
> +1:
> +	adr	r2, 9999f
> +	sub	r3, \skip_num, #8
> +	add	r2, r2, r3, lsl #2
> +	pop	{r3-r10}

same

> +	bx	r2
> +
> +9998:
> +	mcr	p14, 0, r10, c0, c15, \Op2
> +	mcr	p14, 0, r9, c0, c14, \Op2
> +	mcr	p14, 0, r8, c0, c13, \Op2
> +	mcr	p14, 0, r7, c0, c12, \Op2
> +	mcr	p14, 0, r6, c0, c11, \Op2
> +	mcr	p14, 0, r5, c0, c10, \Op2
> +	mcr	p14, 0, r4, c0, c9, \Op2
> +	mcr	p14, 0, r3, c0, c8, \Op2
> +
> +	pop	{r3-r10}
> +9999:
> +	mcr	p14, 0, r10, c0, c7, \Op2
> +	mcr	p14, 0, r9, c0, c6, \Op2
> +	mcr	p14, 0, r8, c0, c5, \Op2
> +	mcr	p14, 0, r7, c0, c4, \Op2
> +	mcr	p14, 0, r6, c0, c3, \Op2
> +	mcr	p14, 0, r5, c0, c2, \Op2
> +	mcr	p14, 0, r4, c0, c1, \Op2
> +	mcr	p14, 0, r3, c0, c0, \Op2
> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r3 */
> +.macro cp14_read_and_str Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3
> +	bx	r3
> +1:
> +	mrc	p14, 0, r2, c0, c15, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> +	mrc	p14, 0, r2, c0, c14, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> +	mrc	p14, 0, r2, c0, c13, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> +	mrc	p14, 0, r2, c0, c12, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> +	mrc	p14, 0, r2, c0, c11, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> +	mrc	p14, 0, r2, c0, c10, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> +	mrc	p14, 0, r2, c0, c9, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> +	mrc	p14, 0, r2, c0, c8, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> +	mrc	p14, 0, r2, c0, c7, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> +	mrc	p14, 0, r2, c0, c6, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> +	mrc	p14, 0, r2, c0, c5, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> +	mrc	p14, 0, r2, c0, c4, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> +	mrc	p14, 0, r2, c0, c3, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> +	mrc	p14, 0, r2, c0, c2, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> +	mrc	p14, 0, r2, c0, c1, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> +	mrc	p14, 0, r2, c0, c0, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r3 */
> +.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3
> +	bx	r3
> +1:
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> +	mcr	p14, 0, r2, c0, c15, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> +	mcr	p14, 0, r2, c0, c14, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> +	mcr	p14, 0, r2, c0, c13, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> +	mcr	p14, 0, r2, c0, c12, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> +	mcr	p14, 0, r2, c0, c11, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> +	mcr	p14, 0, r2, c0, c10, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> +	mcr	p14, 0, r2, c0, c9, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> +	mcr	p14, 0, r2, c0, c8, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> +	mcr	p14, 0, r2, c0, c7, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> +	mcr	p14, 0, r2, c0, c6, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> +	mcr	p14, 0, r2, c0, c5, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> +	mcr	p14, 0, r2, c0, c4, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> +	mcr	p14, 0, r2, c0, c3, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> +	mcr	p14, 0, r2, c0, c2, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> +	mcr	p14, 0, r2, c0, c1, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> +	mcr	p14, 0, r2, c0, c0, \Op2
> +.endm

can you not find some way of unifying cp14_pop_and_write with
cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ?

Probably having two separate structs for the VFP state on the vcpu struct
for both the guest and the host state is one possible way of doing so.

> +
> +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
> +.macro read_hw_dbg_num
> +	mrc	p14, 0, r2, c0, c0, 0
> +	ubfx	r11, r2, #24, #4
> +	add	r11, r11, #1		// Extract BRPs
> +	ubfx	r12, r2, #28, #4
> +	add	r12, r12, #1		// Extract WRPs
> +	mov	r2, #16
> +	sub	r11, r2, r11		// How many BPs to skip
> +	sub	r12, r2, r12		// How many WPs to skip
> +.endm
> +
> +/* Reads cp14 registers from hardware.

You have a lot of multi-line comments in these patches which don't start
with a separate '/*' line, as dictated by the Linux kernel coding style.
So far, I've ignored this, but please fix all these throughout the
series when you respin.

> + * Writes cp14 registers in-order to the stack.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro save_host_debug_regs
> +	read_hw_dbg_num
> +	cp14_read_and_push #4, r11	@ DBGBVR
> +	cp14_read_and_push #5, r11	@ DBGBCR
> +	cp14_read_and_push #6, r12	@ DBGWVR
> +	cp14_read_and_push #7, r12	@ DBGWCR
> +.endm
> +
> +/* Reads cp14 registers from hardware.
> + * Writes cp14 registers in-order to the VCPU struct pointed to by vcpup.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro save_guest_debug_regs
> +	read_hw_dbg_num
> +	cp14_read_and_str #4, cp14_DBGBVR0, r11

why do you need the has before the op2 field?

> +	cp14_read_and_str #5, cp14_DBGBCR0, r11
> +	cp14_read_and_str #6, cp14_DBGWVR0, r12
> +	cp14_read_and_str #7, cp14_DBGWCR0, r12
> +.endm
> +
> +/* Reads cp14 registers in-order from the stack.
> + * Writes cp14 registers to hardware.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro restore_host_debug_regs
> +	read_hw_dbg_num
> +	cp14_pop_and_write #4, r11	@ DBGBVR
> +	cp14_pop_and_write #5, r11	@ DBGBCR
> +	cp14_pop_and_write #6, r12	@ DBGWVR
> +	cp14_pop_and_write #7, r12	@ DBGWCR
> +.endm
> +
> +/* Reads cp14 registers in-order from the VCPU struct pointed to by vcpup
> + * Writes cp14 registers to hardware.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro restore_guest_debug_regs
> +	read_hw_dbg_num
> +	cp14_ldr_and_write #4, cp14_DBGBVR0, r11
> +	cp14_ldr_and_write #5, cp14_DBGBCR0, r11
> +	cp14_ldr_and_write #6, cp14_DBGWVR0, r12
> +	cp14_ldr_and_write #7, cp14_DBGWCR0, r12
> +.endm
> +
>  /*
>   * Save the VGIC CPU state into memory
>   *
> @@ -684,3 +913,19 @@ ARM_BE8(rev	r6, r6  )
>  .macro load_vcpu
>  	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
>  .endm
> +
> +__save_host_debug_regs:
> +	save_host_debug_regs
> +	bx	lr
> +
> +__save_guest_debug_regs:
> +	save_guest_debug_regs
> +	bx	lr
> +
> +__restore_host_debug_regs:
> +	restore_host_debug_regs
> +	bx	lr
> +
> +__restore_guest_debug_regs:
> +	restore_guest_debug_regs
> +	bx	lr
> -- 
> 1.7.12.4
> 

Thanks,
-Christoffer

WARNING: multiple messages have this Message-ID (diff)
From: christoffer.dall@linaro.org (Christoffer Dall)
To: linux-arm-kernel@lists.infradead.org
Subject: [PATCH v3 09/11] KVM: arm: implement lazy world switch for debug registers
Date: Tue, 30 Jun 2015 15:15:22 +0200	[thread overview]
Message-ID: <20150630131522.GQ11332@cbox> (raw)
In-Reply-To: <1434969694-7432-10-git-send-email-zhichao.huang@linaro.org>

On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote:
> Implement switching of the debug registers. While the number
> of registers is massive, CPUs usually don't implement them all
> (A15 has 6 breakpoints and 4 watchpoints, which gives us a total
> of 22 registers "only").
> 
> Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in
> the guest, debug is always actively in use (ARM_DSCR_MDBGEN set).
> 
> We have to do the save/restore dance in this case, because the host
> and the guest might use their respective debug registers at any moment.

this sounds expensive, and I suggested an alternative approach in the
previsou patch.  In any case, measuring the impact on this on hardware
would be a great idea...

> 
> If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged
> the debug registers as dirty, we only save/resotre DBGDSCR.

restore

> 
> Signed-off-by: Zhichao Huang <zhichao.huang@linaro.org>
> ---
>  arch/arm/kvm/interrupts.S      |  16 +++
>  arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 263 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S
> index 79caf79..d626275 100644
> --- a/arch/arm/kvm/interrupts.S
> +++ b/arch/arm/kvm/interrupts.S
> @@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run)
>  	read_cp15_state store_to_vcpu = 0
>  	write_cp15_state read_from_vcpu = 1
>  
> +	@ Store hardware CP14 state and load guest state
> +	compute_debug_state 1f
> +	bl __save_host_debug_regs
> +	bl __restore_guest_debug_regs
> +
> +1:
>  	@ If the host kernel has not been configured with VFPv3 support,
>  	@ then it is safer if we deny guests from using it as well.
>  #ifdef CONFIG_VFPv3
> @@ -201,6 +207,16 @@ after_vfp_restore:
>  	mrc	p15, 0, r2, c0, c0, 5
>  	mcr	p15, 4, r2, c0, c0, 5
>  
> +	@ Store guest CP14 state and restore host state
> +	skip_debug_state 1f
> +	bl __save_guest_debug_regs
> +	bl __restore_host_debug_regs
> +	/* Clear the dirty flag for the next run, as all the state has
> +	 * already been saved. Note that we nuke the whole 32bit word.
> +	 * If we ever add more flags, we'll have to be more careful...
> +	 */
> +	clear_debug_dirty_bit
> +1:
>  	@ Store guest CP15 state and restore host state
>  	read_cp15_state store_to_vcpu = 1
>  	write_cp15_state read_from_vcpu = 0
> diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S
> index 5662c39..ed406be 100644
> --- a/arch/arm/kvm/interrupts_head.S
> +++ b/arch/arm/kvm/interrupts_head.S
> @@ -7,6 +7,7 @@
>  #define VCPU_USR_SP		(VCPU_USR_REG(13))
>  #define VCPU_USR_LR		(VCPU_USR_REG(14))
>  #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
> +#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) * 4))
>  
>  /*
>   * Many of these macros need to access the VCPU structure, which is always
> @@ -168,8 +169,7 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>   * Clobbers *all* registers.
>   */
>  .macro restore_guest_regs
> -	/* reset DBGDSCR to disable debug mode */
> -	mov	r2, #0
> +	ldr	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>  	mcr	p14, 0, r2, c0, c2, 2
>  
>  	restore_guest_regs_mode svc, #VCPU_SVC_REGS
> @@ -250,6 +250,10 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	save_guest_regs_mode abt, #VCPU_ABT_REGS
>  	save_guest_regs_mode und, #VCPU_UND_REGS
>  	save_guest_regs_mode irq, #VCPU_IRQ_REGS
> +
> +	/* DBGDSCR reg */
> +	mrc	p14, 0, r2, c0, c1, 0
> +	str	r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)]
>  .endm
>  
>  /* Reads cp15 registers from hardware and stores them in memory
> @@ -449,6 +453,231 @@ vcpu	.req	r0		@ vcpu pointer always in r0
>  	str	r5, [vcpu, #VCPU_DEBUG_FLAGS]
>  .endm
>  
> +/* Assume r11/r12 in used, clobbers r2-r10 */
> +.macro cp14_read_and_push Op2 skip_num
> +	cmp	\skip_num, #8
> +	// if (skip_num >= 8) then skip c8-c15 directly
> +	bge	1f
> +	adr	r2, 9998f
> +	add	r2, r2, \skip_num, lsl #2
> +	bx	r2
> +1:
> +	adr	r2, 9999f
> +	sub	r3, \skip_num, #8
> +	add	r2, r2, r3, lsl #2
> +	bx	r2
> +9998:
> +	mrc	p14, 0, r10, c0, c15, \Op2
> +	mrc	p14, 0, r9, c0, c14, \Op2
> +	mrc	p14, 0, r8, c0, c13, \Op2
> +	mrc	p14, 0, r7, c0, c12, \Op2
> +	mrc	p14, 0, r6, c0, c11, \Op2
> +	mrc	p14, 0, r5, c0, c10, \Op2
> +	mrc	p14, 0, r4, c0, c9, \Op2
> +	mrc	p14, 0, r3, c0, c8, \Op2
> +	push	{r3-r10}

you probably don't want to do more stores to memory than required

> +9999:
> +	mrc	p14, 0, r10, c0, c7, \Op2
> +	mrc	p14, 0, r9, c0, c6, \Op2
> +	mrc	p14, 0, r8, c0, c5, \Op2
> +	mrc	p14, 0, r7, c0, c4, \Op2
> +	mrc	p14, 0, r6, c0, c3, \Op2
> +	mrc	p14, 0, r5, c0, c2, \Op2
> +	mrc	p14, 0, r4, c0, c1, \Op2
> +	mrc	p14, 0, r3, c0, c0, \Op2
> +	push	{r3-r10}

same

> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r10 */
> +.macro cp14_pop_and_write Op2 skip_num
> +	cmp	\skip_num, #8
> +	// if (skip_num >= 8) then skip c8-c15 directly
> +	bge	1f
> +	adr	r2, 9998f
> +	add	r2, r2, \skip_num, lsl #2
> +	pop	{r3-r10}

you probably don't want to do more loads from memory than required

> +	bx	r2
> +1:
> +	adr	r2, 9999f
> +	sub	r3, \skip_num, #8
> +	add	r2, r2, r3, lsl #2
> +	pop	{r3-r10}

same

> +	bx	r2
> +
> +9998:
> +	mcr	p14, 0, r10, c0, c15, \Op2
> +	mcr	p14, 0, r9, c0, c14, \Op2
> +	mcr	p14, 0, r8, c0, c13, \Op2
> +	mcr	p14, 0, r7, c0, c12, \Op2
> +	mcr	p14, 0, r6, c0, c11, \Op2
> +	mcr	p14, 0, r5, c0, c10, \Op2
> +	mcr	p14, 0, r4, c0, c9, \Op2
> +	mcr	p14, 0, r3, c0, c8, \Op2
> +
> +	pop	{r3-r10}
> +9999:
> +	mcr	p14, 0, r10, c0, c7, \Op2
> +	mcr	p14, 0, r9, c0, c6, \Op2
> +	mcr	p14, 0, r8, c0, c5, \Op2
> +	mcr	p14, 0, r7, c0, c4, \Op2
> +	mcr	p14, 0, r6, c0, c3, \Op2
> +	mcr	p14, 0, r5, c0, c2, \Op2
> +	mcr	p14, 0, r4, c0, c1, \Op2
> +	mcr	p14, 0, r3, c0, c0, \Op2
> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r3 */
> +.macro cp14_read_and_str Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3
> +	bx	r3
> +1:
> +	mrc	p14, 0, r2, c0, c15, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> +	mrc	p14, 0, r2, c0, c14, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> +	mrc	p14, 0, r2, c0, c13, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> +	mrc	p14, 0, r2, c0, c12, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> +	mrc	p14, 0, r2, c0, c11, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> +	mrc	p14, 0, r2, c0, c10, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> +	mrc	p14, 0, r2, c0, c9, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> +	mrc	p14, 0, r2, c0, c8, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> +	mrc	p14, 0, r2, c0, c7, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> +	mrc	p14, 0, r2, c0, c6, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> +	mrc	p14, 0, r2, c0, c5, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> +	mrc	p14, 0, r2, c0, c4, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> +	mrc	p14, 0, r2, c0, c3, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> +	mrc	p14, 0, r2, c0, c2, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> +	mrc	p14, 0, r2, c0, c1, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> +	mrc	p14, 0, r2, c0, c0, \Op2
> +	str     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> +.endm
> +
> +/* Assume r11/r12 in used, clobbers r2-r3 */
> +.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num
> +	adr	r3, 1f
> +	add	r3, r3, \skip_num, lsl #3
> +	bx	r3
> +1:
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)]
> +	mcr	p14, 0, r2, c0, c15, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)]
> +	mcr	p14, 0, r2, c0, c14, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)]
> +	mcr	p14, 0, r2, c0, c13, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)]
> +	mcr	p14, 0, r2, c0, c12, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)]
> +	mcr	p14, 0, r2, c0, c11, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)]
> +	mcr	p14, 0, r2, c0, c10, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)]
> +	mcr	p14, 0, r2, c0, c9, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)]
> +	mcr	p14, 0, r2, c0, c8, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)]
> +	mcr	p14, 0, r2, c0, c7, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)]
> +	mcr	p14, 0, r2, c0, c6, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)]
> +	mcr	p14, 0, r2, c0, c5, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)]
> +	mcr	p14, 0, r2, c0, c4, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)]
> +	mcr	p14, 0, r2, c0, c3, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)]
> +	mcr	p14, 0, r2, c0, c2, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)]
> +	mcr	p14, 0, r2, c0, c1, \Op2
> +	ldr     r2, [vcpu, #CP14_OFFSET(\cp14_reg0)]
> +	mcr	p14, 0, r2, c0, c0, \Op2
> +.endm

can you not find some way of unifying cp14_pop_and_write with
cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ?

Probably having two separate structs for the VFP state on the vcpu struct
for both the guest and the host state is one possible way of doing so.

> +
> +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */
> +.macro read_hw_dbg_num
> +	mrc	p14, 0, r2, c0, c0, 0
> +	ubfx	r11, r2, #24, #4
> +	add	r11, r11, #1		// Extract BRPs
> +	ubfx	r12, r2, #28, #4
> +	add	r12, r12, #1		// Extract WRPs
> +	mov	r2, #16
> +	sub	r11, r2, r11		// How many BPs to skip
> +	sub	r12, r2, r12		// How many WPs to skip
> +.endm
> +
> +/* Reads cp14 registers from hardware.

You have a lot of multi-line comments in these patches which don't start
with a separate '/*' line, as dictated by the Linux kernel coding style.
So far, I've ignored this, but please fix all these throughout the
series when you respin.

> + * Writes cp14 registers in-order to the stack.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro save_host_debug_regs
> +	read_hw_dbg_num
> +	cp14_read_and_push #4, r11	@ DBGBVR
> +	cp14_read_and_push #5, r11	@ DBGBCR
> +	cp14_read_and_push #6, r12	@ DBGWVR
> +	cp14_read_and_push #7, r12	@ DBGWCR
> +.endm
> +
> +/* Reads cp14 registers from hardware.
> + * Writes cp14 registers in-order to the VCPU struct pointed to by vcpup.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro save_guest_debug_regs
> +	read_hw_dbg_num
> +	cp14_read_and_str #4, cp14_DBGBVR0, r11

why do you need the has before the op2 field?

> +	cp14_read_and_str #5, cp14_DBGBCR0, r11
> +	cp14_read_and_str #6, cp14_DBGWVR0, r12
> +	cp14_read_and_str #7, cp14_DBGWCR0, r12
> +.endm
> +
> +/* Reads cp14 registers in-order from the stack.
> + * Writes cp14 registers to hardware.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro restore_host_debug_regs
> +	read_hw_dbg_num
> +	cp14_pop_and_write #4, r11	@ DBGBVR
> +	cp14_pop_and_write #5, r11	@ DBGBCR
> +	cp14_pop_and_write #6, r12	@ DBGWVR
> +	cp14_pop_and_write #7, r12	@ DBGWCR
> +.endm
> +
> +/* Reads cp14 registers in-order from the VCPU struct pointed to by vcpup
> + * Writes cp14 registers to hardware.
> + *
> + * Assumes vcpu pointer in vcpu reg
> + *
> + * Clobbers r2-r12
> + */
> +.macro restore_guest_debug_regs
> +	read_hw_dbg_num
> +	cp14_ldr_and_write #4, cp14_DBGBVR0, r11
> +	cp14_ldr_and_write #5, cp14_DBGBCR0, r11
> +	cp14_ldr_and_write #6, cp14_DBGWVR0, r12
> +	cp14_ldr_and_write #7, cp14_DBGWCR0, r12
> +.endm
> +
>  /*
>   * Save the VGIC CPU state into memory
>   *
> @@ -684,3 +913,19 @@ ARM_BE8(rev	r6, r6  )
>  .macro load_vcpu
>  	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
>  .endm
> +
> +__save_host_debug_regs:
> +	save_host_debug_regs
> +	bx	lr
> +
> +__save_guest_debug_regs:
> +	save_guest_debug_regs
> +	bx	lr
> +
> +__restore_host_debug_regs:
> +	restore_host_debug_regs
> +	bx	lr
> +
> +__restore_guest_debug_regs:
> +	restore_guest_debug_regs
> +	bx	lr
> -- 
> 1.7.12.4
> 

Thanks,
-Christoffer

  reply	other threads:[~2015-06-30 13:15 UTC|newest]

Thread overview: 82+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-06-22 10:41 [PATCH v3 00/11] KVM: arm: debug infrastructure support Zhichao Huang
2015-06-22 10:41 ` Zhichao Huang
2015-06-22 10:41 ` [PATCH v3 01/11] KVM: arm: plug guest debug exploit Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-29 15:49   ` Christoffer Dall
2015-06-29 15:49     ` Christoffer Dall
2015-07-01  7:04     ` zichao
2015-07-01  7:04       ` zichao
2015-07-01  9:00       ` Christoffer Dall
2015-07-01  9:00         ` Christoffer Dall
2015-07-01  9:00         ` Christoffer Dall
2015-06-22 10:41 ` [PATCH v3 02/11] KVM: arm: rename pm_fake handler to trap_raz_wi Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30 13:20   ` Christoffer Dall
2015-06-30 13:20     ` Christoffer Dall
2015-06-22 10:41 ` [PATCH v3 03/11] KVM: arm: enable to use the ARM_DSCR_MDBGEN macro from KVM assembly code Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30 13:20   ` Christoffer Dall
2015-06-30 13:20     ` Christoffer Dall
2015-06-22 10:41 ` [PATCH v3 04/11] KVM: arm: common infrastructure for handling AArch32 CP14/CP15 Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-29 19:43   ` Christoffer Dall
2015-06-29 19:43     ` Christoffer Dall
2015-07-01  7:09     ` zichao
2015-07-01  7:09       ` zichao
2015-07-01  9:00       ` Christoffer Dall
2015-07-01  9:00         ` Christoffer Dall
2015-06-22 10:41 ` [PATCH v3 05/11] KVM: arm: check ordering of all system register tables Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30 13:20   ` Christoffer Dall
2015-06-30 13:20     ` Christoffer Dall
2015-06-22 10:41 ` [PATCH v3 06/11] KVM: arm: add trap handlers for 32-bit debug registers Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-29 21:16   ` Christoffer Dall
2015-06-29 21:16     ` Christoffer Dall
2015-07-01  7:14     ` zichao
2015-07-01  7:14       ` zichao
2015-06-22 10:41 ` [PATCH v3 07/11] KVM: arm: add trap handlers for 64-bit " Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30 13:20   ` Christoffer Dall
2015-06-30 13:20     ` Christoffer Dall
2015-07-01  7:43     ` Zhichao Huang
2015-07-01  7:43       ` Zhichao Huang
2015-06-22 10:41 ` [PATCH v3 08/11] KVM: arm: implement dirty bit mechanism for " Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30  9:20   ` Christoffer Dall
2015-06-30  9:20     ` Christoffer Dall
2015-07-03  9:54     ` Zhichao Huang
2015-07-03  9:54       ` Zhichao Huang
2015-07-03 11:56       ` Christoffer Dall
2015-07-03 11:56         ` Christoffer Dall
2015-07-07 10:06         ` Zhichao Huang
2015-07-07 10:06           ` Zhichao Huang
2015-07-07 10:24           ` Will Deacon
2015-07-07 10:24             ` Will Deacon
2015-07-08 10:50             ` Zhichao Huang
2015-07-08 10:50               ` Zhichao Huang
2015-07-08 17:08               ` Will Deacon
2015-07-08 17:08                 ` Will Deacon
2015-07-09 12:54                 ` Zhichao Huang
2015-07-09 12:54                   ` Zhichao Huang
2015-07-09 11:50             ` Christoffer Dall
2015-07-09 11:50               ` Christoffer Dall
2015-07-13 12:12               ` zichao
2015-07-13 12:12                 ` zichao
2015-06-22 10:41 ` [PATCH v3 09/11] KVM: arm: implement lazy world switch " Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30 13:15   ` Christoffer Dall [this message]
2015-06-30 13:15     ` Christoffer Dall
2015-07-03 10:06     ` Zhichao Huang
2015-07-03 10:06       ` Zhichao Huang
2015-07-03 21:05       ` Christoffer Dall
2015-07-03 21:05         ` Christoffer Dall
2015-06-22 10:41 ` [PATCH v3 10/11] KVM: arm: add a trace event for cp14 traps Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30 13:20   ` Christoffer Dall
2015-06-30 13:20     ` Christoffer Dall
2015-06-22 10:41 ` [PATCH v3 11/11] KVM: arm: enable trapping of all debug registers Zhichao Huang
2015-06-22 10:41   ` Zhichao Huang
2015-06-30 13:19   ` Christoffer Dall
2015-06-30 13:19     ` Christoffer Dall

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20150630131522.GQ11332@cbox \
    --to=christoffer.dall@linaro.org \
    --cc=alex.bennee@linaro.org \
    --cc=huangzhichao@huawei.com \
    --cc=kvm@vger.kernel.org \
    --cc=kvmarm@lists.cs.columbia.edu \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=marc.zyngier@arm.com \
    --cc=will.deacon@arm.com \
    --cc=zhichao.huang@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.