LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: Oops while running fs_racer test on a POWER6 box against latest git
From: Michael Neuling @ 2010-07-02  1:36 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Latchesar Ionkov, Jens Axboe, LKML, linuxppc-dev, Ron Minnich,
	Christoph Hellwig, divya
In-Reply-To: <20100701105907.GK22976@laptop>

In message <20100701105907.GK22976@laptop> you wrote:
> On Thu, Jul 01, 2010 at 03:04:54PM +1000, Michael Neuling wrote:
> > > While running fs_racer test from LTP on a POWER6 box against latest git(2
.6.3
> > 5-rc3-git4 - commitid 984bc9601f64fd)
> > > came across the following warning followed by multiple oops.
> > > 
> > > ------------[ cut here ]------------
> > > 
> > > Badness at kernel/mutex-debug.c:64
> > > NIP: c0000000000be9e8 LR: c0000000000be9cc CTR: 0000000000000000
> > > REGS: c00000010be8f6f0 TRAP: 0700   Not tainted  (2.6.35-rc3-git4-autotes
t)
> > > MSR: 8000000000029032<EE,ME,CE,IR,DR>    CR: 24224422  XER: 00000012
> > > TASK = c00000010727cf00[8211] 'fs_racer_file_c' THREAD: c00000010be8bb50 
CPU:
> >  2
> > > GPR00: 0000000000000000 c00000010be8f970 c000000000d3d798 000000000000000
1
> > > GPR04: c00000010be8fa70 c00000010be8c000 c00000010727d9f8 000000000000000
0
> > > GPR08: c0000000043042f0 c0000000016534e8 000000000000017a c000000000c29a1
c
> > > GPR12: 0000000028228424 c00000000f600500 c00000010be8fc40 000000002000000
0
> > > GPR16: fffffffffffff000 c000000109c73000 c00000010be8fc30 000000000001044
2
> > > GPR20: 0000000000000000 0000000000000000 00000000000001b6 c00000010dd1225
0
> > > GPR24: c00000000017c08c c00000010727cf00 c00000010dd12278 c00000010dd1221
0
> > > GPR28: 0000000000000001 c00000010be8c000 c000000000ca2008 c00000010be8fa7
0
> > > NIP [c0000000000be9e8] .mutex_remove_waiter+0xa4/0x130
> > > LR [c0000000000be9cc] .mutex_remove_waiter+0x88/0x130
> > > Call Trace:
> > > [c00000010be8f970] [c00000010be8fa00] 0xc00000010be8fa00 (unreliable)
> > > [c00000010be8fa00] [c00000000064a9f0] .mutex_lock_nested+0x384/0x430
> > > Instruction dump:
> > > e81f0010 e93d0000 7fa04800 41fe0028 482e96e5 60000000 2fa30000 419e0018
> > > e93e8008 80090000 2f800000 409e0008<0fe00000>   e93e8000 80090000 2f80000
0
> > > Unable to handle kernel paging request for unknown fault
> > > Faulting instruction address: 0xc00000000008d0f4
> > > Oops: Kernel access of bad area, sig: 7 [#1]
> > > SMP NR_CPUS=1024 NUMA
> > > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > > pSeries
> > > last sysfs file: /sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_ma
p
> > > Modules linked in: ipv6 fuse loop dm_mod sr_mod cdrom ibmveth sg
> > > sd_mod crc_t10dif ibmvscsic scsi_transport_srp scsi_tgt scsi_mod
> > > NIP: c00000000008d0f4 LR: c00000000008d0d0 CTR: 0000000000000000
> > > REGS: c00000010978f900 TRAP: 0600   Tainted: G        W    (2.6.35-rc3-gi
t4-a
> > utotest)
> > > MSR: 8000000000009032
> > > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > > EE,ME,IR,DR>    CR: 24022442  XER: 00000012
> > > DAR: c000000000648f54, DSISR: 0000000040010000
> > > TASK = c0000001096e4900[7353] 'fs_racer_file_s' THREAD: c00000010978c000 
CPU:
> >  10
> > > GPR00: 0000000000004000 c00000010978fb80 c000000000d3d798 000000000000000
1
> > > GPR04: c00000000083539e c000000001610228 0000000000000000 c0000000054c688
0
> > > GPR08: 00000000000006a5 c000000000648f54 0000000000000007 00000000049b000
0
> > > GPR12: 0000000000000000 c00000000f601900 00000000ffffffff fffffffffffffff
f
> > > GPR16: 000000004b7dc520 0000000000000000 0000000000000000 c00000010978fea
0
> > > GPR20: 00000fffcca7e7a0 00000fffcca7e7a0 00000fffabf7dfd0 00000fffabf7dfd
0
> > > GPR24: 0000000000000000 0000000001200011 c000000000e1c0a8 c000000000648ed
4
> > > GPR28: 0000000000000000 c0000001096e4900 c000000000ca0458 c00000010725d40
0
> > > NIP [c00000000008d0f4] .copy_process+0x310/0xf40
> > > LR [c00000000008d0d0] .copy_process+0x2ec/0xf40
> > > Call Trace:
> > > [c00000010978fb80] [c00000000008d0d0] .copy_process+0x2ec/0xf40 (unreliab
le)
> > > [c00000010978fc80] [c00000000008deb4] .do_fork+0x190/0x3cc
> > > [c00000010978fdc0] [c000000000011ef4] .sys_clone+0x58/0x70
> > > [c00000010978fe30] [c0000000000087f0] .ppc_clone+0x8/0xc
> > > Instruction dump:
> > > 419e0010 7fe3fb78 480774cd 60000000 801f0014 e93f0008 7800b842 39290080
> > > 78004800 60000042 901f0014 38004000<7d6048a8>   7d6b0078 7d6049ad 40c2fff
4
> > > 
> > > Kernel version 2.6.34-rc3-git3 works fine.
> > 
> > Should this read 2.6.35-rc3-git3?
> > 
> > If so, there's only about 20 commits in:
> > 5904b3b81d2516..984bc9601f64fd
> > 
> > The likely fs related candidates are from Christoph and Nick Piggin
> > (added to CC)
> > 
> > No commits relating to POWER6 or PPC.
> 
> Not sure what's happening here. The first warning looks like some mutex
> corruption, but it doesn't have a stack trace (these are 2 seperate
> dumps, right? ie. the copy_process stack doesn't relate to the mutex
> warning?) So I don't have much idea.
> 
> If it is reproducable, can you try getting a better stack trace, or
> better yet, even bisecting if there is just a small window?

I can't reproduce the bug here on POWER6 or POWER7.

Divya, can you bisect this?

Mikey

^ permalink raw reply

* Re: CONFIG_NO_HZ causing poor console responsiveness
From: Timur Tabi @ 2010-07-01 21:55 UTC (permalink / raw)
  To: Linuxppc-dev Development; +Cc: Mike Galbraith
In-Reply-To: <AANLkTilMzfwgYvoFhxhcVQVGV-EkMLVHI2TeQ29SYFCH@mail.gmail.com>

On Tue, Jun 29, 2010 at 2:54 PM, Timur Tabi <timur@freescale.com> wrote:
> I'm adding support for a new e500-based board (the P1022DS), and in
> the process I've discovered that enabling CONFIG_NO_HZ (Tickless
> System / Dynamic Ticks) causes significant responsiveness problems on
> the serial console. =A0When I type on the console, I see delays of up to
> a half-second for almost every character. =A0It acts as if there's a
> background process eating all the CPU.

I finally finished my git-bisect, and it wasn't that helpful.  I had
to skip several commits because the kernel just wouldn't boot:

There are only 'skip'ped commits left to test.
The first bad commit could be any of:
6bc6cf2b61336ed0c55a615eb4c0c8ed5daf3f08
8b911acdf08477c059d1c36c21113ab1696c612b
21406928afe43f1db6acab4931bb8c886f4d04ce
5ca9880c6f4ba4c84b517bc2fed5366adf63d191
a64692a3afd85fe048551ab89142fd5ca99a0dbd
f2e74eeac03ffb779d64b66a643c5e598145a28b
c6ee36c423c3ed1fb86bb3eabba9fc256a300d16
e12f31d3e5d36328c7fbd0fce40a95e70b59152c
13814d42e45dfbe845a0bbe5184565d9236896ae
b42e0c41a422a212ddea0666d5a3a0e3c35206db
39c0cbe2150cbd848a25ba6cdb271d1ad46818ad
beac4c7e4a1cc6d57801f690e5e82fa2c9c245c8
41acab8851a0408c1d5ad6c21a07456f88b54d40
6427462bfa50f50dc6c088c07037264fcc73eca1
c9494727cf293ae2ec66af57547a3e79c724fec2
We cannot bisect more!

These correspond to a batch of scheduler patches, most from Mike Galbraith.

I don't know what to do now.  I can't test any of these commits.  Even
if I could, they look like they're all part of one set, so I doubt I
could narrow it down to one commit anyway.

--=20
Timur Tabi
Linux kernel developer at Freescale

^ permalink raw reply

* Re: machine check in kernel for a mpc870 board
From: Scott Wood @ 2010-07-01 20:25 UTC (permalink / raw)
  To: Shawn Jin; +Cc: ppcdev
In-Reply-To: <AANLkTikuI49rzT6j9glIB-Vmp6kZSTVT5ri3Hb-JHgKO@mail.gmail.com>

On 07/01/2010 03:17 PM, Shawn Jin wrote:
>>>> How do I find the address, reg, and range for nodes like localbus,
>>>> soc, eth0, cpm, serial etc.?
>>
>> If your CCSRBAR is 0xfa200000, then pretty much anywhere you see 0xff0xxxxx
>> change it to 0xfa2xxxxx.
>
> I'm not sure about the range settings of 0xfe000000. How do you get this?
>
>         localbus@fa200100 {
>                 compatible = "fsl,mpc885-localbus", "fsl,pq1-localbus",
>                              "simple-bus";
>                 #address-cells =<2>;
>                 #size-cells =<1>;
>                 reg =<0xfa200100 0x40>;
>
>                 ranges =<
>                         0 0 0xfe000000 0x01000000    // I'm not sure about this?
>                 >;
>         };

Change 0xfe000000 to wherever u-boot maps your flash, and 0x01000000 to 
whatever the size of the flash localbus mapping is.

Or more generally update this section to hold whatever is connected to 
the localbus on your board.  The first cell is the chipselect.

>> Make sure that you've got Linux platform code enabled that matches the
>> top-level compatible of your device tree.  Try enabling PPC_EARLY_DEBUG_CPM,
>> making sure to update PPC_EARLY_DEBUG_CPM_ADDR to 0xfa202008.
>
> I enabled this early debug feature but don't know this address change.

The address change is for the different IMMR base, only this use is too 
early/hacky to get it from the device tree.

-Scott

^ permalink raw reply

* Re: machine check in kernel for a mpc870 board
From: Shawn Jin @ 2010-07-01 20:17 UTC (permalink / raw)
  To: Scott Wood; +Cc: ppcdev
In-Reply-To: <4C2CD395.90409@freescale.com>

>>> How do I find the address, reg, and range for nodes like localbus,
>>> soc, eth0, cpm, serial etc.?
>
> If your CCSRBAR is 0xfa200000, then pretty much anywhere you see 0xff0xxx=
xx
> change it to 0xfa2xxxxx.

I'm not sure about the range settings of 0xfe000000. How do you get this?

       localbus@fa200100 {
               compatible =3D "fsl,mpc885-localbus", "fsl,pq1-localbus",
                            "simple-bus";
               #address-cells =3D <2>;
               #size-cells =3D <1>;
               reg =3D <0xfa200100 0x40>;

               ranges =3D <
                       0 0 0xfe000000 0x01000000    // I'm not sure about t=
his?
               >;
       };


>> =A0 =A0 Linux/PowerPC load: root=3D/dev/ram
>> =A0 =A0 Finalizing device tree... flat tree at 0x59e300
>>
>> The gdb showed deadbeef.
>> =A0 =A0 (gdb) target remote ppcbdi:2001
>> =A0 =A0 Remote debugging using ppcbdi:2001
>> =A0 =A0 0xdeadbeef in ?? ()
>> =A0 =A0 (gdb)
>>
>> The kernel doesn't seem to start. What could go wrong here?
>
> Pretty much anything. :-)

I realized that. :-P The kernel booting was able to stop at
start_kernel(). I'm going to trace further.

> Make sure that you've got Linux platform code enabled that matches the
> top-level compatible of your device tree. =A0Try enabling PPC_EARLY_DEBUG=
_CPM,
> making sure to update PPC_EARLY_DEBUG_CPM_ADDR to 0xfa202008.

I enabled this early debug feature but don't know this address change.
I'll try it later.

Thanks a lot, Scott.

-Shawn.

^ permalink raw reply

* Re: Oops while running fs_racer test on a POWER6 box against latest git
From: Maciej Rutecki @ 2010-07-01 18:25 UTC (permalink / raw)
  To: divya; +Cc: Latchesar Ionkov, linuxppc-dev, LKML, Ron Minnich
In-Reply-To: <4C2B28F3.7000006@linux.vnet.ibm.com>

On =C5=9Broda, 30 czerwca 2010 o 13:22:27 divya wrote:
> While running fs_racer test from LTP on a POWER6 box against latest
> git(2.6.35-rc3-git4 - commitid 984bc9601f64fd) came across the following
> warning followed by multiple oops.
>=20

I created a Bugzilla entry at=20
https://bugzilla.kernel.org/show_bug.cgi?id=3D16324
for your bug report, please add your address to the CC list in there, thank=
s!


=2D-=20
Maciej Rutecki
http://www.maciek.unixy.pl

^ permalink raw reply

* RE: [PATCH v1]460EX on-chip SATA driver<resubmisison>
From: Rupjyoti Sarmah @ 2010-07-01 17:44 UTC (permalink / raw)
  To: Wolfgang Denk, Josh Boyer, Jeff Garzik
  Cc: linux-ide, sr, linux-kernel, linuxppc-dev
In-Reply-To: <20100630225516.8F9D3152442@gemini.denx.de>

Dear All,

The Synopsis design ware core is task file orientated so the driver would
still need CONFIG_ATA_SFF.
I would be fixing the Kconfig file to make it dependent on the
CONFIG_ATA_SFF.

Regards,
Rup



-----Original Message-----
From: Wolfgang Denk [mailto:wd@denx.de]
Sent: Thursday, July 01, 2010 4:25 AM
To: Josh Boyer
Cc: Jeff Garzik; linux-ide@vger.kernel.org; sr@denx.de; Rupjyoti Sarmah;
linux-kernel@vger.kernel.org; linuxppc-dev@ozlabs.org
Subject: Re: [PATCH v1]460EX on-chip SATA driver<resubmisison>

Dear Josh Boyer,

In message <20100630200325.GD7756@zod.rchland.ibm.com> you wrote:
>
> The driver doesn't depend on CONFIG_ATA_SFF in it's Kconfig file, but
seems to
> require it at build time.  Isn't that something that needs fixing in the
> driver?

Right.  Next question is if this is really needed for this driver.

Best regards,

Wolfgang Denk

-- 
DENX Software Engineering GmbH,     MD: Wolfgang Denk & Detlev Zundel
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: wd@denx.de
Copy from one, it's plagiarism; copy from two, it's research.

^ permalink raw reply

* Re: machine check in kernel for a mpc870 board
From: Scott Wood @ 2010-07-01 17:42 UTC (permalink / raw)
  To: Shawn Jin; +Cc: ppcdev
In-Reply-To: <AANLkTilZm-TRkjgYBc0xj9iFYUJl7bb19Zvrcj2R-wJd@mail.gmail.com>

On 07/01/2010 02:50 AM, Shawn Jin wrote:
> Hi Scott,
>
>> How do I find the address, reg, and range for nodes like localbus,
>> soc, eth0, cpm, serial etc.?

If your CCSRBAR is 0xfa200000, then pretty much anywhere you see 
0xff0xxxxx change it to 0xfa2xxxxx.

> I managed to proceed a little bit further.
>      Memory<-<0x0 0x8000000>  (128MB)
>      ENET0: local-mac-address<- 00:09:9b:01:58:64
>      CPU clock-frequency<- 0x7270e00 (120MHz)
>      CPU timebase-frequency<- 0x393870 (4MHz)
>      CPU bus-frequency<- 0x3938700 (60MHz)
>
>      zImage starting: loaded at 0x00400000 (sp: 0x07d1ccd0)
>      Allocating 0x186bdd bytes for kernel ...
>      gunzipping (0x00000000<- 0x0040c000:0x00591c30)...done 0x173b18 bytes
>
>      Linux/PowerPC load: root=/dev/ram
>      Finalizing device tree... flat tree at 0x59e300
>
> The gdb showed deadbeef.
>      (gdb) target remote ppcbdi:2001
>      Remote debugging using ppcbdi:2001
>      0xdeadbeef in ?? ()
>      (gdb)
>
> The kernel doesn't seem to start. What could go wrong here?

Pretty much anything. :-)

Make sure that you've got Linux platform code enabled that matches the 
top-level compatible of your device tree.  Try enabling 
PPC_EARLY_DEBUG_CPM, making sure to update PPC_EARLY_DEBUG_CPM_ADDR to 
0xfa202008.

-Scott

^ permalink raw reply

* Re: [PATCH] KVM: PPC: Book3S_32 MMU debug compile fixes
From: Alexander Graf @ 2010-07-01 16:40 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277507511-413-1-git-send-email-agraf@suse.de>

Alexander Graf wrote:
> Due to previous changes, the Book3S_32 guest MMU code didn't compile properly
> when enabling debugging.
>
> This patch repairs the broken code paths, making it possible to define DEBUG_MMU
> and friends again.
>
> Signed-off-by: Alexander Graf <agraf@suse.de>
>   

Please also don't forget this patch :)


Alex

^ permalink raw reply

* Re: [PATCH 0/2] Faster MMU lookups for Book3s v3
From: Marcelo Tosatti @ 2010-07-01 15:40 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <1277903926-12786-1-git-send-email-agraf@suse.de>

On Wed, Jun 30, 2010 at 03:18:44PM +0200, Alexander Graf wrote:
> Book3s suffered from my really bad shadow MMU implementation so far. So
> I finally got around to implement a combined hash and list mechanism that
> allows for much faster lookup of mapped pages.
> 
> To show that it really is faster, I tried to run simple process spawning
> code inside the guest with and without these patches:
> 
> [without]
> 
> debian-powerpc:~# time for i in {1..1000}; do /bin/echo hello > /dev/null; done
> 
> real    0m20.235s
> user    0m10.418s
> sys     0m9.766s
> 
> [with]
> 
> debian-powerpc:~# time for i in {1..1000}; do /bin/echo hello > /dev/null; done
> 
> real    0m14.659s
> user    0m8.967s
> sys     0m5.688s
> 
> So as you can see, performance improved significantly.
> 
> v2 -> v3:
> 
>   - use hlist
>   - use global slab cache
> 
> Alexander Graf (2):
>   KVM: PPC: Add generic hpte management functions
>   KVM: PPC: Make use of hash based Shadow MMU
> 
>  arch/powerpc/include/asm/kvm_book3s.h |    9 +
>  arch/powerpc/include/asm/kvm_host.h   |   17 ++-
>  arch/powerpc/kvm/Makefile             |    2 +
>  arch/powerpc/kvm/book3s.c             |   14 ++-
>  arch/powerpc/kvm/book3s_32_mmu_host.c |  104 ++-----------
>  arch/powerpc/kvm/book3s_64_mmu_host.c |   98 +-----------
>  arch/powerpc/kvm/book3s_mmu_hpte.c    |  277 +++++++++++++++++++++++++++++++++
>  7 files changed, 331 insertions(+), 190 deletions(-)
>  create mode 100644 arch/powerpc/kvm/book3s_mmu_hpte.c

Applied, thanks.

^ permalink raw reply

* Re: [PATCH 0/2] Faster MMU lookups for Book3s v3
From: Avi Kivity @ 2010-07-01 13:42 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <4C2C8FA8.1030702@suse.de>

On 07/01/2010 03:52 PM, Alexander Graf wrote:
>>
>>> Don't you use lazy spte updates?
>>>
>>>        
>> We do, but given enough time, the guest will touch its entire memory.
>>      
> Oh, so that's the major difference. On PPC we have the HTAB with a
> fraction of all the mapped pages in it. We don't have a notion of a full
> page table for a guest process. We always only have a snapshot of some
> mappings and shadow those lazily.
>
> So at worst, we have HPTEG_CACHE_NUM shadow pages mapped, which would be
> (1<<  15) * 4k which again would be at most 128MB of guest memory. We
> can't hold more mappings than that anyways, so chances are low we have a
> mapping for each hva.
>    

Doesn't that seriously impact performance?  A guest that recycles pages 
from its lru will touch pages at random from its entire address space.  
On bare metal that isn't a problem (I imagine) due to large tlbs.  But 
virtualized on 4K pages that means the htlb will be thrashed.

>>> But then again I probably do need an rmap for the mmu_notifier magic,
>>> right? But I'd rather prefer to have that code path be slow and the
>>> dirty bitmap invalidation fast than the other way around. Swapping is
>>> slow either way.
>>>
>>>        
>> It's not just swapping, it's also page ageing.  That needs to be
>> fast.  Does ppc have a hardware-set referenced bit?  If so, you need a
>> fast rmap for mmu notifiers.
>>      
> Page ageing is difficult. The HTAB has a hardware set referenced bit,
> but we don't have a guarantee that the entry is still there when we look
> for it. Something else could have overwritten it by then, but the entry
> could still be lingering around in the TLB.
>    

Whoever's dropping the HTAB needs to update the host struct page, and 
also reflect the bit into the guest's HTAB, no?

In fact, on x86 shadow, we don't have an spte for a gpte that is not 
accessed, precisely so we know the exact point in time when the accessed 
bit is set.

> So I think the only reasonable way to implement page ageing is to unmap
> pages. And that's slow, because it means we have to map them again on
> access. Bleks. Or we could look for the HTAB entry and only unmap them
> if the entry is moot.
>    

I think it works out if you update struct page when you clear out an HTAB.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply

* Re: [PATCH 0/2] Faster MMU lookups for Book3s v3
From: Alexander Graf @ 2010-07-01 12:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <4C2C8D8A.7080103@redhat.com>

Avi Kivity wrote:
> On 07/01/2010 03:28 PM, Alexander Graf wrote:
>>
>>>
>>>>    Wouldn't it speed up dirty bitmap flushing
>>>> a lot if we'd just have a simple linked list of all sPTEs belonging to
>>>> that memslot?
>>>>
>>>>        
>>> The complexity is O(pages_in_slot) + O(sptes_for_slot).
>>>
>>> Usually, every page is mapped at least once, so sptes_for_slot
>>> dominates.  Even when it isn't so, iterating the rmap base pointers is
>>> very fast since they are linear in memory, while sptes are scattered
>>> around, causing cache misses.
>>>      
>> Why would pages be mapped often?
>
> It's not a question of how often they are mapped (shadow: very often;
> tdp: very rarely) but what percentage of pages are mapped.  It's
> usually 100%.
>
>> Don't you use lazy spte updates?
>>    
>
> We do, but given enough time, the guest will touch its entire memory.

Oh, so that's the major difference. On PPC we have the HTAB with a
fraction of all the mapped pages in it. We don't have a notion of a full
page table for a guest process. We always only have a snapshot of some
mappings and shadow those lazily.

So at worst, we have HPTEG_CACHE_NUM shadow pages mapped, which would be
(1 << 15) * 4k which again would be at most 128MB of guest memory. We
can't hold more mappings than that anyways, so chances are low we have a
mapping for each hva.

>
>
>>> Another consideration is that on x86, an spte occupies just 64 bits
>>> (for the hardware pte); if there are multiple sptes per page (rare on
>>> modern hardware), there is also extra memory for rmap chains;
>>> sometimes we also allocate 64 bits for the gfn.  Having an extra
>>> linked list would require more memory to be allocated and maintained.
>>>      
>> Hrm. I was thinking of not having an rmap but only using the chain. The
>> only slots that would require such a chain would be the ones with dirty
>> bitmapping enabled, so no penalty for normal RAM (unless you use kemari
>> or live migration of course).
>>    
>
> You could also only chain writeable ptes.

Very true. Probably even more useful :).

>
>> But then again I probably do need an rmap for the mmu_notifier magic,
>> right? But I'd rather prefer to have that code path be slow and the
>> dirty bitmap invalidation fast than the other way around. Swapping is
>> slow either way.
>>    
>
> It's not just swapping, it's also page ageing.  That needs to be
> fast.  Does ppc have a hardware-set referenced bit?  If so, you need a
> fast rmap for mmu notifiers.

Page ageing is difficult. The HTAB has a hardware set referenced bit,
but we don't have a guarantee that the entry is still there when we look
for it. Something else could have overwritten it by then, but the entry
could still be lingering around in the TLB.

So I think the only reasonable way to implement page ageing is to unmap
pages. And that's slow, because it means we have to map them again on
access. Bleks. Or we could look for the HTAB entry and only unmap them
if the entry is moot.


Alex

^ permalink raw reply

* Re: [PATCH 0/2] Faster MMU lookups for Book3s v3
From: Avi Kivity @ 2010-07-01 12:43 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <4C2C89D6.3090401@suse.de>

On 07/01/2010 03:28 PM, Alexander Graf wrote:
>
>>
>>>    Wouldn't it speed up dirty bitmap flushing
>>> a lot if we'd just have a simple linked list of all sPTEs belonging to
>>> that memslot?
>>>
>>>        
>> The complexity is O(pages_in_slot) + O(sptes_for_slot).
>>
>> Usually, every page is mapped at least once, so sptes_for_slot
>> dominates.  Even when it isn't so, iterating the rmap base pointers is
>> very fast since they are linear in memory, while sptes are scattered
>> around, causing cache misses.
>>      
> Why would pages be mapped often?

It's not a question of how often they are mapped (shadow: very often; 
tdp: very rarely) but what percentage of pages are mapped.  It's usually 
100%.

> Don't you use lazy spte updates?
>    

We do, but given enough time, the guest will touch its entire memory.


>> Another consideration is that on x86, an spte occupies just 64 bits
>> (for the hardware pte); if there are multiple sptes per page (rare on
>> modern hardware), there is also extra memory for rmap chains;
>> sometimes we also allocate 64 bits for the gfn.  Having an extra
>> linked list would require more memory to be allocated and maintained.
>>      
> Hrm. I was thinking of not having an rmap but only using the chain. The
> only slots that would require such a chain would be the ones with dirty
> bitmapping enabled, so no penalty for normal RAM (unless you use kemari
> or live migration of course).
>    

You could also only chain writeable ptes.

> But then again I probably do need an rmap for the mmu_notifier magic,
> right? But I'd rather prefer to have that code path be slow and the
> dirty bitmap invalidation fast than the other way around. Swapping is
> slow either way.
>    

It's not just swapping, it's also page ageing.  That needs to be fast.  
Does ppc have a hardware-set referenced bit?  If so, you need a fast 
rmap for mmu notifiers.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply

* Re: [PATCH 0/2] Faster MMU lookups for Book3s v3
From: Alexander Graf @ 2010-07-01 12:28 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <4C2C78AC.3070605@redhat.com>

Avi Kivity wrote:
> On 07/01/2010 01:00 PM, Alexander Graf wrote:
>>
>> But doesn't that mean that you still need to loop through all the hvas
>> that you want to invalidate?
>
> It does.
>
>>   Wouldn't it speed up dirty bitmap flushing
>> a lot if we'd just have a simple linked list of all sPTEs belonging to
>> that memslot?
>>    
>
> The complexity is O(pages_in_slot) + O(sptes_for_slot).
>
> Usually, every page is mapped at least once, so sptes_for_slot
> dominates.  Even when it isn't so, iterating the rmap base pointers is
> very fast since they are linear in memory, while sptes are scattered
> around, causing cache misses.

Why would pages be mapped often? Don't you use lazy spte updates?

>
> Another consideration is that on x86, an spte occupies just 64 bits
> (for the hardware pte); if there are multiple sptes per page (rare on
> modern hardware), there is also extra memory for rmap chains;
> sometimes we also allocate 64 bits for the gfn.  Having an extra
> linked list would require more memory to be allocated and maintained.

Hrm. I was thinking of not having an rmap but only using the chain. The
only slots that would require such a chain would be the ones with dirty
bitmapping enabled, so no penalty for normal RAM (unless you use kemari
or live migration of course).

But then again I probably do need an rmap for the mmu_notifier magic,
right? But I'd rather prefer to have that code path be slow and the
dirty bitmap invalidation fast than the other way around. Swapping is
slow either way.


Alex

^ permalink raw reply

* Re: [PATCH 14/27] KVM: PPC: Magic Page BookE support
From: Alexander Graf @ 2010-07-01 12:25 UTC (permalink / raw)
  To: Josh Boyer; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <20100701111823.GE7756@zod.rchland.ibm.com>

Josh Boyer wrote:
> On Thu, Jul 01, 2010 at 12:42:49PM +0200, Alexander Graf wrote:
>   
>> As we now have Book3s support for the magic page, we also need BookE to
>> join in on the party.
>>
>> This patch implements generic magic page logic for BookE and specific
>> TLB logic for e500. I didn't have any 440 around, so I didn't dare to
>> blindly try and write up broken code.
>>     
>
> Is this the only patch in the series that needs 440 specific code?  Also,
> does 440 KVM still work after this series is applied even without the code
> not present in this patch?
>   

Yes, pretty much. The rest of the code is generic. But 440 should easily
just work with this patch set. If you have one to try it out, please
give it a try. I can even prepare a 440 enabling patch so you could
verify if it works.

Alex

^ permalink raw reply

* Re: [PATCH 14/27] KVM: PPC: Magic Page BookE support
From: Josh Boyer @ 2010-07-01 11:18 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <1277980982-12433-15-git-send-email-agraf@suse.de>

On Thu, Jul 01, 2010 at 12:42:49PM +0200, Alexander Graf wrote:
>As we now have Book3s support for the magic page, we also need BookE to
>join in on the party.
>
>This patch implements generic magic page logic for BookE and specific
>TLB logic for e500. I didn't have any 440 around, so I didn't dare to
>blindly try and write up broken code.

Is this the only patch in the series that needs 440 specific code?  Also,
does 440 KVM still work after this series is applied even without the code
not present in this patch?

josh

^ permalink raw reply

* Re: [PATCH 0/2] Faster MMU lookups for Book3s v3
From: Avi Kivity @ 2010-07-01 11:14 UTC (permalink / raw)
  To: Alexander Graf; +Cc: linuxppc-dev, KVM list, kvm-ppc
In-Reply-To: <4C2C6745.8040001@suse.de>

On 07/01/2010 01:00 PM, Alexander Graf wrote:
>
> But doesn't that mean that you still need to loop through all the hvas
> that you want to invalidate?

It does.

>   Wouldn't it speed up dirty bitmap flushing
> a lot if we'd just have a simple linked list of all sPTEs belonging to
> that memslot?
>    

The complexity is O(pages_in_slot) + O(sptes_for_slot).

Usually, every page is mapped at least once, so sptes_for_slot 
dominates.  Even when it isn't so, iterating the rmap base pointers is 
very fast since they are linear in memory, while sptes are scattered 
around, causing cache misses.

Another consideration is that on x86, an spte occupies just 64 bits (for 
the hardware pte); if there are multiple sptes per page (rare on modern 
hardware), there is also extra memory for rmap chains; sometimes we also 
allocate 64 bits for the gfn.  Having an extra linked list would require 
more memory to be allocated and maintained.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply

* Re: Oops while running fs_racer test on a POWER6 box against latest git
From: Nick Piggin @ 2010-07-01 10:59 UTC (permalink / raw)
  To: Michael Neuling
  Cc: Latchesar Ionkov, Jens Axboe, LKML, linuxppc-dev, Ron Minnich,
	Christoph Hellwig, divya
In-Reply-To: <7381.1277960694@neuling.org>

On Thu, Jul 01, 2010 at 03:04:54PM +1000, Michael Neuling wrote:
> > While running fs_racer test from LTP on a POWER6 box against latest git(2.6.3
> 5-rc3-git4 - commitid 984bc9601f64fd)
> > came across the following warning followed by multiple oops.
> > 
> > ------------[ cut here ]------------
> > 
> > Badness at kernel/mutex-debug.c:64
> > NIP: c0000000000be9e8 LR: c0000000000be9cc CTR: 0000000000000000
> > REGS: c00000010be8f6f0 TRAP: 0700   Not tainted  (2.6.35-rc3-git4-autotest)
> > MSR: 8000000000029032<EE,ME,CE,IR,DR>    CR: 24224422  XER: 00000012
> > TASK = c00000010727cf00[8211] 'fs_racer_file_c' THREAD: c00000010be8bb50 CPU:
>  2
> > GPR00: 0000000000000000 c00000010be8f970 c000000000d3d798 0000000000000001
> > GPR04: c00000010be8fa70 c00000010be8c000 c00000010727d9f8 0000000000000000
> > GPR08: c0000000043042f0 c0000000016534e8 000000000000017a c000000000c29a1c
> > GPR12: 0000000028228424 c00000000f600500 c00000010be8fc40 0000000020000000
> > GPR16: fffffffffffff000 c000000109c73000 c00000010be8fc30 0000000000010442
> > GPR20: 0000000000000000 0000000000000000 00000000000001b6 c00000010dd12250
> > GPR24: c00000000017c08c c00000010727cf00 c00000010dd12278 c00000010dd12210
> > GPR28: 0000000000000001 c00000010be8c000 c000000000ca2008 c00000010be8fa70
> > NIP [c0000000000be9e8] .mutex_remove_waiter+0xa4/0x130
> > LR [c0000000000be9cc] .mutex_remove_waiter+0x88/0x130
> > Call Trace:
> > [c00000010be8f970] [c00000010be8fa00] 0xc00000010be8fa00 (unreliable)
> > [c00000010be8fa00] [c00000000064a9f0] .mutex_lock_nested+0x384/0x430
> > Instruction dump:
> > e81f0010 e93d0000 7fa04800 41fe0028 482e96e5 60000000 2fa30000 419e0018
> > e93e8008 80090000 2f800000 409e0008<0fe00000>   e93e8000 80090000 2f800000
> > Unable to handle kernel paging request for unknown fault
> > Faulting instruction address: 0xc00000000008d0f4
> > Oops: Kernel access of bad area, sig: 7 [#1]
> > SMP NR_CPUS=1024 NUMA
> > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > pSeries
> > last sysfs file: /sys/devices/system/cpu/cpu19/cache/index1/shared_cpu_map
> > Modules linked in: ipv6 fuse loop dm_mod sr_mod cdrom ibmveth sg
> > sd_mod crc_t10dif ibmvscsic scsi_transport_srp scsi_tgt scsi_mod
> > NIP: c00000000008d0f4 LR: c00000000008d0d0 CTR: 0000000000000000
> > REGS: c00000010978f900 TRAP: 0600   Tainted: G        W    (2.6.35-rc3-git4-a
> utotest)
> > MSR: 8000000000009032
> > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > Unrecoverable FP Unavailable Exception 800 at c000000000648ed4
> > EE,ME,IR,DR>    CR: 24022442  XER: 00000012
> > DAR: c000000000648f54, DSISR: 0000000040010000
> > TASK = c0000001096e4900[7353] 'fs_racer_file_s' THREAD: c00000010978c000 CPU:
>  10
> > GPR00: 0000000000004000 c00000010978fb80 c000000000d3d798 0000000000000001
> > GPR04: c00000000083539e c000000001610228 0000000000000000 c0000000054c6880
> > GPR08: 00000000000006a5 c000000000648f54 0000000000000007 00000000049b0000
> > GPR12: 0000000000000000 c00000000f601900 00000000ffffffff ffffffffffffffff
> > GPR16: 000000004b7dc520 0000000000000000 0000000000000000 c00000010978fea0
> > GPR20: 00000fffcca7e7a0 00000fffcca7e7a0 00000fffabf7dfd0 00000fffabf7dfd0
> > GPR24: 0000000000000000 0000000001200011 c000000000e1c0a8 c000000000648ed4
> > GPR28: 0000000000000000 c0000001096e4900 c000000000ca0458 c00000010725d400
> > NIP [c00000000008d0f4] .copy_process+0x310/0xf40
> > LR [c00000000008d0d0] .copy_process+0x2ec/0xf40
> > Call Trace:
> > [c00000010978fb80] [c00000000008d0d0] .copy_process+0x2ec/0xf40 (unreliable)
> > [c00000010978fc80] [c00000000008deb4] .do_fork+0x190/0x3cc
> > [c00000010978fdc0] [c000000000011ef4] .sys_clone+0x58/0x70
> > [c00000010978fe30] [c0000000000087f0] .ppc_clone+0x8/0xc
> > Instruction dump:
> > 419e0010 7fe3fb78 480774cd 60000000 801f0014 e93f0008 7800b842 39290080
> > 78004800 60000042 901f0014 38004000<7d6048a8>   7d6b0078 7d6049ad 40c2fff4
> > 
> > Kernel version 2.6.34-rc3-git3 works fine.
> 
> Should this read 2.6.35-rc3-git3?
> 
> If so, there's only about 20 commits in:
> 5904b3b81d2516..984bc9601f64fd
> 
> The likely fs related candidates are from Christoph and Nick Piggin
> (added to CC)
> 
> No commits relating to POWER6 or PPC.

Not sure what's happening here. The first warning looks like some mutex
corruption, but it doesn't have a stack trace (these are 2 seperate
dumps, right? ie. the copy_process stack doesn't relate to the mutex
warning?) So I don't have much idea.

If it is reproducable, can you try getting a better stack trace, or
better yet, even bisecting if there is just a small window?

Thanks,
Nick

^ permalink raw reply

* [PATCH 14/27] KVM: PPC: Magic Page BookE support
From: Alexander Graf @ 2010-07-01 10:42 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

As we now have Book3s support for the magic page, we also need BookE to
join in on the party.

This patch implements generic magic page logic for BookE and specific
TLB logic for e500. I didn't have any 440 around, so I didn't dare to
blindly try and write up broken code.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/booke.c    |   29 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/e500_tlb.c |   19 +++++++++++++++++--
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0f8ff9d..9609207 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -244,6 +244,31 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 		vcpu->arch.shared->int_pending = 0;
 }
 
+/* Check if a DTLB miss was on the magic page. Returns !0 if so. */
+int kvmppc_dtlb_magic_page(struct kvm_vcpu *vcpu, ulong eaddr)
+{
+	ulong mp_ea = vcpu->arch.magic_page_ea;
+	ulong gpaddr = vcpu->arch.magic_page_pa;
+	int gtlb_index = 11 | (1 << 16); /* Random number in TLB1 */
+
+	/* Check for existence of magic page */
+	if(likely(!mp_ea))
+		return 0;
+
+	/* Check if we're on the magic page */
+	if(likely((eaddr >> 12) != (mp_ea >> 12)))
+		return 0;
+
+	/* Don't map in user mode */
+	if(vcpu->arch.shared->msr & MSR_PR)
+		return 0;
+
+	kvmppc_mmu_map(vcpu, vcpu->arch.magic_page_ea, gpaddr, gtlb_index);
+	kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+
+	return 1;
+}
+
 /**
  * kvmppc_handle_exit
  *
@@ -311,6 +336,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			r = RESUME_HOST;
 			break;
 		case EMULATE_FAIL:
+		case EMULATE_DO_MMIO:
 			/* XXX Deliver Program interrupt to guest. */
 			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
 			       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
@@ -380,6 +406,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		gpa_t gpaddr;
 		gfn_t gfn;
 
+		if (kvmppc_dtlb_magic_page(vcpu, eaddr))
+			break;
+
 		/* Check the guest TLB. */
 		gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr);
 		if (gtlb_index < 0) {
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 66845a5..f5582ca 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -295,9 +295,22 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	struct page *new_page;
 	struct tlbe *stlbe;
 	hpa_t hpaddr;
+	u32 mas2 = gtlbe->mas2;
+	u32 mas3 = gtlbe->mas3;
 
 	stlbe = &vcpu_e500->shadow_tlb[tlbsel][esel];
 
+	if ((vcpu_e500->vcpu.arch.magic_page_ea) &&
+	    ((vcpu_e500->vcpu.arch.magic_page_pa >> PAGE_SHIFT) == gfn) &&
+	    !(vcpu_e500->vcpu.arch.shared->msr & MSR_PR)) {
+		mas2 = 0;
+		mas3 = E500_TLB_SUPER_PERM_MASK;
+		hpaddr = virt_to_phys(vcpu_e500->vcpu.arch.shared);
+		new_page = pfn_to_page(hpaddr >> PAGE_SHIFT);
+		get_page(new_page);
+		goto mapped;
+	}
+
 	/* Get reference to new page. */
 	new_page = gfn_to_page(vcpu_e500->vcpu.kvm, gfn);
 	if (is_error_page(new_page)) {
@@ -305,6 +318,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		kvm_release_page_clean(new_page);
 		return;
 	}
+
+mapped:
 	hpaddr = page_to_phys(new_page);
 
 	/* Drop reference to old page. */
@@ -316,10 +331,10 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	stlbe->mas1 = MAS1_TSIZE(BOOK3E_PAGESZ_4K)
 		| MAS1_TID(get_tlb_tid(gtlbe)) | MAS1_TS | MAS1_VALID;
 	stlbe->mas2 = (gvaddr & MAS2_EPN)
-		| e500_shadow_mas2_attrib(gtlbe->mas2,
+		| e500_shadow_mas2_attrib(mas2,
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 	stlbe->mas3 = (hpaddr & MAS3_RPN)
-		| e500_shadow_mas3_attrib(gtlbe->mas3,
+		| e500_shadow_mas3_attrib(mas3,
 				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
 	stlbe->mas7 = (hpaddr >> 32) & MAS7_RPN;
 
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 25/27] KVM: PPC: PV mtmsrd L=0 and mtmsr
From: Alexander Graf @ 2010-07-01 10:43 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

There is also a form of mtmsr where all bits need to be addressed. While the
PPC64 Linux kernel behaves resonably well here, on PPC32 we do not have an
L=1 form. It does mtmsr even for simple things like only changing EE.

So we need to hook into that one as well and check for a mask of bits that we
deem safe to change from within guest context.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - use kvm_patch_ins_b
---
 arch/powerpc/kernel/kvm.c      |   51 ++++++++++++++++++++++++
 arch/powerpc/kernel/kvm_emul.S |   84 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 1e32298..2541736 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -62,7 +62,9 @@
 #define KVM_INST_MTSPR_DSISR	0x7c1203a6
 
 #define KVM_INST_TLBSYNC	0x7c00046c
+#define KVM_INST_MTMSRD_L0	0x7c000164
 #define KVM_INST_MTMSRD_L1	0x7c010164
+#define KVM_INST_MTMSR		0x7c000124
 
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
@@ -166,6 +168,49 @@ static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
 	kvm_patch_ins_b(inst, distance_start);
 }
 
+extern u32 kvm_emulate_mtmsr_branch_offs;
+extern u32 kvm_emulate_mtmsr_reg1_offs;
+extern u32 kvm_emulate_mtmsr_reg2_offs;
+extern u32 kvm_emulate_mtmsr_reg3_offs;
+extern u32 kvm_emulate_mtmsr_orig_ins_offs;
+extern u32 kvm_emulate_mtmsr_len;
+extern u32 kvm_emulate_mtmsr[];
+
+static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsr_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsr_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsr, kvm_emulate_mtmsr_len * 4);
+	p[kvm_emulate_mtmsr_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtmsr_reg1_offs] |= rt;
+	p[kvm_emulate_mtmsr_reg2_offs] |= rt;
+	p[kvm_emulate_mtmsr_reg3_offs] |= rt;
+	p[kvm_emulate_mtmsr_orig_ins_offs] = *inst;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsr_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -246,6 +291,12 @@ static void kvm_check_ins(u32 *inst)
 		if (get_rt(inst_rt) < 30)
 			kvm_patch_ins_mtmsrd(inst, inst_rt);
 		break;
+	case KVM_INST_MTMSR:
+	case KVM_INST_MTMSRD_L0:
+		/* We use r30 and r31 during the hook */
+		if (get_rt(inst_rt) < 30)
+			kvm_patch_ins_mtmsr(inst, inst_rt);
+		break;
 	}
 
 	switch (_inst) {
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 25e6683..ccf5a42 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -110,3 +110,87 @@ kvm_emulate_mtmsrd_reg_offs:
 .global kvm_emulate_mtmsrd_len
 kvm_emulate_mtmsrd_len:
 	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
+
+
+#define MSR_SAFE_BITS (MSR_EE | MSR_CE | MSR_ME | MSR_RI)
+#define MSR_CRITICAL_BITS ~MSR_SAFE_BITS
+
+.global kvm_emulate_mtmsr
+kvm_emulate_mtmsr:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Find the changed bits between old and new MSR */
+kvm_emulate_mtmsr_reg1:
+	xor	r31, r0, r31
+
+	/* Check if we need to really do mtmsr */
+	LOAD_REG_IMMEDIATE(r30, MSR_CRITICAL_BITS)
+	and.	r31, r31, r30
+
+	/* No critical bits changed? Maybe we can stay in the guest. */
+	beq	maybe_stay_in_guest
+
+do_mtmsr:
+
+	SCRATCH_RESTORE
+
+	/* Just fire off the mtmsr if it's critical */
+kvm_emulate_mtmsr_orig_ins:
+	mtmsr	r0
+
+	b	kvm_emulate_mtmsr_branch
+
+maybe_stay_in_guest:
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_mtmsr
+
+	/* Check if we may trigger an interrupt */
+kvm_emulate_mtmsr_reg2:
+	andi.	r31, r0, MSR_EE
+	beq	no_mtmsr
+
+	b	do_mtmsr
+
+no_mtmsr:
+
+	/* Put MSR into magic page because we don't call mtmsr */
+kvm_emulate_mtmsr_reg3:
+	STL64(r0, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsr_branch:
+	b	.
+kvm_emulate_mtmsr_end:
+
+.global kvm_emulate_mtmsr_branch_offs
+kvm_emulate_mtmsr_branch_offs:
+	.long (kvm_emulate_mtmsr_branch - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg1_offs
+kvm_emulate_mtmsr_reg1_offs:
+	.long (kvm_emulate_mtmsr_reg1 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg2_offs
+kvm_emulate_mtmsr_reg2_offs:
+	.long (kvm_emulate_mtmsr_reg2 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_reg3_offs
+kvm_emulate_mtmsr_reg3_offs:
+	.long (kvm_emulate_mtmsr_reg3 - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_orig_ins_offs
+kvm_emulate_mtmsr_orig_ins_offs:
+	.long (kvm_emulate_mtmsr_orig_ins - kvm_emulate_mtmsr) / 4
+
+.global kvm_emulate_mtmsr_len
+kvm_emulate_mtmsr_len:
+	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 19/27] KVM: PPC: PV instructions to loads and stores
From: Alexander Graf @ 2010-07-01 10:42 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

Some instructions can simply be replaced by load and store instructions to
or from the magic page.

This patch replaces often called instructions that fall into the above category.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - use kvm_patch_ins
---
 arch/powerpc/kernel/kvm.c |  111 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 111 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 1f328d5..7094ee4 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -32,6 +32,35 @@
 #define KVM_MAGIC_PAGE		(-4096L)
 #define magic_var(x) KVM_MAGIC_PAGE + offsetof(struct kvm_vcpu_arch_shared, x)
 
+#define KVM_INST_LWZ		0x80000000
+#define KVM_INST_STW		0x90000000
+#define KVM_INST_LD		0xe8000000
+#define KVM_INST_STD		0xf8000000
+#define KVM_INST_NOP		0x60000000
+#define KVM_INST_B		0x48000000
+#define KVM_INST_B_MASK		0x03ffffff
+#define KVM_INST_B_MAX		0x01ffffff
+
+#define KVM_MASK_RT		0x03e00000
+#define KVM_INST_MFMSR		0x7c0000a6
+#define KVM_INST_MFSPR_SPRG0	0x7c1042a6
+#define KVM_INST_MFSPR_SPRG1	0x7c1142a6
+#define KVM_INST_MFSPR_SPRG2	0x7c1242a6
+#define KVM_INST_MFSPR_SPRG3	0x7c1342a6
+#define KVM_INST_MFSPR_SRR0	0x7c1a02a6
+#define KVM_INST_MFSPR_SRR1	0x7c1b02a6
+#define KVM_INST_MFSPR_DAR	0x7c1302a6
+#define KVM_INST_MFSPR_DSISR	0x7c1202a6
+
+#define KVM_INST_MTSPR_SPRG0	0x7c1043a6
+#define KVM_INST_MTSPR_SPRG1	0x7c1143a6
+#define KVM_INST_MTSPR_SPRG2	0x7c1243a6
+#define KVM_INST_MTSPR_SPRG3	0x7c1343a6
+#define KVM_INST_MTSPR_SRR0	0x7c1a03a6
+#define KVM_INST_MTSPR_SRR1	0x7c1b03a6
+#define KVM_INST_MTSPR_DAR	0x7c1303a6
+#define KVM_INST_MTSPR_DSISR	0x7c1203a6
+
 static bool kvm_patching_worked = true;
 
 static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
@@ -40,6 +69,34 @@ static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
 	flush_icache_range((ulong)inst, (ulong)inst + 4);
 }
 
+static void kvm_patch_ins_ld(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_LD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_lwz(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_LWZ | rt | (addr & 0x0000ffff));
+}
+
+static void kvm_patch_ins_std(u32 *inst, long addr, u32 rt)
+{
+#ifdef CONFIG_64BIT
+	kvm_patch_ins(inst, KVM_INST_STD | rt | (addr & 0x0000fffc));
+#else
+	kvm_patch_ins(inst, KVM_INST_STW | rt | ((addr + 4) & 0x0000fffc));
+#endif
+}
+
+static void kvm_patch_ins_stw(u32 *inst, long addr, u32 rt)
+{
+	kvm_patch_ins(inst, KVM_INST_STW | rt | (addr & 0x0000fffc));
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -54,6 +111,60 @@ static void kvm_check_ins(u32 *inst)
 	u32 inst_rt = _inst & KVM_MASK_RT;
 
 	switch (inst_no_rt) {
+	/* Loads */
+	case KVM_INST_MFMSR:
+		kvm_patch_ins_ld(inst, magic_var(msr), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG0:
+		kvm_patch_ins_ld(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG1:
+		kvm_patch_ins_ld(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG2:
+		kvm_patch_ins_ld(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SPRG3:
+		kvm_patch_ins_ld(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SRR0:
+		kvm_patch_ins_ld(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MFSPR_SRR1:
+		kvm_patch_ins_ld(inst, magic_var(srr1), inst_rt);
+		break;
+	case KVM_INST_MFSPR_DAR:
+		kvm_patch_ins_ld(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MFSPR_DSISR:
+		kvm_patch_ins_lwz(inst, magic_var(dsisr), inst_rt);
+		break;
+
+	/* Stores */
+	case KVM_INST_MTSPR_SPRG0:
+		kvm_patch_ins_std(inst, magic_var(sprg0), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG1:
+		kvm_patch_ins_std(inst, magic_var(sprg1), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG2:
+		kvm_patch_ins_std(inst, magic_var(sprg2), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SPRG3:
+		kvm_patch_ins_std(inst, magic_var(sprg3), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SRR0:
+		kvm_patch_ins_std(inst, magic_var(srr0), inst_rt);
+		break;
+	case KVM_INST_MTSPR_SRR1:
+		kvm_patch_ins_std(inst, magic_var(srr1), inst_rt);
+		break;
+	case KVM_INST_MTSPR_DAR:
+		kvm_patch_ins_std(inst, magic_var(dar), inst_rt);
+		break;
+	case KVM_INST_MTSPR_DSISR:
+		kvm_patch_ins_stw(inst, magic_var(dsisr), inst_rt);
+		break;
 	}
 
 	switch (_inst) {
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 24/27] KVM: PPC: PV mtmsrd L=1
From: Alexander Graf @ 2010-07-01 10:42 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

The PowerPC ISA has a special instruction for mtmsr that only changes the EE
and RI bits, namely the L=1 form.

Since that one is reasonably often occuring and simple to implement, let's
go with this first. Writing EE=0 is always just a store. Doing EE=1 also
requires us to check for pending interrupts and if necessary exit back to the
hypervisor.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - use kvm_patch_ins_b
---
 arch/powerpc/kernel/kvm.c      |   45 ++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/kvm_emul.S |   56 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 337e3e5..1e32298 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -62,6 +62,7 @@
 #define KVM_INST_MTSPR_DSISR	0x7c1203a6
 
 #define KVM_INST_TLBSYNC	0x7c00046c
+#define KVM_INST_MTMSRD_L1	0x7c010164
 
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
@@ -128,6 +129,43 @@ static u32 *kvm_alloc(int len)
 	return p;
 }
 
+extern u32 kvm_emulate_mtmsrd_branch_offs;
+extern u32 kvm_emulate_mtmsrd_reg_offs;
+extern u32 kvm_emulate_mtmsrd_len;
+extern u32 kvm_emulate_mtmsrd[];
+
+static void kvm_patch_ins_mtmsrd(u32 *inst, u32 rt)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_mtmsrd_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_mtmsrd_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_mtmsrd, kvm_emulate_mtmsrd_len * 4);
+	p[kvm_emulate_mtmsrd_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_mtmsrd_reg_offs] |= rt;
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_mtmsrd_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -201,6 +239,13 @@ static void kvm_check_ins(u32 *inst)
 	case KVM_INST_TLBSYNC:
 		kvm_patch_ins_nop(inst);
 		break;
+
+	/* Rewrites */
+	case KVM_INST_MTMSRD_L1:
+		/* We use r30 and r31 during the hook */
+		if (get_rt(inst_rt) < 30)
+			kvm_patch_ins_mtmsrd(inst, inst_rt);
+		break;
 	}
 
 	switch (_inst) {
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index 7da835a..25e6683 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -54,3 +54,59 @@
 	/* Disable critical section. We are critical if			\
 	   shared->critical == r1 and r2 is always != r1 */		\
 	STL64(r2, KVM_MAGIC_PAGE + KVM_MAGIC_CRITICAL, 0);
+
+.global kvm_emulate_mtmsrd
+kvm_emulate_mtmsrd:
+
+	SCRATCH_SAVE
+
+	/* Put MSR & ~(MSR_EE|MSR_RI) in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+	lis	r30, (~(MSR_EE | MSR_RI))@h
+	ori	r30, r30, (~(MSR_EE | MSR_RI))@l
+	and	r31, r31, r30
+
+	/* OR the register's (MSR_EE|MSR_RI) on MSR */
+kvm_emulate_mtmsrd_reg:
+	andi.	r30, r0, (MSR_EE|MSR_RI)
+	or	r31, r31, r30
+
+	/* Put MSR back into magic page */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Check if we have to fetch an interrupt */
+	lwz	r31, (KVM_MAGIC_PAGE + KVM_MAGIC_INT)(0)
+	cmpwi	r31, 0
+	beq+	no_check
+
+	/* Check if we may trigger an interrupt */
+	andi.	r30, r30, MSR_EE
+	beq	no_check
+
+	SCRATCH_RESTORE
+
+	/* Nag hypervisor */
+	tlbsync
+
+	b	kvm_emulate_mtmsrd_branch
+
+no_check:
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_mtmsrd_branch:
+	b	.
+kvm_emulate_mtmsrd_end:
+
+.global kvm_emulate_mtmsrd_branch_offs
+kvm_emulate_mtmsrd_branch_offs:
+	.long (kvm_emulate_mtmsrd_branch - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_reg_offs
+kvm_emulate_mtmsrd_reg_offs:
+	.long (kvm_emulate_mtmsrd_reg - kvm_emulate_mtmsrd) / 4
+
+.global kvm_emulate_mtmsrd_len
+kvm_emulate_mtmsrd_len:
+	.long (kvm_emulate_mtmsrd_end - kvm_emulate_mtmsrd) / 4
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 27/27] KVM: PPC: Add Documentation about PV interface
From: Alexander Graf @ 2010-07-01 10:43 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

We just introduced a new PV interface that screams for documentation. So here
it is - a shiny new and awesome text file describing the internal works of
the PPC KVM paravirtual interface.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - clarify guest implementation
  - clarify that privileged instructions still work
  - explain safe MSR bits
  - Fix dsisr patch description
  - change hypervisor calls to use new register values
---
 Documentation/kvm/ppc-pv.txt |  185 ++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 185 insertions(+), 0 deletions(-)
 create mode 100644 Documentation/kvm/ppc-pv.txt

diff --git a/Documentation/kvm/ppc-pv.txt b/Documentation/kvm/ppc-pv.txt
new file mode 100644
index 0000000..82de6c6
--- /dev/null
+++ b/Documentation/kvm/ppc-pv.txt
@@ -0,0 +1,185 @@
+The PPC KVM paravirtual interface
+=================================
+
+The basic execution principle by which KVM on PowerPC works is to run all kernel
+space code in PR=1 which is user space. This way we trap all privileged
+instructions and can emulate them accordingly.
+
+Unfortunately that is also the downfall. There are quite some privileged
+instructions that needlessly return us to the hypervisor even though they
+could be handled differently.
+
+This is what the PPC PV interface helps with. It takes privileged instructions
+and transforms them into unprivileged ones with some help from the hypervisor.
+This cuts down virtualization costs by about 50% on some of my benchmarks.
+
+The code for that interface can be found in arch/powerpc/kernel/kvm*
+
+Querying for existence
+======================
+
+To find out if we're running on KVM or not, we overlay the PVR register. Usually
+the PVR register contains an id that identifies your CPU type. If, however, you
+pass KVM_PVR_PARA in the register that you want the PVR result in, the register
+still contains KVM_PVR_PARA after the mfpvr call.
+
+	LOAD_REG_IMM(r5, KVM_PVR_PARA)
+	mfpvr	r5
+	[r5 still contains KVM_PVR_PARA]
+
+Once determined to run under a PV capable KVM, you can now use hypercalls as
+described below.
+
+PPC hypercalls
+==============
+
+The only viable ways to reliably get from guest context to host context are:
+
+	1) Call an invalid instruction
+	2) Call the "sc" instruction with a parameter to "sc"
+	3) Call the "sc" instruction with parameters in GPRs
+
+Method 1 is always a bad idea. Invalid instructions can be replaced later on
+by valid instructions, rendering the interface broken.
+
+Method 2 also has downfalls. If the parameter to "sc" is != 0 the spec is
+rather unclear if the sc is targeted directly for the hypervisor or the
+supervisor. It would also require that we read the syscall issuing instruction
+every time a syscall is issued, slowing down guest syscalls.
+
+Method 3 is what KVM uses. We pass magic constants (KVM_SC_MAGIC_R0 and
+KVM_SC_MAGIC_R3) in r0 and r3 respectively. If a syscall instruction with these
+magic values arrives from the guest's kernel mode, we take the syscall as a
+hypercall.
+
+The parameters are as follows:
+
+	r0		KVM_SC_MAGIC_R0
+	r3		KVM_SC_MAGIC_R3		Return code
+	r4		Hypercall number
+	r5		First parameter
+	r6		Second parameter
+	r7		Third parameter
+	r8		Fourth parameter
+
+Hypercall definitions are shared in generic code, so the same hypercall numbers
+apply for x86 and powerpc alike.
+
+The magic page
+==============
+
+To enable communication between the hypervisor and guest there is a new shared
+page that contains parts of supervisor visible register state. The guest can
+map this shared page using the KVM hypercall KVM_HC_PPC_MAP_MAGIC_PAGE.
+
+With this hypercall issued the guest always gets the magic page mapped at the
+desired location in effective and physical address space. For now, we always
+map the page to -4096. This way we can access it using absolute load and store
+functions. The following instruction reads the first field of the magic page:
+
+	ld	rX, -4096(0)
+
+The interface is designed to be extensible should there be need later to add
+additional registers to the magic page. If you add fields to the magic page,
+also define a new hypercall feature to indicate that the host can give you more
+registers. Only if the host supports the additional features, make use of them.
+
+The magic page has the following layout as described in
+arch/powerpc/include/asm/kvm_para.h:
+
+struct kvm_vcpu_arch_shared {
+	__u64 scratch1;
+	__u64 scratch2;
+	__u64 scratch3;
+	__u64 critical;		/* Guest may not get interrupts if == r1 */
+	__u64 sprg0;
+	__u64 sprg1;
+	__u64 sprg2;
+	__u64 sprg3;
+	__u64 srr0;
+	__u64 srr1;
+	__u64 dar;
+	__u64 msr;
+	__u32 dsisr;
+	__u32 int_pending;	/* Tells the guest if we have an interrupt */
+};
+
+Additions to the page must only occur at the end. Struct fields are always 32
+bit aligned.
+
+MSR bits
+========
+
+The MSR contains bits that require hypervisor intervention and bits that do
+not require direct hypervisor intervention because they only get interpreted
+when entering the guest or don't have any impact on the hypervisor's behavior.
+
+The following bits are safe to be set inside the guest:
+
+  MSR_EE
+  MSR_RI
+  MSR_CR
+  MSR_ME
+
+If any other bit changes in the MSR, please still use mtmsr(d).
+
+Patched instructions
+====================
+
+The "ld" and "std" instructions are transormed to "lwz" and "stw" instructions
+respectively on 32 bit systems with an added offset of 4 to accomodate for big
+endianness.
+
+The following is a list of mapping the Linux kernel performs when running as
+guest. Implementing any of those mappings is optional, as the instruction traps
+also act on the shared page. So calling privileged instructions still works as
+before.
+
+From			To
+====			==
+
+mfmsr	rX		ld	rX, magic_page->msr
+mfsprg	rX, 0		ld	rX, magic_page->sprg0
+mfsprg	rX, 1		ld	rX, magic_page->sprg1
+mfsprg	rX, 2		ld	rX, magic_page->sprg2
+mfsprg	rX, 3		ld	rX, magic_page->sprg3
+mfsrr0	rX		ld	rX, magic_page->srr0
+mfsrr1	rX		ld	rX, magic_page->srr1
+mfdar	rX		ld	rX, magic_page->dar
+mfdsisr	rX		lwz	rX, magic_page->dsisr
+
+mtmsr	rX		std	rX, magic_page->msr
+mtsprg	0, rX		std	rX, magic_page->sprg0
+mtsprg	1, rX		std	rX, magic_page->sprg1
+mtsprg	2, rX		std	rX, magic_page->sprg2
+mtsprg	3, rX		std	rX, magic_page->sprg3
+mtsrr0	rX		std	rX, magic_page->srr0
+mtsrr1	rX		std	rX, magic_page->srr1
+mtdar	rX		std	rX, magic_page->dar
+mtdsisr	rX		stw	rX, magic_page->dsisr
+
+tlbsync			nop
+
+mtmsrd	rX, 0		b	<special mtmsr section>
+mtmsr			b	<special mtmsr section>
+
+mtmsrd	rX, 1		b	<special mtmsrd section>
+
+[BookE only]
+wrteei	[0|1]		b	<special wrteei section>
+
+
+Some instructions require more logic to determine what's going on than a load
+or store instruction can deliver. To enable patching of those, we keep some
+RAM around where we can live translate instructions to. What happens is the
+following:
+
+	1) copy emulation code to memory
+	2) patch that code to fit the emulated instruction
+	3) patch that code to return to the original pc + 4
+	4) patch the original instruction to branch to the new code
+
+That way we can inject an arbitrary amount of code as replacement for a single
+instruction. This allows us to check for pending interrupts when setting EE=1
+for example.
+
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 21/27] KVM: PPC: Introduce kvm_tmp framework
From: Alexander Graf @ 2010-07-01 10:42 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

We will soon require more sophisticated methods to replace single instructions
with multiple instructions. We do that by branching to a memory region where we
write replacement code for the instruction to.

This region needs to be within 32 MB of the patched instruction though, because
that's the furthest we can jump with immediate branches.

So we keep 1MB of free space around in bss. After we're done initing we can just
tell the mm system that the unused pages are free, but until then we have enough
space to fit all our code in.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kernel/kvm.c |   41 +++++++++++++++++++++++++++++++++++++++--
 1 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 3a49de5..75c9e0b 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -64,6 +64,8 @@
 #define KVM_INST_TLBSYNC	0x7c00046c
 
 static bool kvm_patching_worked = true;
+static char kvm_tmp[1024 * 1024];
+static int kvm_tmp_index;
 
 static inline void kvm_patch_ins(u32 *inst, u32 new_inst)
 {
@@ -104,6 +106,23 @@ static void kvm_patch_ins_nop(u32 *inst)
 	kvm_patch_ins(inst, KVM_INST_NOP);
 }
 
+static u32 *kvm_alloc(int len)
+{
+	u32 *p;
+
+	if ((kvm_tmp_index + len) > ARRAY_SIZE(kvm_tmp)) {
+		printk(KERN_ERR "KVM: No more space (%d + %d)\n",
+				kvm_tmp_index, len);
+		kvm_patching_worked = false;
+		return NULL;
+	}
+
+	p = (void*)&kvm_tmp[kvm_tmp_index];
+	kvm_tmp_index += len;
+
+	return p;
+}
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -201,12 +220,27 @@ static void kvm_use_magic_page(void)
 		kvm_check_ins(p);
 }
 
+static void kvm_free_tmp(void)
+{
+	unsigned long start, end;
+
+	start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK;
+	end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK;
+
+	/* Free the tmp space we don't need */
+	for (; start < end; start += PAGE_SIZE) {
+		ClearPageReserved(virt_to_page(start));
+		init_page_count(virt_to_page(start));
+		free_page(start);
+		totalram_pages++;
+	}
+}
+
 static int __init kvm_guest_init(void)
 {
-	char *p;
 
 	if (!kvm_para_available())
-		return 0;
+		goto free_tmp;
 
 	if (kvm_para_has_feature(KVM_FEATURE_MAGIC_PAGE))
 		kvm_use_magic_page();
@@ -214,6 +248,9 @@ static int __init kvm_guest_init(void)
 	printk(KERN_INFO "KVM: Live patching for a fast VM %s\n",
 			 kvm_patching_worked ? "worked" : "failed");
 
+free_tmp:
+	kvm_free_tmp();
+
 	return 0;
 }
 
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 22/27] KVM: PPC: Introduce branch patching helper
From: Alexander Graf @ 2010-07-01 10:42 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

We will need to patch several instruction streams over to a different
code path, so we need a way to patch a single instruction with a branch
somewhere else.

This patch adds a helper to facilitate this patching.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kernel/kvm.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 75c9e0b..337e3e5 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -106,6 +106,11 @@ static void kvm_patch_ins_nop(u32 *inst)
 	kvm_patch_ins(inst, KVM_INST_NOP);
 }
 
+static void kvm_patch_ins_b(u32 *inst, int addr)
+{
+	kvm_patch_ins(inst, KVM_INST_B | (addr & KVM_INST_B_MASK));
+}
+
 static u32 *kvm_alloc(int len)
 {
 	u32 *p;
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 26/27] KVM: PPC: PV wrteei
From: Alexander Graf @ 2010-07-01 10:43 UTC (permalink / raw)
  To: kvm-ppc; +Cc: linuxppc-dev, KVM list
In-Reply-To: <1277980982-12433-1-git-send-email-agraf@suse.de>

On BookE the preferred way to write the EE bit is the wrteei instruction. It
already encodes the EE bit in the instruction.

So in order to get BookE some speedups as well, let's also PV'nize thati
instruction.

Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - use kvm_patch_ins_b
---
 arch/powerpc/kernel/kvm.c      |   50 ++++++++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/kvm_emul.S |   41 ++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 2541736..995fadd 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -66,6 +66,9 @@
 #define KVM_INST_MTMSRD_L1	0x7c010164
 #define KVM_INST_MTMSR		0x7c000124
 
+#define KVM_INST_WRTEEI_0	0x7c000146
+#define KVM_INST_WRTEEI_1	0x7c008146
+
 static bool kvm_patching_worked = true;
 static char kvm_tmp[1024 * 1024];
 static int kvm_tmp_index;
@@ -211,6 +214,47 @@ static void kvm_patch_ins_mtmsr(u32 *inst, u32 rt)
 	kvm_patch_ins_b(inst, distance_start);
 }
 
+#ifdef CONFIG_BOOKE
+
+extern u32 kvm_emulate_wrteei_branch_offs;
+extern u32 kvm_emulate_wrteei_ee_offs;
+extern u32 kvm_emulate_wrteei_len;
+extern u32 kvm_emulate_wrteei[];
+
+static void kvm_patch_ins_wrteei(u32 *inst)
+{
+	u32 *p;
+	int distance_start;
+	int distance_end;
+	ulong next_inst;
+
+	p = kvm_alloc(kvm_emulate_wrteei_len * 4);
+	if (!p)
+		return;
+
+	/* Find out where we are and put everything there */
+	distance_start = (ulong)p - (ulong)inst;
+	next_inst = ((ulong)inst + 4);
+	distance_end = next_inst - (ulong)&p[kvm_emulate_wrteei_branch_offs];
+
+	/* Make sure we only write valid b instructions */
+	if (distance_start > KVM_INST_B_MAX) {
+		kvm_patching_worked = false;
+		return;
+	}
+
+	/* Modify the chunk to fit the invocation */
+	memcpy(p, kvm_emulate_wrteei, kvm_emulate_wrteei_len * 4);
+	p[kvm_emulate_wrteei_branch_offs] |= distance_end & KVM_INST_B_MASK;
+	p[kvm_emulate_wrteei_ee_offs] |= (*inst & MSR_EE);
+	flush_icache_range((ulong)p, (ulong)p + kvm_emulate_wrteei_len * 4);
+
+	/* Patch the invocation */
+	kvm_patch_ins_b(inst, distance_start);
+}
+
+#endif
+
 static void kvm_map_magic_page(void *data)
 {
 	kvm_hypercall2(KVM_HC_PPC_MAP_MAGIC_PAGE,
@@ -300,6 +344,12 @@ static void kvm_check_ins(u32 *inst)
 	}
 
 	switch (_inst) {
+#ifdef CONFIG_BOOKE
+	case KVM_INST_WRTEEI_0:
+	case KVM_INST_WRTEEI_1:
+		kvm_patch_ins_wrteei(inst);
+		break;
+#endif
 	}
 }
 
diff --git a/arch/powerpc/kernel/kvm_emul.S b/arch/powerpc/kernel/kvm_emul.S
index ccf5a42..b79b9de 100644
--- a/arch/powerpc/kernel/kvm_emul.S
+++ b/arch/powerpc/kernel/kvm_emul.S
@@ -194,3 +194,44 @@ kvm_emulate_mtmsr_orig_ins_offs:
 .global kvm_emulate_mtmsr_len
 kvm_emulate_mtmsr_len:
 	.long (kvm_emulate_mtmsr_end - kvm_emulate_mtmsr) / 4
+
+
+
+.global kvm_emulate_wrteei
+kvm_emulate_wrteei:
+
+	SCRATCH_SAVE
+
+	/* Fetch old MSR in r31 */
+	LL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	/* Remove MSR_EE from old MSR */
+	li	r30, 0
+	ori	r30, r30, MSR_EE
+	andc	r31, r31, r30
+
+	/* OR new MSR_EE onto the old MSR */
+kvm_emulate_wrteei_ee:
+	ori	r31, r31, 0
+
+	/* Write new MSR value back */
+	STL64(r31, KVM_MAGIC_PAGE + KVM_MAGIC_MSR, 0)
+
+	SCRATCH_RESTORE
+
+	/* Go back to caller */
+kvm_emulate_wrteei_branch:
+	b	.
+kvm_emulate_wrteei_end:
+
+.global kvm_emulate_wrteei_branch_offs
+kvm_emulate_wrteei_branch_offs:
+	.long (kvm_emulate_wrteei_branch - kvm_emulate_wrteei) / 4
+
+.global kvm_emulate_wrteei_ee_offs
+kvm_emulate_wrteei_ee_offs:
+	.long (kvm_emulate_wrteei_ee - kvm_emulate_wrteei) / 4
+
+.global kvm_emulate_wrteei_len
+kvm_emulate_wrteei_len:
+	.long (kvm_emulate_wrteei_end - kvm_emulate_wrteei) / 4
-- 
1.6.0.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox