All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/30] powerpc/booke: Set CPU_FTR_DEBUG_LVL_EXC on 32-bit
From: Alexander Graf @ 2012-02-17 17:13 UTC (permalink / raw)
  To: kvm-ppc; +Cc: kvm, linuxppc-dev, Scott Wood
In-Reply-To: <1329498837-11717-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

Currently 32-bit only cares about this for choice of exception
vector, which is done in core-specific code.  However, KVM will
want to distinguish as well.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/cputable.h |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index ad55a1c..6a034a2 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -376,7 +376,8 @@ extern const char *powerpc_base_platform;
 #define CPU_FTRS_47X	(CPU_FTRS_440x6)
 #define CPU_FTRS_E200	(CPU_FTR_USE_TB | CPU_FTR_SPE_COMP | \
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
-	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE)
+	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \
+	    CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E500	(CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
 	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_NOEXECUTE)
@@ -385,7 +386,7 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_E500MC	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
-	    CPU_FTR_DBELL)
+	    CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E5500	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-- 
1.6.0.2


^ permalink raw reply related

* [PATCH 00/30] KVM: PPC: e500mc support
From: Alexander Graf @ 2012-02-17 17:13 UTC (permalink / raw)
  To: kvm-ppc; +Cc: kvm, linuxppc-dev, Scott Wood

(resending with fixed email list - somehow I screwed that one up the
 first time)

This is Scott's e500mc RFC patch set rebased, berobbed of its pt_regs
parts and fixed for bisectability. On top of them, I addressed all the
comments that I had on the code and that came up in his code as FIXMEs.

I verified that this patch set works just fine on e500mc and doesn't
break e500v2, so I would say it's good to go as it is, unless someone
has strong objections to how things are done. Everything hereafter
I would prefer to do based on a working upstream version rather than
a downstream fork, as that way exposure is a lot higher.

Alex

Alexander Graf (15):
  KVM: PPC: e500mc: Add doorbell emulation support
  KVM: PPC: e500mc: implicitly set MSR_GS
  KVM: PPC: e500mc: Move r1/r2 restoration very early
  KVM: PPC: e500mc: add load inst fixup
  KVM: PPC: rename CONFIG_KVM_E500 -> CONFIG_KVM_E500V2
  KVM: PPC: make e500v2 and e500mc mutually exclusive
  KVM: PPC: booke: remove leftover debugging
  KVM: PPC: booke: deliver program int on emulation failure
  KVM: PPC: booke: call resched after every exit
  KVM: PPC: booke: BOOKE_IRQPRIO_MAX is n+1
  KVM: PPC: bookehv: fix exit timing
  KVM: PPC: bookehv: remove negation for CONFIG_64BIT
  KVM: PPC: bookehv: remove SET_VCPU
  KVM: PPC: bookehv: disable MAS register updates early
  KVM: PPC: bookehv: add comment about shadow_msr

Scott Wood (15):
  powerpc/booke: Set CPU_FTR_DEBUG_LVL_EXC on 32-bit
  powerpc/e500: split CPU_FTRS_ALWAYS/CPU_FTRS_POSSIBLE
  KVM: PPC: factor out lpid allocator from book3s_64_mmu_hv
  KVM: PPC: booke: add booke-level vcpu load/put
  KVM: PPC: booke: Move vm core init/destroy out of booke.c
  KVM: PPC: e500: rename e500_tlb.h to e500.h
  KVM: PPC: e500: merge <asm/kvm_e500.h> into arch/powerpc/kvm/e500.h
  KVM: PPC: e500: clean up arch/powerpc/kvm/e500.h
  KVM: PPC: e500: refactor core-specific TLB code
  KVM: PPC: e500: Track TLB1 entries with a bitmap
  KVM: PPC: e500: emulate tlbilx
  powerpc/booke: Provide exception macros with interrupt name
  KVM: PPC: booke: category E.HV (GS-mode) support
  KVM: PPC: booke: standard PPC floating point support
  KVM: PPC: e500mc support

 arch/powerpc/include/asm/cputable.h         |   21 +-
 arch/powerpc/include/asm/dbell.h            |    1 +
 arch/powerpc/include/asm/kvm.h              |    1 +
 arch/powerpc/include/asm/kvm_asm.h          |    8 +
 arch/powerpc/include/asm/kvm_book3s.h       |    3 +
 arch/powerpc/include/asm/kvm_booke.h        |    3 +
 arch/powerpc/include/asm/kvm_booke_hv_asm.h |   49 +++
 arch/powerpc/include/asm/kvm_e500.h         |   96 -----
 arch/powerpc/include/asm/kvm_host.h         |   22 +-
 arch/powerpc/include/asm/kvm_ppc.h          |    8 +
 arch/powerpc/include/asm/mmu-book3e.h       |    6 +
 arch/powerpc/include/asm/processor.h        |    3 +
 arch/powerpc/include/asm/reg.h              |    2 +
 arch/powerpc/include/asm/reg_booke.h        |   34 ++
 arch/powerpc/include/asm/system.h           |    1 +
 arch/powerpc/kernel/asm-offsets.c           |   15 +-
 arch/powerpc/kernel/cpu_setup_fsl_booke.S   |    1 +
 arch/powerpc/kernel/head_44x.S              |   23 +-
 arch/powerpc/kernel/head_booke.h            |   69 ++-
 arch/powerpc/kernel/head_fsl_booke.S        |   98 ++++-
 arch/powerpc/kvm/44x.c                      |   12 +
 arch/powerpc/kvm/Kconfig                    |   26 +-
 arch/powerpc/kvm/Makefile                   |   15 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c         |   26 +-
 arch/powerpc/kvm/booke.c                    |  379 ++++++++++++++---
 arch/powerpc/kvm/booke.h                    |   57 +++-
 arch/powerpc/kvm/booke_emulate.c            |   23 +-
 arch/powerpc/kvm/bookehv_interrupts.S       |  609 +++++++++++++++++++++++++++
 arch/powerpc/kvm/e500.c                     |  372 ++++++++++++++---
 arch/powerpc/kvm/e500.h                     |  302 +++++++++++++
 arch/powerpc/kvm/e500_emulate.c             |  110 +++++-
 arch/powerpc/kvm/e500_tlb.c                 |  588 +++++++++++---------------
 arch/powerpc/kvm/e500_tlb.h                 |  174 --------
 arch/powerpc/kvm/e500mc.c                   |  342 +++++++++++++++
 arch/powerpc/kvm/powerpc.c                  |   47 ++-
 arch/powerpc/kvm/timing.h                   |    6 +
 36 files changed, 2717 insertions(+), 835 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kvm_booke_hv_asm.h
 delete mode 100644 arch/powerpc/include/asm/kvm_e500.h
 create mode 100644 arch/powerpc/kvm/bookehv_interrupts.S
 create mode 100644 arch/powerpc/kvm/e500.h
 delete mode 100644 arch/powerpc/kvm/e500_tlb.h
 create mode 100644 arch/powerpc/kvm/e500mc.c


^ permalink raw reply

* [U-Boot] [PATCH 2/4] arm: add %function attribute to assembly functions
From: Mike Frysinger @ 2012-02-17 17:13 UTC (permalink / raw)
  To: u-boot
In-Reply-To: <1329314253-4596-3-git-send-email-aneesh@ti.com>

On Wednesday 15 February 2012 08:57:31 Aneesh V wrote:
> This is done using the following directive preceding
> each function definition:
> 
> .type <func-name>, %function
> 
> This marks the symbol as a function in the object
> header which in turn helps the linker in some cases.
> 
> In particular this was found needed for resolving ARM/Thumb
> calls correctly in a build with Thumb interworking enabled.

ideally arm would use the new linkage.h header and then these would change to 
doing what Linux does:
ENTRY(foo)
	<asm>
ENDPROC(foo)
-mike
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 836 bytes
Desc: This is a digitally signed message part.
URL: <http://lists.denx.de/pipermail/u-boot/attachments/20120217/55f51e87/attachment.pgp>

^ permalink raw reply

* Re: [PATCH 4/8] arm: link a device tree blob into the xen image
From: Ian Campbell @ 2012-02-17 17:13 UTC (permalink / raw)
  To: David Vrabel; +Cc: xen-devel@lists.xensource.com
In-Reply-To: <1329139131-27554-5-git-send-email-david.vrabel@citrix.com>

On Mon, 2012-02-13 at 13:18 +0000, David Vrabel wrote:
> diff --git a/config/arm.mk b/config/arm.mk
> index f64f0c1..f20fd2d 100644
> --- a/config/arm.mk
> +++ b/config/arm.mk
> @@ -16,3 +16,9 @@ LDFLAGS_DIRECT_Linux = _linux
>  LDFLAGS_DIRECT += -marmelf$(LDFLAGS_DIRECT_$(XEN_OS))_eabi
>  
>  CONFIG_LOAD_ADDRESS ?= 0x80000000
> +
> +# XXX: When running on the model there is no bootloader to provide a
> +# device tree.  It must be linked into Xen.
> +ifndef CONFIG_DTB_FILE
> +$(error CONFIG_DTB_FILE must be set to the absolute filename of a
> DTB)
> +endif 

This turns out to be a little aggressive -- it also triggers when you
are building the tools. Not a big deal, but a bit annoying, is there
some way we can avoid this? Put it in xen/arch/arm/Foo perhaps?

Ian.

^ permalink raw reply

* Re: [BUG] EHCI build dependencies broken?
From: Russell King - ARM Linux @ 2012-02-17 17:12 UTC (permalink / raw)
  To: Alan Stern; +Cc: linux-omap, linux-usb
In-Reply-To: <Pine.LNX.4.44L0.1202171025550.1350-100000@iolanthe.rowland.org>

On Fri, Feb 17, 2012 at 10:30:33AM -0500, Alan Stern wrote:
> On Fri, 17 Feb 2012, Russell King - ARM Linux wrote:
> 
> > The latest randconfig build for OMAP4430SDP produced this build failure:
> > 
> > drivers/usb/host/ehci-hcd.c:1382:2: error: #error "missing bus glue for ehci-hcd"
> > 
> > This appears to be because the OMAP EHCI glue was not selected in the
> > configuration, but ehci-hcd was:
> > 
> > CONFIG_USB_ARCH_HAS_EHCI=y
> > CONFIG_USB_EHCI_HCD=y
> > CONFIG_USB_EHCI_ROOT_HUB_TT=y
> > # CONFIG_USB_EHCI_TT_NEWSCHED is not set
> > # CONFIG_USB_EHCI_HCD_OMAP is not set
> > # CONFIG_USB_EHCI_MV is not set
> > 
> > Looking at the Kconfig file, it seems that it's entirely possible to
> > select EHCI_HCD without any glue selected.  It's also possible to build
> > EHCI with two (or more) bus glues selected - eg, it's possible for
> > both these to be selected:
> > 
> > Kconfig:
> > config USB_EHCI_HCD_OMAP
> >         bool "EHCI support for OMAP3 and later chips"
> >         depends on USB_EHCI_HCD && ARCH_OMAP
> > 
> > config USB_EHCI_MV
> >         bool "EHCI support for Marvell on-chip controller"
> >         depends on USB_EHCI_HCD
> 
> I'd say the "depends" line for USB_EHCI_MV is broken.  It should allow 
> this option only on systems using the Marvell platform.
> 
> Of course, that doesn't solve the issue of not having any bus glue
> selected at all.  I don't know enough about Kbuild to fix this --
> suggestions are welcome.
> 
> The same problem exists with OHCI and UHCI too.

A better solution would be to sort out the OHCI/UHCI/EHCI drivers so that
they can support more than one bus glue at any one time.  I think that's
been discussed in the past, and is certainly something that's going to be
required for the 'single zImage' project on ARM.

I suspect that's something for later though.

^ permalink raw reply

* Re: [PATCH 2/9] blkcg: drop unnecessary RCU locking
From: Tejun Heo @ 2012-02-17 17:11 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: axboe, ctalbott, rni, linux-kernel
In-Reply-To: <20120217164749.GC26620@redhat.com>

On Fri, Feb 17, 2012 at 11:47:49AM -0500, Vivek Goyal wrote:
> So now in some cases we call blkg_lookup_create() with both queue and rcu
> read lock held (cfq_lookup_create_cfqg()) and in this case hold only queue
> lock.

So, this should be okay.  It's currently not because blkg_alloc() is
broken due to percpu allocation but other than that calling both w/
and w/o RCU read lock should be fine.

> blkg_lookup_create() calls blkg_lookup() which expects a rcu_read_lock()
> to be held and we will be travesing that list without rcu_read_lock()
> held. Isn't that a problem?

No, why would it be a problem?

> We might be examining a blkg belonging to a different queue and it
> might be being freed parallely.

How?

Thanks.

-- 
tejun

^ permalink raw reply

* 100元幫您查屋主電話,查到才計費0975-784602
From: w55688pohan @ 2012-02-17 17:10 UTC (permalink / raw)
  To: 1, 2, 3, httpwww.591.com.tw, 4, 3, 1, httpwww.591.com.tw, 2, 4





^ permalink raw reply

* Re: [RFC PATCH v0 1/2] net: bridge: propagate FDB table into hardware
From: John Fastabend @ 2012-02-17 17:10 UTC (permalink / raw)
  To: jhs
  Cc: jamal, Stephen Hemminger, bhutchings, roprabhu, netdev, mst,
	chrisw, davem, gregory.v.rose, kvm, sri
In-Reply-To: <1329488932.2272.19.camel@mojatatu>

On 2/17/2012 6:28 AM, jamal wrote:
> On Wed, 2012-02-15 at 17:26 -0800, John Fastabend wrote:
>> On 2/15/2012 6:10 AM, Jamal Hadi Salim wrote:
>>> On Tue, 2012-02-14 at 10:57 -0800, John Fastabend wrote:
>>>
>>>> Roopa was likely on the right track here,
>>>>
>>>> http://patchwork.ozlabs.org/patch/123064/
>>>
>>> Doesnt seem related to the bridging stuff - the modeling looks
>>> reasonable however.
>>>
>>
>> The operations are really the same ADD/DEL/GET additional MAC
>> addresses to a port, in this case a macvlan type port. The
>> difference is the  macvlan port type drops any packet with an
>> address not in the FDB where the bridge type floods these.
> 
> Ok.
> [the vlan piece really should have been an integrated part of bridging;
> in the early days this was the case]
> 
> 
>> [root@jf-dev1-dcblab src]# br fdb help
>> Usage: br fdb { add | del | replace } ADDR dev DEV
>>        br fdb {show} [ dev DEV ]
>>
>> In my example I just dumped all bridge devices,
>>
> 
> Ok, makes sense.
> 
> 
>> Seems we need both a synchronize and a { add | del | replace } option.
> 
> I am conflicted on this.
> Not sure if that is a command line thing or something built into a user
> space daemon. It may be useful to have the command line variant but i
> feel having a daemon take care of things helps in faster
> synchronization.
> I think user space is a good spot to add such functionality (as opposed
> to the kernel). That way user space can work with h/ware switching such
> as yours as well as a standalone switching chips (from sillicon vendors
> like Marvel etc).
> IMO, the average user doesnt need to be aware of such low level stuff;
> so the default should be for the user not to be responsible for
> configuration of synchronization. IOW, I want to just run well
> understood user interface tools things like ifconfig, ip link etc, the
> new br tool and not even need to be aware that we are offloading.
> So as long as s/w br0 is mapping to the bridge on ixgb-0 i dont need
> to know ixgb0 h/w bridge exists.
> 

Yes I agree that is the goal.

> One last comment:
> With synchronization there are other challenges when the entry in the
> hardware conflicts with the entry in software when you intend the
> behavior to be the same. This is not such a big deal with bridging but
> becomes more apparent when you start offloading ACLs etc.
> 

OK and these sorts of conflicts certainly don't need to be resolved
by kernel code. So I think this is a reasonable reason to drive the
synchronization into a user space daemon.

> 
>> So I think what your saying is a per port bit to disable learning...
>> hmm but if you start tweaking it too much it looks less and less like a
>> 802.1D bridge and more like something you would want to build with tc or
>> openvswitch or tc+bridge or tc+macvlan.
> 
> These are pretty commodity features in most silicon switching chips ive
> come across. You have a knob to control learning and another to control
> flooding.
> 

All right this looks like a follow up patch to me. First build the interface
to configure the HW FDB. Then a second series to add a flooding knob which
works for both embedded switches and software switches.

> cheers,
> jamal
> 

^ permalink raw reply

* [U-Boot] [PATCH v2 3/6] cm-t35: add EEPROM module and pass Linux a serial number
From: Tom Rini @ 2012-02-17 17:10 UTC (permalink / raw)
  To: u-boot
In-Reply-To: <4F3E22D4.6080409@aribaud.net>

On 02/17/2012 02:50 AM, Albert ARIBAUD wrote:
> Hi Igor,
>
> Le 07/02/2012 09:01, Igor Grinberg a ?crit :
>> Hi Albert,
>>
>> On 02/07/12 00:56, Albert ARIBAUD wrote:
>>> Le 09/01/2012 09:30, Nikita Kiryanov a ?crit :
>>>> On 01/05/2012 04:56 PM, Wolfgang Denk wrote:
>>>>> Dear Igor Grinberg,
>>>>>
>>>>> In message<1325764937-7342-1-git-send-email-grinberg@compulab.co.il>
>>>>> you wrote:
>>>>>> From: Nikita Kiryanov<nikita@compulab.co.il>
>>>>>>
>>>>>> Add board specific EEPROM handling module,
>>>>>> read the serial number from the EEPROM and pass it to Linux.
>>>>> ...
>>>>>
>>>>>> * Fix strange linker warning: ".bss section overlaps previous
>>>>>> sections"
>>>>>> by changing the type of the eeprom_layout static global variable
>>>>>> to int
>>>>>> (probably this is a compiler bug).
>>>>> Probably it is now. Did you inspect the linke rmap?
>>>>
>>>> u-boot.map shows the bss section aligning perfectly with the start of
>>>> rel.dyn.
>>>>
>>>> The difference between the original "working" version and the version
>>>> with the warning
>>>> was an additional byte added by uchar eeprom_layout to the size of
>>>> libcm_t35.o.
>>>> This shouldn't be a problem because the bss section is followed by an
>>>> ALIGN(4), but
>>>> we decided to try changing eeprom_layout to an int and the problem went
>>>> away.
>>>> When we tried to define 4 uchars the problem reappeared.
>>>>
>>>> This suggests that this might be a compiler bug.
>>>>
>>>> There's been some discussion about this in the following threads:
>>>> http://comments.gmane.org/gmane.comp.boot-loaders.u-boot/114646
>>>> http://comments.gmane.org/gmane.comp.boot-loaders.u-boot/90723
>>>> and we're not aware of any fix to the issue.
>>>
>>> Which prompted me to test --no-check-sections with CS 2009q1.
>>> Adding it to LDFLAGS_u-boot does reduce the annoyance from errors to
>>> a warning,
>>> but there is no way to completely make it disappear.
>>
>> So the conclusion is still a tool chain bug, right?
>> Probably it doesn't hurt besides the annoying warning.
>
> Sorry for being slow.
>
> Yes, it seems to be a toolchain bug and limited to a spurious warning.
> However, spurious warnings are always a pain when doing a MAKEALL arm,
> because the board is listed among the unclean and failed builds and then
> one must go through these individually and remember that this specific
> warning is both unimportant and unavoidable.

Have we gone and bugged any toolchain folks about what caused / fixed 
this problem so that perhaps we can option around it or otherwise squash 
this?

-- 
Tom

^ permalink raw reply

* Re: [PATCH] ARM: OMAP: hsmmc: add max_freq field
From: Cousson, Benoit @ 2012-02-17 17:09 UTC (permalink / raw)
  To: Daniel Mack
  Cc: T Krishnamoorthy, Balaji, linux-arm-kernel, linux-mmc, linux-omap,
	Tony Lindgren
In-Reply-To: <CACTFLAMCoKfQbTe2RLTvi1_CH0nKzAOthAwrifNe2+F2sf3uPA@mail.gmail.com>

On 2/17/2012 2:44 PM, Daniel Mack wrote:
> ping? Could anyone care for queueing this please?

There is no OMAP HSMMC dedicated maintainer anymore so I guess you should ping 
Chris Ball <cjb@laptop.org>.

Regards,
Benoit

^ permalink raw reply

* Re: [Qemu-devel] [PATCH 6/6] qmp: add balloon-get-memory-stats & event
From: Michael Roth @ 2012-02-17 17:09 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: armbru, agl, eblake, qemu-devel, Luiz Capitulino
In-Reply-To: <4F3E868D.8020107@us.ibm.com>

On Fri, Feb 17, 2012 at 10:55:41AM -0600, Anthony Liguori wrote:
> On 02/08/2012 02:30 PM, Luiz Capitulino wrote:
> >This commit adds a QMP API for the guest provided memory statistics
> >(long disabled by commit 07b0403dfc2b2ac179ae5b48105096cc2d03375a).
> >
> >The approach taken by the original commit
> >(625a5befc2e3200b396594f002218d235e375da5) was to extend the
> >query-balloon command. It introduced a severe bug though: query-balloon
> >would hang if the guest didn't respond.
> >
> >The approach taken by this commit is asynchronous and thus avoids
> >any QMP hangs.
> >
> >First, a client has to issue the balloon-get-memory-stats command.
> >That command gets the process started by only sending a request to
> >the guest, it doesn't block. When the memory stats are made available
> >by the guest, they are returned to the client as an QMP event.
> >
> >Signed-off-by: Luiz Capitulino<lcapitulino@redhat.com>
> 
> Do we need this to be stable in 1.1?
> 
> We can do this pretty nicely through QOM.  We can have a polling
> property in the virtio-balloon driver, that when set, will enable
> the virtio-balloon device to poll the guest for statistics.
> 
> We can also have properties for each of the memory statistics and a
> timestamp for when the last update was.
> 
> I think this is a friendlier approach for clients, and a cleaner
> approach from a QEMU perspective.
> 
> There's nothing generic about this functionality.  It's extremely
> specific to virtio-balloon.  We just lacked ways to expose device
> specific function pre-QOM.

I'm not so sure, I think proxying guest agent commands through QMP
would hit very similar snags, for instance.

> 
> Regards,
> 
> Anthony Liguori
> 
> 

^ permalink raw reply

* [PATCH] ARM: OMAP: hsmmc: add max_freq field
From: Cousson, Benoit @ 2012-02-17 17:09 UTC (permalink / raw)
  To: linux-arm-kernel
In-Reply-To: <CACTFLAMCoKfQbTe2RLTvi1_CH0nKzAOthAwrifNe2+F2sf3uPA@mail.gmail.com>

On 2/17/2012 2:44 PM, Daniel Mack wrote:
> ping? Could anyone care for queueing this please?

There is no OMAP HSMMC dedicated maintainer anymore so I guess you should ping 
Chris Ball <cjb@laptop.org>.

Regards,
Benoit

^ permalink raw reply

* [Qemu-devel] [PATCH V7 06/11] pci.c: Add pci_check_bar_overlap
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel
  Cc: Anthony PERARD, Yuji Shimada, Xen Devel, Michael S . Tsirkin,
	Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

From: Yuji Shimada <shimada-yxb@necst.nec.co.jp>

This function helps Xen PCI Passthrough device to check for overlap.

Signed-off-by: Yuji Shimada <shimada-yxb@necst.nec.co.jp>
Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/pci.c |   47 +++++++++++++++++++++++++++++++++++++++++++++++
 hw/pci.h |    3 +++
 2 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/hw/pci.c b/hw/pci.c
index 678a8c1..75d6529 100644
--- a/hw/pci.c
+++ b/hw/pci.c
@@ -1985,6 +1985,53 @@ MemoryRegion *pci_address_space_io(PCIDevice *dev)
     return dev->bus->address_space_io;
 }
 
+int pci_check_bar_overlap(PCIDevice *dev,
+                          pcibus_t addr, pcibus_t size, uint8_t type)
+{
+    PCIBus *bus = dev->bus;
+    PCIDevice *devices = NULL;
+    PCIIORegion *r;
+    int i, j;
+    int rc = 0;
+
+    /* check Overlapped to Base Address */
+    for (i = 0; i < ARRAY_SIZE(bus->devices); i++) {
+        devices = bus->devices[i];
+        if (!devices) {
+            continue;
+        }
+
+        /* skip itself */
+        if (devices->devfn == dev->devfn) {
+            continue;
+        }
+
+        for (j = 0; j < PCI_NUM_REGIONS; j++) {
+            r = &devices->io_regions[j];
+
+            /* skip different resource type, but don't skip when
+             * prefetch and non-prefetch memory are compared.
+             */
+            if (type != r->type) {
+                if (type & PCI_BASE_ADDRESS_SPACE_IO ||
+                    r->type & PCI_BASE_ADDRESS_SPACE_IO) {
+                    continue;
+                }
+            }
+
+            if ((addr < (r->addr + r->size)) && ((addr + size) > r->addr)) {
+                printf("Overlapped to device[%02x:%02x.%x][Region:%d]"
+                       "[Address:%"PRIx64"h][Size:%"PRIx64"h]\n",
+                       pci_bus_num(bus), PCI_SLOT(devices->devfn),
+                       PCI_FUNC(devices->devfn), j, r->addr, r->size);
+                rc = 1;
+            }
+        }
+    }
+
+    return rc;
+}
+
 static void pci_device_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *k = DEVICE_CLASS(klass);
diff --git a/hw/pci.h b/hw/pci.h
index 33b0b18..f05fda5 100644
--- a/hw/pci.h
+++ b/hw/pci.h
@@ -566,4 +566,7 @@ extern const VMStateDescription vmstate_pci_device;
     .offset     = vmstate_offset_pointer(_state, _field, PCIDevice), \
 }
 
+int pci_check_bar_overlap(PCIDevice *dev,
+                          pcibus_t addr, pcibus_t size, uint8_t type);
+
 #endif
-- 
Anthony PERARD

^ permalink raw reply related

* [Qemu-devel] [PATCH V7 08/11] Introduce Xen PCI Passthrough, PCI config space helpers (2/3)
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel
  Cc: Anthony PERARD, Guy Zana, Xen Devel, Allen Kay,
	Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

From: Allen Kay <allen.m.kay@intel.com>

A more complete history can be found here:
git://xenbits.xensource.com/qemu-xen-unstable.git

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
Signed-off-by: Guy Zana <guy@neocleus.com>
Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/xen_pci_passthrough.c             |   10 +
 hw/xen_pci_passthrough.h             |    2 +
 hw/xen_pci_passthrough_config_init.c | 1431 ++++++++++++++++++++++++++++++++++
 3 files changed, 1443 insertions(+), 0 deletions(-)

diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
index 3f305dd..26a3bdd 100644
--- a/hw/xen_pci_passthrough.c
+++ b/hw/xen_pci_passthrough.c
@@ -676,6 +676,13 @@ static int pt_initfn(PCIDevice *d)
     /* Handle real device's MMIO/PIO BARs */
     pt_register_regions(s);
 
+    /* reinitialize each config register to be emulated */
+    if (pt_config_init(s)) {
+        PT_ERR(d, "PCI Config space initialisation failed.\n");
+        host_pci_device_put(s->real_device);
+        return -1;
+    }
+
     /* Bind interrupt */
     if (!s->dev.config[PCI_INTERRUPT_PIN]) {
         PT_LOG(d, "no pin interrupt\n");
@@ -773,6 +780,9 @@ static int pt_unregister_device(PCIDevice *d)
         }
     }
 
+    /* delete all emulated config registers */
+    pt_config_delete(s);
+
     /* unregister real device's MMIO/PIO BARs */
     pt_unregister_regions(s);
 
diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
index ea6719c..7ebc793 100644
--- a/hw/xen_pci_passthrough.h
+++ b/hw/xen_pci_passthrough.h
@@ -61,6 +61,8 @@ typedef int (*conf_byte_read)
 #define PT_BAR_ALLF         0xFFFFFFFF  /* BAR ALLF value */
 #define PT_PCI_BAR_UNMAPPED (-1)
 
+#define PCI_CAP_MAX 48
+
 
 typedef enum {
     GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
diff --git a/hw/xen_pci_passthrough_config_init.c b/hw/xen_pci_passthrough_config_init.c
index 1e9de64..f1fffd1 100644
--- a/hw/xen_pci_passthrough_config_init.c
+++ b/hw/xen_pci_passthrough_config_init.c
@@ -1,11 +1,1442 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Alex Novik <alex@neocleus.com>
+ * Allen Kay <allen.m.kay@intel.com>
+ * Guy Zana <guy@neocleus.com>
+ *
+ * This file implements direct PCI assignment to a HVM guest
+ */
+
+#include "qemu-timer.h"
+#include "xen_backend.h"
 #include "xen_pci_passthrough.h"
 
+#define PT_MERGE_VALUE(value, data, val_mask) \
+    (((value) & (val_mask)) | ((data) & ~(val_mask)))
+
+#define PT_INVALID_REG          0xFFFFFFFF      /* invalid register value */
+
+/* prototype */
+
+static int pt_ptr_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
+                           uint32_t real_offset, uint32_t *data);
+
+
+/* helper */
+
+/* A return value of 1 means the capability should NOT be exposed to guest. */
+static int pt_hide_dev_cap(const HostPCIDevice *d, uint8_t grp_id)
+{
+    switch (grp_id) {
+    case PCI_CAP_ID_EXP:
+        /* The PCI Express Capability Structure of the VF of Intel 82599 10GbE
+         * Controller looks trivial, e.g., the PCI Express Capabilities
+         * Register is 0. We should not try to expose it to guest.
+         *
+         * The datasheet is available at
+         * http://download.intel.com/design/network/datashts/82599_datasheet.pdf
+         *
+         * See 'Table 9.7. VF PCIe Configuration Space' of the datasheet, the
+         * PCI Express Capability Structure of the VF of Intel 82599 10GbE
+         * Controller looks trivial, e.g., the PCI Express Capabilities
+         * Register is 0, so the Capability Version is 0 and
+         * pt_pcie_size_init() would fail.
+         */
+        if (d->vendor_id == PCI_VENDOR_ID_INTEL &&
+            d->device_id == PCI_DEVICE_ID_INTEL_82599_VF) {
+            return 1;
+        }
+        break;
+    }
+    return 0;
+}
+
+/*   find emulate register group entry */
 XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
 {
+    XenPTRegGroup *entry = NULL;
+
+    /* find register group entry */
+    QLIST_FOREACH(entry, &s->reg_grp_tbl, entries) {
+        /* check address */
+        if ((entry->base_offset <= address)
+            && ((entry->base_offset + entry->size) > address)) {
+            return entry;
+        }
+    }
+
+    /* group entry not found */
     return NULL;
 }
 
+/* find emulate register entry */
 XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
 {
+    XenPTReg *reg_entry = NULL;
+    XenPTRegInfo *reg = NULL;
+    uint32_t real_offset = 0;
+
+    /* find register entry */
+    QLIST_FOREACH(reg_entry, &reg_grp->reg_tbl_list, entries) {
+        reg = reg_entry->reg;
+        real_offset = reg_grp->base_offset + reg->offset;
+        /* check address */
+        if ((real_offset <= address)
+            && ((real_offset + reg->size) > address)) {
+            return reg_entry;
+        }
+    }
+
     return NULL;
 }
+
+/* parse BAR */
+static PTBarFlag pt_bar_reg_parse(XenPCIPassthroughState *s, XenPTRegInfo *reg)
+{
+    PCIDevice *d = &s->dev;
+    XenPTRegion *region = NULL;
+    PCIIORegion *r;
+    int index = 0;
+
+    /* check 64bit BAR */
+    index = pt_bar_offset_to_index(reg->offset);
+    if ((0 < index) && (index < PCI_ROM_SLOT)) {
+        int flags = s->real_device->io_regions[index - 1].flags;
+
+        if ((flags & IORESOURCE_MEM) && (flags & IORESOURCE_MEM_64)) {
+            region = &s->bases[index - 1];
+            if (region->bar_flag != PT_BAR_FLAG_UPPER) {
+                return PT_BAR_FLAG_UPPER;
+            }
+        }
+    }
+
+    /* check unused BAR */
+    r = &d->io_regions[index];
+    if (r->size == 0) {
+        return PT_BAR_FLAG_UNUSED;
+    }
+
+    /* for ExpROM BAR */
+    if (index == PCI_ROM_SLOT) {
+        return PT_BAR_FLAG_MEM;
+    }
+
+    /* check BAR I/O indicator */
+    if (s->real_device->io_regions[index].flags & IORESOURCE_IO) {
+        return PT_BAR_FLAG_IO;
+    } else {
+        return PT_BAR_FLAG_MEM;
+    }
+}
+
+
+/****************
+ * general register functions
+ */
+
+/* register initialization function */
+
+static int pt_common_reg_init(XenPCIPassthroughState *s,
+                              XenPTRegInfo *reg, uint32_t real_offset,
+                              uint32_t *data)
+{
+    *data = reg->init_val;
+    return 0;
+}
+
+/* Read register functions */
+
+static int pt_byte_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint8_t *value, uint8_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint8_t valid_emu_mask = 0;
+
+    /* emulate byte register */
+    valid_emu_mask = reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+static int pt_word_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint16_t *value, uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t valid_emu_mask = 0;
+
+    /* emulate word register */
+    valid_emu_mask = reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+static int pt_long_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint32_t *value, uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint32_t valid_emu_mask = 0;
+
+    /* emulate long register */
+    valid_emu_mask = reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+   return 0;
+}
+
+/* Write register functions */
+
+static int pt_byte_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint8_t *value, uint8_t dev_value,
+                             uint8_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint8_t writable_mask = 0;
+    uint8_t throughable_mask = 0;
+
+    /* modify emulate register */
+    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    return 0;
+}
+static int pt_word_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint16_t *value, uint16_t dev_value,
+                             uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t writable_mask = 0;
+    uint16_t throughable_mask = 0;
+
+    /* modify emulate register */
+    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    return 0;
+}
+static int pt_long_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint32_t *value, uint32_t dev_value,
+                             uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint32_t writable_mask = 0;
+    uint32_t throughable_mask = 0;
+
+    /* modify emulate register */
+    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~reg->emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    return 0;
+}
+
+
+/* XenPTRegInfo declaration
+ * - only for emulated register (either a part or whole bit).
+ * - for passthrough register that need special behavior (like interacting with
+ *   other component), set emu_mask to all 0 and specify r/w func properly.
+ * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
+ */
+
+/********************
+ * Header Type0
+ */
+
+static int pt_vendor_reg_init(XenPCIPassthroughState *s,
+                              XenPTRegInfo *reg, uint32_t real_offset,
+                              uint32_t *data)
+{
+    *data = s->real_device->vendor_id;
+    return 0;
+}
+static int pt_device_reg_init(XenPCIPassthroughState *s,
+                              XenPTRegInfo *reg, uint32_t real_offset,
+                              uint32_t *data)
+{
+    *data = s->real_device->device_id;
+    return 0;
+}
+static int pt_status_reg_init(XenPCIPassthroughState *s,
+                              XenPTRegInfo *reg, uint32_t real_offset,
+                              uint32_t *data)
+{
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    uint32_t reg_field = 0;
+
+    /* find Header register group */
+    reg_grp_entry = pt_find_reg_grp(s, PCI_CAPABILITY_LIST);
+    if (reg_grp_entry) {
+        /* find Capabilities Pointer register */
+        reg_entry = pt_find_reg(reg_grp_entry, PCI_CAPABILITY_LIST);
+        if (reg_entry) {
+            /* check Capabilities Pointer register */
+            if (reg_entry->data) {
+                reg_field |= PCI_STATUS_CAP_LIST;
+            } else {
+                reg_field &= ~PCI_STATUS_CAP_LIST;
+            }
+        } else {
+            xen_shutdown_fatal_error("Internal error: Couldn't find XenPTReg*"
+                                     " for Capabilities Pointer register."
+                                     " (%s)\n", __func__);
+            return -1;
+        }
+    } else {
+        xen_shutdown_fatal_error("Internal error: Couldn't find XenPTRegGroup"
+                                 " for Header. (%s)\n", __func__);
+        return -1;
+    }
+
+    *data = reg_field;
+    return 0;
+}
+static int pt_header_type_reg_init(XenPCIPassthroughState *s,
+                                   XenPTRegInfo *reg, uint32_t real_offset,
+                                   uint32_t *data)
+{
+    /* read PCI_HEADER_TYPE */
+    *data = reg->init_val | 0x80;
+    return 0;
+}
+
+/* initialize Interrupt Pin register */
+static int pt_irqpin_reg_init(XenPCIPassthroughState *s,
+                              XenPTRegInfo *reg, uint32_t real_offset,
+                              uint32_t *data)
+{
+    *data = pci_read_intx(s);
+    return 0;
+}
+
+/* Command register */
+static int pt_cmd_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                           uint16_t *value, uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t valid_emu_mask = 0;
+    uint16_t emu_mask = reg->emu_mask;
+
+    if (s->is_virtfn) {
+        emu_mask |= PCI_COMMAND_MEMORY;
+    }
+
+    /* emulate word register */
+    valid_emu_mask = emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+static int pt_cmd_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint16_t *value, uint16_t dev_value,
+                            uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t writable_mask = 0;
+    uint16_t throughable_mask = 0;
+    uint16_t wr_value = *value;
+    uint16_t emu_mask = reg->emu_mask;
+
+    if (s->is_virtfn) {
+        emu_mask |= PCI_COMMAND_MEMORY;
+    }
+
+    /* modify emulate register */
+    writable_mask = ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~emu_mask & valid_mask;
+
+    if (*value & PCI_COMMAND_INTX_DISABLE) {
+        throughable_mask |= PCI_COMMAND_INTX_DISABLE;
+    } else {
+        if (s->machine_irq) {
+            throughable_mask |= PCI_COMMAND_INTX_DISABLE;
+        }
+    }
+
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    pt_bar_mapping(s, wr_value & PCI_COMMAND_IO,
+                   wr_value & PCI_COMMAND_MEMORY);
+
+    return 0;
+}
+
+/* BAR */
+#define PT_BAR_MEM_RO_MASK      0x0000000F      /* BAR ReadOnly mask(Memory) */
+#define PT_BAR_MEM_EMU_MASK     0xFFFFFFF0      /* BAR emul mask(Memory) */
+#define PT_BAR_IO_RO_MASK       0x00000003      /* BAR ReadOnly mask(I/O) */
+#define PT_BAR_IO_EMU_MASK      0xFFFFFFFC      /* BAR emul mask(I/O) */
+
+static inline uint32_t base_address_with_flags(HostPCIIORegion *hr)
+{
+    if ((hr->flags & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
+        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_IO_MASK);
+    } else {
+        return hr->base_addr | (hr->flags & ~PCI_BASE_ADDRESS_MEM_MASK);
+    }
+}
+
+static int pt_bar_reg_init(XenPCIPassthroughState *s, XenPTRegInfo *reg,
+                           uint32_t real_offset, uint32_t *data)
+{
+    uint32_t reg_field = 0;
+    int index;
+
+    index = pt_bar_offset_to_index(reg->offset);
+    if (index < 0 || index >= PCI_NUM_REGIONS) {
+        PT_ERR(&s->dev, "Internal error: Invalid BAR index [%d].\n", index);
+        return -1;
+    }
+
+    s->bases[index].e_physbase = PT_PCI_BAR_UNMAPPED;
+
+    /* set BAR flag */
+    s->bases[index].bar_flag = pt_bar_reg_parse(s, reg);
+    if (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED) {
+        reg_field = PT_INVALID_REG;
+    }
+
+    *data = reg_field;
+    return 0;
+}
+static int pt_bar_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                           uint32_t *value, uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint32_t valid_emu_mask = 0;
+    uint32_t bar_emu_mask = 0;
+    int index;
+
+    /* get BAR index */
+    index = pt_bar_offset_to_index(reg->offset);
+    if (index < 0 || index >= PCI_NUM_REGIONS) {
+        PT_ERR(&s->dev, "Internal error: Invalid BAR index [%d].\n", index);
+        return -1;
+    }
+
+    /* use fixed-up value from kernel sysfs */
+    *value = base_address_with_flags(&s->real_device->io_regions[index]);
+
+    /* set emulate mask depend on BAR flag */
+    switch (s->bases[index].bar_flag) {
+    case PT_BAR_FLAG_MEM:
+        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
+        break;
+    case PT_BAR_FLAG_IO:
+        bar_emu_mask = PT_BAR_IO_EMU_MASK;
+        break;
+    case PT_BAR_FLAG_UPPER:
+        bar_emu_mask = PT_BAR_ALLF;
+        break;
+    default:
+        break;
+    }
+
+    /* emulate BAR */
+    valid_emu_mask = bar_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+   return 0;
+}
+static int pt_bar_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                            uint32_t *value, uint32_t dev_value,
+                            uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    PCIDevice *d = &s->dev;
+    const PCIIORegion *r;
+    uint32_t writable_mask = 0;
+    uint32_t throughable_mask = 0;
+    uint32_t bar_emu_mask = 0;
+    uint32_t bar_ro_mask = 0;
+    uint32_t r_size = 0;
+    int index = 0;
+
+    index = pt_bar_offset_to_index(reg->offset);
+    if (index < 0 || index >= PCI_NUM_REGIONS) {
+        PT_ERR(d, "Internal error: Invalid BAR index [%d].\n", index);
+        return -1;
+    }
+
+    r = &d->io_regions[index];
+    base = &s->bases[index];
+    r_size = pt_get_emul_size(base->bar_flag, r->size);
+
+    /* set emulate mask and read-only mask values depend on the BAR flag */
+    switch (s->bases[index].bar_flag) {
+    case PT_BAR_FLAG_MEM:
+        bar_emu_mask = PT_BAR_MEM_EMU_MASK;
+        bar_ro_mask = PT_BAR_MEM_RO_MASK | (r_size - 1);
+        break;
+    case PT_BAR_FLAG_IO:
+        bar_emu_mask = PT_BAR_IO_EMU_MASK;
+        bar_ro_mask = PT_BAR_IO_RO_MASK | (r_size - 1);
+        break;
+    case PT_BAR_FLAG_UPPER:
+        bar_emu_mask = PT_BAR_ALLF;
+        bar_ro_mask = 0;    /* all upper 32bit are R/W */
+        break;
+    default:
+        break;
+    }
+
+    /* modify emulate register */
+    writable_mask = bar_emu_mask & ~bar_ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* check whether we need to update the virtual region address or not */
+    switch (s->bases[index].bar_flag) {
+    case PT_BAR_FLAG_MEM:
+        /* nothing to do */
+        break;
+    case PT_BAR_FLAG_IO:
+        /* nothing to do */
+        break;
+    case PT_BAR_FLAG_UPPER:
+        if (cfg_entry->data) {
+            if (cfg_entry->data != (PT_BAR_ALLF & ~bar_ro_mask)) {
+                PT_WARN(d, "Guest attempt to set high MMIO Base Address. "
+                        "Ignore mapping. "
+                        "(offset: 0x%02x, high address: 0x%08x)\n",
+                        reg->offset, cfg_entry->data);
+            }
+        }
+        break;
+    default:
+        break;
+    }
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~bar_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    /* After BAR reg update, we need to remap BAR */
+    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
+    if (reg_grp_entry) {
+        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
+        if (reg_entry) {
+            pt_bar_mapping_one(s, index, reg_entry->data & PCI_COMMAND_IO,
+                               reg_entry->data & PCI_COMMAND_MEMORY);
+        }
+    }
+
+    return 0;
+}
+
+/* write Exp ROM BAR */
+static int pt_exp_rom_bar_reg_write(XenPCIPassthroughState *s,
+                                    XenPTReg *cfg_entry, uint32_t *value,
+                                    uint32_t dev_value, uint32_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    PCIDevice *d = (PCIDevice *)&s->dev;
+    PCIIORegion *r;
+    uint32_t writable_mask = 0;
+    uint32_t throughable_mask = 0;
+    pcibus_t r_size = 0;
+    uint32_t bar_emu_mask = 0;
+    uint32_t bar_ro_mask = 0;
+
+    r = &d->io_regions[PCI_ROM_SLOT];
+    r_size = r->size;
+    base = &s->bases[PCI_ROM_SLOT];
+    /* align memory type resource size */
+    pt_get_emul_size(base->bar_flag, r_size);
+
+    /* set emulate mask and read-only mask */
+    bar_emu_mask = reg->emu_mask;
+    bar_ro_mask = (reg->ro_mask | (r_size - 1)) & ~PCI_ROM_ADDRESS_ENABLE;
+
+    /* modify emulate register */
+    writable_mask = ~bar_ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* update the corresponding virtual region address */
+    /*
+     * When guest code tries to get block size of mmio, it will write all "1"s
+     * into pci bar register. In this case, cfg_entry->data == writable_mask.
+     * Especially for devices with large mmio, the value of writable_mask
+     * is likely to be a guest physical address that has been mapped to ram
+     * rather than mmio. Remapping this value to mmio should be prevented.
+     */
+
+    if (cfg_entry->data != writable_mask) {
+        r->addr = cfg_entry->data;
+    }
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~bar_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    /* After BAR reg update, we need to remap BAR*/
+    reg_grp_entry = pt_find_reg_grp(s, PCI_COMMAND);
+    if (reg_grp_entry) {
+        reg_entry = pt_find_reg(reg_grp_entry, PCI_COMMAND);
+        if (reg_entry) {
+            pt_bar_mapping_one(s, PCI_ROM_SLOT,
+                               reg_entry->data & PCI_COMMAND_IO,
+                               reg_entry->data & PCI_COMMAND_MEMORY);
+        }
+    }
+
+    return 0;
+}
+
+/* Header Type0 reg static infomation table */
+static XenPTRegInfo pt_emu_reg_header0_tbl[] = {
+    /* Vendor ID reg */
+    {
+        .offset     = PCI_VENDOR_ID,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFFF,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_vendor_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    /* Device ID reg */
+    {
+        .offset     = PCI_DEVICE_ID,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFFF,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_device_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    /* Command reg */
+    {
+        .offset     = PCI_COMMAND,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xF880,
+        .emu_mask   = 0x0740,
+        .init       = pt_common_reg_init,
+        .u.w.read   = pt_cmd_reg_read,
+        .u.w.write  = pt_cmd_reg_write,
+    },
+    /* Capabilities Pointer reg */
+    {
+        .offset     = PCI_CAPABILITY_LIST,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* Status reg */
+    /* use emulated Cap Ptr value to initialize,
+     * so need to be declared after Cap Ptr reg
+     */
+    {
+        .offset     = PCI_STATUS,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0x06FF,
+        .emu_mask   = 0x0010,
+        .init       = pt_status_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    /* Cache Line Size reg */
+    {
+        .offset     = PCI_CACHE_LINE_SIZE,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0x00,
+        .emu_mask   = 0xFF,
+        .init       = pt_common_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* Latency Timer reg */
+    {
+        .offset     = PCI_LATENCY_TIMER,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0x00,
+        .emu_mask   = 0xFF,
+        .init       = pt_common_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* Header Type reg */
+    {
+        .offset     = PCI_HEADER_TYPE,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0x00,
+        .init       = pt_header_type_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* Interrupt Line reg */
+    {
+        .offset     = PCI_INTERRUPT_LINE,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0x00,
+        .emu_mask   = 0xFF,
+        .init       = pt_common_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* Interrupt Pin reg */
+    {
+        .offset     = PCI_INTERRUPT_PIN,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_irqpin_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* BAR 0 reg */
+    /* mask of BAR need to be decided later, depends on IO/MEM type */
+    {
+        .offset     = PCI_BASE_ADDRESS_0,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+    },
+    /* BAR 1 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_1,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+    },
+    /* BAR 2 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_2,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+    },
+    /* BAR 3 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_3,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+    },
+    /* BAR 4 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_4,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+    },
+    /* BAR 5 reg */
+    {
+        .offset     = PCI_BASE_ADDRESS_5,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_bar_reg_read,
+        .u.dw.write = pt_bar_reg_write,
+    },
+    /* Expansion ROM BAR reg */
+    {
+        .offset     = PCI_ROM_ADDRESS,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .ro_mask    = 0x000007FE,
+        .emu_mask   = 0xFFFFF800,
+        .init       = pt_bar_reg_init,
+        .u.dw.read  = pt_long_reg_read,
+        .u.dw.write = pt_exp_rom_bar_reg_write,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/*********************************
+ * Vital Product Data Capability
+ */
+
+/* Vital Product Data Capability Structure reg static infomation table */
+static XenPTRegInfo pt_emu_reg_vpd_tbl[] = {
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/**************************************
+ * Vendor Specific Capability
+ */
+
+/* Vendor Specific Capability Structure reg static infomation table */
+static XenPTRegInfo pt_emu_reg_vendor_tbl[] = {
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/*****************************
+ * PCI Express Capability
+ */
+
+static inline uint8_t get_capability_version(XenPCIPassthroughState *s,
+                                             uint32_t offset)
+{
+    uint8_t flags = pci_get_byte(s->dev.config + offset + PCI_EXP_FLAGS);
+    return flags & PCI_EXP_FLAGS_VERS;
+}
+
+static inline uint8_t get_device_type(XenPCIPassthroughState *s,
+                                      uint32_t offset)
+{
+    uint8_t flags = pci_get_byte(s->dev.config + offset + PCI_EXP_FLAGS);
+    return (flags & PCI_EXP_FLAGS_TYPE) >> 4;
+}
+
+/* initialize Link Control register */
+static int pt_linkctrl_reg_init(XenPCIPassthroughState *s,
+                                XenPTRegInfo *reg, uint32_t real_offset,
+                                uint32_t *data)
+{
+    uint8_t cap_ver = get_capability_version(s, real_offset - reg->offset);
+    uint8_t dev_type = get_device_type(s, real_offset - reg->offset);
+
+    /* no need to initialize in case of Root Complex Integrated Endpoint
+     * with cap_ver 1.x
+     */
+    if ((dev_type == PCI_EXP_TYPE_RC_END) && (cap_ver == 1)) {
+        *data = PT_INVALID_REG;
+    }
+
+    *data = reg->init_val;
+    return 0;
+}
+/* initialize Device Control 2 register */
+static int pt_devctrl2_reg_init(XenPCIPassthroughState *s,
+                                XenPTRegInfo *reg, uint32_t real_offset,
+                                uint32_t *data)
+{
+    uint8_t cap_ver = get_capability_version(s, real_offset - reg->offset);
+
+    /* no need to initialize in case of cap_ver 1.x */
+    if (cap_ver == 1) {
+        *data = PT_INVALID_REG;
+    }
+
+    *data = reg->init_val;
+    return 0;
+}
+/* initialize Link Control 2 register */
+static int pt_linkctrl2_reg_init(XenPCIPassthroughState *s,
+                                 XenPTRegInfo *reg, uint32_t real_offset,
+                                 uint32_t *data)
+{
+    uint8_t cap_ver = get_capability_version(s, real_offset - reg->offset);
+    uint32_t reg_field = 0;
+
+    /* no need to initialize in case of cap_ver 1.x */
+    if (cap_ver == 1) {
+        reg_field = PT_INVALID_REG;
+    } else {
+        /* set Supported Link Speed */
+        uint8_t lnkcap = pci_get_byte(s->dev.config + real_offset - reg->offset
+                                      + PCI_EXP_LNKCAP);
+        reg_field |= PCI_EXP_LNKCAP_SLS & lnkcap;
+    }
+
+    *data = reg_field;
+    return 0;
+}
+
+/* PCI Express Capability Structure reg static infomation table */
+static XenPTRegInfo pt_emu_reg_pcie_tbl[] = {
+    /* Next Pointer reg */
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* Device Capabilities reg */
+    {
+        .offset     = PCI_EXP_DEVCAP,
+        .size       = 4,
+        .init_val   = 0x00000000,
+        .ro_mask    = 0x1FFCFFFF,
+        .emu_mask   = 0x10000000,
+        .init       = pt_common_reg_init,
+        .u.dw.read  = pt_long_reg_read,
+        .u.dw.write = pt_long_reg_write,
+    },
+    /* Device Control reg */
+    {
+        .offset     = PCI_EXP_DEVCTL,
+        .size       = 2,
+        .init_val   = 0x2810,
+        .ro_mask    = 0x8400,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_common_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    /* Link Control reg */
+    {
+        .offset     = PCI_EXP_LNKCTL,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFC34,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_linkctrl_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    /* Device Control 2 reg */
+    {
+        .offset     = 0x28,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFE0,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_devctrl2_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    /* Link Control 2 reg */
+    {
+        .offset     = 0x30,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xE040,
+        .emu_mask   = 0xFFFF,
+        .init       = pt_linkctrl2_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/*********************************
+ * Power Management Capability
+ */
+
+/* read Power Management Control/Status register */
+static int pt_pmcsr_reg_read(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                             uint16_t *value, uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t valid_emu_mask = reg->emu_mask;
+
+    valid_emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
+
+    valid_emu_mask = valid_emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, cfg_entry->data, ~valid_emu_mask);
+
+    return 0;
+}
+/* write Power Management Control/Status register */
+static int pt_pmcsr_reg_write(XenPCIPassthroughState *s, XenPTReg *cfg_entry,
+                              uint16_t *value, uint16_t dev_value,
+                              uint16_t valid_mask)
+{
+    XenPTRegInfo *reg = cfg_entry->reg;
+    uint16_t emu_mask = reg->emu_mask;
+    uint16_t writable_mask = 0;
+    uint16_t throughable_mask = 0;
+
+    emu_mask |= PCI_PM_CTRL_STATE_MASK | PCI_PM_CTRL_NO_SOFT_RESET;
+
+    /* modify emulate register */
+    writable_mask = emu_mask & ~reg->ro_mask & valid_mask;
+    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
+
+    /* create value for writing to I/O device register */
+    throughable_mask = ~emu_mask & valid_mask;
+    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
+
+    return 0;
+}
+
+/* Power Management Capability reg static infomation table */
+static XenPTRegInfo pt_emu_reg_pm_tbl[] = {
+    /* Next Pointer reg */
+    {
+        .offset     = PCI_CAP_LIST_NEXT,
+        .size       = 1,
+        .init_val   = 0x00,
+        .ro_mask    = 0xFF,
+        .emu_mask   = 0xFF,
+        .init       = pt_ptr_reg_init,
+        .u.b.read   = pt_byte_reg_read,
+        .u.b.write  = pt_byte_reg_write,
+    },
+    /* Power Management Capabilities reg */
+    {
+        .offset     = PCI_CAP_FLAGS,
+        .size       = 2,
+        .init_val   = 0x0000,
+        .ro_mask    = 0xFFFF,
+        .emu_mask   = 0xF9C8,
+        .init       = pt_common_reg_init,
+        .u.w.read   = pt_word_reg_read,
+        .u.w.write  = pt_word_reg_write,
+    },
+    /* PCI Power Management Control/Status reg */
+    {
+        .offset     = PCI_PM_CTRL,
+        .size       = 2,
+        .init_val   = 0x0008,
+        .ro_mask    = 0xE1FC,
+        .emu_mask   = 0x8100,
+        .init       = pt_common_reg_init,
+        .u.w.read   = pt_pmcsr_reg_read,
+        .u.w.write  = pt_pmcsr_reg_write,
+    },
+    {
+        .size = 0,
+    },
+};
+
+
+/****************************
+ * Capabilities
+ */
+
+/* capability structure register group size functions */
+
+static int pt_reg_grp_size_init(XenPCIPassthroughState *s,
+                                const XenPTRegGroupInfo *grp_reg,
+                                uint32_t base_offset, uint8_t *size)
+{
+    *size = grp_reg->grp_size;
+    return 0;
+}
+/* get Vendor Specific Capability Structure register group size */
+static int pt_vendor_size_init(XenPCIPassthroughState *s,
+                               const XenPTRegGroupInfo *grp_reg,
+                               uint32_t base_offset, uint8_t *size)
+{
+    *size = pci_get_byte(s->dev.config + base_offset + 0x02);
+    return 0;
+}
+/* get PCI Express Capability Structure register group size */
+static int pt_pcie_size_init(XenPCIPassthroughState *s,
+                             const XenPTRegGroupInfo *grp_reg,
+                             uint32_t base_offset, uint8_t *size)
+{
+    PCIDevice *d = &s->dev;
+    uint8_t version = get_capability_version(s, base_offset);
+    uint8_t type = get_device_type(s, base_offset);
+    uint8_t pcie_size = 0;
+
+
+    /* calculate size depend on capability version and device/port type */
+    /* in case of PCI Express Base Specification Rev 1.x */
+    if (version == 1) {
+        /* The PCI Express Capabilities, Device Capabilities, and Device
+         * Status/Control registers are required for all PCI Express devices.
+         * The Link Capabilities and Link Status/Control are required for all
+         * Endpoints that are not Root Complex Integrated Endpoints. Endpoints
+         * are not required to implement registers other than those listed
+         * above and terminate the capability structure.
+         */
+        switch (type) {
+        case PCI_EXP_TYPE_ENDPOINT:
+        case PCI_EXP_TYPE_LEG_END:
+            pcie_size = 0x14;
+            break;
+        case PCI_EXP_TYPE_RC_END:
+            /* has no link */
+            pcie_size = 0x0C;
+            break;
+        /* only EndPoint passthrough is supported */
+        case PCI_EXP_TYPE_ROOT_PORT:
+        case PCI_EXP_TYPE_UPSTREAM:
+        case PCI_EXP_TYPE_DOWNSTREAM:
+        case PCI_EXP_TYPE_PCI_BRIDGE:
+        case PCI_EXP_TYPE_PCIE_BRIDGE:
+        case PCI_EXP_TYPE_RC_EC:
+        default:
+            PT_ERR(d, "Unsupported device/port type %#x.\n", type);
+            return -1;
+        }
+    }
+    /* in case of PCI Express Base Specification Rev 2.0 */
+    else if (version == 2) {
+        switch (type) {
+        case PCI_EXP_TYPE_ENDPOINT:
+        case PCI_EXP_TYPE_LEG_END:
+        case PCI_EXP_TYPE_RC_END:
+            /* For Functions that do not implement the registers,
+             * these spaces must be hardwired to 0b.
+             */
+            pcie_size = 0x3C;
+            break;
+        /* only EndPoint passthrough is supported */
+        case PCI_EXP_TYPE_ROOT_PORT:
+        case PCI_EXP_TYPE_UPSTREAM:
+        case PCI_EXP_TYPE_DOWNSTREAM:
+        case PCI_EXP_TYPE_PCI_BRIDGE:
+        case PCI_EXP_TYPE_PCIE_BRIDGE:
+        case PCI_EXP_TYPE_RC_EC:
+        default:
+            PT_ERR(d, "Unsupported device/port type %#x.\n", type);
+            return -1;
+        }
+    } else {
+        PT_ERR(d, "Unsupported capability version %#x.\n", version);
+        return -1;
+    }
+
+    *size = pcie_size;
+    return 0;
+}
+
+static const XenPTRegGroupInfo pt_emu_reg_grp_tbl[] = {
+    /* Header Type0 reg group */
+    {
+        .grp_id      = 0xFF,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0x40,
+        .size_init   = pt_reg_grp_size_init,
+        .emu_reg_tbl = pt_emu_reg_header0_tbl,
+    },
+    /* PCI PowerManagement Capability reg group */
+    {
+        .grp_id      = PCI_CAP_ID_PM,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = PCI_PM_SIZEOF,
+        .size_init   = pt_reg_grp_size_init,
+        .emu_reg_tbl = pt_emu_reg_pm_tbl,
+    },
+    /* AGP Capability Structure reg group */
+    {
+        .grp_id     = PCI_CAP_ID_AGP,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x30,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* Vital Product Data Capability Structure reg group */
+    {
+        .grp_id      = PCI_CAP_ID_VPD,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0x08,
+        .size_init   = pt_reg_grp_size_init,
+        .emu_reg_tbl = pt_emu_reg_vpd_tbl,
+    },
+    /* Slot Identification reg group */
+    {
+        .grp_id     = PCI_CAP_ID_SLOTID,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x04,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* PCI-X Capabilities List Item reg group */
+    {
+        .grp_id     = PCI_CAP_ID_PCIX,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x18,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* Vendor Specific Capability Structure reg group */
+    {
+        .grp_id      = PCI_CAP_ID_VNDR,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0xFF,
+        .size_init   = pt_vendor_size_init,
+        .emu_reg_tbl = pt_emu_reg_vendor_tbl,
+    },
+    /* SHPC Capability List Item reg group */
+    {
+        .grp_id     = PCI_CAP_ID_SHPC,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x08,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* Subsystem ID and Subsystem Vendor ID Capability List Item reg group */
+    {
+        .grp_id     = PCI_CAP_ID_SSVID,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x08,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* AGP 8x Capability Structure reg group */
+    {
+        .grp_id     = PCI_CAP_ID_AGP3,
+        .grp_type   = GRP_TYPE_HARDWIRED,
+        .grp_size   = 0x30,
+        .size_init  = pt_reg_grp_size_init,
+    },
+    /* PCI Express Capability Structure reg group */
+    {
+        .grp_id      = PCI_CAP_ID_EXP,
+        .grp_type    = GRP_TYPE_EMU,
+        .grp_size    = 0xFF,
+        .size_init   = pt_pcie_size_init,
+        .emu_reg_tbl = pt_emu_reg_pcie_tbl,
+    },
+    {
+        .grp_size = 0,
+    },
+};
+
+/* initialize Capabilities Pointer or Next Pointer register */
+static int pt_ptr_reg_init(XenPCIPassthroughState *s,
+                           XenPTRegInfo *reg, uint32_t real_offset,
+                           uint32_t *data)
+{
+    int i;
+    uint8_t *config = s->dev.config;
+    uint32_t reg_field = pci_get_byte(config + real_offset);
+    uint8_t cap_id = 0;
+
+    /* find capability offset */
+    while (reg_field) {
+        for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
+            if (pt_hide_dev_cap(s->real_device,
+                                pt_emu_reg_grp_tbl[i].grp_id)) {
+                continue;
+            }
+
+            cap_id = pci_get_byte(config + reg_field + PCI_CAP_LIST_ID);
+            if (pt_emu_reg_grp_tbl[i].grp_id == cap_id) {
+                if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
+                    goto out;
+                }
+                /* ignore the 0 hardwired capability, find next one */
+                break;
+            }
+        }
+
+        /* next capability */
+        reg_field = pci_get_byte(config + reg_field + PCI_CAP_LIST_NEXT);
+    }
+
+out:
+    *data = reg_field;
+    return 0;
+}
+
+
+/*************
+ * Main
+ */
+
+static uint8_t find_cap_offset(XenPCIPassthroughState *s, uint8_t cap)
+{
+    uint8_t id;
+    unsigned max_cap = PCI_CAP_MAX;
+    uint8_t pos = PCI_CAPABILITY_LIST;
+    uint8_t status = 0;
+
+    if (host_pci_get_byte(s->real_device, PCI_STATUS, &status)) {
+        return 0;
+    }
+    if ((status & PCI_STATUS_CAP_LIST) == 0) {
+        return 0;
+    }
+
+    while (max_cap--) {
+        if (host_pci_get_byte(s->real_device, pos, &pos)) {
+            break;
+        }
+        if (pos < PCI_CONFIG_HEADER_SIZE) {
+            break;
+        }
+
+        pos &= ~3;
+        if (host_pci_get_byte(s->real_device, pos + PCI_CAP_LIST_ID, &id)) {
+            break;
+        }
+
+        if (id == 0xff) {
+            break;
+        }
+        if (id == cap) {
+            return pos;
+        }
+
+        pos += PCI_CAP_LIST_NEXT;
+    }
+    return 0;
+}
+
+static int pt_config_reg_init(XenPCIPassthroughState *s,
+                              XenPTRegGroup *reg_grp, XenPTRegInfo *reg)
+{
+    XenPTReg *reg_entry;
+    uint32_t data = 0;
+    int rc = 0;
+
+    reg_entry = g_new0(XenPTReg, 1);
+    reg_entry->reg = reg;
+
+    if (reg->init) {
+        /* initialize emulate register */
+        rc = reg->init(s, reg_entry->reg,
+                       reg_grp->base_offset + reg->offset, &data);
+        if (rc < 0) {
+            free(reg_entry);
+            return rc;
+        }
+        if (data == PT_INVALID_REG) {
+            /* free unused BAR register entry */
+            free(reg_entry);
+            return 0;
+        }
+        /* set register value */
+        reg_entry->data = data;
+    }
+    /* list add register entry */
+    QLIST_INSERT_HEAD(&reg_grp->reg_tbl_list, reg_entry, entries);
+
+    return 0;
+}
+
+int pt_config_init(XenPCIPassthroughState *s)
+{
+    int i, rc;
+
+    QLIST_INIT(&s->reg_grp_tbl);
+
+    for (i = 0; pt_emu_reg_grp_tbl[i].grp_size != 0; i++) {
+        uint32_t reg_grp_offset = 0;
+        XenPTRegGroup *reg_grp_entry = NULL;
+
+        if (pt_emu_reg_grp_tbl[i].grp_id != 0xFF) {
+            if (pt_hide_dev_cap(s->real_device,
+                                pt_emu_reg_grp_tbl[i].grp_id)) {
+                continue;
+            }
+
+            reg_grp_offset = find_cap_offset(s, pt_emu_reg_grp_tbl[i].grp_id);
+
+            if (!reg_grp_offset) {
+                continue;
+            }
+        }
+
+        reg_grp_entry = g_new0(XenPTRegGroup, 1);
+        QLIST_INIT(&reg_grp_entry->reg_tbl_list);
+        QLIST_INSERT_HEAD(&s->reg_grp_tbl, reg_grp_entry, entries);
+
+        reg_grp_entry->base_offset = reg_grp_offset;
+        reg_grp_entry->reg_grp = pt_emu_reg_grp_tbl + i;
+        if (pt_emu_reg_grp_tbl[i].size_init) {
+            /* get register group size */
+            rc = pt_emu_reg_grp_tbl[i].size_init(s, reg_grp_entry->reg_grp,
+                                                 reg_grp_offset,
+                                                 &reg_grp_entry->size);
+            if (rc < 0) {
+                pt_config_delete(s);
+                return rc;
+            }
+        }
+
+        if (pt_emu_reg_grp_tbl[i].grp_type == GRP_TYPE_EMU) {
+            if (pt_emu_reg_grp_tbl[i].emu_reg_tbl) {
+                int j = 0;
+                XenPTRegInfo *reg_tbl = pt_emu_reg_grp_tbl[i].emu_reg_tbl;
+                /* initialize capability register */
+                for (j = 0; reg_tbl->size != 0; j++, reg_tbl++) {
+                    /* initialize capability register */
+                    rc = pt_config_reg_init(s, reg_grp_entry, reg_tbl);
+                    if (rc < 0) {
+                        pt_config_delete(s);
+                        return rc;
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+/* delete all emulate register */
+void pt_config_delete(XenPCIPassthroughState *s)
+{
+    struct XenPTRegGroup *reg_group, *next_grp;
+    struct XenPTReg *reg, *next_reg;
+
+    /* free all register group entry */
+    QLIST_FOREACH_SAFE(reg_group, &s->reg_grp_tbl, entries, next_grp) {
+        /* free all register entry */
+        QLIST_FOREACH_SAFE(reg, &reg_group->reg_tbl_list, entries, next_reg) {
+            QLIST_REMOVE(reg, entries);
+            g_free(reg);
+        }
+
+        QLIST_REMOVE(reg_group, entries);
+        g_free(reg_group);
+    }
+}
-- 
Anthony PERARD

^ permalink raw reply related

* [Qemu-devel] [PATCH V7 07/11] Introduce Xen PCI Passthrough, qdevice (1/3)
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel
  Cc: Anthony PERARD, Guy Zana, Xen Devel, Allen Kay,
	Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

From: Allen Kay <allen.m.kay@intel.com>

A more complete history can be found here:
git://xenbits.xensource.com/qemu-xen-unstable.git

Signed-off-by: Allen Kay <allen.m.kay@intel.com>
Signed-off-by: Guy Zana <guy@neocleus.com>
Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 Makefile.target                      |    2 +
 hw/xen_common.h                      |    3 +
 hw/xen_pci_passthrough.c             |  814 ++++++++++++++++++++++++++++++++++
 hw/xen_pci_passthrough.h             |  251 +++++++++++
 hw/xen_pci_passthrough_config_init.c |   11 +
 xen-all.c                            |   12 +
 6 files changed, 1093 insertions(+), 0 deletions(-)
 create mode 100644 hw/xen_pci_passthrough.c
 create mode 100644 hw/xen_pci_passthrough.h
 create mode 100644 hw/xen_pci_passthrough_config_init.c

diff --git a/Makefile.target b/Makefile.target
index 5098299..f7c5ce7 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -224,6 +224,8 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
 
 # Xen PCI Passthrough
 obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_config_init.o
 
 # Inter-VM PCI shared memory
 CONFIG_IVSHMEM =
diff --git a/hw/xen_common.h b/hw/xen_common.h
index 0409ac7..48916fd 100644
--- a/hw/xen_common.h
+++ b/hw/xen_common.h
@@ -135,4 +135,7 @@ static inline int xc_fd(xc_interface *xen_xc)
 
 void destroy_hvm_domain(void);
 
+/* shutdown/destroy current domain because of an error */
+void xen_shutdown_fatal_error(const char *fmt, ...) GCC_FMT_ATTR(1, 2);
+
 #endif /* QEMU_HW_XEN_COMMON_H */
diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
new file mode 100644
index 0000000..3f305dd
--- /dev/null
+++ b/hw/xen_pci_passthrough.c
@@ -0,0 +1,814 @@
+/*
+ * Copyright (c) 2007, Neocleus Corporation.
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Alex Novik <alex@neocleus.com>
+ * Allen Kay <allen.m.kay@intel.com>
+ * Guy Zana <guy@neocleus.com>
+ *
+ * This file implements direct PCI assignment to a HVM guest
+ */
+
+/*
+ * Interrupt Disable policy:
+ *
+ * INTx interrupt:
+ *   Initialize(register_real_device)
+ *     Map INTx(xc_physdev_map_pirq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Set machine_irq and assigned_device->machine_irq to '0'.
+ *         * Don't bind INTx.
+ *
+ *     Bind INTx(xc_domain_bind_pt_pci_irq):
+ *       <fail>
+ *         - Set real Interrupt Disable bit to '1'.
+ *         - Unmap INTx.
+ *         - Decrement mapped_machine_irq[machine_irq]
+ *         - Set assigned_device->machine_irq to '0'.
+ *
+ *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
+ *     Write '0'
+ *       - Set real bit to '0' if assigned_device->machine_irq isn't '0'.
+ *
+ *     Write '1'
+ *       - Set real bit to '1'.
+ */
+
+#include <sys/ioctl.h>
+
+#include "pci.h"
+#include "xen.h"
+#include "xen_backend.h"
+#include "xen_pci_passthrough.h"
+
+#define PCI_BAR_ENTRIES (6)
+
+#define PT_NR_IRQS          (256)
+uint8_t mapped_machine_irq[PT_NR_IRQS] = {0};
+
+void pt_log(const PCIDevice *d, const char *f, ...)
+{
+    va_list ap;
+
+    va_start(ap, f);
+    if (d) {
+        fprintf(stderr, "[%02x:%02x.%x] ", pci_bus_num(d->bus),
+                PCI_SLOT(d->devfn), PCI_FUNC(d->devfn));
+    }
+    vfprintf(stderr, f, ap);
+    va_end(ap);
+}
+
+
+/* Config Space */
+static int pt_pci_config_access_check(PCIDevice *d, uint32_t address, int len)
+{
+    /* check offset range */
+    if (address >= 0xFF) {
+        PT_ERR(d, "Failed to access register with offset exceeding 0xFF. "
+               "(addr: 0x%02x, len: %d)\n", address, len);
+        return -1;
+    }
+
+    /* check read size */
+    if ((len != 1) && (len != 2) && (len != 4)) {
+        PT_ERR(d, "Failed to access register with invalid access length. "
+               "(addr: 0x%02x, len: %d)\n", address, len);
+        return -1;
+    }
+
+    /* check offset alignment */
+    if (address & (len - 1)) {
+        PT_ERR(d, "Failed to access register with invalid access size "
+               "alignment. (addr: 0x%02x, len: %d)\n", address, len);
+        return -1;
+    }
+
+    return 0;
+}
+
+int pt_bar_offset_to_index(uint32_t offset)
+{
+    int index = 0;
+
+    /* check Exp ROM BAR */
+    if (offset == PCI_ROM_ADDRESS) {
+        return PCI_ROM_SLOT;
+    }
+
+    /* calculate BAR index */
+    index = (offset - PCI_BASE_ADDRESS_0) >> 2;
+    if (index >= PCI_NUM_REGIONS) {
+        return -1;
+    }
+
+    return index;
+}
+
+static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t addr, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    uint32_t val = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    int rc = 0;
+    int emul_len = 0;
+    uint32_t find_addr = addr;
+
+    if (pt_pci_config_access_check(d, addr, len)) {
+        goto exit;
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, addr);
+    if (reg_grp_entry) {
+        /* check 0-Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* no need to emulate, just return 0 */
+            val = 0;
+            goto exit;
+        }
+    }
+
+    /* read I/O device register value */
+    rc = host_pci_get_block(s->real_device, addr, (uint8_t *)&val, len);
+    if (rc < 0) {
+        PT_ERR(d, "pci_read_block failed. return value: %d.\n", rc);
+        memset(&val, 0xff, len);
+    }
+
+    /* just return the I/O device register value for
+     * passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto exit;
+    }
+
+    /* adjust the read value to appropriate CFC-CFF window */
+    val <<= (addr & 3) << 3;
+    emul_len = len;
+
+    /* loop around the guest requested size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            XenPTRegInfo *reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation based on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.read) {
+                    rc = reg->u.b.read(s, reg_entry, ptr_val, valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.read) {
+                    rc = reg->u.w.read(s, reg_entry,
+                                       (uint16_t *)ptr_val, valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.read) {
+                    rc = reg->u.dw.read(s, reg_entry,
+                                        (uint32_t *)ptr_val, valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                xen_shutdown_fatal_error("Internal error: Invalid read "
+                                         "emulation. (%s, rc: %d)\n",
+                                         __func__, rc);
+                return 0;
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before returning them to pci bus emulator */
+    val >>= ((addr & 3) << 3);
+
+exit:
+    PT_LOG_CONFIG(d, addr, val, len);
+    return val;
+}
+
+static void pt_pci_write_config(PCIDevice *d, uint32_t addr,
+                                uint32_t val, int len)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    int index = 0;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    int rc = 0;
+    uint32_t read_val = 0;
+    int emul_len = 0;
+    XenPTReg *reg_entry = NULL;
+    uint32_t find_addr = addr;
+    XenPTRegInfo *reg = NULL;
+
+    if (pt_pci_config_access_check(d, addr, len)) {
+        return;
+    }
+
+    PT_LOG_CONFIG(d, addr, val, len);
+
+    /* check unused BAR register */
+    index = pt_bar_offset_to_index(addr);
+    if ((index >= 0) && (val > 0 && val < PT_BAR_ALLF) &&
+        (s->bases[index].bar_flag == PT_BAR_FLAG_UNUSED)) {
+        PT_WARN(d, "Guest attempt to set address to unused Base Address "
+                "Register. (addr: 0x%02x, len: %d)\n", addr, len);
+    }
+
+    /* find register group entry */
+    reg_grp_entry = pt_find_reg_grp(s, addr);
+    if (reg_grp_entry) {
+        /* check 0-Hardwired register group */
+        if (reg_grp_entry->reg_grp->grp_type == GRP_TYPE_HARDWIRED) {
+            /* ignore silently */
+            PT_WARN(d, "Access to 0-Hardwired register. "
+                    "(addr: 0x%02x, len: %d)\n", addr, len);
+            return;
+        }
+    }
+
+    rc = host_pci_get_block(s->real_device, addr, (uint8_t *)&read_val, len);
+    if (rc < 0) {
+        PT_ERR(d, "pci_read_block failed. return value: %d.\n", rc);
+        memset(&read_val, 0xff, len);
+    }
+
+    /* pass directly to the real device for passthrough type register group */
+    if (reg_grp_entry == NULL) {
+        goto out;
+    }
+
+    pci_default_write_config(d, addr, val, len);
+
+    /* adjust the read and write value to appropriate CFC-CFF window */
+    read_val <<= (addr & 3) << 3;
+    val <<= (addr & 3) << 3;
+    emul_len = len;
+
+    /* loop around the guest requested size */
+    while (emul_len > 0) {
+        /* find register entry to be emulated */
+        reg_entry = pt_find_reg(reg_grp_entry, find_addr);
+        if (reg_entry) {
+            reg = reg_entry->reg;
+            uint32_t real_offset = reg_grp_entry->base_offset + reg->offset;
+            uint32_t valid_mask = 0xFFFFFFFF >> ((4 - emul_len) << 3);
+            uint8_t *ptr_val = NULL;
+
+            valid_mask <<= (find_addr - real_offset) << 3;
+            ptr_val = (uint8_t *)&val + (real_offset & 3);
+
+            /* do emulation based on register size */
+            switch (reg->size) {
+            case 1:
+                if (reg->u.b.write) {
+                    rc = reg->u.b.write(s, reg_entry, ptr_val,
+                                        read_val >> ((real_offset & 3) << 3),
+                                        valid_mask);
+                }
+                break;
+            case 2:
+                if (reg->u.w.write) {
+                    rc = reg->u.w.write(s, reg_entry, (uint16_t *)ptr_val,
+                                        (read_val >> ((real_offset & 3) << 3)),
+                                        valid_mask);
+                }
+                break;
+            case 4:
+                if (reg->u.dw.write) {
+                    rc = reg->u.dw.write(s, reg_entry, (uint32_t *)ptr_val,
+                                         (read_val >> ((real_offset & 3) << 3)),
+                                         valid_mask);
+                }
+                break;
+            }
+
+            if (rc < 0) {
+                xen_shutdown_fatal_error("Internal error: Invalid write"
+                                         " emulation. (%s, rc: %d)\n",
+                                         __func__, rc);
+                return;
+            }
+
+            /* calculate next address to find */
+            emul_len -= reg->size;
+            if (emul_len > 0) {
+                find_addr = real_offset + reg->size;
+            }
+        } else {
+            /* nothing to do with passthrough type register,
+             * continue to find next byte */
+            emul_len--;
+            find_addr++;
+        }
+    }
+
+    /* need to shift back before passing them to host_pci_device */
+    val >>= (addr & 3) << 3;
+
+out:
+    if (!(reg && reg->no_wb)) {
+        /* unknown regs are passed through */
+        rc = host_pci_set_block(s->real_device, addr, (uint8_t *)&val, len);
+
+        if (rc < 0) {
+            PT_ERR(d, "pci_write_block failed. return value: %d.\n", rc);
+        }
+    }
+}
+
+/* ioport/iomem space*/
+static void pt_iomem_map(XenPCIPassthroughState *s, int i,
+                         pcibus_t e_phys, pcibus_t e_size)
+{
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int rc = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG(&s->dev, "BAR %i, e_phys=%#"PRIx64" maddr=%#"PRIx64
+           " len=%#"PRIx64" first_map=%d\n",
+           i, e_phys, s->bases[i].access.maddr, e_size, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
+        /* Remove old mapping */
+        rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                               old_ebase >> XC_PAGE_SHIFT,
+                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
+                               DPCI_REMOVE_MAPPING);
+        if (rc) {
+            PT_ERR(&s->dev, "remove old mapping failed! (rc: %i)\n", rc);
+            return;
+        }
+    }
+
+    /* map only valid guest address */
+    if (e_phys != PCI_BAR_UNMAPPED) {
+        /* Create new mapping */
+        rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
+                                   DPCI_ADD_MAPPING);
+
+        if (rc) {
+            PT_ERR(&s->dev, "create new mapping failed! (rc: %i)\n", rc);
+        }
+    }
+}
+
+static void pt_ioport_map(XenPCIPassthroughState *s, int i,
+                          pcibus_t e_phys, pcibus_t e_size)
+{
+    uint32_t old_ebase = s->bases[i].e_physbase;
+    bool first_map = s->bases[i].e_size == 0;
+    int rc = 0;
+
+    s->bases[i].e_physbase = e_phys;
+    s->bases[i].e_size = e_size;
+
+    PT_LOG(&s->dev, "BAR %i, e_phys=%#04"PRIx64" pio_base=%#04"PRIx64
+           " len=%"PRId64" first_map=%d\n",
+           i, e_phys, s->bases[i].access.pio_base, e_size, first_map);
+
+    if (e_size == 0) {
+        return;
+    }
+
+    if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
+        /* Remove old mapping */
+        rc = xc_domain_ioport_mapping(xen_xc, xen_domid, old_ebase,
+                                      s->bases[i].access.pio_base, e_size,
+                                      DPCI_REMOVE_MAPPING);
+        if (rc) {
+            PT_ERR(&s->dev, "remove old mapping failed! (rc: %i)\n", rc);
+            return;
+        }
+    }
+
+    /* map only valid guest address (include 0) */
+    if (e_phys != PCI_BAR_UNMAPPED) {
+        /* Create new mapping */
+        rc = xc_domain_ioport_mapping(xen_xc, xen_domid, e_phys,
+                                      s->bases[i].access.pio_base, e_size,
+                                      DPCI_ADD_MAPPING);
+        if (rc) {
+            PT_ERR(&s->dev, "create new mapping failed! (rc: %i)\n", rc);
+        }
+    }
+
+}
+
+
+/* mapping BAR */
+
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable)
+{
+    PCIDevice *d = &s->dev;
+    const PCIIORegion *r;
+    XenPTRegGroup *reg_grp_entry = NULL;
+    XenPTReg *reg_entry = NULL;
+    XenPTRegion *base = NULL;
+    pcibus_t r_size = 0, r_addr = PCI_BAR_UNMAPPED;
+    int rc = 0;
+
+    r = &d->io_regions[bar];
+
+    /* check valid region */
+    if (!r->size) {
+        return;
+    }
+
+    base = &s->bases[bar];
+    /* skip unused BAR or upper 64bit BAR */
+    if ((base->bar_flag == PT_BAR_FLAG_UNUSED)
+        || (base->bar_flag == PT_BAR_FLAG_UPPER)) {
+        return;
+    }
+
+    /* copy region address to temporary */
+    r_addr = pci_get_bar_addr(d, bar);
+
+    /* need unmapping in case I/O Space or Memory Space is disabled */
+    if (((base->bar_flag == PT_BAR_FLAG_IO) && !io_enable) ||
+        ((base->bar_flag == PT_BAR_FLAG_MEM) && !mem_enable)) {
+        r_addr = PCI_BAR_UNMAPPED;
+    }
+    /* or ROM address is disabled. */
+    if ((bar == PCI_ROM_SLOT) && (r_addr != PCI_BAR_UNMAPPED)) {
+        reg_grp_entry = pt_find_reg_grp(s, PCI_ROM_ADDRESS);
+        if (reg_grp_entry) {
+            reg_entry = pt_find_reg(reg_grp_entry, PCI_ROM_ADDRESS);
+            if (reg_entry && !(reg_entry->data & PCI_ROM_ADDRESS_ENABLE)) {
+                r_addr = PCI_BAR_UNMAPPED;
+            }
+        }
+    }
+
+    /* prevent guest software mapping memory resource to 00000000h */
+    if ((base->bar_flag == PT_BAR_FLAG_MEM) && (r_addr == 0)) {
+        r_addr = PCI_BAR_UNMAPPED;
+    }
+
+    r_size = pt_get_emul_size(base->bar_flag, r->size);
+
+    rc = pci_check_bar_overlap(d, r_addr, r_size, r->type);
+    if (rc) {
+        PT_WARN(d, "Region: %d (addr: %#"FMT_PCIBUS
+                ", len: %#"FMT_PCIBUS") is overlapped.\n",
+                bar, r_addr, r_size);
+    }
+
+    /* check whether we need to update the mapping or not */
+    if (r_addr != s->bases[bar].e_physbase) {
+        /* mapping BAR */
+        if (base->bar_flag == PT_BAR_FLAG_IO) {
+            pt_ioport_map(s, bar, r_addr, r_size);
+        } else {
+            pt_iomem_map(s, bar, r_addr, r_size);
+        }
+    }
+}
+
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable)
+{
+    int i;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        pt_bar_mapping_one(s, i, io_enable, mem_enable);
+    }
+}
+
+static uint64_t bar_read(void *o, target_phys_addr_t addr, unsigned size)
+{
+    PCIDevice *d = o;
+    /* if this function is called, that probably means that there is a
+     * misconfiguration of the IOMMU. */
+    PT_ERR(d, "Should not read BAR through QEMU. @0x"TARGET_FMT_plx"\n", addr);
+    return 0;
+}
+static void bar_write(void *o, target_phys_addr_t addr,
+                      uint64_t data, unsigned size)
+{
+    PCIDevice *d = o;
+    /* Same comment as bar_read function */
+    PT_ERR(d, "Should not write BAR through QEMU. @0x"TARGET_FMT_plx"\n", addr);
+}
+
+static const MemoryRegionOps ops = {
+    .endianness = DEVICE_NATIVE_ENDIAN,
+    .read = bar_read,
+    .write = bar_write,
+};
+
+/* register regions */
+static int pt_register_regions(XenPCIPassthroughState *s)
+{
+    int i = 0;
+    HostPCIDevice *d = s->real_device;
+
+    /* Register PIO/MMIO BARs */
+    for (i = 0; i < PCI_BAR_ENTRIES; i++) {
+        HostPCIIORegion *r = &d->io_regions[i];
+        uint8_t type;
+
+        if (r->base_addr == 0 || r->size == 0) {
+            continue;
+        }
+
+        s->bases[i].e_physbase = r->base_addr;
+        s->bases[i].access.u = r->base_addr;
+
+        if (r->flags & IORESOURCE_IO) {
+            type = PCI_BASE_ADDRESS_SPACE_IO;
+        } else {
+            type = PCI_BASE_ADDRESS_SPACE_MEMORY;
+            if (r->flags & IORESOURCE_PREFETCH) {
+                type |= PCI_BASE_ADDRESS_MEM_PREFETCH;
+            }
+        }
+
+        memory_region_init_io(&s->bar[i], &ops, &s->dev,
+                              "xen-pci-pt-bar", r->size);
+        pci_register_bar(&s->dev, i, type, &s->bar[i]);
+
+        PT_LOG(&s->dev, "IO region %i registered (size=0x%08"PRIx64
+               " base_addr=0x%08"PRIx64" type: %#x)\n",
+               i, r->size, r->base_addr, type);
+    }
+
+    /* Register expansion ROM address */
+    if (d->rom.base_addr && d->rom.size) {
+        uint32_t bar_data = 0;
+
+        /* Re-set BAR reported by OS, otherwise ROM can't be read. */
+        if (host_pci_get_long(d, PCI_ROM_ADDRESS, &bar_data)) {
+            return 0;
+        }
+        if ((bar_data & PCI_ROM_ADDRESS_MASK) == 0) {
+            bar_data |= d->rom.base_addr & PCI_ROM_ADDRESS_MASK;
+            host_pci_set_long(d, PCI_ROM_ADDRESS, bar_data);
+        }
+
+        s->bases[PCI_ROM_SLOT].e_physbase = d->rom.base_addr;
+        s->bases[PCI_ROM_SLOT].access.maddr = d->rom.base_addr;
+
+        memory_region_init_rom_device(&s->rom, NULL, NULL,
+                                      "xen-pci-pt-rom", d->rom.size);
+        pci_register_bar(&s->dev, PCI_ROM_SLOT, PCI_BASE_ADDRESS_MEM_PREFETCH,
+                         &s->rom);
+
+        PT_LOG(&s->dev, "Expansion ROM registered (size=0x%08"PRIx64
+               " base_addr=0x%08"PRIx64")\n",
+               d->rom.size, d->rom.base_addr);
+    }
+
+    return 0;
+}
+
+static void pt_unregister_regions(XenPCIPassthroughState *s)
+{
+    int i, type, rc;
+    uint32_t e_size;
+    PCIDevice *d = &s->dev;
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        e_size = s->bases[i].e_size;
+        if ((e_size == 0) || (s->bases[i].e_physbase == PT_PCI_BAR_UNMAPPED)) {
+            continue;
+        }
+
+        type = d->io_regions[i].type;
+
+        if (type & PCI_BASE_ADDRESS_SPACE_IO) {
+            rc = xc_domain_ioport_mapping(xen_xc, xen_domid,
+                                          s->bases[i].e_physbase,
+                                          s->bases[i].access.pio_base,
+                                          e_size,
+                                          DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_ERR(d, "remove old io mapping failed!\n");
+                continue;
+            }
+        } else {
+            rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                    s->bases[i].e_physbase >> XC_PAGE_SHIFT,
+                    s->bases[i].access.maddr >> XC_PAGE_SHIFT,
+                    (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
+                    DPCI_REMOVE_MAPPING);
+            if (rc != 0) {
+                PT_ERR(d, "remove old mem mapping failed!\n");
+                continue;
+            }
+        }
+    }
+}
+
+static int pt_initfn(PCIDevice *d)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    int dom, bus;
+    unsigned slot, func;
+    int rc = 0;
+    uint8_t machine_irq = 0;
+    int pirq = PT_UNASSIGNED_PIRQ;
+
+    if (pci_parse_devaddr(s->hostaddr, &dom, &bus, &slot, &func) < 0) {
+        PT_ERR(d, "Failed to parse BDF: %s\n", s->hostaddr);
+        return -1;
+    }
+
+    /* register real device */
+    PT_LOG(d, "Assigning real physical device %02x:%02x.%x"
+           " to devfn %#x\n", bus, slot, func, s->dev.devfn);
+
+    s->real_device = host_pci_device_get(bus, slot, func);
+    if (!s->real_device) {
+        return -1;
+    }
+
+    s->is_virtfn = s->real_device->is_virtfn;
+    if (s->is_virtfn) {
+        PT_LOG(d, "%04x:%02x:%02x.%x is a SR-IOV Virtual Function\n",
+               s->real_device->domain, bus, slot, func);
+    }
+
+    /* Initialize virtualized PCI configuration (Extended 256 Bytes) */
+    if (host_pci_get_block(s->real_device, 0, d->config,
+                           PCI_CONFIG_SPACE_SIZE) == -1) {
+        host_pci_device_put(s->real_device);
+        return -1;
+    }
+
+    /* Handle real device's MMIO/PIO BARs */
+    pt_register_regions(s);
+
+    /* Bind interrupt */
+    if (!s->dev.config[PCI_INTERRUPT_PIN]) {
+        PT_LOG(d, "no pin interrupt\n");
+        goto out;
+    }
+
+    host_pci_get_byte(s->real_device, PCI_INTERRUPT_LINE, &machine_irq);
+    rc = xc_physdev_map_pirq(xen_xc, xen_domid, machine_irq, &pirq);
+
+    if (rc < 0) {
+        PT_ERR(d, "Mapping machine irq %u to pirq %i failed, (rc: %d)\n",
+               machine_irq, pirq, rc);
+
+        /* Disable PCI intx assertion (turn on bit10 of devctl) */
+        host_pci_set_word(s->real_device,
+                          PCI_COMMAND,
+                          pci_get_word(s->dev.config + PCI_COMMAND)
+                          | PCI_COMMAND_INTX_DISABLE);
+        machine_irq = 0;
+        s->machine_irq = 0;
+    } else {
+        machine_irq = pirq;
+        s->machine_irq = pirq;
+        mapped_machine_irq[machine_irq]++;
+    }
+
+    /* bind machine_irq to device */
+    if (machine_irq != 0) {
+        uint8_t e_intx = pci_intx(s);
+
+        rc = xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, machine_irq,
+                                       pci_bus_num(d->bus),
+                                       PCI_SLOT(d->devfn),
+                                       e_intx);
+        if (rc < 0) {
+            PT_ERR(d, "Binding of interrupt %i failed! (rc: %d)\n",
+                   e_intx, rc);
+
+            /* Disable PCI intx assertion (turn on bit10 of devctl) */
+            host_pci_set_word(s->real_device, PCI_COMMAND,
+                              *(uint16_t *)(&s->dev.config[PCI_COMMAND])
+                              | PCI_COMMAND_INTX_DISABLE);
+            mapped_machine_irq[machine_irq]--;
+
+            if (mapped_machine_irq[machine_irq] == 0) {
+                if (xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq)) {
+                    PT_ERR(d, "Unmapping of machine interrupt %i failed!"
+                           " (rc: %d)\n", machine_irq, rc);
+                }
+            }
+            s->machine_irq = 0;
+        }
+    }
+
+out:
+    PT_LOG(d, "Real physical device %02x:%02x.%x registered successfuly!\n",
+           bus, slot, func);
+
+    return 0;
+}
+
+static int pt_unregister_device(PCIDevice *d)
+{
+    XenPCIPassthroughState *s = DO_UPCAST(XenPCIPassthroughState, dev, d);
+    uint8_t machine_irq = s->machine_irq;
+    uint8_t intx = pci_intx(s);
+    int rc;
+
+    if (machine_irq) {
+        rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
+                                     PT_IRQ_TYPE_PCI,
+                                     pci_bus_num(d->bus),
+                                     PCI_SLOT(s->dev.devfn),
+                                     intx,
+                                     0 /* isa_irq */);
+        if (rc < 0) {
+            PT_ERR(d, "unbinding of interrupt INT%c failed."
+                   " (machine irq: %i, rc: %d)"
+                   " But bravely continuing on..\n",
+                   'a' + intx, machine_irq, rc);
+        }
+    }
+
+    if (machine_irq) {
+        mapped_machine_irq[machine_irq]--;
+
+        if (mapped_machine_irq[machine_irq] == 0) {
+            rc = xc_physdev_unmap_pirq(xen_xc, xen_domid, machine_irq);
+
+            if (rc < 0) {
+                PT_ERR(d, "unmapping of interrupt %i failed. (rc: %d)"
+                       " But bravely continuing on..\n",
+                       machine_irq, rc);
+            }
+        }
+    }
+
+    /* unregister real device's MMIO/PIO BARs */
+    pt_unregister_regions(s);
+
+    host_pci_device_put(s->real_device);
+
+    return 0;
+}
+
+static Property xen_pci_passthrough_properties[] = {
+    DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+static void xen_pci_passthrough_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+    k->init = pt_initfn;
+    k->exit = pt_unregister_device;
+    k->config_read = pt_pci_read_config;
+    k->config_write = pt_pci_write_config;
+    dc->desc = "Assign an host PCI device with Xen";
+    dc->props = xen_pci_passthrough_properties;
+};
+
+static TypeInfo xen_pci_passthrough_info = {
+    .name = "xen-pci-passthrough",
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(XenPCIPassthroughState),
+    .class_init = xen_pci_passthrough_class_init,
+};
+
+static void xen_pci_passthrough_register_types(void)
+{
+    type_register_static(&xen_pci_passthrough_info);
+}
+
+type_init(xen_pci_passthrough_register_types)
diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
new file mode 100644
index 0000000..ea6719c
--- /dev/null
+++ b/hw/xen_pci_passthrough.h
@@ -0,0 +1,251 @@
+#ifndef QEMU_HW_XEN_PCI_PASSTHROUGH_H
+#  define QEMU_HW_XEN_PCI_PASSTHROUGH_H
+
+#include "qemu-common.h"
+#include "xen_common.h"
+#include "pci.h"
+#include "host-pci-device.h"
+
+/* #define PT_LOGGING_ENABLED */
+/* #define PT_DEBUG_PCI_CONFIG_ACCESS */
+
+void pt_log(const PCIDevice *d, const char *f, ...) GCC_FMT_ATTR(2, 3);
+
+#define PT_ERR(d, _f, _a...)  pt_log(d, "%s: Error: " _f, __func__, ##_a)
+
+#ifdef PT_LOGGING_ENABLED
+#  define PT_LOG(d, _f, _a...)  pt_log(d, "%s: " _f, __func__, ##_a)
+#  define PT_WARN(d, _f, _a...) pt_log(d, "%s: Warning: " _f, __func__, ##_a)
+#else
+#  define PT_LOG(d, _f, _a...)
+#  define PT_WARN(d, _f, _a...)
+#endif
+
+#ifdef PT_DEBUG_PCI_CONFIG_ACCESS
+#  define PT_LOG_CONFIG(d, addr, val, len) \
+    pt_log(d, "%s: address=0x%04x val=0x%08x len=%d\n", \
+           __func__, addr, val, len)
+#else
+#  define PT_LOG_CONFIG(d, addr, val, len)
+#endif
+
+
+typedef struct XenPTRegInfo XenPTRegInfo;
+typedef struct XenPTReg XenPTReg;
+
+typedef struct XenPCIPassthroughState XenPCIPassthroughState;
+
+/* function type for config reg */
+typedef int (*conf_reg_init)
+    (XenPCIPassthroughState *, XenPTRegInfo *, uint32_t real_offset,
+     uint32_t *data);
+typedef int (*conf_dword_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t dev_value, uint32_t valid_mask);
+typedef int (*conf_word_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t dev_value, uint16_t valid_mask);
+typedef int (*conf_byte_write)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t dev_value, uint8_t valid_mask);
+typedef int (*conf_dword_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint32_t *val, uint32_t valid_mask);
+typedef int (*conf_word_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint16_t *val, uint16_t valid_mask);
+typedef int (*conf_byte_read)
+    (XenPCIPassthroughState *, XenPTReg *cfg_entry,
+     uint8_t *val, uint8_t valid_mask);
+
+#define PT_BAR_ALLF         0xFFFFFFFF  /* BAR ALLF value */
+#define PT_PCI_BAR_UNMAPPED (-1)
+
+
+typedef enum {
+    GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
+    GRP_TYPE_EMU,                               /* emul reg group */
+} RegisterGroupType;
+
+typedef enum {
+    PT_BAR_FLAG_MEM = 0,                        /* Memory type BAR */
+    PT_BAR_FLAG_IO,                             /* I/O type BAR */
+    PT_BAR_FLAG_UPPER,                          /* upper 64bit BAR */
+    PT_BAR_FLAG_UNUSED,                         /* unused BAR */
+} PTBarFlag;
+
+
+typedef struct XenPTRegion {
+    /* Virtual phys base & size */
+    uint32_t e_physbase;
+    uint32_t e_size;
+    /* BAR flag */
+    PTBarFlag bar_flag;
+    /* Translation of the emulated address */
+    union {
+        uint64_t maddr;
+        uint64_t pio_base;
+        uint64_t u;
+    } access;
+} XenPTRegion;
+
+/* XenPTRegInfo declaration
+ * - only for emulated register (either a part or whole bit).
+ * - for passthrough register that need special behavior (like interacting with
+ *   other component), set emu_mask to all 0 and specify r/w func properly.
+ * - do NOT use ALL F for init_val, otherwise the tbl will not be registered.
+ */
+
+/* emulated register infomation */
+struct XenPTRegInfo {
+    uint32_t offset;
+    uint32_t size;
+    uint32_t init_val;
+    /* reg read only field mask (ON:RO/ROS, OFF:other) */
+    uint32_t ro_mask;
+    /* reg emulate field mask (ON:emu, OFF:passthrough) */
+    uint32_t emu_mask;
+    /* no write back allowed */
+    uint32_t no_wb;
+    conf_reg_init init;
+    /* read/write function pointer
+     * for double_word/word/byte size */
+    union {
+        struct {
+            conf_dword_write write;
+            conf_dword_read read;
+        } dw;
+        struct {
+            conf_word_write write;
+            conf_word_read read;
+        } w;
+        struct {
+            conf_byte_write write;
+            conf_byte_read read;
+        } b;
+    } u;
+};
+
+/* emulated register management */
+struct XenPTReg {
+    QLIST_ENTRY(XenPTReg) entries;
+    XenPTRegInfo *reg;
+    uint32_t data; /* emulated value */
+};
+
+typedef struct XenPTRegGroupInfo XenPTRegGroupInfo;
+
+/* emul reg group size initialize method */
+typedef int (*pt_reg_size_init_fn)
+    (XenPCIPassthroughState *, const XenPTRegGroupInfo *,
+     uint32_t base_offset, uint8_t *size);
+
+/* emulated register group infomation */
+struct XenPTRegGroupInfo {
+    uint8_t grp_id;
+    RegisterGroupType grp_type;
+    uint8_t grp_size;
+    pt_reg_size_init_fn size_init;
+    XenPTRegInfo *emu_reg_tbl;
+};
+
+/* emul register group management table */
+typedef struct XenPTRegGroup {
+    QLIST_ENTRY(XenPTRegGroup) entries;
+    const XenPTRegGroupInfo *reg_grp;
+    uint32_t base_offset;
+    uint8_t size;
+    QLIST_HEAD(, XenPTReg) reg_tbl_list;
+} XenPTRegGroup;
+
+
+#define PT_UNASSIGNED_PIRQ (-1)
+
+struct XenPCIPassthroughState {
+    PCIDevice dev;
+
+    char *hostaddr;
+    bool is_virtfn;
+    HostPCIDevice *real_device;
+    XenPTRegion bases[PCI_NUM_REGIONS]; /* Access regions */
+    QLIST_HEAD(, XenPTRegGroup) reg_grp_tbl;
+
+    uint32_t machine_irq;
+
+    MemoryRegion bar[PCI_NUM_REGIONS - 1];
+    MemoryRegion rom;
+};
+
+int pt_config_init(XenPCIPassthroughState *s);
+void pt_config_delete(XenPCIPassthroughState *s);
+void pt_bar_mapping(XenPCIPassthroughState *s, int io_enable, int mem_enable);
+void pt_bar_mapping_one(XenPCIPassthroughState *s, int bar,
+                        int io_enable, int mem_enable);
+XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address);
+XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address);
+int pt_bar_offset_to_index(uint32_t offset);
+
+static inline pcibus_t pt_get_emul_size(PTBarFlag flag, pcibus_t r_size)
+{
+    /* align resource size (memory type only) */
+    if (flag == PT_BAR_FLAG_MEM) {
+        return (r_size + XC_PAGE_SIZE - 1) & XC_PAGE_MASK;
+    } else {
+        return r_size;
+    }
+}
+
+/* INTx */
+/* The PCI Local Bus Specification, Rev. 3.0,
+ * Section 6.2.4 Miscellaneous Registers, pp 223
+ * outlines 5 valid values for the interrupt pin (intx).
+ *  0: For devices (or device functions) that don't use an interrupt in
+ *  1: INTA#
+ *  2: INTB#
+ *  3: INTC#
+ *  4: INTD#
+ *
+ * Xen uses the following 4 values for intx
+ *  0: INTA#
+ *  1: INTB#
+ *  2: INTC#
+ *  3: INTD#
+ *
+ * Observing that these list of values are not the same, pci_read_intx()
+ * uses the following mapping from hw to xen values.
+ * This seems to reflect the current usage within Xen.
+ *
+ * PCI hardware    | Xen | Notes
+ * ----------------+-----+----------------------------------------------------
+ * 0               | 0   | No interrupt
+ * 1               | 0   | INTA#
+ * 2               | 1   | INTB#
+ * 3               | 2   | INTC#
+ * 4               | 3   | INTD#
+ * any other value | 0   | This should never happen, log error message
+ */
+
+static inline uint8_t pci_read_intx(XenPCIPassthroughState *s)
+{
+    uint8_t v = 0;
+    host_pci_get_byte(s->real_device, PCI_INTERRUPT_PIN, &v);
+    return v;
+}
+
+static inline uint8_t pci_intx(XenPCIPassthroughState *s)
+{
+    uint8_t r_val = pci_read_intx(s);
+
+    PT_LOG(&s->dev, "intx=%i\n", r_val);
+    if (r_val < 1 || r_val > 4) {
+        PT_LOG(&s->dev, "Interrupt pin read from hardware is out of range:"
+               " value=%i, acceptable range is 1 - 4\n", r_val);
+        r_val = 0;
+    } else {
+        r_val -= 1;
+    }
+
+    return r_val;
+}
+
+#endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
diff --git a/hw/xen_pci_passthrough_config_init.c b/hw/xen_pci_passthrough_config_init.c
new file mode 100644
index 0000000..1e9de64
--- /dev/null
+++ b/hw/xen_pci_passthrough_config_init.c
@@ -0,0 +1,11 @@
+#include "xen_pci_passthrough.h"
+
+XenPTRegGroup *pt_find_reg_grp(XenPCIPassthroughState *s, uint32_t address)
+{
+    return NULL;
+}
+
+XenPTReg *pt_find_reg(XenPTRegGroup *reg_grp, uint32_t address)
+{
+    return NULL;
+}
diff --git a/xen-all.c b/xen-all.c
index fd39168..e83ba2b 100644
--- a/xen-all.c
+++ b/xen-all.c
@@ -1013,3 +1013,15 @@ void xen_register_framebuffer(MemoryRegion *mr)
 {
     framebuffer = mr;
 }
+
+void xen_shutdown_fatal_error(const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    vfprintf(stderr, fmt, ap);
+    va_end(ap);
+    fprintf(stderr, "Will destroy the domain.\n");
+    /* destroy the domain */
+    qemu_system_shutdown_request();
+}
-- 
Anthony PERARD

^ permalink raw reply related

* [Qemu-devel] [PATCH V7 04/11] configure: Introduce --enable-xen-pci-passthrough.
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel; +Cc: Anthony PERARD, Xen Devel, Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 configure |   25 +++++++++++++++++++++++++
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/configure b/configure
index 68f9ee6..e2c8d3f 100755
--- a/configure
+++ b/configure
@@ -132,6 +132,7 @@ vnc_png=""
 vnc_thread="no"
 xen=""
 xen_ctrl_version=""
+xen_pci_passthrough=""
 linux_aio=""
 cap_ng=""
 attr=""
@@ -657,6 +658,10 @@ for opt do
   ;;
   --enable-xen) xen="yes"
   ;;
+  --disable-xen-pci-passthrough) xen_pci_passthrough="no"
+  ;;
+  --enable-xen-pci-passthrough) xen_pci_passthrough="yes"
+  ;;
   --disable-brlapi) brlapi="no"
   ;;
   --enable-brlapi) brlapi="yes"
@@ -1005,6 +1010,8 @@ echo "                           (affects only QEMU, not qemu-img)"
 echo "  --enable-mixemu          enable mixer emulation"
 echo "  --disable-xen            disable xen backend driver support"
 echo "  --enable-xen             enable xen backend driver support"
+echo "  --disable-xen-pci-passthrough"
+echo "  --enable-xen-pci-passthrough"
 echo "  --disable-brlapi         disable BrlAPI"
 echo "  --enable-brlapi          enable BrlAPI"
 echo "  --disable-vnc-tls        disable TLS encryption for VNC server"
@@ -1458,6 +1465,21 @@ EOF
   fi
 fi
 
+if test "$xen_pci_passthrough" != "no"; then
+  if test "$xen" = "yes" && test "$linux" = "yes"; then
+    xen_pci_passthrough=yes
+  else
+    if test "$xen_pci_passthrough" = "yes"; then
+      echo "ERROR"
+      echo "ERROR: User requested feature Xen PCI Passthrough"
+      echo "ERROR: but this feature require /sys from Linux"
+      echo "ERROR"
+      exit 1;
+    fi
+    xen_pci_passthrough=no
+  fi
+fi
+
 ##########################################
 # pkg-config probe
 
@@ -3592,6 +3614,9 @@ case "$target_arch2" in
     if test "$xen" = "yes" -a "$target_softmmu" = "yes" ; then
       target_phys_bits=64
       echo "CONFIG_XEN=y" >> $config_target_mak
+      if test "$xen_pci_passthrough" = yes; then
+        echo "CONFIG_XEN_PCI_PASSTHROUGH=y" >> "$config_target_mak"
+      fi
     else
       echo "CONFIG_NO_XEN=y" >> $config_target_mak
     fi
-- 
Anthony PERARD

^ permalink raw reply related

* Re: Xen domU Timekeeping (a.k.a TSC/HPET issues)
From: Paul Durrant @ 2012-02-17 17:09 UTC (permalink / raw)
  To: andres@lagarcavilla.org, xen-devel@lists.xensource.com; +Cc: Ian Campbell, Qrux
In-Reply-To: <91f497056a6f40093b3dcab310125a83.squirrel@webmail.lagarcavilla.org>

IIRC Windows (7) calibrates tsc against RTC at start of day and you can get some pretty odd results if a vcpu is descheduled during that calculation. I was not aware of the exact scenario you described. You *can* get the equivalent of the /usepmtimer boot.ini option on more recent OS by doing:

bcdedit /set useplatformclock true

Enabling viridian timers in Xen may be the way to go though.

  Paul

> -----Original Message-----
> From: xen-devel-bounces@lists.xensource.com [mailto:xen-devel-
> bounces@lists.xensource.com] On Behalf Of Andres Lagar-Cavilla
> Sent: 17 February 2012 16:28
> To: xen-devel@lists.xensource.com
> Cc: Ian Campbell; Qrux
> Subject: Re: [Xen-devel] Xen domU Timekeeping (a.k.a TSC/HPET issues)
> 
> > Date: Fri, 17 Feb 2012 12:06:05 +0000
> > From: Ian Campbell <Ian.Campbell@citrix.com>
> > To: Qrux <qrux.qed@gmail.com>
> > Cc: "xen-devel@lists.xensource.com" <xen-devel@lists.xensource.com>
> > Subject: Re: [Xen-devel] Xen domU Timekeeping (a.k.a TSC/HPET issues)
> > Message-ID: <1329480365.3131.50.camel@zakaz.uk.xensource.com>
> > Content-Type: text/plain; charset="UTF-8"
> >
> > I'm afraid I don't know the answer to most of your questions (hence
> > I'm afraid I've trimmed the quotes rather aggressively) but here's
> > some of what I do know.
> 
> I'm gonna add another data point.
> 
> We're seeing the Windows 7 Query Performance Counter get mightily
> confused. People have reported this in Amazon EC2 as well
> https://forums.aws.amazon.com/thread.jspa?threadID=41426
> 
> We've tracked it down to the hpet. Xen schedules an interrupt delivery for
> an hpet tick, but the vcpu is asleep. Could be an admin pause, a sleep on a
> wait queue, paused while qemu does its thing, paused while a mem event is
> processed...
> 
> When the vcpu wakes up it receives the "late" hpet tick. We believe
> Windows 7 QPC also reads the TSC at that point. The TSC kept on ticking
> while the vcpu was paused. Windows does not know what to do about the
> discrepancy and reports a large time leap, usually consistent with a full
> round trip of the 32 bit hpet counter at the Xen-emulated 1/16GHz
> frequency.
> 
> MSDN forums blame "bad hardware" for this. Funny.
> 
> So, we could solve our particular problem if the tsc were not to tick during
> vcpu sleep. And I get an inkling that would help with this post as well. But I
> don't think any of the advertised timer or tsc modes do that.
> 
> Thanks,
> Andres
> 
> 
> 
> I'm not sure this will help with the original post, but there's gotta be
> somebody who
> >
> >> But, practically, is there a safe CPU configuration?
> >
> > I think that part of the problem here is that it is very hard to
> > determine this at the hardware level. There are at least 3 (if not
> > more) CPUID feature bits which say "no really, the TSC is good and
> > safe to use this time, you can rely on that" because they keep
> > inventing new ways to get it wrong.
> >
> > [...]
> >>
> >> Since September, I can't find any further information about this
> >> issue. What is the state of this issue?  The inconsistency I see
> >> right now is this: in the July 2010 TSC discussion, a "Stefano Stabellini"
> >> posted this:
> >>
> >> ====
> >> > /me wonders if timer_mode=1 is the default for xl?
> >> > Or only for xm?
> >>
> >> no, it is not.
> >> Xl defaults to 0 [zero], I am going to change it right now.
> >> ====
> >>
> >> So, it seems like (at least as of July 2010), xl is defaulting to
> >> "timer_mode=1".  That is, assuming that the then-current timer_mode
> >> is the same as present-day tsc_mode.
> >
> > No, I believe they are different things.
> >
> > tsc_mode is to do with the TSC, emulation vs direct exposure etc. Per
> > xen/include/asm-x86/time.h and (in recent xen-unstable) xl.cfg(5)
> >
> > timer_mode is to do with the the way that timer interrupts are
> > injected into the guest. This is described in
> xen/include/public/hvm/params.h.
> > This isn't documented in xl.cfg(5) because I couldn't make head nor
> > tail of the meaning of that header :-(
> >
> >>   In addition, I'm assuming he was changing it from 0 (zero) to 1
> >> (one)--and not some other mode.  But,
> >>
> >>         xen-4.1.2/docs/misc/tscmode.txt
> >
> > Remember that he was referring to timer_mode not tsc_mode...
> >
> >> says:
> >>
> >>         "The default mode (tsc_mode==0) checks TSC-safeness of the
> >> underlying
> >>         hardware on which the virtual machine is launched.  If it is
> >>         TSC-safe, rdtsc will execute at hardware speed; if it is not,
> >> rdtsc
> >>         will be emulated."
> >>
> >> Which implies the default is always 0 (zero).  Which is it?
> >
> > It seems that xl, in xen-unstable, defaults to:
> > 	timer_mode = 1
> > 	tsc_mode = 0
> > as does 4.1 as far as I can tell via code inspection.
> >
> >> More importantly, is the solution to force tsc_mode=2?
> >
> > IMHO this is safe in most situations unless you are running some sort
> > of workload (e.g. a well known database) which has stringent
> > requirements regarding the TSC for transactional consistency (hence
> > the conservative default).
> >
> >>   If so, under what BIOS/xen-boot-params/dom0-boot-params
> conditions?
> >> And--please excuse my exasperation--but WTH does this have to do with
> >> ext3 versus ext4?  Is ext4 exquisitely sensitive to TSC/HPET
> >> "jumpiness" (if that's even what's happening)?
> >
> > Sorry, I have no idea how/why the filesystem would be related to the
> > TSC.
> >
> > It is possible you are actually seeing two bugs I suppose -- there
> > have been issues relating to ext4 and barriers in some kernel versions
> > (I'm afraid I don't recall the details, the list archives ought to
> > contain something).
> >
> > Ian.
> >
> >
> 
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply

* [Qemu-devel] [PATCH V7 01/11] pci_ids: Add INTEL_82599_VF id.
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel; +Cc: Anthony PERARD, Xen Devel, Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/pci_ids.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/hw/pci_ids.h b/hw/pci_ids.h
index e8235a7..943106a 100644
--- a/hw/pci_ids.h
+++ b/hw/pci_ids.h
@@ -118,6 +118,7 @@
 #define PCI_DEVICE_ID_INTEL_82801I_UHCI6 0x2939
 #define PCI_DEVICE_ID_INTEL_82801I_EHCI1 0x293a
 #define PCI_DEVICE_ID_INTEL_82801I_EHCI2 0x293c
+#define PCI_DEVICE_ID_INTEL_82599_VF     0x10ed
 
 #define PCI_VENDOR_ID_XEN               0x5853
 #define PCI_DEVICE_ID_XEN_PLATFORM      0x0001
-- 
Anthony PERARD

^ permalink raw reply related

* [Qemu-devel] [PATCH V7 05/11] Introduce HostPCIDevice to access a pci device on the host.
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel; +Cc: Anthony PERARD, Xen Devel, Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 Makefile.target      |    3 +
 hw/host-pci-device.c |  278 ++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/host-pci-device.h |   75 ++++++++++++++
 3 files changed, 356 insertions(+), 0 deletions(-)
 create mode 100644 hw/host-pci-device.c
 create mode 100644 hw/host-pci-device.h

diff --git a/Makefile.target b/Makefile.target
index 651500e..5098299 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -222,6 +222,9 @@ obj-$(CONFIG_NO_XEN) += xen-stub.o
 
 obj-i386-$(CONFIG_XEN) += xen_platform.o
 
+# Xen PCI Passthrough
+obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
+
 # Inter-VM PCI shared memory
 CONFIG_IVSHMEM =
 ifeq ($(CONFIG_KVM), y)
diff --git a/hw/host-pci-device.c b/hw/host-pci-device.c
new file mode 100644
index 0000000..3dacb30
--- /dev/null
+++ b/hw/host-pci-device.c
@@ -0,0 +1,278 @@
+/*
+ * Copyright (C) 2011       Citrix Ltd.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "host-pci-device.h"
+
+#define PCI_MAX_EXT_CAP \
+    ((PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE) / (PCI_CAP_SIZEOF + 4))
+
+enum error_code {
+    ERROR_SYNTAX = 1,
+};
+
+static int path_to(const HostPCIDevice *d,
+                   const char *name, char *buf, ssize_t size)
+{
+    return snprintf(buf, size, "/sys/bus/pci/devices/%04x:%02x:%02x.%x/%s",
+                    d->domain, d->bus, d->dev, d->func, name);
+}
+
+static int get_resource(HostPCIDevice *d)
+{
+    int i, rc = 0;
+    FILE *f;
+    char path[PATH_MAX];
+    unsigned long long start, end, flags, size;
+
+    path_to(d, "resource", path, sizeof (path));
+    f = fopen(path, "r");
+    if (!f) {
+        fprintf(stderr, "Error: Can't open %s: %s\n", path, strerror(errno));
+        return -errno;
+    }
+
+    for (i = 0; i < PCI_NUM_REGIONS; i++) {
+        if (fscanf(f, "%llx %llx %llx", &start, &end, &flags) != 3) {
+            fprintf(stderr, "Error: Syntax error in %s\n", path);
+            rc = ERROR_SYNTAX;
+            break;
+        }
+        if (start) {
+            size = end - start + 1;
+        } else {
+            size = 0;
+        }
+
+        if (i < PCI_ROM_SLOT) {
+            d->io_regions[i].base_addr = start;
+            d->io_regions[i].size = size;
+            d->io_regions[i].flags = flags;
+        } else {
+            d->rom.base_addr = start;
+            d->rom.size = size;
+            d->rom.flags = flags;
+        }
+    }
+
+    fclose(f);
+    return rc;
+}
+
+static int get_hex_value(HostPCIDevice *d, const char *name,
+                         unsigned long *pvalue)
+{
+    char path[PATH_MAX];
+    FILE *f;
+    unsigned long value;
+
+    path_to(d, name, path, sizeof (path));
+    f = fopen(path, "r");
+    if (!f) {
+        fprintf(stderr, "Error: Can't open %s: %s\n", path, strerror(errno));
+        return -errno;
+    }
+    if (fscanf(f, "%lx\n", &value) != 1) {
+        fprintf(stderr, "Error: Syntax error in %s\n", path);
+        fclose(f);
+        return ERROR_SYNTAX;
+    }
+    fclose(f);
+    *pvalue = value;
+    return 0;
+}
+
+static bool pci_dev_is_virtfn(HostPCIDevice *d)
+{
+    char path[PATH_MAX];
+    struct stat buf;
+
+    path_to(d, "physfn", path, sizeof (path));
+    return !stat(path, &buf);
+}
+
+static int host_pci_config_fd(HostPCIDevice *d)
+{
+    char path[PATH_MAX];
+
+    if (d->config_fd < 0) {
+        path_to(d, "config", path, sizeof (path));
+        d->config_fd = open(path, O_RDWR);
+        if (d->config_fd < 0) {
+            fprintf(stderr, "HostPCIDevice: Can not open '%s': %s\n",
+                    path, strerror(errno));
+        }
+    }
+    return d->config_fd;
+}
+static int host_pci_config_read(HostPCIDevice *d, int pos, void *buf, int len)
+{
+    int fd = host_pci_config_fd(d);
+    int res = 0;
+
+again:
+    res = pread(fd, buf, len, pos);
+    if (res != len) {
+        if (res < 0 && (errno == EINTR || errno == EAGAIN)) {
+            goto again;
+        }
+        fprintf(stderr, "%s: read failed: %s (fd: %i)\n",
+                __func__, strerror(errno), fd);
+        return -errno;
+    }
+    return 0;
+}
+static int host_pci_config_write(HostPCIDevice *d,
+                                 int pos, const void *buf, int len)
+{
+    int fd = host_pci_config_fd(d);
+    int res = 0;
+
+again:
+    res = pwrite(fd, buf, len, pos);
+    if (res != len) {
+        if (res < 0 && (errno == EINTR || errno == EAGAIN)) {
+            goto again;
+        }
+        fprintf(stderr, "%s: write failed: %s\n",
+                __func__, strerror(errno));
+        return -errno;
+    }
+    return 0;
+}
+
+int host_pci_get_byte(HostPCIDevice *d, int pos, uint8_t *p)
+{
+    uint8_t buf;
+    int rc = host_pci_config_read(d, pos, &buf, 1);
+    if (rc == 0) {
+        *p = buf;
+    }
+    return rc;
+}
+int host_pci_get_word(HostPCIDevice *d, int pos, uint16_t *p)
+{
+    uint16_t buf;
+    int rc = host_pci_config_read(d, pos, &buf, 2);
+    if (rc == 0) {
+        *p = le16_to_cpu(buf);
+    }
+    return rc;
+}
+int host_pci_get_long(HostPCIDevice *d, int pos, uint32_t *p)
+{
+    uint32_t buf;
+    int rc = host_pci_config_read(d, pos, &buf, 4);
+    if (rc == 0) {
+        *p = le32_to_cpu(buf);
+    }
+    return rc;
+}
+int host_pci_get_block(HostPCIDevice *d, int pos, uint8_t *buf, int len)
+{
+    return host_pci_config_read(d, pos, buf, len);
+}
+
+int host_pci_set_byte(HostPCIDevice *d, int pos, uint8_t data)
+{
+    return host_pci_config_write(d, pos, &data, 1);
+}
+int host_pci_set_word(HostPCIDevice *d, int pos, uint16_t data)
+{
+    data = cpu_to_le16(data);
+    return host_pci_config_write(d, pos, &data, 2);
+}
+int host_pci_set_long(HostPCIDevice *d, int pos, uint32_t data)
+{
+    data = cpu_to_le32(data);
+    return host_pci_config_write(d, pos, &data, 4);
+}
+int host_pci_set_block(HostPCIDevice *d, int pos, uint8_t *buf, int len)
+{
+    return host_pci_config_write(d, pos, buf, len);
+}
+
+uint32_t host_pci_find_ext_cap_offset(HostPCIDevice *d, uint32_t cap)
+{
+    uint32_t header = 0;
+    int max_cap = PCI_MAX_EXT_CAP;
+    int pos = PCI_CONFIG_SPACE_SIZE;
+
+    do {
+        if (host_pci_get_long(d, pos, &header)) {
+            break;
+        }
+        /*
+         * If we have no capabilities, this is indicated by cap ID,
+         * cap version and next pointer all being 0.
+         */
+        if (header == 0) {
+            break;
+        }
+
+        if (PCI_EXT_CAP_ID(header) == cap) {
+            return pos;
+        }
+
+        pos = PCI_EXT_CAP_NEXT(header);
+        if (pos < PCI_CONFIG_SPACE_SIZE) {
+            break;
+        }
+
+        max_cap--;
+    } while (max_cap > 0);
+
+    return 0;
+}
+
+HostPCIDevice *host_pci_device_get(uint8_t bus, uint8_t dev, uint8_t func)
+{
+    HostPCIDevice *d = NULL;
+    unsigned long v = 0;
+
+    d = g_new0(HostPCIDevice, 1);
+
+    d->config_fd = -1;
+    d->domain = 0;
+    d->bus = bus;
+    d->dev = dev;
+    d->func = func;
+
+    if (host_pci_config_fd(d) == -1) {
+        goto error;
+    }
+    if (get_resource(d) != 0) {
+        goto error;
+    }
+
+    if (get_hex_value(d, "vendor", &v)) {
+        goto error;
+    }
+    d->vendor_id = v;
+    if (get_hex_value(d, "device", &v)) {
+        goto error;
+    }
+    d->device_id = v;
+    d->is_virtfn = pci_dev_is_virtfn(d);
+
+    return d;
+error:
+    if (d->config_fd >= 0) {
+        close(d->config_fd);
+    }
+    g_free(d);
+    return NULL;
+}
+
+void host_pci_device_put(HostPCIDevice *d)
+{
+    if (d->config_fd >= 0) {
+        close(d->config_fd);
+    }
+    g_free(d);
+}
diff --git a/hw/host-pci-device.h b/hw/host-pci-device.h
new file mode 100644
index 0000000..c8880eb
--- /dev/null
+++ b/hw/host-pci-device.h
@@ -0,0 +1,75 @@
+#ifndef HW_HOST_PCI_DEVICE
+#  define HW_HOST_PCI_DEVICE
+
+#include "pci.h"
+
+/*
+ * from linux/ioport.h
+ * IO resources have these defined flags.
+ */
+#define IORESOURCE_BITS         0x000000ff      /* Bus-specific bits */
+
+#define IORESOURCE_TYPE_BITS    0x00000f00      /* Resource type */
+#define IORESOURCE_IO           0x00000100
+#define IORESOURCE_MEM          0x00000200
+#define IORESOURCE_IRQ          0x00000400
+#define IORESOURCE_DMA          0x00000800
+
+#define IORESOURCE_PREFETCH     0x00001000      /* No side effects */
+#define IORESOURCE_READONLY     0x00002000
+#define IORESOURCE_CACHEABLE    0x00004000
+#define IORESOURCE_RANGELENGTH  0x00008000
+#define IORESOURCE_SHADOWABLE   0x00010000
+
+#define IORESOURCE_SIZEALIGN    0x00020000      /* size indicates alignment */
+#define IORESOURCE_STARTALIGN   0x00040000      /* start field is alignment */
+
+#define IORESOURCE_MEM_64       0x00100000
+
+    /* Userland may not map this resource */
+#define IORESOURCE_EXCLUSIVE    0x08000000
+#define IORESOURCE_DISABLED     0x10000000
+#define IORESOURCE_UNSET        0x20000000
+#define IORESOURCE_AUTO         0x40000000
+    /* Driver has marked this resource busy */
+#define IORESOURCE_BUSY         0x80000000
+
+
+typedef struct HostPCIIORegion {
+    unsigned long flags;
+    pcibus_t base_addr;
+    pcibus_t size;
+} HostPCIIORegion;
+
+typedef struct HostPCIDevice {
+    uint16_t domain;
+    uint8_t bus;
+    uint8_t dev;
+    uint8_t func;
+
+    uint16_t vendor_id;
+    uint16_t device_id;
+
+    HostPCIIORegion io_regions[PCI_NUM_REGIONS - 1];
+    HostPCIIORegion rom;
+
+    bool is_virtfn;
+
+    int config_fd;
+} HostPCIDevice;
+
+HostPCIDevice *host_pci_device_get(uint8_t bus, uint8_t dev, uint8_t func);
+void host_pci_device_put(HostPCIDevice *pci_dev);
+
+int host_pci_get_byte(HostPCIDevice *d, int pos, uint8_t *p);
+int host_pci_get_word(HostPCIDevice *d, int pos, uint16_t *p);
+int host_pci_get_long(HostPCIDevice *d, int pos, uint32_t *p);
+int host_pci_get_block(HostPCIDevice *d, int pos, uint8_t *buf, int len);
+int host_pci_set_byte(HostPCIDevice *d, int pos, uint8_t data);
+int host_pci_set_word(HostPCIDevice *d, int pos, uint16_t data);
+int host_pci_set_long(HostPCIDevice *d, int pos, uint32_t data);
+int host_pci_set_block(HostPCIDevice *d, int pos, uint8_t *buf, int len);
+
+uint32_t host_pci_find_ext_cap_offset(HostPCIDevice *s, uint32_t cap);
+
+#endif /* !HW_HOST_PCI_DEVICE */
-- 
Anthony PERARD

^ permalink raw reply related

* [Qemu-devel] [PATCH V7 09/11] Introduce apic-msidef.h
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel; +Cc: Anthony PERARD, Xen Devel, Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

This patch move the msi definition from apic.c to apic-msidef.h. So it can be
used also by other .c files.

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/apic-msidef.h |   28 ++++++++++++++++++++++++++++
 hw/apic.c        |   11 +----------
 2 files changed, 29 insertions(+), 10 deletions(-)
 create mode 100644 hw/apic-msidef.h

diff --git a/hw/apic-msidef.h b/hw/apic-msidef.h
new file mode 100644
index 0000000..3182f0b
--- /dev/null
+++ b/hw/apic-msidef.h
@@ -0,0 +1,28 @@
+#ifndef HW_APIC_MSIDEF_H
+#define HW_APIC_MSIDEF_H
+
+/*
+ * Intel APIC constants: from include/asm/msidef.h
+ */
+
+/*
+ * Shifts for MSI data
+ */
+
+#define MSI_DATA_VECTOR_SHIFT           0
+#define  MSI_DATA_VECTOR_MASK           0x000000ff
+
+#define MSI_DATA_DELIVERY_MODE_SHIFT    8
+#define MSI_DATA_LEVEL_SHIFT            14
+#define MSI_DATA_TRIGGER_SHIFT          15
+
+/*
+ * Shift/mask fields for msi address
+ */
+
+#define MSI_ADDR_DEST_MODE_SHIFT        2
+
+#define MSI_ADDR_DEST_ID_SHIFT          12
+#define  MSI_ADDR_DEST_ID_MASK          0x00ffff0
+
+#endif /* HW_APIC_MSIDEF_H */
diff --git a/hw/apic.c b/hw/apic.c
index ff9d24e..f9a73a8 100644
--- a/hw/apic.c
+++ b/hw/apic.c
@@ -22,19 +22,10 @@
 #include "host-utils.h"
 #include "trace.h"
 #include "pc.h"
+#include "apic-msidef.h"
 
 #define MAX_APIC_WORDS 8
 
-/* Intel APIC constants: from include/asm/msidef.h */
-#define MSI_DATA_VECTOR_SHIFT		0
-#define MSI_DATA_VECTOR_MASK		0x000000ff
-#define MSI_DATA_DELIVERY_MODE_SHIFT	8
-#define MSI_DATA_TRIGGER_SHIFT		15
-#define MSI_DATA_LEVEL_SHIFT		14
-#define MSI_ADDR_DEST_MODE_SHIFT	2
-#define MSI_ADDR_DEST_ID_SHIFT		12
-#define	MSI_ADDR_DEST_ID_MASK		0x00ffff0
-
 static APICCommonState *local_apics[MAX_APICS + 1];
 
 static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode);
-- 
Anthony PERARD

^ permalink raw reply related

* [Qemu-devel] [PATCH V7 00/11] Xen PCI Passthrough
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel; +Cc: Anthony PERARD, Xen Devel, Stefano Stabellini

Hi all,

This patch series introduces the PCI passthrough for Xen.

First, we have HostPCIDevice that help to access one PCI device of the host.

Then, there is an additions in the QEMU code, pci_check_bar_overlap.

There are also several change in pci_ids and pci_regs.

Last part, but not least, the PCI passthrough device himself. Cut in 3 parts
(or file), there is one to take care of the initialisation of a passthrough
device. The second one handle everything about the config address space, there
are specifics functions for every config register. The third one is to handle
MSI.

There is a patch series on xen-devel (applied to xen-unstable) that add the
support of setting a PCI passthrough device through QMP from libxl (xen tool
stack). It is just a call to device_add, with the driver parametter
hostaddr="0000:07:00.1".


Change since the previous set:
  - few fix and rebased on master
  - remove of the power management capability, keep the minimum like if it is
    always desactivated.
  - new patch: port of patch from the qemu-xen fork.


Change v5-v6:
  - msitraslate code have been removed.
  - code for the power management capability is removed, but will be re-added
    for the next version of the patch series as a separate patch.

  - new patch to remove a check in pci_parse_devaddr.
  - use pci_default_config_write, so no more hack to handle the BAR mapping in
    QEMU.
  - improve the code in general (a bit more comprehensible).
  - update to QOM.


Change v4-v5:
  - return -errno if there is an error in host_pci_get_*
  - rename internal function get_value to get_hex_value (and return the same
    error value has get_resource)

Change v3-v4:
  - host_pci_get_* can now return an error, and take an extra parameter, a
    pointer to store the wanted value.
  - The memory_region for the PCI BAR are handled "manualy" because calling
    pci_default_write_config was not possible, because the XenPT handle the
    PCIIORegion it self. This make possible to do a device_remove.
  - Introduction of PT_ERR and PT_WARN macro to print debug and error messages.
    Also, these macro as well as PT_LOG will always print the short BDF of the
    device in the guest point of view.
  - PT_ERR is print by default (for all error messages).
  - Some debug/error message have been improve and should be a bit more useful.
  - hw_error have been removed from the code, and have been replaced by either
    a call to qemu_system_shudown_request() (that lead to a domain destroy) or
    a failed in the initialisation of the device.
  - Now, every patchs should compile with no error.

Change v2-v3;
  - in host-pci-device.c:
    - Return more usefull error code in get_ressource().
    - Use macro in host_pci_find_ext_cap_offset instead of raw number. But I
      still not sure if PCI_MAX_EXT_CAP is right, it's result is 480 like it
      was before, so it's maybe ok.
  - All use of MSI stuff in two first pci passthrough patch have been removed
    and move to the last patch.

Change v1-v2:
  - fix style issue (checkpatch.pl)
  - set the original authors, add some missing copyright headers
  - HostPCIDevice:
    - introduce HostPCIIORegions (with base_addr, size, flags)
    - save all flags from ./resource and store it in a separate field.
    - fix endianess on write
    - new host_pci_dev_put function
    - use pci.c like interface host_pci_get/set_byte/word/long (instead of
      host_pci_read/write_)
  - compile HostPCIDevice only on linux (as well as xen_pci_passthrough)
  - introduce apic-msidef.h file.
  - no more run_one_timer, if a pci device is in the middle of a power
    transition, just "return an error" in config read/write
  - use a global var mapped_machine_irq (local to xen_pci_passthrough.c)
  - add msitranslate and power-mgmt ad qdev property





Allen Kay (2):
  Introduce Xen PCI Passthrough, qdevice (1/3)
  Introduce Xen PCI Passthrough, PCI config space helpers (2/3)

Anthony PERARD (6):
  pci_ids: Add INTEL_82599_VF id.
  pci_regs: Fix value of PCI_EXP_TYPE_RC_EC.
  pci_regs: Add PCI_EXP_TYPE_PCIE_BRIDGE
  configure: Introduce --enable-xen-pci-passthrough.
  Introduce HostPCIDevice to access a pci device on the host.
  Introduce apic-msidef.h

Jiang Yunhong (1):
  Introduce Xen PCI Passthrough, MSI (3/3)

Shan Haitao (1):
  xen passthrough: clean up MSI-X table handling

Yuji Shimada (1):
  pci.c: Add pci_check_bar_overlap

 Makefile.target                      |    6 +
 configure                            |   25 +
 hw/apic-msidef.h                     |   30 +
 hw/apic.c                            |   11 +-
 hw/host-pci-device.c                 |  278 +++++
 hw/host-pci-device.h                 |   75 ++
 hw/pci.c                             |   47 +
 hw/pci.h                             |    3 +
 hw/pci_ids.h                         |    1 +
 hw/pci_regs.h                        |    3 +-
 hw/xen_common.h                      |    3 +
 hw/xen_pci_passthrough.c             |  898 ++++++++++++++++
 hw/xen_pci_passthrough.h             |  308 ++++++
 hw/xen_pci_passthrough_config_init.c | 1914 ++++++++++++++++++++++++++++++++++
 hw/xen_pci_passthrough_msi.c         |  618 +++++++++++
 xen-all.c                            |   12 +
 16 files changed, 4221 insertions(+), 11 deletions(-)
 create mode 100644 hw/apic-msidef.h
 create mode 100644 hw/host-pci-device.c
 create mode 100644 hw/host-pci-device.h
 create mode 100644 hw/xen_pci_passthrough.c
 create mode 100644 hw/xen_pci_passthrough.h
 create mode 100644 hw/xen_pci_passthrough_config_init.c
 create mode 100644 hw/xen_pci_passthrough_msi.c

-- 
Anthony PERARD

^ permalink raw reply

* [Qemu-devel] [PATCH V7 03/11] pci_regs: Add PCI_EXP_TYPE_PCIE_BRIDGE
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel; +Cc: Anthony PERARD, Xen Devel, Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/pci_regs.h |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/hw/pci_regs.h b/hw/pci_regs.h
index 6b42515..56a404b 100644
--- a/hw/pci_regs.h
+++ b/hw/pci_regs.h
@@ -392,6 +392,7 @@
 #define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
 #define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
 #define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
+#define  PCI_EXP_TYPE_PCIE_BRIDGE 0x8   /* PCI/PCI-X to PCIE Bridge */
 #define  PCI_EXP_TYPE_RC_END	0x9	/* Root Complex Integrated Endpoint */
 #define  PCI_EXP_TYPE_RC_EC     0xa     /* Root Complex Event Collector */
 #define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
-- 
Anthony PERARD

^ permalink raw reply related

* [Qemu-devel] [PATCH V7 02/11] pci_regs: Fix value of PCI_EXP_TYPE_RC_EC.
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel; +Cc: Anthony PERARD, Xen Devel, Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

Value check in PCI Express Base Specification rev 1.1

Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/pci_regs.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/hw/pci_regs.h b/hw/pci_regs.h
index e8357c3..6b42515 100644
--- a/hw/pci_regs.h
+++ b/hw/pci_regs.h
@@ -393,7 +393,7 @@
 #define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
 #define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
 #define  PCI_EXP_TYPE_RC_END	0x9	/* Root Complex Integrated Endpoint */
-#define  PCI_EXP_TYPE_RC_EC	0x10	/* Root Complex Event Collector */
+#define  PCI_EXP_TYPE_RC_EC     0xa     /* Root Complex Event Collector */
 #define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
 #define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
 #define PCI_EXP_DEVCAP		4	/* Device capabilities */
-- 
Anthony PERARD

^ permalink raw reply related

* Re: linux-next: build failure after merge of the staging tree
From: Greg KH @ 2012-02-17 17:04 UTC (permalink / raw)
  To: Stephen Rothwell; +Cc: linux-next, linux-kernel, Dan Magenheimer
In-Reply-To: <20120217151030.59ce998d4b844faa0023750b@canb.auug.org.au>

On Fri, Feb 17, 2012 at 03:10:30PM +1100, Stephen Rothwell wrote:
> Hi Greg,
> 
> After merging the staging tree, today's linux-next build (x86_64
> allmodconfig) failed like this:
> 
> drivers/built-in.o: In function `r2hb_hb_group_drop_item':
> heartbeat.c:(.text+0x12cd7a): undefined reference to `config_item_put'
> drivers/built-in.o: In function `r2hb_hb_group_make_item':
> heartbeat.c:(.text+0x12cdc7): undefined reference to `config_item_put'
> drivers/built-in.o: In function `r2hb_alloc_hb_set':
> (.text+0x12cf2f): undefined reference to `config_group_init_type_name'
> drivers/built-in.o: In function `r2nm_cluster_group_drop_item':
> nodemanager.c:(.text+0x12d34f): undefined reference to `config_item_put'
> nodemanager.c:(.text+0x12d378): undefined reference to `config_item_put'
> drivers/built-in.o: In function `r2nm_node_put':
> (.text+0x12d3a0): undefined reference to `config_item_put'
> drivers/built-in.o: In function `r2nm_node_group_drop_item':
> nodemanager.c:(.text+0x12dc4d): undefined reference to `config_item_put'
> drivers/built-in.o: In function `r2nm_node_get':
> (.text+0x12e407): undefined reference to `config_item_get'
> drivers/built-in.o: In function `r2nm_get_node_by_ip':
> (.text+0x12e482): undefined reference to `config_item_get'
> drivers/built-in.o: In function `r2nm_get_node_by_num':
> (.text+0x12e514): undefined reference to `config_item_get'
> drivers/built-in.o: In function `r2nm_node_group_make_item':
> nodemanager.c:(.text+0x12e664): undefined reference to `config_item_init_type_name'
> drivers/built-in.o: In function `r2nm_cluster_group_make_group':
> nodemanager.c:(.text+0x12e762): undefined reference to `config_group_init_type_name'
> nodemanager.c:(.text+0x12e77f): undefined reference to `config_group_init_type_name'
> drivers/built-in.o: In function `r2nm_depend_item':
> (.text+0x12e878): undefined reference to `configfs_depend_item'
> drivers/built-in.o: In function `r2nm_undepend_item':
> (.text+0x12e8a0): undefined reference to `configfs_undepend_item'
> drivers/built-in.o: In function `r2nm_depend_this_node':
> (.text+0x12e907): undefined reference to `config_item_put'
> drivers/built-in.o: In function `r2nm_undepend_this_node':
> (.text+0x12e973): undefined reference to `config_item_put'
> drivers/built-in.o: In function `init_r2nm':
> nodemanager.c:(.init.text+0xefdb): undefined reference to `config_group_init'
> nodemanager.c:(.init.text+0xf00f): undefined reference to `configfs_register_subsystem'
> drivers/built-in.o: In function `exit_r2nm':
> nodemanager.c:(.exit.text+0x99b): undefined reference to `configfs_unregister_subsystem'
> 
> This after I moved the tmem tree to before the staging tree to fix
> yesterday's build failure.
> 
> I have cherry-picked Greg's patch to disable building of the ramster code
> for today (since it wasn't in his tree when I fetched it this morning).

Thank you for doing that, sorry I didn't get any of these build errors
on my own.  That's what I get for applying patches while on the road and
away from my big build systems to verify this :(

greg k-h

^ permalink raw reply

* [PATCH V7 11/11] xen passthrough: clean up MSI-X table handling
From: Anthony PERARD @ 2012-02-17 17:08 UTC (permalink / raw)
  To: QEMU-devel
  Cc: Anthony PERARD, Shan Haitao, Xen Devel, Jan Beulich,
	Stefano Stabellini
In-Reply-To: <1329498525-8454-1-git-send-email-anthony.perard@citrix.com>

From: Shan Haitao <maillists.shan@gmail.com>

This patch does cleaning up of QEMU MSI handling. The fixes are:
1. Changes made to MSI-X table mapping handling to eliminate the small
windows in which guest could have access to physical MSI-X table.
2. MSI-X table is mapped as read-only to QEMU, as masking of MSI-X is
already in Xen now.
3. For registers that coexists inside the MSI-X table (this could be
only PBA I think), value read from physical page would be returned.

Signed-off-by:  Shan Haitao <maillists.shan@gmail.com>

Consolidated duplicate code into _pt_iomem_helper(). Fixed formatting.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/xen_pci_passthrough.c     |   80 ++++++++++++++++++++++++++++++++----------
 hw/xen_pci_passthrough.h     |   11 +++++-
 hw/xen_pci_passthrough_msi.c |   62 ++++----------------------------
 3 files changed, 78 insertions(+), 75 deletions(-)

diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
index 6ea2c45..fab9fbe 100644
--- a/hw/xen_pci_passthrough.c
+++ b/hw/xen_pci_passthrough.c
@@ -356,6 +356,53 @@ out:
     }
 }
 
+static int pt_iomem_helper(XenPCIPassthroughState *s, int i,
+                           pcibus_t e_base, pcibus_t e_size, int op)
+{
+    uint64_t msix_last_pfn = 0;
+    pcibus_t bar_last_pfn = 0;
+    pcibus_t msix_table_size = 0;
+    XenPTMSIX *msix = s->msix;
+    int rc = 0;
+
+    if (!pt_has_msix_mapping(s, i)) {
+        return xc_domain_memory_mapping(xen_xc, xen_domid,
+                                        PFN(e_base),
+                                        PFN(s->bases[i].access.maddr),
+                                        PFN(e_size + XC_PAGE_SIZE - 1),
+                                        op);
+
+    }
+
+    if (msix->table_off) {
+        rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                                      PFN(e_base),
+                                      PFN(s->bases[i].access.maddr),
+                                      PFN(msix->mmio_base_addr) - PFN(e_base),
+                                      op);
+    }
+
+    msix_table_size = msix->total_entries * PCI_MSIX_ENTRY_SIZE;
+    msix_last_pfn = PFN(msix->mmio_base_addr + msix_table_size - 1);
+    bar_last_pfn = PFN(e_base + e_size - 1);
+
+    if (rc == 0 && msix_last_pfn != bar_last_pfn) {
+        assert(msix_last_pfn < bar_last_pfn);
+
+        rc = xc_domain_memory_mapping(xen_xc, xen_domid,
+                                      msix_last_pfn + 1,
+                                      PFN(s->bases[i].access.maddr
+                                          + msix->table_off
+                                          + msix_table_size
+                                          + XC_PAGE_SIZE - 1),
+                                      bar_last_pfn - msix_last_pfn,
+                                      op);
+    }
+
+    return rc;
+}
+
+
 /* ioport/iomem space*/
 static void pt_iomem_map(XenPCIPassthroughState *s, int i,
                          pcibus_t e_phys, pcibus_t e_size)
@@ -376,13 +423,10 @@ static void pt_iomem_map(XenPCIPassthroughState *s, int i,
     }
 
     if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
-        pt_add_msix_mapping(s, i);
-        /* Remove old mapping */
-        rc = xc_domain_memory_mapping(xen_xc, xen_domid,
-                               old_ebase >> XC_PAGE_SHIFT,
-                               s->bases[i].access.maddr >> XC_PAGE_SHIFT,
-                               (e_size + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
-                               DPCI_REMOVE_MAPPING);
+        if (pt_has_msix_mapping(s, i)) {
+            memory_region_set_enabled(&s->msix->mmio, false);
+        }
+        rc = pt_iomem_helper(s, i, old_ebase, e_size, DPCI_REMOVE_MAPPING);
         if (rc) {
             PT_ERR(&s->dev, "remove old mapping failed! (rc: %i)\n", rc);
             return;
@@ -391,21 +435,19 @@ static void pt_iomem_map(XenPCIPassthroughState *s, int i,
 
     /* map only valid guest address */
     if (e_phys != PCI_BAR_UNMAPPED) {
-        /* Create new mapping */
-        rc = xc_domain_memory_mapping(xen_xc, xen_domid,
-                                   s->bases[i].e_physbase >> XC_PAGE_SHIFT,
-                                   s->bases[i].access.maddr >> XC_PAGE_SHIFT,
-                                   (e_size+XC_PAGE_SIZE-1) >> XC_PAGE_SHIFT,
-                                   DPCI_ADD_MAPPING);
+        if (pt_has_msix_mapping(s, i)) {
+            XenPTMSIX *msix = s->msix;
 
-        if (rc) {
-            PT_ERR(&s->dev, "create new mapping failed! (rc: %i)\n", rc);
+            msix->mmio_base_addr = s->bases[i].e_physbase + msix->table_off;
+
+            memory_region_set_enabled(&msix->mmio, true);
         }
 
-        rc = pt_remove_msix_mapping(s, i);
-        if (rc != 0) {
-            PT_ERR(&s->dev, "Remove MSI-X MMIO mapping failed! (rc: %d)\n",
-                   rc);
+        rc = pt_iomem_helper(s, i, e_phys, e_size, DPCI_ADD_MAPPING);
+
+        if (rc) {
+            PT_ERR(&s->dev, "create new mapping failed! (rc: %i)\n", rc);
+            return;
         }
 
         if (old_ebase != e_phys && old_ebase != -1) {
diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
index 3d41e04..f36e0d2 100644
--- a/hw/xen_pci_passthrough.h
+++ b/hw/xen_pci_passthrough.h
@@ -30,6 +30,9 @@ void pt_log(const PCIDevice *d, const char *f, ...) GCC_FMT_ATTR(2, 3);
 #endif
 
 
+/* Helper */
+#define PFN(x) ((x) >> XC_PAGE_SHIFT)
+
 typedef struct XenPTRegInfo XenPTRegInfo;
 typedef struct XenPTReg XenPTReg;
 
@@ -295,7 +298,11 @@ void pt_msix_delete(XenPCIPassthroughState *s);
 int pt_msix_update(XenPCIPassthroughState *s);
 int pt_msix_update_remap(XenPCIPassthroughState *s, int bar_index);
 void pt_msix_disable(XenPCIPassthroughState *s);
-int pt_add_msix_mapping(XenPCIPassthroughState *s, int bar_index);
-int pt_remove_msix_mapping(XenPCIPassthroughState *s, int bar_index);
+
+static inline bool pt_has_msix_mapping(XenPCIPassthroughState *s, int bar)
+{
+    return s->msix && s->msix->bar_index == bar;
+}
+
 
 #endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
diff --git a/hw/xen_pci_passthrough_msi.c b/hw/xen_pci_passthrough_msi.c
index e848c61..e775d21 100644
--- a/hw/xen_pci_passthrough_msi.c
+++ b/hw/xen_pci_passthrough_msi.c
@@ -300,16 +300,6 @@ static int msix_set_enable(XenPCIPassthroughState *s, bool enabled)
                            enabled);
 }
 
-static void mask_physical_msix_entry(XenPCIPassthroughState *s,
-                                     int entry_nr, int mask)
-{
-    void *phys_off;
-
-    phys_off = s->msix->phys_iomem_base + PCI_MSIX_ENTRY_SIZE * entry_nr
-        + PCI_MSIX_ENTRY_VECTOR_CTRL;
-    *(uint32_t *)phys_off = mask;
-}
-
 static int pt_msix_update_one(XenPCIPassthroughState *s, int entry_nr)
 {
     XenPTMSIXEntry *entry = NULL;
@@ -480,8 +470,6 @@ static void pci_msix_write(void *opaque, target_phys_addr_t addr,
         if (msix->enabled && !(val & PCI_MSIX_ENTRY_CTRL_MASKBIT)) {
             pt_msix_update_one(s, entry_nr);
         }
-        mask_physical_msix_entry(s, entry_nr,
-            entry->vector_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT);
     }
 }
 
@@ -493,14 +481,19 @@ static uint64_t pci_msix_read(void *opaque, target_phys_addr_t addr,
     int entry_nr, offset;
 
     entry_nr = addr / PCI_MSIX_ENTRY_SIZE;
-    if (entry_nr < 0 || entry_nr >= msix->total_entries) {
+    if (entry_nr < 0) {
         PT_ERR(&s->dev, "asked MSI-X entry '%i' invalid!\n", entry_nr);
         return 0;
     }
 
     offset = addr % PCI_MSIX_ENTRY_SIZE;
 
-    return get_entry_value(&msix->msix_entry[entry_nr], offset);
+    if (addr < msix->total_entries * PCI_MSIX_ENTRY_SIZE) {
+        return get_entry_value(&msix->msix_entry[entry_nr], offset);
+    } else {
+        /* Pending Bit Array (PBA) */
+        return *(uint32_t *)(msix->phys_iomem_base + addr);
+    }
 }
 
 static const MemoryRegionOps pci_msix_ops = {
@@ -514,45 +507,6 @@ static const MemoryRegionOps pci_msix_ops = {
     },
 };
 
-int pt_add_msix_mapping(XenPCIPassthroughState *s, int bar_index)
-{
-    XenPTMSIX *msix = s->msix;
-
-    if (!(s->msix && s->msix->bar_index == bar_index)) {
-        return 0;
-    }
-
-    memory_region_set_enabled(&msix->mmio, false);
-    return xc_domain_memory_mapping(xen_xc, xen_domid,
-         s->msix->mmio_base_addr >> XC_PAGE_SHIFT,
-         (s->bases[bar_index].access.maddr + s->msix->table_off)
-           >> XC_PAGE_SHIFT,
-         (s->msix->total_entries * PCI_MSIX_ENTRY_SIZE + XC_PAGE_SIZE - 1)
-           >> XC_PAGE_SHIFT,
-         DPCI_ADD_MAPPING);
-}
-
-int pt_remove_msix_mapping(XenPCIPassthroughState *s, int bar_index)
-{
-    XenPTMSIX *msix = s->msix;
-
-    if (!(msix && msix->bar_index == bar_index)) {
-        return 0;
-    }
-
-    memory_region_set_enabled(&msix->mmio, true);
-
-    msix->mmio_base_addr = s->bases[bar_index].e_physbase + msix->table_off;
-
-    return xc_domain_memory_mapping(xen_xc, xen_domid,
-         s->msix->mmio_base_addr >> XC_PAGE_SHIFT,
-         (s->bases[bar_index].access.maddr + s->msix->table_off)
-             >> XC_PAGE_SHIFT,
-         (s->msix->total_entries * PCI_MSIX_ENTRY_SIZE + XC_PAGE_SIZE - 1)
-             >> XC_PAGE_SHIFT,
-         DPCI_REMOVE_MAPPING);
-}
-
 int pt_msix_init(XenPCIPassthroughState *s, uint32_t base)
 {
     uint8_t id = 0;
@@ -609,7 +563,7 @@ int pt_msix_init(XenPCIPassthroughState *s, uint32_t base)
     msix->phys_iomem_base =
         mmap(NULL,
              total_entries * PCI_MSIX_ENTRY_SIZE + msix->table_offset_adjust,
-             PROT_WRITE | PROT_READ,
+             PROT_READ,
              MAP_SHARED | MAP_LOCKED,
              fd,
              msix->table_base + table_off - msix->table_offset_adjust);
-- 
Anthony PERARD

^ permalink raw reply related


This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.