LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 18/21] Replace pci_dn with eeh_dev for EEH aux components
From: Gavin Shan @ 2012-02-24  9:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel.crashing.org, shangw
In-Reply-To: <1330076298-7006-1-git-send-email-shangw@linux.vnet.ibm.com>

The original EEH implementation is heavily depending on struct pci_dn.
We have to put EEH related information to pci_dn. Actually, we could
split struct pci_dn so that the EEH sensitive information to form an
individual struct, then EEH looks more independent.

The patch replaces pci_dn with eeh_dev for EEH aux components like
event and driver. Also, the eeh_event struct has been adjusted for
a little bit since eeh_dev has linked the associated FDT (Flat Device
Tree) node and PCI device. It's not necessary for eeh_event struct to
trace FDT node and PCI device. We can just simply to trace eeh_dev in
eeh_event.

The patch also renames function pcid_name() to eeh_pcid_name(), which
should be missed in the previous patch where the EEH aux components
have been cleaned up.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh_event.h        |    7 +-
 arch/powerpc/platforms/pseries/eeh.c        |    2 +-
 arch/powerpc/platforms/pseries/eeh_driver.c |   81 +++++++++++++--------------
 arch/powerpc/platforms/pseries/eeh_event.c  |   36 ++++++------
 4 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh_event.h b/arch/powerpc/include/asm/eeh_event.h
index 25ebf6a..c68b012 100644
--- a/arch/powerpc/include/asm/eeh_event.h
+++ b/arch/powerpc/include/asm/eeh_event.h
@@ -28,12 +28,11 @@
  */
 struct eeh_event {
 	struct list_head	list;	/* to form event queue	*/
-	struct device_node	*dn;	/* struct device node	*/
-	struct pci_dev		*dev;	/* affected device	*/
+	struct eeh_dev		*edev;	/* EEH device		*/
 };
 
-int eeh_send_failure_event(struct device_node *dn, struct pci_dev *dev);
-struct pci_dn *handle_eeh_events(struct eeh_event *);
+int eeh_send_failure_event(struct eeh_dev *edev);
+struct eeh_dev *handle_eeh_events(struct eeh_event *);
 
 #endif /* __KERNEL__ */
 #endif /* ASM_POWERPC_EEH_EVENT_H */
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 84a8a0c..759d5af 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -475,7 +475,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	eeh_mark_slot(dn, EEH_MODE_ISOLATED);
 	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 
-	eeh_send_failure_event(edev->dn, edev->pdev);
+	eeh_send_failure_event(edev);
 
 	/* Most EEH events are due to device driver bugs.  Having
 	 * a stack trace will help the device-driver authors figure
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 3f25fab..3bf1d10 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -40,7 +40,7 @@
  * This routine is used to retrieve the name of PCI device driver
  * if that's valid.
  */
-static inline const char *pcid_name(struct pci_dev *pdev)
+static inline const char *eeh_pcid_name(struct pci_dev *pdev)
 {
 	if (pdev && pdev->dev.driver)
 		return pdev->dev.driver->name;
@@ -81,7 +81,7 @@ static void print_device_node_tree(struct pci_dn *pdn, int dent)
  */
 static void eeh_disable_irq(struct pci_dev *dev)
 {
-	struct device_node *dn = pci_device_to_OF_node(dev);
+	struct eeh_dev *edev = PCI_DEV_TO_EEH_DEV(dev);
 
 	/* Don't disable MSI and MSI-X interrupts. They are
 	 * effectively disabled by the DMA Stopped state
@@ -93,7 +93,7 @@ static void eeh_disable_irq(struct pci_dev *dev)
 	if (!irq_has_action(dev->irq))
 		return;
 
-	PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED;
+	edev->mode |= EEH_MODE_IRQ_DISABLED;
 	disable_irq_nosync(dev->irq);
 }
 
@@ -106,10 +106,10 @@ static void eeh_disable_irq(struct pci_dev *dev)
  */
 static void eeh_enable_irq(struct pci_dev *dev)
 {
-	struct device_node *dn = pci_device_to_OF_node(dev);
+	struct eeh_dev *edev = PCI_DEV_TO_EEH_DEV(dev);
 
-	if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) {
-		PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED;
+	if ((edev->mode) & EEH_MODE_IRQ_DISABLED) {
+		edev->mode &= ~EEH_MODE_IRQ_DISABLED;
 		enable_irq(dev->irq);
 	}
 }
@@ -270,20 +270,20 @@ static int eeh_report_failure(struct pci_dev *dev, void *userdata)
 
 /**
  * eeh_reset_device - Perform actual reset of a pci slot
- * @pe_dn: PE associated device node
+ * @edev: PE associated EEH device
  * @bus: PCI bus corresponding to the isolcated slot
  *
  * This routine must be called to do reset on the indicated PE.
  * During the reset, udev might be invoked because those affected
  * PCI devices will be removed and then added.
  */
-static int eeh_reset_device(struct pci_dn *pe_dn, struct pci_bus *bus)
+static int eeh_reset_device(struct eeh_dev *edev, struct pci_bus *bus)
 {
 	struct device_node *dn;
 	int cnt, rc;
 
 	/* pcibios will clear the counter; save the value */
-	cnt = pe_dn->eeh_freeze_count;
+	cnt = edev->freeze_count;
 
 	if (bus)
 		pcibios_remove_pci_devices(bus);
@@ -292,21 +292,22 @@ static int eeh_reset_device(struct pci_dn *pe_dn, struct pci_bus *bus)
 	 * Reconfigure bridges and devices. Don't try to bring the system
 	 * up if the reset failed for some reason.
 	 */
-	rc = eeh_reset_pe(pe_dn);
+	rc = eeh_reset_pe(edev);
 	if (rc)
 		return rc;
 
 	/* Walk over all functions on this device. */
-	dn = pe_dn->node;
-	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
+	dn = EEH_DEV_TO_OF_NODE(edev);
+	if (!pcibios_find_pci_bus(dn) && OF_NODE_TO_EEH_DEV(dn->parent))
 		dn = dn->parent->child;
 
 	while (dn) {
-		struct pci_dn *ppe = PCI_DN(dn);
+		struct eeh_dev *pedev = OF_NODE_TO_EEH_DEV(dn);
+
 		/* On Power4, always true because eeh_pe_config_addr=0 */
-		if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) {
+		if (edev->pe_config_addr == pedev->pe_config_addr) {
 			eeh_ops->configure_bridge(dn);
-			eeh_restore_bars(ppe);
+			eeh_restore_bars(pedev);
  		}
 		dn = dn->sibling;
 	}
@@ -321,7 +322,7 @@ static int eeh_reset_device(struct pci_dn *pe_dn, struct pci_bus *bus)
 		ssleep(5);
 		pcibios_add_pci_devices(bus);
 	}
-	pe_dn->eeh_freeze_count = cnt;
+	edev->freeze_count = cnt;
 
 	return 0;
 }
@@ -348,23 +349,22 @@ static int eeh_reset_device(struct pci_dn *pe_dn, struct pci_bus *bus)
  * drivers (which cause a second set of hotplug events to go out to
  * userspace).
  */
-struct pci_dn *handle_eeh_events(struct eeh_event *event)
+struct eeh_dev *handle_eeh_events(struct eeh_event *event)
 {
 	struct device_node *frozen_dn;
-	struct pci_dn *frozen_pdn;
+	struct eeh_dev *frozen_edev;
 	struct pci_bus *frozen_bus;
 	int rc = 0;
 	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
 	const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str;
 
-	frozen_dn = eeh_find_device_pe(event->dn);
+	frozen_dn = eeh_find_device_pe(EEH_DEV_TO_OF_NODE(event->edev));
 	if (!frozen_dn) {
-
-		location = of_get_property(event->dn, "ibm,loc-code", NULL);
+		location = of_get_property(EEH_DEV_TO_OF_NODE(event->edev), "ibm,loc-code", NULL);
 		location = location ? location : "unknown";
 		printk(KERN_ERR "EEH: Error: Cannot find partition endpoint "
 		                "for location=%s pci addr=%s\n",
-		        location, eeh_pci_name(event->dev));
+			location, eeh_pci_name(EEH_DEV_TO_PCI_DEV(event->edev)));
 		return NULL;
 	}
 
@@ -389,22 +389,21 @@ struct pci_dn *handle_eeh_events(struct eeh_event *event)
 		return NULL;
 	}
 
-	frozen_pdn = PCI_DN(frozen_dn);
-	frozen_pdn->eeh_freeze_count++;
+	frozen_edev = OF_NODE_TO_EEH_DEV(frozen_dn);
+	frozen_edev->freeze_count++;
+	pci_str = eeh_pci_name(EEH_DEV_TO_PCI_DEV(event->edev));
+	drv_str = eeh_pcid_name(EEH_DEV_TO_PCI_DEV(event->edev));
 
-	pci_str = eeh_pci_name(event->dev);
-	drv_str = pcid_name(event->dev);
-	
-	if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES)
+	if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES)
 		goto excess_failures;
 
 	printk(KERN_WARNING
 	   "EEH: This PCI device has failed %d times in the last hour:\n",
-		frozen_pdn->eeh_freeze_count);
+		frozen_edev->freeze_count);
 
-	if (frozen_pdn->pcidev) {
-		bus_pci_str = pci_name(frozen_pdn->pcidev);
-		bus_drv_str = pcid_name(frozen_pdn->pcidev);
+	if (frozen_edev->pdev) {
+		bus_pci_str = pci_name(frozen_edev->pdev);
+		bus_drv_str = eeh_pcid_name(frozen_edev->pdev);
 		printk(KERN_WARNING
 			"EEH: Bus location=%s driver=%s pci addr=%s\n",
 			location, bus_drv_str, bus_pci_str);
@@ -425,7 +424,7 @@ struct pci_dn *handle_eeh_events(struct eeh_event *event)
 	/* Get the current PCI slot state. This can take a long time,
 	 * sometimes over 3 seconds for certain systems.
 	 */
-	rc = eeh_ops->wait_state(frozen_pdn->node, MAX_WAIT_FOR_RECOVERY*1000);
+	rc = eeh_ops->wait_state(EEH_DEV_TO_OF_NODE(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000);
 	if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
 		printk(KERN_WARNING "EEH: Permanent failure\n");
 		goto hard_fail;
@@ -435,14 +434,14 @@ struct pci_dn *handle_eeh_events(struct eeh_event *event)
 	 * don't post the error log until after all dev drivers
 	 * have been informed.
 	 */
-	eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP);
+	eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP);
 
 	/* If all device drivers were EEH-unaware, then shut
 	 * down all of the device drivers, and hope they
 	 * go down willingly, without panicing the system.
 	 */
 	if (result == PCI_ERS_RESULT_NONE) {
-		rc = eeh_reset_device(frozen_pdn, frozen_bus);
+		rc = eeh_reset_device(frozen_edev, frozen_bus);
 		if (rc) {
 			printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc);
 			goto hard_fail;
@@ -451,7 +450,7 @@ struct pci_dn *handle_eeh_events(struct eeh_event *event)
 
 	/* If all devices reported they can proceed, then re-enable MMIO */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(frozen_pdn, EEH_OPT_THAW_MMIO);
+		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO);
 
 		if (rc < 0)
 			goto hard_fail;
@@ -465,7 +464,7 @@ struct pci_dn *handle_eeh_events(struct eeh_event *event)
 
 	/* If all devices reported they can proceed, then re-enable DMA */
 	if (result == PCI_ERS_RESULT_CAN_RECOVER) {
-		rc = eeh_pci_enable(frozen_pdn, EEH_OPT_THAW_DMA);
+		rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA);
 
 		if (rc < 0)
 			goto hard_fail;
@@ -483,7 +482,7 @@ struct pci_dn *handle_eeh_events(struct eeh_event *event)
 
 	/* If any device called out for a reset, then reset the slot */
 	if (result == PCI_ERS_RESULT_NEED_RESET) {
-		rc = eeh_reset_device(frozen_pdn, NULL);
+		rc = eeh_reset_device(frozen_edev, NULL);
 		if (rc) {
 			printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc);
 			goto hard_fail;
@@ -502,7 +501,7 @@ struct pci_dn *handle_eeh_events(struct eeh_event *event)
 	/* Tell all device drivers that they can resume operations */
 	pci_walk_bus(frozen_bus, eeh_report_resume, NULL);
 
-	return frozen_pdn;
+	return frozen_edev;
 	
 excess_failures:
 	/*
@@ -515,7 +514,7 @@ excess_failures:
 		"has failed %d times in the last hour "
 		"and has been permanently disabled.\n"
 		"Please try reseating this device or replacing it.\n",
-		location, drv_str, pci_str, frozen_pdn->eeh_freeze_count);
+		location, drv_str, pci_str, frozen_edev->freeze_count);
 	goto perm_error;
 
 hard_fail:
@@ -526,7 +525,7 @@ hard_fail:
 		location, drv_str, pci_str);
 
 perm_error:
-	eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM);
+	eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM);
 
 	/* Notify all devices that they're about to go down. */
 	pci_walk_bus(frozen_bus, eeh_report_failure, NULL);
diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c
index e98347c..ba860bd 100644
--- a/arch/powerpc/platforms/pseries/eeh_event.c
+++ b/arch/powerpc/platforms/pseries/eeh_event.c
@@ -56,8 +56,8 @@ DEFINE_MUTEX(eeh_event_mutex);
 static int eeh_event_handler(void * dummy)
 {
 	unsigned long flags;
-	struct eeh_event	*event;
-	struct pci_dn *pdn;
+	struct eeh_event *event;
+	struct eeh_dev *edev;
 
 	daemonize("eehd");
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -77,23 +77,26 @@ static int eeh_event_handler(void * dummy)
 
 	/* Serialize processing of EEH events */
 	mutex_lock(&eeh_event_mutex);
-	eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);
+	edev = event->edev;
+	eeh_mark_slot(EEH_DEV_TO_OF_NODE(edev), EEH_MODE_RECOVERING);
 
 	printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
-	       eeh_pci_name(event->dev));
+	       eeh_pci_name(edev->pdev));
+
+	edev = handle_eeh_events(event);
 
-	pdn = handle_eeh_events(event);
+	eeh_clear_slot(EEH_DEV_TO_OF_NODE(edev), EEH_MODE_RECOVERING);
+	pci_dev_put(edev->pdev);
 
-	eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);
-	pci_dev_put(event->dev);
 	kfree(event);
 	mutex_unlock(&eeh_event_mutex);
 
 	/* If there are no new errors after an hour, clear the counter. */
-	if (pdn && pdn->eeh_freeze_count>0) {
+	if (edev && edev->freeze_count>0) {
 		msleep_interruptible(3600*1000);
-		if (pdn->eeh_freeze_count>0)
-			pdn->eeh_freeze_count--;
+		if (edev->freeze_count>0)
+			edev->freeze_count--;
+
 	}
 
 	return 0;
@@ -114,17 +117,17 @@ static void eeh_thread_launcher(struct work_struct *dummy)
 
 /**
  * eeh_send_failure_event - Generate a PCI error event
- * @dev: pci device
+ * @edev: EEH device
  *
  * This routine can be called within an interrupt context;
  * the actual event will be delivered in a normal context
  * (from a workqueue).
  */
-int eeh_send_failure_event(struct device_node *dn,
-                            struct pci_dev *dev)
+int eeh_send_failure_event(struct eeh_dev *edev)
 {
 	unsigned long flags;
 	struct eeh_event *event;
+	struct device_node *dn = EEH_DEV_TO_OF_NODE(edev);
 	const char *location;
 
 	if (!mem_init_done) {
@@ -140,11 +143,10 @@ int eeh_send_failure_event(struct device_node *dn,
 		return 1;
  	}
 
-	if (dev)
-		pci_dev_get(dev);
+	if (edev->pdev)
+		pci_dev_get(edev->pdev);
 
-	event->dn = dn;
-	event->dev = dev;
+	event->edev = edev;
 
 	/* We may or may not be called in an interrupt context */
 	spin_lock_irqsave(&eeh_eventlist_lock, flags);
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 11/21] pSeries platform EEH configure bridge
From: Gavin Shan @ 2012-02-24  9:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel.crashing.org, shangw
In-Reply-To: <1330076298-7006-1-git-send-email-shangw@linux.vnet.ibm.com>

In order to enable particular PCI device, which has been included
in the parent PE. The involved PCI bridges should be enabled explicitly
if there has. On pSeries platform, there're dedicated RTAS calls
to fulfil the purpose.

The patch implements the function of configuring PCI bridges through
the dedicated RTAS calls. Besides, the function has been abstracted
by struct eeh_ops::configure_bridge so that the EEH core components
could support multiple platforms in future.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/ppc-pci.h           |    1 -
 arch/powerpc/platforms/pseries/eeh.c         |   44 +-------------------------
 arch/powerpc/platforms/pseries/eeh_driver.c  |    2 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c |   29 ++++++++++++++++-
 4 files changed, 30 insertions(+), 46 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index bd1a84f..b4b18d8 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -57,7 +57,6 @@ void eeh_slot_error_detail (struct pci_dn *pdn, int severity);
 int eeh_pci_enable(struct pci_dn *pdn, int function);
 int eeh_reset_pe(struct pci_dn *);
 void eeh_restore_bars(struct pci_dn *);
-void eeh_configure_bridge(struct pci_dn *);
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
 void eeh_mark_slot(struct device_node *dn, int mode_flag);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 39fcecb..bd4ed83 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -86,10 +86,6 @@
 /* Time to wait for a PCI slot to report status, in milliseconds */
 #define PCI_BUS_RESET_WAIT_MSEC (60*1000)
 
-/* RTAS tokens */
-static int ibm_configure_bridge;
-static int ibm_configure_pe;
-
 /* Platform dependent EEH operations */
 struct eeh_ops *eeh_ops = NULL;
 
@@ -229,7 +225,7 @@ void eeh_slot_error_detail(struct pci_dn *pdn, int severity)
 	pci_regs_buf[0] = 0;
 
 	eeh_pci_enable(pdn, EEH_OPT_THAW_MMIO);
-	eeh_configure_bridge(pdn);
+	eeh_ops->configure_bridge(pdn->node);
 	eeh_restore_bars(pdn);
 	loglen = eeh_gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN);
 
@@ -810,41 +806,6 @@ static void eeh_save_bars(struct pci_dn *pdn)
 }
 
 /**
- * eeh_configure_bridge - Configure PCI bridges for the indicated PE
- * @pdn: PCI device node
- *
- * PCI bridges might be included in PE. In order to make the PE work
- * again. The included PCI bridges should be recovered after the PE
- * encounters frozen state.
- */
-void eeh_configure_bridge(struct pci_dn *pdn)
-{
-	int config_addr;
-	int rc;
-	int token;
-
-	/* Use PE configuration address, if present */
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
-
-	/* Use new configure-pe function, if supported */
-	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE)
-		token = ibm_configure_pe;
-	else
-		token = ibm_configure_bridge;
-
-	rc = rtas_call(token, 3, 1, NULL,
-	               config_addr,
-	               BUID_HI(pdn->phb->buid),
-	               BUID_LO(pdn->phb->buid));
-	if (rc) {
-		printk(KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n",
-		        rc, pdn->node->full_name);
-	}
-}
-
-/**
  * eeh_early_enable - Early enable EEH on the indicated device
  * @dn: device node
  * @data: BUID
@@ -1027,9 +988,6 @@ void __init eeh_init(void)
 	if (np == NULL)
 		return;
 
-	ibm_configure_bridge = rtas_token("ibm,configure-bridge");
-	ibm_configure_pe = rtas_token("ibm,configure-pe");
-
 	/* Enable EEH for all adapters.  Note that eeh requires buid's */
 	for (phb = of_find_node_by_name(NULL, "pci"); phb;
 	     phb = of_find_node_by_name(phb, "pci")) {
diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c
index 6840357..61450e1 100644
--- a/arch/powerpc/platforms/pseries/eeh_driver.c
+++ b/arch/powerpc/platforms/pseries/eeh_driver.c
@@ -295,7 +295,7 @@ static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus)
 		struct pci_dn *ppe = PCI_DN(dn);
 		/* On Power4, always true because eeh_pe_config_addr=0 */
 		if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) {
-			eeh_configure_bridge(ppe);
+			eeh_ops->configure_bridge(dn);
 			eeh_restore_bars(ppe);
  		}
 		dn = dn->sibling;
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 7c8434f..4ed06b2 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -473,7 +473,34 @@ static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_l
  */
 static int pseries_eeh_configure_bridge(struct device_node *dn)
 {
-	return 0;
+	struct pci_dn *pdn;
+	int config_addr;
+	int ret;
+
+	/* Figure out the PE address */
+	pdn = PCI_DN(dn);
+	config_addr = pdn->eeh_config_addr;
+	if (pdn->eeh_pe_config_addr)
+		config_addr = pdn->eeh_pe_config_addr;
+
+	/* Use new configure-pe function, if supported */
+	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
+				config_addr, BUID_HI(pdn->phb->buid),
+				BUID_LO(pdn->phb->buid));
+	} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
+				config_addr, BUID_HI(pdn->phb->buid),
+				BUID_LO(pdn->phb->buid));
+	} else {
+		return -EFAULT;
+	}
+
+	if (ret)
+		pr_warning("%s: Unable to configure bridge %d for %s\n",
+			__func__, ret, dn->full_name);
+
+	return ret;
 }
 
 static struct eeh_ops pseries_eeh_ops = {
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 14/21] Introduce EEH device
From: Gavin Shan @ 2012-02-24  9:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel.crashing.org, shangw
In-Reply-To: <1330076298-7006-1-git-send-email-shangw@linux.vnet.ibm.com>

Original EEH implementation depends on struct pci_dn heavily. However,
EEH shouldn't depend on that actually because EEH needn't share much
information with other PCI components. That's to say, EEH should have
worked independently.

The patch introduces struct eeh_dev so that EEH core components needn't
be working based on struct pci_dn in future. Also, struct pci_dn, struct
eeh_dev instances are created in dynamic fasion and the binding with EEH
device, OF node, PCI device is implemented as well.

The EEH devices are created after PHBs are detected and initialized, but
PCI emunation hasn't started yet. Apart from that, PHB might be created
dynamically through DLPAR component and the EEH devices should be creatd
as well. Another case might be OF node is created dynamically by DR
(Dynamic Reconfiguration), which has been defined by PAPR. For those OF
nodes created by DR, EEH devices should be also created accordingly. The
binding between EEH device and OF node is done while the EEH device is
initially created.

The binding between EEH device and PCI device should be done after PCI
emunation is done. Besides, PCI hotplug also needs the binding so that
the EEH devices could be traced from the newly coming PCI buses or PCI
devices.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/device.h          |    3 ++
 arch/powerpc/include/asm/eeh.h             |   51 ++++++++++++++++++++++++----
 arch/powerpc/kernel/of_platform.c          |    3 ++
 arch/powerpc/kernel/rtas_pci.c             |    3 ++
 arch/powerpc/platforms/pseries/Makefile    |    3 +-
 arch/powerpc/platforms/pseries/pci_dlpar.c |    3 ++
 arch/powerpc/platforms/pseries/setup.c     |    6 +++-
 include/linux/of.h                         |    3 ++
 8 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
index d57c08a..4668344 100644
--- a/arch/powerpc/include/asm/device.h
+++ b/arch/powerpc/include/asm/device.h
@@ -31,6 +31,9 @@ struct dev_archdata {
 #ifdef CONFIG_SWIOTLB
 	dma_addr_t		max_direct_dma_addr;
 #endif
+#ifdef CONFIG_EEH
+	void			*edev;
+#endif
 };
 
 struct pdev_archdata {
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index ad8f318..1310971 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -32,6 +32,37 @@ struct device_node;
 #ifdef CONFIG_EEH
 
 /*
+ * The struct is used to trace EEH state for the associated
+ * PCI device node or PCI device. In future, it might
+ * represent PE as well so that the EEH device to form
+ * another tree except the currently existing tree of PCI
+ * buses and PCI devices
+ */
+#define EEH_MODE_SUPPORTED	(1<<0)	/* EEH supported on the device	*/
+#define EEH_MODE_NOCHECK	(1<<1)	/* EEH check should be skipped	*/
+#define EEH_MODE_ISOLATED	(1<<2)	/* The device has been isolated	*/
+#define EEH_MODE_RECOVERING	(1<<3)	/* Recovering the device	*/
+#define EEH_MODE_IRQ_DISABLED	(1<<4)	/* Interrupt disabled		*/
+
+struct eeh_dev {
+	int mode;			/* EEH mode			*/
+	int class_code;			/* Class code of the device	*/
+	int config_addr;		/* Config address		*/
+	int pe_config_addr;		/* PE config address		*/
+	int check_count;		/* Times of ignored error	*/
+	int freeze_count;		/* Times of froze up		*/
+	int false_positives;		/* Times of reported #ff's	*/
+	u32 config_space[16];		/* Saved PCI config space	*/
+	struct pci_controller *phb;	/* Associated PHB		*/
+	struct device_node *dn;		/* Associated device node	*/
+	struct pci_dev *pdev;		/* Associated PCI device	*/
+};
+#define EEH_DEV_TO_OF_NODE(edev)	(edev->dn)
+#define EEH_DEV_TO_PCI_DEV(edev)	(edev->pdev)
+#define OF_NODE_TO_EEH_DEV(dn)		((struct eeh_dev *)(dn->edev))
+#define PCI_DEV_TO_EEH_DEV(pdev)	((struct eeh_dev *)(pdev->dev.archdata.edev))
+
+/*
  * The struct is used to trace the registered EEH operation
  * callback functions. Actually, those operation callback
  * functions are heavily platform dependent. That means the
@@ -70,19 +101,15 @@ struct eeh_ops {
 extern struct eeh_ops *eeh_ops;
 extern int eeh_subsystem_enabled;
 
-/* Values for eeh_mode bits in device_node */
-#define EEH_MODE_SUPPORTED     (1<<0)
-#define EEH_MODE_NOCHECK       (1<<1)
-#define EEH_MODE_ISOLATED      (1<<2)
-#define EEH_MODE_RECOVERING    (1<<3)
-#define EEH_MODE_IRQ_DISABLED  (1<<4)
-
 /*
  * Max number of EEH freezes allowed before we consider the device
  * to be permanently disabled.
  */
 #define EEH_MAX_ALLOWED_FREEZES 5
 
+void * __devinit eeh_dev_init(struct device_node *dn, void *data);
+void __devinit eeh_dev_phb_init_dynamic(struct pci_controller *phb);
+void __init eeh_dev_phb_init(void);
 void __init eeh_init(void);
 #ifdef CONFIG_PPC_PSERIES
 int __init eeh_pseries_init(void);
@@ -113,6 +140,16 @@ void eeh_remove_bus_device(struct pci_dev *);
 #define EEH_IO_ERROR_VALUE(size)	(~0U >> ((4 - (size)) * 8))
 
 #else /* !CONFIG_EEH */
+
+static inline void *eeh_dev_init(struct device_node *dn, void *data)
+{
+	return NULL;
+}
+
+static inline void eeh_dev_phb_init_dynamic(struct pci_controller *phb) { }
+
+static inline void eeh_dev_phb_init(void) { }
+
 static inline void eeh_init(void) { }
 
 #ifdef CONFIG_PPC_PSERIES
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index e1612df..9239c3a 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -66,6 +66,9 @@ static int __devinit of_pci_phb_probe(struct platform_device *dev)
 	/* Init pci_dn data structures */
 	pci_devs_phb_init_dynamic(phb);
 
+	/* Create EEH devices for the PHB */
+	eeh_dev_phb_init_dynamic(phb);
+
 	/* Register devices with EEH */
 #ifdef CONFIG_EEH
 	if (dev->dev.of_node->child)
diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index 6cd8f01..517bd86 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -275,6 +275,9 @@ void __init find_and_init_phbs(void)
 	of_node_put(root);
 	pci_devs_phb_init();
 
+	/* Create EEH devices for all PHBs */
+	eeh_dev_phb_init();
+
 	/*
 	 * pci_probe_only and pci_assign_all_buses can be set via properties
 	 * in chosen.
diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile
index 9aa5581..12dae0b 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -6,7 +6,8 @@ obj-y			:= lpar.o hvCall.o nvram.o reconfig.o \
 			   firmware.o power.o dlpar.o mobility.o
 obj-$(CONFIG_SMP)	+= smp.o
 obj-$(CONFIG_SCANLOG)	+= scanlog.o
-obj-$(CONFIG_EEH)	+= eeh.o eeh_cache.o eeh_driver.o eeh_event.o eeh_sysfs.o eeh_pseries.o
+obj-$(CONFIG_EEH)	+= eeh.o eeh_dev.o eeh_cache.o eeh_driver.o \
+			   eeh_event.o eeh_sysfs.o eeh_pseries.o
 obj-$(CONFIG_KEXEC)	+= kexec.o
 obj-$(CONFIG_PCI)	+= pci.o pci_dlpar.o
 obj-$(CONFIG_PSERIES_MSI)	+= msi.o
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c
index 55d4ec1..fbb21fc 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -147,6 +147,9 @@ struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn)
 
 	pci_devs_phb_init_dynamic(phb);
 
+	/* Create EEH devices for the PHB */
+	eeh_dev_phb_init_dynamic(phb);
+
 	if (dn->child)
 		eeh_add_device_tree_early(dn);
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 809d9d9..60f9462 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -261,8 +261,12 @@ static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long act
 	switch (action) {
 	case PSERIES_RECONFIG_ADD:
 		pci = np->parent->data;
-		if (pci)
+		if (pci) {
 			update_dn_pci_info(np, pci->phb);
+
+			/* Create EEH device for the OF node */
+			eeh_dev_init(np, pci->phb);
+		}
 		break;
 	default:
 		err = NOTIFY_DONE;
diff --git a/include/linux/of.h b/include/linux/of.h
index a75a831..61863a9 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -58,6 +58,9 @@ struct device_node {
 	struct	kref kref;
 	unsigned long _flags;
 	void	*data;
+#if defined(CONFIG_EEH)
+	void	*edev;
+#endif
 #if defined(CONFIG_SPARC)
 	char	*path_component_name;
 	unsigned int unique_id;
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 04/21] pSeries platform EEH initialization
From: Gavin Shan @ 2012-02-24  9:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel.crashing.org, shangw
In-Reply-To: <1330076298-7006-1-git-send-email-shangw@linux.vnet.ibm.com>

The platform specific EEH operations have been abstracted by
struct eeh_ops. The individual platroms, including pSeries, needs
doing necessary initialization before the platform dependent EEH
operations work properly.

The patch is addressing that and do necessary platform initialization
for pSeries platform. More specificly, it will figure out the tokens
of EEH related RTAS calls.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh.c         |   12 ++++++
 arch/powerpc/platforms/pseries/eeh_pseries.c |   55 ++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index b0e3fb0..bb6de6c 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -1279,6 +1279,18 @@ void __init eeh_init(void)
 {
 	struct device_node *phb, *np;
 	struct eeh_early_enable_info info;
+	int ret;
+
+	/* call platform initialization function */
+	if (!eeh_ops) {
+		pr_warning("%s: Platform EEH operation not found\n",
+			__func__);
+		return;
+	} else if ((ret = eeh_ops->init())) {
+		pr_warning("%s: Failed to call platform init function (%d)\n",
+			__func__, ret);
+		return;
+	}
 
 	raw_spin_lock_init(&confirm_error_lock);
 	spin_lock_init(&slot_errbuf_lock);
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 61a9050..1a9410a 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -45,6 +45,17 @@
 #include <asm/ppc-pci.h>
 #include <asm/rtas.h>
 
+/* RTAS tokens */
+static int ibm_set_eeh_option;
+static int ibm_set_slot_reset;
+static int ibm_read_slot_reset_state;
+static int ibm_read_slot_reset_state2;
+static int ibm_slot_error_detail;
+static int ibm_get_config_addr_info;
+static int ibm_get_config_addr_info2;
+static int ibm_configure_bridge;
+static int ibm_configure_pe;
+
 /**
  * pseries_eeh_init - EEH platform dependent initialization
  *
@@ -52,6 +63,50 @@
  */
 static int pseries_eeh_init(void)
 {
+	/* figure out EEH RTAS function call tokens */
+	ibm_set_eeh_option		= rtas_token("ibm,set-eeh-option");
+	ibm_set_slot_reset		= rtas_token("ibm,set-slot-reset");
+	ibm_read_slot_reset_state2	= rtas_token("ibm,read-slot-reset-state2");
+	ibm_read_slot_reset_state	= rtas_token("ibm,read-slot-reset-state");
+	ibm_slot_error_detail		= rtas_token("ibm,slot-error-detail");
+	ibm_get_config_addr_info2	= rtas_token("ibm,get-config-addr-info2");
+	ibm_get_config_addr_info	= rtas_token("ibm,get-config-addr-info");
+	ibm_configure_pe		= rtas_token("ibm,configure-pe");
+	ibm_configure_bridge		= rtas_token ("ibm,configure-bridge");
+
+	/* necessary sanity check */
+	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,set-eeh-option> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm, set-slot-reset> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_read_slot_reset_state2 == RTAS_UNKNOWN_SERVICE &&
+		   ibm_read_slot_reset_state == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,read-slot-reset-state2> and "
+			"<ibm,read-slot-reset-state> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_slot_error_detail == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,slot-error-detail> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_get_config_addr_info2 == RTAS_UNKNOWN_SERVICE &&
+		   ibm_get_config_addr_info == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,get-config-addr-info2> and "
+			"<ibm,get-config-addr-info> invalid\n",
+			__func__);
+		return -EINVAL;
+	} else if (ibm_configure_pe == RTAS_UNKNOWN_SERVICE &&
+		   ibm_configure_bridge == RTAS_UNKNOWN_SERVICE) {
+		pr_warning("%s: RTAS service <ibm,configure-pe> and "
+			"<ibm,configure-bridge> invalid\n",
+			__func__);
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 06/21] pSeries platform EEH PE address retrieval
From: Gavin Shan @ 2012-02-24  9:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel.crashing.org, shangw
In-Reply-To: <1330076298-7006-1-git-send-email-shangw@linux.vnet.ibm.com>

There're 2 types of addresses used for EEH operations. The first
one would be BDF (Bus/Device/Function) address which is retrieved
from the reg property of the corresponding FDT node. Another one
is PE address that should be enquired from firmware through RTAS
call on pSeries platform. When issuing EEH operation, the PE address
has precedence over BDF address.

The patch implements retrieving PE address according to the given
BDF address on pSeries platform. Also, the struct eeh_early_enable_info
has been removed since the information can be figured out from
dn->pdn->phb->buid directly and that simplifies the code.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh.c         |   67 +------------------------
 arch/powerpc/platforms/pseries/eeh_pseries.c |   46 +++++++++++++++++-
 2 files changed, 48 insertions(+), 65 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index 70a9617..00797e0 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -91,8 +91,6 @@ static int ibm_set_slot_reset;
 static int ibm_read_slot_reset_state;
 static int ibm_read_slot_reset_state2;
 static int ibm_slot_error_detail;
-static int ibm_get_config_addr_info;
-static int ibm_get_config_addr_info2;
 static int ibm_configure_bridge;
 static int ibm_configure_pe;
 
@@ -1048,56 +1046,6 @@ void eeh_configure_bridge(struct pci_dn *pdn)
 	}
 }
 
-#define EEH_ENABLE 1
-
-struct eeh_early_enable_info {
-	unsigned int buid_hi;
-	unsigned int buid_lo;
-};
-
-/**
- * eeh_get_pe_addr - Retrieve PE address with given BDF address
- * @config_addr: BDF address
- * @info: BUID of the associated PHB
- *
- * There're 2 kinds of addresses existing in EEH core components:
- * BDF address and PE address. Besides, there has dedicated platform
- * dependent function call to retrieve the PE address according to
- * the given BDF address. Further more, we prefer PE address on BDF
- * address in EEH core components.
- */
-static int eeh_get_pe_addr(int config_addr,
-                        struct eeh_early_enable_info *info)
-{
-	unsigned int rets[3];
-	int ret;
-
-	/* Use latest config-addr token on power6 */
-	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
-		/* Make sure we have a PE in hand */
-		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-			config_addr, info->buid_hi, info->buid_lo, 1);
-		if (ret || (rets[0]==0))
-			return 0;
-
-		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-			config_addr, info->buid_hi, info->buid_lo, 0);
-		if (ret)
-			return 0;
-		return rets[0];
-	}
-
-	/* Use older config-addr token on power5 */
-	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
-		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
-			config_addr, info->buid_hi, info->buid_lo, 0);
-		if (ret)
-			return 0;
-		return rets[0];
-	}
-	return 0;
-}
-
 /**
  * eeh_early_enable - Early enable EEH on the indicated device
  * @dn: device node
@@ -1110,7 +1058,6 @@ static int eeh_get_pe_addr(int config_addr,
 static void *eeh_early_enable(struct device_node *dn, void *data)
 {
 	unsigned int rets[3];
-	struct eeh_early_enable_info *info = data;
 	int ret;
 	const u32 *class_code = of_get_property(dn, "class-code", NULL);
 	const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL);
@@ -1155,7 +1102,7 @@ static void *eeh_early_enable(struct device_node *dn, void *data)
 			/* If the newer, better, ibm,get-config-addr-info is supported, 
 			 * then use that instead.
 			 */
-			pdn->eeh_pe_config_addr = eeh_get_pe_addr(pdn->eeh_config_addr, info);
+			pdn->eeh_pe_config_addr = eeh_ops->get_pe_addr(dn);
 
 			/* Some older systems (Power4) allow the
 			 * ibm,set-eeh-option call to succeed even on nodes
@@ -1264,7 +1211,6 @@ int __exit eeh_ops_unregister(const char *name)
 void __init eeh_init(void)
 {
 	struct device_node *phb, *np;
-	struct eeh_early_enable_info info;
 	int ret;
 
 	/* call platform initialization function */
@@ -1289,8 +1235,6 @@ void __init eeh_init(void)
 	ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2");
 	ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state");
 	ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
-	ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
-	ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
 	ibm_configure_bridge = rtas_token("ibm,configure-bridge");
 	ibm_configure_pe = rtas_token("ibm,configure-pe");
 
@@ -1313,9 +1257,7 @@ void __init eeh_init(void)
 		if (buid == 0 || PCI_DN(phb) == NULL)
 			continue;
 
-		info.buid_lo = BUID_LO(buid);
-		info.buid_hi = BUID_HI(buid);
-		traverse_pci_devices(phb, eeh_early_enable, &info);
+		traverse_pci_devices(phb, eeh_early_enable, NULL);
 	}
 
 	if (eeh_subsystem_enabled)
@@ -1339,7 +1281,6 @@ void __init eeh_init(void)
 static void eeh_add_device_early(struct device_node *dn)
 {
 	struct pci_controller *phb;
-	struct eeh_early_enable_info info;
 
 	if (!dn || !PCI_DN(dn))
 		return;
@@ -1349,9 +1290,7 @@ static void eeh_add_device_early(struct device_node *dn)
 	if (NULL == phb || 0 == phb->buid)
 		return;
 
-	info.buid_hi = BUID_HI(phb->buid);
-	info.buid_lo = BUID_LO(phb->buid);
-	eeh_early_enable(dn, &info);
+	eeh_early_enable(dn, NULL);
 }
 
 /**
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index c48a9e6..2b9543a 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -176,7 +176,51 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
  */
 static int pseries_eeh_get_pe_addr(struct device_node *dn)
 {
-	return 0;
+	struct pci_dn *pdn;
+	int ret = 0;
+	int rets[3];
+
+	pdn = PCI_DN(dn);
+
+	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+		/*
+		 * First of all, we need to make sure there has one PE
+		 * associated with the device. Otherwise, PE address is
+		 * meaningless.
+		 */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				pdn->eeh_config_addr, BUID_HI(pdn->phb->buid),
+				BUID_LO(pdn->phb->buid), 1);
+		if (ret || (rets[0] == 0))
+			return 0;
+
+		/* Retrieve the associated PE config address */
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+				pdn->eeh_config_addr, BUID_HI(pdn->phb->buid),
+				BUID_LO(pdn->phb->buid), 0);
+		if (ret) {
+			pr_warning("%s: Failed to get PE address for %s\n",
+				__func__, dn->full_name);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
+				pdn->eeh_config_addr, BUID_HI(pdn->phb->buid),
+				BUID_LO(pdn->phb->buid), 0);
+		if (ret) {
+			pr_warning("%s: Failed to get PE address for %s\n",
+				__func__, dn->full_name);
+			return 0;
+		}
+
+		return rets[0];
+	}
+
+	return ret;
 }
 
 /**
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 01/21] Cleanup on comments of EEH core
From: Gavin Shan @ 2012-02-24  9:37 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel.crashing.org, shangw
In-Reply-To: <1330076298-7006-1-git-send-email-shangw@linux.vnet.ibm.com>

The EEH has been implemented on pSeries platform. The original
code looks a little bit nasty. The patch does cleanup on the
current EEH implementation so that it looks more clean.

	* Duplicated comments have been removed from the corresponding
	  header files.
	* Comments have been reorganized so that it looks more clean.
	* The leading comments of functions are adjusted for a little
	  bit so that the result of "make pdfdocs" would be more
	  unified.
	* Function definitions and calls have unified format as "xxx()".
	  That means the format "xxx ()" has been replaced by "xxx()".
	* There're multiple functions implemented for resetting PE. The
	  position of those functions have been move around so that they
	  are adjacent to each other to reflect their relationship.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/eeh.h       |   26 +--
 arch/powerpc/include/asm/ppc-pci.h   |   71 +------
 arch/powerpc/platforms/pseries/eeh.c |  400 +++++++++++++++++++++++-----------
 3 files changed, 276 insertions(+), 221 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 66ea9b8..2328877 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -1,6 +1,6 @@
 /*
- * eeh.h
  * Copyright (C) 2001  Dave Engebretsen & Todd Inglett IBM Corporation.
+ * Copyright 2001-2012 IBM Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -40,8 +40,10 @@ extern int eeh_subsystem_enabled;
 #define EEH_MODE_RECOVERING    (1<<3)
 #define EEH_MODE_IRQ_DISABLED  (1<<4)
 
-/* Max number of EEH freezes allowed before we consider the device
- * to be permanently disabled. */
+/*
+ * Max number of EEH freezes allowed before we consider the device
+ * to be permanently disabled.
+ */
 #define EEH_MAX_ALLOWED_FREEZES 5
 
 void __init eeh_init(void);
@@ -49,26 +51,8 @@ unsigned long eeh_check_failure(const volatile void __iomem *token,
 				unsigned long val);
 int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev);
 void __init pci_addr_cache_build(void);
-
-/**
- * eeh_add_device_early
- * eeh_add_device_late
- *
- * Perform eeh initialization for devices added after boot.
- * Call eeh_add_device_early before doing any i/o to the
- * device (including config space i/o).  Call eeh_add_device_late
- * to finish the eeh setup for this device.
- */
 void eeh_add_device_tree_early(struct device_node *);
 void eeh_add_device_tree_late(struct pci_bus *);
-
-/**
- * eeh_remove_device_recursive - undo EEH for device & children.
- * @dev: pci device to be removed
- *
- * As above, this removes the device; it also removes child
- * pci devices as well.
- */
 void eeh_remove_bus_device(struct pci_dev *);
 
 /**
diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h
index 6d42297..221d82f 100644
--- a/arch/powerpc/include/asm/ppc-pci.h
+++ b/arch/powerpc/include/asm/ppc-pci.h
@@ -47,92 +47,27 @@ extern int rtas_setup_phb(struct pci_controller *phb);
 
 extern unsigned long pci_probe_only;
 
-/* ---- EEH internal-use-only related routines ---- */
 #ifdef CONFIG_EEH
 
 void pci_addr_cache_insert_device(struct pci_dev *dev);
 void pci_addr_cache_remove_device(struct pci_dev *dev);
 void pci_addr_cache_build(void);
 struct pci_dev *pci_get_device_by_addr(unsigned long addr);
-
-/**
- * eeh_slot_error_detail -- record and EEH error condition to the log
- * @pdn:      pci device node
- * @severity: EEH_LOG_TEMP_FAILURE or EEH_LOG_PERM_FAILURE
- *
- * Obtains the EEH error details from the RTAS subsystem,
- * and then logs these details with the RTAS error log system.
- */
 #define EEH_LOG_TEMP_FAILURE 1
 #define EEH_LOG_PERM_FAILURE 2
 void eeh_slot_error_detail (struct pci_dn *pdn, int severity);
-
-/**
- * rtas_pci_enable - enable IO transfers for this slot
- * @pdn:       pci device node
- * @function:  either EEH_THAW_MMIO or EEH_THAW_DMA 
- *
- * Enable I/O transfers to this slot 
- */
 #define EEH_THAW_MMIO 2
 #define EEH_THAW_DMA  3
 int rtas_pci_enable(struct pci_dn *pdn, int function);
-
-/**
- * rtas_set_slot_reset -- unfreeze a frozen slot
- * @pdn:       pci device node
- *
- * Clear the EEH-frozen condition on a slot.  This routine
- * does this by asserting the PCI #RST line for 1/8th of
- * a second; this routine will sleep while the adapter is
- * being reset.
- *
- * Returns a non-zero value if the reset failed.
- */
 int rtas_set_slot_reset (struct pci_dn *);
 int eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs);
-
-/** 
- * eeh_restore_bars - Restore device configuration info.
- * @pdn:       pci device node
- *
- * A reset of a PCI device will clear out its config space.
- * This routines will restore the config space for this
- * device, and is children, to values previously obtained
- * from the firmware.
- */
 void eeh_restore_bars(struct pci_dn *);
-
-/**
- * rtas_configure_bridge -- firmware initialization of pci bridge
- * @pdn:       pci device node
- *
- * Ask the firmware to configure all PCI bridges devices
- * located behind the indicated node. Required after a
- * pci device reset. Does essentially the same hing as
- * eeh_restore_bars, but for brdges, and lets firmware 
- * do the work.
- */
 void rtas_configure_bridge(struct pci_dn *);
-
 int rtas_write_config(struct pci_dn *, int where, int size, u32 val);
 int rtas_read_config(struct pci_dn *, int where, int size, u32 *val);
-
-/**
- * eeh_mark_slot -- set mode flags for pertition endpoint
- * @pdn:       pci device node
- *
- * mark and clear slots: find "partition endpoint" PE and set or 
- * clear the flags for each subnode of the PE.
- */
-void eeh_mark_slot (struct device_node *dn, int mode_flag);
-void eeh_clear_slot (struct device_node *dn, int mode_flag);
-
-/**
- * find_device_pe -- Find the associated "Partiationable Endpoint" PE
- * @pdn:       pci device node
- */
-struct device_node * find_device_pe(struct device_node *dn);
+void eeh_mark_slot(struct device_node *dn, int mode_flag);
+void eeh_clear_slot(struct device_node *dn, int mode_flag);
+struct device_node *find_device_pe(struct device_node *dn);
 
 void eeh_sysfs_add_device(struct pci_dev *pdev);
 void eeh_sysfs_remove_device(struct pci_dev *pdev);
diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c
index c0b40af..5f6d37b 100644
--- a/arch/powerpc/platforms/pseries/eeh.c
+++ b/arch/powerpc/platforms/pseries/eeh.c
@@ -1,8 +1,8 @@
 /*
- * eeh.c
  * Copyright IBM Corporation 2001, 2005, 2006
  * Copyright Dave Engebretsen & Todd Inglett 2001
  * Copyright Linas Vepstas 2005, 2006
+ * Copyright 2001-2012 IBM Corporation.
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -22,7 +22,7 @@
  */
 
 #include <linux/delay.h>
-#include <linux/sched.h>	/* for init_mm */
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/pci.h>
@@ -129,9 +129,16 @@ static unsigned long slot_resets;
 
 #define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE)
 
-/* --------------------------------------------------------------- */
-/* Below lies the EEH event infrastructure */
-
+/**
+ * rtas_slot_error_detail - Retrieve error log through RTAS call
+ * @pdn: device node
+ * @severity: temporary or permanent error log
+ * @driver_log: driver log to be combined with the retrieved error log
+ * @loglen: length of driver log
+ *
+ * This routine should be called to retrieve error log through the dedicated
+ * RTAS call.
+ */
 static void rtas_slot_error_detail(struct pci_dn *pdn, int severity,
                                    char *driver_log, size_t loglen)
 {
@@ -163,7 +170,7 @@ static void rtas_slot_error_detail(struct pci_dn *pdn, int severity,
 }
 
 /**
- * gather_pci_data - copy assorted PCI config space registers to buff
+ * gather_pci_data - Copy assorted PCI config space registers to buff
  * @pdn: device to report data for
  * @buf: point to buffer in which to log
  * @len: amount of room in buffer
@@ -258,6 +265,16 @@ static size_t gather_pci_data(struct pci_dn *pdn, char * buf, size_t len)
 	return n;
 }
 
+/**
+ * eeh_slot_error_detail - Generate combined log including driver log and error log
+ * @pdn: device node
+ * @severity: temporary or permanent error log
+ *
+ * This routine should be called to generate the combined log, which
+ * is comprised of driver log and error log. The driver log is figured
+ * out from the config space of the corresponding PCI device, while
+ * the error log is fetched through platform dependent function call.
+ */
 void eeh_slot_error_detail(struct pci_dn *pdn, int severity)
 {
 	size_t loglen = 0;
@@ -275,6 +292,9 @@ void eeh_slot_error_detail(struct pci_dn *pdn, int severity)
  * read_slot_reset_state - Read the reset state of a device node's slot
  * @dn: device node to read
  * @rets: array to return results in
+ *
+ * Read the reset state of a device node's slot through platform dependent
+ * function call.
  */
 static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
 {
@@ -300,9 +320,9 @@ static int read_slot_reset_state(struct pci_dn *pdn, int rets[])
 }
 
 /**
- * eeh_wait_for_slot_status - returns error status of slot
- * @pdn pci device node
- * @max_wait_msecs maximum number to millisecs to wait
+ * eeh_wait_for_slot_status - Returns error status of slot
+ * @pdn: pci device node
+ * @max_wait_msecs: maximum number to millisecs to wait
  *
  * Return negative value if a permanent error, else return
  * Partition Endpoint (PE) status value.
@@ -332,16 +352,16 @@ eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs)
 
 		mwait = rets[2];
 		if (mwait <= 0) {
-			printk (KERN_WARNING
-			        "EEH: Firmware returned bad wait value=%d\n", mwait);
+			printk(KERN_WARNING "EEH: Firmware returned bad wait value=%d\n",
+				mwait);
 			mwait = 1000;
 		} else if (mwait > 300*1000) {
-			printk (KERN_WARNING
-			        "EEH: Firmware is taking too long, time=%d\n", mwait);
+			printk(KERN_WARNING "EEH: Firmware is taking too long, time=%d\n",
+				mwait);
 			mwait = 300*1000;
 		}
 		max_wait_msecs -= mwait;
-		msleep (mwait);
+		msleep(mwait);
 	}
 
 	printk(KERN_WARNING "EEH: Timed out waiting for slot status\n");
@@ -349,8 +369,11 @@ eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs)
 }
 
 /**
- * eeh_token_to_phys - convert EEH address token to phys address
- * @token i/o token, should be address in the form 0xA....
+ * eeh_token_to_phys - Convert EEH address token to phys address
+ * @token: I/O token, should be address in the form 0xA....
+ *
+ * This routine should be called to convert virtual I/O address
+ * to physical one.
  */
 static inline unsigned long eeh_token_to_phys(unsigned long token)
 {
@@ -365,8 +388,11 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
 	return pa | (token & (PAGE_SIZE-1));
 }
 
-/** 
- * Return the "partitionable endpoint" (pe) under which this device lies
+/**
+ * find_device_pe - Retrieve the PE for the given device
+ * @dn: device node
+ *
+ * Return the PE under which this device lies
  */
 struct device_node * find_device_pe(struct device_node *dn)
 {
@@ -377,14 +403,18 @@ struct device_node * find_device_pe(struct device_node *dn)
 	return dn;
 }
 
-/** Mark all devices that are children of this device as failed.
- *  Mark the device driver too, so that it can see the failure
- *  immediately; this is critical, since some drivers poll
- *  status registers in interrupts ... If a driver is polling,
- *  and the slot is frozen, then the driver can deadlock in
- *  an interrupt context, which is bad.
+/**
+ * __eeh_mark_slot - Mark all child devices as failed
+ * @parent: parent device
+ * @mode_flag: failure flag
+ *
+ * Mark all devices that are children of this device as failed.
+ * Mark the device driver too, so that it can see the failure
+ * immediately; this is critical, since some drivers poll
+ * status registers in interrupts ... If a driver is polling,
+ * and the slot is frozen, then the driver can deadlock in
+ * an interrupt context, which is bad.
  */
-
 static void __eeh_mark_slot(struct device_node *parent, int mode_flag)
 {
 	struct device_node *dn;
@@ -404,10 +434,18 @@ static void __eeh_mark_slot(struct device_node *parent, int mode_flag)
 	}
 }
 
-void eeh_mark_slot (struct device_node *dn, int mode_flag)
+/**
+ * eeh_mark_slot - Mark the indicated device and its children as failed
+ * @dn: parent device
+ * @mode_flag: failure flag
+ *
+ * Mark the indicated device and its child devices as failed.
+ * The device drivers are marked as failed as well.
+ */
+void eeh_mark_slot(struct device_node *dn, int mode_flag)
 {
 	struct pci_dev *dev;
-	dn = find_device_pe (dn);
+	dn = find_device_pe(dn);
 
 	/* Back up one, since config addrs might be shared */
 	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
@@ -423,6 +461,13 @@ void eeh_mark_slot (struct device_node *dn, int mode_flag)
 	__eeh_mark_slot(dn, mode_flag);
 }
 
+/**
+ * __eeh_clear_slot - Clear failure flag for the child devices
+ * @parent: parent device
+ * @mode_flag: flag to be cleared
+ *
+ * Clear failure flag for the child devices.
+ */
 static void __eeh_clear_slot(struct device_node *parent, int mode_flag)
 {
 	struct device_node *dn;
@@ -436,12 +481,19 @@ static void __eeh_clear_slot(struct device_node *parent, int mode_flag)
 	}
 }
 
-void eeh_clear_slot (struct device_node *dn, int mode_flag)
+/**
+ * eeh_clear_slot - Clear failure flag for the indicated device and its children
+ * @dn: parent device
+ * @mode_flag: flag to be cleared
+ *
+ * Clear failure flag for the indicated device and its children.
+ */
+void eeh_clear_slot(struct device_node *dn, int mode_flag)
 {
 	unsigned long flags;
 	raw_spin_lock_irqsave(&confirm_error_lock, flags);
 	
-	dn = find_device_pe (dn);
+	dn = find_device_pe(dn);
 	
 	/* Back up one, since config addrs might be shared */
 	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
@@ -453,43 +505,10 @@ void eeh_clear_slot (struct device_node *dn, int mode_flag)
 	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 }
 
-void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset)
-{
-	struct device_node *dn;
-
-	for_each_child_of_node(parent, dn) {
-		if (PCI_DN(dn)) {
-
-			struct pci_dev *dev = PCI_DN(dn)->pcidev;
-
-			if (dev && dev->driver)
-				*freset |= dev->needs_freset;
-
-			__eeh_set_pe_freset(dn, freset);
-		}
-	}
-}
-
-void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset)
-{
-	struct pci_dev *dev;
-	dn = find_device_pe(dn);
-
-	/* Back up one, since config addrs might be shared */
-	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
-		dn = dn->parent;
-
-	dev = PCI_DN(dn)->pcidev;
-	if (dev)
-		*freset |= dev->needs_freset;
-
-	__eeh_set_pe_freset(dn, freset);
-}
-
 /**
- * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
- * @dn device node
- * @dev pci device, if known
+ * eeh_dn_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @dn: device node
+ * @dev: pci device, if known
  *
  * Check for an EEH failure for the given device node.  Call this
  * routine if the result of a read was all 0xff's and you want to
@@ -548,11 +567,11 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 		pdn->eeh_check_count ++;
 		if (pdn->eeh_check_count % EEH_MAX_FAILS == 0) {
 			location = of_get_property(dn, "ibm,loc-code", NULL);
-			printk (KERN_ERR "EEH: %d reads ignored for recovering device at "
+			printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
 				"location=%s driver=%s pci addr=%s\n",
 				pdn->eeh_check_count, location,
 				eeh_driver_name(dev), eeh_pci_name(dev));
-			printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n",
+			printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n",
 				eeh_driver_name(dev));
 			dump_stack();
 		}
@@ -579,7 +598,8 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
 	}
 
 	/* Note that config-io to empty slots may fail;
-	 * they are empty when they don't have children. */
+	 * they are empty when they don't have children.
+	 */
 	if ((rets[0] == 5) && (rets[2] == 0) && (dn->child == NULL)) {
 		false_positives++;
 		pdn->eeh_false_positives ++;
@@ -609,15 +629,17 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
  
 	/* Avoid repeated reports of this failure, including problems
 	 * with other functions on this device, and functions under
-	 * bridges. */
-	eeh_mark_slot (dn, EEH_MODE_ISOLATED);
+	 * bridges.
+	 */
+	eeh_mark_slot(dn, EEH_MODE_ISOLATED);
 	raw_spin_unlock_irqrestore(&confirm_error_lock, flags);
 
-	eeh_send_failure_event (dn, dev);
+	eeh_send_failure_event(dn, dev);
 
 	/* Most EEH events are due to device driver bugs.  Having
 	 * a stack trace will help the device-driver authors figure
-	 * out what happened.  So print that out. */
+	 * out what happened.  So print that out.
+	 */
 	dump_stack();
 	return 1;
 
@@ -629,9 +651,9 @@ dn_unlock:
 EXPORT_SYMBOL_GPL(eeh_dn_check_failure);
 
 /**
- * eeh_check_failure - check if all 1's data is due to EEH slot freeze
- * @token i/o token, should be address in the form 0xA....
- * @val value, should be all 1's (XXX why do we need this arg??)
+ * eeh_check_failure - Check if all 1's data is due to EEH slot freeze
+ * @token: I/O token, should be address in the form 0xA....
+ * @val: value, should be all 1's (XXX why do we need this arg??)
  *
  * Check for an EEH failure at the given token address.  Call this
  * routine if the result of a read was all 0xff's and you want to
@@ -655,7 +677,7 @@ unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned lon
 	}
 
 	dn = pci_device_to_OF_node(dev);
-	eeh_dn_check_failure (dn, dev);
+	eeh_dn_check_failure(dn, dev);
 
 	pci_dev_put(dev);
 	return val;
@@ -663,14 +685,15 @@ unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned lon
 
 EXPORT_SYMBOL(eeh_check_failure);
 
-/* ------------------------------------------------------------- */
-/* The code below deals with error recovery */
 
 /**
- * rtas_pci_enable - enable MMIO or DMA transfers for this slot
+ * rtas_pci_enable - Enable MMIO or DMA transfers for this slot
  * @pdn pci device node
+ *
+ * This routine should be called to reenable frozen MMIO or DMA
+ * so that it would work correctly again. It's useful while doing
+ * recovery or log collection on the indicated device.
  */
-
 int
 rtas_pci_enable(struct pci_dn *pdn, int function)
 {
@@ -692,7 +715,7 @@ rtas_pci_enable(struct pci_dn *pdn, int function)
 		printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n",
 		        function, rc, pdn->node->full_name);
 
-	rc = eeh_wait_for_slot_status (pdn, PCI_BUS_RESET_WAIT_MSEC);
+	rc = eeh_wait_for_slot_status(pdn, PCI_BUS_RESET_WAIT_MSEC);
 	if ((rc == 4) && (function == EEH_THAW_MMIO))
 		return 0;
 
@@ -700,27 +723,25 @@ rtas_pci_enable(struct pci_dn *pdn, int function)
 }
 
 /**
- * rtas_pci_slot_reset - raises/lowers the pci #RST line
- * @pdn pci device node
+ * rtas_pci_slot_reset - Raises/Lowers the pci #RST line
+ * @pdn: pci device node
  * @state: 1/0 to raise/lower the #RST
  *
  * Clear the EEH-frozen condition on a slot.  This routine
  * asserts the PCI #RST line if the 'state' argument is '1',
  * and drops the #RST line if 'state is '0'.  This routine is
  * safe to call in an interrupt context.
- *
  */
-
 static void
 rtas_pci_slot_reset(struct pci_dn *pdn, int state)
 {
 	int config_addr;
 	int rc;
 
-	BUG_ON (pdn==NULL); 
+	BUG_ON(pdn==NULL);
 
 	if (!pdn->phb) {
-		printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n",
+		printk(KERN_WARNING "EEH: in slot reset, device node %s has no phb\n",
 		        pdn->node->full_name);
 		return;
 	}
@@ -752,12 +773,12 @@ rtas_pci_slot_reset(struct pci_dn *pdn, int state)
 
 /**
  * pcibios_set_pcie_slot_reset - Set PCI-E reset state
- * @dev:	pci device struct
- * @state:	reset state to enter
+ * @dev: pci device struct
+ * @state: reset state to enter
  *
  * Return value:
  * 	0 if success
- **/
+ */
 int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state)
 {
 	struct device_node *dn = pci_device_to_OF_node(dev);
@@ -781,10 +802,62 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 }
 
 /**
- * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second
- * @pdn: pci device node to be reset.
+ * __eeh_set_pe_freset - Check the required reset for child devices
+ * @parent: parent device
+ * @freset: return value
+ *
+ * Each device might have its preferred reset type: fundamental or
+ * hot reset. The routine is used to collect the information from
+ * the child devices so that they could be reset accordingly.
  */
+void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset)
+{
+	struct device_node *dn;
+
+	for_each_child_of_node(parent, dn) {
+		if (PCI_DN(dn)) {
+			struct pci_dev *dev = PCI_DN(dn)->pcidev;
+
+			if (dev && dev->driver)
+				*freset |= dev->needs_freset;
+
+			__eeh_set_pe_freset(dn, freset);
+		}
+	}
+}
+
+/**
+ * eeh_set_pe_freset - Check the required reset for the indicated device and its children
+ * @dn: parent device
+ * @freset: return value
+ *
+ * Each device might have its preferred reset type: fundamental or
+ * hot reset. The routine is used to collected the information for
+ * the indicated device and its children so that the bunch of the
+ * devices could be reset properly.
+ */
+void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset)
+{
+	struct pci_dev *dev;
+	dn = find_device_pe(dn);
+
+	/* Back up one, since config addrs might be shared */
+	if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent))
+		dn = dn->parent;
 
+	dev = PCI_DN(dn)->pcidev;
+	if (dev)
+		*freset |= dev->needs_freset;
+
+	__eeh_set_pe_freset(dn, freset);
+}
+
+/**
+ * __rtas_set_slot_reset - Assert the pci #RST line for 1/4 second
+ * @pdn: pci device node to be reset.
+ *
+ * Assert the PCI #RST line for 1/4 second.
+ */
 static void __rtas_set_slot_reset(struct pci_dn *pdn)
 {
 	unsigned int freset = 0;
@@ -803,25 +876,35 @@ static void __rtas_set_slot_reset(struct pci_dn *pdn)
 		rtas_pci_slot_reset(pdn, 1);
 
 	/* The PCI bus requires that the reset be held high for at least
-	 * a 100 milliseconds. We wait a bit longer 'just in case'.  */
-
+	 * a 100 milliseconds. We wait a bit longer 'just in case'.
+	 */
 #define PCI_BUS_RST_HOLD_TIME_MSEC 250
-	msleep (PCI_BUS_RST_HOLD_TIME_MSEC);
+	msleep(PCI_BUS_RST_HOLD_TIME_MSEC);
 	
 	/* We might get hit with another EEH freeze as soon as the 
 	 * pci slot reset line is dropped. Make sure we don't miss
-	 * these, and clear the flag now. */
-	eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED);
+	 * these, and clear the flag now.
+	 */
+	eeh_clear_slot(pdn->node, EEH_MODE_ISOLATED);
 
-	rtas_pci_slot_reset (pdn, 0);
+	rtas_pci_slot_reset(pdn, 0);
 
 	/* After a PCI slot has been reset, the PCI Express spec requires
 	 * a 1.5 second idle time for the bus to stabilize, before starting
-	 * up traffic. */
+	 * up traffic.
+	 */
 #define PCI_BUS_SETTLE_TIME_MSEC 1800
-	msleep (PCI_BUS_SETTLE_TIME_MSEC);
+	msleep(PCI_BUS_SETTLE_TIME_MSEC);
 }
 
+/**
+ * rtas_set_slot_reset - Reset the indicated PE
+ * @pdn: PCI device node
+ *
+ * This routine should be called to reset indicated device, including
+ * PE. A PE might include multiple PCI devices and sometimes PCI bridges
+ * might be involved as well.
+ */
 int rtas_set_slot_reset(struct pci_dn *pdn)
 {
 	int i, rc;
@@ -846,7 +929,6 @@ int rtas_set_slot_reset(struct pci_dn *pdn)
 	return -1;
 }
 
-/* ------------------------------------------------------- */
 /** Save and restore of PCI BARs
  *
  * Although firmware will set up BARs during boot, it doesn't
@@ -863,7 +945,7 @@ int rtas_set_slot_reset(struct pci_dn *pdn)
  * the expansion ROM base address, the latency timer, and etc.
  * from the saved values in the device node.
  */
-static inline void __restore_bars (struct pci_dn *pdn)
+static inline void __restore_bars(struct pci_dn *pdn)
 {
 	int i;
 	u32 cmd;
@@ -879,17 +961,18 @@ static inline void __restore_bars (struct pci_dn *pdn)
 #define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF))
 #define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)])
 
-	rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1,
+	rtas_write_config(pdn, PCI_CACHE_LINE_SIZE, 1,
 	            SAVED_BYTE(PCI_CACHE_LINE_SIZE));
 
-	rtas_write_config (pdn, PCI_LATENCY_TIMER, 1,
+	rtas_write_config(pdn, PCI_LATENCY_TIMER, 1,
 	            SAVED_BYTE(PCI_LATENCY_TIMER));
 
 	/* max latency, min grant, interrupt pin and line */
 	rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]);
 
 	/* Restore PERR & SERR bits, some devices require it,
-	   don't touch the other command bits */
+	 * don't touch the other command bits
+	 */
 	rtas_read_config(pdn, PCI_COMMAND, 4, &cmd);
 	if (pdn->config_space[1] & PCI_COMMAND_PARITY)
 		cmd |= PCI_COMMAND_PARITY;
@@ -903,7 +986,8 @@ static inline void __restore_bars (struct pci_dn *pdn)
 }
 
 /**
- * eeh_restore_bars - restore the PCI config space info
+ * eeh_restore_bars - Restore the PCI config space info
+ * @pdn: PCI device node
  *
  * This routine performs a recursive walk to the children
  * of this device as well.
@@ -915,14 +999,15 @@ void eeh_restore_bars(struct pci_dn *pdn)
 		return;
 	
 	if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code))
-		__restore_bars (pdn);
+		__restore_bars(pdn);
 
 	for_each_child_of_node(pdn->node, dn)
-		eeh_restore_bars (PCI_DN(dn));
+		eeh_restore_bars(PCI_DN(dn));
 }
 
 /**
- * eeh_save_bars - save device bars
+ * eeh_save_bars - Save device bars
+ * @pdn: PCI device node
  *
  * Save the values of the device bars. Unlike the restore
  * routine, this routine is *not* recursive. This is because
@@ -940,6 +1025,14 @@ static void eeh_save_bars(struct pci_dn *pdn)
 		rtas_read_config(pdn, i * 4, 4, &pdn->config_space[i]);
 }
 
+/**
+ * rtas_configure_bridge - Configure PCI bridges for the indicated PE
+ * @pdn: PCI device node
+ *
+ * PCI bridges might be included in PE. In order to make the PE work
+ * again. The included PCI bridges should be recovered after the PE
+ * encounters frozen state.
+ */
 void
 rtas_configure_bridge(struct pci_dn *pdn)
 {
@@ -963,17 +1056,11 @@ rtas_configure_bridge(struct pci_dn *pdn)
 	               BUID_HI(pdn->phb->buid),
 	               BUID_LO(pdn->phb->buid));
 	if (rc) {
-		printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n",
+		printk(KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n",
 		        rc, pdn->node->full_name);
 	}
 }
 
-/* ------------------------------------------------------------- */
-/* The code below deals with enabling EEH for devices during  the
- * early boot sequence.  EEH must be enabled before any PCI probing
- * can be done.
- */
-
 #define EEH_ENABLE 1
 
 struct eeh_early_enable_info {
@@ -981,7 +1068,18 @@ struct eeh_early_enable_info {
 	unsigned int buid_lo;
 };
 
-static int get_pe_addr (int config_addr,
+/**
+ * get_pe_addr - Retrieve PE address with given BDF address
+ * @config_addr: BDF address
+ * @info: BUID of the associated PHB
+ *
+ * There're 2 kinds of addresses existing in EEH core components:
+ * BDF address and PE address. Besides, there has dedicated platform
+ * dependent function call to retrieve the PE address according to
+ * the given BDF address. Further more, we prefer PE address on BDF
+ * address in EEH core components.
+ */
+static int get_pe_addr(int config_addr,
                         struct eeh_early_enable_info *info)
 {
 	unsigned int rets[3];
@@ -990,12 +1088,12 @@ static int get_pe_addr (int config_addr,
 	/* Use latest config-addr token on power6 */
 	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
 		/* Make sure we have a PE in hand */
-		ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets,
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
 			config_addr, info->buid_hi, info->buid_lo, 1);
 		if (ret || (rets[0]==0))
 			return 0;
 
-		ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets,
+		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
 			config_addr, info->buid_hi, info->buid_lo, 0);
 		if (ret)
 			return 0;
@@ -1004,7 +1102,7 @@ static int get_pe_addr (int config_addr,
 
 	/* Use older config-addr token on power5 */
 	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
-		ret = rtas_call (ibm_get_config_addr_info, 4, 2, rets,
+		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
 			config_addr, info->buid_hi, info->buid_lo, 0);
 		if (ret)
 			return 0;
@@ -1013,7 +1111,15 @@ static int get_pe_addr (int config_addr,
 	return 0;
 }
 
-/* Enable eeh for the given device node. */
+/**
+ * early_enable_eeh - Early enable EEH on the indicated device
+ * @dn: device node
+ * @data: BUID
+ *
+ * Enable EEH functionality on the specified PCI device. The function
+ * is expected to be called before real PCI probing is done. However,
+ * the PHBs have been initialized at this point.
+ */
 static void *early_enable_eeh(struct device_node *dn, void *data)
 {
 	unsigned int rets[3];
@@ -1047,7 +1153,8 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
 	pdn->class_code = *class_code;
 
 	/* Ok... see if this device supports EEH.  Some do, some don't,
-	 * and the only way to find out is to check each and every one. */
+	 * and the only way to find out is to check each and every one.
+	 */
 	regs = of_get_property(dn, "reg", NULL);
 	if (regs) {
 		/* First register entry is addr (00BBSS00)  */
@@ -1061,13 +1168,15 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
 			pdn->eeh_config_addr = regs[0];
 
 			/* If the newer, better, ibm,get-config-addr-info is supported, 
-			 * then use that instead. */
+			 * then use that instead.
+			 */
 			pdn->eeh_pe_config_addr = get_pe_addr(pdn->eeh_config_addr, info);
 
 			/* Some older systems (Power4) allow the
 			 * ibm,set-eeh-option call to succeed even on nodes
 			 * where EEH is not supported. Verify support
-			 * explicitly. */
+			 * explicitly.
+			 */
 			ret = read_slot_reset_state(pdn, rets);
 			if ((ret == 0) && (rets[1] == 1))
 				enable = 1;
@@ -1083,7 +1192,8 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
 		} else {
 
 			/* This device doesn't support EEH, but it may have an
-			 * EEH parent, in which case we mark it as supported. */
+			 * EEH parent, in which case we mark it as supported.
+			 */
 			if (dn->parent && PCI_DN(dn->parent)
 			    && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) {
 				/* Parent supports EEH. */
@@ -1101,7 +1211,9 @@ static void *early_enable_eeh(struct device_node *dn, void *data)
 	return NULL;
 }
 
-/*
+/**
+ * eeh_init - EEH initialization
+ *
  * Initialize EEH by trying to enable it for all of the adapters in the system.
  * As a side effect we can determine here if eeh is supported at all.
  * Note that we leave EEH on so failed config cycles won't cause a machine
@@ -1133,7 +1245,7 @@ void __init eeh_init(void)
 	ibm_slot_error_detail = rtas_token("ibm,slot-error-detail");
 	ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info");
 	ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2");
-	ibm_configure_bridge = rtas_token ("ibm,configure-bridge");
+	ibm_configure_bridge = rtas_token("ibm,configure-bridge");
 	ibm_configure_pe = rtas_token("ibm,configure-pe");
 
 	if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE)
@@ -1170,7 +1282,7 @@ void __init eeh_init(void)
 }
 
 /**
- * eeh_add_device_early - enable EEH for the indicated device_node
+ * eeh_add_device_early - Enable EEH for the indicated device_node
  * @dn: device node for which to set up EEH
  *
  * This routine must be used to perform EEH initialization for PCI
@@ -1199,6 +1311,14 @@ static void eeh_add_device_early(struct device_node *dn)
 	early_enable_eeh(dn, &info);
 }
 
+/**
+ * eeh_add_device_tree_early - Enable EEH for the indicated device
+ * @dn: device node
+ *
+ * This routine must be used to perform EEH initialization for the
+ * indicated PCI device that was added after system boot (e.g.
+ * hotplug, dlpar).
+ */
 void eeh_add_device_tree_early(struct device_node *dn)
 {
 	struct device_node *sib;
@@ -1210,7 +1330,7 @@ void eeh_add_device_tree_early(struct device_node *dn)
 EXPORT_SYMBOL_GPL(eeh_add_device_tree_early);
 
 /**
- * eeh_add_device_late - perform EEH initialization for the indicated pci device
+ * eeh_add_device_late - Perform EEH initialization for the indicated pci device
  * @dev: pci device for which to set up EEH
  *
  * This routine must be used to complete EEH initialization for PCI
@@ -1234,13 +1354,21 @@ static void eeh_add_device_late(struct pci_dev *dev)
 	}
 	WARN_ON(pdn->pcidev);
 
-	pci_dev_get (dev);
+	pci_dev_get(dev);
 	pdn->pcidev = dev;
 
 	pci_addr_cache_insert_device(dev);
 	eeh_sysfs_add_device(dev);
 }
 
+/**
+ * eeh_add_device_tree_late - Perform EEH initialization for the indicated PCI bus
+ * @bus: PCI bus
+ *
+ * This routine must be used to perform EEH initialization for PCI
+ * devices which are attached to the indicated PCI bus. The PCI bus
+ * is added after system boot through hotplug or dlpar.
+ */
 void eeh_add_device_tree_late(struct pci_bus *bus)
 {
 	struct pci_dev *dev;
@@ -1257,7 +1385,7 @@ void eeh_add_device_tree_late(struct pci_bus *bus)
 EXPORT_SYMBOL_GPL(eeh_add_device_tree_late);
 
 /**
- * eeh_remove_device - undo EEH setup for the indicated pci device
+ * eeh_remove_device - Undo EEH setup for the indicated pci device
  * @dev: pci device to be removed
  *
  * This routine should be called when a device is removed from
@@ -1281,12 +1409,20 @@ static void eeh_remove_device(struct pci_dev *dev)
 		return;
 	}
 	PCI_DN(dn)->pcidev = NULL;
-	pci_dev_put (dev);
+	pci_dev_put(dev);
 
 	pci_addr_cache_remove_device(dev);
 	eeh_sysfs_remove_device(dev);
 }
 
+/**
+ * eeh_remove_bus_device - Undo EEH setup for the indicated PCI device
+ * @dev: PCI device
+ *
+ * This routine must be called when a device is removed from the
+ * running system through hotplug or dlpar. The corresponding
+ * PCI address cache will be removed.
+ */
 void eeh_remove_bus_device(struct pci_dev *dev)
 {
 	struct pci_bus *bus = dev->subordinate;
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 19/21] Replace pci_dn with eeh_dev for EEH on pSeries
From: Gavin Shan @ 2012-02-24  9:38 UTC (permalink / raw)
  To: linuxppc-dev; +Cc: kernel.crashing.org, shangw
In-Reply-To: <1330076298-7006-1-git-send-email-shangw@linux.vnet.ibm.com>

The pci_dn has been replaced with eeh_dev. In order to comply with
the rule, the EEH platform implementation on pSeries should also
be adjusted for a little bit so that it will depend on eeh_dev instead
of pci_dn.

The patch replaces pci_dn with eeh_dev. The corresponding information
will be retrieved from eeh_dev instead of pci_dn.

Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
---
 arch/powerpc/platforms/pseries/eeh_pseries.c |   96 +++++++++++++-------------
 1 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 4ed06b2..a3345c2 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -144,11 +144,11 @@ static int pseries_eeh_init(void)
 static int pseries_eeh_set_option(struct device_node *dn, int option)
 {
 	int ret = 0;
-	struct pci_dn *pdn;
+	struct eeh_dev *edev;
 	const u32 *reg;
 	int config_addr;
 
-	pdn = PCI_DN(dn);
+	edev = OF_NODE_TO_EEH_DEV(dn);
 
 	/*
 	 * When we're enabling or disabling EEH functioality on
@@ -165,9 +165,9 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 
 	case EEH_OPT_THAW_MMIO:
 	case EEH_OPT_THAW_DMA:
-		config_addr = pdn->eeh_config_addr;
-		if (pdn->eeh_pe_config_addr)
-			config_addr = pdn->eeh_pe_config_addr;
+		config_addr = edev->config_addr;
+		if (edev->pe_config_addr)
+			config_addr = edev->pe_config_addr;
 		break;
 
 	default:
@@ -177,8 +177,8 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
 	}
 
 	ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL,
-			config_addr, BUID_HI(pdn->phb->buid),
-			BUID_LO(pdn->phb->buid), option);
+			config_addr, BUID_HI(edev->phb->buid),
+			BUID_LO(edev->phb->buid), option);
 
 	return ret;
 }
@@ -198,11 +198,11 @@ static int pseries_eeh_set_option(struct device_node *dn, int option)
  */
 static int pseries_eeh_get_pe_addr(struct device_node *dn)
 {
-	struct pci_dn *pdn;
+	struct eeh_dev *edev;
 	int ret = 0;
 	int rets[3];
 
-	pdn = PCI_DN(dn);
+	edev = OF_NODE_TO_EEH_DEV(dn);
 
 	if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
 		/*
@@ -211,15 +211,15 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 		 * meaningless.
 		 */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				pdn->eeh_config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid), 1);
+				edev->config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), 1);
 		if (ret || (rets[0] == 0))
 			return 0;
 
 		/* Retrieve the associated PE config address */
 		ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-				pdn->eeh_config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid), 0);
+				edev->config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), 0);
 		if (ret) {
 			pr_warning("%s: Failed to get PE address for %s\n",
 				__func__, dn->full_name);
@@ -231,8 +231,8 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
 
 	if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
-				pdn->eeh_config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid), 0);
+				edev->config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), 0);
 		if (ret) {
 			pr_warning("%s: Failed to get PE address for %s\n",
 				__func__, dn->full_name);
@@ -260,28 +260,28 @@ static int pseries_eeh_get_pe_addr(struct device_node *dn)
  */
 static int pseries_eeh_get_state(struct device_node *dn, int *state)
 {
-	struct pci_dn *pdn;
+	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 	int rets[4];
 	int result;
 
 	/* Figure out PE config address if possible */
-	pdn = PCI_DN(dn);
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
+	edev = OF_NODE_TO_EEH_DEV(dn);
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
 
 	if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_read_slot_reset_state2, 3, 4, rets,
-				config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid));
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
 	} else if (ibm_read_slot_reset_state != RTAS_UNKNOWN_SERVICE) {
 		/* Fake PE unavailable info */
 		rets[2] = 0;
 		ret = rtas_call(ibm_read_slot_reset_state, 3, 3, rets,
-				config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid));
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
 	} else {
 		return EEH_STATE_NOT_SUPPORT;
 	}
@@ -340,27 +340,27 @@ static int pseries_eeh_get_state(struct device_node *dn, int *state)
  */
 static int pseries_eeh_reset(struct device_node *dn, int option)
 {
-	struct pci_dn *pdn;
+	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out PE address */
-	pdn = PCI_DN(dn);
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
+	edev = OF_NODE_TO_EEH_DEV(dn);
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
 
 	/* Reset PE through RTAS call */
 	ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-			config_addr, BUID_HI(pdn->phb->buid),
-			BUID_LO(pdn->phb->buid), option);
+			config_addr, BUID_HI(edev->phb->buid),
+			BUID_LO(edev->phb->buid), option);
 
 	/* If fundamental-reset not supported, try hot-reset */
 	if (option == EEH_RESET_FUNDAMENTAL &&
 	    ret == -8) {
 		ret = rtas_call(ibm_set_slot_reset, 4, 1, NULL,
-				config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid), EEH_RESET_HOT);
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid), EEH_RESET_HOT);
 	}
 
 	return ret;
@@ -437,22 +437,22 @@ static int pseries_eeh_wait_state(struct device_node *dn, int max_wait)
  */
 static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_log, unsigned long len)
 {
-	struct pci_dn *pdn;
+	struct eeh_dev *edev;
 	int config_addr;
 	unsigned long flags;
 	int ret;
 
-	pdn = PCI_DN(dn);
+	edev = OF_NODE_TO_EEH_DEV(dn);
 	spin_lock_irqsave(&slot_errbuf_lock, flags);
 	memset(slot_errbuf, 0, eeh_error_buf_size);
 
 	/* Figure out the PE address */
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
 
 	ret = rtas_call(ibm_slot_error_detail, 8, 1, NULL, config_addr,
-			BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid),
+			BUID_HI(edev->phb->buid), BUID_LO(edev->phb->buid),
 			virt_to_phys(drv_log), len,
 			virt_to_phys(slot_errbuf), eeh_error_buf_size,
 			severity);
@@ -473,25 +473,25 @@ static int pseries_eeh_get_log(struct device_node *dn, int severity, char *drv_l
  */
 static int pseries_eeh_configure_bridge(struct device_node *dn)
 {
-	struct pci_dn *pdn;
+	struct eeh_dev *edev;
 	int config_addr;
 	int ret;
 
 	/* Figure out the PE address */
-	pdn = PCI_DN(dn);
-	config_addr = pdn->eeh_config_addr;
-	if (pdn->eeh_pe_config_addr)
-		config_addr = pdn->eeh_pe_config_addr;
+	edev = OF_NODE_TO_EEH_DEV(dn);
+	config_addr = edev->config_addr;
+	if (edev->pe_config_addr)
+		config_addr = edev->pe_config_addr;
 
 	/* Use new configure-pe function, if supported */
 	if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_pe, 3, 1, NULL,
-				config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid));
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
 	} else if (ibm_configure_bridge != RTAS_UNKNOWN_SERVICE) {
 		ret = rtas_call(ibm_configure_bridge, 3, 1, NULL,
-				config_addr, BUID_HI(pdn->phb->buid),
-				BUID_LO(pdn->phb->buid));
+				config_addr, BUID_HI(edev->phb->buid),
+				BUID_LO(edev->phb->buid));
 	} else {
 		return -EFAULT;
 	}
-- 
1.7.5.4

^ permalink raw reply related

* [PATCH 08/23] PCI, powerpc: Register busn_res for root buses
From: Yinghai Lu @ 2012-02-24 10:18 UTC (permalink / raw)
  To: Jesse Barnes, Benjamin Herrenschmidt, Tony Luck
  Cc: linux-arch, Greg Kroah-Hartman, linuxppc-dev, linux-kernel,
	Dominik Brodowski, Yinghai Lu, linux-pci, Bjorn Helgaas,
	Paul Mackerras, Andrew Morton, Linus Torvalds
In-Reply-To: <1330078751-7299-1-git-send-email-yinghai@kernel.org>

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/kernel/pci-common.c |    7 ++++++-
 1 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index cce98d7..501f29b 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1732,6 +1732,8 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose)
 	bus->secondary = hose->first_busno;
 	hose->bus = bus;
 
+	pci_bus_insert_busn_res(bus, hose->first_busno, hose->last_busno);
+
 	/* Get probe mode and perform scan */
 	mode = PCI_PROBE_NORMAL;
 	if (node && ppc_md.pci_probe_mode)
@@ -1742,8 +1744,11 @@ void __devinit pcibios_scan_phb(struct pci_controller *hose)
 		of_scan_bus(node, bus);
 	}
 
-	if (mode == PCI_PROBE_NORMAL)
+	if (mode == PCI_PROBE_NORMAL) {
+		pci_bus_update_busn_res_end(bus, 255);
 		hose->last_busno = bus->subordinate = pci_scan_child_bus(bus);
+		pci_bus_update_busn_res_end(bus, bus->subordinate);
+	}
 
 	/* Platform gets a chance to do some global fixups before
 	 * we proceed to resource allocation
-- 
1.7.7

^ permalink raw reply related

* Re: [PATCH 14/21] Introduce EEH device
From: Stephen Rothwell @ 2012-02-24 12:46 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev
In-Reply-To: <1330076298-7006-15-git-send-email-shangw@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 619 bytes --]

Hi Gavin,

On Fri, 24 Feb 2012 17:38:11 +0800 Gavin Shan <shangw@linux.vnet.ibm.com> wrote:
>
> +#define EEH_DEV_TO_OF_NODE(edev)	(edev->dn)
> +#define EEH_DEV_TO_PCI_DEV(edev)	(edev->pdev)
> +#define OF_NODE_TO_EEH_DEV(dn)		((struct eeh_dev *)(dn->edev))
> +#define PCI_DEV_TO_EEH_DEV(pdev)	((struct eeh_dev *)(pdev->dev.archdata.edev))

You need to put parentheses around the use of macro arguments ... or,
even better, make these into "static inline" accessor functions (with
lower case names).

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 14/21] Introduce EEH device
From: Stephen Rothwell @ 2012-02-24 12:50 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev
In-Reply-To: <1330076298-7006-15-git-send-email-shangw@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1114 bytes --]

Hi Gavin,

On Fri, 24 Feb 2012 17:38:11 +0800 Gavin Shan <shangw@linux.vnet.ibm.com> wrote:
>
> diff --git a/arch/powerpc/include/asm/device.h b/arch/powerpc/include/asm/device.h
> index d57c08a..4668344 100644
> --- a/arch/powerpc/include/asm/device.h
> +++ b/arch/powerpc/include/asm/device.h
> @@ -31,6 +31,9 @@ struct dev_archdata {
>  #ifdef CONFIG_SWIOTLB
>  	dma_addr_t		max_direct_dma_addr;
>  #endif
> +#ifdef CONFIG_EEH
> +	void			*edev;
> +#endif
>  };
>  
>  struct pdev_archdata {
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index ad8f318..1310971 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> +#define OF_NODE_TO_EEH_DEV(dn)		((struct eeh_dev *)(dn->edev))
> +#define PCI_DEV_TO_EEH_DEV(pdev)	((struct eeh_dev *)(pdev->dev.archdata.edev))

If the edev fields of dev_archdata and device_node are always going to be
"struct eeh_dev *", why not declare then as such and avoid the casting?

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH 20/21] Introduce struct eeh_stats for EEH
From: Stephen Rothwell @ 2012-02-24 13:01 UTC (permalink / raw)
  To: Gavin Shan; +Cc: linuxppc-dev
In-Reply-To: <1330076298-7006-21-git-send-email-shangw@linux.vnet.ibm.com>

[-- Attachment #1: Type: text/plain, Size: 1221 bytes --]

Hi Gavin,

On Fri, 24 Feb 2012 17:38:17 +0800 Gavin Shan <shangw@linux.vnet.ibm.com> wrote:
>
> diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
> index 1310971..226c9a5 100644
> --- a/arch/powerpc/include/asm/eeh.h
> +++ b/arch/powerpc/include/asm/eeh.h
> @@ -98,6 +98,21 @@ struct eeh_ops {
>  	int (*configure_bridge)(struct device_node *dn);
>  };
>  
> +/*
> + * The struct is used to maintain the EEH global statistic
> + * information. Besides, the EEH global statistics will be
> + * exported to user space through procfs
> + */
> +struct eeh_stats {
> +	unsigned long no_device;	/* PCI device not found		*/
> +	unsigned long no_dn;		/* OF node not found		*/
> +	unsigned long no_cfg_addr;	/* Config address not found	*/
> +	unsigned long ignored_check;	/* EEH check skipped		*/
> +	unsigned long total_mmio_ffs;	/* Total EEH checks		*/
> +	unsigned long false_positives;	/* Unnecessary EEH checks	*/
> +	unsigned long slot_resets;	/* PE reset			*/
> +};

If this is used in only one place, there is not much point in putting it
in a header file.

-- 
Cheers,
Stephen Rothwell                    sfr@canb.auug.org.au
http://www.canb.auug.org.au/~sfr/

[-- Attachment #2: Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* RE: [PATCH 20/21] Introduce struct eeh_stats for EEH
From: David Laight @ 2012-02-24 13:51 UTC (permalink / raw)
  To: Gavin Shan, linuxppc-dev; +Cc: kernel.crashing.org
In-Reply-To: <1330076298-7006-21-git-send-email-shangw@linux.vnet.ibm.com>

=20
> +/*
> + * The struct is used to maintain the EEH global statistic
> + * information. Besides, the EEH global statistics will be
> + * exported to user space through procfs
> + */
> +struct eeh_stats {
> +	unsigned long no_device;	/* PCI device not found
*/
> +	unsigned long no_dn;		/* OF node not found
*/
> +	unsigned long no_cfg_addr;	/* Config address not  found
*/
> +	unsigned long ignored_check;	/* EEH check skipped
*/
> +	unsigned long total_mmio_ffs;	/* Total EEH checks
*/
> +	unsigned long false_positives;	/* Unnecessary EEH checks
*/
> +	unsigned long slot_resets;	/* PE reset
*/
> +};

Why 'unsigned long', surely either 'unsigned int'
or a fixed-width type.

	David

^ permalink raw reply

* [PATCH 01/37] powerpc/booke: Set CPU_FTR_DEBUG_LVL_EXC on 32-bit
From: Alexander Graf @ 2012-02-24 14:25 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

Currently 32-bit only cares about this for choice of exception
vector, which is done in core-specific code.  However, KVM will
want to distinguish as well.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/cputable.h |    5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index ad55a1c..6a034a2 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -376,7 +376,8 @@ extern const char *powerpc_base_platform;
 #define CPU_FTRS_47X	(CPU_FTRS_440x6)
 #define CPU_FTRS_E200	(CPU_FTR_USE_TB | CPU_FTR_SPE_COMP | \
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_COHERENT_ICACHE | \
-	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE)
+	    CPU_FTR_UNIFIED_ID_CACHE | CPU_FTR_NOEXECUTE | \
+	    CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E500	(CPU_FTR_MAYBE_CAN_DOZE | CPU_FTR_USE_TB | \
 	    CPU_FTR_SPE_COMP | CPU_FTR_MAYBE_CAN_NAP | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_NOEXECUTE)
@@ -385,7 +386,7 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_E500MC	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
-	    CPU_FTR_DBELL)
+	    CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC)
 #define CPU_FTRS_E5500	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 05/37] KVM: PPC: booke: Move vm core init/destroy out of booke.c
From: Alexander Graf @ 2012-02-24 14:25 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

e500mc will want to do lpid allocation/deallocation here.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/44x.c   |    9 +++++++++
 arch/powerpc/kvm/booke.c |    9 ---------
 arch/powerpc/kvm/e500.c  |    9 +++++++++
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 879a1a7..50e7dbc 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -163,6 +163,15 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
 
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 static int __init kvmppc_44x_init(void)
 {
 	int r;
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a2456c7..2ee9bae 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -932,15 +932,6 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm,
 {
 }
 
-int kvmppc_core_init_vm(struct kvm *kvm)
-{
-	return 0;
-}
-
-void kvmppc_core_destroy_vm(struct kvm *kvm)
-{
-}
-
 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
 {
 	vcpu->arch.tcr = new_tcr;
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 2d5fe04..ac6c9ae 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -226,6 +226,15 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+}
+
 static int __init kvmppc_e500_init(void)
 {
 	int r, i;
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 04/37] KVM: PPC: booke: add booke-level vcpu load/put
From: Alexander Graf @ 2012-02-24 14:25 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

This gives us a place to put load/put actions that correspond to
code that is booke-specific but not specific to a particular core.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/44x.c   |    3 +++
 arch/powerpc/kvm/booke.c |    8 ++++++++
 arch/powerpc/kvm/booke.h |    3 +++
 arch/powerpc/kvm/e500.c  |    3 +++
 4 files changed, 17 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 7b612a7..879a1a7 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -29,15 +29,18 @@
 #include <asm/kvm_ppc.h>
 
 #include "44x_tlb.h"
+#include "booke.h"
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	kvmppc_booke_vcpu_load(vcpu, cpu);
 	kvmppc_44x_tlb_load(vcpu);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvmppc_44x_tlb_put(vcpu);
+	kvmppc_booke_vcpu_put(vcpu);
 }
 
 int kvmppc_core_check_processor_compat(void)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ee9e1ee..a2456c7 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -968,6 +968,14 @@ void kvmppc_decrementer_func(unsigned long data)
 	kvmppc_set_tsr_bits(vcpu, TSR_DIS);
 }
 
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+}
+
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
+{
+}
+
 int __init kvmppc_booke_init(void)
 {
 	unsigned long ivor[16];
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 2fe2027..05d1d99 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -71,4 +71,7 @@ void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu);
 /* high-level function, manages flags, host state */
 void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
 
+void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu);
+
 #endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index ddcd896..2d5fe04 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -36,6 +36,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	kvmppc_booke_vcpu_load(vcpu, cpu);
 	kvmppc_e500_tlb_load(vcpu, cpu);
 }
 
@@ -47,6 +48,8 @@ void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.shadow_msr & MSR_SPE)
 		kvmppc_vcpu_disable_spe(vcpu);
 #endif
+
+	kvmppc_booke_vcpu_put(vcpu);
 }
 
 int kvmppc_core_check_processor_compat(void)
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 00/37] KVM: PPC: e500mc support v2
From: Alexander Graf @ 2012-02-24 14:25 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm

This is Scott's e500mc RFC patch set rebased, berobbed of its pt_regs
parts and fixed for bisectability. On top of them, I addressed all the
comments that I had on the code and that came up in his code as FIXMEs.

I verified that this patch set works just fine on e500mc and doesn't
break e500v2, so I would say it's good to go as it is, unless someone
has strong objections to how things are done. Everything hereafter
I would prefer to do based on a working upstream version rather than
a downstream fork, as that way exposure is a lot higher.

v1 -> v2:

  - ESR -> GESR
  - introduce and use constants for doorbell
  - drop e500mc ifdefs for doorbell
  - fix whitespace
  - use explicit preempt counts in inst fixup
  - rework e500v2 kconfig patch
  - add patches 31-37

Alexander Graf (22):
  KVM: PPC: e500mc: Add doorbell emulation support
  KVM: PPC: e500mc: implicitly set MSR_GS
  KVM: PPC: e500mc: Move r1/r2 restoration very early
  KVM: PPC: e500mc: add load inst fixup
  KVM: PPC: rename CONFIG_KVM_E500 -> CONFIG_KVM_E500V2
  KVM: PPC: make e500v2 kvm and e500mc cpu mutually exclusive
  KVM: PPC: booke: remove leftover debugging
  KVM: PPC: booke: deliver program int on emulation failure
  KVM: PPC: booke: rework rescheduling checks
  KVM: PPC: booke: BOOKE_IRQPRIO_MAX is n+1
  KVM: PPC: bookehv: fix exit timing
  KVM: PPC: bookehv: remove negation for CONFIG_64BIT
  KVM: PPC: bookehv: remove SET_VCPU
  KVM: PPC: bookehv: disable MAS register updates early
  KVM: PPC: bookehv: add comment about shadow_msr
  KVM: PPC: booke: Readd debug abort code for machine check
  KVM: PPC: booke: add GS documentation for program interrupt
  KVM: PPC: bookehv: remove unused code
  KVM: PPC: e500: fix typo in tlb code
  KVM: PPC: booke: Support perfmon interrupts
  KVM: PPC: booke: expose guest registers on irq reinject
  KVM: PPC: booke: Reinject performance monitor interrupts

Scott Wood (15):
  powerpc/booke: Set CPU_FTR_DEBUG_LVL_EXC on 32-bit
  powerpc/e500: split CPU_FTRS_ALWAYS/CPU_FTRS_POSSIBLE
  KVM: PPC: factor out lpid allocator from book3s_64_mmu_hv
  KVM: PPC: booke: add booke-level vcpu load/put
  KVM: PPC: booke: Move vm core init/destroy out of booke.c
  KVM: PPC: e500: rename e500_tlb.h to e500.h
  KVM: PPC: e500: merge <asm/kvm_e500.h> into arch/powerpc/kvm/e500.h
  KVM: PPC: e500: clean up arch/powerpc/kvm/e500.h
  KVM: PPC: e500: refactor core-specific TLB code
  KVM: PPC: e500: Track TLB1 entries with a bitmap
  KVM: PPC: e500: emulate tlbilx
  powerpc/booke: Provide exception macros with interrupt name
  KVM: PPC: booke: category E.HV (GS-mode) support
  KVM: PPC: booke: standard PPC floating point support
  KVM: PPC: e500mc support

 arch/powerpc/include/asm/cputable.h         |   21 +-
 arch/powerpc/include/asm/dbell.h            |    3 +
 arch/powerpc/include/asm/hw_irq.h           |    1 +
 arch/powerpc/include/asm/kvm.h              |    1 +
 arch/powerpc/include/asm/kvm_asm.h          |    8 +
 arch/powerpc/include/asm/kvm_book3s.h       |    3 +
 arch/powerpc/include/asm/kvm_booke.h        |    3 +
 arch/powerpc/include/asm/kvm_booke_hv_asm.h |   49 +++
 arch/powerpc/include/asm/kvm_e500.h         |   96 -----
 arch/powerpc/include/asm/kvm_host.h         |   22 +-
 arch/powerpc/include/asm/kvm_ppc.h          |   10 +-
 arch/powerpc/include/asm/mmu-book3e.h       |    6 +
 arch/powerpc/include/asm/processor.h        |    3 +
 arch/powerpc/include/asm/reg.h              |    2 +
 arch/powerpc/include/asm/reg_booke.h        |   34 ++
 arch/powerpc/include/asm/system.h           |    1 +
 arch/powerpc/kernel/asm-offsets.c           |   15 +-
 arch/powerpc/kernel/cpu_setup_fsl_booke.S   |    1 +
 arch/powerpc/kernel/head_44x.S              |   23 +-
 arch/powerpc/kernel/head_booke.h            |   69 ++-
 arch/powerpc/kernel/head_fsl_booke.S        |   98 ++++-
 arch/powerpc/kvm/44x.c                      |   12 +
 arch/powerpc/kvm/Kconfig                    |   28 +-
 arch/powerpc/kvm/Makefile                   |   15 +-
 arch/powerpc/kvm/book3s.c                   |    4 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c         |   26 +-
 arch/powerpc/kvm/booke.c                    |  470 +++++++++++++++++----
 arch/powerpc/kvm/booke.h                    |   57 +++-
 arch/powerpc/kvm/booke_emulate.c            |   23 +-
 arch/powerpc/kvm/bookehv_interrupts.S       |  606 +++++++++++++++++++++++++++
 arch/powerpc/kvm/e500.c                     |  372 ++++++++++++++---
 arch/powerpc/kvm/e500.h                     |  302 +++++++++++++
 arch/powerpc/kvm/e500_emulate.c             |  110 +++++-
 arch/powerpc/kvm/e500_tlb.c                 |  588 +++++++++++---------------
 arch/powerpc/kvm/e500_tlb.h                 |  174 --------
 arch/powerpc/kvm/e500mc.c                   |  342 +++++++++++++++
 arch/powerpc/kvm/powerpc.c                  |   47 ++-
 arch/powerpc/kvm/timing.h                   |    6 +
 38 files changed, 2797 insertions(+), 854 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kvm_booke_hv_asm.h
 delete mode 100644 arch/powerpc/include/asm/kvm_e500.h
 create mode 100644 arch/powerpc/kvm/bookehv_interrupts.S
 create mode 100644 arch/powerpc/kvm/e500.h
 delete mode 100644 arch/powerpc/kvm/e500_tlb.h
 create mode 100644 arch/powerpc/kvm/e500mc.c

^ permalink raw reply

* [PATCH 03/37] KVM: PPC: factor out lpid allocator from book3s_64_mmu_hv
From: Alexander Graf @ 2012-02-24 14:25 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

We'll use it on e500mc as well.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s.h |    3 ++
 arch/powerpc/include/asm/kvm_booke.h  |    3 ++
 arch/powerpc/include/asm/kvm_ppc.h    |    5 ++++
 arch/powerpc/kvm/book3s_64_mmu_hv.c   |   26 +++++++++---------------
 arch/powerpc/kvm/powerpc.c            |   34 +++++++++++++++++++++++++++++++++
 5 files changed, 55 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index aa795cc..046041f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -452,4 +452,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
 
 #define INS_DCBZ			0x7c0007ec
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
+
 #endif /* __ASM_KVM_BOOK3S_H__ */
diff --git a/arch/powerpc/include/asm/kvm_booke.h b/arch/powerpc/include/asm/kvm_booke.h
index a90e091..b7cd335 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -23,6 +23,9 @@
 #include <linux/types.h>
 #include <linux/kvm_host.h>
 
+/* LPIDs we support with this build -- runtime limit may be lower */
+#define KVMPPC_NR_LPIDS                        64
+
 static inline void kvmppc_set_gpr(struct kvm_vcpu *vcpu, int num, ulong val)
 {
 	vcpu->arch.gpr[num] = val;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 9d6dee0..731e920 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -204,4 +204,9 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
 			     struct kvm_dirty_tlb *cfg);
 
+long kvmppc_alloc_lpid(void);
+void kvmppc_claim_lpid(long lpid);
+void kvmppc_free_lpid(long lpid);
+void kvmppc_init_lpid(unsigned long nr_lpids);
+
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index ddc485a..d031ce1 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -36,13 +36,11 @@
 
 /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */
 #define MAX_LPID_970	63
-#define NR_LPIDS	(LPID_RSVD + 1)
-unsigned long lpid_inuse[BITS_TO_LONGS(NR_LPIDS)];
 
 long kvmppc_alloc_hpt(struct kvm *kvm)
 {
 	unsigned long hpt;
-	unsigned long lpid;
+	long lpid;
 	struct revmap_entry *rev;
 	struct kvmppc_linear_info *li;
 
@@ -72,14 +70,9 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 	}
 	kvm->arch.revmap = rev;
 
-	/* Allocate the guest's logical partition ID */
-	do {
-		lpid = find_first_zero_bit(lpid_inuse, NR_LPIDS);
-		if (lpid >= NR_LPIDS) {
-			pr_err("kvm_alloc_hpt: No LPIDs free\n");
-			goto out_freeboth;
-		}
-	} while (test_and_set_bit(lpid, lpid_inuse));
+	lpid = kvmppc_alloc_lpid();
+	if (lpid < 0)
+		goto out_freeboth;
 
 	kvm->arch.sdr1 = __pa(hpt) | (HPT_ORDER - 18);
 	kvm->arch.lpid = lpid;
@@ -96,7 +89,7 @@ long kvmppc_alloc_hpt(struct kvm *kvm)
 
 void kvmppc_free_hpt(struct kvm *kvm)
 {
-	clear_bit(kvm->arch.lpid, lpid_inuse);
+	kvmppc_free_lpid(kvm->arch.lpid);
 	vfree(kvm->arch.revmap);
 	if (kvm->arch.hpt_li)
 		kvm_release_hpt(kvm->arch.hpt_li);
@@ -171,8 +164,7 @@ int kvmppc_mmu_hv_init(void)
 	if (!cpu_has_feature(CPU_FTR_HVMODE))
 		return -EINVAL;
 
-	memset(lpid_inuse, 0, sizeof(lpid_inuse));
-
+	/* POWER7 has 10-bit LPIDs, PPC970 and e500mc have 6-bit LPIDs */
 	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
 		host_lpid = mfspr(SPRN_LPID);	/* POWER7 */
 		rsvd_lpid = LPID_RSVD;
@@ -181,9 +173,11 @@ int kvmppc_mmu_hv_init(void)
 		rsvd_lpid = MAX_LPID_970;
 	}
 
-	set_bit(host_lpid, lpid_inuse);
+	kvmppc_init_lpid(rsvd_lpid + 1);
+
+	kvmppc_claim_lpid(host_lpid);
 	/* rsvd_lpid is reserved for use in partition switching */
-	set_bit(rsvd_lpid, lpid_inuse);
+	kvmppc_claim_lpid(rsvd_lpid);
 
 	return 0;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 00d7e34..9806ea5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -808,6 +808,40 @@ out:
 	return r;
 }
 
+static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)];
+static unsigned long nr_lpids;
+
+long kvmppc_alloc_lpid(void)
+{
+	long lpid;
+
+	do {
+		lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS);
+		if (lpid >= nr_lpids) {
+			pr_err("%s: No LPIDs free\n", __func__);
+			return -ENOMEM;
+		}
+	} while (test_and_set_bit(lpid, lpid_inuse));
+
+	return lpid;
+}
+
+void kvmppc_claim_lpid(long lpid)
+{
+	set_bit(lpid, lpid_inuse);
+}
+
+void kvmppc_free_lpid(long lpid)
+{
+	clear_bit(lpid, lpid_inuse);
+}
+
+void kvmppc_init_lpid(unsigned long nr_lpids_param)
+{
+	nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param);
+	memset(lpid_inuse, 0, sizeof(lpid_inuse));
+}
+
 int kvm_arch_init(void *opaque)
 {
 	return 0;
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 06/37] KVM: PPC: e500: rename e500_tlb.h to e500.h
From: Alexander Graf @ 2012-02-24 14:26 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

This is in preparation for merging in the contents of
arch/powerpc/include/asm/kvm_e500.h.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500.c                 |    2 +-
 arch/powerpc/kvm/{e500_tlb.h => e500.h} |    6 +++---
 arch/powerpc/kvm/e500_emulate.c         |    2 +-
 arch/powerpc/kvm/e500_tlb.c             |    2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)
 rename arch/powerpc/kvm/{e500_tlb.h => e500.h} (98%)

diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index ac6c9ae..5c450ba 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -24,7 +24,7 @@
 #include <asm/kvm_ppc.h>
 
 #include "booke.h"
-#include "e500_tlb.h"
+#include "e500.h"
 
 void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/powerpc/kvm/e500_tlb.h b/arch/powerpc/kvm/e500.h
similarity index 98%
rename from arch/powerpc/kvm/e500_tlb.h
rename to arch/powerpc/kvm/e500.h
index 5c6d2d7..02ecde2 100644
--- a/arch/powerpc/kvm/e500_tlb.h
+++ b/arch/powerpc/kvm/e500.h
@@ -12,8 +12,8 @@
  * published by the Free Software Foundation.
  */
 
-#ifndef __KVM_E500_TLB_H__
-#define __KVM_E500_TLB_H__
+#ifndef KVM_E500_H
+#define KVM_E500_H
 
 #include <linux/kvm_host.h>
 #include <asm/mmu-book3e.h>
@@ -171,4 +171,4 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 	return 1;
 }
 
-#endif /* __KVM_E500_TLB_H__ */
+#endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 6d0b2bd..2a1a228 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -17,7 +17,7 @@
 #include <asm/kvm_e500.h>
 
 #include "booke.h"
-#include "e500_tlb.h"
+#include "e500.h"
 
 #define XOP_TLBIVAX 786
 #define XOP_TLBSX   914
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 6e53e41..1d623a0 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -29,7 +29,7 @@
 #include <asm/kvm_e500.h>
 
 #include "../mm/mmu_decl.h"
-#include "e500_tlb.h"
+#include "e500.h"
 #include "trace.h"
 #include "timing.h"
 
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 02/37] powerpc/e500: split CPU_FTRS_ALWAYS/CPU_FTRS_POSSIBLE
From: Alexander Graf @ 2012-02-24 14:25 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

Split e500 (v1/v2) and e500mc/e5500 to allow optimization of feature
checks that differ between the two.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/cputable.h |   12 ++++++++----
 1 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 6a034a2..2022f2d 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -483,8 +483,10 @@ enum {
 	    CPU_FTRS_E200 |
 #endif
 #ifdef CONFIG_E500
-	    CPU_FTRS_E500 | CPU_FTRS_E500_2 | CPU_FTRS_E500MC |
-	    CPU_FTRS_E5500 |
+	    CPU_FTRS_E500 | CPU_FTRS_E500_2 |
+#endif
+#ifdef CONFIG_PPC_E500MC
+	    CPU_FTRS_E500MC | CPU_FTRS_E5500 |
 #endif
 	    0,
 };
@@ -528,8 +530,10 @@ enum {
 	    CPU_FTRS_E200 &
 #endif
 #ifdef CONFIG_E500
-	    CPU_FTRS_E500 & CPU_FTRS_E500_2 & CPU_FTRS_E500MC &
-	    CPU_FTRS_E5500 &
+	    CPU_FTRS_E500 & CPU_FTRS_E500_2 &
+#endif
+#ifdef CONFIG_PPC_E500MC
+	    CPU_FTRS_E500MC & CPU_FTRS_E5500 &
 #endif
 	    CPU_FTRS_POSSIBLE,
 };
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 11/37] KVM: PPC: e500: emulate tlbilx
From: Alexander Graf @ 2012-02-24 14:26 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

tlbilx is the new, preferred invalidation instruction.  It is not
found on e500 prior to e500mc, but there should be no harm in
supporting it on all e500.

Based on code from Ashish Kalra <Ashish.Kalra@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500.h         |    1 +
 arch/powerpc/kvm/e500_emulate.c |    9 ++++++
 arch/powerpc/kvm/e500_tlb.c     |   52 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index f4dee55..ce3f163 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -124,6 +124,7 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb);
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb);
 int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb);
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index c80794d..af02c18 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -22,6 +22,7 @@
 #define XOP_TLBSX   914
 #define XOP_TLBRE   946
 #define XOP_TLBWE   978
+#define XOP_TLBILX  18
 
 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
                            unsigned int inst, int *advance)
@@ -29,6 +30,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int emulated = EMULATE_DONE;
 	int ra;
 	int rb;
+	int rt;
 
 	switch (get_op(inst)) {
 	case 31:
@@ -47,6 +49,13 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			emulated = kvmppc_e500_emul_tlbsx(vcpu,rb);
 			break;
 
+		case XOP_TLBILX:
+			ra = get_ra(inst);
+			rb = get_rb(inst);
+			rt = get_rt(inst);
+			emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb);
+			break;
+
 		case XOP_TLBIVAX:
 			ra = get_ra(inst);
 			rb = get_rb(inst);
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index c8ce51d..6eb5d65 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -631,6 +631,58 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 	return EMULATE_DONE;
 }
 
+static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
+		       int pid, int rt)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+	int tid, esel;
+
+	/* invalidate all entries */
+	for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) {
+		tlbe = get_entry(vcpu_e500, tlbsel, esel);
+		tid = get_tlb_tid(tlbe);
+		if (rt == 0 || tid == pid) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+		}
+	}
+}
+
+static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid,
+		       int ra, int rb)
+{
+	int tlbsel, esel;
+	gva_t ea;
+
+	ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb);
+	if (ra)
+		ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra);
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1);
+		if (esel >= 0) {
+			inval_gtlbe_on_host(vcpu_e500, tlbsel, esel);
+			kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel);
+			break;
+		}
+	}
+}
+
+int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int pid = get_cur_spid(vcpu);
+
+	if (rt == 0 || rt == 1) {
+		tlbilx_all(vcpu_e500, 0, pid, rt);
+		tlbilx_all(vcpu_e500, 1, pid, rt);
+	} else if (rt == 3) {
+		tlbilx_one(vcpu_e500, pid, ra, rb);
+	}
+
+	return EMULATE_DONE;
+}
+
 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 10/37] KVM: PPC: e500: Track TLB1 entries with a bitmap
From: Alexander Graf @ 2012-02-24 14:26 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

Rather than invalidate everything when a TLB1 entry needs to be
taken down, keep track of which host TLB1 entries are used for
a given guest TLB1 entry, and invalidate just those entries.

Based on code from Ashish Kalra <Ashish.Kalra@freescale.com>
and Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/e500.h     |    5 +++
 arch/powerpc/kvm/e500_tlb.c |   72 ++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 72 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 34cef08..f4dee55 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -2,6 +2,7 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu <yu.liu@freescale.com>
+ *         Ashish Kalra <ashish.kalra@freescale.com>
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.h and
@@ -25,6 +26,7 @@
 
 #define E500_TLB_VALID 1
 #define E500_TLB_DIRTY 2
+#define E500_TLB_BITMAP 4
 
 struct tlbe_ref {
 	pfn_t pfn;
@@ -82,6 +84,9 @@ struct kvmppc_vcpu_e500 {
 	struct page **shared_tlb_pages;
 	int num_shared_tlb_pages;
 
+	u64 *g2h_tlb1_map;
+	unsigned int *h2g_tlb1_rmap;
+
 #ifdef CONFIG_KVM_E500
 	u32 pid[E500_PID_NUM];
 
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 9925fc6..c8ce51d 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -2,6 +2,7 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, yu.liu@freescale.com
+ *         Ashish Kalra, ashish.kalra@freescale.com
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.c,
@@ -175,8 +176,28 @@ static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
 	struct kvm_book3e_206_tlb_entry *gtlbe =
 		get_entry(vcpu_e500, tlbsel, esel);
 
-	if (tlbsel == 1) {
-		kvmppc_e500_tlbil_all(vcpu_e500);
+	if (tlbsel == 1 &&
+	    vcpu_e500->gtlb_priv[1][esel].ref.flags & E500_TLB_BITMAP) {
+		u64 tmp = vcpu_e500->g2h_tlb1_map[esel];
+		int hw_tlb_indx;
+		unsigned long flags;
+
+		local_irq_save(flags);
+		while (tmp) {
+			hw_tlb_indx = __ilog2_u64(tmp & -tmp);
+			mtspr(SPRN_MAS0,
+			      MAS0_TLBSEL(1) |
+			      MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
+			mtspr(SPRN_MAS1, 0);
+			asm volatile("tlbwe");
+			vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0;
+			tmp &= tmp - 1;
+		}
+		mb();
+		vcpu_e500->g2h_tlb1_map[esel] = 0;
+		vcpu_e500->gtlb_priv[1][esel].ref.flags &= ~E500_TLB_BITMAP;
+		local_irq_restore(flags);
+
 		return;
 	}
 
@@ -282,6 +303,16 @@ static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref)
 	}
 }
 
+static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	if (vcpu_e500->g2h_tlb1_map)
+		memset(vcpu_e500->g2h_tlb1_map,
+		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries, 0);
+	if (vcpu_e500->h2g_tlb1_rmap)
+		memset(vcpu_e500->h2g_tlb1_rmap,
+		       sizeof(unsigned int) * host_tlb_params[1].entries, 0);
+}
+
 static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int tlbsel = 0;
@@ -511,7 +542,7 @@ static void kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 /* XXX for both one-one and one-to-many , for now use TLB1 */
 static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 		u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe,
-		struct kvm_book3e_206_tlb_entry *stlbe)
+		struct kvm_book3e_206_tlb_entry *stlbe, int esel)
 {
 	struct tlbe_ref *ref;
 	unsigned int victim;
@@ -524,6 +555,14 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	ref = &vcpu_e500->tlb_refs[1][victim];
 	kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, ref);
 
+	vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << victim;
+	vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP;
+	if (vcpu_e500->h2g_tlb1_rmap[victim]) {
+		unsigned int idx = vcpu_e500->h2g_tlb1_rmap[victim];
+		vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << victim);
+	}
+	vcpu_e500->h2g_tlb1_rmap[victim] = esel;
+
 	return victim;
 }
 
@@ -728,7 +767,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			 * are mapped on the fly. */
 			stlbsel = 1;
 			sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
-				    raddr >> PAGE_SHIFT, gtlbe, &stlbe);
+				    raddr >> PAGE_SHIFT, gtlbe, &stlbe, esel);
 			break;
 
 		default:
@@ -856,7 +895,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 
 		stlbsel = 1;
 		sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn,
-					     gtlbe, &stlbe);
+					     gtlbe, &stlbe, esel);
 		break;
 	}
 
@@ -872,6 +911,9 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int i;
 
+	clear_tlb1_bitmap(vcpu_e500);
+	kfree(vcpu_e500->g2h_tlb1_map);
+
 	clear_tlb_refs(vcpu_e500);
 	kfree(vcpu_e500->gtlb_priv[0]);
 	kfree(vcpu_e500->gtlb_priv[1]);
@@ -932,6 +974,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	char *virt;
 	struct page **pages;
 	struct tlbe_priv *privs[2] = {};
+	u64 *g2h_bitmap = NULL;
 	size_t array_len;
 	u32 sets;
 	int num_pages, ret, i;
@@ -993,10 +1036,16 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	if (!privs[0] || !privs[1])
 		goto err_put_page;
 
+	g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1],
+	                     GFP_KERNEL);
+	if (!g2h_bitmap)
+		goto err_put_page;
+
 	free_gtlb(vcpu_e500);
 
 	vcpu_e500->gtlb_priv[0] = privs[0];
 	vcpu_e500->gtlb_priv[1] = privs[1];
+	vcpu_e500->g2h_tlb1_map = g2h_bitmap;
 
 	vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *)
 		(virt + (cfg->array & (PAGE_SIZE - 1)));
@@ -1129,6 +1178,18 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (!vcpu_e500->gtlb_priv[1])
 		goto err;
 
+	vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) *
+					  vcpu_e500->gtlb_params[1].entries,
+					  GFP_KERNEL);
+	if (!vcpu_e500->g2h_tlb1_map)
+		goto err;
+
+	vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) *
+					   host_tlb_params[1].entries,
+					   GFP_KERNEL);
+	if (!vcpu_e500->h2g_tlb1_rmap)
+		goto err;
+
 	/* Init TLB configuration register */
 	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
 			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
@@ -1154,6 +1215,7 @@ err:
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	free_gtlb(vcpu_e500);
+	kfree(vcpu_e500->h2g_tlb1_rmap);
 	kfree(vcpu_e500->tlb_refs[0]);
 	kfree(vcpu_e500->tlb_refs[1]);
 }
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 12/37] powerpc/booke: Provide exception macros with interrupt name
From: Alexander Graf @ 2012-02-24 14:26 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

DO_KVM will need to identify the particular exception type.

There is an existing set of arbitrary numbers that Linux passes,
but it's an undocumented mess that sort of corresponds to server/classic
exception vectors but not really.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kernel/head_44x.S       |   23 +++++++++------
 arch/powerpc/kernel/head_booke.h     |   41 ++++++++++++++------------
 arch/powerpc/kernel/head_fsl_booke.S |   52 +++++++++++++++++++++-------------
 3 files changed, 68 insertions(+), 48 deletions(-)

diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index 7dd2981..d1192c5 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -248,10 +248,11 @@ _ENTRY(_start);
 
 interrupt_base:
 	/* Critical Input Interrupt */
-	CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
 
 	/* Machine Check Interrupt */
-	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
 	MCHECK_EXCEPTION(0x0210, MachineCheckA, machine_check_exception)
 
 	/* Data Storage Interrupt */
@@ -261,7 +262,8 @@ interrupt_base:
 	INSTRUCTION_STORAGE_EXCEPTION
 
 	/* External Input Interrupt */
-	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+	EXCEPTION(0x0500, BOOKE_INTERRUPT_EXTERNAL, ExternalInput, \
+		  do_IRQ, EXC_XFER_LITE)
 
 	/* Alignment Interrupt */
 	ALIGNMENT_EXCEPTION
@@ -273,29 +275,32 @@ interrupt_base:
 #ifdef CONFIG_PPC_FPU
 	FP_UNAVAILABLE_EXCEPTION
 #else
-	EXCEPTION(0x2010, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2010, BOOKE_INTERRUPT_FP_UNAVAIL, \
+		  FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
 #endif
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(BOOKE_INTERRUPT_SYSCALL)
 	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
-	EXCEPTION(0x2020, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2020, BOOKE_INTERRUPT_AP_UNAVAIL, \
+		  AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
 
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
-	EXCEPTION(0x1010, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x1010, BOOKE_INTERRUPT_FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Watchdog Timer Interrupt */
 	/* TODO: Add watchdog support */
 #ifdef CONFIG_BOOKE_WDT
-	CRITICAL_EXCEPTION(0x1020, WatchdogTimer, WatchdogException)
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, WatchdogException)
 #else
-	CRITICAL_EXCEPTION(0x1020, WatchdogTimer, unknown_exception)
+	CRITICAL_EXCEPTION(0x1020, WATCHDOG, WatchdogTimer, unknown_exception)
 #endif
 
 	/* Data TLB Error Interrupt */
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index fc921bf..06ab353 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -2,6 +2,8 @@
 #define __HEAD_BOOKE_H__
 
 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
+#include <asm/kvm_asm.h>
+
 /*
  * Macros used for common Book-e exception handling
  */
@@ -28,7 +30,7 @@
  */
 #define THREAD_NORMSAVE(offset)	(THREAD_NORMSAVES + (offset * 4))
 
-#define NORMAL_EXCEPTION_PROLOG						     \
+#define NORMAL_EXCEPTION_PROLOG(intno)						     \
 	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
 	mfspr	r10, SPRN_SPRG_THREAD;					     \
 	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
@@ -113,7 +115,7 @@
  * registers as the normal prolog above. Instead we use a portion of the
  * critical/machine check exception stack at low physical addresses.
  */
-#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, exc_level_srr0, exc_level_srr1) \
+#define EXC_LEVEL_EXCEPTION_PROLOG(exc_level, intno, exc_level_srr0, exc_level_srr1) \
 	mtspr	SPRN_SPRG_WSCRATCH_##exc_level,r8;			     \
 	BOOKE_LOAD_EXC_LEVEL_STACK(exc_level);/* r8 points to the exc_level stack*/ \
 	stw	r9,GPR9(r8);		/* save various registers	   */\
@@ -162,12 +164,13 @@
 	SAVE_4GPRS(3, r11);						     \
 	SAVE_2GPRS(7, r11)
 
-#define CRITICAL_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(CRIT, SPRN_CSRR0, SPRN_CSRR1)
+#define CRITICAL_EXCEPTION_PROLOG(intno) \
+		EXC_LEVEL_EXCEPTION_PROLOG(CRIT, intno, SPRN_CSRR0, SPRN_CSRR1)
 #define DEBUG_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(DBG, SPRN_DSRR0, SPRN_DSRR1)
+		EXC_LEVEL_EXCEPTION_PROLOG(DBG, DEBUG, SPRN_DSRR0, SPRN_DSRR1)
 #define MCHECK_EXCEPTION_PROLOG \
-		EXC_LEVEL_EXCEPTION_PROLOG(MC, SPRN_MCSRR0, SPRN_MCSRR1)
+		EXC_LEVEL_EXCEPTION_PROLOG(MC, MACHINE_CHECK, \
+			SPRN_MCSRR0, SPRN_MCSRR1)
 
 /*
  * Exception vectors.
@@ -181,16 +184,16 @@ label:
 	.long	func;						\
 	.long	ret_from_except_full
 
-#define EXCEPTION(n, label, hdlr, xfer)				\
+#define EXCEPTION(n, intno, label, hdlr, xfer)			\
 	START_EXCEPTION(label);					\
-	NORMAL_EXCEPTION_PROLOG;				\
+	NORMAL_EXCEPTION_PROLOG(intno);				\
 	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
 	xfer(n, hdlr)
 
-#define CRITICAL_EXCEPTION(n, label, hdlr)			\
-	START_EXCEPTION(label);					\
-	CRITICAL_EXCEPTION_PROLOG;				\
-	addi	r3,r1,STACK_FRAME_OVERHEAD;			\
+#define CRITICAL_EXCEPTION(n, intno, label, hdlr)			\
+	START_EXCEPTION(label);						\
+	CRITICAL_EXCEPTION_PROLOG(intno);				\
+	addi	r3,r1,STACK_FRAME_OVERHEAD;				\
 	EXC_XFER_TEMPLATE(hdlr, n+2, (MSR_KERNEL & ~(MSR_ME|MSR_DE|MSR_CE)), \
 			  NOCOPY, crit_transfer_to_handler, \
 			  ret_from_crit_exc)
@@ -302,7 +305,7 @@ label:
 
 #define DEBUG_CRIT_EXCEPTION						      \
 	START_EXCEPTION(DebugCrit);					      \
-	CRITICAL_EXCEPTION_PROLOG;					      \
+	CRITICAL_EXCEPTION_PROLOG(DEBUG);				      \
 									      \
 	/*								      \
 	 * If there is a single step or branch-taken exception in an	      \
@@ -355,7 +358,7 @@ label:
 
 #define DATA_STORAGE_EXCEPTION						      \
 	START_EXCEPTION(DataStorage)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE);		      \
 	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r5,_ESR(r11);						      \
 	mfspr	r4,SPRN_DEAR;		/* Grab the DEAR */		      \
@@ -363,7 +366,7 @@ label:
 
 #define INSTRUCTION_STORAGE_EXCEPTION					      \
 	START_EXCEPTION(InstructionStorage)				      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(INST_STORAGE);		      \
 	mfspr	r5,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r5,_ESR(r11);						      \
 	mr      r4,r12;                 /* Pass SRR0 as arg2 */		      \
@@ -372,7 +375,7 @@ label:
 
 #define ALIGNMENT_EXCEPTION						      \
 	START_EXCEPTION(Alignment)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(ALIGNMENT);		      \
 	mfspr   r4,SPRN_DEAR;           /* Grab the DEAR and save it */	      \
 	stw     r4,_DEAR(r11);						      \
 	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -380,7 +383,7 @@ label:
 
 #define PROGRAM_EXCEPTION						      \
 	START_EXCEPTION(Program)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(PROGRAM);		      \
 	mfspr	r4,SPRN_ESR;		/* Grab the ESR and save it */	      \
 	stw	r4,_ESR(r11);						      \
 	addi	r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -388,7 +391,7 @@ label:
 
 #define DECREMENTER_EXCEPTION						      \
 	START_EXCEPTION(Decrementer)					      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(DECREMENTER);		      \
 	lis     r0,TSR_DIS@h;           /* Setup the DEC interrupt mask */    \
 	mtspr   SPRN_TSR,r0;		/* Clear the DEC interrupt */	      \
 	addi    r3,r1,STACK_FRAME_OVERHEAD;				      \
@@ -396,7 +399,7 @@ label:
 
 #define FP_UNAVAILABLE_EXCEPTION					      \
 	START_EXCEPTION(FloatingPointUnavailable)			      \
-	NORMAL_EXCEPTION_PROLOG;					      \
+	NORMAL_EXCEPTION_PROLOG(FP_UNAVAIL);		      \
 	beq	1f;							      \
 	bl	load_up_fpu;		/* if from user, just load it up */   \
 	b	fast_exception_return;					      \
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index d5d78c4..418931f 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -301,19 +301,20 @@ _ENTRY(__early_start)
 
 interrupt_base:
 	/* Critical Input Interrupt */
-	CRITICAL_EXCEPTION(0x0100, CriticalInput, unknown_exception)
+	CRITICAL_EXCEPTION(0x0100, CRITICAL, CriticalInput, unknown_exception)
 
 	/* Machine Check Interrupt */
 #ifdef CONFIG_E200
 	/* no RFMCI, MCSRRs on E200 */
-	CRITICAL_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
+	CRITICAL_EXCEPTION(0x0200, MACHINE_CHECK, MachineCheck, \
+			   machine_check_exception)
 #else
 	MCHECK_EXCEPTION(0x0200, MachineCheck, machine_check_exception)
 #endif
 
 	/* Data Storage Interrupt */
 	START_EXCEPTION(DataStorage)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(DATA_STORAGE)
 	mfspr	r5,SPRN_ESR		/* Grab the ESR, save it, pass arg3 */
 	stw	r5,_ESR(r11)
 	mfspr	r4,SPRN_DEAR		/* Grab the DEAR, save it, pass arg2 */
@@ -328,7 +329,7 @@ interrupt_base:
 	INSTRUCTION_STORAGE_EXCEPTION
 
 	/* External Input Interrupt */
-	EXCEPTION(0x0500, ExternalInput, do_IRQ, EXC_XFER_LITE)
+	EXCEPTION(0x0500, EXTERNAL, ExternalInput, do_IRQ, EXC_XFER_LITE)
 
 	/* Alignment Interrupt */
 	ALIGNMENT_EXCEPTION
@@ -342,32 +343,36 @@ interrupt_base:
 #else
 #ifdef CONFIG_E200
 	/* E200 treats 'normal' floating point instructions as FP Unavail exception */
-	EXCEPTION(0x0800, FloatingPointUnavailable, program_check_exception, EXC_XFER_EE)
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  program_check_exception, EXC_XFER_EE)
 #else
-	EXCEPTION(0x0800, FloatingPointUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x0800, FP_UNAVAIL, FloatingPointUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 #endif
 #endif
 
 	/* System Call Interrupt */
 	START_EXCEPTION(SystemCall)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(SYSCALL)
 	EXC_XFER_EE_LITE(0x0c00, DoSyscall)
 
 	/* Auxiliary Processor Unavailable Interrupt */
-	EXCEPTION(0x2900, AuxillaryProcessorUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2900, AP_UNAVAIL, AuxillaryProcessorUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Decrementer Interrupt */
 	DECREMENTER_EXCEPTION
 
 	/* Fixed Internal Timer Interrupt */
 	/* TODO: Add FIT support */
-	EXCEPTION(0x3100, FixedIntervalTimer, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x3100, FIT, FixedIntervalTimer, \
+		  unknown_exception, EXC_XFER_EE)
 
 	/* Watchdog Timer Interrupt */
 #ifdef CONFIG_BOOKE_WDT
-	CRITICAL_EXCEPTION(0x3200, WatchdogTimer, WatchdogException)
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, WatchdogException)
 #else
-	CRITICAL_EXCEPTION(0x3200, WatchdogTimer, unknown_exception)
+	CRITICAL_EXCEPTION(0x3200, WATCHDOG, WatchdogTimer, unknown_exception)
 #endif
 
 	/* Data TLB Error Interrupt */
@@ -538,31 +543,38 @@ interrupt_base:
 #ifdef CONFIG_SPE
 	/* SPE Unavailable */
 	START_EXCEPTION(SPEUnavailable)
-	NORMAL_EXCEPTION_PROLOG
+	NORMAL_EXCEPTION_PROLOG(SPE_UNAVAIL)
 	bne	load_up_spe
 	addi	r3,r1,STACK_FRAME_OVERHEAD
 	EXC_XFER_EE_LITE(0x2010, KernelSPE)
 #else
-	EXCEPTION(0x2020, SPEUnavailable, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2020, SPE_UNAVAIL, SPEUnavailable, \
+		  unknown_exception, EXC_XFER_EE)
 #endif /* CONFIG_SPE */
 
 	/* SPE Floating Point Data */
 #ifdef CONFIG_SPE
-	EXCEPTION(0x2030, SPEFloatingPointData, SPEFloatingPointException, EXC_XFER_EE);
+	EXCEPTION(0x2030, SPE_FP_DATA, SPEFloatingPointData, \
+		  SPEFloatingPointException, EXC_XFER_EE);
 
 	/* SPE Floating Point Round */
-	EXCEPTION(0x2050, SPEFloatingPointRound, SPEFloatingPointRoundException, EXC_XFER_EE)
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  SPEFloatingPointRoundException, EXC_XFER_EE)
 #else
-	EXCEPTION(0x2040, SPEFloatingPointData, unknown_exception, EXC_XFER_EE)
-	EXCEPTION(0x2050, SPEFloatingPointRound, unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2040, SPE_FP_DATA, SPEFloatingPointData, \
+		  unknown_exception, EXC_XFER_EE)
+	EXCEPTION(0x2050, SPE_FP_ROUND, SPEFloatingPointRound, \
+		  unknown_exception, EXC_XFER_EE)
 #endif /* CONFIG_SPE */
 
 	/* Performance Monitor */
-	EXCEPTION(0x2060, PerformanceMonitor, performance_monitor_exception, EXC_XFER_STD)
+	EXCEPTION(0x2060, PERFORMANCE_MONITOR, PerformanceMonitor, \
+		  performance_monitor_exception, EXC_XFER_STD)
 
-	EXCEPTION(0x2070, Doorbell, doorbell_exception, EXC_XFER_STD)
+	EXCEPTION(0x2070, DOORBELL, Doorbell, doorbell_exception, EXC_XFER_STD)
 
-	CRITICAL_EXCEPTION(0x2080, CriticalDoorbell, unknown_exception)
+	CRITICAL_EXCEPTION(0x2080, DOORBELL_CRITICAL, \
+			   CriticalDoorbell, unknown_exception)
 
 	/* Debug Interrupt */
 	DEBUG_DEBUG_EXCEPTION
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 09/37] KVM: PPC: e500: refactor core-specific TLB code
From: Alexander Graf @ 2012-02-24 14:26 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

The PID handling is e500v1/v2-specific, and is moved to e500.c.

The MMU sregs code and kvmppc_core_vcpu_translate will be shared with
e500mc, and is moved from e500.c to e500_tlb.c.

Partially based on patches from Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
[agraf: fix bisectability]
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |    2 +
 arch/powerpc/kvm/e500.c             |  357 +++++++++++++++++++++++----
 arch/powerpc/kvm/e500.h             |   62 ++++-
 arch/powerpc/kvm/e500_emulate.c     |    6 +-
 arch/powerpc/kvm/e500_tlb.c         |  460 +++++++++--------------------------
 5 files changed, 473 insertions(+), 414 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 52eb9c1..47612cc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -426,6 +426,8 @@ struct kvm_vcpu_arch {
 	ulong fault_esr;
 	ulong queued_dear;
 	ulong queued_esr;
+	u32 tlbcfg[4];
+	u32 mmucfg;
 #endif
 	gpa_t paddr_accessed;
 
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index 76b35d8..b479ed7 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -22,9 +22,281 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 
+#include "../mm/mmu_decl.h"
 #include "booke.h"
 #include "e500.h"
 
+struct id {
+	unsigned long val;
+	struct id **pentry;
+};
+
+#define NUM_TIDS 256
+
+/*
+ * This table provide mappings from:
+ * (guestAS,guestTID,guestPR) --> ID of physical cpu
+ * guestAS	[0..1]
+ * guestTID	[0..255]
+ * guestPR	[0..1]
+ * ID		[1..255]
+ * Each vcpu keeps one vcpu_id_table.
+ */
+struct vcpu_id_table {
+	struct id id[2][NUM_TIDS][2];
+};
+
+/*
+ * This table provide reversed mappings of vcpu_id_table:
+ * ID --> address of vcpu_id_table item.
+ * Each physical core has one pcpu_id_table.
+ */
+struct pcpu_id_table {
+	struct id *entry[NUM_TIDS];
+};
+
+static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
+
+/* This variable keeps last used shadow ID on local core.
+ * The valid range of shadow ID is [1..255] */
+static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
+
+/*
+ * Allocate a free shadow id and setup a valid sid mapping in given entry.
+ * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_setup_one(struct id *entry)
+{
+	unsigned long sid;
+	int ret = -1;
+
+	sid = ++(__get_cpu_var(pcpu_last_used_sid));
+	if (sid < NUM_TIDS) {
+		__get_cpu_var(pcpu_sids).entry[sid] = entry;
+		entry->val = sid;
+		entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid];
+		ret = sid;
+	}
+
+	/*
+	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
+	 * the caller will invalidate everything and start over.
+	 *
+	 * sid > NUM_TIDS indicates a race, which we disable preemption to
+	 * avoid.
+	 */
+	WARN_ON(sid > NUM_TIDS);
+
+	return ret;
+}
+
+/*
+ * Check if given entry contain a valid shadow id mapping.
+ * An ID mapping is considered valid only if
+ * both vcpu and pcpu know this mapping.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+static inline int local_sid_lookup(struct id *entry)
+{
+	if (entry && entry->val != 0 &&
+	    __get_cpu_var(pcpu_sids).entry[entry->val] == entry &&
+	    entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val])
+		return entry->val;
+	return -1;
+}
+
+/* Invalidate all id mappings on local core -- call with preempt disabled */
+static inline void local_sid_destroy_all(void)
+{
+	__get_cpu_var(pcpu_last_used_sid) = 0;
+	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
+}
+
+static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
+	return vcpu_e500->idt;
+}
+
+static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kfree(vcpu_e500->idt);
+	vcpu_e500->idt = NULL;
+}
+
+/* Map guest pid to shadow.
+ * We use PID to keep shadow of current guest non-zero PID,
+ * and use PID1 to keep shadow of guest zero PID.
+ * So that guest tlbe with TID=0 can be accessed at any time */
+static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	preempt_disable();
+	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu),
+			get_cur_pid(&vcpu_e500->vcpu),
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
+			get_cur_as(&vcpu_e500->vcpu), 0,
+			get_cur_pr(&vcpu_e500->vcpu), 1);
+	preempt_enable();
+}
+
+/* Invalidate all mappings on vcpu */
+static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/* Invalidate one ID mapping on vcpu */
+static inline void kvmppc_e500_id_table_reset_one(
+			       struct kvmppc_vcpu_e500 *vcpu_e500,
+			       int as, int pid, int pr)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+
+	BUG_ON(as >= 2);
+	BUG_ON(pid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	idt->id[as][pid][pr].val = 0;
+	idt->id[as][pid][pr].pentry = NULL;
+
+	/* Update shadow pid when mappings are changed */
+	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+}
+
+/*
+ * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
+ * This function first lookup if a valid mapping exists,
+ * if not, then creates a new one.
+ *
+ * The caller must have preemption disabled, and keep it that way until
+ * it has finished with the returned shadow id (either written into the
+ * TLB or arch.shadow_pid, or discarded).
+ */
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	int sid;
+
+	BUG_ON(as >= 2);
+	BUG_ON(gid >= NUM_TIDS);
+	BUG_ON(pr >= 2);
+
+	sid = local_sid_lookup(&idt->id[as][gid][pr]);
+
+	while (sid <= 0) {
+		/* No mapping yet */
+		sid = local_sid_setup_one(&idt->id[as][gid][pr]);
+		if (sid <= 0) {
+			_tlbil_all();
+			local_sid_destroy_all();
+		}
+
+		/* Update shadow pid when mappings are changed */
+		if (!avoid_recursion)
+			kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+
+	return sid;
+}
+
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe),
+				   get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	if (vcpu->arch.pid != pid) {
+		vcpu_e500->pid[0] = vcpu->arch.pid = pid;
+		kvmppc_e500_recalc_shadow_pid(vcpu_e500);
+	}
+}
+
+/* gtlbe must not be mapped by more than one host tlbe */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+                           struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	struct vcpu_id_table *idt = vcpu_e500->idt;
+	unsigned int pr, tid, ts, pid;
+	u32 val, eaddr;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+
+	preempt_disable();
+
+	/* One guest ID may be mapped to two shadow IDs */
+	for (pr = 0; pr < 2; pr++) {
+		/*
+		 * The shadow PID can have a valid mapping on at most one
+		 * host CPU.  In the common case, it will be valid on this
+		 * CPU, in which case we do a local invalidation of the
+		 * specific address.
+		 *
+		 * If the shadow PID is not valid on the current host CPU,
+		 * we invalidate the entire shadow PID.
+		 */
+		pid = local_sid_lookup(&idt->id[ts][tid][pr]);
+		if (pid <= 0) {
+			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
+			continue;
+		}
+
+		/*
+		 * The guest is invalidating a 4K entry which is in a PID
+		 * that has a valid shadow mapping on this host CPU.  We
+		 * search host TLB to invalidate it's shadow TLB entry,
+		 * similar to __tlbil_va except that we need to look in AS1.
+		 */
+		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
+		eaddr = get_tlb_eaddr(gtlbe);
+
+		local_irq_save(flags);
+
+		mtspr(SPRN_MAS6, val);
+		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
+		val = mfspr(SPRN_MAS1);
+		if (val & MAS1_VALID) {
+			mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+			asm volatile("tlbwe");
+		}
+
+		local_irq_restore(flags);
+	}
+
+	preempt_enable();
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	kvmppc_e500_id_table_reset_all(vcpu_e500);
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+	/* Recalc shadow pid since MSR changes */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
+}
+
 void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
 {
 }
@@ -36,13 +308,13 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	kvmppc_booke_vcpu_load(vcpu, cpu);
-	kvmppc_e500_tlb_load(vcpu, cpu);
+
+	/* Shadow PID may be expired on local core */
+	kvmppc_e500_recalc_shadow_pid(to_e500(vcpu));
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	kvmppc_e500_tlb_put(vcpu);
-
 #ifdef CONFIG_SPE
 	if (vcpu->arch.shadow_msr & MSR_SPE)
 		kvmppc_vcpu_disable_spe(vcpu);
@@ -63,6 +335,23 @@ int kvmppc_core_check_processor_compat(void)
 	return r;
 }
 
+static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	struct kvm_book3e_206_tlb_entry *tlbe;
+
+	/* Insert large initial mapping for guest. */
+	tlbe = get_entry(vcpu_e500, 1, 0);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
+	tlbe->mas2 = 0;
+	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
+
+	/* 4K map for serial output. Used by kernel wrapper. */
+	tlbe = get_entry(vcpu_e500, 1, 1);
+	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
+	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
+	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
+}
+
 int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -78,32 +367,6 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
-int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
-                               struct kvm_translation *tr)
-{
-	int index;
-	gva_t eaddr;
-	u8 pid;
-	u8 as;
-
-	eaddr = tr->linear_address;
-	pid = (tr->linear_address >> 32) & 0xff;
-	as = (tr->linear_address >> 40) & 0x1;
-
-	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as);
-	if (index < 0) {
-		tr->valid = 0;
-		return 0;
-	}
-
-	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
-	/* XXX what does "writeable" and "usermode" even mean? */
-	tr->valid = 1;
-
-	return 0;
-}
-
 void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -117,19 +380,6 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
 	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
 
-	sregs->u.e.mas0 = vcpu->arch.shared->mas0;
-	sregs->u.e.mas1 = vcpu->arch.shared->mas1;
-	sregs->u.e.mas2 = vcpu->arch.shared->mas2;
-	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
-	sregs->u.e.mas4 = vcpu->arch.shared->mas4;
-	sregs->u.e.mas6 = vcpu->arch.shared->mas6;
-
-	sregs->u.e.mmucfg = mfspr(SPRN_MMUCFG);
-	sregs->u.e.tlbcfg[0] = vcpu_e500->tlb0cfg;
-	sregs->u.e.tlbcfg[1] = vcpu_e500->tlb1cfg;
-	sregs->u.e.tlbcfg[2] = 0;
-	sregs->u.e.tlbcfg[3] = 0;
-
 	sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL];
 	sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA];
 	sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND];
@@ -137,11 +387,13 @@ void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
 
 	kvmppc_get_sregs_ivor(vcpu, sregs);
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
 }
 
 int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
 
 	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
 		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
@@ -149,14 +401,9 @@ int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
 	}
 
-	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
-		vcpu->arch.shared->mas0 = sregs->u.e.mas0;
-		vcpu->arch.shared->mas1 = sregs->u.e.mas1;
-		vcpu->arch.shared->mas2 = sregs->u.e.mas2;
-		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
-		vcpu->arch.shared->mas4 = sregs->u.e.mas4;
-		vcpu->arch.shared->mas6 = sregs->u.e.mas6;
-	}
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
 
 	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
 		return 0;
@@ -195,9 +442,12 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	if (err)
 		goto free_vcpu;
 
+	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
+		goto uninit_vcpu;
+
 	err = kvmppc_e500_tlb_init(vcpu_e500);
 	if (err)
-		goto uninit_vcpu;
+		goto uninit_id;
 
 	vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO);
 	if (!vcpu->arch.shared)
@@ -207,6 +457,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 uninit_tlb:
 	kvmppc_e500_tlb_uninit(vcpu_e500);
+uninit_id:
+	kvmppc_e500_id_table_free(vcpu_e500);
 uninit_vcpu:
 	kvm_vcpu_uninit(vcpu);
 free_vcpu:
@@ -220,8 +472,9 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 
 	free_page((unsigned long)vcpu->arch.shared);
-	kvm_vcpu_uninit(vcpu);
 	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvmppc_e500_id_table_free(vcpu_e500);
+	kvm_vcpu_uninit(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
 }
 
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index a48af00..34cef08 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -35,7 +35,9 @@ struct tlbe_priv {
 	struct tlbe_ref ref; /* TLB0 only -- TLB1 uses tlb_refs */
 };
 
+#ifdef CONFIG_KVM_E500
 struct vcpu_id_table;
+#endif
 
 struct kvmppc_e500_tlb_params {
 	int entries, ways, sets;
@@ -70,23 +72,22 @@ struct kvmppc_vcpu_e500 {
 	struct tlbe_ref *tlb_refs[E500_TLB_NUM];
 	unsigned int host_tlb1_nv;
 
-	u32 host_pid[E500_PID_NUM];
-	u32 pid[E500_PID_NUM];
 	u32 svr;
-
-	/* vcpu id table */
-	struct vcpu_id_table *idt;
-
 	u32 l1csr0;
 	u32 l1csr1;
 	u32 hid0;
 	u32 hid1;
-	u32 tlb0cfg;
-	u32 tlb1cfg;
 	u64 mcar;
 
 	struct page **shared_tlb_pages;
 	int num_shared_tlb_pages;
+
+#ifdef CONFIG_KVM_E500
+	u32 pid[E500_PID_NUM];
+
+	/* vcpu id table */
+	struct vcpu_id_table *idt;
+#endif
 };
 
 static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
@@ -113,23 +114,25 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
 	  (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \
 	   | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK)
 
-extern void kvmppc_e500_tlb_put(struct kvm_vcpu *);
-extern void kvmppc_e500_tlb_load(struct kvm_vcpu *, int);
-extern void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *);
-extern void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *);
 int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500,
 				ulong value);
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu);
 int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb);
 int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb);
-int kvmppc_e500_tlb_search(struct kvm_vcpu *, gva_t, unsigned int, int);
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500);
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500);
 
 void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
+
+#ifdef CONFIG_KVM_E500
+unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
+				 unsigned int as, unsigned int gid,
+				 unsigned int pr, int avoid_recursion);
+#endif
+
 /* TLB helper functions */
 static inline unsigned int
 get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe)
@@ -183,6 +186,12 @@ get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe)
 	return (tlbe->mas1 >> 30) & 0x1;
 }
 
+static inline unsigned int
+get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe)
+{
+	return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT;
+}
+
 static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.pid & 0xff;
@@ -248,4 +257,31 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 	return 1;
 }
 
+static inline struct kvm_book3e_206_tlb_entry *get_entry(
+	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
+{
+	int offset = vcpu_e500->gtlb_offset[tlbsel];
+	return &vcpu_e500->gtlb_arch[offset + entry];
+}
+
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe);
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500);
+
+#ifdef CONFIG_KVM_E500
+unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
+				      struct kvm_book3e_206_tlb_entry *gtlbe);
+
+static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf;
+
+	return vcpu_e500->pid[tidseld];
+}
+
+/* Force TS=1 for all guest mappings. */
+#define get_tlb_sts(gtlbe)              (MAS1_TS)
+#endif /* CONFIG_KVM_E500 */
+
 #endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 7e2d592..c80794d 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -174,9 +174,9 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		kvmppc_set_gpr(vcpu, rt, val);
 		break;
 	case SPRN_TLB0CFG:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb0cfg); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[0]); break;
 	case SPRN_TLB1CFG:
-		kvmppc_set_gpr(vcpu, rt, vcpu_e500->tlb1cfg); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[1]); break;
 	case SPRN_L1CSR0:
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->l1csr0); break;
 	case SPRN_L1CSR1:
@@ -192,7 +192,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		kvmppc_set_gpr(vcpu, rt, 0); break;
 
 	case SPRN_MMUCFG:
-		kvmppc_set_gpr(vcpu, rt, mfspr(SPRN_MMUCFG)); break;
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.mmucfg); break;
 
 	/* extra exceptions */
 	case SPRN_IVOR32:
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 7d4a918..9925fc6 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -27,208 +27,14 @@
 #include <linux/hugetlb.h>
 #include <asm/kvm_ppc.h>
 
-#include "../mm/mmu_decl.h"
 #include "e500.h"
 #include "trace.h"
 #include "timing.h"
 
 #define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1)
 
-struct id {
-	unsigned long val;
-	struct id **pentry;
-};
-
-#define NUM_TIDS 256
-
-/*
- * This table provide mappings from:
- * (guestAS,guestTID,guestPR) --> ID of physical cpu
- * guestAS	[0..1]
- * guestTID	[0..255]
- * guestPR	[0..1]
- * ID		[1..255]
- * Each vcpu keeps one vcpu_id_table.
- */
-struct vcpu_id_table {
-	struct id id[2][NUM_TIDS][2];
-};
-
-/*
- * This table provide reversed mappings of vcpu_id_table:
- * ID --> address of vcpu_id_table item.
- * Each physical core has one pcpu_id_table.
- */
-struct pcpu_id_table {
-	struct id *entry[NUM_TIDS];
-};
-
-static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids);
-
-/* This variable keeps last used shadow ID on local core.
- * The valid range of shadow ID is [1..255] */
-static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid);
-
 static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM];
 
-static struct kvm_book3e_206_tlb_entry *get_entry(
-	struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry)
-{
-	int offset = vcpu_e500->gtlb_offset[tlbsel];
-	return &vcpu_e500->gtlb_arch[offset + entry];
-}
-
-/*
- * Allocate a free shadow id and setup a valid sid mapping in given entry.
- * A mapping is only valid when vcpu_id_table and pcpu_id_table are match.
- *
- * The caller must have preemption disabled, and keep it that way until
- * it has finished with the returned shadow id (either written into the
- * TLB or arch.shadow_pid, or discarded).
- */
-static inline int local_sid_setup_one(struct id *entry)
-{
-	unsigned long sid;
-	int ret = -1;
-
-	sid = ++(__get_cpu_var(pcpu_last_used_sid));
-	if (sid < NUM_TIDS) {
-		__get_cpu_var(pcpu_sids).entry[sid] = entry;
-		entry->val = sid;
-		entry->pentry = &__get_cpu_var(pcpu_sids).entry[sid];
-		ret = sid;
-	}
-
-	/*
-	 * If sid == NUM_TIDS, we've run out of sids.  We return -1, and
-	 * the caller will invalidate everything and start over.
-	 *
-	 * sid > NUM_TIDS indicates a race, which we disable preemption to
-	 * avoid.
-	 */
-	WARN_ON(sid > NUM_TIDS);
-
-	return ret;
-}
-
-/*
- * Check if given entry contain a valid shadow id mapping.
- * An ID mapping is considered valid only if
- * both vcpu and pcpu know this mapping.
- *
- * The caller must have preemption disabled, and keep it that way until
- * it has finished with the returned shadow id (either written into the
- * TLB or arch.shadow_pid, or discarded).
- */
-static inline int local_sid_lookup(struct id *entry)
-{
-	if (entry && entry->val != 0 &&
-	    __get_cpu_var(pcpu_sids).entry[entry->val] == entry &&
-	    entry->pentry == &__get_cpu_var(pcpu_sids).entry[entry->val])
-		return entry->val;
-	return -1;
-}
-
-/* Invalidate all id mappings on local core -- call with preempt disabled */
-static inline void local_sid_destroy_all(void)
-{
-	__get_cpu_var(pcpu_last_used_sid) = 0;
-	memset(&__get_cpu_var(pcpu_sids), 0, sizeof(__get_cpu_var(pcpu_sids)));
-}
-
-static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL);
-	return vcpu_e500->idt;
-}
-
-static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	kfree(vcpu_e500->idt);
-}
-
-/* Invalidate all mappings on vcpu */
-static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table));
-
-	/* Update shadow pid when mappings are changed */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
-/* Invalidate one ID mapping on vcpu */
-static inline void kvmppc_e500_id_table_reset_one(
-			       struct kvmppc_vcpu_e500 *vcpu_e500,
-			       int as, int pid, int pr)
-{
-	struct vcpu_id_table *idt = vcpu_e500->idt;
-
-	BUG_ON(as >= 2);
-	BUG_ON(pid >= NUM_TIDS);
-	BUG_ON(pr >= 2);
-
-	idt->id[as][pid][pr].val = 0;
-	idt->id[as][pid][pr].pentry = NULL;
-
-	/* Update shadow pid when mappings are changed */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
-/*
- * Map guest (vcpu,AS,ID,PR) to physical core shadow id.
- * This function first lookup if a valid mapping exists,
- * if not, then creates a new one.
- *
- * The caller must have preemption disabled, and keep it that way until
- * it has finished with the returned shadow id (either written into the
- * TLB or arch.shadow_pid, or discarded).
- */
-static unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500,
-					unsigned int as, unsigned int gid,
-					unsigned int pr, int avoid_recursion)
-{
-	struct vcpu_id_table *idt = vcpu_e500->idt;
-	int sid;
-
-	BUG_ON(as >= 2);
-	BUG_ON(gid >= NUM_TIDS);
-	BUG_ON(pr >= 2);
-
-	sid = local_sid_lookup(&idt->id[as][gid][pr]);
-
-	while (sid <= 0) {
-		/* No mapping yet */
-		sid = local_sid_setup_one(&idt->id[as][gid][pr]);
-		if (sid <= 0) {
-			_tlbil_all();
-			local_sid_destroy_all();
-		}
-
-		/* Update shadow pid when mappings are changed */
-		if (!avoid_recursion)
-			kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-	}
-
-	return sid;
-}
-
-/* Map guest pid to shadow.
- * We use PID to keep shadow of current guest non-zero PID,
- * and use PID1 to keep shadow of guest zero PID.
- * So that guest tlbe with TID=0 can be accessed at any time */
-void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	preempt_disable();
-	vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500,
-			get_cur_as(&vcpu_e500->vcpu),
-			get_cur_pid(&vcpu_e500->vcpu),
-			get_cur_pr(&vcpu_e500->vcpu), 1);
-	vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500,
-			get_cur_as(&vcpu_e500->vcpu), 0,
-			get_cur_pr(&vcpu_e500->vcpu), 1);
-	preempt_enable();
-}
-
 static inline unsigned int gtlb0_get_next_victim(
 		struct kvmppc_vcpu_e500 *vcpu_e500)
 {
@@ -336,6 +142,7 @@ static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 }
 
+#ifdef CONFIG_KVM_E500
 void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
@@ -360,75 +167,21 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu)
 	__write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index));
 	preempt_enable();
 }
-
-void kvmppc_e500_tlb_load(struct kvm_vcpu *vcpu, int cpu)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-
-	/* Shadow PID may be expired on local core */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
-void kvmppc_e500_tlb_put(struct kvm_vcpu *vcpu)
-{
-}
+#endif
 
 static void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500,
 				int tlbsel, int esel)
 {
 	struct kvm_book3e_206_tlb_entry *gtlbe =
 		get_entry(vcpu_e500, tlbsel, esel);
-	struct vcpu_id_table *idt = vcpu_e500->idt;
-	unsigned int pr, tid, ts, pid;
-	u32 val, eaddr;
-	unsigned long flags;
-
-	ts = get_tlb_ts(gtlbe);
-	tid = get_tlb_tid(gtlbe);
-
-	preempt_disable();
-
-	/* One guest ID may be mapped to two shadow IDs */
-	for (pr = 0; pr < 2; pr++) {
-		/*
-		 * The shadow PID can have a valid mapping on at most one
-		 * host CPU.  In the common case, it will be valid on this
-		 * CPU, in which case (for TLB0) we do a local invalidation
-		 * of the specific address.
-		 *
-		 * If the shadow PID is not valid on the current host CPU, or
-		 * if we're invalidating a TLB1 entry, we invalidate the
-		 * entire shadow PID.
-		 */
-		if (tlbsel == 1 ||
-		    (pid = local_sid_lookup(&idt->id[ts][tid][pr])) <= 0) {
-			kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr);
-			continue;
-		}
-
-		/*
-		 * The guest is invalidating a TLB0 entry which is in a PID
-		 * that has a valid shadow mapping on this host CPU.  We
-		 * search host TLB0 to invalidate it's shadow TLB entry,
-		 * similar to __tlbil_va except that we need to look in AS1.
-		 */
-		val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS;
-		eaddr = get_tlb_eaddr(gtlbe);
-
-		local_irq_save(flags);
-
-		mtspr(SPRN_MAS6, val);
-		asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr));
-		val = mfspr(SPRN_MAS1);
-		if (val & MAS1_VALID) {
-			mtspr(SPRN_MAS1, val & ~MAS1_VALID);
-			asm volatile("tlbwe");
-		}
 
-		local_irq_restore(flags);
+	if (tlbsel == 1) {
+		kvmppc_e500_tlbil_all(vcpu_e500);
+		return;
 	}
 
-	preempt_enable();
+	/* Guest tlbe is backed by at most one host tlbe per shadow pid. */
+	kvmppc_e500_tlbil_one(vcpu_e500, gtlbe);
 }
 
 static int tlb0_set_base(gva_t addr, int sets, int ways)
@@ -546,7 +299,7 @@ static void clear_tlb_refs(struct kvmppc_vcpu_e500 *vcpu_e500)
 	int stlbsel = 1;
 	int i;
 
-	kvmppc_e500_id_table_reset_all(vcpu_e500);
+	kvmppc_e500_tlbil_all(vcpu_e500);
 
 	for (i = 0; i < host_tlb_params[stlbsel].entries; i++) {
 		struct tlbe_ref *ref =
@@ -561,19 +314,18 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 		unsigned int eaddr, int as)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	unsigned int victim, pidsel, tsized;
+	unsigned int victim, tsized;
 	int tlbsel;
 
 	/* since we only have two TLBs, only lower bit is used. */
 	tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1;
 	victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0;
-	pidsel = (vcpu->arch.shared->mas4 >> 16) & 0xf;
 	tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f;
 
 	vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim)
 		| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 	vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0)
-		| MAS1_TID(vcpu_e500->pid[pidsel])
+		| MAS1_TID(get_tlbmiss_tid(vcpu))
 		| MAS1_TSIZE(tsized);
 	vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN)
 		| (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK);
@@ -585,23 +337,22 @@ static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu,
 
 /* TID must be supplied by the caller */
 static inline void kvmppc_e500_setup_stlbe(
-	struct kvmppc_vcpu_e500 *vcpu_e500,
+	struct kvm_vcpu *vcpu,
 	struct kvm_book3e_206_tlb_entry *gtlbe,
 	int tsize, struct tlbe_ref *ref, u64 gvaddr,
 	struct kvm_book3e_206_tlb_entry *stlbe)
 {
 	pfn_t pfn = ref->pfn;
+	u32 pr = vcpu->arch.shared->msr & MSR_PR;
 
 	BUG_ON(!(ref->flags & E500_TLB_VALID));
 
-	/* Force TS=1 IPROT=0 for all guest mappings. */
-	stlbe->mas1 = MAS1_TSIZE(tsize) | MAS1_TS | MAS1_VALID;
-	stlbe->mas2 = (gvaddr & MAS2_EPN)
-		| e500_shadow_mas2_attrib(gtlbe->mas2,
-				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
-	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT)
-		| e500_shadow_mas3_attrib(gtlbe->mas7_3,
-				vcpu_e500->vcpu.arch.shared->msr & MSR_PR);
+	/* Force IPROT=0 for all guest mappings. */
+	stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID;
+	stlbe->mas2 = (gvaddr & MAS2_EPN) |
+		      e500_shadow_mas2_attrib(gtlbe->mas2, pr);
+	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
+			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
 }
 
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
@@ -735,7 +486,8 @@ static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	kvmppc_e500_ref_release(ref);
 	kvmppc_e500_ref_setup(ref, gtlbe, pfn);
 
-	kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, tsize, ref, gvaddr, stlbe);
+	kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize,
+				ref, gvaddr, stlbe);
 }
 
 /* XXX only map the one-one case, for now use TLB0 */
@@ -775,14 +527,6 @@ static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	return victim;
 }
 
-void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-
-	/* Recalc shadow pid since MSR changes */
-	kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-}
-
 static inline int kvmppc_e500_gtlbe_invalidate(
 				struct kvmppc_vcpu_e500 *vcpu_e500,
 				int tlbsel, int esel)
@@ -810,7 +554,7 @@ int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value)
 			kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel);
 
 	/* Invalidate all vcpu id mappings */
-	kvmppc_e500_id_table_reset_all(vcpu_e500);
+	kvmppc_e500_tlbil_all(vcpu_e500);
 
 	return EMULATE_DONE;
 }
@@ -843,7 +587,7 @@ int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb)
 	}
 
 	/* Invalidate all vcpu id mappings */
-	kvmppc_e500_id_table_reset_all(vcpu_e500);
+	kvmppc_e500_tlbil_all(vcpu_e500);
 
 	return EMULATE_DONE;
 }
@@ -928,9 +672,7 @@ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 	int stid;
 
 	preempt_disable();
-	stid = kvmppc_e500_get_sid(vcpu_e500, get_tlb_ts(gtlbe),
-				   get_tlb_tid(gtlbe),
-				   get_cur_pr(&vcpu_e500->vcpu), 0);
+	stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe);
 
 	stlbe->mas1 |= MAS1_TID(stid);
 	write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe);
@@ -940,8 +682,8 @@ static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500,
 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	struct kvm_book3e_206_tlb_entry *gtlbe;
-	int tlbsel, esel;
+	struct kvm_book3e_206_tlb_entry *gtlbe, stlbe;
+	int tlbsel, esel, stlbsel, sesel;
 
 	tlbsel = get_tlb_tlbsel(vcpu);
 	esel = get_tlb_esel(vcpu, tlbsel);
@@ -960,8 +702,6 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 
 	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
 	if (tlbe_is_host_safe(vcpu, gtlbe)) {
-		struct kvm_book3e_206_tlb_entry stlbe;
-		int stlbsel, sesel;
 		u64 eaddr;
 		u64 raddr;
 
@@ -988,7 +728,7 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 			 * are mapped on the fly. */
 			stlbsel = 1;
 			sesel = kvmppc_e500_tlb1_map(vcpu_e500, eaddr,
-					raddr >> PAGE_SHIFT, gtlbe, &stlbe);
+				    raddr >> PAGE_SHIFT, gtlbe, &stlbe);
 			break;
 
 		default:
@@ -1002,6 +742,48 @@ int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
+				  gva_t eaddr, unsigned int pid, int as)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int esel, tlbsel;
+
+	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
+		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as);
+		if (esel >= 0)
+			return index_of(tlbsel, esel);
+	}
+
+	return -1;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+	int index;
+	gva_t eaddr;
+	u8 pid;
+	u8 as;
+
+	eaddr = tr->linear_address;
+	pid = (tr->linear_address >> 32) & 0xff;
+	as = (tr->linear_address >> 40) & 0x1;
+
+	index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as);
+	if (index < 0) {
+		tr->valid = 0;
+		return 0;
+	}
+
+	tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr);
+	/* XXX what does "writeable" and "usermode" even mean? */
+	tr->valid = 1;
+
+	return 0;
+}
+
+
 int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
 	unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS);
@@ -1065,7 +847,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 		sesel = 0; /* unused */
 		priv = &vcpu_e500->gtlb_priv[tlbsel][esel];
 
-		kvmppc_e500_setup_stlbe(vcpu_e500, gtlbe, BOOK3E_PAGESZ_4K,
+		kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K,
 					&priv->ref, eaddr, &stlbe);
 		break;
 
@@ -1086,48 +868,6 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr,
 	write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel);
 }
 
-int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu,
-				gva_t eaddr, unsigned int pid, int as)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-	int esel, tlbsel;
-
-	for (tlbsel = 0; tlbsel < 2; tlbsel++) {
-		esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as);
-		if (esel >= 0)
-			return index_of(tlbsel, esel);
-	}
-
-	return -1;
-}
-
-void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
-{
-	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
-
-	if (vcpu->arch.pid != pid) {
-		vcpu_e500->pid[0] = vcpu->arch.pid = pid;
-		kvmppc_e500_recalc_shadow_pid(vcpu_e500);
-	}
-}
-
-void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500)
-{
-	struct kvm_book3e_206_tlb_entry *tlbe;
-
-	/* Insert large initial mapping for guest. */
-	tlbe = get_entry(vcpu_e500, 1, 0);
-	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M);
-	tlbe->mas2 = 0;
-	tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK;
-
-	/* 4K map for serial output. Used by kernel wrapper. */
-	tlbe = get_entry(vcpu_e500, 1, 1);
-	tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K);
-	tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G;
-	tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK;
-}
-
 static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	int i;
@@ -1154,6 +894,36 @@ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500)
 	vcpu_e500->gtlb_arch = NULL;
 }
 
+void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	sregs->u.e.mas0 = vcpu->arch.shared->mas0;
+	sregs->u.e.mas1 = vcpu->arch.shared->mas1;
+	sregs->u.e.mas2 = vcpu->arch.shared->mas2;
+	sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3;
+	sregs->u.e.mas4 = vcpu->arch.shared->mas4;
+	sregs->u.e.mas6 = vcpu->arch.shared->mas6;
+
+	sregs->u.e.mmucfg = vcpu->arch.mmucfg;
+	sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0];
+	sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1];
+	sregs->u.e.tlbcfg[2] = 0;
+	sregs->u.e.tlbcfg[3] = 0;
+}
+
+int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) {
+		vcpu->arch.shared->mas0 = sregs->u.e.mas0;
+		vcpu->arch.shared->mas1 = sregs->u.e.mas1;
+		vcpu->arch.shared->mas2 = sregs->u.e.mas2;
+		vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3;
+		vcpu->arch.shared->mas4 = sregs->u.e.mas4;
+		vcpu->arch.shared->mas6 = sregs->u.e.mas6;
+	}
+
+	return 0;
+}
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 			      struct kvm_config_tlb *cfg)
 {
@@ -1237,14 +1007,16 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 	vcpu_e500->gtlb_offset[0] = 0;
 	vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0];
 
-	vcpu_e500->tlb0cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE;
+
+	vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
 	if (params.tlb_sizes[0] <= 2048)
-		vcpu_e500->tlb0cfg |= params.tlb_sizes[0];
-	vcpu_e500->tlb0cfg |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
+		vcpu->arch.tlbcfg[0] |= params.tlb_sizes[0];
+	vcpu->arch.tlbcfg[0] |= params.tlb_ways[0] << TLBnCFG_ASSOC_SHIFT;
 
-	vcpu_e500->tlb1cfg &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu_e500->tlb1cfg |= params.tlb_sizes[1];
-	vcpu_e500->tlb1cfg |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
+	vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
+	vcpu->arch.tlbcfg[1] |= params.tlb_sizes[1];
+	vcpu->arch.tlbcfg[1] |= params.tlb_ways[1] << TLBnCFG_ASSOC_SHIFT;
 
 	vcpu_e500->shared_tlb_pages = pages;
 	vcpu_e500->num_shared_tlb_pages = num_pages;
@@ -1280,6 +1052,7 @@ int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
 
 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
+	struct kvm_vcpu *vcpu = &vcpu_e500->vcpu;
 	int entry_size = sizeof(struct kvm_book3e_206_tlb_entry);
 	int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE;
 
@@ -1356,20 +1129,17 @@ int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500)
 	if (!vcpu_e500->gtlb_priv[1])
 		goto err;
 
-	if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL)
-		goto err;
-
 	/* Init TLB configuration register */
-	vcpu_e500->tlb0cfg = mfspr(SPRN_TLB0CFG) &
+	vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) &
 			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[0].entries;
-	vcpu_e500->tlb0cfg |=
+	vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[0].entries;
+	vcpu->arch.tlbcfg[0] |=
 		vcpu_e500->gtlb_params[0].ways << TLBnCFG_ASSOC_SHIFT;
 
-	vcpu_e500->tlb1cfg = mfspr(SPRN_TLB1CFG) &
+	vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) &
 			     ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC);
-	vcpu_e500->tlb0cfg |= vcpu_e500->gtlb_params[1].entries;
-	vcpu_e500->tlb0cfg |=
+	vcpu->arch.tlbcfg[0] |= vcpu_e500->gtlb_params[1].entries;
+	vcpu->arch.tlbcfg[0] |=
 		vcpu_e500->gtlb_params[1].ways << TLBnCFG_ASSOC_SHIFT;
 
 	return 0;
@@ -1384,8 +1154,6 @@ err:
 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	free_gtlb(vcpu_e500);
-	kvmppc_e500_id_table_free(vcpu_e500);
-
 	kfree(vcpu_e500->tlb_refs[0]);
 	kfree(vcpu_e500->tlb_refs[1]);
 }
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 15/37] KVM: PPC: e500mc support
From: Alexander Graf @ 2012-02-24 14:26 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

Add processor support for e500mc, using hardware virtualization support
(GS-mode).

Current issues include:
 - No support for external proxy (coreint) interrupt mode in the guest.

Includes work by Ashish Kalra <Ashish.Kalra@freescale.com>,
Varun Sethi <Varun.Sethi@freescale.com>, and
Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/cputable.h       |    6 +-
 arch/powerpc/include/asm/kvm.h            |    1 +
 arch/powerpc/kernel/cpu_setup_fsl_booke.S |    1 +
 arch/powerpc/kernel/head_fsl_booke.S      |   46 ++++
 arch/powerpc/kvm/Kconfig                  |   17 ++-
 arch/powerpc/kvm/Makefile                 |   11 +
 arch/powerpc/kvm/e500.h                   |   13 +-
 arch/powerpc/kvm/e500_emulate.c           |   24 ++-
 arch/powerpc/kvm/e500_tlb.c               |   21 ++-
 arch/powerpc/kvm/e500mc.c                 |  342 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c                |    6 +-
 11 files changed, 476 insertions(+), 12 deletions(-)
 create mode 100644 arch/powerpc/kvm/e500mc.c

diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 2022f2d..598cd24 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -168,6 +168,7 @@ extern const char *powerpc_base_platform;
 #define CPU_FTR_LWSYNC			ASM_CONST(0x0000000008000000)
 #define CPU_FTR_NOEXECUTE		ASM_CONST(0x0000000010000000)
 #define CPU_FTR_INDEXED_DCR		ASM_CONST(0x0000000020000000)
+#define CPU_FTR_EMB_HV			ASM_CONST(0x0000000040000000)
 
 /*
  * Add the 64-bit processor unique features in the top half of the word;
@@ -386,11 +387,11 @@ extern const char *powerpc_base_platform;
 	    CPU_FTR_NODSISRALIGN | CPU_FTR_NOEXECUTE)
 #define CPU_FTRS_E500MC	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
-	    CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC)
+	    CPU_FTR_DBELL | CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
 #define CPU_FTRS_E5500	(CPU_FTR_USE_TB | CPU_FTR_NODSISRALIGN | \
 	    CPU_FTR_L2CSR | CPU_FTR_LWSYNC | CPU_FTR_NOEXECUTE | \
 	    CPU_FTR_DBELL | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
-	    CPU_FTR_DEBUG_LVL_EXC)
+	    CPU_FTR_DEBUG_LVL_EXC | CPU_FTR_EMB_HV)
 #define CPU_FTRS_GENERIC_32	(CPU_FTR_COMMON | CPU_FTR_NODSISRALIGN)
 
 /* 64-bit CPUs */
@@ -535,6 +536,7 @@ enum {
 #ifdef CONFIG_PPC_E500MC
 	    CPU_FTRS_E500MC & CPU_FTRS_E5500 &
 #endif
+	    ~CPU_FTR_EMB_HV &	/* can be removed at runtime */
 	    CPU_FTRS_POSSIBLE,
 };
 #endif /* __powerpc64__ */
diff --git a/arch/powerpc/include/asm/kvm.h b/arch/powerpc/include/asm/kvm.h
index b921c3f..1bea4d8 100644
--- a/arch/powerpc/include/asm/kvm.h
+++ b/arch/powerpc/include/asm/kvm.h
@@ -277,6 +277,7 @@ struct kvm_sync_regs {
 #define KVM_CPU_E500V2		2
 #define KVM_CPU_3S_32		3
 #define KVM_CPU_3S_64		4
+#define KVM_CPU_E500MC		5
 
 /* for KVM_CAP_SPAPR_TCE */
 struct kvm_create_spapr_tce {
diff --git a/arch/powerpc/kernel/cpu_setup_fsl_booke.S b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
index 8053db0..69fdd23 100644
--- a/arch/powerpc/kernel/cpu_setup_fsl_booke.S
+++ b/arch/powerpc/kernel/cpu_setup_fsl_booke.S
@@ -73,6 +73,7 @@ _GLOBAL(__setup_cpu_e500v2)
 	mtlr	r4
 	blr
 _GLOBAL(__setup_cpu_e500mc)
+	mr	r5, r4
 	mflr	r4
 	bl	__e500_icache_setup
 	bl	__e500_dcache_setup
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 418931f..88c0a35 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -380,10 +380,16 @@ interrupt_base:
 	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
 	mfspr	r10, SPRN_SPRG_THREAD
 	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
 	stw	r12, THREAD_NORMSAVE(1)(r10)
 	stw	r13, THREAD_NORMSAVE(2)(r10)
 	mfcr	r13
 	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_DTLB_MISS SPRN_SRR1
 	mfspr	r10, SPRN_DEAR		/* Get faulting address */
 
 	/* If we are faulting a kernel address, we have to use the
@@ -468,10 +474,16 @@ interrupt_base:
 	mtspr	SPRN_SPRG_WSCRATCH0, r10 /* Save some working registers */
 	mfspr	r10, SPRN_SPRG_THREAD
 	stw	r11, THREAD_NORMSAVE(0)(r10)
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mfspr	r11, SPRN_SRR1
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
 	stw	r12, THREAD_NORMSAVE(1)(r10)
 	stw	r13, THREAD_NORMSAVE(2)(r10)
 	mfcr	r13
 	stw	r13, THREAD_NORMSAVE(3)(r10)
+	DO_KVM	BOOKE_INTERRUPT_ITLB_MISS SPRN_SRR1
 	mfspr	r10, SPRN_SRR0		/* Get faulting address */
 
 	/* If we are faulting a kernel address, we have to use the
@@ -580,6 +592,17 @@ interrupt_base:
 	DEBUG_DEBUG_EXCEPTION
 	DEBUG_CRIT_EXCEPTION
 
+	GUEST_DOORBELL_EXCEPTION
+
+	CRITICAL_EXCEPTION(0, GUEST_DBELL_CRIT, CriticalGuestDoorbell, \
+			   unknown_exception)
+
+	/* Hypercall */
+	EXCEPTION(0, HV_SYSCALL, Hypercall, unknown_exception, EXC_XFER_EE)
+
+	/* Embedded Hypervisor Privilege */
+	EXCEPTION(0, HV_PRIV, Ehvpriv, unknown_exception, EXC_XFER_EE)
+
 /*
  * Local functions
  */
@@ -883,8 +906,31 @@ _GLOBAL(__setup_e500mc_ivors)
 	mtspr	SPRN_IVOR36,r3
 	li	r3,CriticalDoorbell@l
 	mtspr	SPRN_IVOR37,r3
+
+	/*
+	 * We only want to touch IVOR38-41 if we're running on hardware
+	 * that supports category E.HV.  The architectural way to determine
+	 * this is MMUCFG[LPIDSIZE].
+	 */
+	mfspr	r3, SPRN_MMUCFG
+	andis.	r3, r3, MMUCFG_LPIDSIZE@h
+	beq	no_hv
+	li	r3,GuestDoorbell@l
+	mtspr	SPRN_IVOR38,r3
+	li	r3,CriticalGuestDoorbell@l
+	mtspr	SPRN_IVOR39,r3
+	li	r3,Hypercall@l
+	mtspr	SPRN_IVOR40,r3
+	li	r3,Ehvpriv@l
+	mtspr	SPRN_IVOR41,r3
+skip_hv_ivors:
 	sync
 	blr
+no_hv:
+	lwz	r3, CPU_SPEC_FEATURES(r5)
+	rlwinm	r3, r3, 0, ~CPU_FTR_EMB_HV
+	stw	r3, CPU_SPEC_FEATURES(r5)
+	b	skip_hv_ivors
 
 /*
  * extern void giveup_altivec(struct task_struct *prev)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 2c33cd3..58f6e68 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -109,7 +109,7 @@ config KVM_440
 
 config KVM_EXIT_TIMING
 	bool "Detailed exit timing"
-	depends on KVM_440 || KVM_E500
+	depends on KVM_440 || KVM_E500 || KVM_E500MC
 	---help---
 	  Calculate elapsed time for every exit/enter cycle. A per-vcpu
 	  report is available in debugfs kvm/vm#_vcpu#_timing.
@@ -132,6 +132,21 @@ config KVM_E500
 
 	  If unsure, say N.
 
+config KVM_E500MC
+	bool "KVM support for PowerPC E500MC/E5500 processors"
+	depends on EXPERIMENTAL && PPC_E500MC
+	select KVM
+	select KVM_MMIO
+	select KVM_BOOKE_HV
+	---help---
+	  Support running unmodified E500MC/E5500 (32-bit) guest kernels in
+	  virtual machines on E500MC/E5500 host processors.
+
+	  This module provides access to the hardware capabilities through
+	  a character device node named /dev/kvm.
+
+	  If unsure, say N.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 3688aee..62febd7 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -38,6 +38,16 @@ kvm-e500-objs := \
 	e500_emulate.o
 kvm-objs-$(CONFIG_KVM_E500) := $(kvm-e500-objs)
 
+kvm-e500mc-objs := \
+	$(common-objs-y) \
+	booke.o \
+	booke_emulate.o \
+	bookehv_interrupts.o \
+	e500mc.o \
+	e500_tlb.o \
+	e500_emulate.o
+kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
+
 kvm-book3s_64-objs-$(CONFIG_KVM_BOOK3S_64_PR) := \
 	../../../virt/kvm/coalesced_mmio.o \
 	fpu.o \
@@ -89,6 +99,7 @@ kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
 obj-$(CONFIG_KVM_440) += kvm.o
 obj-$(CONFIG_KVM_E500) += kvm.o
+obj-$(CONFIG_KVM_E500MC) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o
 obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o
 
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index ce3f163..3143085 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -2,7 +2,9 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu <yu.liu@freescale.com>
+ *         Scott Wood <scottwood@freescale.com>
  *         Ashish Kalra <ashish.kalra@freescale.com>
+ *         Varun Sethi <varun.sethi@freescale.com>
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.h and
@@ -100,6 +102,7 @@ static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu);
 }
 
+
 /* This geometry is the legacy default -- can be overridden by userspace */
 #define KVM_E500_TLB0_WAY_SIZE		128
 #define KVM_E500_TLB0_WAY_NUM		2
@@ -250,10 +253,12 @@ static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 	if (!get_tlb_v(tlbe))
 		return 0;
 
+#ifndef CONFIG_KVM_BOOKE_HV
 	/* Does it match current guest AS? */
 	/* XXX what about IS != DS? */
 	if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS))
 		return 0;
+#endif
 
 	gpa = get_tlb_raddr(tlbe);
 	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
@@ -274,7 +279,11 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
 			   struct kvm_book3e_206_tlb_entry *gtlbe);
 void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500);
 
-#ifdef CONFIG_KVM_E500
+#ifdef CONFIG_KVM_BOOKE_HV
+#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe)       get_tlb_tid(gtlbe)
+#define get_tlbmiss_tid(vcpu)           get_cur_pid(vcpu)
+#define get_tlb_sts(gtlbe)              (gtlbe->mas1 & MAS1_TS)
+#else
 unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu,
 				      struct kvm_book3e_206_tlb_entry *gtlbe);
 
@@ -288,6 +297,6 @@ static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu)
 
 /* Force TS=1 for all guest mappings. */
 #define get_tlb_sts(gtlbe)              (MAS1_TS)
-#endif /* CONFIG_KVM_E500 */
+#endif /* !BOOKE_HV */
 
 #endif /* KVM_E500_H */
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index af02c18..98b6c1c 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -85,6 +85,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
 
 	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
 	case SPRN_PID:
 		kvmppc_set_pid(vcpu, spr_val);
 		break;
@@ -114,6 +115,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		vcpu->arch.shared->mas7_3 &= (u64)0xffffffff;
 		vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32;
 		break;
+#endif
 	case SPRN_L1CSR0:
 		vcpu_e500->l1csr0 = spr_val;
 		vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC);
@@ -143,7 +145,14 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 	case SPRN_IVOR35:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val;
 		break;
-
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val;
+		break;
+	case SPRN_IVOR37:
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val;
+		break;
+#endif
 	default:
 		emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, rs);
 	}
@@ -155,9 +164,11 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 {
 	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
 	int emulated = EMULATE_DONE;
-	unsigned long val;
 
 	switch (sprn) {
+#ifndef CONFIG_KVM_BOOKE_HV
+		unsigned long val;
+
 	case SPRN_PID:
 		kvmppc_set_gpr(vcpu, rt, vcpu_e500->pid[0]); break;
 	case SPRN_PID1:
@@ -182,6 +193,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		val = vcpu->arch.shared->mas7_3 >> 32;
 		kvmppc_set_gpr(vcpu, rt, val);
 		break;
+#endif
 	case SPRN_TLB0CFG:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.tlbcfg[0]); break;
 	case SPRN_TLB1CFG:
@@ -216,6 +228,14 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 	case SPRN_IVOR35:
 		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]);
 		break;
+#ifdef CONFIG_KVM_BOOKE_HV
+	case SPRN_IVOR36:
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]);
+		break;
+	case SPRN_IVOR37:
+		kvmppc_set_gpr(vcpu, rt, vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]);
+		break;
+#endif
 	default:
 		emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, rt);
 	}
diff --git a/arch/powerpc/kvm/e500_tlb.c b/arch/powerpc/kvm/e500_tlb.c
index 6eb5d65..e232bb4 100644
--- a/arch/powerpc/kvm/e500_tlb.c
+++ b/arch/powerpc/kvm/e500_tlb.c
@@ -2,7 +2,9 @@
  * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
  *
  * Author: Yu Liu, yu.liu@freescale.com
+ *         Scott Wood, scottwood@freescale.com
  *         Ashish Kalra, ashish.kalra@freescale.com
+ *         Varun Sethi, varun.sethi@freescale.com
  *
  * Description:
  * This file is based on arch/powerpc/kvm/44x_tlb.c,
@@ -64,6 +66,7 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
 	/* Mask off reserved bits. */
 	mas3 &= MAS3_ATTRIB_MASK;
 
+#ifndef CONFIG_KVM_BOOKE_HV
 	if (!usermode) {
 		/* Guest is in supervisor mode,
 		 * so we need to translate guest
@@ -71,8 +74,9 @@ static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode)
 		mas3 &= ~E500_TLB_USER_PERM_MASK;
 		mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1;
 	}
-
-	return mas3 | E500_TLB_SUPER_PERM_MASK;
+	mas3 |= E500_TLB_SUPER_PERM_MASK;
+#endif
+	return mas3;
 }
 
 static inline u32 e500_shadow_mas2_attrib(u32 mas2, int usermode)
@@ -98,7 +102,16 @@ static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe,
 	mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2);
 	mtspr(SPRN_MAS3, (u32)stlbe->mas7_3);
 	mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32));
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_MAS8, stlbe->mas8);
+#endif
 	asm volatile("isync; tlbwe" : : : "memory");
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	/* Must clear mas8 for other host tlbwe's */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+#endif
 	local_irq_restore(flags);
 
 	trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1,
@@ -384,6 +397,10 @@ static inline void kvmppc_e500_setup_stlbe(
 		      e500_shadow_mas2_attrib(gtlbe->mas2, pr);
 	stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) |
 			e500_shadow_mas3_attrib(gtlbe->mas7_3, pr);
+
+#ifdef CONFIG_KVM_BOOKE_HV
+	stlbe->mas8 = MAS8_TGS | vcpu->kvm->arch.lpid;
+#endif
 }
 
 static inline void kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
new file mode 100644
index 0000000..fe6c1de
--- /dev/null
+++ b/arch/powerpc/kvm/e500mc.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2010 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * Author: Varun Sethi, <varun.sethi@freescale.com>
+ *
+ * Description:
+ * This file is derived from arch/powerpc/kvm/e500.c,
+ * by Yu Liu <yu.liu@freescale.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/export.h>
+
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+#include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
+
+#include "booke.h"
+#include "e500.h"
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type)
+{
+	enum ppc_dbell dbell_type;
+	unsigned long tag;
+
+	switch (type) {
+	case INT_CLASS_NONCRIT:
+		dbell_type = PPC_G_DBELL;
+		break;
+	case INT_CLASS_CRIT:
+		dbell_type = PPC_G_DBELL_CRIT;
+		break;
+	case INT_CLASS_MC:
+		dbell_type = PPC_G_DBELL_MC;
+		break;
+	default:
+		WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type);
+		return;
+	}
+
+
+	tag = PPC_DBELL_LPID(vcpu->kvm->arch.lpid) | vcpu->vcpu_id;
+	mb();
+	ppc_msgsnd(dbell_type, 0, tag);
+}
+
+/* gtlbe must not be mapped by more than one host tlb entry */
+void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
+			   struct kvm_book3e_206_tlb_entry *gtlbe)
+{
+	unsigned int tid, ts;
+	u32 val, eaddr, lpid;
+	unsigned long flags;
+
+	ts = get_tlb_ts(gtlbe);
+	tid = get_tlb_tid(gtlbe);
+	lpid = vcpu_e500->vcpu.kvm->arch.lpid;
+
+	/* We search the host TLB to invalidate its shadow TLB entry */
+	val = (tid << 16) | ts;
+	eaddr = get_tlb_eaddr(gtlbe);
+
+	local_irq_save(flags);
+
+	mtspr(SPRN_MAS6, val);
+	mtspr(SPRN_MAS5, MAS5_SGS | lpid);
+
+	asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr));
+	val = mfspr(SPRN_MAS1);
+	if (val & MAS1_VALID) {
+		mtspr(SPRN_MAS1, val & ~MAS1_VALID);
+		asm volatile("tlbwe");
+	}
+	mtspr(SPRN_MAS5, 0);
+	/* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */
+	mtspr(SPRN_MAS8, 0);
+	isync();
+
+	local_irq_restore(flags);
+}
+
+void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	mtspr(SPRN_MAS5, MAS5_SGS | vcpu_e500->vcpu.kvm->arch.lpid);
+	asm volatile("tlbilxlpid");
+	mtspr(SPRN_MAS5, 0);
+	local_irq_restore(flags);
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid)
+{
+	vcpu->arch.pid = pid;
+}
+
+void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr)
+{
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	kvmppc_booke_vcpu_load(vcpu, cpu);
+
+	mtspr(SPRN_LPID, vcpu->kvm->arch.lpid);
+	mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr);
+	mtspr(SPRN_GPIR, vcpu->vcpu_id);
+	mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp);
+	mtspr(SPRN_EPLC, vcpu->arch.eplc);
+	mtspr(SPRN_EPSC, vcpu->arch.epsc);
+
+	mtspr(SPRN_GIVPR, vcpu->arch.ivpr);
+	mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]);
+	mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]);
+	mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0);
+	mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1);
+	mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2);
+	mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3);
+
+	mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0);
+	mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1);
+
+	mtspr(SPRN_GEPR, vcpu->arch.epr);
+	mtspr(SPRN_GDEAR, vcpu->arch.shared->dar);
+	mtspr(SPRN_GESR, vcpu->arch.shared->esr);
+
+	if (vcpu->arch.oldpir != mfspr(SPRN_PIR))
+		kvmppc_e500_tlbil_all(vcpu_e500);
+
+	kvmppc_load_guest_fp(vcpu);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.eplc = mfspr(SPRN_EPLC);
+	vcpu->arch.epsc = mfspr(SPRN_EPSC);
+
+	vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0);
+	vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1);
+	vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2);
+	vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3);
+
+	vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0);
+	vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1);
+
+	vcpu->arch.epr = mfspr(SPRN_GEPR);
+	vcpu->arch.shared->dar = mfspr(SPRN_GDEAR);
+	vcpu->arch.shared->esr = mfspr(SPRN_GESR);
+
+	vcpu->arch.oldpir = mfspr(SPRN_PIR);
+
+	kvmppc_booke_vcpu_put(vcpu);
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0)
+		r = 0;
+	else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0)
+		r = 0;
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \
+				 SPRN_EPCR_DUVD;
+	vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_DEP | MSRP_PMMP;
+	vcpu->arch.eplc = EPC_EGS | (vcpu->kvm->arch.lpid << EPC_ELPID_SHIFT);
+	vcpu->arch.epsc = vcpu->arch.eplc;
+
+	vcpu->arch.pvr = mfspr(SPRN_PVR);
+	vcpu_e500->svr = mfspr(SPRN_SVR);
+
+	vcpu->arch.cpu_type = KVM_CPU_E500MC;
+
+	return 0;
+}
+
+void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM |
+			       KVM_SREGS_E_PC;
+	sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL;
+
+	sregs->u.e.impl.fsl.features = 0;
+	sregs->u.e.impl.fsl.svr = vcpu_e500->svr;
+	sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0;
+	sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar;
+
+	kvmppc_get_sregs_e500_tlb(vcpu, sregs);
+
+	sregs->u.e.ivor_high[3] =
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR];
+	sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL];
+	sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT];
+
+	kvmppc_get_sregs_ivor(vcpu, sregs);
+}
+
+int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+	int ret;
+
+	if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) {
+		vcpu_e500->svr = sregs->u.e.impl.fsl.svr;
+		vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0;
+		vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar;
+	}
+
+	ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs);
+	if (ret < 0)
+		return ret;
+
+	if (!(sregs->u.e.features & KVM_SREGS_E_IVOR))
+		return 0;
+
+	if (sregs->u.e.features & KVM_SREGS_E_PM) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] =
+			sregs->u.e.ivor_high[3];
+	}
+
+	if (sregs->u.e.features & KVM_SREGS_E_PC) {
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] =
+			sregs->u.e.ivor_high[4];
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] =
+			sregs->u.e.ivor_high[5];
+	}
+
+	return kvmppc_set_sregs_ivor(vcpu, sregs);
+}
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500;
+	struct kvm_vcpu *vcpu;
+	int err;
+
+	vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu_e500) {
+		err = -ENOMEM;
+		goto out;
+	}
+	vcpu = &vcpu_e500->vcpu;
+
+	/* Invalid PIR value -- this LPID dosn't have valid state on any cpu */
+	vcpu->arch.oldpir = 0xffffffff;
+
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	err = kvmppc_e500_tlb_init(vcpu_e500);
+	if (err)
+		goto uninit_vcpu;
+
+	vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+	if (!vcpu->arch.shared)
+		goto uninit_tlb;
+
+	return vcpu;
+
+uninit_tlb:
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+uninit_vcpu:
+	kvm_vcpu_uninit(vcpu);
+
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu);
+
+	free_page((unsigned long)vcpu->arch.shared);
+	kvmppc_e500_tlb_uninit(vcpu_e500);
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu_e500);
+}
+
+int kvmppc_core_init_vm(struct kvm *kvm)
+{
+	int lpid;
+
+	lpid = kvmppc_alloc_lpid();
+	if (lpid < 0)
+		return lpid;
+
+	kvm->arch.lpid = lpid;
+	return 0;
+}
+
+void kvmppc_core_destroy_vm(struct kvm *kvm)
+{
+	kvmppc_free_lpid(kvm->arch.lpid);
+}
+
+static int __init kvmppc_e500mc_init(void)
+{
+	int r;
+
+	r = kvmppc_booke_init();
+	if (r)
+		return r;
+
+	kvmppc_init_lpid(64);
+	kvmppc_claim_lpid(0); /* host */
+
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE);
+}
+
+static void __exit kvmppc_e500mc_exit(void)
+{
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_e500mc_init);
+module_exit(kvmppc_e500mc_exit);
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index a5c5ceb..58a084f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -230,7 +230,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PPC_PAIRED_SINGLES:
 	case KVM_CAP_PPC_OSI:
 	case KVM_CAP_PPC_GET_PVINFO:
-#ifdef CONFIG_KVM_E500
+#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB:
 #endif
 		r = 1;
@@ -638,7 +638,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		r = 0;
 		vcpu->arch.papr_enabled = true;
 		break;
-#ifdef CONFIG_KVM_E500
+#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
 	case KVM_CAP_SW_TLB: {
 		struct kvm_config_tlb cfg;
 		void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0];
@@ -715,7 +715,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 
-#ifdef CONFIG_KVM_E500
+#if defined(CONFIG_KVM_E500) || defined(CONFIG_KVM_E500MC)
 	case KVM_DIRTY_TLB: {
 		struct kvm_dirty_tlb dirty;
 		r = -EFAULT;
-- 
1.6.0.2

^ permalink raw reply related

* [PATCH 13/37] KVM: PPC: booke: category E.HV (GS-mode) support
From: Alexander Graf @ 2012-02-24 14:26 UTC (permalink / raw)
  To: kvm-ppc; +Cc: Scott Wood, linuxppc-dev, kvm
In-Reply-To: <1330093591-19523-1-git-send-email-agraf@suse.de>

From: Scott Wood <scottwood@freescale.com>

Chips such as e500mc that implement category E.HV in Power ISA 2.06
provide hardware virtualization features, including a new MSR mode for
guest state.  The guest OS can perform many operations without trapping
into the hypervisor, including transitions to and from guest userspace.

Since we can use SRR1[GS] to reliably tell whether an exception came from
guest state, instead of messing around with IVPR, we use DO_KVM similarly
to book3s.

Current issues include:
 - Machine checks from guest state are not routed to the host handler.
 - The guest can cause a host oops by executing an emulated instruction
   in a page that lacks read permission.  Existing e500/4xx support has
   the same problem.

Includes work by Ashish Kalra <Ashish.Kalra@freescale.com>,
Varun Sethi <Varun.Sethi@freescale.com>, and
Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Scott Wood <scottwood@freescale.com>
[agraf: remove pt_regs usage]
Signed-off-by: Alexander Graf <agraf@suse.de>

---

v1 -> v2:

  - ESR -> GESR
---
 arch/powerpc/include/asm/dbell.h            |    1 +
 arch/powerpc/include/asm/kvm_asm.h          |    8 +
 arch/powerpc/include/asm/kvm_booke_hv_asm.h |   49 +++
 arch/powerpc/include/asm/kvm_host.h         |   19 +-
 arch/powerpc/include/asm/kvm_ppc.h          |    3 +
 arch/powerpc/include/asm/mmu-book3e.h       |    6 +
 arch/powerpc/include/asm/processor.h        |    3 +
 arch/powerpc/include/asm/reg.h              |    2 +
 arch/powerpc/include/asm/reg_booke.h        |   34 ++
 arch/powerpc/kernel/asm-offsets.c           |   15 +-
 arch/powerpc/kernel/head_booke.h            |   28 ++-
 arch/powerpc/kvm/Kconfig                    |    3 +
 arch/powerpc/kvm/booke.c                    |  309 ++++++++++++---
 arch/powerpc/kvm/booke.h                    |   24 +-
 arch/powerpc/kvm/booke_emulate.c            |   23 +-
 arch/powerpc/kvm/bookehv_interrupts.S       |  587 +++++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c                  |    5 +
 arch/powerpc/kvm/timing.h                   |    6 +
 18 files changed, 1058 insertions(+), 67 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kvm_booke_hv_asm.h
 create mode 100644 arch/powerpc/kvm/bookehv_interrupts.S

diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index efa74ac..d7365b0 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -19,6 +19,7 @@
 
 #define PPC_DBELL_MSG_BRDCAST	(0x04000000)
 #define PPC_DBELL_TYPE(x)	(((x) & 0xf) << (63-36))
+#define PPC_DBELL_LPID(x)	((x) << (63 - 49))
 enum ppc_dbell {
 	PPC_DBELL = 0,		/* doorbell */
 	PPC_DBELL_CRIT = 1,	/* critical doorbell */
diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 7b1f0e0..0978152 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -48,6 +48,14 @@
 #define BOOKE_INTERRUPT_SPE_FP_DATA 33
 #define BOOKE_INTERRUPT_SPE_FP_ROUND 34
 #define BOOKE_INTERRUPT_PERFORMANCE_MONITOR 35
+#define BOOKE_INTERRUPT_DOORBELL 36
+#define BOOKE_INTERRUPT_DOORBELL_CRITICAL 37
+
+/* booke_hv */
+#define BOOKE_INTERRUPT_GUEST_DBELL 38
+#define BOOKE_INTERRUPT_GUEST_DBELL_CRIT 39
+#define BOOKE_INTERRUPT_HV_SYSCALL 40
+#define BOOKE_INTERRUPT_HV_PRIV 41
 
 /* book3s */
 
diff --git a/arch/powerpc/include/asm/kvm_booke_hv_asm.h b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
new file mode 100644
index 0000000..30a600f
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_booke_hv_asm.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef ASM_KVM_BOOKE_HV_ASM_H
+#define ASM_KVM_BOOKE_HV_ASM_H
+
+#ifdef __ASSEMBLY__
+
+/*
+ * All exceptions from guest state must go through KVM
+ * (except for those which are delivered directly to the guest) --
+ * there are no exceptions for which we fall through directly to
+ * the normal host handler.
+ *
+ * Expected inputs (normal exceptions):
+ *   SCRATCH0 = saved r10
+ *   r10 = thread struct
+ *   r11 = appropriate SRR1 variant (currently used as scratch)
+ *   r13 = saved CR
+ *   *(r10 + THREAD_NORMSAVE(0)) = saved r11
+ *   *(r10 + THREAD_NORMSAVE(2)) = saved r13
+ *
+ * Expected inputs (crit/mcheck/debug exceptions):
+ *   appropriate SCRATCH = saved r8
+ *   r8 = exception level stack frame
+ *   r9 = *(r8 + _CCR) = saved CR
+ *   r11 = appropriate SRR1 variant (currently used as scratch)
+ *   *(r8 + GPR9) = saved r9
+ *   *(r8 + GPR10) = saved r10 (r10 not yet clobbered)
+ *   *(r8 + GPR11) = saved r11
+ */
+.macro DO_KVM intno srr1
+#ifdef CONFIG_KVM_BOOKE_HV
+BEGIN_FTR_SECTION
+	mtocrf	0x80, r11	/* check MSR[GS] without clobbering reg */
+	bf	3, kvmppc_resume_\intno\()_\srr1
+	b	kvmppc_handler_\intno\()_\srr1
+kvmppc_resume_\intno\()_\srr1:
+END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV)
+#endif
+.endm
+
+#endif /*__ASSEMBLY__ */
+#endif /* ASM_KVM_BOOKE_HV_ASM_H */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 47612cc..ed95f53 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -106,6 +106,8 @@ struct kvm_vcpu_stat {
 	u32 dec_exits;
 	u32 ext_intr_exits;
 	u32 halt_wakeup;
+	u32 dbell_exits;
+	u32 gdbell_exits;
 #ifdef CONFIG_PPC_BOOK3S
 	u32 pf_storage;
 	u32 pf_instruc;
@@ -140,6 +142,7 @@ enum kvm_exit_types {
 	EMULATED_TLBSX_EXITS,
 	EMULATED_TLBWE_EXITS,
 	EMULATED_RFI_EXITS,
+	EMULATED_RFCI_EXITS,
 	DEC_EXITS,
 	EXT_INTR_EXITS,
 	HALT_WAKEUP,
@@ -147,6 +150,8 @@ enum kvm_exit_types {
 	FP_UNAVAIL,
 	DEBUG_EXITS,
 	TIMEINGUEST,
+	DBELL_EXITS,
+	GDBELL_EXITS,
 	__NUMBER_OF_KVM_EXIT_TYPES
 };
 
@@ -217,10 +222,10 @@ struct kvm_arch_memory_slot {
 };
 
 struct kvm_arch {
+	unsigned int lpid;
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	unsigned long hpt_virt;
 	struct revmap_entry *revmap;
-	unsigned int lpid;
 	unsigned int host_lpid;
 	unsigned long host_lpcr;
 	unsigned long sdr1;
@@ -345,6 +350,17 @@ struct kvm_vcpu_arch {
 	u64 vsr[64];
 #endif
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	u32 host_mas4;
+	u32 host_mas6;
+	u32 shadow_epcr;
+	u32 epcr;
+	u32 shadow_msrp;
+	u32 eplc;
+	u32 epsc;
+	u32 oldpir;
+#endif
+
 #ifdef CONFIG_PPC_BOOK3S
 	/* For Gekko paired singles */
 	u32 qpr[32];
@@ -428,6 +444,7 @@ struct kvm_vcpu_arch {
 	ulong queued_esr;
 	u32 tlbcfg[4];
 	u32 mmucfg;
+	u32 epr;
 #endif
 	gpa_t paddr_accessed;
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 731e920..e709975 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -139,6 +139,9 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 
+extern int kvmppc_bookehv_init(void);
+extern void kvmppc_bookehv_exit(void);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index cdb5421..eeabcdb 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -104,6 +104,8 @@
 #define MAS4_TSIZED_MASK	0x00000f80	/* Default TSIZE */
 #define MAS4_TSIZED_SHIFT	7
 
+#define MAS5_SGS		0x80000000
+
 #define MAS6_SPID0		0x3FFF0000
 #define MAS6_SPID1		0x00007FFE
 #define MAS6_ISIZE(x)		MAS1_TSIZE(x)
@@ -118,6 +120,10 @@
 
 #define MAS7_RPN		0xFFFFFFFF
 
+#define MAS8_TGS		0x80000000 /* Guest space */
+#define MAS8_VF			0x40000000 /* Virtualization Fault */
+#define MAS8_TLPID		0x000000ff
+
 /* Bit definitions for MMUCFG */
 #define MMUCFG_MAVN	0x00000003	/* MMU Architecture Version Number */
 #define MMUCFG_MAVN_V1	0x00000000	/* v1.0 */
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index b585bff..f64262b 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -243,6 +243,9 @@ struct thread_struct {
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	void*		kvm_shadow_vcpu; /* KVM internal data */
 #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	struct kvm_vcpu	*kvm_vcpu;
+#endif
 #ifdef CONFIG_PPC64
 	unsigned long	dscr;
 	int		dscr_inherit;
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 35c9309..67512f3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -257,7 +257,9 @@
 #define   LPCR_LPES_SH	2
 #define   LPCR_RMI     0x00000002      /* real mode is cache inhibit */
 #define   LPCR_HDICE   0x00000001      /* Hyp Decr enable (HV,PR,EE) */
+#ifndef SPRN_LPID
 #define SPRN_LPID	0x13F	/* Logical Partition Identifier */
+#endif
 #define   LPID_RSVD	0x3ff		/* Reserved LPID for partn switching */
 #define	SPRN_HMER	0x150	/* Hardware m? error recovery */
 #define	SPRN_HMEER	0x151	/* Hardware m? enable error recovery */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 500fe1d..8960afc 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -56,17 +56,29 @@
 #define SPRN_SPRG7W	0x117	/* Special Purpose Register General 7 Write */
 #define SPRN_EPCR	0x133	/* Embedded Processor Control Register */
 #define SPRN_DBCR2	0x136	/* Debug Control Register 2 */
+#define SPRN_MSRP	0x137	/* MSR Protect Register */
 #define SPRN_IAC3	0x13A	/* Instruction Address Compare 3 */
 #define SPRN_IAC4	0x13B	/* Instruction Address Compare 4 */
 #define SPRN_DVC1	0x13E	/* Data Value Compare Register 1 */
 #define SPRN_DVC2	0x13F	/* Data Value Compare Register 2 */
+#define SPRN_LPID	0x152	/* Logical Partition ID */
 #define SPRN_MAS8	0x155	/* MMU Assist Register 8 */
 #define SPRN_TLB0PS	0x158	/* TLB 0 Page Size Register */
 #define SPRN_MAS5_MAS6	0x15c	/* MMU Assist Register 5 || 6 */
 #define SPRN_MAS8_MAS1	0x15d	/* MMU Assist Register 8 || 1 */
 #define SPRN_EPTCFG	0x15e	/* Embedded Page Table Config */
+#define SPRN_GSPRG0	0x170	/* Guest SPRG0 */
+#define SPRN_GSPRG1	0x171	/* Guest SPRG1 */
+#define SPRN_GSPRG2	0x172	/* Guest SPRG2 */
+#define SPRN_GSPRG3	0x173	/* Guest SPRG3 */
 #define SPRN_MAS7_MAS3	0x174	/* MMU Assist Register 7 || 3 */
 #define SPRN_MAS0_MAS1	0x175	/* MMU Assist Register 0 || 1 */
+#define SPRN_GSRR0	0x17A	/* Guest SRR0 */
+#define SPRN_GSRR1	0x17B	/* Guest SRR1 */
+#define SPRN_GEPR	0x17C	/* Guest EPR */
+#define SPRN_GDEAR	0x17D	/* Guest DEAR */
+#define SPRN_GPIR	0x17E	/* Guest PIR */
+#define SPRN_GESR	0x17F	/* Guest Exception Syndrome Register */
 #define SPRN_IVOR0	0x190	/* Interrupt Vector Offset Register 0 */
 #define SPRN_IVOR1	0x191	/* Interrupt Vector Offset Register 1 */
 #define SPRN_IVOR2	0x192	/* Interrupt Vector Offset Register 2 */
@@ -87,6 +99,13 @@
 #define SPRN_IVOR39	0x1B1	/* Interrupt Vector Offset Register 39 */
 #define SPRN_IVOR40	0x1B2	/* Interrupt Vector Offset Register 40 */
 #define SPRN_IVOR41	0x1B3	/* Interrupt Vector Offset Register 41 */
+#define SPRN_GIVOR2	0x1B8	/* Guest IVOR2 */
+#define SPRN_GIVOR3	0x1B9	/* Guest IVOR3 */
+#define SPRN_GIVOR4	0x1BA	/* Guest IVOR4 */
+#define SPRN_GIVOR8	0x1BB	/* Guest IVOR8 */
+#define SPRN_GIVOR13	0x1BC	/* Guest IVOR13 */
+#define SPRN_GIVOR14	0x1BD	/* Guest IVOR14 */
+#define SPRN_GIVPR	0x1BF	/* Guest IVPR */
 #define SPRN_SPEFSCR	0x200	/* SPE & Embedded FP Status & Control */
 #define SPRN_BBEAR	0x201	/* Branch Buffer Entry Address Register */
 #define SPRN_BBTAR	0x202	/* Branch Buffer Target Address Register */
@@ -239,6 +258,10 @@
 #define MCSR_LDG	0x00002000UL /* Guarded Load */
 #define MCSR_TLBSYNC	0x00000002UL /* Multiple tlbsyncs detected */
 #define MCSR_BSL2_ERR	0x00000001UL /* Backside L2 cache error */
+
+#define MSRP_UCLEP	0x04000000 /* Protect MSR[UCLE] */
+#define MSRP_DEP	0x00000200 /* Protect MSR[DE] */
+#define MSRP_PMMP	0x00000004 /* Protect MSR[PMM] */
 #endif
 
 #ifdef CONFIG_E200
@@ -593,6 +616,17 @@
 #define SPRN_EPCR_DMIUH		0x00400000	/* Disable MAS Interrupt updates
 						 * for hypervisor */
 
+/* Bit definitions for EPLC/EPSC */
+#define EPC_EPR		0x80000000 /* 1 = user, 0 = kernel */
+#define EPC_EPR_SHIFT	31
+#define EPC_EAS		0x40000000 /* Address Space */
+#define EPC_EAS_SHIFT	30
+#define EPC_EGS		0x20000000 /* 1 = guest, 0 = hypervisor */
+#define EPC_EGS_SHIFT	29
+#define EPC_ELPID	0x00ff0000
+#define EPC_ELPID_SHIFT	16
+#define EPC_EPID	0x00003fff
+#define EPC_EPID_SHIFT	0
 
 /*
  * The IBM-403 is an even more odd special case, as it is much
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 8e0db0b..a24f2c0 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -119,6 +119,9 @@ int main(void)
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	DEFINE(THREAD_KVM_SVCPU, offsetof(struct thread_struct, kvm_shadow_vcpu));
 #endif
+#ifdef CONFIG_KVM_BOOKE_HV
+	DEFINE(THREAD_KVM_VCPU, offsetof(struct thread_struct, kvm_vcpu));
+#endif
 
 	DEFINE(TI_FLAGS, offsetof(struct thread_info, flags));
 	DEFINE(TI_LOCAL_FLAGS, offsetof(struct thread_info, local_flags));
@@ -401,6 +404,7 @@ int main(void)
 #ifdef CONFIG_KVM
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
+	DEFINE(VCPU_GUEST_PID, offsetof(struct kvm_vcpu, arch.pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_VRSAVE, offsetof(struct kvm_vcpu, arch.vrsave));
 	DEFINE(VCPU_FPRS, offsetof(struct kvm_vcpu, arch.fpr));
@@ -443,9 +447,11 @@ int main(void)
 	DEFINE(VCPU_SHARED_MAS4, offsetof(struct kvm_vcpu_arch_shared, mas4));
 	DEFINE(VCPU_SHARED_MAS6, offsetof(struct kvm_vcpu_arch_shared, mas6));
 
+	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
+	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
+
 	/* book3s */
 #ifdef CONFIG_KVM_BOOK3S_64_HV
-	DEFINE(KVM_LPID, offsetof(struct kvm, arch.lpid));
 	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
 	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
@@ -460,7 +466,6 @@ int main(void)
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
-	DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm));
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
 	DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
@@ -611,6 +616,12 @@ int main(void)
 	DEFINE(VCPU_HOST_SPEFSCR, offsetof(struct kvm_vcpu, arch.host_spefscr));
 #endif
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	DEFINE(VCPU_HOST_MAS4, offsetof(struct kvm_vcpu, arch.host_mas4));
+	DEFINE(VCPU_HOST_MAS6, offsetof(struct kvm_vcpu, arch.host_mas6));
+	DEFINE(VCPU_EPLC, offsetof(struct kvm_vcpu, arch.eplc));
+#endif
+
 #ifdef CONFIG_KVM_EXIT_TIMING
 	DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
 						arch.timing_exit.tv32.tbu));
diff --git a/arch/powerpc/kernel/head_booke.h b/arch/powerpc/kernel/head_booke.h
index 06ab353..b87c335 100644
--- a/arch/powerpc/kernel/head_booke.h
+++ b/arch/powerpc/kernel/head_booke.h
@@ -3,6 +3,7 @@
 
 #include <asm/ptrace.h>	/* for STACK_FRAME_REGS_MARKER */
 #include <asm/kvm_asm.h>
+#include <asm/kvm_booke_hv_asm.h>
 
 /*
  * Macros used for common Book-e exception handling
@@ -36,8 +37,9 @@
 	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
 	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
 	mfcr	r13;			/* save CR in r13 for now	   */\
-	mfspr	r11,SPRN_SRR1;		/* check whether user or kernel    */\
-	andi.	r11,r11,MSR_PR;						     \
+	mfspr	r11, SPRN_SRR1;		                                     \
+	DO_KVM	BOOKE_INTERRUPT_##intno SPRN_SRR1;			     \
+	andi.	r11, r11, MSR_PR;	/* check whether user or kernel    */\
 	mr	r11, r1;						     \
 	beq	1f;							     \
 	/* if from user, start at top of this thread's kernel stack */       \
@@ -123,8 +125,9 @@
 	stw	r10,GPR10(r8);						     \
 	stw	r11,GPR11(r8);						     \
 	stw	r9,_CCR(r8);		/* save CR on stack		   */\
-	mfspr	r10,exc_level_srr1;	/* check whether user or kernel    */\
-	andi.	r10,r10,MSR_PR;						     \
+	mfspr	r11,exc_level_srr1;	/* check whether user or kernel    */\
+	DO_KVM	BOOKE_INTERRUPT_##intno exc_level_srr1;		             \
+	andi.	r11,r11,MSR_PR;						     \
 	mfspr	r11,SPRN_SPRG_THREAD;	/* if from user, start at top of   */\
 	lwz	r11,THREAD_INFO-THREAD(r11); /* this thread's kernel stack */\
 	addi	r11,r11,EXC_LVL_FRAME_OVERHEAD;	/* allocate stack frame    */\
@@ -173,6 +176,23 @@
 			SPRN_MCSRR0, SPRN_MCSRR1)
 
 /*
+ * Guest Doorbell -- this is a bit odd in that uses GSRR0/1 despite
+ * being delivered to the host.  This exception can only happen
+ * inside a KVM guest -- so we just handle up to the DO_KVM rather
+ * than try to fit this into one of the existing prolog macros.
+ */
+#define GUEST_DOORBELL_EXCEPTION \
+	START_EXCEPTION(GuestDoorbell);					     \
+	mtspr	SPRN_SPRG_WSCRATCH0, r10;	/* save one register */	     \
+	mfspr	r10, SPRN_SPRG_THREAD;					     \
+	stw	r11, THREAD_NORMSAVE(0)(r10);				     \
+	mfspr	r11, SPRN_SRR1;		                                     \
+	stw	r13, THREAD_NORMSAVE(2)(r10);				     \
+	mfcr	r13;			/* save CR in r13 for now	   */\
+	DO_KVM	BOOKE_INTERRUPT_GUEST_DBELL SPRN_GSRR1;			     \
+	trap
+
+/*
  * Exception vectors.
  */
 #define	START_EXCEPTION(label)						     \
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 8f64709..2c33cd3 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -90,6 +90,9 @@ config KVM_BOOK3S_64_PR
 	depends on KVM_BOOK3S_64 && !KVM_BOOK3S_64_HV
 	select KVM_BOOK3S_PR
 
+config KVM_BOOKE_HV
+	bool
+
 config KVM_440
 	bool "KVM support for PowerPC 440 processors"
 	depends on EXPERIMENTAL && 44x
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 2ee9bae..75dbaeb 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -17,6 +17,8 @@
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ *          Scott Wood <scottwood@freescale.com>
+ *          Varun Sethi <varun.sethi@freescale.com>
  */
 
 #include <linux/errno.h>
@@ -30,9 +32,12 @@
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
-#include "timing.h"
 #include <asm/cacheflush.h>
+#include <asm/dbell.h>
+#include <asm/hw_irq.h>
+#include <asm/irq.h>
 
+#include "timing.h"
 #include "booke.h"
 
 unsigned long kvmppc_booke_handlers;
@@ -55,6 +60,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "dec",        VCPU_STAT(dec_exits) },
 	{ "ext_intr",   VCPU_STAT(ext_intr_exits) },
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+	{ "doorbell", VCPU_STAT(dbell_exits) },
+	{ "guest doorbell", VCPU_STAT(gdbell_exits) },
 	{ NULL }
 };
 
@@ -121,6 +128,10 @@ void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 {
 	u32 old_msr = vcpu->arch.shared->msr;
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	new_msr |= MSR_GS;
+#endif
+
 	vcpu->arch.shared->msr = new_msr;
 
 	kvmppc_mmu_msr_notify(vcpu, old_msr);
@@ -195,6 +206,75 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
 	clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
 }
 
+static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_GSRR0, srr0);
+	mtspr(SPRN_GSRR1, srr1);
+#else
+	vcpu->arch.shared->srr0 = srr0;
+	vcpu->arch.shared->srr1 = srr1;
+#endif
+}
+
+static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.csrr0 = srr0;
+	vcpu->arch.csrr1 = srr1;
+}
+
+static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) {
+		vcpu->arch.dsrr0 = srr0;
+		vcpu->arch.dsrr1 = srr1;
+	} else {
+		set_guest_csrr(vcpu, srr0, srr1);
+	}
+}
+
+static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1)
+{
+	vcpu->arch.mcsrr0 = srr0;
+	vcpu->arch.mcsrr1 = srr1;
+}
+
+static unsigned long get_guest_dear(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	return mfspr(SPRN_GDEAR);
+#else
+	return vcpu->arch.shared->dar;
+#endif
+}
+
+static void set_guest_dear(struct kvm_vcpu *vcpu, unsigned long dear)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_GDEAR, dear);
+#else
+	vcpu->arch.shared->dar = dear;
+#endif
+}
+
+static unsigned long get_guest_esr(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	return mfspr(SPRN_GESR);
+#else
+	return vcpu->arch.shared->esr;
+#endif
+}
+
+static void set_guest_esr(struct kvm_vcpu *vcpu, u32 esr)
+{
+#ifdef CONFIG_KVM_BOOKE_HV
+	mtspr(SPRN_GESR, esr);
+#else
+	vcpu->arch.shared->esr = esr;
+#endif
+}
+
 /* Deliver the interrupt of the corresponding priority, if possible. */
 static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                                         unsigned int priority)
@@ -206,6 +286,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
 	bool crit;
 	bool keep_irq = false;
+	enum int_class int_class;
 
 	/* Truncate crit indicators in 32 bit mode */
 	if (!(vcpu->arch.shared->msr & MSR_SF)) {
@@ -241,16 +322,20 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_AP_UNAVAIL:
 	case BOOKE_IRQPRIO_ALIGNMENT:
 		allowed = 1;
-		msr_mask = MSR_CE|MSR_ME|MSR_DE;
+		msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
 		break;
 	case BOOKE_IRQPRIO_CRITICAL:
-	case BOOKE_IRQPRIO_WATCHDOG:
 		allowed = vcpu->arch.shared->msr & MSR_CE;
-		msr_mask = MSR_ME;
+		allowed = allowed && !crit;
+		msr_mask = MSR_GS | MSR_ME;
+		int_class = INT_CLASS_CRIT;
 		break;
 	case BOOKE_IRQPRIO_MACHINE_CHECK:
 		allowed = vcpu->arch.shared->msr & MSR_ME;
-		msr_mask = 0;
+		allowed = allowed && !crit;
+		msr_mask = MSR_GS;
+		int_class = INT_CLASS_MC;
 		break;
 	case BOOKE_IRQPRIO_DECREMENTER:
 	case BOOKE_IRQPRIO_FIT:
@@ -259,28 +344,62 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 	case BOOKE_IRQPRIO_EXTERNAL:
 		allowed = vcpu->arch.shared->msr & MSR_EE;
 		allowed = allowed && !crit;
-		msr_mask = MSR_CE|MSR_ME|MSR_DE;
+		msr_mask = MSR_GS | MSR_CE | MSR_ME | MSR_DE;
+		int_class = INT_CLASS_NONCRIT;
 		break;
 	case BOOKE_IRQPRIO_DEBUG:
 		allowed = vcpu->arch.shared->msr & MSR_DE;
-		msr_mask = MSR_ME;
+		allowed = allowed && !crit;
+		msr_mask = MSR_GS | MSR_ME;
+		int_class = INT_CLASS_CRIT;
 		break;
 	}
 
 	if (allowed) {
-		vcpu->arch.shared->srr0 = vcpu->arch.pc;
-		vcpu->arch.shared->srr1 = vcpu->arch.shared->msr;
+		switch (int_class) {
+		case INT_CLASS_NONCRIT:
+			set_guest_srr(vcpu, vcpu->arch.pc,
+				      vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_CRIT:
+			set_guest_csrr(vcpu, vcpu->arch.pc,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_DBG:
+			set_guest_dsrr(vcpu, vcpu->arch.pc,
+				       vcpu->arch.shared->msr);
+			break;
+		case INT_CLASS_MC:
+			set_guest_mcsrr(vcpu, vcpu->arch.pc,
+					vcpu->arch.shared->msr);
+			break;
+		}
+
 		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
 		if (update_esr == true)
-			vcpu->arch.shared->esr = vcpu->arch.queued_esr;
+			set_guest_esr(vcpu, vcpu->arch.queued_esr);
 		if (update_dear == true)
-			vcpu->arch.shared->dar = vcpu->arch.queued_dear;
+			set_guest_dear(vcpu, vcpu->arch.queued_dear);
 		kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask);
 
 		if (!keep_irq)
 			clear_bit(priority, &vcpu->arch.pending_exceptions);
 	}
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	/*
+	 * If an interrupt is pending but masked, raise a guest doorbell
+	 * so that we are notified when the guest enables the relevant
+	 * MSR bit.
+	 */
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT);
+	if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK)
+		kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC);
+#endif
+
 	return allowed;
 }
 
@@ -344,6 +463,11 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		return -EINVAL;
 	}
 
+	if (!current->thread.kvm_vcpu) {
+		WARN(1, "no vcpu\n");
+		return -EPERM;
+	}
+
 	local_irq_disable();
 
 	kvmppc_core_prepare_to_enter(vcpu);
@@ -363,6 +487,38 @@ out:
 	return ret;
 }
 
+static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+	enum emulation_result er;
+
+	er = kvmppc_emulate_instruction(run, vcpu);
+	switch (er) {
+	case EMULATE_DONE:
+		/* don't overwrite subtypes, just account kvm_stats */
+		kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
+		/* Future optimization: only reload non-volatiles if
+		 * they were actually modified by emulation. */
+		return RESUME_GUEST_NV;
+
+	case EMULATE_DO_DCR:
+		run->exit_reason = KVM_EXIT_DCR;
+		return RESUME_HOST;
+
+	case EMULATE_FAIL:
+		/* XXX Deliver Program interrupt to guest. */
+		printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
+		       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
+		/* For debugging, encode the failing instruction and
+		 * report it to userspace. */
+		run->hw.hardware_exit_reason = ~0ULL << 32;
+		run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
+		return RESUME_HOST;
+
+	default:
+		BUG();
+	}
+}
+
 /**
  * kvmppc_handle_exit
  *
@@ -371,12 +527,30 @@ out:
 int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
                        unsigned int exit_nr)
 {
-	enum emulation_result er;
 	int r = RESUME_HOST;
 
 	/* update before a new last_exit_type is rewritten */
 	kvmppc_update_timing_stats(vcpu);
 
+	switch (exit_nr) {
+	case BOOKE_INTERRUPT_EXTERNAL:
+		do_IRQ(current->thread.regs);
+		break;
+
+	case BOOKE_INTERRUPT_DECREMENTER:
+		timer_interrupt(current->thread.regs);
+		break;
+
+#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_BOOK3E_64)
+	case BOOKE_INTERRUPT_DOORBELL:
+		doorbell_exception(current->thread.regs);
+		break;
+#endif
+	case BOOKE_INTERRUPT_MACHINE_CHECK:
+		/* FIXME */
+		break;
+	}
+
 	local_irq_enable();
 
 	run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -384,30 +558,56 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	switch (exit_nr) {
 	case BOOKE_INTERRUPT_MACHINE_CHECK:
-		printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR));
-		kvmppc_dump_vcpu(vcpu);
-		r = RESUME_HOST;
+		kvm_resched(vcpu);
+		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_EXTERNAL:
 		kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
-		if (need_resched())
-			cond_resched();
+		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_DECREMENTER:
-		/* Since we switched IVPR back to the host's value, the host
-		 * handled this interrupt the moment we enabled interrupts.
-		 * Now we just offer it a chance to reschedule the guest. */
 		kvmppc_account_exit(vcpu, DEC_EXITS);
-		if (need_resched())
-			cond_resched();
+		kvm_resched(vcpu);
 		r = RESUME_GUEST;
 		break;
 
+	case BOOKE_INTERRUPT_DOORBELL:
+		kvmppc_account_exit(vcpu, DBELL_EXITS);
+		kvm_resched(vcpu);
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL_CRIT:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_CE or MSR_ME was not
+		 * set.  Once we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_GUEST_DBELL:
+		kvmppc_account_exit(vcpu, GDBELL_EXITS);
+
+		/*
+		 * We are here because there is a pending guest interrupt
+		 * which could not be delivered as MSR_EE was not set.  Once
+		 * we break from here we will retry delivery.
+		 */
+		r = RESUME_GUEST;
+		break;
+
+	case BOOKE_INTERRUPT_HV_PRIV:
+		r = emulation_exit(run, vcpu);
+		break;
+
 	case BOOKE_INTERRUPT_PROGRAM:
-		if (vcpu->arch.shared->msr & MSR_PR) {
+		if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) {
 			/* Program traps generated by user-level software must be handled
 			 * by the guest kernel. */
 			kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr);
@@ -416,32 +616,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 		}
 
-		er = kvmppc_emulate_instruction(run, vcpu);
-		switch (er) {
-		case EMULATE_DONE:
-			/* don't overwrite subtypes, just account kvm_stats */
-			kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
-			/* Future optimization: only reload non-volatiles if
-			 * they were actually modified by emulation. */
-			r = RESUME_GUEST_NV;
-			break;
-		case EMULATE_DO_DCR:
-			run->exit_reason = KVM_EXIT_DCR;
-			r = RESUME_HOST;
-			break;
-		case EMULATE_FAIL:
-			/* XXX Deliver Program interrupt to guest. */
-			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
-			       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
-			/* For debugging, encode the failing instruction and
-			 * report it to userspace. */
-			run->hw.hardware_exit_reason = ~0ULL << 32;
-			run->hw.hardware_exit_reason |= vcpu->arch.last_inst;
-			r = RESUME_HOST;
-			break;
-		default:
-			BUG();
-		}
+		r = emulation_exit(run, vcpu);
 		break;
 
 	case BOOKE_INTERRUPT_FP_UNAVAIL:
@@ -506,6 +681,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	case BOOKE_INTERRUPT_HV_SYSCALL:
+		if (!(vcpu->arch.shared->msr & MSR_PR)) {
+			kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu));
+		} else {
+			/*
+			 * hcall from guest userspace -- send privileged
+			 * instruction program check.
+			 */
+			kvmppc_core_queue_program(vcpu, ESR_PPR);
+		}
+
+		r = RESUME_GUEST;
+		break;
+#else
 	case BOOKE_INTERRUPT_SYSCALL:
 		if (!(vcpu->arch.shared->msr & MSR_PR) &&
 		    (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) {
@@ -519,6 +709,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
 		r = RESUME_GUEST;
 		break;
+#endif
 
 	case BOOKE_INTERRUPT_DTLB_MISS: {
 		unsigned long eaddr = vcpu->arch.fault_dear;
@@ -659,12 +850,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	int r;
 
 	vcpu->arch.pc = 0;
-	vcpu->arch.shared->msr = 0;
-	vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
 	vcpu->arch.shared->pir = vcpu->vcpu_id;
 	kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */
+	kvmppc_set_msr(vcpu, 0);
 
+#ifndef CONFIG_KVM_BOOKE_HV
+	vcpu->arch.shadow_msr = MSR_USER | MSR_DE | MSR_IS | MSR_DS;
 	vcpu->arch.shadow_pid = 1;
+	vcpu->arch.shared->msr = 0;
+#endif
 
 	/* Eye-catching numbers so we know if the guest takes an interrupt
 	 * before it's programmed its own IVPR/IVORs. */
@@ -745,8 +939,8 @@ static void get_sregs_base(struct kvm_vcpu *vcpu,
 	sregs->u.e.csrr0 = vcpu->arch.csrr0;
 	sregs->u.e.csrr1 = vcpu->arch.csrr1;
 	sregs->u.e.mcsr = vcpu->arch.mcsr;
-	sregs->u.e.esr = vcpu->arch.shared->esr;
-	sregs->u.e.dear = vcpu->arch.shared->dar;
+	sregs->u.e.esr = get_guest_esr(vcpu);
+	sregs->u.e.dear = get_guest_dear(vcpu);
 	sregs->u.e.tsr = vcpu->arch.tsr;
 	sregs->u.e.tcr = vcpu->arch.tcr;
 	sregs->u.e.dec = kvmppc_get_dec(vcpu, tb);
@@ -763,8 +957,8 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
 	vcpu->arch.csrr0 = sregs->u.e.csrr0;
 	vcpu->arch.csrr1 = sregs->u.e.csrr1;
 	vcpu->arch.mcsr = sregs->u.e.mcsr;
-	vcpu->arch.shared->esr = sregs->u.e.esr;
-	vcpu->arch.shared->dar = sregs->u.e.dear;
+	set_guest_esr(vcpu, sregs->u.e.esr);
+	set_guest_dear(vcpu, sregs->u.e.dear);
 	vcpu->arch.vrsave = sregs->u.e.vrsave;
 	kvmppc_set_tcr(vcpu, sregs->u.e.tcr);
 
@@ -961,14 +1155,17 @@ void kvmppc_decrementer_func(unsigned long data)
 
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	current->thread.kvm_vcpu = vcpu;
 }
 
 void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	current->thread.kvm_vcpu = NULL;
 }
 
 int __init kvmppc_booke_init(void)
 {
+#ifndef CONFIG_KVM_BOOKE_HV
 	unsigned long ivor[16];
 	unsigned long max_ivor = 0;
 	int i;
@@ -1011,7 +1208,7 @@ int __init kvmppc_booke_init(void)
 	}
 	flush_icache_range(kvmppc_booke_handlers,
 	                   kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
-
+#endif /* !BOOKE_HV */
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 05d1d99..d53bcf2 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -48,7 +48,20 @@
 #define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19
 /* Internal pseudo-irqprio for level triggered externals */
 #define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20
-#define BOOKE_IRQPRIO_MAX 20
+#define BOOKE_IRQPRIO_DBELL 21
+#define BOOKE_IRQPRIO_DBELL_CRIT 22
+#define BOOKE_IRQPRIO_MAX 23
+
+#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \
+			  (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \
+			  (1 << BOOKE_IRQPRIO_DBELL) | \
+			  (1 << BOOKE_IRQPRIO_DECREMENTER) | \
+			  (1 << BOOKE_IRQPRIO_FIT) | \
+			  (1 << BOOKE_IRQPRIO_EXTERNAL))
+
+#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \
+			  (1 << BOOKE_IRQPRIO_WATCHDOG) | \
+			  (1 << BOOKE_IRQPRIO_CRITICAL))
 
 extern unsigned long kvmppc_booke_handlers;
 
@@ -74,4 +87,13 @@ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu);
 
+enum int_class {
+	INT_CLASS_NONCRIT,
+	INT_CLASS_CRIT,
+	INT_CLASS_MC,
+	INT_CLASS_DBG,
+};
+
+void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type);
+
 #endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index 3e652da..904412b 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -99,6 +99,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return emulated;
 }
 
+/*
+ * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode).
+ * Their backing store is in real registers, and these functions
+ * will return the wrong result if called for them in another context
+ * (such as debugging).
+ */
 int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 {
 	int emulated = EMULATE_DONE;
@@ -122,9 +128,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		kvmppc_set_tcr(vcpu, spr_val);
 		break;
 
-	/* Note: SPRG4-7 are user-readable. These values are
-	 * loaded into the real SPRGs when resuming the
-	 * guest. */
+	/*
+	 * Note: SPRG4-7 are user-readable.
+	 * These values are loaded into the real SPRGs when resuming the
+	 * guest (PR-mode only).
+	 */
 	case SPRN_SPRG4:
 		vcpu->arch.shared->sprg4 = spr_val; break;
 	case SPRN_SPRG5:
@@ -136,6 +144,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 
 	case SPRN_IVPR:
 		vcpu->arch.ivpr = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVPR, spr_val);
+#endif
 		break;
 	case SPRN_IVOR0:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val;
@@ -145,6 +156,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		break;
 	case SPRN_IVOR2:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR2, spr_val);
+#endif
 		break;
 	case SPRN_IVOR3:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val;
@@ -163,6 +177,9 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		break;
 	case SPRN_IVOR8:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val;
+#ifdef CONFIG_KVM_BOOKE_HV
+		mtspr(SPRN_GIVOR8, spr_val);
+#endif
 		break;
 	case SPRN_IVOR9:
 		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val;
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
new file mode 100644
index 0000000..9eaeebd
--- /dev/null
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -0,0 +1,587 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2010-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Varun Sethi <varun.sethi@freescale.com>
+ * Author: Scott Wood <scotwood@freescale.com>
+ *
+ * This file is derived from arch/powerpc/kvm/booke_interrupts.S
+ */
+
+#include <asm/ppc_asm.h>
+#include <asm/kvm_asm.h>
+#include <asm/reg.h>
+#include <asm/mmu-44x.h>
+#include <asm/page.h>
+#include <asm/asm-compat.h>
+#include <asm/asm-offsets.h>
+#include <asm/bitsperlong.h>
+
+#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */
+
+#define GET_VCPU(vcpu, thread)	\
+	PPC_LL	vcpu, THREAD_KVM_VCPU(thread)
+
+#define SET_VCPU(vcpu)		\
+        PPC_STL	vcpu, (THREAD + THREAD_KVM_VCPU)(r2)
+
+#define LONGBYTES		(BITS_PER_LONG / 8)
+
+#define VCPU_GPR(n)     	(VCPU_GPRS + (n * LONGBYTES))
+#define VCPU_GUEST_SPRG(n)	(VCPU_GUEST_SPRGS + (n * LONGBYTES))
+
+/* The host stack layout: */
+#define HOST_R1         (0 * LONGBYTES) /* Implied by stwu. */
+#define HOST_CALLEE_LR  (1 * LONGBYTES)
+#define HOST_RUN        (2 * LONGBYTES) /* struct kvm_run */
+/*
+ * r2 is special: it holds 'current', and it made nonvolatile in the
+ * kernel with the -ffixed-r2 gcc option.
+ */
+#define HOST_R2         (3 * LONGBYTES)
+#define HOST_NV_GPRS    (4 * LONGBYTES)
+#define HOST_NV_GPR(n)  (HOST_NV_GPRS + ((n - 14) * LONGBYTES))
+#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(31) + LONGBYTES)
+#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */
+#define HOST_STACK_LR   (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */
+
+#define NEED_EMU		0x00000001 /* emulation -- save nv regs */
+#define NEED_DEAR		0x00000002 /* save faulting DEAR */
+#define NEED_ESR		0x00000004 /* save faulting ESR */
+
+/*
+ * On entry:
+ * r4 = vcpu, r5 = srr0, r6 = srr1
+ * saved in vcpu: cr, ctr, r3-r13
+ */
+.macro kvm_handler_common intno, srr0, flags
+	mfspr	r10, SPRN_PID
+	lwz	r8, VCPU_HOST_PID(r4)
+	PPC_LL	r11, VCPU_SHARED(r4)
+	PPC_STL	r14, VCPU_GPR(r14)(r4) /* We need a non-volatile GPR. */
+	li	r14, \intno
+
+	stw	r10, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r8
+
+	.if	\flags & NEED_EMU
+	lwz	r9, VCPU_KVM(r4)
+	.endif
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save exit time */
+1:	mfspr	r7, SPRN_TBRU
+	mfspr	r8, SPRN_TBRL
+	mfspr	r9, SPRN_TBRU
+	cmpw	r9, r7
+	PPC_STL	r8, VCPU_TIMING_EXIT_TBL(r4)
+	bne-	1b
+	PPC_STL	r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
+	oris	r8, r6, MSR_CE@h
+#ifndef CONFIG_64BIT
+	stw	r6, (VCPU_SHARED_MSR + 4)(r11)
+#else
+	std	r6, (VCPU_SHARED_MSR)(r11)
+#endif
+	ori	r8, r8, MSR_ME | MSR_RI
+	PPC_STL	r5, VCPU_PC(r4)
+
+	/*
+	 * Make sure CE/ME/RI are set (if appropriate for exception type)
+	 * whether or not the guest had it set.  Since mfmsr/mtmsr are
+	 * somewhat expensive, skip in the common case where the guest
+	 * had all these bits set (and thus they're still set if
+	 * appropriate for the exception type).
+	 */
+	cmpw	r6, r8
+	.if	\flags & NEED_EMU
+	lwz	r9, KVM_LPID(r9)
+	.endif
+	beq	1f
+	mfmsr	r7
+	.if	\srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0
+	oris	r7, r7, MSR_CE@h
+	.endif
+	.if	\srr0 != SPRN_MCSRR0
+	ori	r7, r7, MSR_ME | MSR_RI
+	.endif
+	mtmsr	r7
+1:
+
+	.if	\flags & NEED_EMU
+	/*
+	 * This assumes you have external PID support.
+	 * To support a bookehv CPU without external PID, you'll
+	 * need to look up the TLB entry and create a temporary mapping.
+	 *
+	 * FIXME: we don't currently handle if the lwepx faults.  PR-mode
+	 * booke doesn't handle it either.  Since Linux doesn't use
+	 * broadcast tlbivax anymore, the only way this should happen is
+	 * if the guest maps its memory execute-but-not-read, or if we
+	 * somehow take a TLB miss in the middle of this entry code and
+	 * evict the relevant entry.  On e500mc, all kernel lowmem is
+	 * bolted into TLB1 large page mappings, and we don't use
+	 * broadcast invalidates, so we should not take a TLB miss here.
+	 *
+	 * Later we'll need to deal with faults here.  Disallowing guest
+	 * mappings that are execute-but-not-read could be an option on
+	 * e500mc, but not on chips with an LRAT if it is used.
+	 */
+
+	mfspr	r3, SPRN_EPLC	/* will already have correct ELPID and EGS */
+	PPC_STL	r15, VCPU_GPR(r15)(r4)
+	PPC_STL	r16, VCPU_GPR(r16)(r4)
+	PPC_STL	r17, VCPU_GPR(r17)(r4)
+	PPC_STL	r18, VCPU_GPR(r18)(r4)
+	PPC_STL	r19, VCPU_GPR(r19)(r4)
+	mr	r8, r3
+	PPC_STL	r20, VCPU_GPR(r20)(r4)
+	rlwimi	r8, r6, EPC_EAS_SHIFT - MSR_IR_LG, EPC_EAS
+	PPC_STL	r21, VCPU_GPR(r21)(r4)
+	rlwimi	r8, r6, EPC_EPR_SHIFT - MSR_PR_LG, EPC_EPR
+	PPC_STL	r22, VCPU_GPR(r22)(r4)
+	rlwimi	r8, r10, EPC_EPID_SHIFT, EPC_EPID
+	PPC_STL	r23, VCPU_GPR(r23)(r4)
+	PPC_STL	r24, VCPU_GPR(r24)(r4)
+	PPC_STL	r25, VCPU_GPR(r25)(r4)
+	PPC_STL	r26, VCPU_GPR(r26)(r4)
+	PPC_STL	r27, VCPU_GPR(r27)(r4)
+	PPC_STL	r28, VCPU_GPR(r28)(r4)
+	PPC_STL	r29, VCPU_GPR(r29)(r4)
+	PPC_STL	r30, VCPU_GPR(r30)(r4)
+	PPC_STL	r31, VCPU_GPR(r31)(r4)
+	mtspr	SPRN_EPLC, r8
+	isync
+	lwepx	r9, 0, r5
+	mtspr	SPRN_EPLC, r3
+	stw	r9, VCPU_LAST_INST(r4)
+	.endif
+
+	.if	\flags & NEED_ESR
+	mfspr	r8, SPRN_ESR
+	PPC_STL	r8, VCPU_FAULT_ESR(r4)
+	.endif
+
+	.if	\flags & NEED_DEAR
+	mfspr	r9, SPRN_DEAR
+	PPC_STL	r9, VCPU_FAULT_DEAR(r4)
+	.endif
+
+	b	kvmppc_resume_host
+.endm
+
+/*
+ * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h
+ */
+.macro kvm_handler intno srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	GET_VCPU(r11, r10)
+	PPC_STL r3, VCPU_GPR(r3)(r11)
+	mfspr	r3, SPRN_SPRG_RSCRATCH0
+	PPC_STL	r4, VCPU_GPR(r4)(r11)
+	PPC_LL	r4, THREAD_NORMSAVE(0)(r10)
+	PPC_STL	r5, VCPU_GPR(r5)(r11)
+	PPC_STL	r13, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(r10)(r11)
+	PPC_LL	r3, THREAD_NORMSAVE(2)(r10)
+	PPC_STL	r6, VCPU_GPR(r6)(r11)
+	PPC_STL	r4, VCPU_GPR(r11)(r11)
+	mfspr	r6, \srr1
+	PPC_STL	r7, VCPU_GPR(r7)(r11)
+	PPC_STL	r8, VCPU_GPR(r8)(r11)
+	PPC_STL	r9, VCPU_GPR(r9)(r11)
+	PPC_STL r3, VCPU_GPR(r13)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(r12)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+.macro kvm_lvl_handler intno scratch srr0, srr1, flags
+_GLOBAL(kvmppc_handler_\intno\()_\srr1)
+	mfspr	r10, SPRN_SPRG_THREAD
+	GET_VCPU(r11, r10)
+	PPC_STL r3, VCPU_GPR(r3)(r11)
+	mfspr	r3, \scratch
+	PPC_STL	r4, VCPU_GPR(r4)(r11)
+	PPC_LL	r4, GPR9(r8)
+	PPC_STL	r5, VCPU_GPR(r5)(r11)
+	PPC_STL	r9, VCPU_CR(r11)
+	mfspr	r5, \srr0
+	PPC_STL	r3, VCPU_GPR(r8)(r11)
+	PPC_LL	r3, GPR10(r8)
+	PPC_STL	r6, VCPU_GPR(r6)(r11)
+	PPC_STL	r4, VCPU_GPR(r9)(r11)
+	mfspr	r6, \srr1
+	PPC_LL	r4, GPR11(r8)
+	PPC_STL	r7, VCPU_GPR(r7)(r11)
+	PPC_STL	r8, VCPU_GPR(r8)(r11)
+	PPC_STL r3, VCPU_GPR(r10)(r11)
+	mfctr	r7
+	PPC_STL	r12, VCPU_GPR(r12)(r11)
+	PPC_STL	r4, VCPU_GPR(r11)(r11)
+	PPC_STL	r7, VCPU_CTR(r11)
+	mr	r4, r11
+	kvm_handler_common \intno, \srr0, \flags
+.endm
+
+kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \
+	SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR)
+kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, NEED_ESR
+kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \
+	SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR)
+kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU
+kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0
+kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0
+kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \
+	SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0
+
+
+/* Registers:
+ *  SPRG_SCRATCH0: guest r10
+ *  r4: vcpu pointer
+ *  r11: vcpu->arch.shared
+ *  r14: KVM exit number
+ */
+_GLOBAL(kvmppc_resume_host)
+	/* Save remaining volatile guest register state to vcpu. */
+	mfspr	r3, SPRN_VRSAVE
+	PPC_STL	r0, VCPU_GPR(r0)(r4)
+	PPC_STL	r1, VCPU_GPR(r1)(r4)
+	mflr	r5
+	mfspr	r6, SPRN_SPRG4
+	PPC_STL	r2, VCPU_GPR(r2)(r4)
+	PPC_STL	r5, VCPU_LR(r4)
+	mfspr	r7, SPRN_SPRG5
+	PPC_STL	r3, VCPU_VRSAVE(r4)
+	PPC_STL	r6, VCPU_SHARED_SPRG4(r11)
+	mfspr	r8, SPRN_SPRG6
+	PPC_STL	r7, VCPU_SHARED_SPRG5(r11)
+	mfspr	r9, SPRN_SPRG7
+	PPC_STL	r8, VCPU_SHARED_SPRG6(r11)
+	mfxer	r3
+	PPC_STL	r9, VCPU_SHARED_SPRG7(r11)
+
+	/* save guest MAS registers and restore host mas4 & mas6 */
+	mfspr	r5, SPRN_MAS0
+	PPC_STL	r3, VCPU_XER(r4)
+	mfspr	r6, SPRN_MAS1
+	stw	r5, VCPU_SHARED_MAS0(r11)
+	mfspr	r7, SPRN_MAS2
+	stw	r6, VCPU_SHARED_MAS1(r11)
+#ifndef CONFIG_64BIT
+	stw	r7, (VCPU_SHARED_MAS2 + 4)(r11)
+#else
+	std	r7, (VCPU_SHARED_MAS2)(r11)
+#endif
+	mfspr	r5, SPRN_MAS3
+	mfspr	r6, SPRN_MAS4
+	stw	r5, VCPU_SHARED_MAS7_3+4(r11)
+	mfspr	r7, SPRN_MAS6
+	stw	r6, VCPU_SHARED_MAS4(r11)
+	mfspr	r5, SPRN_MAS7
+	lwz	r6, VCPU_HOST_MAS4(r4)
+	stw	r7, VCPU_SHARED_MAS6(r11)
+	lwz	r8, VCPU_HOST_MAS6(r4)
+	mtspr	SPRN_MAS4, r6
+	stw	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r8
+	mfspr	r3, SPRN_EPCR
+	rlwinm	r3, r3, 0, ~SPRN_EPCR_DMIUH
+	mtspr	SPRN_EPCR, r3
+	isync
+
+	/* Restore host stack pointer */
+	PPC_LL	r1, VCPU_HOST_STACK(r4)
+	PPC_LL	r2, HOST_R2(r1)
+
+	/* Switch to kernel stack and jump to handler. */
+	PPC_LL	r3, HOST_RUN(r1)
+	mr	r5, r14 /* intno */
+	mr	r14, r4 /* Save vcpu pointer. */
+	bl	kvmppc_handle_exit
+
+	/* Restore vcpu pointer and the nonvolatiles we used. */
+	mr	r4, r14
+	PPC_LL	r14, VCPU_GPR(r14)(r4)
+
+	andi.	r5, r3, RESUME_FLAG_NV
+	beq	skip_nv_load
+	PPC_LL	r15, VCPU_GPR(r15)(r4)
+	PPC_LL	r16, VCPU_GPR(r16)(r4)
+	PPC_LL	r17, VCPU_GPR(r17)(r4)
+	PPC_LL	r18, VCPU_GPR(r18)(r4)
+	PPC_LL	r19, VCPU_GPR(r19)(r4)
+	PPC_LL	r20, VCPU_GPR(r20)(r4)
+	PPC_LL	r21, VCPU_GPR(r21)(r4)
+	PPC_LL	r22, VCPU_GPR(r22)(r4)
+	PPC_LL	r23, VCPU_GPR(r23)(r4)
+	PPC_LL	r24, VCPU_GPR(r24)(r4)
+	PPC_LL	r25, VCPU_GPR(r25)(r4)
+	PPC_LL	r26, VCPU_GPR(r26)(r4)
+	PPC_LL	r27, VCPU_GPR(r27)(r4)
+	PPC_LL	r28, VCPU_GPR(r28)(r4)
+	PPC_LL	r29, VCPU_GPR(r29)(r4)
+	PPC_LL	r30, VCPU_GPR(r30)(r4)
+	PPC_LL	r31, VCPU_GPR(r31)(r4)
+skip_nv_load:
+	/* Should we return to the guest? */
+	andi.	r5, r3, RESUME_FLAG_HOST
+	beq	lightweight_exit
+
+	srawi	r3, r3, 2 /* Shift -ERR back down. */
+
+heavyweight_exit:
+	/* Not returning to guest. */
+	PPC_LL	r5, HOST_STACK_LR(r1)
+
+	/*
+	 * We already saved guest volatile register state; now save the
+	 * non-volatiles.
+	 */
+
+	PPC_STL	r15, VCPU_GPR(r15)(r4)
+	PPC_STL	r16, VCPU_GPR(r16)(r4)
+	PPC_STL	r17, VCPU_GPR(r17)(r4)
+	PPC_STL	r18, VCPU_GPR(r18)(r4)
+	PPC_STL	r19, VCPU_GPR(r19)(r4)
+	PPC_STL	r20, VCPU_GPR(r20)(r4)
+	PPC_STL	r21, VCPU_GPR(r21)(r4)
+	PPC_STL	r22, VCPU_GPR(r22)(r4)
+	PPC_STL	r23, VCPU_GPR(r23)(r4)
+	PPC_STL	r24, VCPU_GPR(r24)(r4)
+	PPC_STL	r25, VCPU_GPR(r25)(r4)
+	PPC_STL	r26, VCPU_GPR(r26)(r4)
+	PPC_STL	r27, VCPU_GPR(r27)(r4)
+	PPC_STL	r28, VCPU_GPR(r28)(r4)
+	PPC_STL	r29, VCPU_GPR(r29)(r4)
+	PPC_STL	r30, VCPU_GPR(r30)(r4)
+	PPC_STL	r31, VCPU_GPR(r31)(r4)
+
+	/* Load host non-volatile register state from host stack. */
+	PPC_LL	r14, HOST_NV_GPR(r14)(r1)
+	PPC_LL	r15, HOST_NV_GPR(r15)(r1)
+	PPC_LL	r16, HOST_NV_GPR(r16)(r1)
+	PPC_LL	r17, HOST_NV_GPR(r17)(r1)
+	PPC_LL	r18, HOST_NV_GPR(r18)(r1)
+	PPC_LL	r19, HOST_NV_GPR(r19)(r1)
+	PPC_LL	r20, HOST_NV_GPR(r20)(r1)
+	PPC_LL	r21, HOST_NV_GPR(r21)(r1)
+	PPC_LL	r22, HOST_NV_GPR(r22)(r1)
+	PPC_LL	r23, HOST_NV_GPR(r23)(r1)
+	PPC_LL	r24, HOST_NV_GPR(r24)(r1)
+	PPC_LL	r25, HOST_NV_GPR(r25)(r1)
+	PPC_LL	r26, HOST_NV_GPR(r26)(r1)
+	PPC_LL	r27, HOST_NV_GPR(r27)(r1)
+	PPC_LL	r28, HOST_NV_GPR(r28)(r1)
+	PPC_LL	r29, HOST_NV_GPR(r29)(r1)
+	PPC_LL	r30, HOST_NV_GPR(r30)(r1)
+	PPC_LL	r31, HOST_NV_GPR(r31)(r1)
+
+	/* Return to kvm_vcpu_run(). */
+	mtlr	r5
+	addi	r1, r1, HOST_STACK_SIZE
+	/* r3 still contains the return code from kvmppc_handle_exit(). */
+	blr
+
+/* Registers:
+ *  r3: kvm_run pointer
+ *  r4: vcpu pointer
+ */
+_GLOBAL(__kvmppc_vcpu_run)
+	stwu	r1, -HOST_STACK_SIZE(r1)
+	PPC_STL	r1, VCPU_HOST_STACK(r4)	/* Save stack pointer to vcpu. */
+
+	/* Save host state to stack. */
+	PPC_STL	r3, HOST_RUN(r1)
+	mflr	r3
+	PPC_STL	r3, HOST_STACK_LR(r1)
+
+	/* Save host non-volatile register state to stack. */
+	PPC_STL	r14, HOST_NV_GPR(r14)(r1)
+	PPC_STL	r15, HOST_NV_GPR(r15)(r1)
+	PPC_STL	r16, HOST_NV_GPR(r16)(r1)
+	PPC_STL	r17, HOST_NV_GPR(r17)(r1)
+	PPC_STL	r18, HOST_NV_GPR(r18)(r1)
+	PPC_STL	r19, HOST_NV_GPR(r19)(r1)
+	PPC_STL	r20, HOST_NV_GPR(r20)(r1)
+	PPC_STL	r21, HOST_NV_GPR(r21)(r1)
+	PPC_STL	r22, HOST_NV_GPR(r22)(r1)
+	PPC_STL	r23, HOST_NV_GPR(r23)(r1)
+	PPC_STL	r24, HOST_NV_GPR(r24)(r1)
+	PPC_STL	r25, HOST_NV_GPR(r25)(r1)
+	PPC_STL	r26, HOST_NV_GPR(r26)(r1)
+	PPC_STL	r27, HOST_NV_GPR(r27)(r1)
+	PPC_STL	r28, HOST_NV_GPR(r28)(r1)
+	PPC_STL	r29, HOST_NV_GPR(r29)(r1)
+	PPC_STL	r30, HOST_NV_GPR(r30)(r1)
+	PPC_STL	r31, HOST_NV_GPR(r31)(r1)
+
+	/* Load guest non-volatiles. */
+	PPC_LL	r14, VCPU_GPR(r14)(r4)
+	PPC_LL	r15, VCPU_GPR(r15)(r4)
+	PPC_LL	r16, VCPU_GPR(r16)(r4)
+	PPC_LL	r17, VCPU_GPR(r17)(r4)
+	PPC_LL	r18, VCPU_GPR(r18)(r4)
+	PPC_LL	r19, VCPU_GPR(r19)(r4)
+	PPC_LL	r20, VCPU_GPR(r20)(r4)
+	PPC_LL	r21, VCPU_GPR(r21)(r4)
+	PPC_LL	r22, VCPU_GPR(r22)(r4)
+	PPC_LL	r23, VCPU_GPR(r23)(r4)
+	PPC_LL	r24, VCPU_GPR(r24)(r4)
+	PPC_LL	r25, VCPU_GPR(r25)(r4)
+	PPC_LL	r26, VCPU_GPR(r26)(r4)
+	PPC_LL	r27, VCPU_GPR(r27)(r4)
+	PPC_LL	r28, VCPU_GPR(r28)(r4)
+	PPC_LL	r29, VCPU_GPR(r29)(r4)
+	PPC_LL	r30, VCPU_GPR(r30)(r4)
+	PPC_LL	r31, VCPU_GPR(r31)(r4)
+
+
+lightweight_exit:
+	PPC_STL	r2, HOST_R2(r1)
+
+	mfspr	r3, SPRN_PID
+	stw	r3, VCPU_HOST_PID(r4)
+	lwz	r3, VCPU_GUEST_PID(r4)
+	mtspr	SPRN_PID, r3
+
+	/* Save vcpu pointer for the exception handlers
+	 * must be done before loading guest r2.
+	 */
+//	SET_VCPU(r4)
+
+	PPC_LL	r11, VCPU_SHARED(r4)
+	/* Save host mas4 and mas6 and load guest MAS registers */
+	mfspr	r3, SPRN_MAS4
+	stw	r3, VCPU_HOST_MAS4(r4)
+	mfspr	r3, SPRN_MAS6
+	stw	r3, VCPU_HOST_MAS6(r4)
+	lwz	r3, VCPU_SHARED_MAS0(r11)
+	lwz	r5, VCPU_SHARED_MAS1(r11)
+#ifndef CONFIG_64BIT
+	lwz	r6, (VCPU_SHARED_MAS2 + 4)(r11)
+#else
+	ld	r6, (VCPU_SHARED_MAS2)(r11)
+#endif
+	lwz	r7, VCPU_SHARED_MAS7_3+4(r11)
+	lwz	r8, VCPU_SHARED_MAS4(r11)
+	mtspr	SPRN_MAS0, r3
+	mtspr	SPRN_MAS1, r5
+	mtspr	SPRN_MAS2, r6
+	mtspr	SPRN_MAS3, r7
+	mtspr	SPRN_MAS4, r8
+	lwz	r3, VCPU_SHARED_MAS6(r11)
+	lwz	r5, VCPU_SHARED_MAS7_3+0(r11)
+	mtspr	SPRN_MAS6, r3
+	mtspr	SPRN_MAS7, r5
+	/* Disable MAS register updates via exception */
+	mfspr	r3, SPRN_EPCR
+	oris	r3, r3, SPRN_EPCR_DMIUH@h
+	mtspr	SPRN_EPCR, r3
+
+	/*
+	 * Host interrupt handlers may have clobbered these guest-readable
+	 * SPRGs, so we need to reload them here with the guest's values.
+	 */
+	lwz	r3, VCPU_VRSAVE(r4)
+	lwz	r5, VCPU_SHARED_SPRG4(r11)
+	mtspr	SPRN_VRSAVE, r3
+	lwz	r6, VCPU_SHARED_SPRG5(r11)
+	mtspr	SPRN_SPRG4W, r5
+	lwz	r7, VCPU_SHARED_SPRG6(r11)
+	mtspr	SPRN_SPRG5W, r6
+	lwz	r8, VCPU_SHARED_SPRG7(r11)
+	mtspr	SPRN_SPRG6W, r7
+	mtspr	SPRN_SPRG7W, r8
+
+	/* Load some guest volatiles. */
+	PPC_LL	r3, VCPU_LR(r4)
+	PPC_LL	r5, VCPU_XER(r4)
+	PPC_LL	r6, VCPU_CTR(r4)
+	PPC_LL	r7, VCPU_CR(r4)
+	PPC_LL	r8, VCPU_PC(r4)
+#ifndef CONFIG_64BIT
+	lwz	r9, (VCPU_SHARED_MSR + 4)(r11)
+#else
+	ld	r9, (VCPU_SHARED_MSR)(r11)
+#endif
+	PPC_LL	r0, VCPU_GPR(r0)(r4)
+	PPC_LL	r1, VCPU_GPR(r1)(r4)
+	PPC_LL	r2, VCPU_GPR(r2)(r4)
+	PPC_LL	r10, VCPU_GPR(r10)(r4)
+	PPC_LL	r11, VCPU_GPR(r11)(r4)
+	PPC_LL	r12, VCPU_GPR(r12)(r4)
+	PPC_LL	r13, VCPU_GPR(r13)(r4)
+	mtlr	r3
+	mtxer	r5
+	mtctr	r6
+	mtcr	r7
+	mtsrr0	r8
+	mtsrr1	r9
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save enter time */
+1:
+	mfspr	r6, SPRN_TBRU
+	mfspr	r7, SPRN_TBRL
+	mfspr	r8, SPRN_TBRU
+	cmpw	r8, r6
+	PPC_STL	r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	bne	1b
+	PPC_STL	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
+	/* Finish loading guest volatiles and jump to guest. */
+	PPC_LL	r5, VCPU_GPR(r5)(r4)
+	PPC_LL	r6, VCPU_GPR(r6)(r4)
+	PPC_LL	r7, VCPU_GPR(r7)(r4)
+	PPC_LL	r8, VCPU_GPR(r8)(r4)
+	PPC_LL	r9, VCPU_GPR(r9)(r4)
+
+	PPC_LL	r3, VCPU_GPR(r3)(r4)
+	PPC_LL	r4, VCPU_GPR(r4)(r4)
+	rfi
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9806ea5..a5c5ceb 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -109,6 +109,11 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
 		goto out;
 #endif
 
+#ifdef CONFIG_KVM_BOOKE_HV
+	if (!cpu_has_feature(CPU_FTR_EMB_HV))
+		goto out;
+#endif
+
 	r = true;
 
 out:
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index 8167d42..bf191e7 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -93,6 +93,12 @@ static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
 	case SIGNAL_EXITS:
 		vcpu->stat.signal_exits++;
 		break;
+	case DBELL_EXITS:
+		vcpu->stat.dbell_exits++;
+		break;
+	case GDBELL_EXITS:
+		vcpu->stat.gdbell_exits++;
+		break;
 	}
 }
 
-- 
1.6.0.2

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox