public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
* [patch 0/3] kvm tool: Serial emulation overhaul
@ 2011-12-10 13:27 Thomas Gleixner
  2011-12-10 13:27 ` [patch 1/3] kvm tool: serial: Cleanup coding style Thomas Gleixner
                   ` (3 more replies)
  0 siblings, 4 replies; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-10 13:27 UTC (permalink / raw)
  To: LKML; +Cc: Pekka Enberg, Sasha Levin

We observed massive hangs of the serial console in kvm tool which
turned out to be caused by the bogus emulation of the interrupt line
behaviour.

This series cleans up and simplifies the serial code and in the last
step fixes the interrupt handling proper.

Thanks,

	tglx

 hw/serial.c               |  224 ++++++++++++++++++++--------------------------
 include/kvm/8250-serial.h |    2 
 x86/kvm.c                 |    2 
 3 files changed, 103 insertions(+), 125 deletions(-)


^ permalink raw reply	[flat|nested] 46+ messages in thread

* [patch 1/3] kvm tool: serial: Cleanup coding style
  2011-12-10 13:27 [patch 0/3] kvm tool: Serial emulation overhaul Thomas Gleixner
@ 2011-12-10 13:27 ` Thomas Gleixner
  2011-12-10 13:27 ` [patch 3/3] kvm tool: serial: Fix interrupt handling Thomas Gleixner
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-10 13:27 UTC (permalink / raw)
  To: LKML; +Cc: Pekka Enberg, Sasha Levin

[-- Attachment #1: kvm-tool-cleanup-serial-c.patch --]
[-- Type: text/plain, Size: 5252 bytes --]

It's nice to align struct initializers, but random tab insertion into
the code flow is a horrible idea.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 tools/kvm/hw/serial.c |   69 ++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

Index: linux-kvm/tools/kvm/hw/serial.c
===================================================================
--- linux-kvm.orig/tools/kvm/hw/serial.c
+++ linux-kvm/tools/kvm/hw/serial.c
@@ -91,13 +91,13 @@ static void serial8250__sysrq(struct kvm
 {
 	switch (sysrq_pending) {
 	case SYSRQ_PENDING_BREAK:
-		dev->lsr	|= UART_LSR_DR | UART_LSR_BI;
+		dev->lsr |= UART_LSR_DR | UART_LSR_BI;
 
-		sysrq_pending	= SYSRQ_PENDING_CMD;
+		sysrq_pending = SYSRQ_PENDING_CMD;
 		break;
 	case SYSRQ_PENDING_CMD:
-		dev->rbr	= 'p';
-		dev->lsr	|= UART_LSR_DR;
+		dev->rbr = 'p';
+		dev->lsr |= UART_LSR_DR;
 
 		sysrq_pending	= SYSRQ_PENDING_NONE;
 		break;
@@ -124,8 +124,8 @@ static void serial8250__receive(struct k
 	if (c < 0)
 		return;
 
-	dev->rbr	= c;
-	dev->lsr	|= UART_LSR_DR;
+	dev->rbr = c;
+	dev->lsr |= UART_LSR_DR;
 }
 
 void serial8250__inject_interrupt(struct kvm *kvm)
@@ -140,11 +140,11 @@ void serial8250__inject_interrupt(struct
 		serial8250__receive(kvm, dev);
 
 		if (dev->ier & UART_IER_RDI && dev->lsr & UART_LSR_DR)
-			dev->iir		= UART_IIR_RDI;
+			dev->iir = UART_IIR_RDI;
 		else if (dev->ier & UART_IER_THRI)
-			dev->iir		= UART_IIR_THRI;
+			dev->iir = UART_IIR_THRI;
 		else
-			dev->iir		= UART_IIR_NO_INT;
+			dev->iir = UART_IIR_NO_INT;
 
 		if (dev->iir != UART_IIR_NO_INT) {
 			kvm__irq_line(kvm, dev->irq, 0);
@@ -179,30 +179,30 @@ static bool serial8250_out(struct ioport
 	u16 offset;
 	bool ret = true;
 
-	dev		= find_device(port);
+	dev = find_device(port);
 	if (!dev)
 		return false;
 
 	mutex_lock(&dev->mutex);
 
-	offset		= port - dev->iobase;
+	offset = port - dev->iobase;
 
 	if (dev->lcr & UART_LCR_DLAB) {
 		switch (offset) {
 		case UART_DLL:
-			dev->dll	= ioport__read8(data);
+			dev->dll = ioport__read8(data);
 			break;
 		case UART_DLM:
-			dev->dlm	= ioport__read8(data);
+			dev->dlm = ioport__read8(data);
 			break;
 		case UART_FCR:
-			dev->fcr	= ioport__read8(data);
+			dev->fcr = ioport__read8(data);
 			break;
 		case UART_LCR:
-			dev->lcr	= ioport__read8(data);
+			dev->lcr = ioport__read8(data);
 			break;
 		case UART_MCR:
-			dev->mcr	= ioport__read8(data);
+			dev->mcr = ioport__read8(data);
 			break;
 		case UART_LSR:
 			/* Factory test */
@@ -211,11 +211,11 @@ static bool serial8250_out(struct ioport
 			/* Not used */
 			break;
 		case UART_SCR:
-			dev->scr	= ioport__read8(data);
+			dev->scr = ioport__read8(data);
 			break;
 		default:
-			ret		= false;
-			goto out_unlock;
+			ret = false;
+			break;
 		}
 	} else {
 		switch (offset) {
@@ -225,21 +225,21 @@ static bool serial8250_out(struct ioport
 			if (!(dev->mcr & UART_MCR_LOOP))
 				term_putc(CONSOLE_8250, addr, size, dev->id);
 
-			dev->iir		= UART_IIR_NO_INT;
+			dev->iir = UART_IIR_NO_INT;
 			break;
 		}
 		case UART_FCR:
-			dev->fcr	= ioport__read8(data);
+			dev->fcr = ioport__read8(data);
 			break;
 		case UART_IER:
-			dev->ier	= ioport__read8(data) & 0x3f;
+			dev->ier = ioport__read8(data) & 0x3f;
 			kvm__irq_line(kvm, dev->irq, dev->ier ? 1 : 0);
 			break;
 		case UART_LCR:
-			dev->lcr	= ioport__read8(data);
+			dev->lcr = ioport__read8(data);
 			break;
 		case UART_MCR:
-			dev->mcr	= ioport__read8(data);
+			dev->mcr = ioport__read8(data);
 			break;
 		case UART_LSR:
 			/* Factory test */
@@ -248,15 +248,14 @@ static bool serial8250_out(struct ioport
 			/* Not used */
 			break;
 		case UART_SCR:
-			dev->scr	= ioport__read8(data);
+			dev->scr = ioport__read8(data);
 			break;
 		default:
-			ret		= false;
-			goto out_unlock;
+			ret = false;
+			break;
 		}
 	}
 
-out_unlock:
 	mutex_unlock(&dev->mutex);
 
 	return ret;
@@ -268,13 +267,13 @@ static bool serial8250_in(struct ioport
 	u16 offset;
 	bool ret = true;
 
-	dev		= find_device(port);
+	dev = find_device(port);
 	if (!dev)
 		return false;
 
 	mutex_lock(&dev->mutex);
 
-	offset		= port - dev->iobase;
+	offset = port - dev->iobase;
 
 	if (dev->lcr & UART_LCR_DLAB) {
 		switch (offset) {
@@ -293,8 +292,8 @@ static bool serial8250_in(struct ioport
 		switch (offset) {
 		case UART_RX:
 			ioport__write8(data, dev->rbr);
-			dev->lsr		&= ~UART_LSR_DR;
-			dev->iir		= UART_IIR_NO_INT;
+			dev->lsr &= ~UART_LSR_DR;
+			dev->iir = UART_IIR_NO_INT;
 			goto out_unlock;
 
 		case UART_IER:
@@ -311,7 +310,7 @@ static bool serial8250_in(struct ioport
 		u8 iir = dev->iir;
 
 		if (dev->fcr & UART_FCR_ENABLE_FIFO)
-			iir		|= 0xc0;
+			iir |= 0xc0;
 
 		ioport__write8(data, iir);
 		break;
@@ -324,7 +323,7 @@ static bool serial8250_in(struct ioport
 		break;
 	case UART_LSR:
 		ioport__write8(data, dev->lsr);
-		dev->lsr		&= ~(UART_LSR_OE|UART_LSR_PE|UART_LSR_FE|UART_LSR_BI);
+		dev->lsr &= ~(UART_LSR_OE|UART_LSR_PE|UART_LSR_FE|UART_LSR_BI);
 		break;
 	case UART_MSR:
 		ioport__write8(data, dev->msr);
@@ -333,7 +332,7 @@ static bool serial8250_in(struct ioport
 		ioport__write8(data, dev->scr);
 		break;
 	default:
-		ret		= false;
+		ret = false;
 		goto out_unlock;
 	}
 out_unlock:



^ permalink raw reply	[flat|nested] 46+ messages in thread

* [patch 2/3] kvm tool: serial: Simplify switch cases
  2011-12-10 13:27 [patch 0/3] kvm tool: Serial emulation overhaul Thomas Gleixner
  2011-12-10 13:27 ` [patch 1/3] kvm tool: serial: Cleanup coding style Thomas Gleixner
  2011-12-10 13:27 ` [patch 3/3] kvm tool: serial: Fix interrupt handling Thomas Gleixner
@ 2011-12-10 13:27 ` Thomas Gleixner
  2011-12-10 15:17 ` [patch 0/3] kvm tool: Serial emulation overhaul Pekka Enberg
  3 siblings, 0 replies; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-10 13:27 UTC (permalink / raw)
  To: LKML; +Cc: Pekka Enberg, Sasha Levin

[-- Attachment #1: kvm-tool-serial-simplify-switch-cases.patch --]
[-- Type: text/plain, Size: 3696 bytes --]

There is no point to have the same switch case construct for all the
registers, just to take care of the oddball case of DLL/DLM.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 tools/kvm/hw/serial.c |  129 +++++++++++++++++---------------------------------
 1 file changed, 44 insertions(+), 85 deletions(-)

Index: linux-kvm/tools/kvm/hw/serial.c
===================================================================
--- linux-kvm.orig/tools/kvm/hw/serial.c
+++ linux-kvm/tools/kvm/hw/serial.c
@@ -187,73 +187,48 @@ static bool serial8250_out(struct ioport
 
 	offset = port - dev->iobase;
 
-	if (dev->lcr & UART_LCR_DLAB) {
-		switch (offset) {
-		case UART_DLL:
-			dev->dll = ioport__read8(data);
-			break;
-		case UART_DLM:
-			dev->dlm = ioport__read8(data);
-			break;
-		case UART_FCR:
-			dev->fcr = ioport__read8(data);
-			break;
-		case UART_LCR:
-			dev->lcr = ioport__read8(data);
-			break;
-		case UART_MCR:
-			dev->mcr = ioport__read8(data);
-			break;
-		case UART_LSR:
-			/* Factory test */
-			break;
-		case UART_MSR:
-			/* Not used */
-			break;
-		case UART_SCR:
-			dev->scr = ioport__read8(data);
-			break;
-		default:
-			ret = false;
-			break;
-		}
-	} else {
-		switch (offset) {
-		case UART_TX: {
+	switch (offset) {
+	case UART_TX:
+		if (!(dev->lcr & UART_LCR_DLAB)) {
 			char *addr = data;
 
 			if (!(dev->mcr & UART_MCR_LOOP))
 				term_putc(CONSOLE_8250, addr, size, dev->id);
 
 			dev->iir = UART_IIR_NO_INT;
-			break;
+		} else {
+			dev->dll = ioport__read8(data);
 		}
-		case UART_FCR:
-			dev->fcr = ioport__read8(data);
-			break;
-		case UART_IER:
+		break;
+	case UART_IER:
+		if (!(dev->lcr & UART_LCR_DLAB)) {
 			dev->ier = ioport__read8(data) & 0x3f;
 			kvm__irq_line(kvm, dev->irq, dev->ier ? 1 : 0);
-			break;
-		case UART_LCR:
-			dev->lcr = ioport__read8(data);
-			break;
-		case UART_MCR:
-			dev->mcr = ioport__read8(data);
-			break;
-		case UART_LSR:
-			/* Factory test */
-			break;
-		case UART_MSR:
-			/* Not used */
-			break;
-		case UART_SCR:
-			dev->scr = ioport__read8(data);
-			break;
-		default:
-			ret = false;
-			break;
+		} else {
+			dev->dlm = ioport__read8(data);
 		}
+		break;
+	case UART_FCR:
+		dev->fcr = ioport__read8(data);
+		break;
+	case UART_LCR:
+		dev->lcr = ioport__read8(data);
+		break;
+	case UART_MCR:
+		dev->mcr = ioport__read8(data);
+		break;
+	case UART_LSR:
+		/* Factory test */
+		break;
+	case UART_MSR:
+		/* Not used */
+		break;
+	case UART_SCR:
+		dev->scr = ioport__read8(data);
+		break;
+	default:
+		ret = false;
+		break;
 	}
 
 	mutex_unlock(&dev->mutex);
@@ -275,37 +250,22 @@ static bool serial8250_in(struct ioport
 
 	offset = port - dev->iobase;
 
-	if (dev->lcr & UART_LCR_DLAB) {
-		switch (offset) {
-		case UART_DLL:
+	switch (offset) {
+	case UART_RX:
+		if (dev->lcr & UART_LCR_DLAB) {
 			ioport__write8(data, dev->dll);
-			goto out_unlock;
-
-		case UART_DLM:
-			ioport__write8(data, dev->dlm);
-			goto out_unlock;
-
-		default:
-			break;
-		}
-	} else {
-		switch (offset) {
-		case UART_RX:
+		} else {
 			ioport__write8(data, dev->rbr);
 			dev->lsr &= ~UART_LSR_DR;
 			dev->iir = UART_IIR_NO_INT;
-			goto out_unlock;
-
-		case UART_IER:
-			ioport__write8(data, dev->ier);
-			goto out_unlock;
-
-		default:
-			break;
 		}
-	}
-
-	switch (offset) {
+		break;
+	case UART_IER:
+		if (dev->lcr & UART_LCR_DLAB)
+			ioport__write8(data, dev->dlm);
+		else
+			ioport__write8(data, dev->ier);
+		break;
 	case UART_IIR: {
 		u8 iir = dev->iir;
 
@@ -333,9 +293,8 @@ static bool serial8250_in(struct ioport
 		break;
 	default:
 		ret = false;
-		goto out_unlock;
+		break;
 	}
-out_unlock:
 	mutex_unlock(&dev->mutex);
 
 	return ret;



^ permalink raw reply	[flat|nested] 46+ messages in thread

* [patch 3/3] kvm tool: serial: Fix interrupt handling
  2011-12-10 13:27 [patch 0/3] kvm tool: Serial emulation overhaul Thomas Gleixner
  2011-12-10 13:27 ` [patch 1/3] kvm tool: serial: Cleanup coding style Thomas Gleixner
@ 2011-12-10 13:27 ` Thomas Gleixner
  2011-12-10 13:27 ` [patch 2/3] kvm tool: serial: Simplify switch cases Thomas Gleixner
  2011-12-10 15:17 ` [patch 0/3] kvm tool: Serial emulation overhaul Pekka Enberg
  3 siblings, 0 replies; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-10 13:27 UTC (permalink / raw)
  To: LKML; +Cc: Pekka Enberg, Sasha Levin

[-- Attachment #1: kvm-tools-serial-fix-interrupt-handling.patch --]
[-- Type: text/plain, Size: 5960 bytes --]

The interrupt injection of the serial emulation is completely
broken. It's just doing random toggling of the interrupt line, which
can lead to complete console hangs.

The real hardware asserts the interrupt line when a condition
(RX/TX/Status) is met and the corresponding interrupt is enabled in
the IER. It's deasserted when the condition is cleared or the
corresponding interrupt is disabled in the IER.

So the correct emulation just needs to check after each state change
in the LSR or the IER which bits in the IIR need to be set and update
the interrupt line accordingly. To avoid setting the same state over
and over keep an internal state of the last set interrupt line state
and only update via the kvm ioctl when the new state differs.

Rename serial8250__inject_interrupts() to serial8250__update_consoles()
which reflects what the function really is about.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 tools/kvm/hw/serial.c               |   74 ++++++++++++++++++++++--------------
 tools/kvm/include/kvm/8250-serial.h |    2 
 tools/kvm/x86/kvm.c                 |    2 
 3 files changed, 49 insertions(+), 29 deletions(-)

Index: linux-kvm/tools/kvm/hw/serial.c
===================================================================
--- linux-kvm.orig/tools/kvm/hw/serial.c
+++ linux-kvm/tools/kvm/hw/serial.c
@@ -18,6 +18,7 @@ struct serial8250_device {
 
 	u16			iobase;
 	u8			irq;
+	u8			irq_state;
 
 	u8			rbr;		/* receive buffer */
 	u8			dll;
@@ -81,6 +82,31 @@ static struct serial8250_device devices[
 	},
 };
 
+static void serial8250_update_irq(struct kvm *kvm, struct serial8250_device *dev)
+{
+	u8 iir = 0;
+
+	/* Data ready and rcv interrupt enabled ? */
+	if ((dev->ier & UART_IER_RDI) && (dev->lsr & UART_LSR_DR))
+		iir |= UART_IIR_RDI;
+
+	/* Transmitter empty and interrupt enabled ? */
+	if ((dev->ier & UART_IER_THRI) && (dev->lsr & UART_LSR_TEMT))
+		iir |= UART_IIR_THRI;
+
+	/* Now update the irq line, if necessary */
+	if (!iir) {
+		dev->iir = UART_IIR_NO_INT;
+		if (dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 0);
+	} else {
+		dev->iir = iir;
+		if (!dev->irq_state)
+			kvm__irq_line(kvm, dev->irq, 1);
+	}
+	dev->irq_state = iir;
+}
+
 #define SYSRQ_PENDING_NONE		0
 #define SYSRQ_PENDING_BREAK		1
 #define SYSRQ_PENDING_CMD		2
@@ -128,7 +154,7 @@ static void serial8250__receive(struct k
 	dev->lsr |= UART_LSR_DR;
 }
 
-void serial8250__inject_interrupt(struct kvm *kvm)
+void serial8250__update_consoles(struct kvm *kvm)
 {
 	unsigned int i;
 
@@ -139,17 +165,7 @@ void serial8250__inject_interrupt(struct
 
 		serial8250__receive(kvm, dev);
 
-		if (dev->ier & UART_IER_RDI && dev->lsr & UART_LSR_DR)
-			dev->iir = UART_IIR_RDI;
-		else if (dev->ier & UART_IER_THRI)
-			dev->iir = UART_IIR_THRI;
-		else
-			dev->iir = UART_IIR_NO_INT;
-
-		if (dev->iir != UART_IIR_NO_INT) {
-			kvm__irq_line(kvm, dev->irq, 0);
-			kvm__irq_line(kvm, dev->irq, 1);
-		}
+		serial8250_update_irq(kvm, dev);
 
 		mutex_unlock(&dev->mutex);
 	}
@@ -194,19 +210,26 @@ static bool serial8250_out(struct ioport
 
 			if (!(dev->mcr & UART_MCR_LOOP))
 				term_putc(CONSOLE_8250, addr, size, dev->id);
+			/* else FIXME: Inject data into rcv path for LOOP */
 
-			dev->iir = UART_IIR_NO_INT;
+			/*
+			 * Set transmitter and transmit hold register
+			 * empty.  We have no FIFO at the moment and
+			 * on the TX side it's only interesting, when
+			 * we could coalesce port io on the kernel
+			 * kernel.
+			 */
+			dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+			break;
 		} else {
 			dev->dll = ioport__read8(data);
 		}
 		break;
 	case UART_IER:
-		if (!(dev->lcr & UART_LCR_DLAB)) {
+		if (!(dev->lcr & UART_LCR_DLAB))
 			dev->ier = ioport__read8(data) & 0x3f;
-			kvm__irq_line(kvm, dev->irq, dev->ier ? 1 : 0);
-		} else {
+		else
 			dev->dlm = ioport__read8(data);
-		}
 		break;
 	case UART_FCR:
 		dev->fcr = ioport__read8(data);
@@ -231,6 +254,8 @@ static bool serial8250_out(struct ioport
 		break;
 	}
 
+	serial8250_update_irq(kvm, dev);
+
 	mutex_unlock(&dev->mutex);
 
 	return ret;
@@ -257,7 +282,6 @@ static bool serial8250_in(struct ioport 
 		} else {
 			ioport__write8(data, dev->rbr);
 			dev->lsr &= ~UART_LSR_DR;
-			dev->iir = UART_IIR_NO_INT;
 		}
 		break;
 	case UART_IER:
@@ -266,15 +290,9 @@ static bool serial8250_in(struct ioport 
 		else
 			ioport__write8(data, dev->ier);
 		break;
-	case UART_IIR: {
-		u8 iir = dev->iir;
-
-		if (dev->fcr & UART_FCR_ENABLE_FIFO)
-			iir |= 0xc0;
-
-		ioport__write8(data, iir);
+	case UART_IIR:
+		ioport__write8(data, dev->iir);
 		break;
-	}
 	case UART_LCR:
 		ioport__write8(data, dev->lcr);
 		break;
@@ -283,7 +301,6 @@ static bool serial8250_in(struct ioport 
 		break;
 	case UART_LSR:
 		ioport__write8(data, dev->lsr);
-		dev->lsr &= ~(UART_LSR_OE|UART_LSR_PE|UART_LSR_FE|UART_LSR_BI);
 		break;
 	case UART_MSR:
 		ioport__write8(data, dev->msr);
@@ -295,6 +312,9 @@ static bool serial8250_in(struct ioport 
 		ret = false;
 		break;
 	}
+
+	serial8250_update_irq(kvm, dev);
+
 	mutex_unlock(&dev->mutex);
 
 	return ret;
Index: linux-kvm/tools/kvm/include/kvm/8250-serial.h
===================================================================
--- linux-kvm.orig/tools/kvm/include/kvm/8250-serial.h
+++ linux-kvm/tools/kvm/include/kvm/8250-serial.h
@@ -4,7 +4,7 @@
 struct kvm;
 
 void serial8250__init(struct kvm *kvm);
-void serial8250__inject_interrupt(struct kvm *kvm);
+void serial8250__update_consoles(struct kvm *kvm);
 void serial8250__inject_sysrq(struct kvm *kvm);
 
 #endif /* KVM__8250_SERIAL_H */
Index: linux-kvm/tools/kvm/x86/kvm.c
===================================================================
--- linux-kvm.orig/tools/kvm/x86/kvm.c
+++ linux-kvm/tools/kvm/x86/kvm.c
@@ -351,6 +351,6 @@ void kvm__arch_setup_firmware(struct kvm
 
 void kvm__arch_periodic_poll(struct kvm *kvm)
 {
-	serial8250__inject_interrupt(kvm);
+	serial8250__update_consoles(kvm);
 	virtio_console__inject_interrupt(kvm);
 }



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-10 13:27 [patch 0/3] kvm tool: Serial emulation overhaul Thomas Gleixner
                   ` (2 preceding siblings ...)
  2011-12-10 13:27 ` [patch 2/3] kvm tool: serial: Simplify switch cases Thomas Gleixner
@ 2011-12-10 15:17 ` Pekka Enberg
  2011-12-10 20:51   ` Thomas Gleixner
  3 siblings, 1 reply; 46+ messages in thread
From: Pekka Enberg @ 2011-12-10 15:17 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Sasha Levin, mingo

On Sat, 10 Dec 2011, Thomas Gleixner wrote:
> We observed massive hangs of the serial console in kvm tool which
> turned out to be caused by the bogus emulation of the interrupt line
> behaviour.
>
> This series cleans up and simplifies the serial code and in the last
> step fixes the interrupt handling proper.

Applied, thanks Thomas!

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-10 15:17 ` [patch 0/3] kvm tool: Serial emulation overhaul Pekka Enberg
@ 2011-12-10 20:51   ` Thomas Gleixner
  2011-12-11  8:15     ` Pekka Enberg
  0 siblings, 1 reply; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-10 20:51 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: LKML, Sasha Levin, mingo

On Sat, 10 Dec 2011, Pekka Enberg wrote:

> On Sat, 10 Dec 2011, Thomas Gleixner wrote:
> > We observed massive hangs of the serial console in kvm tool which
> > turned out to be caused by the bogus emulation of the interrupt line
> > behaviour.
> > 
> > This series cleans up and simplifies the serial code and in the last
> > step fixes the interrupt handling proper.
> 
> Applied, thanks Thomas!

There is a slight problem with them. I only tested them with the
rt-guest kernels, but non rt kernels are slightly unhappy. Sorry for
not thinking about that.

Fix below.

Thanks,

	tglx
------------------>
Subject: kvm tools: serial: Make it work with non rt guests as well
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 10 Dec 2011 21:27:26 +0100

Sasha reported, that a non RT guest reports "too much work for irq 4"
with the previous serial overhaul.

The reason is, that the new code allows unlimited tx transfers, which
triggers the sanity check in the 8250.c interrupt handler.

Limit the consecutive TX chars to 16 and let the guest kernel escape
from the 8250 interrupt handler. Set the TEMT/THRE bits in the
periodic serial console update.

Reported-by: Sasha Levin  <levinsasha928@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 tools/kvm/hw/serial.c |   30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

Index: linux-kvm/tools/kvm/hw/serial.c
===================================================================
--- linux-kvm.orig/tools/kvm/hw/serial.c
+++ linux-kvm/tools/kvm/hw/serial.c
@@ -19,6 +19,7 @@ struct serial8250_device {
 	u16			iobase;
 	u8			irq;
 	u8			irq_state;
+	int			txcnt;
 
 	u8			rbr;		/* receive buffer */
 	u8			dll;
@@ -105,6 +106,16 @@ static void serial8250_update_irq(struct
 			kvm__irq_line(kvm, dev->irq, 1);
 	}
 	dev->irq_state = iir;
+
+	/*
+	 * If the kernel disabled the tx interrupt, we know that there
+	 * is nothing more to transmit, so we can reset our tx logic
+	 * here.
+	 */
+	if (!(dev->ier & UART_IER_THRI)) {
+		dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+		dev->txcnt = 0;
+	}
 }
 
 #define SYSRQ_PENDING_NONE		0
@@ -134,6 +145,15 @@ static void serial8250__receive(struct k
 {
 	int c;
 
+	/*
+	 * If the guest transmitted 16 chars in a row, we clear the
+	 * TEMT/THRE bits to let the kernel escape from the 8250
+	 * interrupt handler. We come here only once a ms, so that
+	 * should give the kernel the desired pause.
+	 */
+	dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+	dev->txcnt = 0;
+
 	if (dev->lsr & UART_LSR_DR)
 		return;
 
@@ -212,14 +232,8 @@ static bool serial8250_out(struct ioport
 				term_putc(CONSOLE_8250, addr, size, dev->id);
 			/* else FIXME: Inject data into rcv path for LOOP */
 
-			/*
-			 * Set transmitter and transmit hold register
-			 * empty.  We have no FIFO at the moment and
-			 * on the TX side it's only interesting, when
-			 * we could coalesce port io on the kernel
-			 * kernel.
-			 */
-			dev->lsr |= UART_LSR_TEMT | UART_LSR_THRE;
+			if (++dev->txcnt == 16)
+				dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
 			break;
 		} else {
 			dev->dll = ioport__read8(data);



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-10 20:51   ` Thomas Gleixner
@ 2011-12-11  8:15     ` Pekka Enberg
  2011-12-11 10:30       ` Ingo Molnar
  0 siblings, 1 reply; 46+ messages in thread
From: Pekka Enberg @ 2011-12-11  8:15 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: LKML, Sasha Levin, mingo

On Sat, 10 Dec 2011, Thomas Gleixner wrote:
> There is a slight problem with them. I only tested them with the
> rt-guest kernels, but non rt kernels are slightly unhappy. Sorry for
> not thinking about that.
>
> Fix below.

Applied, thanks!

Ingo, how does serial console work for you with latest master? It's likely 
that Thomas' patches fix top slowness for you.

 			Pekka

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11  8:15     ` Pekka Enberg
@ 2011-12-11 10:30       ` Ingo Molnar
  2011-12-11 10:46         ` Ingo Molnar
  2011-12-11 14:04         ` Thomas Gleixner
  0 siblings, 2 replies; 46+ messages in thread
From: Ingo Molnar @ 2011-12-11 10:30 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Thomas Gleixner, LKML, Sasha Levin


* Pekka Enberg <penberg@kernel.org> wrote:

> On Sat, 10 Dec 2011, Thomas Gleixner wrote:
>
> > There is a slight problem with them. I only tested them with 
> > the rt-guest kernels, but non rt kernels are slightly 
> > unhappy. Sorry for not thinking about that.
> >
> > Fix below.
> 
> Applied, thanks!
> 
> Ingo, how does serial console work for you with latest master? 
> It's likely that Thomas' patches fix top slowness for you.

It didnt get worse - but i can still easily tell apart a 'top' 
running under kvm than native. Try something like this:

 run top and type 's0.1<enter>' to change the refresh interval 
 to 100 msecs.

Run the same thing under kvm and on the native host. The two are 
visibly different: the kvm one refreshes with the cursor showing 
up mid-refresh a number of times.

In theory a kvm driven serial console should be extremely fast, 
much faster than a real serial console, basically as fast as a 
local console. Yet this still does not seem to be the case.

In fact even ssh-ing in to a box over Wifi and running the above 
top session is undistinguishable from top running in a local 
console. So IMO it cannot be virtualization overhead - there 
must still be some delay or serious lack of buffering somewhere.

Here's a way to measure the slowdown directly:

  time top -d 0.01 -n 10 -b

         native: 0.9 secs
  ssh over wifi: 1.4 secs
     kvm serial: 5.0 secs

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11 10:30       ` Ingo Molnar
@ 2011-12-11 10:46         ` Ingo Molnar
  2011-12-11 14:04         ` Thomas Gleixner
  1 sibling, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2011-12-11 10:46 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Thomas Gleixner, LKML, Sasha Levin


btw., i just noticed another performance weirdness, running the 
latest version on -tip with such a 64-bit x86 kernel:

  make defconfig
  make kvmconfig   # sidenote: sigh - this still needs a 'make oldconfig'


and do a 'kvm run' into a shell, then I get a permanently busy 
kvm executable sucking up CPU time:

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
 28206 mingo     20   0 3586m  80m 1080 S 56.1  0.7   0:13.83 kvm
 29462 mingo     20   0 15380 1392  900 R  0.3  0.0   0:00.02 top

perf kvm top suggests simple idling around of 16 CPUs:

--------------------------------------------------------------------------------------------------
   PerfTop:    1635 irqs/sec  kernel:91.1% us: 0.3% guest kernel: 8.5% guest us: 0.0% exact:  0.0% [1000Hz cycles],  (all, 16 CPUs)
--------------------------------------------------------------------------------------------------

             samples  pcnt function                  DSO
             _______ _____ _________________________ _______________________________

              155.00 17.9% apic_timer_interrupt      /home/mingo/linux/linux/vmlinux
               89.00 10.3% find_busiest_group        /home/mingo/linux/linux/vmlinux
               66.00  7.6% native_apic_mem_write     /home/mingo/linux/linux/vmlinux
               51.00  5.9% idle_cpu                  /home/mingo/linux/linux/vmlinux
               29.00  3.3% __rcu_pending             /home/mingo/linux/linux/vmlinux
               25.00  2.9% run_posix_cpu_timers      /home/mingo/linux/linux/vmlinux
               24.00  2.8% _raw_spin_lock            /home/mingo/linux/linux/vmlinux
               19.00  2.2% cpumask_next_and          /home/mingo/linux/linux/vmlinux
               19.00  2.2% ktime_get                 /home/mingo/linux/linux/vmlinux
               18.00  2.1% run_timer_softirq         /home/mingo/linux/linux/vmlinux
               16.00  1.8% run_rebalance_domains     /home/mingo/linux/linux/vmlinux
               16.00  1.8% native_sched_clock        /home/mingo/linux/linux/vmlinux
               15.00  1.7% rcu_exit_nohz             /home/mingo/linux/linux/vmlinux
               14.00  1.6% tick_nohz_stop_sched_tick /home/mingo/linux/linux/vmlinux
               14.00  1.6% hrtimer_run_queues        /home/mingo/linux/linux/vmlinux

NO_HZ is set in the .config.

Btw., 'perf kvm' is in need of some love: for example it is 
unable to pick up a vmlinux from the current directory and 
./vmlinux does not work either, it needs 
/home/mingo/linux/linux/vmlinux specified explicitly - that 
sucks.

Ideally perf could be taught to talk to the guest kvm process 
and get its vmlinux home position (even better: its guest 
kallsyms) from there - so if i typed 'perf kvm top' it would do 
the right thing all automagically. [ What a fantastc level of 
automation! ;-) ]

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11 10:30       ` Ingo Molnar
  2011-12-11 10:46         ` Ingo Molnar
@ 2011-12-11 14:04         ` Thomas Gleixner
  2011-12-11 15:53           ` Ingo Molnar
  2011-12-12 10:27           ` Alan Cox
  1 sibling, 2 replies; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-11 14:04 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Pekka Enberg, LKML, Sasha Levin

On Sun, 11 Dec 2011, Ingo Molnar wrote:
> In theory a kvm driven serial console should be extremely fast, 
> much faster than a real serial console, basically as fast as a 
> local console. Yet this still does not seem to be the case.
> 
> In fact even ssh-ing in to a box over Wifi and running the above 
> top session is undistinguishable from top running in a local 
> console. So IMO it cannot be virtualization overhead - there 
> must still be some delay or serious lack of buffering somewhere.

Well, the difference between ssh and serial is, that ssh can pack 1.5k
worth of data into one frame, while serial has to send it
piecewise. And the emulation has to trap into kvm tool for each tx
byte, which doesnt help either. We cannot do much with buffering on
the kvm tool side as we have no clue how much consecutive data will
come in. That's why there is a virtual console, which has the
disadvantage that you cant see the early boot messages.

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11 14:04         ` Thomas Gleixner
@ 2011-12-11 15:53           ` Ingo Molnar
  2011-12-12  5:30             ` Sasha Levin
                               ` (2 more replies)
  2011-12-12 10:27           ` Alan Cox
  1 sibling, 3 replies; 46+ messages in thread
From: Ingo Molnar @ 2011-12-11 15:53 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Pekka Enberg, LKML, Sasha Levin


* Thomas Gleixner <tglx@linutronix.de> wrote:

> On Sun, 11 Dec 2011, Ingo Molnar wrote:

> > In theory a kvm driven serial console should be extremely 
> > fast, much faster than a real serial console, basically as 
> > fast as a local console. Yet this still does not seem to be 
> > the case.
> > 
> > In fact even ssh-ing in to a box over Wifi and running the 
> > above top session is undistinguishable from top running in a 
> > local console. So IMO it cannot be virtualization overhead - 
> > there must still be some delay or serious lack of buffering 
> > somewhere.
> 
> Well, the difference between ssh and serial is, that ssh can 
> pack 1.5k worth of data into one frame, while serial has to 
> send it piecewise. And the emulation has to trap into kvm tool 
> for each tx byte, which doesnt help either. We cannot do much 
> with buffering on the kvm tool side as we have no clue how 
> much consecutive data will come in. That's why there is a 
> virtual console, which has the disadvantage that you cant see 
> the early boot messages.

Okay, but look at it from another angle: the top output i 
generate is about 300k characters. 5000 msecs to execute it 
means 16 usecs overhead per character - or about 50k cycles - on 
a top of the class x86 CPU.

50k cycles for every single byte. And as a user i notice that 
first hand.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11 15:53           ` Ingo Molnar
@ 2011-12-12  5:30             ` Sasha Levin
  2011-12-12 11:12               ` Ingo Molnar
  2011-12-12  9:42             ` Thomas Gleixner
  2011-12-12 11:19             ` Pekka Enberg
  2 siblings, 1 reply; 46+ messages in thread
From: Sasha Levin @ 2011-12-12  5:30 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, Pekka Enberg, LKML

On Sun, 2011-12-11 at 16:53 +0100, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > On Sun, 11 Dec 2011, Ingo Molnar wrote:
> 
> > > In theory a kvm driven serial console should be extremely 
> > > fast, much faster than a real serial console, basically as 
> > > fast as a local console. Yet this still does not seem to be 
> > > the case.
> > > 
> > > In fact even ssh-ing in to a box over Wifi and running the 
> > > above top session is undistinguishable from top running in a 
> > > local console. So IMO it cannot be virtualization overhead - 
> > > there must still be some delay or serious lack of buffering 
> > > somewhere.
> > 
> > Well, the difference between ssh and serial is, that ssh can 
> > pack 1.5k worth of data into one frame, while serial has to 
> > send it piecewise. And the emulation has to trap into kvm tool 
> > for each tx byte, which doesnt help either. We cannot do much 
> > with buffering on the kvm tool side as we have no clue how 
> > much consecutive data will come in. That's why there is a 
> > virtual console, which has the disadvantage that you cant see 
> > the early boot messages.
> 
> Okay, but look at it from another angle: the top output i 
> generate is about 300k characters. 5000 msecs to execute it 
> means 16 usecs overhead per character - or about 50k cycles - on 
> a top of the class x86 CPU.
> 
> 50k cycles for every single byte. And as a user i notice that 
> first hand.

50k cycles for every single byte is pretty much as good as it will get
with serial console. See slide #5 in Marcelo's KVM Forum 2010
presentation[1] where he timed a heavyweight exit to about 40k cycles.

And I'm not sure we're taking just one of those for each byte...

I'm currently working on a patch which will let you see early boot
prints from the serial console, but switch to the virtio one when it
reaches usermode. This way you can have the best of both worlds: see
early boot and get a fast console when you've reached userspace.

[1]
http://www.linux-kvm.org/wiki/images/e/ea/2010-forum-mtosatti_walkthrough_entry_exit.pdf

-- 

Sasha.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11 15:53           ` Ingo Molnar
  2011-12-12  5:30             ` Sasha Levin
@ 2011-12-12  9:42             ` Thomas Gleixner
  2011-12-12 11:19             ` Pekka Enberg
  2 siblings, 0 replies; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-12  9:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Pekka Enberg, LKML, Sasha Levin

On Sun, 11 Dec 2011, Ingo Molnar wrote:
> * Thomas Gleixner <tglx@linutronix.de> wrote:
> 
> > On Sun, 11 Dec 2011, Ingo Molnar wrote:
> 
> > > In theory a kvm driven serial console should be extremely 
> > > fast, much faster than a real serial console, basically as 
> > > fast as a local console. Yet this still does not seem to be 
> > > the case.
> > > 
> > > In fact even ssh-ing in to a box over Wifi and running the 
> > > above top session is undistinguishable from top running in a 
> > > local console. So IMO it cannot be virtualization overhead - 
> > > there must still be some delay or serious lack of buffering 
> > > somewhere.
> > 
> > Well, the difference between ssh and serial is, that ssh can 
> > pack 1.5k worth of data into one frame, while serial has to 
> > send it piecewise. And the emulation has to trap into kvm tool 
> > for each tx byte, which doesnt help either. We cannot do much 
> > with buffering on the kvm tool side as we have no clue how 
> > much consecutive data will come in. That's why there is a 
> > virtual console, which has the disadvantage that you cant see 
> > the early boot messages.
> 
> Okay, but look at it from another angle: the top output i 
> generate is about 300k characters. 5000 msecs to execute it 
> means 16 usecs overhead per character - or about 50k cycles - on 
> a top of the class x86 CPU.
> 
> 50k cycles for every single byte. And as a user i notice that 
> first hand.

Well, you can't do anything about it simply because it's doing full
hardware emulation which goes all the way to user space and back for
each inb/outb.

virtio_console was written to avoid that overhead.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11 14:04         ` Thomas Gleixner
  2011-12-11 15:53           ` Ingo Molnar
@ 2011-12-12 10:27           ` Alan Cox
  2011-12-12 10:59             ` Sasha Levin
  1 sibling, 1 reply; 46+ messages in thread
From: Alan Cox @ 2011-12-12 10:27 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Ingo Molnar, Pekka Enberg, LKML, Sasha Levin

> Well, the difference between ssh and serial is, that ssh can pack 1.5k
> worth of data into one frame, while serial has to send it
> piecewise. And the emulation has to trap into kvm tool for each tx
> byte, which doesnt help either. We cannot do much with buffering on
> the kvm tool side as we have no clue how much consecutive data will
> come in. That's why there is a virtual console, which has the
> disadvantage that you cant see the early boot messages.

You can emulate a chip with a 64byte or so FIFO. You can do I/O cycle
prediction in the kernel part and you can use the empty bit as a clue
(which is what most serial<->ethernet widgetry does).

Alan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 10:27           ` Alan Cox
@ 2011-12-12 10:59             ` Sasha Levin
  2011-12-12 11:02               ` Alan Cox
  2011-12-12 17:21               ` Ingo Molnar
  0 siblings, 2 replies; 46+ messages in thread
From: Sasha Levin @ 2011-12-12 10:59 UTC (permalink / raw)
  To: Alan Cox; +Cc: Thomas Gleixner, Ingo Molnar, Pekka Enberg, LKML

On Mon, 2011-12-12 at 10:27 +0000, Alan Cox wrote:
> > Well, the difference between ssh and serial is, that ssh can pack 1.5k
> > worth of data into one frame, while serial has to send it
> > piecewise. And the emulation has to trap into kvm tool for each tx
> > byte, which doesnt help either. We cannot do much with buffering on
> > the kvm tool side as we have no clue how much consecutive data will
> > come in. That's why there is a virtual console, which has the
> > disadvantage that you cant see the early boot messages.
> 
> You can emulate a chip with a 64byte or so FIFO. You can do I/O cycle
> prediction in the kernel part and you can use the empty bit as a clue
> (which is what most serial<->ethernet widgetry does).

The performance problems here aren't the same performance problems you
have on real hardware. The problem here is that it costs 40k cycles for
the guest to access the emulated chip.

-- 

Sasha.



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 10:59             ` Sasha Levin
@ 2011-12-12 11:02               ` Alan Cox
  2011-12-12 18:21                 ` Avi Kivity
  2011-12-12 17:21               ` Ingo Molnar
  1 sibling, 1 reply; 46+ messages in thread
From: Alan Cox @ 2011-12-12 11:02 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Thomas Gleixner, Ingo Molnar, Pekka Enberg, LKML

> > You can emulate a chip with a 64byte or so FIFO. You can do I/O cycle
> > prediction in the kernel part and you can use the empty bit as a clue
> > (which is what most serial<->ethernet widgetry does).
> 
> The performance problems here aren't the same performance problems you
> have on real hardware. The problem here is that it costs 40k cycles for
> the guest to access the emulated chip.

How much of that 40K cycles is bouncing back and forth into emulation
layers and how much of it is the kernel trap ?
 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12  5:30             ` Sasha Levin
@ 2011-12-12 11:12               ` Ingo Molnar
  2011-12-12 11:20                 ` Alan Cox
                                   ` (2 more replies)
  0 siblings, 3 replies; 46+ messages in thread
From: Ingo Molnar @ 2011-12-12 11:12 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Thomas Gleixner, Pekka Enberg, LKML


* Sasha Levin <levinsasha928@gmail.com> wrote:

> 50k cycles for every single byte is pretty much as good as it 
> will get with serial console. See slide #5 in Marcelo's KVM 
> Forum 2010 presentation[1] where he timed a heavyweight exit 
> to about 40k cycles.

> [1]
> http://www.linux-kvm.org/wiki/images/e/ea/2010-forum-mtosatti_walkthrough_entry_exit.pdf

But what we do here is a PIO exit. That, according to Marcelo's 
measurements, is about 10K cycles, back to back. [*]

So where does the extra overhead come from?

We shouldn't care that there's virtio-console - the goal of 
tools/kvm it speed everything up as much as possible, so we 
should not jump to the next IO abstraction unless we know where 
every cycle was spent with simpler IO models ...

Thanks,

	Ingo

[*] Also, those 10K cycles include some significant Qemu 
    overhead - a couple of thousand cycles - that should be much 
    lower in the tools/kvm case.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-11 15:53           ` Ingo Molnar
  2011-12-12  5:30             ` Sasha Levin
  2011-12-12  9:42             ` Thomas Gleixner
@ 2011-12-12 11:19             ` Pekka Enberg
  2011-12-12 17:20               ` Ingo Molnar
  2011-12-12 18:19               ` Avi Kivity
  2 siblings, 2 replies; 46+ messages in thread
From: Pekka Enberg @ 2011-12-12 11:19 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, LKML, Sasha Levin

On Sun, Dec 11, 2011 at 5:53 PM, Ingo Molnar <mingo@elte.hu> wrote:
> Okay, but look at it from another angle: the top output i
> generate is about 300k characters. 5000 msecs to execute it
> means 16 usecs overhead per character - or about 50k cycles - on
> a top of the class x86 CPU.

I'm seeing 1.5 usecs per character for this little benchmark:

$ cat tick.S
#define DBG_PORT	0xe0
	.code16gcc
	.text
	.globl	_start
	.type	_start, @function
_start:
	mov	$0x64,%cx
	mov	$0xffff,%bx

outer_loop:
inner_loop:
	mov	$0x3f8,%dx
	mov	$0x2e, %al	# .
	out	%al,%dx

	sub	$0x1,%bx
	jne	inner_loop
	
	sub	$0x1,%cx
	jne	outer_loop

	/* not a valid port to force exit */
	outb	%al, $DBG_PORT

$ gcc -nostdinc -c tick.S -o tick.o
$ ld -Ttext=0x00 -nostdlib -static tick.o -o tick.elf
$ objcopy -O binary tick.elf tick.bin
$ perf stat ./vm run ./tests/serial/tick.bin > /dev/null
  Warning: ./tests/serial/tick.bin is not a bzImage. Trying to load it
as a flat binary...

 Performance counter stats for './vm run ./tests/serial/tick.bin':

       9984.468850 task-clock                #    0.995 CPUs utilized
            16,597 context-switches          #    0.002 M/sec
                46 CPU-migrations            #    0.000 M/sec
             1,200 page-faults               #    0.000 M/sec
    32,500,683,485 cycles                    #    3.255 GHz
         [82.17%]
    14,986,643,686 stalled-cycles-frontend   #   46.11% frontend
cycles idle    [83.57%]
     9,400,515,530 stalled-cycles-backend    #   28.92% backend
cycles idle    [67.11%]
    25,213,133,751 instructions              #    0.78  insns per cycle
                                             #    0.59  stalled cycles
per insn [83.57%]
     5,111,389,696 branches                  #  511.934 M/sec
         [83.57%]
        14,565,759 branch-misses             #    0.28% of all
branches         [83.58%]

      10.030175553 seconds time elapsed

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:12               ` Ingo Molnar
@ 2011-12-12 11:20                 ` Alan Cox
  2011-12-12 17:20                   ` Ingo Molnar
                                     ` (2 more replies)
  2011-12-12 11:23                 ` Cyrill Gorcunov
  2011-12-12 17:40                 ` Sasha Levin
  2 siblings, 3 replies; 46+ messages in thread
From: Alan Cox @ 2011-12-12 11:20 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Sasha Levin, Thomas Gleixner, Pekka Enberg, LKML

> [*] Also, those 10K cycles include some significant Qemu 
>     overhead - a couple of thousand cycles - that should be much 
>     lower in the tools/kvm case.

Are all the traps going into qemu - is KVM still that braindead ?

Alan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:12               ` Ingo Molnar
  2011-12-12 11:20                 ` Alan Cox
@ 2011-12-12 11:23                 ` Cyrill Gorcunov
  2011-12-12 17:40                 ` Sasha Levin
  2 siblings, 0 replies; 46+ messages in thread
From: Cyrill Gorcunov @ 2011-12-12 11:23 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Sasha Levin, Thomas Gleixner, Pekka Enberg, LKML

On Mon, Dec 12, 2011 at 12:12:22PM +0100, Ingo Molnar wrote:
> 
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > 50k cycles for every single byte is pretty much as good as it 
> > will get with serial console. See slide #5 in Marcelo's KVM 
> > Forum 2010 presentation[1] where he timed a heavyweight exit 
> > to about 40k cycles.
> 
> > [1]
> > http://www.linux-kvm.org/wiki/images/e/ea/2010-forum-mtosatti_walkthrough_entry_exit.pdf
> 
> But what we do here is a PIO exit. That, according to Marcelo's 
> measurements, is about 10K cycles, back to back. [*]
> 
> So where does the extra overhead come from?
> 
> We shouldn't care that there's virtio-console - the goal of 
> tools/kvm it speed everything up as much as possible, so we 
> should not jump to the next IO abstraction unless we know where 
> every cycle was spent with simpler IO models ...
> 
> Thanks,
> 
> 	Ingo
> 
> [*] Also, those 10K cycles include some significant Qemu 
>     overhead - a couple of thousand cycles - that should be much 
>     lower in the tools/kvm case.
> 

FWIW, last time I was mcount'ing calltrace -- we spend a lot of time
due to timer signals, ie because of

#define TIMER_INTERVAL_NS 1000000	/* 1 msec */

so, do we really need it being that hight? This poll includes
enquiry if there some symbol a user typed in console. Maybe
we should reduce this rate?

	Cyrill

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:20                 ` Alan Cox
@ 2011-12-12 17:20                   ` Ingo Molnar
  2011-12-12 18:16                   ` Avi Kivity
  2011-12-12 18:16                   ` Avi Kivity
  2 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2011-12-12 17:20 UTC (permalink / raw)
  To: Alan Cox; +Cc: Sasha Levin, Thomas Gleixner, Pekka Enberg, LKML


* Alan Cox <alan@lxorguk.ukuu.org.uk> wrote:

> > [*] Also, those 10K cycles include some significant Qemu 
> >     overhead - a couple of thousand cycles - that should be much 
> >     lower in the tools/kvm case.
> 
> Are all the traps going into qemu - is KVM still that braindead ?

I don't know what exactly Marcelo has measured - it seems a PIO 
trap handled by the VM monitor process - i.e. Qemu in that case.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:19             ` Pekka Enberg
@ 2011-12-12 17:20               ` Ingo Molnar
  2011-12-12 18:40                 ` Pekka Enberg
  2011-12-12 18:19               ` Avi Kivity
  1 sibling, 1 reply; 46+ messages in thread
From: Ingo Molnar @ 2011-12-12 17:20 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Thomas Gleixner, LKML, Sasha Levin


* Pekka Enberg <penberg@kernel.org> wrote:

> On Sun, Dec 11, 2011 at 5:53 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > Okay, but look at it from another angle: the top output i
> > generate is about 300k characters. 5000 msecs to execute it
> > means 16 usecs overhead per character - or about 50k cycles - on
> > a top of the class x86 CPU.
> 
> I'm seeing 1.5 usecs per character for this little benchmark:

Interesting - what do you see with the top -b measurement i 
posted - can you see similar slowdowns?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 10:59             ` Sasha Levin
  2011-12-12 11:02               ` Alan Cox
@ 2011-12-12 17:21               ` Ingo Molnar
  1 sibling, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2011-12-12 17:21 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Alan Cox, Thomas Gleixner, Pekka Enberg, LKML


* Sasha Levin <levinsasha928@gmail.com> wrote:

> > You can emulate a chip with a 64byte or so FIFO. You can do 
> > I/O cycle prediction in the kernel part and you can use the 
> > empty bit as a clue (which is what most serial<->ethernet 
> > widgetry does).
> 
> The performance problems here aren't the same performance 
> problems you have on real hardware. The problem here is that 
> it costs 40k cycles for the guest to access the emulated chip.

I question that 40k cycles figure - where does Marcelo's paper 
mention it?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:12               ` Ingo Molnar
  2011-12-12 11:20                 ` Alan Cox
  2011-12-12 11:23                 ` Cyrill Gorcunov
@ 2011-12-12 17:40                 ` Sasha Levin
  2011-12-12 17:45                   ` Ingo Molnar
  2 siblings, 1 reply; 46+ messages in thread
From: Sasha Levin @ 2011-12-12 17:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, Pekka Enberg, LKML, Alan Cox

On Mon, 2011-12-12 at 12:12 +0100, Ingo Molnar wrote:
> * Sasha Levin <levinsasha928@gmail.com> wrote:
> 
> > 50k cycles for every single byte is pretty much as good as it 
> > will get with serial console. See slide #5 in Marcelo's KVM 
> > Forum 2010 presentation[1] where he timed a heavyweight exit 
> > to about 40k cycles.
> 
> > [1]
> > http://www.linux-kvm.org/wiki/images/e/ea/2010-forum-mtosatti_walkthrough_entry_exit.pdf
> 
> But what we do here is a PIO exit. That, according to Marcelo's 
> measurements, is about 10K cycles, back to back. [*]
> 
> So where does the extra overhead come from?
> 
> We shouldn't care that there's virtio-console - the goal of 
> tools/kvm it speed everything up as much as possible, so we 
> should not jump to the next IO abstraction unless we know where 
> every cycle was spent with simpler IO models ...
> 
> Thanks,
> 
> 	Ingo
> 
> [*] Also, those 10K cycles include some significant Qemu 
>     overhead - a couple of thousand cycles - that should be much 
>     lower in the tools/kvm case.

No, everything on slide 5 is part of a PIO exit. Starting from 'out' and
up to until it's back to the guest. A total of over 40k cycles.

KVM tools does have less overhead than qemu regarding PIO exits, but
even if you take those completely out of the equation we're still pretty
close to 40k cycles.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 17:40                 ` Sasha Levin
@ 2011-12-12 17:45                   ` Ingo Molnar
  0 siblings, 0 replies; 46+ messages in thread
From: Ingo Molnar @ 2011-12-12 17:45 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Thomas Gleixner, Pekka Enberg, LKML, Alan Cox


* Sasha Levin <levinsasha928@gmail.com> wrote:

> On Mon, 2011-12-12 at 12:12 +0100, Ingo Molnar wrote:
> > * Sasha Levin <levinsasha928@gmail.com> wrote:
> > 
> > > 50k cycles for every single byte is pretty much as good as it 
> > > will get with serial console. See slide #5 in Marcelo's KVM 
> > > Forum 2010 presentation[1] where he timed a heavyweight exit 
> > > to about 40k cycles.
> > 
> > > [1]
> > > http://www.linux-kvm.org/wiki/images/e/ea/2010-forum-mtosatti_walkthrough_entry_exit.pdf
> > 
> > But what we do here is a PIO exit. That, according to 
> > Marcelo's measurements, is about 10K cycles, back to back. 
> > [*]

> No, everything on slide 5 is part of a PIO exit. Starting from 
> 'out' and up to until it's back to the guest. A total of over 
> 40k cycles.

No, slide 5 is a cumulative total, with the '+' showing the 
individual costs, adding up to 10400 cycles total.

Please read that slide again ;-)

10K is bad enough IMO, 40K cycles would be 'out of this world 
crazy'.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:20                 ` Alan Cox
  2011-12-12 17:20                   ` Ingo Molnar
@ 2011-12-12 18:16                   ` Avi Kivity
  2011-12-12 18:16                   ` Avi Kivity
  2 siblings, 0 replies; 46+ messages in thread
From: Avi Kivity @ 2011-12-12 18:16 UTC (permalink / raw)
  To: Alan Cox; +Cc: Ingo Molnar, Sasha Levin, Thomas Gleixner, Pekka Enberg, LKML

On 12/12/2011 01:20 PM, Alan Cox wrote:
> > [*] Also, those 10K cycles include some significant Qemu 
> >     overhead - a couple of thousand cycles - that should be much 
> >     lower in the tools/kvm case.
>
> Are all the traps going into qemu

Only the ones that go to devices modelled in userspace.

>  - is KVM still that braindead ?

It has only been this way.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:20                 ` Alan Cox
  2011-12-12 17:20                   ` Ingo Molnar
  2011-12-12 18:16                   ` Avi Kivity
@ 2011-12-12 18:16                   ` Avi Kivity
  2011-12-12 21:36                     ` Alan Cox
  2 siblings, 1 reply; 46+ messages in thread
From: Avi Kivity @ 2011-12-12 18:16 UTC (permalink / raw)
  To: Alan Cox; +Cc: Ingo Molnar, Sasha Levin, Thomas Gleixner, Pekka Enberg, LKML

On 12/12/2011 01:20 PM, Alan Cox wrote:
> > [*] Also, those 10K cycles include some significant Qemu 
> >     overhead - a couple of thousand cycles - that should be much 
> >     lower in the tools/kvm case.
>
> Are all the traps going into qemu

Only the ones that go to devices modelled in userspace.

>  - is KVM still that braindead ?

How would you do it?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:19             ` Pekka Enberg
  2011-12-12 17:20               ` Ingo Molnar
@ 2011-12-12 18:19               ` Avi Kivity
  2011-12-12 18:31                 ` Pekka Enberg
  1 sibling, 1 reply; 46+ messages in thread
From: Avi Kivity @ 2011-12-12 18:19 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Ingo Molnar, Thomas Gleixner, LKML, Sasha Levin

On 12/12/2011 01:19 PM, Pekka Enberg wrote:
> On Sun, Dec 11, 2011 at 5:53 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > Okay, but look at it from another angle: the top output i
> > generate is about 300k characters. 5000 msecs to execute it
> > means 16 usecs overhead per character - or about 50k cycles - on
> > a top of the class x86 CPU.
>
> I'm seeing 1.5 usecs per character for this little benchmark:
>
>


No interrupts, right?  Things look differently with one interrupt per
character and none.  Try running kvm_stat with your benchmark and with a
guest kernel flooding the serial port.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 11:02               ` Alan Cox
@ 2011-12-12 18:21                 ` Avi Kivity
  0 siblings, 0 replies; 46+ messages in thread
From: Avi Kivity @ 2011-12-12 18:21 UTC (permalink / raw)
  To: Alan Cox; +Cc: Sasha Levin, Thomas Gleixner, Ingo Molnar, Pekka Enberg, LKML

On 12/12/2011 01:02 PM, Alan Cox wrote:
> > > You can emulate a chip with a 64byte or so FIFO. You can do I/O cycle
> > > prediction in the kernel part and you can use the empty bit as a clue
> > > (which is what most serial<->ethernet widgetry does).
> > 
> > The performance problems here aren't the same performance problems you
> > have on real hardware. The problem here is that it costs 40k cycles for
> > the guest to access the emulated chip.
>
> How much of that 40K cycles is bouncing back and forth into emulation
> layers and how much of it is the kernel trap ?
>  

30K of those cycles are only in Sasha's mind, the real figure is 10K
(and likely lower with newer hardware).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 18:19               ` Avi Kivity
@ 2011-12-12 18:31                 ` Pekka Enberg
  2011-12-13 10:33                   ` Avi Kivity
  0 siblings, 1 reply; 46+ messages in thread
From: Pekka Enberg @ 2011-12-12 18:31 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, Thomas Gleixner, LKML, Sasha Levin

On Mon, 2011-12-12 at 20:19 +0200, Avi Kivity wrote:
> On 12/12/2011 01:19 PM, Pekka Enberg wrote:
> > On Sun, Dec 11, 2011 at 5:53 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > > Okay, but look at it from another angle: the top output i
> > > generate is about 300k characters. 5000 msecs to execute it
> > > means 16 usecs overhead per character - or about 50k cycles - on
> > > a top of the class x86 CPU.
> >
> > I'm seeing 1.5 usecs per character for this little benchmark:
> 
> No interrupts, right?  Things look differently with one interrupt per
> character and none. 

No interrupts. I was simply trying to measure PIO overhead to see how
bad the 15 usec per character Ingo measured really is.

			Pekka


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 17:20               ` Ingo Molnar
@ 2011-12-12 18:40                 ` Pekka Enberg
  2011-12-12 19:14                   ` Pekka Enberg
                                     ` (2 more replies)
  0 siblings, 3 replies; 46+ messages in thread
From: Pekka Enberg @ 2011-12-12 18:40 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, LKML, Sasha Levin

On Mon, Dec 12, 2011 at 7:20 PM, Ingo Molnar <mingo@elte.hu> wrote:
>> I'm seeing 1.5 usecs per character for this little benchmark:
>
> Interesting - what do you see with the top -b measurement i
> posted - can you see similar slowdowns?

ssh-4.2# time top -d 0.01 -n 10 -b

[snip]

real	0m2.935
user	0m0.006s
sys	0m0.049s

That's 9.7 usecs per character which is roughly 6x slowdown.

It seems to be related to interrupt handling because if I bump up
TIMER_INTEVAL_NS to 10 msec:

--- a/tools/kvm/kvm.c
+++ b/tools/kvm/kvm.c
@@ -412,7 +412,7 @@ found_kernel:
        return ret;
 }

-#define TIMER_INTERVAL_NS 1000000      /* 1 msec */
+#define TIMER_INTERVAL_NS 10000000     /* 10 msec */

serial console output slows down by the same 10x factor:

real	0m24.631s
user	0m0.007s
sys	0m0.025s

Lowering the interval too 100 usec speeds things up but unfortunately
chokes the serial layer rather quickly:

    8 root      RT   [    6.759222] serial8250: too much work for irq4
0     0    0    0 S  0.0  0.0   0:00.00 migration/1
    9 root      20   0     0    0    0 S  0.0  0.0   0:00.00 kworker/1:0
   10 root      20   0     0    0    0 S  0.0  0.0   0:00.00 ksoftirqd/1
   11 root      20   0     0    0    0 S  0.0  0.0   0:00.00 kworker/0:1
   12 root      RT   0     0    0    0 S  0.0  0.0   0:00.00 watchdog/1
   13 root      RT   0     0    0    0 S  0.0  0.0   0:00.00 migration/2
   14 root      20   0     0    0    0 S  0[    6.759222] serial8250:
too much work for irq4
.0  0.0   0:00.00 kworker/2:0
   15 root      20   0     0    0    0 S  0.0  0.0   0:00.00 ksoftirqd/2

So i'm pretty sure it's some bug in hw/serial.c that's limiting
character output by interrupts.

                                Pekka

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 18:40                 ` Pekka Enberg
@ 2011-12-12 19:14                   ` Pekka Enberg
  2011-12-12 19:21                   ` Ingo Molnar
  2011-12-12 21:03                   ` James Courtier-Dutton
  2 siblings, 0 replies; 46+ messages in thread
From: Pekka Enberg @ 2011-12-12 19:14 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, LKML, Sasha Levin

On Mon, Dec 12, 2011 at 7:20 PM, Ingo Molnar <mingo@elte.hu> wrote:
>>> I'm seeing 1.5 usecs per character for this little benchmark:
>>
>> Interesting - what do you see with the top -b measurement i
>> posted - can you see similar slowdowns?

On Mon, Dec 12, 2011 at 8:40 PM, Pekka Enberg <penberg@kernel.org> wrote:
> ssh-4.2# time top -d 0.01 -n 10 -b
>
> [snip]
>
> real    0m2.935
> user    0m0.006s
> sys     0m0.049s
>
> That's 9.7 usecs per character which is roughly 6x slowdown.
>
> It seems to be related to interrupt handling because if I bump up
> TIMER_INTEVAL_NS to 10 msec:

[snip]

Looking at drivers/tty/serial/8250.c, we have:

static void serial8250_console_putchar(struct uart_port *port, int ch)
{
        struct uart_8250_port *up =
                container_of(port, struct uart_8250_port, port);

        wait_for_xmitr(up, UART_LSR_THRE);
        serial_out(up, UART_TX, ch);
}

in tools/kvm/hw/serial.c::serial8250_out() we do:

        case UART_TX:
                if (!(dev->lcr & UART_LCR_DLAB)) {
                        char *addr = data;

                        if (!(dev->mcr & UART_MCR_LOOP))
                                term_putc(CONSOLE_8250, addr, size, dev->id);
                        /* else FIXME: Inject data into rcv path for LOOP */

                        if (++dev->txcnt == 16)
                                dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
                        break;

which clears UART_LSR_THRE after 16 characters. UART_LSR_THRE is only
enabled in serial8250__update_consoles() and serial8250_update_irq()
which are completely bound by TIMER_INTERVAL_NS.

                        Pekka

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 18:40                 ` Pekka Enberg
  2011-12-12 19:14                   ` Pekka Enberg
@ 2011-12-12 19:21                   ` Ingo Molnar
  2011-12-13  0:59                     ` Thomas Gleixner
  2011-12-12 21:03                   ` James Courtier-Dutton
  2 siblings, 1 reply; 46+ messages in thread
From: Ingo Molnar @ 2011-12-12 19:21 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Thomas Gleixner, LKML, Sasha Levin


* Pekka Enberg <penberg@kernel.org> wrote:

> So i'm pretty sure it's some bug in hw/serial.c that's 
> limiting character output by interrupts.

Serial port control flow somehow being bound by timer frequency 
[which is not really a necessity: both the host and the guest 
could stream on full speed] was too my observation early on.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 18:40                 ` Pekka Enberg
  2011-12-12 19:14                   ` Pekka Enberg
  2011-12-12 19:21                   ` Ingo Molnar
@ 2011-12-12 21:03                   ` James Courtier-Dutton
  2 siblings, 0 replies; 46+ messages in thread
From: James Courtier-Dutton @ 2011-12-12 21:03 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Ingo Molnar, Thomas Gleixner, LKML, Sasha Levin

On 12 December 2011 18:40, Pekka Enberg <penberg@kernel.org> wrote:
> -#define TIMER_INTERVAL_NS 1000000      /* 1 msec */
> +#define TIMER_INTERVAL_NS 10000000     /* 10 msec */
>
> serial console output slows down by the same 10x factor:
>
> real    0m24.631s
> user    0m0.007s
> sys     0m0.025s
>
> Lowering the interval too 100 usec speeds things up but unfortunately
> chokes the serial layer rather quickly:
>
>    8 root      RT   [    6.759222] serial8250: too much work for irq4
> 0     0    0    0 S  0.0  0.0   0:00.00 migration/1
>    9 root      20   0     0    0    0 S  0.0  0.0   0:00.00 kworker/1:0
>   10 root      20   0     0    0    0 S  0.0  0.0   0:00.00 ksoftirqd/1
>   11 root      20   0     0    0    0 S  0.0  0.0   0:00.00 kworker/0:1
>   12 root      RT   0     0    0    0 S  0.0  0.0   0:00.00 watchdog/1
>   13 root      RT   0     0    0    0 S  0.0  0.0   0:00.00 migration/2
>   14 root      20   0     0    0    0 S  0[    6.759222] serial8250:
> too much work for irq4
> .0  0.0   0:00.00 kworker/2:0
>   15 root      20   0     0    0    0 S  0.0  0.0   0:00.00 ksoftirqd/2
>
> So i'm pretty sure it's some bug in hw/serial.c that's limiting
> character output by interrupts.
>
Surely the system should be limiting the character output.
If the baud rate is set to 9600, the simulation of the serial port
should be slow.
If the baud rate is set to 115200, the simulation of the serial port
should be faster.
On real hardware, IO ports (outb/inb) are much slower than PCI memory mapped IO.
I am not sure if perceived slowness of kvm is by design or by accident though.

Now, if we want the guest serial console port to actually be fast, we
should let it be set to a baud of 921000 bps.
You could then use the nagle algorithm to improve its performance when
working in a kvm guest.

I have an application where when I set the guest serial port to 9600,
I want it to be slow, and be as close to real 9600 baud as possible.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 18:16                   ` Avi Kivity
@ 2011-12-12 21:36                     ` Alan Cox
  2011-12-13 10:32                       ` Avi Kivity
  0 siblings, 1 reply; 46+ messages in thread
From: Alan Cox @ 2011-12-12 21:36 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, Sasha Levin, Thomas Gleixner, Pekka Enberg, LKML

On Mon, 12 Dec 2011 20:16:50 +0200
Avi Kivity <avi@redhat.com> wrote:

> On 12/12/2011 01:20 PM, Alan Cox wrote:
> > > [*] Also, those 10K cycles include some significant Qemu 
> > >     overhead - a couple of thousand cycles - that should be much 
> > >     lower in the tools/kvm case.
> >
> > Are all the traps going into qemu
> 
> Only the ones that go to devices modelled in userspace.
> 
> >  - is KVM still that braindead ?
> 
> How would you do it?

See my posting from years back about this...

Your trap handler returns the kernel a predictor tree (say a 1 page tree)
of the next expected ins out and actions - where the action is either
store, or trap out.

Each kernel trap then boils down to

Does this trap match an entry in the top of our predictor tree. Ie is it
an in/out (or equivalent mmio) that the driver has given us expected
behaviour for.

We could of course have several for different ranges of devices

	no - hit emulator and return position in predictor tree we failed
		at plus tree buffer if needed

	yes - follow the tree node

	For IN
		copy predicted byte/word/dword to tree buffer
		move along tree
		return

	For Out
		store store byte/word/dword in tree buffer
		(one of which can be used for discard)
		move along the tree
		return

Thats enough to do things like write predictors for the kernel console
emulating the serial fifo, to script PIO IDE transfers etc

The kernel side is miniscule and generic, the user space can migrate to
doing this where it pays off and not where it doesn't.


Alan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 19:21                   ` Ingo Molnar
@ 2011-12-13  0:59                     ` Thomas Gleixner
  2011-12-13  7:03                       ` Pekka Enberg
  2011-12-13 10:58                       ` Avi Kivity
  0 siblings, 2 replies; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-13  0:59 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Pekka Enberg, LKML, Sasha Levin

On Mon, 12 Dec 2011, Ingo Molnar wrote:
> * Pekka Enberg <penberg@kernel.org> wrote:
> 
> > So i'm pretty sure it's some bug in hw/serial.c that's 
> > limiting character output by interrupts.

No, that's not a bug. The current emulation has no fifo and it writes
every single character to the terminal. Go figure. The timer
limitation is due to the missing fifo and other details, which would
drive the serial driver into the "too much work for irq 4" case.

The approach I took was to keep the emulated device as close to the
real HW for obvious reasons. You simply cannot ignore the way how a HW
device works and how the corresponding kernel driver expects it to
work.

> Serial port control flow somehow being bound by timer frequency 
> [which is not really a necessity: both the host and the guest 
> could stream on full speed] was too my observation early on.

That simply does not work with the way the serial driver for the 8250
is written.

If you emulate hardware then don't expect that the shortcomings of the
real hardware and the clusterf*ck in the corresponding device drivers
go magically away.

If you want high performance virtualization then use virtual drivers
and stop whining about a 30 years old legacy device and its warts.

There is way bigger fish to fry in that virt stuff, which is more
important and way simpler to fix.

I just stumbled over this:

<idle>-0     [017] 316004.317563: hrtimer_cancel: hrtimer=ffff880130ad8610
<idle>-0     [017] 316004.317563: hrtimer_expire_entry: hrtimer=ffff880130ad8610 function=kvm_timer_fn now=316147662509905
<idle>-0     [017] 316004.317565: sched_wakeup: comm=qemu-system-x86 pid=77375 prio=19 success=1 target_cpu=017
<idle>-0     [017] 316004.317566: hrtimer_expire_exit: hrtimer=ffff880130ad8610
<idle>-0     [017] 316004.317567: power_end: cpu_id=17
<idle>-0     [017] 316004.317567: cpu_idle: state=4294967295 cpu_id=17
<idle>-0     [017] 316004.317568: sched_switch: prev_comm=swapper/17 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=qemu-system-x86 next_pid=77375 next_prio=19
 <...>-77375 [017] 316004.317570: hrtimer_cancel: hrtimer=ffff880125e67660
 <...>-77375 [017] 316004.317571: kvm_apic_accept_irq: apicid 1 vec 239 (Fixed|edge)
 <...>-77375 [017] 316004.317572: hrtimer_start: hrtimer=ffff880125e67660 function=kvm_idle_watchdog expires=316147712519668 softexpires=316147712519668
 <...>-77375 [017] 316004.317574: hrtimer_cancel: hrtimer=ffff880125e67660
 <...>-77375 [017] 316004.317575: kvm_inj_virq: irq 239
 <...>-77375 [017] 316004.317577: kvm_entry: vcpu 1
 <...>-77375 [017] 316004.317579: kvm_exit: reason APIC_ACCESS rip 0xffffffff8104d1cc info 10b0 0
 <...>-77375 [017] 316004.317581: kvm_mmu_pagetable_walk: addr ffffffff8104d1cc pferr 10 F
 <...>-77375 [017] 316004.317582: kvm_mmu_paging_element: pte 1c09067 level 4
 <...>-77375 [017] 316004.317582: kvm_mmu_paging_element: pte 1c0d063 level 3
 <...>-77375 [017] 316004.317583: kvm_mmu_paging_element: pte 10001e1 level 2
 <...>-77375 [017] 316004.317584: kvm_emulate_insn: 0:ffffffff8104d1cc: 89 b7 00 b0 5f ff (prot64)
 <...>-77375 [017] 316004.317585: kvm_mmu_pagetable_walk: addr ffffffffff5fb0b0 pferr 2 W
 <...>-77375 [017] 316004.317585: kvm_mmu_paging_element: pte 1c09067 level 4
 <...>-77375 [017] 316004.317586: kvm_mmu_paging_element: pte 1c0a067 level 3
 <...>-77375 [017] 316004.317586: kvm_mmu_paging_element: pte 1eae067 level 2
 <...>-77375 [017] 316004.317587: kvm_mmu_paging_element: pte 80000000fee0017b level 1
 <...>-77375 [017] 316004.317588: kvm_mmio: mmio write len 4 gpa 0xfee000b0 val 0x0
 <...>-77375 [017] 316004.317588: kvm_apic: apic_write APIC_EOI = 0x0
 <...>-77375 [017] 316004.317590: kvm_entry: vcpu 1
 <...>-77375 [017] 316004.317599: kvm_exit: reason APIC_ACCESS rip 0xffffffff8104d1cc info 1380 0
 <...>-77375 [017] 316004.317600: kvm_mmu_pagetable_walk: addr ffffffff8104d1cc pferr 10 F
 <...>-77375 [017] 316004.317601: kvm_mmu_paging_element: pte 1c09067 level 4
 <...>-77375 [017] 316004.317602: kvm_mmu_paging_element: pte 1c0d063 level 3
 <...>-77375 [017] 316004.317602: kvm_mmu_paging_element: pte 10001e1 level 2
 <...>-77375 [017] 316004.317603: kvm_emulate_insn: 0:ffffffff8104d1cc: 89 b7 00 b0 5f ff (prot64)
 <...>-77375 [017] 316004.317604: kvm_mmu_pagetable_walk: addr ffffffffff5fb380 pferr 2 W
 <...>-77375 [017] 316004.317604: kvm_mmu_paging_element: pte 1c09067 level 4
 <...>-77375 [017] 316004.317605: kvm_mmu_paging_element: pte 1c0a067 level 3
 <...>-77375 [017] 316004.317605: kvm_mmu_paging_element: pte 1eae067 level 2
 <...>-77375 [017] 316004.317605: kvm_mmu_paging_element: pte 80000000fee0017b level 1
 <...>-77375 [017] 316004.317606: kvm_mmio: mmio write len 4 gpa 0xfee00380 val 0x1798
 <...>-77375 [017] 316004.317606: kvm_apic: apic_write APIC_TMICT = 0x1798
 <...>-77375 [017] 316004.317607: hrtimer_start: hrtimer=ffff880130ad8610 function=kvm_timer_fn expires=316147662651030 softexpires=316147662651030
 <...>-77375 [017] 316004.317610: kvm_entry: vcpu 1
 <...>-77375 [017] 316004.317621: kvm_exit: reason HLT rip 0xffffffff81052214 info 0 0
 <...>-77375 [017] 316004.317626: sched_switch: prev_comm=qemu-system-x86 prev_pid=77375 prev_prio=19 prev_state=S ==> next_comm=swapper/17 next_pid=0 next_prio=120
<idle>-0     [017] 316004.317627: power_start: type=1 state=0 cpu_id=17

Why the heck is a paravirtualized guest using an local APIC timer
emulation, instead of a paravirtualized clock event device?

Just look at the trace. That's insane. We enter the guest for 2us to
come back and handle the APIC_EOI for 11us. Then we go back to the
guest for 9us and spend again 11us for handling a write to APIC_TMICT.

That's 11us guest vs. 22us host time.

Aside of that, when looking at the bootup, the guest "calibrates" the
local APIC timer emulation against an emulated legacy device to figure
out the APIC timer clock rate, which is totally irrelevant for a
paravirtualized guest, if done right.

Look how a guest timer is programmed:

     hrtimer_start();
        ...
	clock_events_programm_event(dev, expires, now);
	  ns_delta = expires - now;
	  delta = convert_ns_to_dev(ns_delta, dev);
	  dev->set_next_event(delta, dev);
	    lapic_next_event(delta, dev);
	      apic_write(APIC_TMICT, delta);
	        |
		---> traps into host
                  kvm_mmu_pagetable_walk();
                  kvm_mmio_emulation();
                    kvm_apic_emulation();
		      start_apic_timer();
		        now = get_host_time();
		        delta = convert_apic_to_ns(APIC_TMICT);
                        hrtimer_start(apic_timer, now + delta, HRTIMER_MODE_ABS);

Oh well, we 

   - convert from nsec to a "calibrated" APIC delta
   - "program" the APIC timer
   - trap into the host
   - convert the "calibrated" delta back to nsec
   - add it to the current host time
   - arm the timer

Why the heck don't we use a paravirt device, which just provides a
nsec based interface. The host knows the time delta between the guests
notion of CLOCK_MONOTONIC and its own. That would reduce the whole
procedure to:

     hrtimer_start();
        ...
	clock_events_programm_event(dev, expires, now);
	  dev->set_next_ktime(expires, dev);
	    kvm_clock_event_set_next(expires, dev);
	        |
		---> traps into host with a paravirt call
		kvm_handle_guest_clkev_dev();
                  hrtimer_start(apic_timer, expires + host_guest_delta, HRTIMER_MODE_ABS);

That would save tons of time on an hot path. Even if the
host_guest_delta approach does not work, a 1:1 nsec mapping as a
relative timer on the host would be way faster than the current
solution.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13  0:59                     ` Thomas Gleixner
@ 2011-12-13  7:03                       ` Pekka Enberg
  2011-12-13 11:05                         ` Alan Cox
  2011-12-13 10:58                       ` Avi Kivity
  1 sibling, 1 reply; 46+ messages in thread
From: Pekka Enberg @ 2011-12-13  7:03 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Ingo Molnar, LKML, Sasha Levin

* Pekka Enberg <penberg@kernel.org> wrote:
>>> So i'm pretty sure it's some bug in hw/serial.c that's
>>> limiting character output by interrupts.

On Tue, 13 Dec 2011, Thomas Gleixner wrote:
> No, that's not a bug. The current emulation has no fifo and it writes
> every single character to the terminal. Go figure. The timer
> limitation is due to the missing fifo and other details, which would
> drive the serial driver into the "too much work for irq 4" case.
>
> The approach I took was to keep the emulated device as close to the
> real HW for obvious reasons. You simply cannot ignore the way how a HW
> device works and how the corresponding kernel driver expects it to
> work.

No disagreement here.

On Mon, 12 Dec 2011, Ingo Molnar wrote:
>> Serial port control flow somehow being bound by timer frequency
>> [which is not really a necessity: both the host and the guest
>> could stream on full speed] was too my observation early on.
>
> That simply does not work with the way the serial driver for the 8250
> is written.
>
> If you emulate hardware then don't expect that the shortcomings of the
> real hardware and the clusterf*ck in the corresponding device drivers
> go magically away.
>
> If you want high performance virtualization then use virtual drivers
> and stop whining about a 30 years old legacy device and its warts.

If you compare Ingo's numbers to mine, there's still some slowdown which 
cannot be explained by legacy device warts.

Btw, I was able to double the speed of serial console with this simple 
patch:

diff --git a/tools/kvm/hw/serial.c b/tools/kvm/hw/serial.c
index c0f6970..1a8eaaa 100644
--- a/tools/kvm/hw/serial.c
+++ b/tools/kvm/hw/serial.c
@@ -232,7 +232,7 @@ static bool serial8250_out(struct ioport *ioport, struct kvm *kvm, u16 port, voi
  				term_putc(CONSOLE_8250, addr, size, dev->id);
  			/* else FIXME: Inject data into rcv path for LOOP */

-			if (++dev->txcnt == 16)
+			if (++dev->txcnt == 32)
  				dev->lsr &= ~(UART_LSR_TEMT | UART_LSR_THRE);
  			break;
  		} else {

I assume '16' here is the 16550A FIFO size? Is there any reason we can't 
cheat and use a deeper FIFO internally?

 			Pekka

^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 21:36                     ` Alan Cox
@ 2011-12-13 10:32                       ` Avi Kivity
  0 siblings, 0 replies; 46+ messages in thread
From: Avi Kivity @ 2011-12-13 10:32 UTC (permalink / raw)
  To: Alan Cox; +Cc: Ingo Molnar, Sasha Levin, Thomas Gleixner, Pekka Enberg, LKML

On 12/12/2011 11:36 PM, Alan Cox wrote:
> On Mon, 12 Dec 2011 20:16:50 +0200
> Avi Kivity <avi@redhat.com> wrote:
>
> > On 12/12/2011 01:20 PM, Alan Cox wrote:
> > > > [*] Also, those 10K cycles include some significant Qemu 
> > > >     overhead - a couple of thousand cycles - that should be much 
> > > >     lower in the tools/kvm case.
> > >
> > > Are all the traps going into qemu
> > 
> > Only the ones that go to devices modelled in userspace.
> > 
> > >  - is KVM still that braindead ?
> > 
> > How would you do it?
>
> See my posting from years back about this...

Must have missed it.

>
> Your trap handler returns the kernel a predictor tree (say a 1 page tree)
> of the next expected ins out and actions - where the action is either
> store, or trap out.
>
> Each kernel trap then boils down to
>
> Does this trap match an entry in the top of our predictor tree. Ie is it
> an in/out (or equivalent mmio) that the driver has given us expected
> behaviour for.
>
> We could of course have several for different ranges of devices
>
> 	no - hit emulator and return position in predictor tree we failed
> 		at plus tree buffer if needed
>
> 	yes - follow the tree node
>
> 	For IN
> 		copy predicted byte/word/dword to tree buffer
> 		move along tree
> 		return
>
> 	For Out
> 		store store byte/word/dword in tree buffer
> 		(one of which can be used for discard)
> 		move along the tree
> 		return
>
> Thats enough to do things like write predictors for the kernel console
> emulating the serial fifo, to script PIO IDE transfers etc
>
> The kernel side is miniscule and generic, the user space can migrate to
> doing this where it pays off and not where it doesn't.

Looks incredibly fragile.  As soon as the guest changes its behaviour a
tiny bit, this breaks.  It doesn't work for any device that has even a
bit of indirection (like IDE DMA), or needs timers, or is misdesigned in
any of the hundreds of ways hardware designers have discovered over the
years.  And in the end, performance is still low compared to a
paravirtual device that was designed for minimizing exits - an exit that
is terminated in the kernel is cheaper than userspace, but still very
expensive.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-12 18:31                 ` Pekka Enberg
@ 2011-12-13 10:33                   ` Avi Kivity
  0 siblings, 0 replies; 46+ messages in thread
From: Avi Kivity @ 2011-12-13 10:33 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Ingo Molnar, Thomas Gleixner, LKML, Sasha Levin

On 12/12/2011 08:31 PM, Pekka Enberg wrote:
> On Mon, 2011-12-12 at 20:19 +0200, Avi Kivity wrote:
> > On 12/12/2011 01:19 PM, Pekka Enberg wrote:
> > > On Sun, Dec 11, 2011 at 5:53 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > > > Okay, but look at it from another angle: the top output i
> > > > generate is about 300k characters. 5000 msecs to execute it
> > > > means 16 usecs overhead per character - or about 50k cycles - on
> > > > a top of the class x86 CPU.
> > >
> > > I'm seeing 1.5 usecs per character for this little benchmark:
> > 
> > No interrupts, right?  Things look differently with one interrupt per
> > character and none. 
>
> No interrupts. I was simply trying to measure PIO overhead to see how
> bad the 15 usec per character Ingo measured really is.

Yeah.

Anyway, uart serial is never going to be fast.  That's what
virtio-serial is for.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13  0:59                     ` Thomas Gleixner
  2011-12-13  7:03                       ` Pekka Enberg
@ 2011-12-13 10:58                       ` Avi Kivity
  2011-12-13 13:52                         ` Thomas Gleixner
  1 sibling, 1 reply; 46+ messages in thread
From: Avi Kivity @ 2011-12-13 10:58 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Ingo Molnar, Pekka Enberg, LKML, Sasha Levin

On 12/13/2011 02:59 AM, Thomas Gleixner wrote:

<snip trace>
> Why the heck is a paravirtualized guest using an local APIC timer
> emulation, instead of a paravirtualized clock event device?
>
> Just look at the trace. That's insane. We enter the guest for 2us to
> come back and handle the APIC_EOI for 11us. Then we go back to the
> guest for 9us and spend again 11us for handling a write to APIC_TMICT.
>
> That's 11us guest vs. 22us host time.

Run your guest with x2apic enabled, the timing will be very different. 
You'll still have an exit for APIC_TMICT and APIC_EOI, but they'll be
much faster.  It's possible to avoid the EOI exit with some paravirt
magic, but that has its own issues.

> Aside of that, when looking at the bootup, the guest "calibrates" the
> local APIC timer emulation against an emulated legacy device to figure
> out the APIC timer clock rate, which is totally irrelevant for a
> paravirtualized guest, if done right.
>
> Look how a guest timer is programmed:
>
>      hrtimer_start();
>         ...
> 	clock_events_programm_event(dev, expires, now);
> 	  ns_delta = expires - now;
> 	  delta = convert_ns_to_dev(ns_delta, dev);
> 	  dev->set_next_event(delta, dev);
> 	    lapic_next_event(delta, dev);
> 	      apic_write(APIC_TMICT, delta);
> 	        |
> 		---> traps into host
>                   kvm_mmu_pagetable_walk();
>                   kvm_mmio_emulation();
>                     kvm_apic_emulation();
> 		      start_apic_timer();
> 		        now = get_host_time();
> 		        delta = convert_apic_to_ns(APIC_TMICT);
>                         hrtimer_start(apic_timer, now + delta, HRTIMER_MODE_ABS);
>
> Oh well, we 
>
>    - convert from nsec to a "calibrated" APIC delta
>    - "program" the APIC timer
>    - trap into the host
>    - convert the "calibrated" delta back to nsec
>    - add it to the current host time
>    - arm the timer
>
> Why the heck don't we use a paravirt device, which just provides a
> nsec based interface. The host knows the time delta between the guests
> notion of CLOCK_MONOTONIC and its own.

We do have a paravirt clocksource, just not clockevents.

>  That would reduce the whole
> procedure to:
>
>      hrtimer_start();
>         ...
> 	clock_events_programm_event(dev, expires, now);
> 	  dev->set_next_ktime(expires, dev);
> 	    kvm_clock_event_set_next(expires, dev);
> 	        |
> 		---> traps into host with a paravirt call
> 		kvm_handle_guest_clkev_dev();
>                   hrtimer_start(apic_timer, expires + host_guest_delta, HRTIMER_MODE_ABS);
>
> That would save tons of time on an hot path. Even if the
> host_guest_delta approach does not work, a 1:1 nsec mapping as a
> relative timer on the host would be way faster than the current
> solution.
>

The problem with paravirt clockevents is that if/when the APIC becomes
virtualized, then guests which were started with the paravirt
clockevents don't get accelerated when they are migrated onto newer
hardware.  This problem has bitten us several times in the past; if you
want to see how it looks when applied on a large scale look at Xen -
they have a paravirt-the-fsck-out-of-everything mode and a full virt
mode (which should be way faster these days); the two aren't
compatible.  Of course back when they started, they didn't have a
choice, but we do.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13  7:03                       ` Pekka Enberg
@ 2011-12-13 11:05                         ` Alan Cox
  0 siblings, 0 replies; 46+ messages in thread
From: Alan Cox @ 2011-12-13 11:05 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: Thomas Gleixner, Ingo Molnar, LKML, Sasha Levin

> I assume '16' here is the 16550A FIFO size? Is there any reason we can't 
> cheat and use a deeper FIFO internally?

Providing you emulate the fifo itself the guest has no idea how many bits
are leaving the fifo and at what rate in normal use. However you need to
be a bit cleverer to handle fifo sizing.

More generically there are a wide range of 8250 derived UART devices with
larger FIFOs if probed. The extra logic to emulate those isn't that much
and will take you to 64byte FIFO trivally enough and in a way that should
work with random guests.

Alan

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13 10:58                       ` Avi Kivity
@ 2011-12-13 13:52                         ` Thomas Gleixner
  2011-12-13 14:23                           ` Avi Kivity
  0 siblings, 1 reply; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-13 13:52 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, Pekka Enberg, LKML, Sasha Levin

On Tue, 13 Dec 2011, Avi Kivity wrote:

> On 12/13/2011 02:59 AM, Thomas Gleixner wrote:
> 
> <snip trace>
> > Why the heck is a paravirtualized guest using an local APIC timer
> > emulation, instead of a paravirtualized clock event device?
> >
> > Just look at the trace. That's insane. We enter the guest for 2us to
> > come back and handle the APIC_EOI for 11us. Then we go back to the
> > guest for 9us and spend again 11us for handling a write to APIC_TMICT.
> >
> > That's 11us guest vs. 22us host time.
> 
> Run your guest with x2apic enabled, the timing will be very different. 

And what magic do I have to use to make that happen other than having
x2apic support enabled in the kernel? Or do I need a certain kernel
version for host and guest to make that work?

> The problem with paravirt clockevents is that if/when the APIC becomes
> virtualized, then guests which were started with the paravirt
> clockevents don't get accelerated when they are migrated onto newer
> hardware.  This problem has bitten us several times in the past; if you
> want to see how it looks when applied on a large scale look at Xen -
> they have a paravirt-the-fsck-out-of-everything mode and a full virt
> mode (which should be way faster these days); the two aren't
> compatible.  Of course back when they started, they didn't have a
> choice, but we do.

Well, I can see the migration pain for life migration and I agree that
paravirt the world and some more is a horror as well. Though there is
a midground between everything and being smart about certain
aspects. The whole APIC timer calibration and the back and forth
conversion is definitely nothing which falls into the category of
smart.

Thanks,

	tglx



^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13 13:52                         ` Thomas Gleixner
@ 2011-12-13 14:23                           ` Avi Kivity
  2011-12-13 14:30                             ` Sasha Levin
  2011-12-13 14:51                             ` Thomas Gleixner
  0 siblings, 2 replies; 46+ messages in thread
From: Avi Kivity @ 2011-12-13 14:23 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Ingo Molnar, Pekka Enberg, LKML, Sasha Levin

On 12/13/2011 03:52 PM, Thomas Gleixner wrote:
> On Tue, 13 Dec 2011, Avi Kivity wrote:
>
> > On 12/13/2011 02:59 AM, Thomas Gleixner wrote:
> > 
> > <snip trace>
> > > Why the heck is a paravirtualized guest using an local APIC timer
> > > emulation, instead of a paravirtualized clock event device?
> > >
> > > Just look at the trace. That's insane. We enter the guest for 2us to
> > > come back and handle the APIC_EOI for 11us. Then we go back to the
> > > guest for 9us and spend again 11us for handling a write to APIC_TMICT.
> > >
> > > That's 11us guest vs. 22us host time.
> > 
> > Run your guest with x2apic enabled, the timing will be very different. 
>
> And what magic do I have to use to make that happen other than having
> x2apic support enabled in the kernel? Or do I need a certain kernel
> version for host and guest to make that work?

With qemu, -cpu host or -cpu blah,+x2apic.  kvm-tool does the equivalent
of -cpu host, so I'm surprised it doesn't show up.  x2apic has been
recognized by the guest for a long time (ce69a784; 2.6.32, I'll be
surprised if you have anything older than that on your machine).

Does x2apic show up in your guest's /proc/cpuinfo?

> > The problem with paravirt clockevents is that if/when the APIC becomes
> > virtualized, then guests which were started with the paravirt
> > clockevents don't get accelerated when they are migrated onto newer
> > hardware.  This problem has bitten us several times in the past; if you
> > want to see how it looks when applied on a large scale look at Xen -
> > they have a paravirt-the-fsck-out-of-everything mode and a full virt
> > mode (which should be way faster these days); the two aren't
> > compatible.  Of course back when they started, they didn't have a
> > choice, but we do.
>
> Well, I can see the migration pain for life migration and I agree that
> paravirt the world and some more is a horror as well. Though there is
> a midground between everything and being smart about certain
> aspects. 

Agree, clocksource is one example where the gain exceeds the pain.

> The whole APIC timer calibration and the back and forth
> conversion is definitely nothing which falls into the category of
> smart.

APIC timer calibration is silly but it hasn't proven to be a real-world
problem.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13 14:23                           ` Avi Kivity
@ 2011-12-13 14:30                             ` Sasha Levin
  2011-12-13 14:51                             ` Thomas Gleixner
  1 sibling, 0 replies; 46+ messages in thread
From: Sasha Levin @ 2011-12-13 14:30 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Thomas Gleixner, Ingo Molnar, Pekka Enberg, LKML

On Tue, 2011-12-13 at 16:23 +0200, Avi Kivity wrote:
> With qemu, -cpu host or -cpu blah,+x2apic.  kvm-tool does the equivalent
> of -cpu host, so I'm surprised it doesn't show up.  x2apic has been
> recognized by the guest for a long time (ce69a784; 2.6.32, I'll be
> surprised if you have anything older than that on your machine).
> 
> Does x2apic show up in your guest's /proc/cpuinfo? 

It should be detected on the guest. kvm tool does currently have an
issue where if x2apic is enabled, MSI-X interrupts will only go to
VCPU#0 (ideas for solving that are welcome).

-- 

Sasha.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13 14:23                           ` Avi Kivity
  2011-12-13 14:30                             ` Sasha Levin
@ 2011-12-13 14:51                             ` Thomas Gleixner
  2011-12-13 14:58                               ` Avi Kivity
  1 sibling, 1 reply; 46+ messages in thread
From: Thomas Gleixner @ 2011-12-13 14:51 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Ingo Molnar, Pekka Enberg, LKML, Sasha Levin

On Tue, 13 Dec 2011, Avi Kivity wrote:
> On 12/13/2011 03:52 PM, Thomas Gleixner wrote:
> > On Tue, 13 Dec 2011, Avi Kivity wrote:
> > > Run your guest with x2apic enabled, the timing will be very different. 
> >
> > And what magic do I have to use to make that happen other than having
> > x2apic support enabled in the kernel? Or do I need a certain kernel
> > version for host and guest to make that work?
> 
> With qemu, -cpu host or -cpu blah,+x2apic.  kvm-tool does the equivalent

Well, that was a trace from qemu-kvm (your latest tree). Now with -cpu
host it works and I get the same results as with kvm tool.

> of -cpu host, so I'm surprised it doesn't show up.  x2apic has been
> recognized by the guest for a long time (ce69a784; 2.6.32, I'll be
> surprised if you have anything older than that on your machine).
> 
> Does x2apic show up in your guest's /proc/cpuinfo?

Now it does :)

> > The whole APIC timer calibration and the back and forth
> > conversion is definitely nothing which falls into the category of
> > smart.
> 
> APIC timer calibration is silly but it hasn't proven to be a real-world
> problem.

Well, that depends on the world. The issue with that calibration is
that it can be off and for a certain category of application it
matters. Definitely not for the usual virt workload stuff. But saving
cycles for highres/nohz enabled guests would be nice. Maybe I have a
go when I find a some time.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [patch 0/3] kvm tool: Serial emulation overhaul
  2011-12-13 14:51                             ` Thomas Gleixner
@ 2011-12-13 14:58                               ` Avi Kivity
  0 siblings, 0 replies; 46+ messages in thread
From: Avi Kivity @ 2011-12-13 14:58 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: Ingo Molnar, Pekka Enberg, LKML, Sasha Levin

On 12/13/2011 04:51 PM, Thomas Gleixner wrote:
> > > The whole APIC timer calibration and the back and forth
> > > conversion is definitely nothing which falls into the category of
> > > smart.
> > 
> > APIC timer calibration is silly but it hasn't proven to be a real-world
> > problem.
>
> Well, that depends on the world. 

What I mean by that, is that I haven't seen any bug reports in this area
apart from some issue with wiring the PIT/PIC/APIC correctly which were
resolved long ago.

> The issue with that calibration is
> that it can be off and for a certain category of application it
> matters. Definitely not for the usual virt workload stuff. But saving
> cycles for highres/nohz enabled guests would be nice. Maybe I have a
> go when I find a some time.

Just dropping the calibration an replacing it with a pre-calibrated
value should be simple, if there is a workload that could benefit.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2011-12-13 14:58 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-12-10 13:27 [patch 0/3] kvm tool: Serial emulation overhaul Thomas Gleixner
2011-12-10 13:27 ` [patch 1/3] kvm tool: serial: Cleanup coding style Thomas Gleixner
2011-12-10 13:27 ` [patch 3/3] kvm tool: serial: Fix interrupt handling Thomas Gleixner
2011-12-10 13:27 ` [patch 2/3] kvm tool: serial: Simplify switch cases Thomas Gleixner
2011-12-10 15:17 ` [patch 0/3] kvm tool: Serial emulation overhaul Pekka Enberg
2011-12-10 20:51   ` Thomas Gleixner
2011-12-11  8:15     ` Pekka Enberg
2011-12-11 10:30       ` Ingo Molnar
2011-12-11 10:46         ` Ingo Molnar
2011-12-11 14:04         ` Thomas Gleixner
2011-12-11 15:53           ` Ingo Molnar
2011-12-12  5:30             ` Sasha Levin
2011-12-12 11:12               ` Ingo Molnar
2011-12-12 11:20                 ` Alan Cox
2011-12-12 17:20                   ` Ingo Molnar
2011-12-12 18:16                   ` Avi Kivity
2011-12-12 18:16                   ` Avi Kivity
2011-12-12 21:36                     ` Alan Cox
2011-12-13 10:32                       ` Avi Kivity
2011-12-12 11:23                 ` Cyrill Gorcunov
2011-12-12 17:40                 ` Sasha Levin
2011-12-12 17:45                   ` Ingo Molnar
2011-12-12  9:42             ` Thomas Gleixner
2011-12-12 11:19             ` Pekka Enberg
2011-12-12 17:20               ` Ingo Molnar
2011-12-12 18:40                 ` Pekka Enberg
2011-12-12 19:14                   ` Pekka Enberg
2011-12-12 19:21                   ` Ingo Molnar
2011-12-13  0:59                     ` Thomas Gleixner
2011-12-13  7:03                       ` Pekka Enberg
2011-12-13 11:05                         ` Alan Cox
2011-12-13 10:58                       ` Avi Kivity
2011-12-13 13:52                         ` Thomas Gleixner
2011-12-13 14:23                           ` Avi Kivity
2011-12-13 14:30                             ` Sasha Levin
2011-12-13 14:51                             ` Thomas Gleixner
2011-12-13 14:58                               ` Avi Kivity
2011-12-12 21:03                   ` James Courtier-Dutton
2011-12-12 18:19               ` Avi Kivity
2011-12-12 18:31                 ` Pekka Enberg
2011-12-13 10:33                   ` Avi Kivity
2011-12-12 10:27           ` Alan Cox
2011-12-12 10:59             ` Sasha Levin
2011-12-12 11:02               ` Alan Cox
2011-12-12 18:21                 ` Avi Kivity
2011-12-12 17:21               ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox