Linux virtualization list

Linux virtualization list
 help / color / mirror / Atom feed

* [PATCH v2 2/6] PCI: Scan all functions when running over Jailhouse
From: Jan Kiszka @ 2018-02-28  6:34 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, Benedikt Spranger, linux-pci, x86,
	Linux Kernel Mailing List, virtualization
In-Reply-To: <cover.1519799691.git.jan.kiszka@siemens.com>

From: Jan Kiszka <jan.kiszka@siemens.com>

Per PCIe r4.0, sec 7.5.1.1.9, multi-function devices are required to
have a function 0.  Therefore, Linux scans for devices at function 0
(devfn 0/8/16/...) and only scans for other functions if function 0
has its Multi-Function Device bit set or ARI or SR-IOV indicate
there are more functions.

The Jailhouse hypervisor may pass individual functions of a
multi-function device to a guest without passing function 0, which
means a Linux guest won't find them.

Change Linux PCI probing so it scans all function numbers when
running as a guest over Jailhouse.

This is technically prohibited by the spec, so it is possible that
PCI devices without the Multi-Function Device bit set may have
unexpected behavior in response to this probe.

Based on patch by Benedikt Spranger, adding Jailhouse probing to avoid
changing the behavior in the absence of the hypervisor.

CC: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/legacy.c | 4 +++-
 drivers/pci/probe.c   | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 1cb01abcb1be..dfbe6ac38830 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -4,6 +4,7 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pci.h>
+#include <asm/jailhouse_para.h>
 #include <asm/pci_x86.h>
 
 /*
@@ -34,13 +35,14 @@ int __init pci_legacy_init(void)
 
 void pcibios_scan_specific_bus(int busn)
 {
+	int stride = jailhouse_paravirt() ? 1 : 8;
 	int devfn;
 	u32 l;
 
 	if (pci_find_bus(0, busn))
 		return;
 
-	for (devfn = 0; devfn < 256; devfn += 8) {
+	for (devfn = 0; devfn < 256; devfn += stride) {
 		if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
 		    l != 0x0000 && l != 0xffff) {
 			DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ef5377438a1e..ce728251ae36 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -16,6 +16,7 @@
 #include <linux/pci-aspm.h>
 #include <linux/aer.h>
 #include <linux/acpi.h>
+#include <linux/hypervisor.h>
 #include <linux/irqdomain.h>
 #include <linux/pm_runtime.h>
 #include "pci.h"
@@ -2517,6 +2518,7 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus,
 					      unsigned int available_buses)
 {
 	unsigned int used_buses, normal_bridges = 0, hotplug_bridges = 0;
+	unsigned int stride = jailhouse_paravirt() ? 1 : 8;
 	unsigned int start = bus->busn_res.start;
 	unsigned int devfn, cmax, max = start;
 	struct pci_dev *dev;
@@ -2524,7 +2526,7 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus,
 	dev_dbg(&bus->dev, "scanning bus\n");
 
 	/* Go find them, Rover! */
-	for (devfn = 0; devfn < 0x100; devfn += 8)
+	for (devfn = 0; devfn < 0x100; devfn += stride)
 		pci_scan_slot(bus, devfn);
 
 	/* Reserve buses for SR-IOV capability */
-- 
2.13.6

^ permalink raw reply related

* [PATCH v2 3/6] x86/jailhouse: Enable PCI mmconfig access in inmates
From: Jan Kiszka @ 2018-02-28  6:34 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, linux-pci, x86, Linux Kernel Mailing List,
	virtualization
In-Reply-To: <cover.1519799691.git.jan.kiszka@siemens.com>

From: Otavio Pontes <otavio.pontes@intel.com>

Use the PCI mmconfig base address exported by jailhouse in boot
parameters in order to access the memory mapped PCI configuration space.

Signed-off-by: Otavio Pontes <otavio.pontes@intel.com>
[Jan: rebased, fixed !CONFIG_PCI_MMCONFIG]
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 arch/x86/include/asm/pci_x86.h | 2 ++
 arch/x86/kernel/jailhouse.c    | 7 +++++++
 arch/x86/pci/mmconfig-shared.c | 4 ++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index eb66fa9cd0fc..959d618dbb17 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -151,6 +151,8 @@ extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
 			       phys_addr_t addr);
 extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
 extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
+extern struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
+							int end, u64 addr);
 
 extern struct list_head pci_mmcfg_list;
 
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index b68fd895235a..7fe2a73da0b3 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -124,6 +124,13 @@ static int __init jailhouse_pci_arch_init(void)
 	if (pcibios_last_bus < 0)
 		pcibios_last_bus = 0xff;
 
+#ifdef CONFIG_PCI_MMCONFIG
+	if (setup_data.pci_mmconfig_base) {
+		pci_mmconfig_add(0, 0, 0xff, setup_data.pci_mmconfig_base);
+		pci_mmcfg_arch_init();
+	}
+#endif
+
 	return 0;
 }
 
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 96684d0adcf9..0e590272366b 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -94,8 +94,8 @@ static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
 	return new;
 }
 
-static struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
-							int end, u64 addr)
+struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
+						 int end, u64 addr)
 {
 	struct pci_mmcfg_region *new;
 
-- 
2.13.6

^ permalink raw reply related

* [PATCH v2 4/6] x86: Consolidate PCI_MMCONFIG configs
From: Jan Kiszka @ 2018-02-28  6:34 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, linux-pci, x86, Linux Kernel Mailing List,
	virtualization
In-Reply-To: <cover.1519799691.git.jan.kiszka@siemens.com>

From: Jan Kiszka <jan.kiszka@siemens.com>

Not sure if those two worked by design or just by chance so far. In any
case, it's at least cleaner and clearer to express this in a single
config statement.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 arch/x86/Kconfig | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index eb7f43f23521..63e85e7da12e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2641,8 +2641,9 @@ config PCI_DIRECT
 	depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
 
 config PCI_MMCONFIG
-	def_bool y
-	depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY)
+	bool "Support mmconfig PCI config space access" if X86_64
+	default y
+	depends on PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY || X86_64)
 
 config PCI_OLPC
 	def_bool y
@@ -2657,10 +2658,6 @@ config PCI_DOMAINS
 	def_bool y
 	depends on PCI
 
-config PCI_MMCONFIG
-	bool "Support mmconfig PCI config space access"
-	depends on X86_64 && PCI && ACPI
-
 config PCI_CNB20LE_QUIRK
 	bool "Read CNB20LE Host Bridge Windows" if EXPERT
 	depends on PCI
-- 
2.13.6

^ permalink raw reply related

* [PATCH v2 5/6] x86/jailhouse: Allow to use PCI_MMCONFIG without ACPI
From: Jan Kiszka @ 2018-02-28  6:34 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, linux-pci, x86, Linux Kernel Mailing List,
	virtualization
In-Reply-To: <cover.1519799691.git.jan.kiszka@siemens.com>

From: Jan Kiszka <jan.kiszka@siemens.com>

Jailhouse does not use ACPI, but it does support MMCONFIG. Make sure the
latter can be built without having to enable ACPI as well. Primarily, we
need to make the AMD mmconf-fam10h_64 depend upon MMCONFIG and ACPI,
instead of just the former.

Saves some bytes in the Jailhouse non-root kernel.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 arch/x86/Kconfig          | 6 +++++-
 arch/x86/kernel/Makefile  | 2 +-
 arch/x86/kernel/cpu/amd.c | 2 +-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 63e85e7da12e..5b0ac52e357a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2643,7 +2643,7 @@ config PCI_DIRECT
 config PCI_MMCONFIG
 	bool "Support mmconfig PCI config space access" if X86_64
 	default y
-	depends on PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY || X86_64)
+	depends on PCI && (ACPI || SFI || JAILHOUSE_GUEST) && (PCI_GOMMCONFIG || PCI_GOANY || X86_64)
 
 config PCI_OLPC
 	def_bool y
@@ -2658,6 +2658,10 @@ config PCI_DOMAINS
 	def_bool y
 	depends on PCI
 
+config MMCONF_FAM10H
+	def_bool y
+	depends on PCI_MMCONFIG && ACPI
+
 config PCI_CNB20LE_QUIRK
 	bool "Read CNB20LE Host Bridge Windows" if EXPERT
 	depends on PCI
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 29786c87e864..73ccf80c09a2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -146,6 +146,6 @@ ifeq ($(CONFIG_X86_64),y)
 	obj-$(CONFIG_GART_IOMMU)	+= amd_gart_64.o aperture_64.o
 	obj-$(CONFIG_CALGARY_IOMMU)	+= pci-calgary_64.o tce_64.o
 
-	obj-$(CONFIG_PCI_MMCONFIG)	+= mmconf-fam10h_64.o
+	obj-$(CONFIG_MMCONF_FAM10H)	+= mmconf-fam10h_64.o
 	obj-y				+= vsmp_64.o
 endif
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index f0e6456ca7d3..12bc0a1139da 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -716,7 +716,7 @@ static void init_amd_k8(struct cpuinfo_x86 *c)
 
 static void init_amd_gh(struct cpuinfo_x86 *c)
 {
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_MMCONF_FAM10H
 	/* do this for boot cpu */
 	if (c == &boot_cpu_data)
 		check_enable_amd_mmconf_dmi();
-- 
2.13.6

^ permalink raw reply related

* [PATCH v2 6/6] MAINTAINERS: Add entry for Jailhouse
From: Jan Kiszka @ 2018-02-28  6:34 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, linux-pci, x86, Linux Kernel Mailing List,
	virtualization
In-Reply-To: <cover.1519799691.git.jan.kiszka@siemens.com>

From: Jan Kiszka <jan.kiszka@siemens.com>

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 MAINTAINERS | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 93a12af4f180..4b889f282c77 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7521,6 +7521,13 @@ Q:	http://patchwork.linuxtv.org/project/linux-media/list/
 S:	Maintained
 F:	drivers/media/dvb-frontends/ix2505v*
 
+JAILHOUSE HYPERVISOR INTERFACE
+M:	Jan Kiszka <jan.kiszka@siemens.com>
+L:	jailhouse-dev@googlegroups.com
+S:	Maintained
+F:	arch/x86/kernel/jailhouse.c
+F:	arch/x86/include/asm/jailhouse_para.h
+
 JC42.4 TEMPERATURE SENSOR DRIVER
 M:	Guenter Roeck <linux@roeck-us.net>
 L:	linux-hwmon@vger.kernel.org
-- 
2.13.6

^ permalink raw reply related

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-28  7:08 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Duyck, Alexander H, virtio-dev, Michael S. Tsirkin,
	Sridhar Samudrala, Alexander Duyck, virtualization, Siwei Liu,
	Netdev, David Miller
In-Reply-To: <20180227134149.2bff667e@cakuba.netronome.com>

Tue, Feb 27, 2018 at 10:41:49PM CET, kubakici@wp.pl wrote:
>On Tue, 27 Feb 2018 13:16:21 -0800, Alexander Duyck wrote:
>> Basically we need some sort of PCI or PCIe topology mapping for the
>> devices that can be translated into something we can communicate over
>> the communication channel. 
>
>Hm.  This is probably a completely stupid idea, but if we need to
>start marshalling configuration requests/hints maybe the entire problem
>could be solved by opening a netlink socket from hypervisor?  Even make
>teamd run on the hypervisor side...

Interesting. That would be more trickier then just to fwd 1 genetlink
socket to the hypervisor.

Also, I think that the solution should handle multiple guest oses. What
I'm thinking about is some generic bonding description passed over some
communication channel into vm. The vm either use it for configuration,
or ignores it if it is not smart enough/updated enough.

^ permalink raw reply

* Re: [PATCH v2 2/6] PCI: Scan all functions when running over Jailhouse
From: Thomas Gleixner @ 2018-02-28  8:44 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: jailhouse-dev, Benedikt Spranger, linux-pci, x86,
	Linux Kernel Mailing List, virtualization, Ingo Molnar,
	H . Peter Anvin, Bjorn Helgaas
In-Reply-To: <021d3dde4276c9bf4325f7bdc37e3c47069e48fc.1519799691.git.jan.kiszka@siemens.com>

On Wed, 28 Feb 2018, Jan Kiszka wrote:

> From: Jan Kiszka <jan.kiszka@siemens.com>
> 
> Per PCIe r4.0, sec 7.5.1.1.9, multi-function devices are required to
> have a function 0.  Therefore, Linux scans for devices at function 0
> (devfn 0/8/16/...) and only scans for other functions if function 0
> has its Multi-Function Device bit set or ARI or SR-IOV indicate
> there are more functions.
> 
> The Jailhouse hypervisor may pass individual functions of a
> multi-function device to a guest without passing function 0, which
> means a Linux guest won't find them.
> 
> Change Linux PCI probing so it scans all function numbers when
> running as a guest over Jailhouse.

>  void pcibios_scan_specific_bus(int busn)
>  {
> +	int stride = jailhouse_paravirt() ? 1 : 8;
>  	int devfn;
>  	u32 l;
>  
>  	if (pci_find_bus(0, busn))
>  		return;
>  
> -	for (devfn = 0; devfn < 256; devfn += 8) {
> +	for (devfn = 0; devfn < 256; devfn += stride) {
>  		if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
>  		    l != 0x0000 && l != 0xffff) {
>  			DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);

Shouldn't that take the situation into account where the MFD bit is set on
a regular devfn, i.e. (devfn % 8) == 0? In that case you'd scan the
subfunctions twice.

Thanks,

	tglx

^ permalink raw reply

* Re: [PATCH v2 2/6] PCI: Scan all functions when running over Jailhouse
From: Jan Kiszka @ 2018-02-28 10:01 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: jailhouse-dev, Benedikt Spranger, linux-pci, x86,
	Linux Kernel Mailing List, virtualization, Ingo Molnar,
	H . Peter Anvin, Bjorn Helgaas
In-Reply-To: <alpine.DEB.2.21.1802280938380.1886@nanos.tec.linutronix.de>

On 2018-02-28 09:44, Thomas Gleixner wrote:
> On Wed, 28 Feb 2018, Jan Kiszka wrote:
> 
>> From: Jan Kiszka <jan.kiszka@siemens.com>
>>
>> Per PCIe r4.0, sec 7.5.1.1.9, multi-function devices are required to
>> have a function 0.  Therefore, Linux scans for devices at function 0
>> (devfn 0/8/16/...) and only scans for other functions if function 0
>> has its Multi-Function Device bit set or ARI or SR-IOV indicate
>> there are more functions.
>>
>> The Jailhouse hypervisor may pass individual functions of a
>> multi-function device to a guest without passing function 0, which
>> means a Linux guest won't find them.
>>
>> Change Linux PCI probing so it scans all function numbers when
>> running as a guest over Jailhouse.
> 
>>  void pcibios_scan_specific_bus(int busn)
>>  {
>> +	int stride = jailhouse_paravirt() ? 1 : 8;
>>  	int devfn;
>>  	u32 l;
>>  
>>  	if (pci_find_bus(0, busn))
>>  		return;
>>  
>> -	for (devfn = 0; devfn < 256; devfn += 8) {
>> +	for (devfn = 0; devfn < 256; devfn += stride) {
>>  		if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
>>  		    l != 0x0000 && l != 0xffff) {
>>  			DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
> 
> Shouldn't that take the situation into account where the MFD bit is set on
> a regular devfn, i.e. (devfn % 8) == 0? In that case you'd scan the
> subfunctions twice.

Good point, and it also applies to pci_scan_child_bus_extend. Will add
some filters.

Jan

-- 
Siemens AG, Corporate Technology, CT RDA IOT SES-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply

* [PATCH net] virtio-net: disable NAPI only when enabled during XDP set
From: Jason Wang @ 2018-02-28 10:20 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel

We try to disable NAPI to prevent a single XDP TX queue being used by
multiple cpus. But we don't check if device is up (NAPI is enabled),
this could result stall because of infinite wait in
napi_disable(). Fixing this by checking device state through
netif_running() before.

Fixes: 4941d472bf95b ("virtio-net: do not reset during XDP set")
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9bb9e56..2d54123 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2185,8 +2185,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	}
 
 	/* Make sure NAPI is not using any XDP TX queues for RX. */
-	for (i = 0; i < vi->max_queue_pairs; i++)
-		napi_disable(&vi->rq[i].napi);
+	if (netif_running(dev))
+		for (i = 0; i < vi->max_queue_pairs; i++)
+			napi_disable(&vi->rq[i].napi);
 
 	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
 	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
@@ -2205,7 +2206,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 		}
 		if (old_prog)
 			bpf_prog_put(old_prog);
-		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
+		if (netif_running(dev))
+			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 	}
 
 	return 0;
-- 
2.7.4

^ permalink raw reply related

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Michael S. Tsirkin @ 2018-02-28 14:32 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski, Sridhar Samudrala,
	Alexander Duyck, virtualization, Siwei Liu, Netdev, David Miller
In-Reply-To: <20180228070839.GA19654@nanopsycho>

On Wed, Feb 28, 2018 at 08:08:39AM +0100, Jiri Pirko wrote:
> Tue, Feb 27, 2018 at 10:41:49PM CET, kubakici@wp.pl wrote:
> >On Tue, 27 Feb 2018 13:16:21 -0800, Alexander Duyck wrote:
> >> Basically we need some sort of PCI or PCIe topology mapping for the
> >> devices that can be translated into something we can communicate over
> >> the communication channel. 
> >
> >Hm.  This is probably a completely stupid idea, but if we need to
> >start marshalling configuration requests/hints maybe the entire problem
> >could be solved by opening a netlink socket from hypervisor?  Even make
> >teamd run on the hypervisor side...
> 
> Interesting. That would be more trickier then just to fwd 1 genetlink
> socket to the hypervisor.
> 
> Also, I think that the solution should handle multiple guest oses. What
> I'm thinking about is some generic bonding description passed over some
> communication channel into vm. The vm either use it for configuration,
> or ignores it if it is not smart enough/updated enough.

For sure, we could build virtio-bond to pass that info to guests.

Such an advisory mechanism would not be a replacement for the mandatory
passthrough fallback flag proposed, but OTOH it's much more flexible.

-- 
MST

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-28 15:11 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski, Sridhar Samudrala,
	Alexander Duyck, virtualization, Siwei Liu, Netdev, David Miller
In-Reply-To: <20180228160647-mutt-send-email-mst@kernel.org>

Wed, Feb 28, 2018 at 03:32:44PM CET, mst@redhat.com wrote:
>On Wed, Feb 28, 2018 at 08:08:39AM +0100, Jiri Pirko wrote:
>> Tue, Feb 27, 2018 at 10:41:49PM CET, kubakici@wp.pl wrote:
>> >On Tue, 27 Feb 2018 13:16:21 -0800, Alexander Duyck wrote:
>> >> Basically we need some sort of PCI or PCIe topology mapping for the
>> >> devices that can be translated into something we can communicate over
>> >> the communication channel. 
>> >
>> >Hm.  This is probably a completely stupid idea, but if we need to
>> >start marshalling configuration requests/hints maybe the entire problem
>> >could be solved by opening a netlink socket from hypervisor?  Even make
>> >teamd run on the hypervisor side...
>> 
>> Interesting. That would be more trickier then just to fwd 1 genetlink
>> socket to the hypervisor.
>> 
>> Also, I think that the solution should handle multiple guest oses. What
>> I'm thinking about is some generic bonding description passed over some
>> communication channel into vm. The vm either use it for configuration,
>> or ignores it if it is not smart enough/updated enough.
>
>For sure, we could build virtio-bond to pass that info to guests.

What do you mean by "virtio-bond". virtio_net extension?

>
>Such an advisory mechanism would not be a replacement for the mandatory
>passthrough fallback flag proposed, but OTOH it's much more flexible.
>
>-- 
>MST

^ permalink raw reply

* Re: [PATCH net] virtio-net: disable NAPI only when enabled during XDP set
From: Michael S. Tsirkin @ 2018-02-28 15:27 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, virtualization
In-Reply-To: <1519813204-10002-1-git-send-email-jasowang@redhat.com>

On Wed, Feb 28, 2018 at 06:20:04PM +0800, Jason Wang wrote:
> We try to disable NAPI to prevent a single XDP TX queue being used by
> multiple cpus. But we don't check if device is up (NAPI is enabled),
> this could result stall because of infinite wait in
> napi_disable(). Fixing this by checking device state through
> netif_running() before.
> 
> Fixes: 4941d472bf95b ("virtio-net: do not reset during XDP set")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

> ---
>  drivers/net/virtio_net.c | 8 +++++---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 9bb9e56..2d54123 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2185,8 +2185,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>  	}
>  
>  	/* Make sure NAPI is not using any XDP TX queues for RX. */
> -	for (i = 0; i < vi->max_queue_pairs; i++)
> -		napi_disable(&vi->rq[i].napi);
> +	if (netif_running(dev))
> +		for (i = 0; i < vi->max_queue_pairs; i++)
> +			napi_disable(&vi->rq[i].napi);
>  
>  	netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
>  	err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
> @@ -2205,7 +2206,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>  		}
>  		if (old_prog)
>  			bpf_prog_put(old_prog);
> -		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
> +		if (netif_running(dev))
> +			virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
>  	}
>  
>  	return 0;
> -- 
> 2.7.4

^ permalink raw reply

* Re: [PATCH v2 4/6] x86: Consolidate PCI_MMCONFIG configs
From: Andy Shevchenko @ 2018-02-28 15:45 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: jailhouse-dev, linux-pci, x86, Linux Kernel Mailing List,
	virtualization, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas,
	Thomas Gleixner
In-Reply-To: <76a05abd818c89032161585ba130511a5bd673f0.1519799691.git.jan.kiszka@siemens.com>

On Wed, Feb 28, 2018 at 8:34 AM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> From: Jan Kiszka <jan.kiszka@siemens.com>
>
> Not sure if those two worked by design or just by chance so far. In any
> case, it's at least cleaner and clearer to express this in a single
> config statement.

I would add a reference to the commit which brought that in the first place.

>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> ---
>  arch/x86/Kconfig | 9 +++------
>  1 file changed, 3 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index eb7f43f23521..63e85e7da12e 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2641,8 +2641,9 @@ config PCI_DIRECT
>         depends on PCI && (X86_64 || (PCI_GODIRECT || PCI_GOANY || PCI_GOOLPC || PCI_GOMMCONFIG))
>
>  config PCI_MMCONFIG
> -       def_bool y
> -       depends on X86_32 && PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY)
> +       bool "Support mmconfig PCI config space access" if X86_64
> +       default y
> +       depends on PCI && (ACPI || SFI) && (PCI_GOMMCONFIG || PCI_GOANY || X86_64)

Looking to the above context I would rather put it like

depends on PCI && (ACPI || SFI) && (X86_64 || (PCI_GOANY || PCI_GOMMCONFIG))

-- 
With Best Regards,
Andy Shevchenko

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Michael S. Tsirkin @ 2018-02-28 15:45 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski, Sridhar Samudrala,
	Alexander Duyck, virtualization, Siwei Liu, Netdev, David Miller
In-Reply-To: <20180228151131.GF19654@nanopsycho>

On Wed, Feb 28, 2018 at 04:11:31PM +0100, Jiri Pirko wrote:
> Wed, Feb 28, 2018 at 03:32:44PM CET, mst@redhat.com wrote:
> >On Wed, Feb 28, 2018 at 08:08:39AM +0100, Jiri Pirko wrote:
> >> Tue, Feb 27, 2018 at 10:41:49PM CET, kubakici@wp.pl wrote:
> >> >On Tue, 27 Feb 2018 13:16:21 -0800, Alexander Duyck wrote:
> >> >> Basically we need some sort of PCI or PCIe topology mapping for the
> >> >> devices that can be translated into something we can communicate over
> >> >> the communication channel. 
> >> >
> >> >Hm.  This is probably a completely stupid idea, but if we need to
> >> >start marshalling configuration requests/hints maybe the entire problem
> >> >could be solved by opening a netlink socket from hypervisor?  Even make
> >> >teamd run on the hypervisor side...
> >> 
> >> Interesting. That would be more trickier then just to fwd 1 genetlink
> >> socket to the hypervisor.
> >> 
> >> Also, I think that the solution should handle multiple guest oses. What
> >> I'm thinking about is some generic bonding description passed over some
> >> communication channel into vm. The vm either use it for configuration,
> >> or ignores it if it is not smart enough/updated enough.
> >
> >For sure, we could build virtio-bond to pass that info to guests.
> 
> What do you mean by "virtio-bond". virtio_net extension?

I mean a new device supplying topology information to guests,
with updates whenever VMs are started, stopped or migrated.

> >
> >Such an advisory mechanism would not be a replacement for the mandatory
> >passthrough fallback flag proposed, but OTOH it's much more flexible.
> >
> >-- 
> >MST

^ permalink raw reply

* Re: [PATCH net] virtio-net: disable NAPI only when enabled during XDP set
From: David Miller @ 2018-02-28 17:22 UTC (permalink / raw)
  To: jasowang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <1519813204-10002-1-git-send-email-jasowang@redhat.com>

From: Jason Wang <jasowang@redhat.com>
Date: Wed, 28 Feb 2018 18:20:04 +0800

> We try to disable NAPI to prevent a single XDP TX queue being used by
> multiple cpus. But we don't check if device is up (NAPI is enabled),
> this could result stall because of infinite wait in
> napi_disable(). Fixing this by checking device state through
> netif_running() before.
> 
> Fixes: 4941d472bf95b ("virtio-net: do not reset during XDP set")
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Yes, mis-paired NAPI enable/disable are really a pain.

Probably, we can do something in the interfaces or mechanisms to make
this less error prone and less fragile.

Anyways, applied and queued up for -stable, thanks!

^ permalink raw reply

* Re: [PATCH net] virtio-net: disable NAPI only when enabled during XDP set
From: Ben Greear @ 2018-02-28 19:02 UTC (permalink / raw)
  To: David Miller, jasowang; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <20180228.122231.1811613646832502397.davem@davemloft.net>

On 02/28/2018 09:22 AM, David Miller wrote:
> From: Jason Wang <jasowang@redhat.com>
> Date: Wed, 28 Feb 2018 18:20:04 +0800
>
>> We try to disable NAPI to prevent a single XDP TX queue being used by
>> multiple cpus. But we don't check if device is up (NAPI is enabled),
>> this could result stall because of infinite wait in
>> napi_disable(). Fixing this by checking device state through
>> netif_running() before.
>>
>> Fixes: 4941d472bf95b ("virtio-net: do not reset during XDP set")
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>
> Yes, mis-paired NAPI enable/disable are really a pain.
>
> Probably, we can do something in the interfaces or mechanisms to make
> this less error prone and less fragile.
>
> Anyways, applied and queued up for -stable, thanks!


I just hit a similar bug in ath10k.  It seems like napi has plenty
of free bit flags so it could keep track of 'is-enabled' state and
allow someone to call napi_disable multiple times w/out deadlocking.

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Jiri Pirko @ 2018-02-28 19:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski, Sridhar Samudrala,
	Alexander Duyck, virtualization, Siwei Liu, Netdev, David Miller
In-Reply-To: <20180228174449-mutt-send-email-mst@kernel.org>

Wed, Feb 28, 2018 at 04:45:39PM CET, mst@redhat.com wrote:
>On Wed, Feb 28, 2018 at 04:11:31PM +0100, Jiri Pirko wrote:
>> Wed, Feb 28, 2018 at 03:32:44PM CET, mst@redhat.com wrote:
>> >On Wed, Feb 28, 2018 at 08:08:39AM +0100, Jiri Pirko wrote:
>> >> Tue, Feb 27, 2018 at 10:41:49PM CET, kubakici@wp.pl wrote:
>> >> >On Tue, 27 Feb 2018 13:16:21 -0800, Alexander Duyck wrote:
>> >> >> Basically we need some sort of PCI or PCIe topology mapping for the
>> >> >> devices that can be translated into something we can communicate over
>> >> >> the communication channel. 
>> >> >
>> >> >Hm.  This is probably a completely stupid idea, but if we need to
>> >> >start marshalling configuration requests/hints maybe the entire problem
>> >> >could be solved by opening a netlink socket from hypervisor?  Even make
>> >> >teamd run on the hypervisor side...
>> >> 
>> >> Interesting. That would be more trickier then just to fwd 1 genetlink
>> >> socket to the hypervisor.
>> >> 
>> >> Also, I think that the solution should handle multiple guest oses. What
>> >> I'm thinking about is some generic bonding description passed over some
>> >> communication channel into vm. The vm either use it for configuration,
>> >> or ignores it if it is not smart enough/updated enough.
>> >
>> >For sure, we could build virtio-bond to pass that info to guests.
>> 
>> What do you mean by "virtio-bond". virtio_net extension?
>
>I mean a new device supplying topology information to guests,
>with updates whenever VMs are started, stopped or migrated.

Good. Any idea how that device would look like? Also, any idea how to
handle in in kernel and how to pass along this info to userspace?
Is there anything similar out there?

Thanks!

^ permalink raw reply

* Re: [RFC PATCH v3 0/3] Enable virtio_net to act as a backup for a passthru device
From: Michael S. Tsirkin @ 2018-02-28 20:48 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: Duyck, Alexander H, virtio-dev, Jakub Kicinski, Sridhar Samudrala,
	Alexander Duyck, virtualization, Siwei Liu, Netdev, David Miller
In-Reply-To: <20180228192501.GB2389@nanopsycho>

On Wed, Feb 28, 2018 at 08:25:01PM +0100, Jiri Pirko wrote:
> Wed, Feb 28, 2018 at 04:45:39PM CET, mst@redhat.com wrote:
> >On Wed, Feb 28, 2018 at 04:11:31PM +0100, Jiri Pirko wrote:
> >> Wed, Feb 28, 2018 at 03:32:44PM CET, mst@redhat.com wrote:
> >> >On Wed, Feb 28, 2018 at 08:08:39AM +0100, Jiri Pirko wrote:
> >> >> Tue, Feb 27, 2018 at 10:41:49PM CET, kubakici@wp.pl wrote:
> >> >> >On Tue, 27 Feb 2018 13:16:21 -0800, Alexander Duyck wrote:
> >> >> >> Basically we need some sort of PCI or PCIe topology mapping for the
> >> >> >> devices that can be translated into something we can communicate over
> >> >> >> the communication channel. 
> >> >> >
> >> >> >Hm.  This is probably a completely stupid idea, but if we need to
> >> >> >start marshalling configuration requests/hints maybe the entire problem
> >> >> >could be solved by opening a netlink socket from hypervisor?  Even make
> >> >> >teamd run on the hypervisor side...
> >> >> 
> >> >> Interesting. That would be more trickier then just to fwd 1 genetlink
> >> >> socket to the hypervisor.
> >> >> 
> >> >> Also, I think that the solution should handle multiple guest oses. What
> >> >> I'm thinking about is some generic bonding description passed over some
> >> >> communication channel into vm. The vm either use it for configuration,
> >> >> or ignores it if it is not smart enough/updated enough.
> >> >
> >> >For sure, we could build virtio-bond to pass that info to guests.
> >> 
> >> What do you mean by "virtio-bond". virtio_net extension?
> >
> >I mean a new device supplying topology information to guests,
> >with updates whenever VMs are started, stopped or migrated.
> 
> Good. Any idea how that device would look like? Also, any idea how to
> handle in in kernel and how to pass along this info to userspace?
> Is there anything similar out there?
> 
> Thanks!

E.g. balloon is used to pass hints about amount of memory
guest should use. We could do something similar.

I imagine device can send a configuration interrupt
on each topology change. Kernel wakes up userspace pollers.
Userspace starts doing reads from a char device and
figures out what changed.

Which info is needed there? I am not sure.
How about list of MAC/VLAN addresses coupled to list of
devices to queue on (specified by mac? by PCI address)?

Or do we ever need to go higher level and make decisions
based on IP addresses as well?

-- 
MST

^ permalink raw reply

* [PATCH net-next 0/2] virtio-net: re enable XDP_REDIRECT for mergeable buffer
From: Jason Wang @ 2018-03-01  3:19 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.fastabend, brouer

Hi:

This series tries to re-enable XDP_REDIRECT for mergeable buffer which
was removed since commit 7324f5399b06 ("virtio_net: disable
XDP_REDIRECT in receive_mergeable() case"). Main concerns are:

- not enough tailroom was reserved which breaks cpumap
- complex logic like EWMA and linearizing during XDP processing

Fix those by:

- reserve enough tailroom during refill
- disable EWMA and use fixed size of rx buffer
- drop linearizing logic and offload it to generic XDP routine, this
  could happen only when the buffer were refilled before XDP set, so
  we could simply ignore the negative performance impact.

Please review.

Thanks

Jason Wang (2):
  virtio-net: re enable XDP_REDIRECT for mergeable buffer
  virtio-net: simplify XDP handling in small buffer

 drivers/net/virtio_net.c | 186 ++++++++++++++++++-----------------------------
 1 file changed, 70 insertions(+), 116 deletions(-)

-- 
2.7.4

^ permalink raw reply

* [PATCH net-next 1/2] virtio-net: re enable XDP_REDIRECT for mergeable buffer
From: Jason Wang @ 2018-03-01  3:19 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.fastabend, brouer
In-Reply-To: <1519874345-10235-1-git-send-email-jasowang@redhat.com>

XDP_REDIRECT support for mergeable buffer was removed since commit
7324f5399b06 ("virtio_net: disable XDP_REDIRECT in receive_mergeable()
case"). This is because we don't reserve enough tailroom for struct
skb_shared_info which breaks XDP assumption. Other complaints are, the
complex linearize logic and EWMA estimation may increase the
possibility of linearizing.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 107 +++++++++++++++++++++++++++++------------------
 1 file changed, 67 insertions(+), 40 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9bb9e56..81190ba 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -537,6 +537,26 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 	return NULL;
 }
 
+static struct sk_buff *virtnet_skb_xdp(struct receive_queue *rq,
+				       struct sk_buff *skb)
+{
+	struct bpf_prog *xdp_prog;
+	int ret;
+
+	rcu_read_lock();
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		ret = do_xdp_generic(xdp_prog, skb);
+		if (ret != XDP_PASS) {
+			rcu_read_unlock();
+			return NULL;
+		}
+	}
+	rcu_read_unlock();
+
+	return skb;
+}
+
 static struct sk_buff *receive_small(struct net_device *dev,
 				     struct virtnet_info *vi,
 				     struct receive_queue *rq,
@@ -689,31 +709,30 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct bpf_prog *xdp_prog;
 	unsigned int truesize;
 	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
-	bool sent;
+	bool sent, skb_xdp = false;
+	int err;
 
 	head_skb = NULL;
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
-		struct page *xdp_page;
 		struct xdp_buff xdp;
 		void *data;
 		u32 act;
 
-		/* This happens when rx buffer size is underestimated */
+		/* This happens when rx buffer size is underestimated
+		 * or headroom is not enough because of the buffer
+		 * was refilled before XDP is set. In both cases,
+		 * for simplicity, we will offload them to generic
+		 * XDP routine. This should only happen for the first
+		 * several packets, so we don't care much about its
+		 * performance.
+		 */
 		if (unlikely(num_buf > 1 ||
 			     headroom < virtnet_get_headroom(vi))) {
-			/* linearize data for XDP */
-			xdp_page = xdp_linearize_page(rq, &num_buf,
-						      page, offset,
-						      VIRTIO_XDP_HEADROOM,
-						      &len);
-			if (!xdp_page)
-				goto err_xdp;
-			offset = VIRTIO_XDP_HEADROOM;
-		} else {
-			xdp_page = page;
+			skb_xdp = true;
+			goto skb_xdp;
 		}
 
 		/* Transient failure which in theory could occur if
@@ -727,7 +746,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		/* Allow consuming headroom but reserve enough space to push
 		 * the descriptor on if we get an XDP_TX return code.
 		 */
-		data = page_address(xdp_page) + offset;
+		data = page_address(page) + offset;
 		xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len;
 		xdp.data = data + vi->hdr_len;
 		xdp_set_data_meta_invalid(&xdp);
@@ -736,9 +755,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
-		if (act != XDP_PASS)
-			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
-
 		switch (act) {
 		case XDP_PASS:
 			/* recalculate offset to account for any header
@@ -746,28 +762,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			 * skb and avoid using offset
 			 */
 			offset = xdp.data -
-					page_address(xdp_page) - vi->hdr_len;
-
-			/* We can only create skb based on xdp_page. */
-			if (unlikely(xdp_page != page)) {
-				rcu_read_unlock();
-				put_page(page);
-				head_skb = page_to_skb(vi, rq, xdp_page,
-						       offset, len, PAGE_SIZE);
-				return head_skb;
-			}
+					page_address(page) - vi->hdr_len;
 			break;
 		case XDP_TX:
 			sent = __virtnet_xdp_xmit(vi, &xdp);
 			if (unlikely(!sent)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
-				if (unlikely(xdp_page != page))
-					put_page(xdp_page);
 				goto err_xdp;
 			}
 			*xdp_xmit = true;
-			if (unlikely(xdp_page != page))
+			rcu_read_unlock();
+			goto xdp_xmit;
+		case XDP_REDIRECT:
+			err = xdp_do_redirect(dev, &xdp, xdp_prog);
+			if (err)
 				goto err_xdp;
+			*xdp_xmit = true;
 			rcu_read_unlock();
 			goto xdp_xmit;
 		default:
@@ -775,13 +785,12 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		case XDP_ABORTED:
 			trace_xdp_exception(vi->dev, xdp_prog, act);
 		case XDP_DROP:
-			if (unlikely(xdp_page != page))
-				__free_pages(xdp_page, 0);
 			goto err_xdp;
 		}
 	}
 	rcu_read_unlock();
 
+skb_xdp:
 	truesize = mergeable_ctx_to_truesize(ctx);
 	if (unlikely(len > truesize)) {
 		pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
@@ -848,7 +857,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		}
 	}
 
-	ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
+	if (skb_xdp)
+		head_skb = virtnet_skb_xdp(rq, head_skb);
+	else
+		ewma_pkt_len_add(&rq->mrg_avg_pkt_len, head_skb->len);
+
 	return head_skb;
 
 err_xdp:
@@ -1013,13 +1026,18 @@ static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
 }
 
 static unsigned int get_mergeable_buf_len(struct receive_queue *rq,
-					  struct ewma_pkt_len *avg_pkt_len)
+					  struct ewma_pkt_len *avg_pkt_len,
+					  unsigned int room)
 {
 	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	unsigned int len;
 
-	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
+	if (room)
+		return PAGE_SIZE - room;
+
+	len = hdr_len +	clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
 				rq->min_buf_len, PAGE_SIZE - hdr_len);
+
 	return ALIGN(len, L1_CACHE_BYTES);
 }
 
@@ -1028,21 +1046,27 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
 {
 	struct page_frag *alloc_frag = &rq->alloc_frag;
 	unsigned int headroom = virtnet_get_headroom(vi);
+	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
+	unsigned int room = SKB_DATA_ALIGN(headroom + tailroom);
 	char *buf;
 	void *ctx;
 	int err;
 	unsigned int len, hole;
 
-	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len);
-	if (unlikely(!skb_page_frag_refill(len + headroom, alloc_frag, gfp)))
+	/* Extra tailroom is needed to satisfy XDP's assumption. This
+	 * means rx frags coalescing won't work, but consider we've
+	 * disabled GSO for XDP, it won't be a big issue.
+	 */
+	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
+	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
 		return -ENOMEM;
 
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 	buf += headroom; /* advance address leaving hole at front of pkt */
 	get_page(alloc_frag->page);
-	alloc_frag->offset += len + headroom;
+	alloc_frag->offset += len + room;
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < len + headroom) {
+	if (hole < len + room) {
 		/* To avoid internal fragmentation, if there is very likely not
 		 * enough space for another buffer, add the remaining space to
 		 * the current buffer.
@@ -2576,12 +2600,15 @@ static ssize_t mergeable_rx_buffer_size_show(struct netdev_rx_queue *queue,
 {
 	struct virtnet_info *vi = netdev_priv(queue->dev);
 	unsigned int queue_index = get_netdev_rx_queue_index(queue);
+	unsigned int headroom = virtnet_get_headroom(vi);
+	unsigned int tailroom = headroom ? sizeof(struct skb_shared_info) : 0;
 	struct ewma_pkt_len *avg;
 
 	BUG_ON(queue_index >= vi->max_queue_pairs);
 	avg = &vi->rq[queue_index].mrg_avg_pkt_len;
 	return sprintf(buf, "%u\n",
-		       get_mergeable_buf_len(&vi->rq[queue_index], avg));
+		       get_mergeable_buf_len(&vi->rq[queue_index], avg,
+				       SKB_DATA_ALIGN(headroom + tailroom)));
 }
 
 static struct rx_queue_attribute mergeable_rx_buffer_size_attribute =
-- 
2.7.4

^ permalink raw reply related

* [PATCH net-next 2/2] virtio-net: simplify XDP handling in small buffer
From: Jason Wang @ 2018-03-01  3:19 UTC (permalink / raw)
  To: mst, virtualization, netdev, linux-kernel; +Cc: john.fastabend, brouer
In-Reply-To: <1519874345-10235-1-git-send-email-jasowang@redhat.com>

We used to do data copy through xdp_linearize_page() for the buffer
without sufficient headroom, it brings extra complexity without
helping for the performance. So this patch remove it and switch to use
generic XDP routine to handle this case.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 93 ++++++------------------------------------------
 1 file changed, 10 insertions(+), 83 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 81190ba..3f14948 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -474,69 +474,6 @@ static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
 	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
 }
 
-/* We copy the packet for XDP in the following cases:
- *
- * 1) Packet is scattered across multiple rx buffers.
- * 2) Headroom space is insufficient.
- *
- * This is inefficient but it's a temporary condition that
- * we hit right after XDP is enabled and until queue is refilled
- * with large buffers with sufficient headroom - so it should affect
- * at most queue size packets.
- * Afterwards, the conditions to enable
- * XDP should preclude the underlying device from sending packets
- * across multiple buffers (num_buf > 1), and we make sure buffers
- * have enough headroom.
- */
-static struct page *xdp_linearize_page(struct receive_queue *rq,
-				       u16 *num_buf,
-				       struct page *p,
-				       int offset,
-				       int page_off,
-				       unsigned int *len)
-{
-	struct page *page = alloc_page(GFP_ATOMIC);
-
-	if (!page)
-		return NULL;
-
-	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
-	page_off += *len;
-
-	while (--*num_buf) {
-		unsigned int buflen;
-		void *buf;
-		int off;
-
-		buf = virtqueue_get_buf(rq->vq, &buflen);
-		if (unlikely(!buf))
-			goto err_buf;
-
-		p = virt_to_head_page(buf);
-		off = buf - page_address(p);
-
-		/* guard against a misconfigured or uncooperative backend that
-		 * is sending packet larger than the MTU.
-		 */
-		if ((page_off + buflen) > PAGE_SIZE) {
-			put_page(p);
-			goto err_buf;
-		}
-
-		memcpy(page_address(page) + page_off,
-		       page_address(p) + off, buflen);
-		page_off += buflen;
-		put_page(p);
-	}
-
-	/* Headroom does not contribute to packet length */
-	*len = page_off - VIRTIO_XDP_HEADROOM;
-	return page;
-err_buf:
-	__free_pages(page, 0);
-	return NULL;
-}
-
 static struct sk_buff *virtnet_skb_xdp(struct receive_queue *rq,
 				       struct sk_buff *skb)
 {
@@ -573,8 +510,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	struct page *page = virt_to_head_page(buf);
 	unsigned int delta = 0;
-	struct page *xdp_page;
-	bool sent;
+	bool sent, skb_xdp = false;
 	int err;
 
 	len -= vi->hdr_len;
@@ -590,25 +526,14 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		if (unlikely(hdr->hdr.gso_type))
 			goto err_xdp;
 
+		/* This happnes when headroom is not enough because
+		 * the buffer was refilled before XDP is set. This
+		 * only happen for several packets, for simplicity,
+		 * offload them to generic XDP routine.
+		 */
 		if (unlikely(xdp_headroom < virtnet_get_headroom(vi))) {
-			int offset = buf - page_address(page) + header_offset;
-			unsigned int tlen = len + vi->hdr_len;
-			u16 num_buf = 1;
-
-			xdp_headroom = virtnet_get_headroom(vi);
-			header_offset = VIRTNET_RX_PAD + xdp_headroom;
-			headroom = vi->hdr_len + header_offset;
-			buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
-				 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-			xdp_page = xdp_linearize_page(rq, &num_buf, page,
-						      offset, header_offset,
-						      &tlen);
-			if (!xdp_page)
-				goto err_xdp;
-
-			buf = page_address(xdp_page);
-			put_page(page);
-			page = xdp_page;
+			skb_xdp = true;
+			goto skb_xdp;
 		}
 
 		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
@@ -650,6 +575,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	}
 	rcu_read_unlock();
 
+skb_xdp:
 	skb = build_skb(buf, buflen);
 	if (!skb) {
 		put_page(page);
@@ -662,6 +588,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
 	} /* keep zeroed vnet hdr since packet was changed by bpf */
 
+	skb = virtnet_skb_xdp(rq, skb);
 err:
 	return skb;
 
-- 
2.7.4

^ permalink raw reply related

* [PATCH v3 0/6] jailhouse: Enhance secondary Jailhouse guest support /wrt PCI
From: Jan Kiszka @ 2018-03-01  5:40 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, Mark Rutland, Benedikt Spranger, linux-pci, x86,
	Linux Kernel Mailing List, virtualization, Andy Shevchenko,
	Rob Herring, Otavio Pontes

Basic x86 support [1] for running Linux as secondary Jailhouse [2] guest
is currently pending in the tip tree. This builds on top and enhances
the PCI support for x86 and also ARM guests (ARM[64] does not require
platform patches and works already).

Key elements of this series are:
 - detection of Jailhouse via device tree hypervisor node
 - function-level PCI scan if Jailhouse is detected
 - MMCONFIG support for x86 guests

As most changes affect x86, I would suggest to route the series also via
tip after the necessary acks are collected.

Changes in v3:
 - avoided duplicate scans of PCI functions under Jailhouse
 - reformated PCI_MMCONFIG condition and rephrase related commit log

Changes in v2:
 - adjusted commit log and include ordering in patch 2
 - rebased over Linus master

Jan

[1] https://lkml.org/lkml/2017/11/27/125
[2] http://jailhouse-project.org

CC: Benedikt Spranger <b.spranger@linutronix.de>
CC: Mark Rutland <mark.rutland@arm.com>
CC: Otavio Pontes <otavio.pontes@intel.com>
CC: Rob Herring <robh+dt@kernel.org>

Jan Kiszka (5):
  jailhouse: Provide detection for non-x86 systems
  PCI: Scan all functions when running over Jailhouse
  x86: Consolidate PCI_MMCONFIG configs
  x86/jailhouse: Allow to use PCI_MMCONFIG without ACPI
  MAINTAINERS: Add entry for Jailhouse

Otavio Pontes (1):
  x86/jailhouse: Enable PCI mmconfig access in inmates

 Documentation/devicetree/bindings/jailhouse.txt |  8 ++++++++
 MAINTAINERS                                     |  7 +++++++
 arch/x86/Kconfig                                | 11 ++++++-----
 arch/x86/include/asm/jailhouse_para.h           |  2 +-
 arch/x86/include/asm/pci_x86.h                  |  2 ++
 arch/x86/kernel/Makefile                        |  2 +-
 arch/x86/kernel/cpu/amd.c                       |  2 +-
 arch/x86/kernel/jailhouse.c                     |  7 +++++++
 arch/x86/pci/legacy.c                           |  4 +++-
 arch/x86/pci/mmconfig-shared.c                  |  4 ++--
 drivers/pci/probe.c                             | 22 +++++++++++++++++++---
 include/linux/hypervisor.h                      | 17 +++++++++++++++--
 12 files changed, 72 insertions(+), 16 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/jailhouse.txt

-- 
2.13.6

^ permalink raw reply

* [PATCH v3 1/6] jailhouse: Provide detection for non-x86 systems
From: Jan Kiszka @ 2018-03-01  5:40 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, Mark Rutland, linux-pci, x86,
	Linux Kernel Mailing List, virtualization, Andy Shevchenko,
	Rob Herring
In-Reply-To: <cover.1519882849.git.jan.kiszka@siemens.com>

From: Jan Kiszka <jan.kiszka@siemens.com>

Implement jailhouse_paravirt() via device tree probing on architectures
!= x86. Will be used by the PCI core.

CC: Rob Herring <robh+dt@kernel.org>
CC: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 Documentation/devicetree/bindings/jailhouse.txt |  8 ++++++++
 arch/x86/include/asm/jailhouse_para.h           |  2 +-
 include/linux/hypervisor.h                      | 17 +++++++++++++++--
 3 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/jailhouse.txt

diff --git a/Documentation/devicetree/bindings/jailhouse.txt b/Documentation/devicetree/bindings/jailhouse.txt
new file mode 100644
index 000000000000..2901c25ff340
--- /dev/null
+++ b/Documentation/devicetree/bindings/jailhouse.txt
@@ -0,0 +1,8 @@
+Jailhouse non-root cell device tree bindings
+--------------------------------------------
+
+When running in a non-root Jailhouse cell (partition), the device tree of this
+platform shall have a top-level "hypervisor" node with the following
+properties:
+
+- compatible = "jailhouse,cell"
diff --git a/arch/x86/include/asm/jailhouse_para.h b/arch/x86/include/asm/jailhouse_para.h
index 875b54376689..b885a961a150 100644
--- a/arch/x86/include/asm/jailhouse_para.h
+++ b/arch/x86/include/asm/jailhouse_para.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL2.0 */
 
 /*
- * Jailhouse paravirt_ops implementation
+ * Jailhouse paravirt detection
  *
  * Copyright (c) Siemens AG, 2015-2017
  *
diff --git a/include/linux/hypervisor.h b/include/linux/hypervisor.h
index b19563f9a8eb..fc08b433c856 100644
--- a/include/linux/hypervisor.h
+++ b/include/linux/hypervisor.h
@@ -8,15 +8,28 @@
  */
 
 #ifdef CONFIG_X86
+
+#include <asm/jailhouse_para.h>
 #include <asm/x86_init.h>
+
 static inline void hypervisor_pin_vcpu(int cpu)
 {
 	x86_platform.hyper.pin_vcpu(cpu);
 }
-#else
+
+#else /* !CONFIG_X86 */
+
+#include <linux/of.h>
+
 static inline void hypervisor_pin_vcpu(int cpu)
 {
 }
-#endif
+
+static inline bool jailhouse_paravirt(void)
+{
+	return of_find_compatible_node(NULL, NULL, "jailhouse,cell");
+}
+
+#endif /* !CONFIG_X86 */
 
 #endif /* __LINUX_HYPEVISOR_H */
-- 
2.13.6

^ permalink raw reply related

* [PATCH v3 2/6] PCI: Scan all functions when running over Jailhouse
From: Jan Kiszka @ 2018-03-01  5:40 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, Benedikt Spranger, linux-pci, x86,
	Linux Kernel Mailing List, virtualization, Andy Shevchenko
In-Reply-To: <cover.1519882849.git.jan.kiszka@siemens.com>

From: Jan Kiszka <jan.kiszka@siemens.com>

Per PCIe r4.0, sec 7.5.1.1.9, multi-function devices are required to
have a function 0.  Therefore, Linux scans for devices at function 0
(devfn 0/8/16/...) and only scans for other functions if function 0
has its Multi-Function Device bit set or ARI or SR-IOV indicate
there are more functions.

The Jailhouse hypervisor may pass individual functions of a
multi-function device to a guest without passing function 0, which
means a Linux guest won't find them.

Change Linux PCI probing so it scans all function numbers when
running as a guest over Jailhouse.

This is technically prohibited by the spec, so it is possible that
PCI devices without the Multi-Function Device bit set may have
unexpected behavior in response to this probe.

Derived from original patch by Benedikt Spranger.

CC: Benedikt Spranger <b.spranger@linutronix.de>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
---
 arch/x86/pci/legacy.c |  4 +++-
 drivers/pci/probe.c   | 22 +++++++++++++++++++---
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 1cb01abcb1be..dfbe6ac38830 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -4,6 +4,7 @@
 #include <linux/init.h>
 #include <linux/export.h>
 #include <linux/pci.h>
+#include <asm/jailhouse_para.h>
 #include <asm/pci_x86.h>
 
 /*
@@ -34,13 +35,14 @@ int __init pci_legacy_init(void)
 
 void pcibios_scan_specific_bus(int busn)
 {
+	int stride = jailhouse_paravirt() ? 1 : 8;
 	int devfn;
 	u32 l;
 
 	if (pci_find_bus(0, busn))
 		return;
 
-	for (devfn = 0; devfn < 256; devfn += 8) {
+	for (devfn = 0; devfn < 256; devfn += stride) {
 		if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) &&
 		    l != 0x0000 && l != 0xffff) {
 			DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l);
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ef5377438a1e..da22d6d216f8 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -16,6 +16,7 @@
 #include <linux/pci-aspm.h>
 #include <linux/aer.h>
 #include <linux/acpi.h>
+#include <linux/hypervisor.h>
 #include <linux/irqdomain.h>
 #include <linux/pm_runtime.h>
 #include "pci.h"
@@ -2518,14 +2519,29 @@ static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus,
 {
 	unsigned int used_buses, normal_bridges = 0, hotplug_bridges = 0;
 	unsigned int start = bus->busn_res.start;
-	unsigned int devfn, cmax, max = start;
+	unsigned int devfn, fn, cmax, max = start;
 	struct pci_dev *dev;
+	int nr_devs;
 
 	dev_dbg(&bus->dev, "scanning bus\n");
 
 	/* Go find them, Rover! */
-	for (devfn = 0; devfn < 0x100; devfn += 8)
-		pci_scan_slot(bus, devfn);
+	for (devfn = 0; devfn < 0x100; devfn += 8) {
+		nr_devs = pci_scan_slot(bus, devfn);
+
+		/*
+		 * The Jailhouse hypervisor may pass individual functions of a
+		 * multi-function device to a guest without passing function 0.
+		 * Look for them as well.
+		 */
+		if (jailhouse_paravirt() && nr_devs == 0) {
+			for (fn = 1; fn < 8; fn++) {
+				dev = pci_scan_single_device(bus, devfn + fn);
+				if (dev)
+					dev->multifunction = 1;
+			}
+		}
+	}
 
 	/* Reserve buses for SR-IOV capability */
 	used_buses = pci_iov_bus_range(bus);
-- 
2.13.6

^ permalink raw reply related

* [PATCH v3 3/6] x86/jailhouse: Enable PCI mmconfig access in inmates
From: Jan Kiszka @ 2018-03-01  5:40 UTC (permalink / raw)
  To: Thomas Gleixner, Ingo Molnar, H . Peter Anvin, Bjorn Helgaas
  Cc: jailhouse-dev, linux-pci, x86, Linux Kernel Mailing List,
	virtualization, Andy Shevchenko
In-Reply-To: <cover.1519882849.git.jan.kiszka@siemens.com>

From: Otavio Pontes <otavio.pontes@intel.com>

Use the PCI mmconfig base address exported by jailhouse in boot
parameters in order to access the memory mapped PCI configuration space.

Signed-off-by: Otavio Pontes <otavio.pontes@intel.com>
[Jan: rebased, fixed !CONFIG_PCI_MMCONFIG]
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 arch/x86/include/asm/pci_x86.h | 2 ++
 arch/x86/kernel/jailhouse.c    | 7 +++++++
 arch/x86/pci/mmconfig-shared.c | 4 ++--
 3 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index eb66fa9cd0fc..959d618dbb17 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -151,6 +151,8 @@ extern int pci_mmconfig_insert(struct device *dev, u16 seg, u8 start, u8 end,
 			       phys_addr_t addr);
 extern int pci_mmconfig_delete(u16 seg, u8 start, u8 end);
 extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
+extern struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
+							int end, u64 addr);
 
 extern struct list_head pci_mmcfg_list;
 
diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c
index b68fd895235a..7fe2a73da0b3 100644
--- a/arch/x86/kernel/jailhouse.c
+++ b/arch/x86/kernel/jailhouse.c
@@ -124,6 +124,13 @@ static int __init jailhouse_pci_arch_init(void)
 	if (pcibios_last_bus < 0)
 		pcibios_last_bus = 0xff;
 
+#ifdef CONFIG_PCI_MMCONFIG
+	if (setup_data.pci_mmconfig_base) {
+		pci_mmconfig_add(0, 0, 0xff, setup_data.pci_mmconfig_base);
+		pci_mmcfg_arch_init();
+	}
+#endif
+
 	return 0;
 }
 
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 96684d0adcf9..0e590272366b 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -94,8 +94,8 @@ static struct pci_mmcfg_region *pci_mmconfig_alloc(int segment, int start,
 	return new;
 }
 
-static struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
-							int end, u64 addr)
+struct pci_mmcfg_region *__init pci_mmconfig_add(int segment, int start,
+						 int end, u64 addr)
 {
 	struct pci_mmcfg_region *new;
 
-- 
2.13.6

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox