* [PATCH v2 10/11] x86/paravirt: move the Xen-only pv_mmu_ops under the PARAVIRT_XXL umbrella
From: Juergen Gross @ 2018-08-13 7:37 UTC (permalink / raw)
To: linux-kernel, xen-devel, x86, virtualization
Cc: Juergen Gross, boris.ostrovsky, rusty, peterz, mingo, hpa,
akataria, tglx
In-Reply-To: <20180813073739.26108-1-jgross@suse.com>
Most of the paravirt ops defined in pv_mmu_ops are for Xen PV guests
only. Define them only if CONFIG_PARAVIRT_XXL is set.
Signed-off-by: Juergen Gross <jgross@suse.com>
---
arch/x86/include/asm/fixmap.h | 2 +-
arch/x86/include/asm/mmu_context.h | 4 +-
arch/x86/include/asm/paravirt.h | 115 +++++++++++++++++-----------------
arch/x86/include/asm/paravirt_types.h | 29 ++++-----
arch/x86/include/asm/pgalloc.h | 2 +-
arch/x86/include/asm/pgtable.h | 7 +--
arch/x86/include/asm/special_insns.h | 11 +---
arch/x86/kernel/asm-offsets.c | 2 +-
arch/x86/kernel/head_64.S | 4 +-
arch/x86/kernel/paravirt.c | 15 +++--
arch/x86/kernel/paravirt_patch_32.c | 4 +-
arch/x86/kernel/paravirt_patch_64.c | 4 +-
12 files changed, 97 insertions(+), 102 deletions(-)
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e203169931c7..ac80e7eadc3a 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -152,7 +152,7 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
void native_set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags);
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
static inline void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
{
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bbc796eb0a3b..ffae17a8db36 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -16,12 +16,12 @@
extern atomic64_t last_mm_ctx_id;
-#ifndef CONFIG_PARAVIRT
+#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
}
-#endif /* !CONFIG_PARAVIRT */
+#endif /* !CONFIG_PARAVIRT_XXL */
#ifdef CONFIG_PERF_EVENTS
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 334bc2e7cd78..4ceb3708fe06 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -17,6 +17,57 @@
#include <linux/cpumask.h>
#include <asm/frame.h>
+static inline unsigned long long paravirt_sched_clock(void)
+{
+ return PVOP_CALL0(unsigned long long, time.sched_clock);
+}
+
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+ return PVOP_CALL1(u64, time.steal_clock, cpu);
+}
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void)
+{
+ pv_ops.cpu.io_delay();
+#ifdef REALLY_SLOW_IO
+ pv_ops.cpu.io_delay();
+ pv_ops.cpu.io_delay();
+ pv_ops.cpu.io_delay();
+#endif
+}
+
+static inline void __flush_tlb(void)
+{
+ PVOP_VCALL0(mmu.flush_tlb_user);
+}
+
+static inline void __flush_tlb_global(void)
+{
+ PVOP_VCALL0(mmu.flush_tlb_kernel);
+}
+
+static inline void __flush_tlb_one_user(unsigned long addr)
+{
+ PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
+}
+
+static inline void flush_tlb_others(const struct cpumask *cpumask,
+ const struct flush_tlb_info *info)
+{
+ PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
+}
+
+static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
+{
+ PVOP_VCALL1(mmu.exit_mmap, mm);
+}
+
#ifdef CONFIG_PARAVIRT_XXL
static inline void load_sp0(unsigned long sp0)
{
@@ -52,7 +103,6 @@ static inline void write_cr0(unsigned long x)
{
PVOP_VCALL1(cpu.write_cr0, x);
}
-#endif
static inline unsigned long read_cr2(void)
{
@@ -74,7 +124,6 @@ static inline void write_cr3(unsigned long x)
PVOP_VCALL1(mmu.write_cr3, x);
}
-#ifdef CONFIG_PARAVIRT_XXL
static inline void __write_cr4(unsigned long x)
{
PVOP_VCALL1(cpu.write_cr4, x);
@@ -172,23 +221,7 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
*p = paravirt_read_msr_safe(msr, &err);
return err;
}
-#endif
-static inline unsigned long long paravirt_sched_clock(void)
-{
- return PVOP_CALL0(unsigned long long, time.sched_clock);
-}
-
-struct static_key;
-extern struct static_key paravirt_steal_enabled;
-extern struct static_key paravirt_steal_rq_enabled;
-
-static inline u64 paravirt_steal_clock(int cpu)
-{
- return PVOP_CALL1(u64, time.steal_clock, cpu);
-}
-
-#ifdef CONFIG_PARAVIRT_XXL
static inline unsigned long long paravirt_read_pmc(int counter)
{
return PVOP_CALL1(u64, cpu.read_pmc, counter);
@@ -267,18 +300,6 @@ static inline void set_iopl_mask(unsigned mask)
{
PVOP_VCALL1(cpu.set_iopl_mask, mask);
}
-#endif
-
-/* The paravirtualized I/O functions */
-static inline void slow_down_io(void)
-{
- pv_ops.cpu.io_delay();
-#ifdef REALLY_SLOW_IO
- pv_ops.cpu.io_delay();
- pv_ops.cpu.io_delay();
- pv_ops.cpu.io_delay();
-#endif
-}
static inline void paravirt_activate_mm(struct mm_struct *prev,
struct mm_struct *next)
@@ -292,30 +313,6 @@ static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
PVOP_VCALL2(mmu.dup_mmap, oldmm, mm);
}
-static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
-{
- PVOP_VCALL1(mmu.exit_mmap, mm);
-}
-
-static inline void __flush_tlb(void)
-{
- PVOP_VCALL0(mmu.flush_tlb_user);
-}
-static inline void __flush_tlb_global(void)
-{
- PVOP_VCALL0(mmu.flush_tlb_kernel);
-}
-static inline void __flush_tlb_one_user(unsigned long addr)
-{
- PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
-}
-
-static inline void flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
-{
- PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
-}
-
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
return PVOP_CALL1(int, mmu.pgd_alloc, mm);
@@ -618,7 +615,6 @@ static inline void pmd_clear(pmd_t *pmdp)
}
#endif /* CONFIG_X86_PAE */
-#ifdef CONFIG_PARAVIRT_XXL
#define __HAVE_ARCH_START_CONTEXT_SWITCH
static inline void arch_start_context_switch(struct task_struct *prev)
{
@@ -629,7 +625,6 @@ static inline void arch_end_context_switch(struct task_struct *next)
{
PVOP_VCALL1(cpu.end_context_switch, next);
}
-#endif
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
@@ -652,6 +647,7 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
{
pv_ops.mmu.set_fixmap(idx, phys, flags);
}
+#endif
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
@@ -937,15 +933,20 @@ extern void default_banner(void);
#endif /* __ASSEMBLY__ */
#else /* CONFIG_PARAVIRT */
# define default_banner x86_init_noop
+#endif /* !CONFIG_PARAVIRT */
+
#ifndef __ASSEMBLY__
+#ifndef CONFIG_PARAVIRT_XXL
static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm,
struct mm_struct *mm)
{
}
+#endif
+#ifndef CONFIG_PARAVIRT
static inline void paravirt_arch_exit_mmap(struct mm_struct *mm)
{
}
+#endif
#endif /* __ASSEMBLY__ */
-#endif /* !CONFIG_PARAVIRT */
#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index ae53ee36d8fb..0aec1d7f1f6d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -90,13 +90,14 @@ struct pv_init_ops {
unsigned long addr, unsigned len);
} __no_randomize_layout;
-
+#ifdef CONFIG_PARAVIRT_XXL
struct pv_lazy_ops {
/* Set deferred update mode, used for batching operations. */
void (*enter)(void);
void (*leave)(void);
void (*flush)(void);
} __no_randomize_layout;
+#endif
struct pv_time_ops {
unsigned long long (*sched_clock)(void);
@@ -205,29 +206,28 @@ struct pv_irq_ops {
} __no_randomize_layout;
struct pv_mmu_ops {
+ /* TLB operations */
+ void (*flush_tlb_user)(void);
+ void (*flush_tlb_kernel)(void);
+ void (*flush_tlb_one_user)(unsigned long addr);
+ void (*flush_tlb_others)(const struct cpumask *cpus,
+ const struct flush_tlb_info *info);
+
+ /* Hook for intercepting the destruction of an mm_struct. */
+ void (*exit_mmap)(struct mm_struct *mm);
+
+#ifdef CONFIG_PARAVIRT_XXL
unsigned long (*read_cr2)(void);
void (*write_cr2)(unsigned long);
unsigned long (*read_cr3)(void);
void (*write_cr3)(unsigned long);
- /*
- * Hooks for intercepting the creation/use/destruction of an
- * mm_struct.
- */
+ /* Hooks for intercepting the creation/use of an mm_struct. */
void (*activate_mm)(struct mm_struct *prev,
struct mm_struct *next);
void (*dup_mmap)(struct mm_struct *oldmm,
struct mm_struct *mm);
- void (*exit_mmap)(struct mm_struct *mm);
-
-
- /* TLB operations */
- void (*flush_tlb_user)(void);
- void (*flush_tlb_kernel)(void);
- void (*flush_tlb_one_user)(unsigned long addr);
- void (*flush_tlb_others)(const struct cpumask *cpus,
- const struct flush_tlb_info *info);
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
@@ -302,6 +302,7 @@ struct pv_mmu_ops {
an mfn. We can tell which is which from the index. */
void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
phys_addr_t phys, pgprot_t flags);
+#endif
} __no_randomize_layout;
struct arch_spinlock;
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index fbd578daa66e..ec7f43327033 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -8,7 +8,7 @@
static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
#else
#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9ea291fe7107..b9abc525ece3 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -52,9 +52,9 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
extern pmdval_t early_pmd_flags;
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
-#else /* !CONFIG_PARAVIRT */
+#else /* !CONFIG_PARAVIRT_XXL */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
@@ -108,9 +108,6 @@ extern pmdval_t early_pmd_flags;
#define pte_val(x) native_pte_val(x)
#define __pte(x) native_make_pte(x)
-#endif /* CONFIG_PARAVIRT */
-
-#ifndef CONFIG_PARAVIRT_XXL
#define arch_end_context_switch(prev) do {} while(0)
#endif /* CONFIG_PARAVIRT_XXL */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2aa6ce4bf159..43c029cdc3fe 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -141,11 +141,10 @@ static inline unsigned long __read_cr4(void)
return native_read_cr4();
}
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/paravirt.h>
-#endif
+#else
-#ifndef CONFIG_PARAVIRT_XXL
static inline unsigned long read_cr0(void)
{
return native_read_cr0();
@@ -155,9 +154,7 @@ static inline void write_cr0(unsigned long x)
{
native_write_cr0(x);
}
-#endif
-#ifndef CONFIG_PARAVIRT
static inline unsigned long read_cr2(void)
{
return native_read_cr2();
@@ -181,9 +178,7 @@ static inline void write_cr3(unsigned long x)
{
native_write_cr3(x);
}
-#endif
-#ifndef CONFIG_PARAVIRT_XXL
static inline void __write_cr4(unsigned long x)
{
native_write_cr4(x);
@@ -213,7 +208,7 @@ static inline void load_gs_index(unsigned selector)
#endif
-#endif/* CONFIG_PARAVIRT_XXL */
+#endif /* CONFIG_PARAVIRT_XXL */
static inline void clflush(volatile void *__p)
{
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 5ea1be9d1819..e2e9ad1a2faf 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -70,9 +70,9 @@ void common(void) {
OFFSET(PV_IRQ_irq_enable, paravirt_patch_template, irq.irq_enable);
#ifdef CONFIG_PARAVIRT_XXL
OFFSET(PV_CPU_iret, paravirt_patch_template, cpu.iret);
-#endif
OFFSET(PV_MMU_read_cr2, paravirt_patch_template, mmu.read_cr2);
#endif
+#endif
#ifdef CONFIG_XEN
BLANK();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index e11b96b2dc6b..981fd802830f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -25,14 +25,12 @@
#include <asm/export.h>
#include <asm/nospec-branch.h>
-#ifdef CONFIG_PARAVIRT
+#ifdef CONFIG_PARAVIRT_XXL
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
#define GET_CR2_INTO(reg) GET_CR2_INTO_RAX ; movq %rax, reg
#else
#define GET_CR2_INTO(reg) movq %cr2, reg
-#endif
-#ifndef CONFIG_PARAVIRT_XXL
#define INTERRUPT_RETURN iretq
#endif
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 4dd12cc15daa..4fbc7899be27 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -385,16 +385,19 @@ struct paravirt_patch_template pv_ops = {
#endif
/* Mmu ops. */
- .mmu.read_cr2 = native_read_cr2,
- .mmu.write_cr2 = native_write_cr2,
- .mmu.read_cr3 = __native_read_cr3,
- .mmu.write_cr3 = native_write_cr3,
-
.mmu.flush_tlb_user = native_flush_tlb,
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_others = native_flush_tlb_others,
+ .mmu.exit_mmap = paravirt_nop,
+
+#ifdef CONFIG_PARAVIRT_XXL
+ .mmu.read_cr2 = native_read_cr2,
+ .mmu.write_cr2 = native_write_cr2,
+ .mmu.read_cr3 = __native_read_cr3,
+ .mmu.write_cr3 = native_write_cr3,
+
.mmu.pgd_alloc = __paravirt_pgd_alloc,
.mmu.pgd_free = paravirt_nop,
@@ -447,7 +450,6 @@ struct paravirt_patch_template pv_ops = {
.mmu.make_pgd = PTE_IDENT,
.mmu.dup_mmap = paravirt_nop,
- .mmu.exit_mmap = paravirt_nop,
.mmu.activate_mm = paravirt_nop,
.mmu.lazy_mode = {
@@ -457,6 +459,7 @@ struct paravirt_patch_template pv_ops = {
},
.mmu.set_fixmap = native_set_fixmap,
+#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
/* Lock ops. */
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 5a20aa56efc0..846b8744d804 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -7,10 +7,10 @@ DEF_NATIVE(irq, restore_fl, "push %eax; popf");
DEF_NATIVE(irq, save_fl, "pushf; pop %eax");
#ifdef CONFIG_PARAVIRT_XXL
DEF_NATIVE(cpu, iret, "iret");
-#endif
DEF_NATIVE(mmu, read_cr2, "mov %cr2, %eax");
DEF_NATIVE(mmu, write_cr3, "mov %eax, %cr3");
DEF_NATIVE(mmu, read_cr3, "mov %cr3, %eax");
+#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
DEF_NATIVE(lock, queued_spin_unlock, "movb $0, (%eax)");
@@ -49,10 +49,10 @@ unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len)
PATCH_SITE(irq, save_fl);
#ifdef CONFIG_PARAVIRT_XXL
PATCH_SITE(cpu, iret);
-#endif
PATCH_SITE(mmu, read_cr2);
PATCH_SITE(mmu, read_cr3);
PATCH_SITE(mmu, write_cr3);
+#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(lock.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 00030a15de35..b61936b98200 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -7,10 +7,10 @@ DEF_NATIVE(irq, irq_disable, "cli");
DEF_NATIVE(irq, irq_enable, "sti");
DEF_NATIVE(irq, restore_fl, "pushq %rdi; popfq");
DEF_NATIVE(irq, save_fl, "pushfq; popq %rax");
+#ifdef CONFIG_PARAVIRT_XXL
DEF_NATIVE(mmu, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(mmu, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(mmu, write_cr3, "movq %rdi, %cr3");
-#ifdef CONFIG_PARAVIRT_XXL
DEF_NATIVE(cpu, wbinvd, "wbinvd");
DEF_NATIVE(cpu, usergs_sysret64, "swapgs; sysretq");
@@ -59,10 +59,10 @@ unsigned native_patch(u8 type, void *ibuf, unsigned long addr, unsigned len)
PATCH_SITE(cpu, usergs_sysret64);
PATCH_SITE(cpu, swapgs);
PATCH_SITE(cpu, wbinvd);
-#endif
PATCH_SITE(mmu, read_cr2);
PATCH_SITE(mmu, read_cr3);
PATCH_SITE(mmu, write_cr3);
+#endif
#if defined(CONFIG_PARAVIRT_SPINLOCKS)
case PARAVIRT_PATCH(lock.queued_spin_unlock):
if (pv_is_native_spin_unlock()) {
--
2.13.7
^ permalink raw reply related
* [PATCH v2 11/11] x86/paravirt: remove unneeded mmu related paravirt ops bits
From: Juergen Gross @ 2018-08-13 7:37 UTC (permalink / raw)
To: linux-kernel, xen-devel, x86, virtualization
Cc: Juergen Gross, boris.ostrovsky, rusty, peterz, mingo, hpa,
akataria, tglx
In-Reply-To: <20180813073739.26108-1-jgross@suse.com>
There is no need to have 32-bit code for CONFIG_PGTABLE_LEVELS >= 4.
Remove it.
Signed-off-by: Juergen Gross <jgross@suse.com>
---
arch/x86/include/asm/paravirt.h | 20 +++-----------------
1 file changed, 3 insertions(+), 17 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 4ceb3708fe06..0499f17ca480 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -501,25 +501,14 @@ static inline pud_t __pud(pudval_t val)
{
pudval_t ret;
- if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pudval_t, mmu.make_pud, val, (u64)val >> 32);
- else
- ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);
+ ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val);
return (pud_t) { ret };
}
static inline pudval_t pud_val(pud_t pud)
{
- pudval_t ret;
-
- if (sizeof(pudval_t) > sizeof(long))
- ret = PVOP_CALLEE2(pudval_t, mmu.pud_val,
- pud.pud, (u64)pud.pud >> 32);
- else
- ret = PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud);
-
- return ret;
+ return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud);
}
static inline void pud_clear(pud_t *pudp)
@@ -531,10 +520,7 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
p4dval_t val = native_p4d_val(p4d);
- if (sizeof(p4dval_t) > sizeof(long))
- PVOP_VCALL3(mmu.set_p4d, p4dp, val, (u64)val >> 32);
- else
- PVOP_VCALL2(mmu.set_p4d, p4dp, val);
+ PVOP_VCALL2(mmu.set_p4d, p4dp, val);
}
#if CONFIG_PGTABLE_LEVELS >= 5
--
2.13.7
^ permalink raw reply related
* Re: [PATCH net-next] virtio_net: remove duplicated include from virtio_net.c
From: Michael S. Tsirkin @ 2018-08-13 9:42 UTC (permalink / raw)
To: YueHaibing; +Cc: netdev, virtualization, davem, linux-kernel
In-Reply-To: <20180813061315.9084-1-yuehaibing@huawei.com>
On Mon, Aug 13, 2018 at 02:13:15PM +0800, YueHaibing wrote:
> Remove duplicated include linux/netdevice.h
>
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
> drivers/net/virtio_net.c | 1 -
> 1 file changed, 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index eb00ae6..7659209 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -30,7 +30,6 @@
> #include <linux/cpu.h>
> #include <linux/average.h>
> #include <linux/filter.h>
> -#include <linux/netdevice.h>
> #include <linux/kernel.h>
> #include <linux/pci.h>
> #include <net/route.h>
> --
> 2.7.0
>
^ permalink raw reply
* Re: [PATCH 0/2] Provide init/release functions for struct ttm_bo_global
From: Christian König @ 2018-08-13 10:33 UTC (permalink / raw)
To: Thomas Zimmermann, ray.huang, Jerry.Zhang, dri-devel
Cc: David1.Zhou, thellstrom, nouveau, syeh, airlied, puck.chen,
amd-gfx, virtualization, z.liuxinliang, zourongrong,
kong.kongxinwei, linux-graphics-maintainer, gregkh,
alexander.deucher, bskeggs
In-Reply-To: <20180813102443.12662-1-tzimmermann@suse.de>
Yes, please! I had it on my TODO list to clean that up for an eternity.
Actually I never understood why that should be driver work to setup TTM?
I mean can't we just have a module_init/module_exit for TTM?
Thanks,
Christian.
Am 13.08.2018 um 12:24 schrieb Thomas Zimmermann:
> TTM uses global memory and BO for backing graphics buffers. These are
> represented by struct ttm_mem_global and struct ttm_bo_global.
>
> Currently, struct ttm_bo_global can only be initialized and released through
> struct ttm_bo_global_ref. This is a workaround for passing an instance of
> ttm_mem_global to the BO global initialization code.
>
> The use of struct ttm_bo_global_ref makes driver code unnecessary hard to
> understand. At the same time drivers can use any combination of memory and
> BO for initializing the global instances. This can result in subtle bugs
> when the order of initializing and releasing drivers changes.
>
> As a first step for resolving these problems, the provided patch set
> separates initialization and release of struct ttm_bo_global from
> struct ttm_bo_global_ref.
>
> The first patch only renames ttm_bo_global_{init/release}. Hopefully this
> change can be applied at once for all drivers.
>
> Future directions: All TTM-based drivers follow the same pattern for setting
> up the TTM. In a follow-up patch, this code can be moved into a single place
> and shared among drivers.
>
> Thomas Zimmermann (2):
> drm/ttm: Rename ttm_bo_global_{init,release}() to
> ttm_bo_global_ref_*()
> drm/ttm: Provide ttm_bo_global_{init/release}() for struct
> ttm_bo_global
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 +-
> drivers/gpu/drm/ast/ast_ttm.c | 4 +-
> drivers/gpu/drm/bochs/bochs_mm.c | 4 +-
> drivers/gpu/drm/cirrus/cirrus_ttm.c | 4 +-
> drivers/gpu/drm/hisilicon/hibmc/hibmc_ttm.c | 4 +-
> drivers/gpu/drm/mgag200/mgag200_ttm.c | 4 +-
> drivers/gpu/drm/nouveau/nouveau_ttm.c | 4 +-
> drivers/gpu/drm/qxl/qxl_ttm.c | 4 +-
> drivers/gpu/drm/radeon/radeon_ttm.c | 4 +-
> drivers/gpu/drm/ttm/ttm_bo.c | 12 ++---
> drivers/gpu/drm/virtio/virtgpu_ttm.c | 4 +-
> drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 4 +-
> drivers/staging/vboxvideo/vbox_ttm.c | 4 +-
> include/drm/ttm/ttm_bo_driver.h | 53 ++++++++++++++++-----
> 14 files changed, 70 insertions(+), 43 deletions(-)
>
> --
> 2.18.0
>
^ permalink raw reply
* Re: [PATCH 0/2] Provide init/release functions for struct ttm_bo_global
From: Thomas Hellstrom @ 2018-08-13 12:54 UTC (permalink / raw)
To: Thomas Zimmermann, Christian König, ray.huang, Jerry.Zhang,
dri-devel
Cc: David1.Zhou, nouveau, syeh, airlied, puck.chen, amd-gfx,
virtualization, z.liuxinliang, zourongrong, kong.kongxinwei,
linux-graphics-maintainer, gregkh, alexander.deucher, bskeggs
In-Reply-To: <87d57d1e-4ce3-cde7-5e05-798b6738ae6c@suse.de>
On 08/13/2018 02:28 PM, Thomas Zimmermann wrote:
> Hi
>
> Am 13.08.2018 um 12:33 schrieb Christian König:
>> Yes, please! I had it on my TODO list to clean that up for an eternity.
> On top of these patches, I have a patch set that provides a single
> init/release interface for TTM global data. I'll post it when the
> current patches got some feed back.
>
> I'd really like to move the code from drm_global.c back into TTM. It's
> TTM-specific and not useful elsewhere. However, the first commit message
> from 2010 [1] says that some unnamed, external driver uses this code for
> something. Do you know if this still applies?
I'm not sure actually, and not even sure that external driver ever
intended to be compatible with upcoming upstream versions of drm. In
that case I guess whoever maintains it should speak up now...
The drm global stuff was added to facilitate for subsystems that wanted
to register stuff that was really system-global and not per-device, and
also not used by all drivers. As an example, even if the memory
accounting is currently restricted to TTM, IMO we probably want
something drm global to avoid malicious user-space app consuming all
kernel memory by, for example, repeatedly open gem handles until kernel
memory is exhausted. TTM drivers should stop that, but it's pretty
meaningless if they share the system with a non-TTM driver that
considers this "stupid and difficult".
So having said that, I'm not really against ditching the drm_global
stuff since it hasn't found any out-of-ttm usage.
/Thomas
>
> Best regards
> Thomas
>
> [1]
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ba4420c224c2808f2661cf8428f43ceef7a73a4a
>
>> Actually I never understood why that should be driver work to setup TTM?
>>
>> I mean can't we just have a module_init/module_exit for TTM?
>>
>> Thanks,
>> Christian.
>>
>> Am 13.08.2018 um 12:24 schrieb Thomas Zimmermann:
>>> TTM uses global memory and BO for backing graphics buffers. These are
>>> represented by struct ttm_mem_global and struct ttm_bo_global.
>>>
>>> Currently, struct ttm_bo_global can only be initialized and released
>>> through
>>> struct ttm_bo_global_ref. This is a workaround for passing an instance of
>>> ttm_mem_global to the BO global initialization code.
>>>
>>> The use of struct ttm_bo_global_ref makes driver code unnecessary hard to
>>> understand. At the same time drivers can use any combination of memory
>>> and
>>> BO for initializing the global instances. This can result in subtle bugs
>>> when the order of initializing and releasing drivers changes.
>>>
>>> As a first step for resolving these problems, the provided patch set
>>> separates initialization and release of struct ttm_bo_global from
>>> struct ttm_bo_global_ref.
>>>
>>> The first patch only renames ttm_bo_global_{init/release}. Hopefully this
>>> change can be applied at once for all drivers.
>>>
>>> Future directions: All TTM-based drivers follow the same pattern for
>>> setting
>>> up the TTM. In a follow-up patch, this code can be moved into a single
>>> place
>>> and shared among drivers.
>>>
>>> Thomas Zimmermann (2):
>>> drm/ttm: Rename ttm_bo_global_{init,release}() to
>>> ttm_bo_global_ref_*()
>>> drm/ttm: Provide ttm_bo_global_{init/release}() for struct
>>> ttm_bo_global
>>>
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 4 +-
>>> drivers/gpu/drm/ast/ast_ttm.c | 4 +-
>>> drivers/gpu/drm/bochs/bochs_mm.c | 4 +-
>>> drivers/gpu/drm/cirrus/cirrus_ttm.c | 4 +-
>>> drivers/gpu/drm/hisilicon/hibmc/hibmc_ttm.c | 4 +-
>>> drivers/gpu/drm/mgag200/mgag200_ttm.c | 4 +-
>>> drivers/gpu/drm/nouveau/nouveau_ttm.c | 4 +-
>>> drivers/gpu/drm/qxl/qxl_ttm.c | 4 +-
>>> drivers/gpu/drm/radeon/radeon_ttm.c | 4 +-
>>> drivers/gpu/drm/ttm/ttm_bo.c | 12 ++---
>>> drivers/gpu/drm/virtio/virtgpu_ttm.c | 4 +-
>>> drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c | 4 +-
>>> drivers/staging/vboxvideo/vbox_ttm.c | 4 +-
>>> include/drm/ttm/ttm_bo_driver.h | 53 ++++++++++++++++-----
>>> 14 files changed, 70 insertions(+), 43 deletions(-)
>>>
>>> --
>>> 2.18.0
>>>
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH] drm/virtio: track virtual output state
From: Gerd Hoffmann @ 2018-08-13 15:28 UTC (permalink / raw)
To: dri-devel; +Cc: David Airlie, open list, open list:VIRTIO GPU DRIVER
Track whenever an virtual output (crtc) is enabled or disabled.
On atomic updates check for both framebuffer being present and crtc
being enabled to figure whenever the output is active or not.
Signed-off-by: Gerd Hoffmann <kraxel@redhat.com>
---
drivers/gpu/drm/virtio/virtgpu_drv.h | 1 +
drivers/gpu/drm/virtio/virtgpu_display.c | 4 ++++
drivers/gpu/drm/virtio/virtgpu_plane.c | 2 +-
3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h
index 65605e207b..d46f10e656 100644
--- a/drivers/gpu/drm/virtio/virtgpu_drv.h
+++ b/drivers/gpu/drm/virtio/virtgpu_drv.h
@@ -114,6 +114,7 @@ struct virtio_gpu_output {
struct virtio_gpu_update_cursor cursor;
int cur_x;
int cur_y;
+ bool enabled;
};
#define drm_crtc_to_virtio_gpu_output(x) \
container_of(x, struct virtio_gpu_output, crtc)
diff --git a/drivers/gpu/drm/virtio/virtgpu_display.c b/drivers/gpu/drm/virtio/virtgpu_display.c
index 25503b9335..9f1e0a669d 100644
--- a/drivers/gpu/drm/virtio/virtgpu_display.c
+++ b/drivers/gpu/drm/virtio/virtgpu_display.c
@@ -109,6 +109,9 @@ static void virtio_gpu_crtc_mode_set_nofb(struct drm_crtc *crtc)
static void virtio_gpu_crtc_atomic_enable(struct drm_crtc *crtc,
struct drm_crtc_state *old_state)
{
+ struct virtio_gpu_output *output = drm_crtc_to_virtio_gpu_output(crtc);
+
+ output->enabled = true;
}
static void virtio_gpu_crtc_atomic_disable(struct drm_crtc *crtc,
@@ -119,6 +122,7 @@ static void virtio_gpu_crtc_atomic_disable(struct drm_crtc *crtc,
struct virtio_gpu_output *output = drm_crtc_to_virtio_gpu_output(crtc);
virtio_gpu_cmd_set_scanout(vgdev, output->index, 0, 0, 0, 0, 0);
+ output->enabled = false;
}
static int virtio_gpu_crtc_atomic_check(struct drm_crtc *crtc,
diff --git a/drivers/gpu/drm/virtio/virtgpu_plane.c b/drivers/gpu/drm/virtio/virtgpu_plane.c
index dc5b5b2b7a..88f2fb8c61 100644
--- a/drivers/gpu/drm/virtio/virtgpu_plane.c
+++ b/drivers/gpu/drm/virtio/virtgpu_plane.c
@@ -152,7 +152,7 @@ static void virtio_gpu_primary_plane_update(struct drm_plane *plane,
if (WARN_ON(!output))
return;
- if (plane->state->fb) {
+ if (plane->state->fb && output->enabled) {
vgfb = to_virtio_gpu_framebuffer(plane->state->fb);
bo = gem_to_virtio_gpu_obj(vgfb->base.obj[0]);
handle = bo->hw_res_handle;
--
2.9.3
^ permalink raw reply related
* Re: [PATCH net-next] virtio_net: remove duplicated include from virtio_net.c
From: David Miller @ 2018-08-13 16:46 UTC (permalink / raw)
To: yuehaibing; +Cc: netdev, virtualization, linux-kernel, mst
In-Reply-To: <20180813061315.9084-1-yuehaibing@huawei.com>
From: YueHaibing <yuehaibing@huawei.com>
Date: Mon, 13 Aug 2018 14:13:15 +0800
> Remove duplicated include linux/netdevice.h
>
> Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Applied.
^ permalink raw reply
* Re: [PATCH 1/1] vhost: change the signature of __vhost_get_user_slow()
From: Jason Wang @ 2018-08-15 3:22 UTC (permalink / raw)
To: Dongli Zhang, kvm, netdev; +Cc: virtualization, linux-kernel, mst
In-Reply-To: <1534297600-2577-1-git-send-email-dongli.zhang@oracle.com>
On 2018年08月15日 09:46, Dongli Zhang wrote:
> Remove 'type' from the signature of __vhost_get_user_slow() as it is not
> used.
>
> Signed-off-by: Dongli Zhang <dongli.zhang@oracle.com>
> ---
> drivers/vhost/vhost.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index ed31145..f78d3bc 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -807,8 +807,7 @@ static int vhost_copy_from_user(struct vhost_virtqueue *vq, void *to,
> }
>
> static void __user *__vhost_get_user_slow(struct vhost_virtqueue *vq,
> - void __user *addr, unsigned int size,
> - int type)
> + void __user *addr, unsigned int size)
> {
> int ret;
>
> @@ -846,7 +845,7 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
> if (uaddr)
> return uaddr;
>
> - return __vhost_get_user_slow(vq, addr, size, type);
> + return __vhost_get_user_slow(vq, addr, size);
> }
>
> #define vhost_put_user(vq, x, ptr) \
Please keep this as is.
It will be used by incoming packed virtqueue.
Thanks
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH RESEND] x86-64: use RIP-relative calls for paravirt indirect ones
From: Jan Beulich @ 2018-08-15 8:09 UTC (permalink / raw)
To: the arch/x86 maintainers, Juergen Gross, Alok Kataria
Cc: hpa, linux-kernel, Linux Virtualization
In-Reply-To: <5B30C3F102000078001CD6F1@prv1-mh.provo.novell.com>
This saves one insn byte per instance, summing up to a savings of over
4k in my (stripped down) configuration. No variant of to be patched in
replacement code relies on the one byte larger size.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
---
Resend to include x86 maintainers, who aren't listed explicitly for the
file changed.
---
arch/x86/include/asm/paravirt_types.h | 6 ++++++
1 file changed, 6 insertions(+)
--- 4.18/arch/x86/include/asm/paravirt_types.h
+++ 4.18-x86_64-pvops-call-RIPrel/arch/x86/include/asm/paravirt_types.h
@@ -393,9 +393,15 @@ int paravirt_disable_iospace(void);
* offset into the paravirt_patch_template structure, and can therefore be
* freely converted back into a structure offset.
*/
+#ifdef CONFIG_X86_32
#define PARAVIRT_CALL \
ANNOTATE_RETPOLINE_SAFE \
"call *%c[paravirt_opptr];"
+#else
+#define PARAVIRT_CALL \
+ ANNOTATE_RETPOLINE_SAFE \
+ "call *%c[paravirt_opptr](%%rip);"
+#endif
/*
* These macros are intended to wrap calls through one of the paravirt
^ permalink raw reply
* [PATCH v4 0/3] virtio-balloon: some improvements
From: Wei Wang @ 2018-08-16 7:50 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, linux-mm, mst, mhocko,
akpm, penguin-kernel
This series is split from the "Virtio-balloon: support free page
reporting" series to make some improvements.
ChangeLog:
v3->v4:
- use kzalloc to allocate the vb struct so that we don't need to zero
initialize each field one by one later;
- also remove vb->shrinker.batch = 0, which is not needed now.
v2->v3:
- shrink the balloon pages according to the amount requested by the
claimer, instead of using a user specified number;
v1->v2:
- register the shrinker when VIRTIO_BALLOON_F_DEFLATE_ON_OOM is
negotiated.
Wei Wang (3):
virtio-balloon: remove BUG() in init_vqs
virtio-balloon: kzalloc the vb struct
virtio_balloon: replace oom notifier with shrinker
drivers/virtio/virtio_balloon.c | 125 +++++++++++++++++++++-------------------
1 file changed, 67 insertions(+), 58 deletions(-)
--
2.7.4
^ permalink raw reply
* [PATCH v4 1/3] virtio-balloon: remove BUG() in init_vqs
From: Wei Wang @ 2018-08-16 7:50 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, linux-mm, mst, mhocko,
akpm, penguin-kernel
In-Reply-To: <1534405858-27085-1-git-send-email-wei.w.wang@intel.com>
It's a bit overkill to use BUG when failing to add an entry to the
stats_vq in init_vqs. So remove it and just return the error to the
caller to bail out nicely.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
---
drivers/virtio/virtio_balloon.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 3988c09..8100e77 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -455,9 +455,13 @@ static int init_vqs(struct virtio_balloon *vb)
num_stats = update_balloon_stats(vb);
sg_init_one(&sg, vb->stats, sizeof(vb->stats[0]) * num_stats);
- if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
- < 0)
- BUG();
+ err = virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb,
+ GFP_KERNEL);
+ if (err) {
+ dev_warn(&vb->vdev->dev, "%s: add stat_vq failed\n",
+ __func__);
+ return err;
+ }
virtqueue_kick(vb->stats_vq);
}
return 0;
--
2.7.4
^ permalink raw reply related
* [PATCH v4 2/3] virtio-balloon: kzalloc the vb struct
From: Wei Wang @ 2018-08-16 7:50 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, linux-mm, mst, mhocko,
akpm, penguin-kernel
In-Reply-To: <1534405858-27085-1-git-send-email-wei.w.wang@intel.com>
Zero all the vb fields at alloaction, so that we don't need to
zero-initialize each field one by one later.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
drivers/virtio/virtio_balloon.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8100e77..d97d73c 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -561,7 +561,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
return -EINVAL;
}
- vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL);
+ vdev->priv = vb = kzalloc(sizeof(*vb), GFP_KERNEL);
if (!vb) {
err = -ENOMEM;
goto out;
@@ -570,8 +570,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func);
INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func);
spin_lock_init(&vb->stop_update_lock);
- vb->stop_update = false;
- vb->num_pages = 0;
mutex_init(&vb->balloon_lock);
init_waitqueue_head(&vb->acked);
vb->vdev = vdev;
@@ -602,7 +600,6 @@ static int virtballoon_probe(struct virtio_device *vdev)
err = PTR_ERR(vb->vb_dev_info.inode);
kern_unmount(balloon_mnt);
unregister_oom_notifier(&vb->nb);
- vb->vb_dev_info.inode = NULL;
goto out_del_vqs;
}
vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
--
2.7.4
^ permalink raw reply related
* [PATCH v4 3/3] virtio_balloon: replace oom notifier with shrinker
From: Wei Wang @ 2018-08-16 7:50 UTC (permalink / raw)
To: virtio-dev, linux-kernel, virtualization, linux-mm, mst, mhocko,
akpm, penguin-kernel
In-Reply-To: <1534405858-27085-1-git-send-email-wei.w.wang@intel.com>
The OOM notifier is getting deprecated to use for the reasons:
- As a callout from the oom context, it is too subtle and easy to
generate bugs and corner cases which are hard to track;
- It is called too late (after the reclaiming has been performed).
Drivers with large amuont of reclaimable memory is expected to
release them at an early stage of memory pressure;
- The notifier callback isn't aware of oom contrains;
Link: https://lkml.org/lkml/2018/7/12/314
This patch replaces the virtio-balloon oom notifier with a shrinker
to release balloon pages on memory pressure. The balloon pages are
given back to mm adaptively by returning the number of pages that the
reclaimer is asking for (i.e. sc->nr_to_scan).
Currently the max possible value of sc->nr_to_scan passed to the balloon
shrinker is SHRINK_BATCH, which is 128. This is smaller than the
limitation that only VIRTIO_BALLOON_ARRAY_PFNS_MAX (256) pages can be
returned via one invocation of leak_balloon. But this patch still
considers the case that SHRINK_BATCH or shrinker->batch could be changed
to a value larger than VIRTIO_BALLOON_ARRAY_PFNS_MAX, which will need to
do multiple invocations of leak_balloon.
Historically, the feature VIRTIO_BALLOON_F_DEFLATE_ON_OOM has been used
to release balloon pages on OOM. We continue to use this feature bit for
the shrinker, so the shrinker is only registered when this feature bit
has been negotiated with host.
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
---
drivers/virtio/virtio_balloon.c | 110 +++++++++++++++++++++-------------------
1 file changed, 59 insertions(+), 51 deletions(-)
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index d97d73c..d1c1f62 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -27,7 +27,6 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/balloon_compaction.h>
-#include <linux/oom.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/mount.h>
@@ -40,13 +39,8 @@
*/
#define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
#define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
-#define OOM_VBALLOON_DEFAULT_PAGES 256
#define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
-static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
-module_param(oom_pages, int, S_IRUSR | S_IWUSR);
-MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
-
#ifdef CONFIG_BALLOON_COMPACTION
static struct vfsmount *balloon_mnt;
#endif
@@ -86,8 +80,8 @@ struct virtio_balloon {
/* Memory statistics */
struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
- /* To register callback in oom notifier call chain */
- struct notifier_block nb;
+ /* To register a shrinker to shrink memory upon memory pressure */
+ struct shrinker shrinker;
};
static struct virtio_device_id id_table[] = {
@@ -365,38 +359,6 @@ static void update_balloon_size(struct virtio_balloon *vb)
&actual);
}
-/*
- * virtballoon_oom_notify - release pages when system is under severe
- * memory pressure (called from out_of_memory())
- * @self : notifier block struct
- * @dummy: not used
- * @parm : returned - number of freed pages
- *
- * The balancing of memory by use of the virtio balloon should not cause
- * the termination of processes while there are pages in the balloon.
- * If virtio balloon manages to release some memory, it will make the
- * system return and retry the allocation that forced the OOM killer
- * to run.
- */
-static int virtballoon_oom_notify(struct notifier_block *self,
- unsigned long dummy, void *parm)
-{
- struct virtio_balloon *vb;
- unsigned long *freed;
- unsigned num_freed_pages;
-
- vb = container_of(self, struct virtio_balloon, nb);
- if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
- return NOTIFY_OK;
-
- freed = parm;
- num_freed_pages = leak_balloon(vb, oom_pages);
- update_balloon_size(vb);
- *freed += num_freed_pages;
-
- return NOTIFY_OK;
-}
-
static void update_balloon_stats_func(struct work_struct *work)
{
struct virtio_balloon *vb;
@@ -550,6 +512,52 @@ static struct file_system_type balloon_fs = {
#endif /* CONFIG_BALLOON_COMPACTION */
+static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long pages_to_free, pages_freed = 0;
+ struct virtio_balloon *vb = container_of(shrinker,
+ struct virtio_balloon, shrinker);
+
+ pages_to_free = sc->nr_to_scan * VIRTIO_BALLOON_PAGES_PER_PAGE;
+
+ /*
+ * One invocation of leak_balloon can deflate at most
+ * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it
+ * multiple times to deflate pages till reaching pages_to_free.
+ */
+ while (vb->num_pages && pages_to_free) {
+ pages_to_free -= pages_freed;
+ pages_freed += leak_balloon(vb, pages_to_free);
+ }
+ update_balloon_size(vb);
+
+ return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE;
+}
+
+static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ struct virtio_balloon *vb = container_of(shrinker,
+ struct virtio_balloon, shrinker);
+
+ return vb->num_pages / VIRTIO_BALLOON_PAGES_PER_PAGE;
+}
+
+static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb)
+{
+ unregister_shrinker(&vb->shrinker);
+}
+
+static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
+{
+ vb->shrinker.scan_objects = virtio_balloon_shrinker_scan;
+ vb->shrinker.count_objects = virtio_balloon_shrinker_count;
+ vb->shrinker.seeks = DEFAULT_SEEKS;
+
+ return register_shrinker(&vb->shrinker);
+}
+
static int virtballoon_probe(struct virtio_device *vdev)
{
struct virtio_balloon *vb;
@@ -580,17 +588,10 @@ static int virtballoon_probe(struct virtio_device *vdev)
if (err)
goto out_free_vb;
- vb->nb.notifier_call = virtballoon_oom_notify;
- vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
- err = register_oom_notifier(&vb->nb);
- if (err < 0)
- goto out_del_vqs;
-
#ifdef CONFIG_BALLOON_COMPACTION
balloon_mnt = kern_mount(&balloon_fs);
if (IS_ERR(balloon_mnt)) {
err = PTR_ERR(balloon_mnt);
- unregister_oom_notifier(&vb->nb);
goto out_del_vqs;
}
@@ -599,12 +600,19 @@ static int virtballoon_probe(struct virtio_device *vdev)
if (IS_ERR(vb->vb_dev_info.inode)) {
err = PTR_ERR(vb->vb_dev_info.inode);
kern_unmount(balloon_mnt);
- unregister_oom_notifier(&vb->nb);
goto out_del_vqs;
}
vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
#endif
-
+ /*
+ * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to decide if a
+ * shrinker needs to be registered to relieve memory pressure.
+ */
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM)) {
+ err = virtio_balloon_register_shrinker(vb);
+ if (err)
+ goto out_del_vqs;
+ }
virtio_device_ready(vdev);
if (towards_target(vb))
@@ -636,8 +644,8 @@ static void virtballoon_remove(struct virtio_device *vdev)
{
struct virtio_balloon *vb = vdev->priv;
- unregister_oom_notifier(&vb->nb);
-
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
+ virtio_balloon_unregister_shrinker(vb);
spin_lock_irq(&vb->stop_update_lock);
vb->stop_update = true;
spin_unlock_irq(&vb->stop_update_lock);
--
2.7.4
^ permalink raw reply related
* Call for Papers - ICITS'19 - Quito, Ecuador
From: Maria Lemos @ 2018-08-18 11:02 UTC (permalink / raw)
To: virtualization
[-- Attachment #1.1: Type: text/plain, Size: 6043 bytes --]
***** Proceedings by Springer. Indexed by Scopus, ISI, etc.
------------
ICITS'19 - The 2019 International Conference on Information Technology & Systems
6 - 8 February 2019, Quito, Ecuador
http://www.icits.me/ <http://www.icits.me/>
------------------------------ ------------------------------ ------------------------------ ------------------------
ICITS'19 - The 2019 International Conference on Information Technology & Systems, to be held at Quito, Ecuador, 6 - 8 February 2019, is an international forum for researchers and practitioners to present and discuss the most recent innovations, trends, results, experiences and concerns in the several perspectives of Information Technology & Systems.
We are pleased to invite you to submit your papers to ICITS'19. They can be written in English, Spanish or Portuguese. All submissions will be reviewed on the basis of relevance, originality, importance and clarity.
Topics
Submitted papers should be related with one or more of the main themes proposed for the Conference:
A) Information and Knowledge Management (IKM);
B) Organizational Models and Information Systems (OMIS);
C) Software and Systems Modeling (SSM);
D) Software Systems, Architectures, Applications and Tools (SSAAT);
E) Multimedia Systems and Applications (MSA);
F) Computer Networks, Mobility and Pervasive Systems (CNMPS);
G) Intelligent and Decision Support Systems (IDSS);
H) Big Data Analytics and Applications (BDAA);
I) Human-Computer Interaction (HCI);
J) Ethics, Computers and Security (ECS)
K) Health Informatics (HIS);
L) Information Technologies in Education (ITE);
M) Cybersecurity and Cyber-defense;
N) Electromagnetics, Sensors and Antennas for Security.
Submission and Decision
Submitted papers written in English (until 10-page limit) must comply with the format of Advances in Intelligent Systems and Computing series (see Instructions for Authors at Springer Website <http://www.springer.com/series/11156> or download a DOC example <http://www.icits.me/springerformat.doc>), must not have been published before, not be under review for any other conference or publication and not include any information leading to the authors’ identification. Therefore, the authors’ names, affiliations and bibliographic references should not be included in the version for evaluation by the Scientific Committee. This information should only be included in the camera-ready version, saved in Word or Latex format and also in PDF format. These files must be accompanied by the Consent to Publish form <http://www.icits.me/copyright.pdf> filled out, in a ZIP file, and uploaded at the conference management system.
Submitted papers written in Spanish or Portuguese (until 15-page limit) must comply with the format of RISTI <http://www.risti.xyz/> - Revista Ibérica de Sistemas e Tecnologias de Informação (download instructions/template for authors in Spanish <http://www.risti.xyz/formato-es.doc> or Portuguese <http://www.risti.xyz/formato-pt.doc>), must not have been published before, not be under review for any other conference or publication and not include any information leading to the authors’ identification. Therefore, the authors’ names, affiliations and bibliographic references should not be included in the version for evaluation by the Scientific Committee. This information should only be included in the camera-ready version, saved in Word. These file must be uploaded at the conference management system in a ZIP file.
All papers will be subjected to a “double-blind review” by at least two members of the Scientific Committee.
Based on Scientific Committee evaluation, a paper can be rejected or accepted by the Conference Chairs. In the later case, it can be accepted as paper or poster.
The authors of papers accepted as posters must build and print a poster to be exhibited during the Conference. This poster must follow an A1 or A2 vertical format. The Conference can includes Work Sessions where these posters are presented and orally discussed, with a 7 minute limit per poster.
The authors of accepted papers will have 15 minutes to present their work in a Conference Work Session; approximately 5 minutes of discussion will follow each presentation.
Publication and Indexing
To ensure that an accepted paper is published, at least one of the authors must be fully registered by the 9th of October 2018, and the paper must comply with the suggested layout and page-limit. Additionally, all recommended changes must be addressed by the authors before they submit the camera-ready version.
No more than one paper per registration will be published. An extra fee must be paid for publication of additional papers, with a maximum of one additional paper per registration. One registration permits only the participation of one author in the conference.
Papers written in English and accepted and registered will be published in Proceedings by Springer, in a book of the Advances in Intelligent Systems and Computing <http://www.springer.com/series/11156>series, will be submitted for indexation by ISI, EI-Compendex, SCOPUS and DBLP, among others, and will be available in the SpringerLink Digital Library <http://link.springer.com/>.
Papers written in Spanish or Portuguese and accepted and registered will be published in a Special Issue of RISTI <http://www.risti.xyz/index.php?option=com_content&view=article&id=3&Itemid=104&lang=es> and will be submitted for indexation by SCOPUS, among others.
Important Dates
Paper Submission: September 16, 2018
Notification of Acceptance: October 28, 2018
Payment of Registration, to ensure the inclusion of an accepted paper in the conference proceedings: November 9, 2018.
Camera-ready Submission: November 9, 2018
Website of ICITS'19: http://www.icits.me/ <http://www.icits.me/>
---
This email has been checked for viruses by AVG.
https://www.avg.com
[-- Attachment #1.2: Type: text/html, Size: 10805 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* [PATCH net-next v8 0/7] net: vhost: improve performance when enable busyloop
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patches improve the guest receive performance.
On the handle_tx side, we poll the sock receive queue
at the same time. handle_rx do that in the same way.
For more performance report, see patch 4, 6, 7
Tonghao Zhang (7):
net: vhost: lock the vqs one by one
net: vhost: replace magic number of lock annotation
net: vhost: factor out busy polling logic to vhost_net_busy_poll()
net: vhost: add rx busy polling in tx path
net: vhost: introduce bitmap for vhost_poll
net: vhost: disable rx wakeup during tx busypoll
net: vhost: make busyloop_intr more accurate
drivers/vhost/net.c | 169 +++++++++++++++++++++++++++++++-------------------
drivers/vhost/vhost.c | 41 ++++++------
drivers/vhost/vhost.h | 7 ++-
3 files changed, 133 insertions(+), 84 deletions(-)
--
1.8.3.1
^ permalink raw reply
* [PATCH net-next v8 1/7] net: vhost: lock the vqs one by one
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
In-Reply-To: <1534680686-3108-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a502f1a..a1c06e7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
{
int i;
- for (i = 0; i < d->nvqs; ++i)
+ for (i = 0; i < d->nvqs; ++i) {
+ mutex_lock(&d->vqs[i]->mutex);
__vhost_vq_meta_reset(d->vqs[i]);
+ mutex_unlock(&d->vqs[i]->mutex);
+ }
}
static void vhost_vq_reset(struct vhost_dev *dev,
@@ -890,20 +893,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
#define vhost_get_used(vq, x, ptr) \
vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_lock_nested(&d->vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_unlock(&d->vqs[i]->mutex);
-}
-
static int vhost_new_umem_range(struct vhost_umem *umem,
u64 start, u64 size, u64 end,
u64 userspace_addr, int perm)
@@ -953,7 +942,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
if (msg->iova <= vq_msg->iova &&
msg->iova + msg->size - 1 > vq_msg->iova &&
vq_msg->type == VHOST_IOTLB_MISS) {
+ mutex_lock(&node->vq->mutex);
vhost_poll_queue(&node->vq->poll);
+ mutex_unlock(&node->vq->mutex);
+
list_del(&node->node);
kfree(node);
}
@@ -985,7 +977,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
int ret = 0;
mutex_lock(&dev->mutex);
- vhost_dev_lock_vqs(dev);
switch (msg->type) {
case VHOST_IOTLB_UPDATE:
if (!dev->iotlb) {
@@ -1019,7 +1010,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
- vhost_dev_unlock_vqs(dev);
mutex_unlock(&dev->mutex);
return ret;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v8 2/7] net: vhost: replace magic number of lock annotation
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
In-Reply-To: <1534680686-3108-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 367d802..32c1b52 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -712,7 +712,7 @@ static void handle_tx(struct vhost_net *net)
struct vhost_virtqueue *vq = &nvq->vq;
struct socket *sock;
- mutex_lock(&vq->mutex);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
sock = vq->private_data;
if (!sock)
goto out;
@@ -777,7 +777,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
/* Flush batched heads first */
vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&tvq->mutex, 1);
+ mutex_lock_nested(&tvq->mutex, VHOST_NET_VQ_TX);
vhost_disable_notify(&net->dev, tvq);
preempt_disable();
@@ -919,7 +919,7 @@ static void handle_rx(struct vhost_net *net)
__virtio16 num_buffers;
int recv_pkts = 0;
- mutex_lock_nested(&vq->mutex, 0);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
sock = vq->private_data;
if (!sock)
goto out;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v8 3/7] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
In-Reply-To: <1534680686-3108-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.
To avoid duplicate codes, introduce the helper functions:
* sock_has_rx_data(changed from sk_has_rx_data)
* vhost_net_busy_poll_try_queue
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 111 +++++++++++++++++++++++++++++++++-------------------
1 file changed, 71 insertions(+), 40 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 32c1b52..453c061 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -440,6 +440,75 @@ static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq)
nvq->done_idx = 0;
}
+static int sock_has_rx_data(struct socket *sock)
+{
+ if (unlikely(!sock))
+ return 0;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sock->sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll_try_queue(struct vhost_net *net,
+ struct vhost_virtqueue *vq)
+{
+ if (!vhost_vq_avail_empty(&net->dev, vq)) {
+ vhost_poll_queue(&vq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
+ vhost_disable_notify(&net->dev, vq);
+ vhost_poll_queue(&vq->poll);
+ }
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool *busyloop_intr,
+ bool poll_rx)
+{
+ unsigned long busyloop_timeout;
+ unsigned long endtime;
+ struct socket *sock;
+ struct vhost_virtqueue *vq = poll_rx ? tvq : rvq;
+
+ mutex_lock_nested(&vq->mutex, poll_rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+ vhost_disable_notify(&net->dev, vq);
+ sock = rvq->private_data;
+
+ busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
+ tvq->busyloop_timeout;
+
+ preempt_disable();
+ endtime = busy_clock() + busyloop_timeout;
+
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(&net->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+
+ if ((sock_has_rx_data(sock) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) ||
+ !vhost_vq_avail_empty(&net->dev, tvq))
+ break;
+
+ cpu_relax();
+ }
+
+ preempt_enable();
+
+ if (poll_rx)
+ vhost_net_busy_poll_try_queue(net, tvq);
+ else if (sock_has_rx_data(sock))
+ vhost_net_busy_poll_try_queue(net, rvq);
+ else /* On tx here, sock has no rx data. */
+ vhost_enable_notify(&net->dev, rvq);
+
+ mutex_unlock(&vq->mutex);
+}
+
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_net_virtqueue *nvq,
unsigned int *out_num, unsigned int *in_num,
@@ -753,16 +822,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
return len;
}
-static int sk_has_rx_data(struct sock *sk)
-{
- struct socket *sock = sk->sk_socket;
-
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
-
- return skb_queue_empty(&sk->sk_receive_queue);
-}
-
static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
bool *busyloop_intr)
{
@@ -770,41 +829,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
- unsigned long uninitialized_var(endtime);
int len = peek_head_len(rnvq, sk);
- if (!len && tvq->busyloop_timeout) {
+ if (!len && rvq->busyloop_timeout) {
/* Flush batched heads first */
vhost_net_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&tvq->mutex, VHOST_NET_VQ_TX);
- vhost_disable_notify(&net->dev, tvq);
-
- preempt_disable();
- endtime = busy_clock() + tvq->busyloop_timeout;
-
- while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(&net->dev)) {
- *busyloop_intr = true;
- break;
- }
- if ((sk_has_rx_data(sk) &&
- !vhost_vq_avail_empty(&net->dev, rvq)) ||
- !vhost_vq_avail_empty(&net->dev, tvq))
- break;
- cpu_relax();
- }
-
- preempt_enable();
-
- if (!vhost_vq_avail_empty(&net->dev, tvq)) {
- vhost_poll_queue(&tvq->poll);
- } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
- vhost_disable_notify(&net->dev, tvq);
- vhost_poll_queue(&tvq->poll);
- }
-
- mutex_unlock(&tvq->mutex);
+ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
len = peek_head_len(rnvq, sk);
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v8 4/7] net: vhost: add rx busy polling in tx path
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
In-Reply-To: <1534680686-3108-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch improves the guest receive performance.
On the handle_tx side, we poll the sock receive queue at the
same time. handle_rx do that in the same way.
We set the poll-us=100us and use the netperf to test throughput
and mean latency. When running the tests, the vhost-net kthread
of that VM, is alway 100% CPU. The commands are shown as below.
Rx performance is greatly improved by this patch. There is not
notable performance change on tx with this series though. This
patch is useful for bi-directional traffic.
netperf -H IP -t TCP_STREAM -l 20 -- -O "THROUGHPUT, THROUGHPUT_UNITS, MEAN_LATENCY"
Topology:
[Host] ->linux bridge -> tap vhost-net ->[Guest]
TCP_STREAM:
* Without the patch: 19842.95 Mbps, 6.50 us mean latency
* With the patch: 37598.20 Mbps, 3.43 us mean latency
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 33 +++++++++++++--------------------
1 file changed, 13 insertions(+), 20 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 453c061..1eff72d 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -510,31 +510,24 @@ static void vhost_net_busy_poll(struct vhost_net *net,
}
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
- struct vhost_net_virtqueue *nvq,
+ struct vhost_net_virtqueue *tnvq,
unsigned int *out_num, unsigned int *in_num,
bool *busyloop_intr)
{
- struct vhost_virtqueue *vq = &nvq->vq;
- unsigned long uninitialized_var(endtime);
- int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_virtqueue *rvq = &rnvq->vq;
+ struct vhost_virtqueue *tvq = &tnvq->vq;
+
+ int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
out_num, in_num, NULL, NULL);
- if (r == vq->num && vq->busyloop_timeout) {
- if (!vhost_sock_zcopy(vq->private_data))
- vhost_net_signal_used(nvq);
- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
- while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(vq->dev)) {
- *busyloop_intr = true;
- break;
- }
- if (!vhost_vq_avail_empty(vq->dev, vq))
- break;
- cpu_relax();
- }
- preempt_enable();
- r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ if (r == tvq->num && tvq->busyloop_timeout) {
+ if (!vhost_sock_zcopy(tvq->private_data))
+ vhost_net_signal_used(tnvq);
+
+ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
+
+ r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
out_num, in_num, NULL, NULL);
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v8 5/7] net: vhost: introduce bitmap for vhost_poll
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
In-Reply-To: <1534680686-3108-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
The bitmap of vhost_dev can help us to check if the
specified poll is scheduled. This patch will be used
for next two patches.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 11 +++++++++--
drivers/vhost/vhost.c | 17 +++++++++++++++--
drivers/vhost/vhost.h | 7 ++++++-
3 files changed, 30 insertions(+), 5 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 1eff72d..23d7ffc 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1135,8 +1135,15 @@ static int vhost_net_open(struct inode *inode, struct file *f)
}
vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
- vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev);
- vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
+ vhost_poll_init(n->poll + VHOST_NET_VQ_TX,
+ handle_tx_net,
+ VHOST_NET_VQ_TX,
+ EPOLLOUT, dev);
+
+ vhost_poll_init(n->poll + VHOST_NET_VQ_RX,
+ handle_rx_net,
+ VHOST_NET_VQ_RX,
+ EPOLLIN, dev);
f->private_data = n;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a1c06e7..dc88a60 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -186,7 +186,7 @@ void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn)
/* Init poll structure */
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
- __poll_t mask, struct vhost_dev *dev)
+ __u8 poll_id, __poll_t mask, struct vhost_dev *dev)
{
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
@@ -194,6 +194,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
poll->dev = dev;
poll->wqh = NULL;
+ poll->poll_id = poll_id;
vhost_work_init(&poll->work, fn);
}
EXPORT_SYMBOL_GPL(vhost_poll_init);
@@ -276,8 +277,16 @@ bool vhost_has_work(struct vhost_dev *dev)
}
EXPORT_SYMBOL_GPL(vhost_has_work);
+bool vhost_has_work_pending(struct vhost_dev *dev, int poll_id)
+{
+ return !llist_empty(&dev->work_list) &&
+ test_bit(poll_id, dev->work_pending);
+}
+EXPORT_SYMBOL_GPL(vhost_has_work_pending);
+
void vhost_poll_queue(struct vhost_poll *poll)
{
+ set_bit(poll->poll_id, poll->dev->work_pending);
vhost_work_queue(poll->dev, &poll->work);
}
EXPORT_SYMBOL_GPL(vhost_poll_queue);
@@ -354,6 +363,7 @@ static int vhost_worker(void *data)
if (!node)
schedule();
+ bitmap_zero(dev->work_pending, VHOST_DEV_MAX_VQ);
node = llist_reverse_order(node);
/* make sure flag is seen after deletion */
smp_wmb();
@@ -420,6 +430,8 @@ void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue *vq;
int i;
+ BUG_ON(nvqs > VHOST_DEV_MAX_VQ);
+
dev->vqs = vqs;
dev->nvqs = nvqs;
mutex_init(&dev->mutex);
@@ -428,6 +440,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->iotlb = NULL;
dev->mm = NULL;
dev->worker = NULL;
+ bitmap_zero(dev->work_pending, VHOST_DEV_MAX_VQ);
init_llist_head(&dev->work_list);
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
@@ -445,7 +458,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
- EPOLLIN, dev);
+ i, EPOLLIN, dev);
}
}
EXPORT_SYMBOL_GPL(vhost_dev_init);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 6c844b9..60b6f6d 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -30,6 +30,7 @@ struct vhost_poll {
wait_queue_head_t *wqh;
wait_queue_entry_t wait;
struct vhost_work work;
+ __u8 poll_id;
__poll_t mask;
struct vhost_dev *dev;
};
@@ -37,9 +38,10 @@ struct vhost_poll {
void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
bool vhost_has_work(struct vhost_dev *dev);
+bool vhost_has_work_pending(struct vhost_dev *dev, int poll_id);
void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
- __poll_t mask, struct vhost_dev *dev);
+ __u8 id, __poll_t mask, struct vhost_dev *dev);
int vhost_poll_start(struct vhost_poll *poll, struct file *file);
void vhost_poll_stop(struct vhost_poll *poll);
void vhost_poll_flush(struct vhost_poll *poll);
@@ -152,6 +154,8 @@ struct vhost_msg_node {
struct list_head node;
};
+#define VHOST_DEV_MAX_VQ 128
+
struct vhost_dev {
struct mm_struct *mm;
struct mutex mutex;
@@ -159,6 +163,7 @@ struct vhost_dev {
int nvqs;
struct eventfd_ctx *log_ctx;
struct llist_head work_list;
+ DECLARE_BITMAP(work_pending, VHOST_DEV_MAX_VQ);
struct task_struct *worker;
struct vhost_umem *umem;
struct vhost_umem *iotlb;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v8 6/7] net: vhost: disable rx wakeup during tx busypoll
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
In-Reply-To: <1534680686-3108-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
In the handle_tx, the busypoll will vhost_net_disable/enable_vq
because we have poll the sock. This can improve performance.
This is suggested by Toshiaki Makita and Jason Wang.
If the rx handle is scheduled, we will not enable vq, because it's
not necessary. We do it not in last 'else' because if we receive
the data, but can't queue the rx handle(rx vring is full), then we
enable the vq to avoid case: guest receives the data, vring is not
full then guest can get more data, but vq is disabled, rx vq can't
be wakeup to receive more data.
Topology:
[Host] ->linux bridge -> tap vhost-net ->[Guest]
TCP_STREAM (netperf):
* Without the patch: 37598.20 Mbps, 3.43 us mean latency
* With the patch: 38035.39 Mbps, 3.37 us mean latency
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 23d7ffc..db63ae2 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -480,6 +480,9 @@ static void vhost_net_busy_poll(struct vhost_net *net,
busyloop_timeout = poll_rx ? rvq->busyloop_timeout:
tvq->busyloop_timeout;
+ if (!poll_rx)
+ vhost_net_disable_vq(net, rvq);
+
preempt_disable();
endtime = busy_clock() + busyloop_timeout;
@@ -506,6 +509,10 @@ static void vhost_net_busy_poll(struct vhost_net *net,
else /* On tx here, sock has no rx data. */
vhost_enable_notify(&net->dev, rvq);
+ if (!poll_rx &&
+ !vhost_has_work_pending(&net->dev, VHOST_NET_VQ_RX))
+ vhost_net_enable_vq(net, rvq);
+
mutex_unlock(&vq->mutex);
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v8 7/7] net: vhost: make busyloop_intr more accurate
From: xiangxia.m.yue @ 2018-08-19 12:11 UTC (permalink / raw)
To: jasowang, mst, makita.toshiaki; +Cc: netdev, virtualization
In-Reply-To: <1534680686-3108-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
The patch uses vhost_has_work_pending() to check if
the specified handler is scheduled, because in the most case,
vhost_has_work() return true when other side handler is added
to worker list. Use the vhost_has_work_pending() insead of
vhost_has_work().
Topology:
[Host] ->linux bridge -> tap vhost-net ->[Guest]
TCP_STREAM (netperf):
* Without the patch: 38035.39 Mbps, 3.37 us mean latency
* With the patch: 38409.44 Mbps, 3.34 us mean latency
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index db63ae2..b6939ef 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -487,10 +487,8 @@ static void vhost_net_busy_poll(struct vhost_net *net,
endtime = busy_clock() + busyloop_timeout;
while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(&net->dev)) {
- *busyloop_intr = true;
+ if (vhost_has_work(&net->dev))
break;
- }
if ((sock_has_rx_data(sock) &&
!vhost_vq_avail_empty(&net->dev, rvq)) ||
@@ -513,6 +511,11 @@ static void vhost_net_busy_poll(struct vhost_net *net,
!vhost_has_work_pending(&net->dev, VHOST_NET_VQ_RX))
vhost_net_enable_vq(net, rvq);
+ if (vhost_has_work_pending(&net->dev,
+ poll_rx ?
+ VHOST_NET_VQ_RX: VHOST_NET_VQ_TX))
+ *busyloop_intr = true;
+
mutex_unlock(&vq->mutex);
}
--
1.8.3.1
^ permalink raw reply related
* Call for Workshops Proposals - WorldCIST'19, La Toja Island, Spain
From: Maria Lemos @ 2018-08-19 22:03 UTC (permalink / raw)
To: virtualization
[-- Attachment #1.1: Type: text/plain, Size: 4421 bytes --]
----------------- CALL FOR WORKSHOPS PROPOSALS --------------------
WorldCIST'19 - 7th World Conference on Information Systems and Technologies
16th-19th of April 2019, La Toja Island, Galicia, Spain
http://www.worldcist.org/ <http://www.worldcist.org/>
-----------------------------------------------------------------------------------
The Information Systems and Technologies research and industrial community is invited to submit proposals for the organization of Workshops at WorldCist'19 - 7th World Conference on Information Systems and Technologies, to be held at La Toja Island, Galicia, Spain, 16 - 19 April 2019. WorldCist is a global forum for researchers and practitioners to present and discuss the most recent innovations, trends, results, experiences and concerns in the several perspectives of Information Systems and Technologies.
###############
WORKSHOP FORMAT
###############
Workshops should focus on a specific scientific subject on the scope of WorldCist'19 but not directly included on the main conference areas. Each workshop will be coordinated by an Organizing Committee composed of, at least, two researchers in the field, preferably from different institutions and different countries. The organizers should create an international Program Committee for the Workshop, with recognized researchers within the specific Workshop scientific area. Each workshop should have at least ten submissions and five accepted papers in order to be conducted at WorldCist'19.
The selection of Workshops will be performed by WorldCist'19 Conference/Workshop Chairs. Workshops full and short papers will be published in the conference main proceedings in specific Workshop chapters published by Springer in a book of the AISC series. Proceedings will be submitted for indexation by ISI Thomson, SCOPUS, DBLP, EI-Compendex among several other scientific databases. Extended versions of best selected papers will be published in journals indexed by ISI/SCI, SCOPUS and DBLP. Detailed and up-to-date information may be found at WorldCist'19 website: http://www.worldcist.org/ <http://www.worldcist.org/>
#####################
WORKSHOP ORGANIZATION
#####################
The Organizing Committee of each Workshop will be responsible for:
- Producing and distributing the Workshop Call for Papers (CFP);
- Coordinating the review and selection process for the papers submitted to the Workshop, as Workshop chairs (on the paper submission system to be installed);
- Delivering the final versions of the papers accepted for the Workshop in accordance with the guidelines and deadlines defined by WorldCist'19 organizers;
- Coordinating and chairing the Workshop sessions at the conference.
WorldCist'19 organizers reserve the right to cancel any Workshop if deadlines are missed or if the number of registered attendees is too low to support the costs associated with the Workshop.
################
PROPOSAL CONTENT
################
Workshop proposals should contain the following information:
- Workshop title;
- Brief description of the specific scientific scope of the Workshop;
- List of topics of interest (max 15 topics);
- Reasons the Workshop should be held within WorldCist’19;
- Name, postal address, phone and email of all the members of the Workshop Organizing Committee;
- Preliminary proposal for the Workshop Program Committee (Names and affiliations).
Proposals should be submitted at https://easychair.org/conferences/?conf=worldcist-workshops2019 <https://easychair.org/conferences/?conf=worldcist-workshops2019> in PDF (in English), by September 10, 2018.
###############
IMPORTANT DATES
###############
- Deadline for Workshop proposals: September 10, 2018
- Notification of Workshop acceptance: September 20, 2018
- Workshop Final Information and Program Committee: October 10, 2018
- Deadline for paper submission: November 30, 2018
- Notification of paper acceptance: January 6, 2019
- Deadline for final versions and conference registration: January 20, 2019
- Conference dates: April 16-19, 2019
#####
CHAIR
#####
Luis Paulo Reis, AISTI, IEEE & University of Porto, Portugal
WorldCIST'19 Website: http://www.worldcist.org/ <http://www.worldcist.org/>
---
This email has been checked for viruses by AVG.
https://www.avg.com
[-- Attachment #1.2: Type: text/html, Size: 6068 bytes --]
[-- Attachment #2: Type: text/plain, Size: 183 bytes --]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH RESEND] x86-64: use RIP-relative calls for paravirt indirect ones
From: Thomas Gleixner @ 2018-08-20 14:54 UTC (permalink / raw)
To: Jan Beulich
Cc: Juergen Gross, the arch/x86 maintainers, linux-kernel,
Linux Virtualization, hpa, Alok Kataria
In-Reply-To: <5B73DFB602000078001DE35E@prv1-mh.provo.novell.com>
On Wed, 15 Aug 2018, Jan Beulich wrote:
> This saves one insn byte per instance, summing up to a savings of over
> 4k in my (stripped down) configuration. No variant of to be patched in
> replacement code relies on the one byte larger size.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Juergen Gross <jgross@suse.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
^ permalink raw reply
* RE: [PATCH V2 net-next 2/6] sctp: Handle sctp packets with CHECKSUM_PARTIAL
From: David Laight @ 2018-08-20 15:39 UTC (permalink / raw)
To: 'Marcelo Ricardo Leitner', Vladislav Yasevich
Cc: virtio-dev@lists.oasis-open.org, nhorman@tuxdriver.com,
mst@redhat.com, netdev@vger.kernel.org,
virtualization@lists.linux-foundation.org,
linux-sctp@vger.kernel.org
In-Reply-To: <20180820145415.GA5310@localhost.localdomain>
From: Marcelo Ricardo Leitner
> Sent: 20 August 2018 15:54
> On Wed, May 02, 2018 at 11:38:24AM -0300, Marcelo Ricardo Leitner wrote:
> > On Tue, May 01, 2018 at 10:07:35PM -0400, Vladislav Yasevich wrote:
> > > With SCTP checksum offload available in virtio, it is now
> > > possible for virtio to receive a sctp packet with CHECKSUM_PARTIAL
> > > set (guest-to-guest traffic). SCTP doesn't really have a
> > > partial checksum like TCP does, because CRC32c can't do partial
> > > additive checksumming.
...
Actually that isn't entirely true.
For all crc, crc(a) ^ crc(b) == crc(a^b).
Since crc(0) == 0 you can xor together two separately calculated crc
provided they both end at the same point.
The slight problem is that you are more likely to be appending
one buffer to another - which requires appending lots of zero
bytes to one of the crcs.
This could be speeded up by using lookup tables that add moderate
sized blocks of zero bytes to a crc instead of adding the zero
bytes one at a time.
Doing it without large const data and/or data cache trashing
is left as an exercise to the implementer.
David
-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox