* [PATCH RFC V4 2/3] KVM: X86: Adding arbitrary data pointer in kvm memslot iterator functions
From: Ahmed Abd El Mawgood @ 2018-07-20 23:31 UTC (permalink / raw)
To: kvm, Kernel Hardening, virtualization, linux-doc, x86, xen-devel
Cc: Ard Biesheuvel, Kees Cook, nathan Corbet, David Vrabel, rkrcmar,
Boris Lukashev, Ingo Molnar, nigel.edwards, hpa, Paolo Bonzini,
Thomas Gleixner, Rik van Riel
In-Reply-To: <20180720233130.14129-1-ahmedsoliman0x666@gmail.com>
This will help sharing data into the slot_level_handler callback. In my
case I need to a share a counter for the pages traversed to use it in some
bitmap. Being able to send arbitrary memory pointer into the
slot_level_handler callback made it easy.
Signed-off-by: Ahmed Abd El Mawgood <ahmedsoliman0x666@gmail.com>
---
arch/x86/kvm/mmu.c | 65 +++++++++++++++++++++++++++++++-----------------------
1 file changed, 37 insertions(+), 28 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d594690d8b95..77661530b2c4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1418,7 +1418,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
static bool __rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
- bool pt_protect)
+ bool pt_protect, void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1457,7 +1457,8 @@ static bool wrprot_ad_disabled_spte(u64 *sptep)
* - W bit on ad-disabled SPTEs.
* Returns true iff any D or W bits were cleared.
*/
-static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1483,7 +1484,8 @@ static bool spte_set_dirty(u64 *sptep)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1515,7 +1517,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmap_head, false);
+ __rmap_write_protect(kvm, rmap_head, false, NULL);
/* clear the first set bit */
mask &= mask - 1;
@@ -1541,7 +1543,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PT_PAGE_TABLE_LEVEL, slot);
- __rmap_clear_dirty(kvm, rmap_head);
+ __rmap_clear_dirty(kvm, rmap_head, NULL);
/* clear the first set bit */
mask &= mask - 1;
@@ -1594,7 +1596,8 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmap_head, true);
+ write_protected |= __rmap_write_protect(kvm, rmap_head, true,
+ NULL);
}
return write_protected;
@@ -1608,7 +1611,8 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
}
-static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
+static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1628,7 +1632,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct kvm_memory_slot *slot, gfn_t gfn, int level,
unsigned long data)
{
- return kvm_zap_rmapp(kvm, rmap_head);
+ return kvm_zap_rmapp(kvm, rmap_head, NULL);
}
static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
@@ -5086,13 +5090,15 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
}
/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+typedef bool (*slot_level_handler) (struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head, void *data);
/* The caller should hold mmu-lock before calling this function. */
static __always_inline bool
slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+ gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb,
+ void *data)
{
struct slot_rmap_walk_iterator iterator;
bool flush = false;
@@ -5100,7 +5106,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
end_gfn, &iterator) {
if (iterator.rmap)
- flush |= fn(kvm, iterator.rmap);
+ flush |= fn(kvm, iterator.rmap, data);
if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
if (flush && lock_flush_tlb) {
@@ -5122,36 +5128,36 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
static __always_inline bool
slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
slot_level_handler fn, int start_level, int end_level,
- bool lock_flush_tlb)
+ bool lock_flush_tlb, void *data)
{
return slot_handle_level_range(kvm, memslot, fn, start_level,
end_level, memslot->base_gfn,
memslot->base_gfn + memslot->npages - 1,
- lock_flush_tlb);
+ lock_flush_tlb, data);
}
static __always_inline bool
slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool lock_flush_tlb, void *data)
{
return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data);
}
static __always_inline bool
slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool lock_flush_tlb, void *data)
{
return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
- PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+ PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb, data);
}
static __always_inline bool
slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
- slot_level_handler fn, bool lock_flush_tlb)
+ slot_level_handler fn, bool lock_flush_tlb, void *data)
{
return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
- PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+ PT_PAGE_TABLE_LEVEL, lock_flush_tlb, data);
}
void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
@@ -5173,7 +5179,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
- start, end - 1, true);
+ start, end - 1, true, NULL);
}
}
@@ -5181,9 +5187,10 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
}
static bool slot_rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head)
+ struct kvm_rmap_head *rmap_head,
+ void *data)
{
- return __rmap_write_protect(kvm, rmap_head, false);
+ return __rmap_write_protect(kvm, rmap_head, false, data);
}
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -5193,7 +5200,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
- false);
+ false, NULL);
spin_unlock(&kvm->mmu_lock);
/*
@@ -5219,7 +5226,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head)
+ struct kvm_rmap_head *rmap_head,
+ void *data)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -5257,7 +5265,7 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
/* FIXME: const-ify all uses of struct kvm_memory_slot. */
spin_lock(&kvm->mmu_lock);
slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
- kvm_mmu_zap_collapsible_spte, true);
+ kvm_mmu_zap_collapsible_spte, true, NULL);
spin_unlock(&kvm->mmu_lock);
}
@@ -5267,7 +5275,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
bool flush;
spin_lock(&kvm->mmu_lock);
- flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
+ flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false, NULL);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
@@ -5290,7 +5298,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
spin_lock(&kvm->mmu_lock);
flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
- false);
+ false, NULL);
spin_unlock(&kvm->mmu_lock);
/* see kvm_mmu_slot_remove_write_access */
@@ -5307,7 +5315,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
bool flush;
spin_lock(&kvm->mmu_lock);
- flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
+ flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false,
+ NULL);
spin_unlock(&kvm->mmu_lock);
lockdep_assert_held(&kvm->slots_lock);
--
2.16.4
^ permalink raw reply related
* [PATCH RFC V4 3/3] KVM: X86: Adding skeleton for Memory ROE
From: Ahmed Abd El Mawgood @ 2018-07-20 23:31 UTC (permalink / raw)
To: kvm, Kernel Hardening, virtualization, linux-doc, x86, xen-devel
Cc: Ard Biesheuvel, Kees Cook, nathan Corbet, David Vrabel, rkrcmar,
Boris Lukashev, Ingo Molnar, nigel.edwards, hpa, Paolo Bonzini,
Thomas Gleixner, Rik van Riel
In-Reply-To: <20180720233130.14129-1-ahmedsoliman0x666@gmail.com>
This patch introduces a hypercall implemented for X86 that can assist
against subset of kernel rootkits, it works by place readonly protection in
shadow PTE. The end result protection is also kept in a bitmap for each
kvm_memory_slot and is used as reference when updating SPTEs. The whole
goal is to protect the guest kernel static data from modification if
attacker is running from guest ring 0, for this reason there is no
hypercall to revert effect of Memory ROE hypercall. This patch doesn't
implement integrity check on guest TLB so obvious attack on the current
implementation will involve guest virtual address -> guest physical
address remapping, but there are plans to fix that.
Signed-off-by: Ahmed Abd El Mawgood <ahmedsoliman0x666@gmail.com>
---
arch/x86/include/asm/kvm_host.h | 11 ++++-
arch/x86/kvm/Kconfig | 7 +++
arch/x86/kvm/mmu.c | 72 +++++++++++++++++++++++-----
arch/x86/kvm/x86.c | 104 ++++++++++++++++++++++++++++++++++++++--
include/linux/kvm_host.h | 3 ++
include/uapi/linux/kvm_para.h | 1 +
virt/kvm/kvm_main.c | 29 +++++++++--
7 files changed, 208 insertions(+), 19 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c13cd28d9d1b..128bcfa246a3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -236,6 +236,15 @@ struct kvm_mmu_memory_cache {
void *objects[KVM_NR_MEM_OBJS];
};
+/*
+ * This is internal structure used to be be able to access kvm memory slot and
+ * have track of the number of current PTE when doing shadow PTE walk
+ */
+struct kvm_write_access_data {
+ int i;
+ struct kvm_memory_slot *memslot;
+};
+
/*
* the pages used as guest page table on soft mmu are tracked by
* kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
@@ -1130,7 +1139,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 acc_track_mask, u64 me_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+void kvm_mmu_slot_apply_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot);
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *memslot);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1bbec387d289..487e02de4e76 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -96,6 +96,13 @@ config KVM_MMU_AUDIT
This option adds a R/W kVM module parameter 'mmu_audit', which allows
auditing of KVM MMU events at runtime.
+config KVM_MROE
+ bool "Hypercall Memory Read-Only Enforcement"
+ depends on KVM && X86
+ help
+ This option adds KVM_HC_HMROE hypercall to kvm as a hardening
+ mechanism to protect memory pages from being edited.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 77661530b2c4..4ce6a9a19a23 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1416,9 +1416,8 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
return mmu_spte_update(sptep, spte);
}
-static bool __rmap_write_protect(struct kvm *kvm,
- struct kvm_rmap_head *rmap_head,
- bool pt_protect, void *data)
+static bool __rmap_write_protection(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head, bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1430,6 +1429,38 @@ static bool __rmap_write_protect(struct kvm *kvm,
return flush;
}
+#ifdef CONFIG_KVM_MROE
+static bool __rmap_write_protect_mroe(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ bool pt_protect,
+ struct kvm_write_access_data *d)
+{
+ u64 *sptep;
+ struct rmap_iterator iter;
+ bool prot;
+ bool flush = false;
+
+ for_each_rmap_spte(rmap_head, &iter, sptep) {
+ prot = !test_bit(d->i, d->memslot->mroe_bitmap) && pt_protect;
+ flush |= spte_write_protect(sptep, prot);
+ d->i++;
+ }
+ return flush;
+}
+#endif
+
+static bool __rmap_write_protect(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ bool pt_protect,
+ struct kvm_write_access_data *d)
+{
+#ifdef CONFIG_KVM_MROE
+ if (d != NULL)
+ return __rmap_write_protect_mroe(kvm, rmap_head, pt_protect, d);
+#endif
+ return __rmap_write_protection(kvm, rmap_head, pt_protect);
+}
+
static bool spte_clear_dirty(u64 *sptep)
{
u64 spte = *sptep;
@@ -1517,7 +1548,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
PT_PAGE_TABLE_LEVEL, slot);
- __rmap_write_protect(kvm, rmap_head, false, NULL);
+ __rmap_write_protection(kvm, rmap_head, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1593,11 +1624,15 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head;
int i;
bool write_protected = false;
+ struct kvm_write_access_data data = {
+ .i = 0,
+ .memslot = slot,
+ };
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
rmap_head = __gfn_to_rmap(gfn, i, slot);
write_protected |= __rmap_write_protect(kvm, rmap_head, true,
- NULL);
+ &data);
}
return write_protected;
@@ -5190,21 +5225,36 @@ static bool slot_rmap_write_protect(struct kvm *kvm,
struct kvm_rmap_head *rmap_head,
void *data)
{
- return __rmap_write_protect(kvm, rmap_head, false, data);
+ return __rmap_write_protect(kvm, rmap_head, false,
+ (struct kvm_write_access_data *)data);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
+static bool slot_rmap_apply_protection(struct kvm *kvm,
+ struct kvm_rmap_head *rmap_head,
+ void *data)
+{
+ struct kvm_write_access_data *d = (struct kvm_write_access_data *) data;
+ bool prot_mask = !(d->memslot->flags & KVM_MEM_READONLY);
+
+ return __rmap_write_protect(kvm, rmap_head, prot_mask, d);
+}
+
+void kvm_mmu_slot_apply_write_access(struct kvm *kvm,
struct kvm_memory_slot *memslot)
{
bool flush;
+ struct kvm_write_access_data data = {
+ .i = 0,
+ .memslot = memslot,
+ };
spin_lock(&kvm->mmu_lock);
- flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
- false, NULL);
+ flush = slot_handle_all_level(kvm, memslot, slot_rmap_apply_protection,
+ false, &data);
spin_unlock(&kvm->mmu_lock);
/*
- * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
+ * kvm_mmu_slot_apply_write_access() and kvm_vm_ioctl_get_dirty_log()
* which do tlb flush out of mmu-lock should be serialized by
* kvm->slots_lock otherwise tlb flush would be missed.
*/
@@ -5301,7 +5351,7 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
false, NULL);
spin_unlock(&kvm->mmu_lock);
- /* see kvm_mmu_slot_remove_write_access */
+ /* see kvm_mmu_slot_apply_write_access*/
lockdep_assert_held(&kvm->slots_lock);
if (flush)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b812b3c5088..de58ddaf5260 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4179,7 +4179,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
/*
* All the TLBs can be flushed out of mmu lock, see the comments in
- * kvm_mmu_slot_remove_write_access().
+ * kvm_mmu_slot_apply_write_access().
*/
lockdep_assert_held(&kvm->slots_lock);
if (is_dirty)
@@ -6672,7 +6672,98 @@ static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
}
#endif
-/*
+#ifdef CONFIG_KVM_MROE
+static void kvm_mroe_protect_slot(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, u64 npages)
+{
+ int i;
+
+ for (i = gfn - slot->base_gfn; i < gfn + npages - slot->base_gfn; i++)
+ set_bit(i, slot->mroe_bitmap);
+ kvm_mmu_slot_apply_write_access(kvm, slot);
+ kvm_arch_flush_shadow_memslot(kvm, slot);
+}
+
+static int __kvm_mroe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
+{
+ struct kvm_memory_slot *slot;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+
+ while (npages != 0) {
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot)
+ return -EINVAL;
+ if (gfn + npages > slot->base_gfn + slot->npages) {
+ u64 _npages = slot->base_gfn + slot->npages - gfn;
+
+ kvm_mroe_protect_slot(kvm, slot, gfn, _npages);
+ gfn += _npages;
+ npages -= _npages;
+ } else {
+ kvm_mroe_protect_slot(kvm, slot, gfn, npages);
+ npages = 0;
+ }
+ }
+ return 0;
+}
+
+static int kvm_mroe_protect_range(struct kvm *kvm, gpa_t gpa, u64 npages)
+{
+ int r;
+
+ mutex_lock(&kvm->slots_lock);
+ r = __kvm_mroe_protect_range(kvm, gpa, npages);
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static bool kvm_mroe_userspace(struct kvm_vcpu *vcpu)
+{
+ u64 rflags;
+ u64 cr0 = kvm_read_cr0(vcpu);
+ u64 iopl;
+
+ // first checking we are not in protected mode
+ if ((cr0 & 1) == 0)
+ return false;
+ /*
+ * we don't need to worry about comments in __get_regs
+ * because we are sure that this function will only be
+ * triggered at the end of a hypercall
+ */
+ rflags = kvm_get_rflags(vcpu);
+ iopl = (rflags >> 12) & 3;
+ if (iopl != 3)
+ return false;
+ return true;
+}
+
+static int kvm_mroe(struct kvm_vcpu *vcpu, u64 gva, u64 npages)
+{
+ struct kvm *kvm = vcpu->kvm;
+ gpa_t gpa;
+ u64 hva;
+
+ /*
+ * First we need to maek sure that we are running from something that
+ * isn't usermode
+ */
+ if (kvm_mroe_userspace(vcpu))
+ return -1;//I don't really know what to return
+ if (gva & ~PAGE_MASK)
+ return -EINVAL;
+ // We need to make sure that there will be no overflow
+ if ((npages << PAGE_SHIFT) >> PAGE_SHIFT != npages || npages == 0)
+ return -EINVAL;
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+ hva = gfn_to_hva(kvm, gpa >> PAGE_SHIFT);
+ if (!access_ok(VERIFY_WRITE, hva, npages << PAGE_SHIFT))
+ return -EINVAL;
+ return kvm_mroe_protect_range(vcpu->kvm, gpa, npages);
+}
+#endif
+
+ /*
* kvm_pv_kick_cpu_op: Kick a vcpu.
*
* @apicid - apicid of vcpu to be kicked.
@@ -6739,6 +6830,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_CLOCK_PAIRING:
ret = kvm_pv_clock_pairing(vcpu, a0, a1);
break;
+#endif
+#ifdef CONFIG_KVM_MROE
+ case KVM_HC_HMROE:
+ ret = kvm_mroe(vcpu, a0, a1);
+ break;
#endif
default:
ret = -KVM_ENOSYS;
@@ -8973,8 +9069,8 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
struct kvm_memory_slot *new)
{
/* Still write protect RO slot */
+ kvm_mmu_slot_apply_write_access(kvm, new);
if (new->flags & KVM_MEM_READONLY) {
- kvm_mmu_slot_remove_write_access(kvm, new);
return;
}
@@ -9012,7 +9108,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
if (kvm_x86_ops->slot_enable_log_dirty)
kvm_x86_ops->slot_enable_log_dirty(kvm, new);
else
- kvm_mmu_slot_remove_write_access(kvm, new);
+ kvm_mmu_slot_apply_write_access(kvm, new);
} else {
if (kvm_x86_ops->slot_disable_log_dirty)
kvm_x86_ops->slot_disable_log_dirty(kvm, new);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4ee7bc548a83..82c5780e11d9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -297,6 +297,9 @@ static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
struct kvm_memory_slot {
gfn_t base_gfn;
unsigned long npages;
+#ifdef CONFIG_KVM_MROE
+ unsigned long *mroe_bitmap;
+#endif
unsigned long *dirty_bitmap;
struct kvm_arch_memory_slot arch;
unsigned long userspace_addr;
diff --git a/include/uapi/linux/kvm_para.h b/include/uapi/linux/kvm_para.h
index dcf629dd2889..4e2badc09b5b 100644
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -26,6 +26,7 @@
#define KVM_HC_MIPS_EXIT_VM 7
#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
#define KVM_HC_CLOCK_PAIRING 9
+#define KVM_HC_HMROE 10
/*
* hypercalls use architecture specific
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b47507faab5..0f7141e4d550 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -794,6 +794,17 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
return 0;
}
+static int kvm_init_mroe_bitmap(struct kvm_memory_slot *slot)
+{
+#ifdef CONFIG_KVM_MROE
+ slot->mroe_bitmap = kvzalloc(BITS_TO_LONGS(slot->npages) *
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!slot->mroe_bitmap)
+ return -ENOMEM;
+#endif
+ return 0;
+}
+
/*
* Insert memslot and re-sort memslots based on their GFN,
* so binary search could be used to lookup GFN.
@@ -1011,6 +1022,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free;
}
+ if (kvm_init_mroe_bitmap(&new) < 0)
+ goto out_free;
slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
if (!slots)
@@ -1264,13 +1277,23 @@ static bool memslot_is_readonly(struct kvm_memory_slot *slot)
return slot->flags & KVM_MEM_READONLY;
}
+static bool gfn_is_readonly(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+#ifdef CONFIG_KVM_MROE
+ return test_bit(gfn - slot->base_gfn, slot->mroe_bitmap) ||
+ memslot_is_readonly(slot);
+#else
+ return memslot_is_readonly(slot);
+#endif
+}
+
static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
gfn_t *nr_pages, bool write)
{
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return KVM_HVA_ERR_BAD;
- if (memslot_is_readonly(slot) && write)
+ if (gfn_is_readonly(slot, gfn) && write)
return KVM_HVA_ERR_RO_BAD;
if (nr_pages)
@@ -1314,7 +1337,7 @@ unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);
if (!kvm_is_error_hva(hva) && writable)
- *writable = !memslot_is_readonly(slot);
+ *writable = !gfn_is_readonly(slot, gfn);
return hva;
}
@@ -1554,7 +1577,7 @@ kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn,
}
/* Do not map writable pfn in the readonly memslot. */
- if (writable && memslot_is_readonly(slot)) {
+ if (writable && gfn_is_readonly(slot, gfn)) {
*writable = false;
writable = NULL;
}
--
2.16.4
^ permalink raw reply related
* [PATCH net-next v6 0/4] net: vhost: improve performance when enable busyloop
From: xiangxia.m.yue @ 2018-07-21 18:03 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patches improve the guest receive performance.
On the handle_tx side, we poll the sock receive queue
at the same time. handle_rx do that in the same way.
For more performance report, see patch 4.
v5->v6:
rebase the codes.
Tonghao Zhang (4):
net: vhost: lock the vqs one by one
net: vhost: replace magic number of lock annotation
net: vhost: factor out busy polling logic to vhost_net_busy_poll()
net: vhost: add rx busy polling in tx path
drivers/vhost/net.c | 148 +++++++++++++++++++++++++++++---------------------
drivers/vhost/vhost.c | 24 +++-----
2 files changed, 94 insertions(+), 78 deletions(-)
--
1.8.3.1
^ permalink raw reply
* [PATCH net-next v6 1/4] net: vhost: lock the vqs one by one
From: xiangxia.m.yue @ 2018-07-21 18:03 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1532196242-2998-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch changes the way that lock all vqs
at the same, to lock them one by one. It will
be used for next patch to avoid the deadlock.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/vhost.c | 24 +++++++-----------------
1 file changed, 7 insertions(+), 17 deletions(-)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a502f1a..a1c06e7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
{
int i;
- for (i = 0; i < d->nvqs; ++i)
+ for (i = 0; i < d->nvqs; ++i) {
+ mutex_lock(&d->vqs[i]->mutex);
__vhost_vq_meta_reset(d->vqs[i]);
+ mutex_unlock(&d->vqs[i]->mutex);
+ }
}
static void vhost_vq_reset(struct vhost_dev *dev,
@@ -890,20 +893,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
#define vhost_get_used(vq, x, ptr) \
vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
-static void vhost_dev_lock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_lock_nested(&d->vqs[i]->mutex, i);
-}
-
-static void vhost_dev_unlock_vqs(struct vhost_dev *d)
-{
- int i = 0;
- for (i = 0; i < d->nvqs; ++i)
- mutex_unlock(&d->vqs[i]->mutex);
-}
-
static int vhost_new_umem_range(struct vhost_umem *umem,
u64 start, u64 size, u64 end,
u64 userspace_addr, int perm)
@@ -953,7 +942,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
if (msg->iova <= vq_msg->iova &&
msg->iova + msg->size - 1 > vq_msg->iova &&
vq_msg->type == VHOST_IOTLB_MISS) {
+ mutex_lock(&node->vq->mutex);
vhost_poll_queue(&node->vq->poll);
+ mutex_unlock(&node->vq->mutex);
+
list_del(&node->node);
kfree(node);
}
@@ -985,7 +977,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
int ret = 0;
mutex_lock(&dev->mutex);
- vhost_dev_lock_vqs(dev);
switch (msg->type) {
case VHOST_IOTLB_UPDATE:
if (!dev->iotlb) {
@@ -1019,7 +1010,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
break;
}
- vhost_dev_unlock_vqs(dev);
mutex_unlock(&dev->mutex);
return ret;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v6 2/4] net: vhost: replace magic number of lock annotation
From: xiangxia.m.yue @ 2018-07-21 18:04 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1532196242-2998-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Use the VHOST_NET_VQ_XXX as a subclass for mutex_lock_nested.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Acked-by: Jason Wang <jasowang@redhat.com>
---
drivers/vhost/net.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index b224036..321264c 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -490,7 +490,7 @@ static void handle_tx(struct vhost_net *net)
bool zcopy, zcopy_used;
int sent_pkts = 0;
- mutex_lock(&vq->mutex);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_TX);
sock = vq->private_data;
if (!sock)
goto out;
@@ -667,7 +667,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
/* Flush batched heads first */
vhost_rx_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&tvq->mutex, 1);
+ mutex_lock_nested(&tvq->mutex, VHOST_NET_VQ_TX);
vhost_disable_notify(&net->dev, tvq);
preempt_disable();
@@ -809,7 +809,7 @@ static void handle_rx(struct vhost_net *net)
__virtio16 num_buffers;
int recv_pkts = 0;
- mutex_lock_nested(&vq->mutex, 0);
+ mutex_lock_nested(&vq->mutex, VHOST_NET_VQ_RX);
sock = vq->private_data;
if (!sock)
goto out;
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v6 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: xiangxia.m.yue @ 2018-07-21 18:04 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1532196242-2998-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Factor out generic busy polling logic and will be
used for in tx path in the next patch. And with the patch,
qemu can set differently the busyloop_timeout for rx queue.
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 114 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 74 insertions(+), 40 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 321264c..2dc937e 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -428,6 +428,78 @@ static int vhost_net_enable_vq(struct vhost_net *n,
return vhost_poll_start(poll, sock->file);
}
+static int sk_has_rx_data(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->ops->peek_len)
+ return sock->ops->peek_len(sock);
+
+ return skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static void vhost_net_busy_poll_vq_check(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool rx)
+{
+ struct socket *sock = rvq->private_data;
+
+ if (rx) {
+ if (!vhost_vq_avail_empty(&net->dev, tvq)) {
+ vhost_poll_queue(&tvq->poll);
+ } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
+ vhost_disable_notify(&net->dev, tvq);
+ vhost_poll_queue(&tvq->poll);
+ }
+ } else if ((sock && sk_has_rx_data(sock->sk)) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) {
+ vhost_poll_queue(&rvq->poll);
+ }
+}
+
+static void vhost_net_busy_poll(struct vhost_net *net,
+ struct vhost_virtqueue *rvq,
+ struct vhost_virtqueue *tvq,
+ bool *busyloop_intr,
+ bool rx)
+{
+ unsigned long busyloop_timeout;
+ unsigned long endtime;
+ struct socket *sock;
+ struct vhost_virtqueue *vq = rx ? tvq : rvq;
+
+ mutex_lock_nested(&vq->mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
+ vhost_disable_notify(&net->dev, vq);
+ sock = rvq->private_data;
+
+ busyloop_timeout = rx ? rvq->busyloop_timeout:
+ tvq->busyloop_timeout;
+
+ preempt_disable();
+ endtime = busy_clock() + busyloop_timeout;
+
+ while (vhost_can_busy_poll(endtime)) {
+ if (vhost_has_work(&net->dev)) {
+ *busyloop_intr = true;
+ break;
+ }
+
+ if ((sock && sk_has_rx_data(sock->sk) &&
+ !vhost_vq_avail_empty(&net->dev, rvq)) ||
+ !vhost_vq_avail_empty(&net->dev, tvq))
+ break;
+
+ cpu_relax();
+ }
+
+ preempt_enable();
+
+ vhost_net_busy_poll_vq_check(net, rvq, tvq, rx);
+
+ mutex_unlock(&vq->mutex);
+}
+
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
struct vhost_virtqueue *vq,
struct iovec iov[], unsigned int iov_size,
@@ -631,16 +703,6 @@ static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
return len;
}
-static int sk_has_rx_data(struct sock *sk)
-{
- struct socket *sock = sk->sk_socket;
-
- if (sock->ops->peek_len)
- return sock->ops->peek_len(sock);
-
- return skb_queue_empty(&sk->sk_receive_queue);
-}
-
static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
{
struct vhost_virtqueue *vq = &nvq->vq;
@@ -660,41 +722,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk,
struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX];
struct vhost_virtqueue *rvq = &rnvq->vq;
struct vhost_virtqueue *tvq = &tnvq->vq;
- unsigned long uninitialized_var(endtime);
int len = peek_head_len(rnvq, sk);
- if (!len && tvq->busyloop_timeout) {
+ if (!len && rvq->busyloop_timeout) {
/* Flush batched heads first */
vhost_rx_signal_used(rnvq);
/* Both tx vq and rx socket were polled here */
- mutex_lock_nested(&tvq->mutex, VHOST_NET_VQ_TX);
- vhost_disable_notify(&net->dev, tvq);
-
- preempt_disable();
- endtime = busy_clock() + tvq->busyloop_timeout;
-
- while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(&net->dev)) {
- *busyloop_intr = true;
- break;
- }
- if ((sk_has_rx_data(sk) &&
- !vhost_vq_avail_empty(&net->dev, rvq)) ||
- !vhost_vq_avail_empty(&net->dev, tvq))
- break;
- cpu_relax();
- }
-
- preempt_enable();
-
- if (!vhost_vq_avail_empty(&net->dev, tvq)) {
- vhost_poll_queue(&tvq->poll);
- } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
- vhost_disable_notify(&net->dev, tvq);
- vhost_poll_queue(&tvq->poll);
- }
-
- mutex_unlock(&tvq->mutex);
+ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true);
len = peek_head_len(rnvq, sk);
}
--
1.8.3.1
^ permalink raw reply related
* [PATCH net-next v6 4/4] net: vhost: add rx busy polling in tx path
From: xiangxia.m.yue @ 2018-07-21 18:04 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1532196242-2998-1-git-send-email-xiangxia.m.yue@gmail.com>
From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
This patch improves the guest receive performance.
On the handle_tx side, we poll the sock receive queue at the
same time. handle_rx do that in the same way.
We set the poll-us=100us and use the netperf to test throughput
and mean latency. When running the tests, the vhost-net kthread
of that VM, is alway 100% CPU. The commands are shown as below.
Rx performance is greatly improved by this patch. There is not
notable performance change on tx with this series though. This
patch is useful for bi-directional traffic.
netperf -H IP -t TCP_STREAM -l 20 -- -O "THROUGHPUT, THROUGHPUT_UNITS, MEAN_LATENCY"
Topology:
[Host] ->linux bridge -> tap vhost-net ->[Guest]
TCP_STREAM:
* Without the patch: 19842.95 Mbps, 6.50 us mean latency
* With the patch: 37373.10 Mbps, 3.43 us mean latency
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
---
drivers/vhost/net.c | 30 +++++++++++-------------------
1 file changed, 11 insertions(+), 19 deletions(-)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 2dc937e..a562828 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -501,29 +501,21 @@ static void vhost_net_busy_poll(struct vhost_net *net,
}
static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
- struct vhost_virtqueue *vq,
+ struct vhost_virtqueue *tvq,
struct iovec iov[], unsigned int iov_size,
unsigned int *out_num, unsigned int *in_num,
bool *busyloop_intr)
{
- unsigned long uninitialized_var(endtime);
- int r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
- out_num, in_num, NULL, NULL);
-
- if (r == vq->num && vq->busyloop_timeout) {
- preempt_disable();
- endtime = busy_clock() + vq->busyloop_timeout;
- while (vhost_can_busy_poll(endtime)) {
- if (vhost_has_work(vq->dev)) {
- *busyloop_intr = true;
- break;
- }
- if (!vhost_vq_avail_empty(vq->dev, vq))
- break;
- cpu_relax();
- }
- preempt_enable();
- r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+ struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_virtqueue *rvq = &rnvq->vq;
+
+ int r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
+ out_num, in_num, NULL, NULL);
+
+ if (r == tvq->num && tvq->busyloop_timeout) {
+ vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, false);
+
+ r = vhost_get_vq_desc(tvq, tvq->iov, ARRAY_SIZE(tvq->iov),
out_num, in_num, NULL, NULL);
}
--
1.8.3.1
^ permalink raw reply related
* Re: [PATCH net-next 0/9] TX used ring batched updating for vhost
From: David Miller @ 2018-07-22 4:44 UTC (permalink / raw)
To: jasowang; +Cc: netdev, virtualization, linux-kernel, kvm, mst
In-Reply-To: <1532045721-4958-1-git-send-email-jasowang@redhat.com>
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 20 Jul 2018 08:15:12 +0800
> This series implement batch updating of used ring for TX. This help to
> reduce the cache contention on used ring. The idea is first split
> datacopy path from zerocopy, and do only batching for datacopy. This
> is because zercopy had already supported its own batching.
>
> TX PPS was increased 25.8% and Netperf TCP does not show obvious
> differences.
>
> The split of datapath will also be helpful for future implementation
> like in order completion.
>
> Please review.
Jason, I really like the way you composed this patch series, it was
very easy to read and understand. :-)
Michael, please review.
^ permalink raw reply
* RE: [PATCH v36 0/5] Virtio-balloon: support free page reporting
From: Wang, Wei W @ 2018-07-22 11:11 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: yang.zhang.wz@gmail.com, virtio-dev@lists.oasis-open.org,
riel@redhat.com, quan.xu0@gmail.com, kvm@vger.kernel.org,
nilal@redhat.com, liliang.opensource@gmail.com,
linux-kernel@vger.kernel.org, mhocko@kernel.org,
linux-mm@kvack.org, pbonzini@redhat.com,
akpm@linux-foundation.org,
virtualization@lists.linux-foundation.org,
torvalds@linux-foundation.org
In-Reply-To: <20180720154922-mutt-send-email-mst@kernel.org>
On Friday, July 20, 2018 8:52 PM, Michael S. Tsirkin wrote:
> On Fri, Jul 20, 2018 at 04:33:00PM +0800, Wei Wang wrote:
> > This patch series is separated from the previous "Virtio-balloon
> > Enhancement" series. The new feature,
> VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> > implemented by this series enables the virtio-balloon driver to report
> > hints of guest free pages to the host. It can be used to accelerate
> > live migration of VMs. Here is an introduction of this usage:
> >
> > Live migration needs to transfer the VM's memory from the source
> > machine to the destination round by round. For the 1st round, all the
> > VM's memory is transferred. From the 2nd round, only the pieces of
> > memory that were written by the guest (after the 1st round) are
> > transferred. One method that is popularly used by the hypervisor to
> > track which part of memory is written is to write-protect all the guest
> memory.
> >
> > This feature enables the optimization by skipping the transfer of
> > guest free pages during VM live migration. It is not concerned that
> > the memory pages are used after they are given to the hypervisor as a
> > hint of the free pages, because they will be tracked by the hypervisor
> > and transferred in the subsequent round if they are used and written.
> >
> > * Tests
> > - Test Environment
> > Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
> > Guest: 8G RAM, 4 vCPU
> > Migration setup: migrate_set_speed 100G, migrate_set_downtime 2
> > second
>
> Can we split out patches 1 and 2? They seem appropriate for this release ...
Sounds good to me. I'm not sure if there would be comments on the first 2 patches. If no, can you just take them here? Or you need me to repost them separately?
Best,
Wei
^ permalink raw reply
* Re: [PATCH net-next 0/9] TX used ring batched updating for vhost
From: Michael S. Tsirkin @ 2018-07-22 14:37 UTC (permalink / raw)
To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <1532045721-4958-1-git-send-email-jasowang@redhat.com>
On Fri, Jul 20, 2018 at 08:15:12AM +0800, Jason Wang wrote:
> Hi:
>
> This series implement batch updating of used ring for TX. This help to
> reduce the cache contention on used ring. The idea is first split
> datacopy path from zerocopy, and do only batching for datacopy. This
> is because zercopy had already supported its own batching.
>
> TX PPS was increased 25.8% and Netperf TCP does not show obvious
> differences.
>
> The split of datapath will also be helpful for future implementation
> like in order completion.
>
> Please review.
>
> Thanks
Acked-by: Michael S. Tsirkin <mst@redhat.com>
I'm very happy with the split, the mixed data path became hard to
maintain.
> Jason Wang (9):
> vhost_net: drop unnecessary parameter
> vhost_net: introduce helper to initialize tx iov iter
> vhost_net: introduce vhost_exceeds_weight()
> vhost_net: introduce get_tx_bufs()
> vhost_net: introduce tx_can_batch()
> vhost_net: split out datacopy logic
> vhost_net: rename vhost_rx_signal_used() to vhost_net_signal_used()
> vhost_net: rename VHOST_RX_BATCH to VHOST_NET_BATCH
> vhost_net: batch update used ring for datacopy TX
>
> drivers/vhost/net.c | 249 +++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 179 insertions(+), 70 deletions(-)
>
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH v36 2/5] virtio_balloon: replace oom notifier with shrinker
From: Michael S. Tsirkin @ 2018-07-22 14:48 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, virtualization, torvalds
In-Reply-To: <1532075585-39067-3-git-send-email-wei.w.wang@intel.com>
On Fri, Jul 20, 2018 at 04:33:02PM +0800, Wei Wang wrote:
> The OOM notifier is getting deprecated to use for the reasons mentioned
> here by Michal Hocko: https://lkml.org/lkml/2018/7/12/314
>
> This patch replaces the virtio-balloon oom notifier with a shrinker
> to release balloon pages on memory pressure.
>
> In addition, the bug in the replaced virtballoon_oom_notify that only
> VIRTIO_BALLOON_ARRAY_PFNS_MAX (i.e 256) balloon pages can be freed
> though the user has specified more than that number is fixed in the
> shrinker_scan function.
>
> Signed-off-by: Wei Wang <wei.w.wang@intel.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Michal Hocko <mhocko@kernel.org>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Linus Torvalds <torvalds@linux-foundation.org>
> ---
> drivers/virtio/virtio_balloon.c | 113 +++++++++++++++++++++++-----------------
> 1 file changed, 65 insertions(+), 48 deletions(-)
>
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 9356a1a..c6fd406 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -27,7 +27,6 @@
> #include <linux/slab.h>
> #include <linux/module.h>
> #include <linux/balloon_compaction.h>
> -#include <linux/oom.h>
> #include <linux/wait.h>
> #include <linux/mm.h>
> #include <linux/mount.h>
> @@ -40,12 +39,12 @@
> */
> #define VIRTIO_BALLOON_PAGES_PER_PAGE (unsigned)(PAGE_SIZE >> VIRTIO_BALLOON_PFN_SHIFT)
> #define VIRTIO_BALLOON_ARRAY_PFNS_MAX 256
> -#define OOM_VBALLOON_DEFAULT_PAGES 256
> +#define DEFAULT_BALLOON_PAGES_TO_SHRINK 256
> #define VIRTBALLOON_OOM_NOTIFY_PRIORITY 80
>
> -static int oom_pages = OOM_VBALLOON_DEFAULT_PAGES;
> -module_param(oom_pages, int, S_IRUSR | S_IWUSR);
> -MODULE_PARM_DESC(oom_pages, "pages to free on OOM");
> +static unsigned long balloon_pages_to_shrink = DEFAULT_BALLOON_PAGES_TO_SHRINK;
> +module_param(balloon_pages_to_shrink, ulong, 0600);
> +MODULE_PARM_DESC(balloon_pages_to_shrink, "pages to free on memory presure");
>
> #ifdef CONFIG_BALLOON_COMPACTION
> static struct vfsmount *balloon_mnt;
> @@ -86,8 +85,8 @@ struct virtio_balloon {
> /* Memory statistics */
> struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR];
>
> - /* To register callback in oom notifier call chain */
> - struct notifier_block nb;
> + /* To register a shrinker to shrink memory upon memory pressure */
> + struct shrinker shrinker;
> };
>
> static struct virtio_device_id id_table[] = {
> @@ -365,38 +364,6 @@ static void update_balloon_size(struct virtio_balloon *vb)
> &actual);
> }
>
> -/*
> - * virtballoon_oom_notify - release pages when system is under severe
> - * memory pressure (called from out_of_memory())
> - * @self : notifier block struct
> - * @dummy: not used
> - * @parm : returned - number of freed pages
> - *
> - * The balancing of memory by use of the virtio balloon should not cause
> - * the termination of processes while there are pages in the balloon.
> - * If virtio balloon manages to release some memory, it will make the
> - * system return and retry the allocation that forced the OOM killer
> - * to run.
> - */
> -static int virtballoon_oom_notify(struct notifier_block *self,
> - unsigned long dummy, void *parm)
> -{
> - struct virtio_balloon *vb;
> - unsigned long *freed;
> - unsigned num_freed_pages;
> -
> - vb = container_of(self, struct virtio_balloon, nb);
> - if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> - return NOTIFY_OK;
> -
> - freed = parm;
> - num_freed_pages = leak_balloon(vb, oom_pages);
> - update_balloon_size(vb);
> - *freed += num_freed_pages;
> -
> - return NOTIFY_OK;
> -}
> -
> static void update_balloon_stats_func(struct work_struct *work)
> {
> struct virtio_balloon *vb;
> @@ -548,6 +515,61 @@ static struct file_system_type balloon_fs = {
>
> #endif /* CONFIG_BALLOON_COMPACTION */
>
> +static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
> + struct shrink_control *sc)
> +{
> + unsigned long pages_to_free = balloon_pages_to_shrink,
> + pages_freed = 0;
> + struct virtio_balloon *vb = container_of(shrinker,
> + struct virtio_balloon, shrinker);
> +
> + /*
> + * One invocation of leak_balloon can deflate at most
> + * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it
> + * multiple times to deflate pages till reaching
> + * balloon_pages_to_shrink pages.
> + */
> + while (vb->num_pages && pages_to_free) {
> + pages_to_free = balloon_pages_to_shrink - pages_freed;
> + pages_freed += leak_balloon(vb, pages_to_free);
> + }
> + update_balloon_size(vb);
Are you sure that this is never called if count returned 0?
> +
> + return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE;
> +}
> +
> +static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
> + struct shrink_control *sc)
> +{
> + struct virtio_balloon *vb = container_of(shrinker,
> + struct virtio_balloon, shrinker);
> +
> + /*
> + * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to handle the
> + * case when shrinker needs to be invoked to relieve memory pressure.
> + */
> + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> + return 0;
So why not skip notifier registration when deflate on oom
is clear?
> +
> + return min_t(unsigned long, vb->num_pages, balloon_pages_to_shrink) /
> + VIRTIO_BALLOON_PAGES_PER_PAGE;
> +}
> +
> +static void virtio_balloon_unregister_shrinker(struct virtio_balloon *vb)
> +{
> + unregister_shrinker(&vb->shrinker);
> +}
> +
> +static int virtio_balloon_register_shrinker(struct virtio_balloon *vb)
> +{
> + vb->shrinker.scan_objects = virtio_balloon_shrinker_scan;
> + vb->shrinker.count_objects = virtio_balloon_shrinker_count;
> + vb->shrinker.batch = 0;
> + vb->shrinker.seeks = DEFAULT_SEEKS;
> +
> + return register_shrinker(&vb->shrinker);
> +}
> +
> static int virtballoon_probe(struct virtio_device *vdev)
> {
> struct virtio_balloon *vb;
> @@ -580,17 +602,10 @@ static int virtballoon_probe(struct virtio_device *vdev)
> if (err)
> goto out_free_vb;
>
> - vb->nb.notifier_call = virtballoon_oom_notify;
> - vb->nb.priority = VIRTBALLOON_OOM_NOTIFY_PRIORITY;
> - err = register_oom_notifier(&vb->nb);
> - if (err < 0)
> - goto out_del_vqs;
> -
> #ifdef CONFIG_BALLOON_COMPACTION
> balloon_mnt = kern_mount(&balloon_fs);
> if (IS_ERR(balloon_mnt)) {
> err = PTR_ERR(balloon_mnt);
> - unregister_oom_notifier(&vb->nb);
> goto out_del_vqs;
> }
>
> @@ -599,12 +614,14 @@ static int virtballoon_probe(struct virtio_device *vdev)
> if (IS_ERR(vb->vb_dev_info.inode)) {
> err = PTR_ERR(vb->vb_dev_info.inode);
> kern_unmount(balloon_mnt);
> - unregister_oom_notifier(&vb->nb);
> vb->vb_dev_info.inode = NULL;
> goto out_del_vqs;
> }
> vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
> #endif
> + err = virtio_balloon_register_shrinker(vb);
> + if (err)
> + goto out_del_vqs;
>
So we can get scans before device is ready. Leak will fail
then. Why not register later after device is ready?
> virtio_device_ready(vdev);
>
> @@ -637,7 +654,7 @@ static void virtballoon_remove(struct virtio_device *vdev)
> {
> struct virtio_balloon *vb = vdev->priv;
>
> - unregister_oom_notifier(&vb->nb);
> + virtio_balloon_unregister_shrinker(vb);
>
> spin_lock_irq(&vb->stop_update_lock);
> vb->stop_update = true;
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH net-next v6 1/4] net: vhost: lock the vqs one by one
From: Michael S. Tsirkin @ 2018-07-22 15:26 UTC (permalink / raw)
To: xiangxia.m.yue; +Cc: netdev, virtualization
In-Reply-To: <1532196242-2998-2-git-send-email-xiangxia.m.yue@gmail.com>
On Sat, Jul 21, 2018 at 11:03:59AM -0700, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> This patch changes the way that lock all vqs
> at the same, to lock them one by one. It will
> be used for next patch to avoid the deadlock.
>
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> Acked-by: Jason Wang <jasowang@redhat.com>
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
> drivers/vhost/vhost.c | 24 +++++++-----------------
> 1 file changed, 7 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index a502f1a..a1c06e7 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
> {
> int i;
>
> - for (i = 0; i < d->nvqs; ++i)
> + for (i = 0; i < d->nvqs; ++i) {
> + mutex_lock(&d->vqs[i]->mutex);
> __vhost_vq_meta_reset(d->vqs[i]);
> + mutex_unlock(&d->vqs[i]->mutex);
> + }
> }
>
> static void vhost_vq_reset(struct vhost_dev *dev,
> @@ -890,20 +893,6 @@ static inline void __user *__vhost_get_user(struct vhost_virtqueue *vq,
> #define vhost_get_used(vq, x, ptr) \
> vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
>
> -static void vhost_dev_lock_vqs(struct vhost_dev *d)
> -{
> - int i = 0;
> - for (i = 0; i < d->nvqs; ++i)
> - mutex_lock_nested(&d->vqs[i]->mutex, i);
> -}
> -
> -static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> -{
> - int i = 0;
> - for (i = 0; i < d->nvqs; ++i)
> - mutex_unlock(&d->vqs[i]->mutex);
> -}
> -
> static int vhost_new_umem_range(struct vhost_umem *umem,
> u64 start, u64 size, u64 end,
> u64 userspace_addr, int perm)
> @@ -953,7 +942,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
> if (msg->iova <= vq_msg->iova &&
> msg->iova + msg->size - 1 > vq_msg->iova &&
> vq_msg->type == VHOST_IOTLB_MISS) {
> + mutex_lock(&node->vq->mutex);
> vhost_poll_queue(&node->vq->poll);
> + mutex_unlock(&node->vq->mutex);
> +
> list_del(&node->node);
> kfree(node);
> }
> @@ -985,7 +977,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
> int ret = 0;
>
> mutex_lock(&dev->mutex);
> - vhost_dev_lock_vqs(dev);
> switch (msg->type) {
> case VHOST_IOTLB_UPDATE:
> if (!dev->iotlb) {
> @@ -1019,7 +1010,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev *dev,
> break;
> }
>
> - vhost_dev_unlock_vqs(dev);
> mutex_unlock(&dev->mutex);
>
> return ret;
I do prefer the finer-grained locking but I remember we
discussed something like this in the past and Jason saw issues
with such a locking.
Jason?
> --
> 1.8.3.1
^ permalink raw reply
* Re: [PATCH net-next 0/9] TX used ring batched updating for vhost
From: David Miller @ 2018-07-22 16:44 UTC (permalink / raw)
To: mst; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180722173622-mutt-send-email-mst@kernel.org>
From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Sun, 22 Jul 2018 17:37:05 +0300
> On Fri, Jul 20, 2018 at 08:15:12AM +0800, Jason Wang wrote:
>> Hi:
>>
>> This series implement batch updating of used ring for TX. This help to
>> reduce the cache contention on used ring. The idea is first split
>> datacopy path from zerocopy, and do only batching for datacopy. This
>> is because zercopy had already supported its own batching.
>>
>> TX PPS was increased 25.8% and Netperf TCP does not show obvious
>> differences.
>>
>> The split of datapath will also be helpful for future implementation
>> like in order completion.
>>
>> Please review.
>>
>> Thanks
>
> Acked-by: Michael S. Tsirkin <mst@redhat.com>
>
> I'm very happy with the split, the mixed data path became hard to
> maintain.
Thanks for reviewing.
Series applied, thanks everyone.
^ permalink raw reply
* Re: [PATCH net-next V2 0/8] Packed virtqueue support for vhost
From: Michael S. Tsirkin @ 2018-07-22 16:56 UTC (permalink / raw)
To: Jason Wang
Cc: kvm, netdev, linux-kernel, virtualization, maxime.coquelin, wexu
In-Reply-To: <5ba5c927-a0b4-f399-7a88-b90763765142@redhat.com>
On Tue, Jul 17, 2018 at 08:45:16AM +0800, Jason Wang wrote:
> > I'm not sure I understand this approach. Packed ring is just an optimization.
> > What value is there in merging it if it does not help speed?
>
> If you want to support migration from dpdk or vDPA backend.
Migration from dpdk is a mess: if you add new features you
fix migration from new one but break migration from old one.
So I'm not too worried until dpdk guys implement one of
the migration versioning proposals that have been floating
around for years.
I think vDPAs are using split ring right now. They will likely
switch to packed ring in the future, but it will probably
take some time before they do.
> And we still
> have the chance to see the performance with virito-net pmd in the future. If
> this does not make sense for you, I will leave this series until we can get
> results from virtio-net pmd (or find a way that packed virtqueue
> outperform).
This makes sense to me. If there's a gain that is only
observed with the pmd driver, I think that's still fine,
but we do need to see it.
> And I will start to post other optimizations on vhost.
>
> Thanks
Thanks a lot!
--
MST
^ permalink raw reply
* Re: [RFC 4/4] virtio: Add platform specific DMA API translation for virito devices
From: Anshuman Khandual @ 2018-07-23 2:16 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: robh, srikar, benh, linuxram, linux-kernel, virtualization, hch,
paulus, mpe, joe, linuxppc-dev, elfring, haren, david
In-Reply-To: <20180720160550-mutt-send-email-mst@kernel.org>
On 07/20/2018 06:45 PM, Michael S. Tsirkin wrote:
> On Fri, Jul 20, 2018 at 09:29:41AM +0530, Anshuman Khandual wrote:
>> Subject: Re: [RFC 4/4] virtio: Add platform specific DMA API translation for
>> virito devices
>
> s/virito/virtio/
Oops, will fix it. Thanks for pointing out.
>
>> This adds a hook which a platform can define in order to allow it to
>> override virtio device's DMA OPS irrespective of whether it has the
>> flag VIRTIO_F_IOMMU_PLATFORM set or not. We want to use this to do
>> bounce-buffering of data on the new secure pSeries platform, currently
>> under development, where a KVM host cannot access all of the memory
>> space of a secure KVM guest. The host can only access the pages which
>> the guest has explicitly requested to be shared with the host, thus
>> the virtio implementation in the guest has to copy data to and from
>> shared pages.
>>
>> With this hook, the platform code in the secure guest can force the
>> use of swiotlb for virtio buffers, with a back-end for swiotlb which
>> will use a pool of pre-allocated shared pages. Thus all data being
>> sent or received by virtio devices will be copied through pages which
>> the host has access to.
>>
>> Signed-off-by: Anshuman Khandual <khandual@linux.vnet.ibm.com>
>> ---
>> arch/powerpc/include/asm/dma-mapping.h | 6 ++++++
>> arch/powerpc/platforms/pseries/iommu.c | 6 ++++++
>> drivers/virtio/virtio.c | 7 +++++++
>> 3 files changed, 19 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/dma-mapping.h b/arch/powerpc/include/asm/dma-mapping.h
>> index 8fa3945..bc5a9d3 100644
>> --- a/arch/powerpc/include/asm/dma-mapping.h
>> +++ b/arch/powerpc/include/asm/dma-mapping.h
>> @@ -116,3 +116,9 @@ extern u64 __dma_get_required_mask(struct device *dev);
>>
>> #endif /* __KERNEL__ */
>> #endif /* _ASM_DMA_MAPPING_H */
>> +
>> +#define platform_override_dma_ops platform_override_dma_ops
>> +
>> +struct virtio_device;
>> +
>> +extern void platform_override_dma_ops(struct virtio_device *vdev);
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 06f0296..5773bc7 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -38,6 +38,7 @@
>> #include <linux/of.h>
>> #include <linux/iommu.h>
>> #include <linux/rculist.h>
>> +#include <linux/virtio.h>
>> #include <asm/io.h>
>> #include <asm/prom.h>
>> #include <asm/rtas.h>
>> @@ -1396,3 +1397,8 @@ static int __init disable_multitce(char *str)
>> __setup("multitce=", disable_multitce);
>>
>> machine_subsys_initcall_sync(pseries, tce_iommu_bus_notifier_init);
>> +
>> +void platform_override_dma_ops(struct virtio_device *vdev)
>> +{
>> + /* Override vdev->parent.dma_ops if required */
>> +}
>> diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
>> index 6b13987..432c332 100644
>> --- a/drivers/virtio/virtio.c
>> +++ b/drivers/virtio/virtio.c
>> @@ -168,6 +168,12 @@ EXPORT_SYMBOL_GPL(virtio_add_status);
>>
>> const struct dma_map_ops virtio_direct_dma_ops;
>>
>> +#ifndef platform_override_dma_ops
>> +static inline void platform_override_dma_ops(struct virtio_device *vdev)
>> +{
>> +}
>> +#endif
>> +
>> int virtio_finalize_features(struct virtio_device *dev)
>> {
>> int ret = dev->config->finalize_features(dev);
>> @@ -179,6 +185,7 @@ int virtio_finalize_features(struct virtio_device *dev)
>> if (virtio_has_iommu_quirk(dev))
>> set_dma_ops(dev->dev.parent, &virtio_direct_dma_ops);
>>
>> + platform_override_dma_ops(dev);
>
> Is there a single place where virtio_has_iommu_quirk is called now?
Not other than this one. But in the proposed implementation of
platform_override_dma_ops on powerpc, we will again check on
virtio_has_iommu_quirk before overriding it with SWIOTLB.
void platform_override_dma_ops(struct virtio_device *vdev)
{
if (is_ultravisor_platform() && virtio_has_iommu_quirk(vdev))
set_dma_ops(vdev->dev.parent, &swiotlb_dma_ops);
}
> If so, we could put this into virtio_has_iommu_quirk then.
Did you mean platform_override_dma_ops instead ? If so, yes that
is possible. Default implementation of platform_override_dma_ops
should just check on VIRTIO_F_IOMMU_PLATFORM feature and override
with virtio_direct_dma_ops but arch implementation can check on
what ever else they would like and override appropriately.
Default platform_override_dma_ops will be like this
#ifndef platform_override_dma_ops
static inline void platform_override_dma_ops(struct virtio_device *vdev)
{
if(!virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM))
set_dma_ops(dev->dev.parent, &virtio_direct_dma_ops);
}
#endif
Proposed powerpc implementation will be like this instead
void platform_override_dma_ops(struct virtio_device *vdev)
{
if (virtio_has_feature(vdev, VIRTIO_F_IOMMU_PLATFORM))
return;
if (is_ultravisor_platform())
set_dma_ops(vdev->dev.parent, &swiotlb_dma_ops);
else
set_dma_ops(dev->dev.parent, &virtio_direct_dma_ops);
}
There is a redundant definition of virtio_has_iommu_quirk in the tools
directory (tools/virtio/linux/virtio_config.h) which does not seem to
be used any where. I guess that can be removed without problem.
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Anshuman Khandual @ 2018-07-23 6:28 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: robh, srikar, linuxram, linux-kernel, virtualization, hch, paulus,
joe, linuxppc-dev, elfring, haren, david
In-Reply-To: <20180720161541-mutt-send-email-mst@kernel.org>
On 07/20/2018 06:46 PM, Michael S. Tsirkin wrote:
> On Fri, Jul 20, 2018 at 09:29:37AM +0530, Anshuman Khandual wrote:
>> This patch series is the follow up on the discussions we had before about
>> the RFC titled [RFC,V2] virtio: Add platform specific DMA API translation
>> for virito devices (https://patchwork.kernel.org/patch/10417371/). There
>> were suggestions about doing away with two different paths of transactions
>> with the host/QEMU, first being the direct GPA and the other being the DMA
>> API based translations.
>>
>> First patch attempts to create a direct GPA mapping based DMA operations
>> structure called 'virtio_direct_dma_ops' with exact same implementation
>> of the direct GPA path which virtio core currently has but just wrapped in
>> a DMA API format. Virtio core must use 'virtio_direct_dma_ops' instead of
>> the arch default in absence of VIRTIO_F_IOMMU_PLATFORM flag to preserve the
>> existing semantics. The second patch does exactly that inside the function
>> virtio_finalize_features(). The third patch removes the default direct GPA
>> path from virtio core forcing it to use DMA API callbacks for all devices.
>> Now with that change, every device must have a DMA operations structure
>> associated with it. The fourth patch adds an additional hook which gives
>> the platform an opportunity to do yet another override if required. This
>> platform hook can be used on POWER Ultravisor based protected guests to
>> load up SWIOTLB DMA callbacks to do the required (as discussed previously
>> in the above mentioned thread how host is allowed to access only parts of
>> the guest GPA range) bounce buffering into the shared memory for all I/O
>> scatter gather buffers to be consumed on the host side.
>>
>> Please go through these patches and review whether this approach broadly
>> makes sense. I will appreciate suggestions, inputs, comments regarding
>> the patches or the approach in general. Thank you.
> I like how patches 1-3 look. Could you test performance
> with/without to see whether the extra indirection through
> use of DMA ops causes a measurable slow-down?
I ran this simple DD command 10 times where /dev/vda is a virtio block
device of 10GB size.
dd if=/dev/zero of=/dev/vda bs=8M count=1024 oflag=direct
With and without patches bandwidth which has a bit wide range does not
look that different from each other.
Without patches
===============
---------- 1 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.95557 s, 4.4 GB/s
---------- 2 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.05176 s, 4.2 GB/s
---------- 3 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.88314 s, 4.6 GB/s
---------- 4 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.84899 s, 4.6 GB/s
---------- 5 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 5.37184 s, 1.6 GB/s
---------- 6 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.9205 s, 4.5 GB/s
---------- 7 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 6.85166 s, 1.3 GB/s
---------- 8 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.74049 s, 4.9 GB/s
---------- 9 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 6.31699 s, 1.4 GB/s
---------- 10 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.47057 s, 3.5 GB/s
With patches
============
---------- 1 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.25993 s, 3.8 GB/s
---------- 2 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.82438 s, 4.7 GB/s
---------- 3 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.93856 s, 4.4 GB/s
---------- 4 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.83405 s, 4.7 GB/s
---------- 5 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 7.50199 s, 1.1 GB/s
---------- 6 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.28742 s, 3.8 GB/s
---------- 7 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 5.74958 s, 1.5 GB/s
---------- 8 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.99149 s, 4.3 GB/s
---------- 9 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 5.67647 s, 1.5 GB/s
---------- 10 ---------
1024+0 records in
1024+0 records out
8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.93957 s, 2.9 GB/s
Does this look okay ?
^ permalink raw reply
* Patch "x86/paravirt: Make native_save_fl() extern inline" has been added to the 4.4-stable tree
From: gregkh @ 2018-07-23 7:58 UTC (permalink / raw)
To: 20180621162324.36656-4-ndesaulniers, acme, akataria, akpm,
andrea.parri, ard.biesheuvel, arnd, aryabinin, astrachan,
boris.ostrovsky, brijesh.singh, caoj.fnst, geert, ghackmann,
gregkh, hpa, jan.kiszka, jarkko.sakkinen, jgross, joe, jpoimboe,
keescook, kirill.shutemov, kstewart, manojgupta, mawilcox,
michal.lkml, mingo, mjg59, mka
Cc: stable-commits
This is a note to let you know that I've just added the patch titled
x86/paravirt: Make native_save_fl() extern inline
to the 4.4-stable tree which can be found at:
http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
The filename of the patch is:
x86-paravirt-make-native_save_fl-extern-inline.patch
and it can be found in the queue-4.4 subdirectory.
If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@vger.kernel.org> know about it.
From d0a8d9378d16eb3c69bd8e6d23779fbdbee3a8c7 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <ndesaulniers@google.com>
Date: Thu, 21 Jun 2018 09:23:24 -0700
Subject: x86/paravirt: Make native_save_fl() extern inline
From: Nick Desaulniers <ndesaulniers@google.com>
commit d0a8d9378d16eb3c69bd8e6d23779fbdbee3a8c7 upstream.
native_save_fl() is marked static inline, but by using it as
a function pointer in arch/x86/kernel/paravirt.c, it MUST be outlined.
paravirt's use of native_save_fl() also requires that no GPRs other than
%rax are clobbered.
Compilers have different heuristics which they use to emit stack guard
code, the emittance of which can break paravirt's callee saved assumption
by clobbering %rcx.
Marking a function definition extern inline means that if this version
cannot be inlined, then the out-of-line version will be preferred. By
having the out-of-line version be implemented in assembly, it cannot be
instrumented with a stack protector, which might violate custom calling
conventions that code like paravirt rely on.
The semantics of extern inline has changed since gnu89. This means that
folks using GCC versions >= 5.1 may see symbol redefinition errors at
link time for subdirs that override KBUILD_CFLAGS (making the C standard
used implicit) regardless of this patch. This has been cleaned up
earlier in the patch set, but is left as a note in the commit message
for future travelers.
Reports:
https://lkml.org/lkml/2018/5/7/534
https://github.com/ClangBuiltLinux/linux/issues/16
Discussion:
https://bugs.llvm.org/show_bug.cgi?id=37512
https://lkml.org/lkml/2018/5/24/1371
Thanks to the many folks that participated in the discussion.
[Backport for 4.4. 4.4 is missing commit 784d5699eddc "x86: move exports to
actual definitions" which doesn't apply cleanly, and not really worth
backporting IMO. It's simpler to change this patch from upstream:
+ #include <asm-generic/export.h>
rather than
+ #include <asm/export.h>]
Debugged-by: Alistair Strachan <astrachan@google.com>
Debugged-by: Matthias Kaehlcke <mka@chromium.org>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: H. Peter Anvin <hpa@zytor.com>
Suggested-by: Tom Stellar <tstellar@redhat.com>
Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@redhat.com
Cc: akataria@vmware.com
Cc: akpm@linux-foundation.org
Cc: andrea.parri@amarulasolutions.com
Cc: ard.biesheuvel@linaro.org
Cc: aryabinin@virtuozzo.com
Cc: astrachan@google.com
Cc: boris.ostrovsky@oracle.com
Cc: brijesh.singh@amd.com
Cc: caoj.fnst@cn.fujitsu.com
Cc: geert@linux-m68k.org
Cc: ghackmann@google.com
Cc: gregkh@linuxfoundation.org
Cc: jan.kiszka@siemens.com
Cc: jarkko.sakkinen@linux.intel.com
Cc: joe@perches.com
Cc: jpoimboe@redhat.com
Cc: keescook@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: kstewart@linuxfoundation.org
Cc: linux-efi@vger.kernel.org
Cc: linux-kbuild@vger.kernel.org
Cc: manojgupta@google.com
Cc: mawilcox@microsoft.com
Cc: michal.lkml@markovi.net
Cc: mjg59@google.com
Cc: mka@chromium.org
Cc: pombredanne@nexb.com
Cc: rientjes@google.com
Cc: rostedt@goodmis.org
Cc: thomas.lendacky@amd.com
Cc: tweek@google.com
Cc: virtualization@lists.linux-foundation.org
Cc: will.deacon@arm.com
Cc: yamada.masahiro@socionext.com
Link: http://lkml.kernel.org/r/20180621162324.36656-4-ndesaulniers@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
arch/x86/include/asm/irqflags.h | 2 +-
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/irqflags.S | 26 ++++++++++++++++++++++++++
3 files changed, 28 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -8,7 +8,7 @@
* Interrupt control:
*/
-static inline unsigned long native_save_fl(void)
+extern inline unsigned long native_save_fl(void)
{
unsigned long flags;
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,6 +41,7 @@ obj-y += alternative.o i8253.o pci-nom
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
+obj-y += irqflags.o
obj-y += process.o
obj-y += fpu/
--- /dev/null
+++ b/arch/x86/kernel/irqflags.S
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+#include <linux/linkage.h>
+
+/*
+ * unsigned long native_save_fl(void)
+ */
+ENTRY(native_save_fl)
+ pushf
+ pop %_ASM_AX
+ ret
+ENDPROC(native_save_fl)
+EXPORT_SYMBOL(native_save_fl)
+
+/*
+ * void native_restore_fl(unsigned long flags)
+ * %eax/%rdi: flags
+ */
+ENTRY(native_restore_fl)
+ push %_ASM_ARG1
+ popf
+ ret
+ENDPROC(native_restore_fl)
+EXPORT_SYMBOL(native_restore_fl)
Patches currently in stable-queue which might be from ndesaulniers@google.com are
queue-4.4/x86-paravirt-make-native_save_fl-extern-inline.patch
^ permalink raw reply
* Re: [RFC 0/4] Virtio uses DMA API for all devices
From: Michael S. Tsirkin @ 2018-07-23 9:08 UTC (permalink / raw)
To: Anshuman Khandual
Cc: robh, srikar, linuxram, linux-kernel, virtualization, hch, paulus,
joe, linuxppc-dev, elfring, haren, david
In-Reply-To: <8f51d2c6-cc0c-9e42-f0fd-a8a33acc8b83@linux.vnet.ibm.com>
On Mon, Jul 23, 2018 at 11:58:23AM +0530, Anshuman Khandual wrote:
> On 07/20/2018 06:46 PM, Michael S. Tsirkin wrote:
> > On Fri, Jul 20, 2018 at 09:29:37AM +0530, Anshuman Khandual wrote:
> >> This patch series is the follow up on the discussions we had before about
> >> the RFC titled [RFC,V2] virtio: Add platform specific DMA API translation
> >> for virito devices (https://patchwork.kernel.org/patch/10417371/). There
> >> were suggestions about doing away with two different paths of transactions
> >> with the host/QEMU, first being the direct GPA and the other being the DMA
> >> API based translations.
> >>
> >> First patch attempts to create a direct GPA mapping based DMA operations
> >> structure called 'virtio_direct_dma_ops' with exact same implementation
> >> of the direct GPA path which virtio core currently has but just wrapped in
> >> a DMA API format. Virtio core must use 'virtio_direct_dma_ops' instead of
> >> the arch default in absence of VIRTIO_F_IOMMU_PLATFORM flag to preserve the
> >> existing semantics. The second patch does exactly that inside the function
> >> virtio_finalize_features(). The third patch removes the default direct GPA
> >> path from virtio core forcing it to use DMA API callbacks for all devices.
> >> Now with that change, every device must have a DMA operations structure
> >> associated with it. The fourth patch adds an additional hook which gives
> >> the platform an opportunity to do yet another override if required. This
> >> platform hook can be used on POWER Ultravisor based protected guests to
> >> load up SWIOTLB DMA callbacks to do the required (as discussed previously
> >> in the above mentioned thread how host is allowed to access only parts of
> >> the guest GPA range) bounce buffering into the shared memory for all I/O
> >> scatter gather buffers to be consumed on the host side.
> >>
> >> Please go through these patches and review whether this approach broadly
> >> makes sense. I will appreciate suggestions, inputs, comments regarding
> >> the patches or the approach in general. Thank you.
> > I like how patches 1-3 look. Could you test performance
> > with/without to see whether the extra indirection through
> > use of DMA ops causes a measurable slow-down?
>
> I ran this simple DD command 10 times where /dev/vda is a virtio block
> device of 10GB size.
>
> dd if=/dev/zero of=/dev/vda bs=8M count=1024 oflag=direct
>
> With and without patches bandwidth which has a bit wide range does not
> look that different from each other.
>
> Without patches
> ===============
>
> ---------- 1 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.95557 s, 4.4 GB/s
> ---------- 2 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.05176 s, 4.2 GB/s
> ---------- 3 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.88314 s, 4.6 GB/s
> ---------- 4 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.84899 s, 4.6 GB/s
> ---------- 5 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 5.37184 s, 1.6 GB/s
> ---------- 6 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.9205 s, 4.5 GB/s
> ---------- 7 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 6.85166 s, 1.3 GB/s
> ---------- 8 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.74049 s, 4.9 GB/s
> ---------- 9 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 6.31699 s, 1.4 GB/s
> ---------- 10 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.47057 s, 3.5 GB/s
>
>
> With patches
> ============
>
> ---------- 1 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.25993 s, 3.8 GB/s
> ---------- 2 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.82438 s, 4.7 GB/s
> ---------- 3 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.93856 s, 4.4 GB/s
> ---------- 4 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.83405 s, 4.7 GB/s
> ---------- 5 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 7.50199 s, 1.1 GB/s
> ---------- 6 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.28742 s, 3.8 GB/s
> ---------- 7 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 5.74958 s, 1.5 GB/s
> ---------- 8 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 1.99149 s, 4.3 GB/s
> ---------- 9 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 5.67647 s, 1.5 GB/s
> ---------- 10 ---------
> 1024+0 records in
> 1024+0 records out
> 8589934592 bytes (8.6 GB, 8.0 GiB) copied, 2.93957 s, 2.9 GB/s
>
> Does this look okay ?
You want to test IOPS with lots of small writes and using
raw ramdisk on host.
--
MST
^ permalink raw reply
* Re: [PATCH net-next v6 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Toshiaki Makita @ 2018-07-23 9:57 UTC (permalink / raw)
To: xiangxia.m.yue, jasowang; +Cc: netdev, virtualization, mst
In-Reply-To: <1532196242-2998-4-git-send-email-xiangxia.m.yue@gmail.com>
On 2018/07/22 3:04, xiangxia.m.yue@gmail.com wrote:
> From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
>
> Factor out generic busy polling logic and will be
> used for in tx path in the next patch. And with the patch,
> qemu can set differently the busyloop_timeout for rx queue.
>
> Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> ---
...
> +static void vhost_net_busy_poll_vq_check(struct vhost_net *net,
> + struct vhost_virtqueue *rvq,
> + struct vhost_virtqueue *tvq,
> + bool rx)
> +{
> + struct socket *sock = rvq->private_data;
> +
> + if (rx) {
> + if (!vhost_vq_avail_empty(&net->dev, tvq)) {
> + vhost_poll_queue(&tvq->poll);
> + } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
> + vhost_disable_notify(&net->dev, tvq);
> + vhost_poll_queue(&tvq->poll);
> + }
> + } else if ((sock && sk_has_rx_data(sock->sk)) &&
> + !vhost_vq_avail_empty(&net->dev, rvq)) {
> + vhost_poll_queue(&rvq->poll);
Now we wait for vq_avail for rx as well, I think you cannot skip
vhost_enable_notify() on tx. Probably you might want to do:
} else if (sock && sk_has_rx_data(sock->sk)) {
if (!vhost_vq_avail_empty(&net->dev, rvq)) {
vhost_poll_queue(&rvq->poll);
} else if (unlikely(vhost_enable_notify(&net->dev, rvq))) {
vhost_disable_notify(&net->dev, rvq);
vhost_poll_queue(&rvq->poll);
}
}
Also it's better to care vhost_net_disable_vq()/vhost_net_enable_vq() on tx?
--
Toshiaki Makita
^ permalink raw reply
* Re: [PATCH v36 2/5] virtio_balloon: replace oom notifier with shrinker
From: Wei Wang @ 2018-07-23 10:30 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, virtualization, torvalds
In-Reply-To: <20180722174125-mutt-send-email-mst@kernel.org>
On 07/22/2018 10:48 PM, Michael S. Tsirkin wrote:
> On Fri, Jul 20, 2018 at 04:33:02PM +0800, Wei Wang wrote:
>>
>> +static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
>> + struct shrink_control *sc)
>> +{
>> + unsigned long pages_to_free = balloon_pages_to_shrink,
>> + pages_freed = 0;
>> + struct virtio_balloon *vb = container_of(shrinker,
>> + struct virtio_balloon, shrinker);
>> +
>> + /*
>> + * One invocation of leak_balloon can deflate at most
>> + * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it
>> + * multiple times to deflate pages till reaching
>> + * balloon_pages_to_shrink pages.
>> + */
>> + while (vb->num_pages && pages_to_free) {
>> + pages_to_free = balloon_pages_to_shrink - pages_freed;
>> + pages_freed += leak_balloon(vb, pages_to_free);
>> + }
>> + update_balloon_size(vb);
> Are you sure that this is never called if count returned 0?
Yes. Please see do_shrink_slab, it just returns if count is 0.
>
>> +
>> + return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE;
>> +}
>> +
>> +static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
>> + struct shrink_control *sc)
>> +{
>> + struct virtio_balloon *vb = container_of(shrinker,
>> + struct virtio_balloon, shrinker);
>> +
>> + /*
>> + * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to handle the
>> + * case when shrinker needs to be invoked to relieve memory pressure.
>> + */
>> + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
>> + return 0;
> So why not skip notifier registration when deflate on oom
> is clear?
Sounds good, thanks.
> vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
> #endif
> + err = virtio_balloon_register_shrinker(vb);
> + if (err)
> + goto out_del_vqs;
>
> So we can get scans before device is ready. Leak will fail
> then. Why not register later after device is ready?
Probably no.
- it would be better not to set device ready when register_shrinker failed.
- When the device isn't ready, ballooning won't happen, that is,
vb->num_pages will be 0, which results in shrinker_count=0 and
shrinker_scan won't be called.
So I think it would be better to have shrinker registered before
device_ready.
Best,
Wei
^ permalink raw reply
* [PATCH 4.4 018/107] x86/paravirt: Make native_save_fl() extern inline
From: Greg Kroah-Hartman @ 2018-07-23 12:41 UTC (permalink / raw)
To: linux-kernel
Cc: andrea.parri, kstewart, linux-efi, brijesh.singh, Peter Zijlstra,
jan.kiszka, will.deacon, jarkko.sakkinen, virtualization,
yamada.masahiro, manojgupta, H. Peter Anvin, Thomas Gleixner,
tweek, mawilcox, akataria, ghackmann, Ingo Molnar, mjg59, mka,
geert, rientjes, aryabinin, thomas.lendacky, Arnd Bergmann,
linux-kbuild, pombredanne, rostedt, acme, caoj.fnst, jpoimboe,
Sedat Dilek, boris.ostrovsky
In-Reply-To: <20180723122413.003644357@linuxfoundation.org>
4.4-stable review patch. If anyone has any objections, please let me know.
------------------
From: Nick Desaulniers <ndesaulniers@google.com>
commit d0a8d9378d16eb3c69bd8e6d23779fbdbee3a8c7 upstream.
native_save_fl() is marked static inline, but by using it as
a function pointer in arch/x86/kernel/paravirt.c, it MUST be outlined.
paravirt's use of native_save_fl() also requires that no GPRs other than
%rax are clobbered.
Compilers have different heuristics which they use to emit stack guard
code, the emittance of which can break paravirt's callee saved assumption
by clobbering %rcx.
Marking a function definition extern inline means that if this version
cannot be inlined, then the out-of-line version will be preferred. By
having the out-of-line version be implemented in assembly, it cannot be
instrumented with a stack protector, which might violate custom calling
conventions that code like paravirt rely on.
The semantics of extern inline has changed since gnu89. This means that
folks using GCC versions >= 5.1 may see symbol redefinition errors at
link time for subdirs that override KBUILD_CFLAGS (making the C standard
used implicit) regardless of this patch. This has been cleaned up
earlier in the patch set, but is left as a note in the commit message
for future travelers.
Reports:
https://lkml.org/lkml/2018/5/7/534
https://github.com/ClangBuiltLinux/linux/issues/16
Discussion:
https://bugs.llvm.org/show_bug.cgi?id=37512
https://lkml.org/lkml/2018/5/24/1371
Thanks to the many folks that participated in the discussion.
[Backport for 4.4. 4.4 is missing commit 784d5699eddc "x86: move exports to
actual definitions" which doesn't apply cleanly, and not really worth
backporting IMO. It's simpler to change this patch from upstream:
+ #include <asm-generic/export.h>
rather than
+ #include <asm/export.h>]
Debugged-by: Alistair Strachan <astrachan@google.com>
Debugged-by: Matthias Kaehlcke <mka@chromium.org>
Suggested-by: Arnd Bergmann <arnd@arndb.de>
Suggested-by: H. Peter Anvin <hpa@zytor.com>
Suggested-by: Tom Stellar <tstellar@redhat.com>
Reported-by: Sedat Dilek <sedat.dilek@gmail.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Nick Desaulniers <ndesaulniers@google.com>
Acked-by: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@redhat.com
Cc: akataria@vmware.com
Cc: akpm@linux-foundation.org
Cc: andrea.parri@amarulasolutions.com
Cc: ard.biesheuvel@linaro.org
Cc: aryabinin@virtuozzo.com
Cc: astrachan@google.com
Cc: boris.ostrovsky@oracle.com
Cc: brijesh.singh@amd.com
Cc: caoj.fnst@cn.fujitsu.com
Cc: geert@linux-m68k.org
Cc: ghackmann@google.com
Cc: gregkh@linuxfoundation.org
Cc: jan.kiszka@siemens.com
Cc: jarkko.sakkinen@linux.intel.com
Cc: joe@perches.com
Cc: jpoimboe@redhat.com
Cc: keescook@google.com
Cc: kirill.shutemov@linux.intel.com
Cc: kstewart@linuxfoundation.org
Cc: linux-efi@vger.kernel.org
Cc: linux-kbuild@vger.kernel.org
Cc: manojgupta@google.com
Cc: mawilcox@microsoft.com
Cc: michal.lkml@markovi.net
Cc: mjg59@google.com
Cc: mka@chromium.org
Cc: pombredanne@nexb.com
Cc: rientjes@google.com
Cc: rostedt@goodmis.org
Cc: thomas.lendacky@amd.com
Cc: tweek@google.com
Cc: virtualization@lists.linux-foundation.org
Cc: will.deacon@arm.com
Cc: yamada.masahiro@socionext.com
Link: http://lkml.kernel.org/r/20180621162324.36656-4-ndesaulniers@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
arch/x86/include/asm/irqflags.h | 2 +-
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/irqflags.S | 26 ++++++++++++++++++++++++++
3 files changed, 28 insertions(+), 1 deletion(-)
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -8,7 +8,7 @@
* Interrupt control:
*/
-static inline unsigned long native_save_fl(void)
+extern inline unsigned long native_save_fl(void)
{
unsigned long flags;
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,6 +41,7 @@ obj-y += alternative.o i8253.o pci-nom
obj-y += tsc.o tsc_msr.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
+obj-y += irqflags.o
obj-y += process.o
obj-y += fpu/
--- /dev/null
+++ b/arch/x86/kernel/irqflags.S
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <asm/asm.h>
+#include <asm-generic/export.h>
+#include <linux/linkage.h>
+
+/*
+ * unsigned long native_save_fl(void)
+ */
+ENTRY(native_save_fl)
+ pushf
+ pop %_ASM_AX
+ ret
+ENDPROC(native_save_fl)
+EXPORT_SYMBOL(native_save_fl)
+
+/*
+ * void native_restore_fl(unsigned long flags)
+ * %eax/%rdi: flags
+ */
+ENTRY(native_restore_fl)
+ push %_ASM_ARG1
+ popf
+ ret
+ENDPROC(native_restore_fl)
+EXPORT_SYMBOL(native_restore_fl)
^ permalink raw reply
* Re: [PATCH net-next v6 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()
From: Tonghao Zhang @ 2018-07-23 12:43 UTC (permalink / raw)
To: makita.toshiaki; +Cc: Linux Kernel Network Developers, virtualization, mst
In-Reply-To: <c520c142-ef36-7918-7135-51258c17bd83@lab.ntt.co.jp>
On Mon, Jul 23, 2018 at 5:58 PM Toshiaki Makita
<makita.toshiaki@lab.ntt.co.jp> wrote:
>
> On 2018/07/22 3:04, xiangxia.m.yue@gmail.com wrote:
> > From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> >
> > Factor out generic busy polling logic and will be
> > used for in tx path in the next patch. And with the patch,
> > qemu can set differently the busyloop_timeout for rx queue.
> >
> > Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
> > ---
> ...
> > +static void vhost_net_busy_poll_vq_check(struct vhost_net *net,
> > + struct vhost_virtqueue *rvq,
> > + struct vhost_virtqueue *tvq,
> > + bool rx)
> > +{
> > + struct socket *sock = rvq->private_data;
> > +
> > + if (rx) {
> > + if (!vhost_vq_avail_empty(&net->dev, tvq)) {
> > + vhost_poll_queue(&tvq->poll);
> > + } else if (unlikely(vhost_enable_notify(&net->dev, tvq))) {
> > + vhost_disable_notify(&net->dev, tvq);
> > + vhost_poll_queue(&tvq->poll);
> > + }
> > + } else if ((sock && sk_has_rx_data(sock->sk)) &&
> > + !vhost_vq_avail_empty(&net->dev, rvq)) {
> > + vhost_poll_queue(&rvq->poll);
>
> Now we wait for vq_avail for rx as well, I think you cannot skip
> vhost_enable_notify() on tx. Probably you might want to do:
I think vhost_enable_notify is needed.
> } else if (sock && sk_has_rx_data(sock->sk)) {
> if (!vhost_vq_avail_empty(&net->dev, rvq)) {
> vhost_poll_queue(&rvq->poll);
> } else if (unlikely(vhost_enable_notify(&net->dev, rvq))) {
> vhost_disable_notify(&net->dev, rvq);
> vhost_poll_queue(&rvq->poll);
> }
> }
As Jason review as before, we only want rx kick when packet is pending at
socket but we're out of available buffers. So we just enable notify,
but not poll it ?
} else if ((sock && sk_has_rx_data(sock->sk)) &&
!vhost_vq_avail_empty(&net->dev, rvq)) {
vhost_poll_queue(&rvq->poll);
else {
vhost_enable_notify(&net->dev, rvq);
}
> Also it's better to care vhost_net_disable_vq()/vhost_net_enable_vq() on tx?
I cant find why it is better, if necessary, we can do it.
> --
> Toshiaki Makita
>
^ permalink raw reply
* Re: [PATCH v36 0/5] Virtio-balloon: support free page reporting
From: Michael S. Tsirkin @ 2018-07-23 14:07 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, virtualization, torvalds, dgilbert
In-Reply-To: <1532075585-39067-1-git-send-email-wei.w.wang@intel.com>
On Fri, Jul 20, 2018 at 04:33:00PM +0800, Wei Wang wrote:
> This patch series is separated from the previous "Virtio-balloon
> Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> implemented by this series enables the virtio-balloon driver to report
> hints of guest free pages to the host. It can be used to accelerate live
> migration of VMs. Here is an introduction of this usage:
>
> Live migration needs to transfer the VM's memory from the source machine
> to the destination round by round. For the 1st round, all the VM's memory
> is transferred. From the 2nd round, only the pieces of memory that were
> written by the guest (after the 1st round) are transferred. One method
> that is popularly used by the hypervisor to track which part of memory is
> written is to write-protect all the guest memory.
>
> This feature enables the optimization by skipping the transfer of guest
> free pages during VM live migration. It is not concerned that the memory
> pages are used after they are given to the hypervisor as a hint of the
> free pages, because they will be tracked by the hypervisor and transferred
> in the subsequent round if they are used and written.
>
> * Tests
> - Test Environment
> Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
> Guest: 8G RAM, 4 vCPU
> Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
>
> - Test Results
> - Idle Guest Live Migration Time (results are averaged over 10 runs):
> - Optimization v.s. Legacy = 409ms vs 1757ms --> ~77% reduction
> (setting page poisoning zero and enabling ksm don't affect the
> comparison result)
> - Guest with Linux Compilation Workload (make bzImage -j4):
> - Live Migration Time (average)
> Optimization v.s. Legacy = 1407ms v.s. 2528ms --> ~44% reduction
> - Linux Compilation Time
> Optimization v.s. Legacy = 5min4s v.s. 5min12s
> --> no obvious difference
I'd like to see dgilbert's take on whether this kind of gain
justifies adding a PV interfaces, and what kind of guest workload
is appropriate.
Cc'd.
> ChangeLog:
> v35->v36:
> - remove the mm patch, as Linus has a suggestion to get free page
> addresses via allocation, instead of reading from the free page
> list.
> - virtio-balloon:
> - replace oom notifier with shrinker;
> - the guest to host communication interface remains the same as
> v32.
> - allocate free page blocks and send to host one by one, and free
> them after sending all the pages.
>
> For ChangeLogs from v22 to v35, please reference
> https://lwn.net/Articles/759413/
>
> For ChangeLogs before v21, please reference
> https://lwn.net/Articles/743660/
>
> Wei Wang (5):
> virtio-balloon: remove BUG() in init_vqs
> virtio_balloon: replace oom notifier with shrinker
> virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
> mm/page_poison: expose page_poisoning_enabled to kernel modules
> virtio-balloon: VIRTIO_BALLOON_F_PAGE_POISON
>
> drivers/virtio/virtio_balloon.c | 456 ++++++++++++++++++++++++++++++------
> include/uapi/linux/virtio_balloon.h | 7 +
> mm/page_poison.c | 6 +
> 3 files changed, 394 insertions(+), 75 deletions(-)
>
> --
> 2.7.4
^ permalink raw reply
* Re: [PATCH v36 2/5] virtio_balloon: replace oom notifier with shrinker
From: Michael S. Tsirkin @ 2018-07-23 14:13 UTC (permalink / raw)
To: Wei Wang
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, mhocko, linux-mm, pbonzini,
akpm, virtualization, torvalds
In-Reply-To: <5B55AE56.5030404@intel.com>
On Mon, Jul 23, 2018 at 06:30:46PM +0800, Wei Wang wrote:
> On 07/22/2018 10:48 PM, Michael S. Tsirkin wrote:
> > On Fri, Jul 20, 2018 at 04:33:02PM +0800, Wei Wang wrote:
> > > +static unsigned long virtio_balloon_shrinker_scan(struct shrinker *shrinker,
> > > + struct shrink_control *sc)
> > > +{
> > > + unsigned long pages_to_free = balloon_pages_to_shrink,
> > > + pages_freed = 0;
> > > + struct virtio_balloon *vb = container_of(shrinker,
> > > + struct virtio_balloon, shrinker);
> > > +
> > > + /*
> > > + * One invocation of leak_balloon can deflate at most
> > > + * VIRTIO_BALLOON_ARRAY_PFNS_MAX balloon pages, so we call it
> > > + * multiple times to deflate pages till reaching
> > > + * balloon_pages_to_shrink pages.
> > > + */
> > > + while (vb->num_pages && pages_to_free) {
> > > + pages_to_free = balloon_pages_to_shrink - pages_freed;
> > > + pages_freed += leak_balloon(vb, pages_to_free);
> > > + }
> > > + update_balloon_size(vb);
> > Are you sure that this is never called if count returned 0?
>
> Yes. Please see do_shrink_slab, it just returns if count is 0.
>
> >
> > > +
> > > + return pages_freed / VIRTIO_BALLOON_PAGES_PER_PAGE;
> > > +}
> > > +
> > > +static unsigned long virtio_balloon_shrinker_count(struct shrinker *shrinker,
> > > + struct shrink_control *sc)
> > > +{
> > > + struct virtio_balloon *vb = container_of(shrinker,
> > > + struct virtio_balloon, shrinker);
> > > +
> > > + /*
> > > + * We continue to use VIRTIO_BALLOON_F_DEFLATE_ON_OOM to handle the
> > > + * case when shrinker needs to be invoked to relieve memory pressure.
> > > + */
> > > + if (!virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
> > > + return 0;
> > So why not skip notifier registration when deflate on oom
> > is clear?
>
> Sounds good, thanks.
>
>
> > vb->vb_dev_info.inode->i_mapping->a_ops = &balloon_aops;
> > #endif
> > + err = virtio_balloon_register_shrinker(vb);
> > + if (err)
> > + goto out_del_vqs;
> > So we can get scans before device is ready. Leak will fail
> > then. Why not register later after device is ready?
>
> Probably no.
>
> - it would be better not to set device ready when register_shrinker failed.
That's very rare so I won't be too worried.
> - When the device isn't ready, ballooning won't happen, that is,
> vb->num_pages will be 0, which results in shrinker_count=0 and shrinker_scan
> won't be called.
>
> So I think it would be better to have shrinker registered before
> device_ready.
>
> Best,
> Wei
^ permalink raw reply
* Re: [PATCH v36 0/5] Virtio-balloon: support free page reporting
From: Dr. David Alan Gilbert @ 2018-07-23 14:36 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: yang.zhang.wz, virtio-dev, riel, quan.xu0, kvm, nilal,
liliang.opensource, linux-kernel, virtualization, linux-mm,
pbonzini, akpm, mhocko, torvalds
In-Reply-To: <20180723122342-mutt-send-email-mst@kernel.org>
* Michael S. Tsirkin (mst@redhat.com) wrote:
> On Fri, Jul 20, 2018 at 04:33:00PM +0800, Wei Wang wrote:
> > This patch series is separated from the previous "Virtio-balloon
> > Enhancement" series. The new feature, VIRTIO_BALLOON_F_FREE_PAGE_HINT,
> > implemented by this series enables the virtio-balloon driver to report
> > hints of guest free pages to the host. It can be used to accelerate live
> > migration of VMs. Here is an introduction of this usage:
> >
> > Live migration needs to transfer the VM's memory from the source machine
> > to the destination round by round. For the 1st round, all the VM's memory
> > is transferred. From the 2nd round, only the pieces of memory that were
> > written by the guest (after the 1st round) are transferred. One method
> > that is popularly used by the hypervisor to track which part of memory is
> > written is to write-protect all the guest memory.
> >
> > This feature enables the optimization by skipping the transfer of guest
> > free pages during VM live migration. It is not concerned that the memory
> > pages are used after they are given to the hypervisor as a hint of the
> > free pages, because they will be tracked by the hypervisor and transferred
> > in the subsequent round if they are used and written.
> >
> > * Tests
> > - Test Environment
> > Host: Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz
> > Guest: 8G RAM, 4 vCPU
> > Migration setup: migrate_set_speed 100G, migrate_set_downtime 2 second
> >
> > - Test Results
> > - Idle Guest Live Migration Time (results are averaged over 10 runs):
> > - Optimization v.s. Legacy = 409ms vs 1757ms --> ~77% reduction
> > (setting page poisoning zero and enabling ksm don't affect the
> > comparison result)
> > - Guest with Linux Compilation Workload (make bzImage -j4):
> > - Live Migration Time (average)
> > Optimization v.s. Legacy = 1407ms v.s. 2528ms --> ~44% reduction
> > - Linux Compilation Time
> > Optimization v.s. Legacy = 5min4s v.s. 5min12s
> > --> no obvious difference
>
> I'd like to see dgilbert's take on whether this kind of gain
> justifies adding a PV interfaces, and what kind of guest workload
> is appropriate.
>
> Cc'd.
Well, 44% is great ... although the measurement is a bit weird.
a) A 2 second downtime is very large; 300-500ms is more normal
b) I'm not sure what the 'average' is - is that just between a bunch of
repeated migrations?
c) What load was running in the guest during the live migration?
An interesting measurement to add would be to do the same test but
with a VM with a lot more RAM but the same load; you'd hope the gain
would be even better.
It would be interesting, especially because the users who are interested
are people creating VMs allocated with lots of extra memory (for the
worst case) but most of the time migrating when it's fairly idle.
Dave
>
>
> > ChangeLog:
> > v35->v36:
> > - remove the mm patch, as Linus has a suggestion to get free page
> > addresses via allocation, instead of reading from the free page
> > list.
> > - virtio-balloon:
> > - replace oom notifier with shrinker;
> > - the guest to host communication interface remains the same as
> > v32.
> > - allocate free page blocks and send to host one by one, and free
> > them after sending all the pages.
> >
> > For ChangeLogs from v22 to v35, please reference
> > https://lwn.net/Articles/759413/
> >
> > For ChangeLogs before v21, please reference
> > https://lwn.net/Articles/743660/
> >
> > Wei Wang (5):
> > virtio-balloon: remove BUG() in init_vqs
> > virtio_balloon: replace oom notifier with shrinker
> > virtio-balloon: VIRTIO_BALLOON_F_FREE_PAGE_HINT
> > mm/page_poison: expose page_poisoning_enabled to kernel modules
> > virtio-balloon: VIRTIO_BALLOON_F_PAGE_POISON
> >
> > drivers/virtio/virtio_balloon.c | 456 ++++++++++++++++++++++++++++++------
> > include/uapi/linux/virtio_balloon.h | 7 +
> > mm/page_poison.c | 6 +
> > 3 files changed, 394 insertions(+), 75 deletions(-)
> >
> > --
> > 2.7.4
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox