From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:40971) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1aabVk-0004Iv-UY for qemu-devel@nongnu.org; Mon, 29 Feb 2016 23:02:14 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1aabVh-0003zl-LT for qemu-devel@nongnu.org; Mon, 29 Feb 2016 23:02:12 -0500 Received: from [59.151.112.132] (port=30499 helo=heian.cn.fujitsu.com) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1aabVc-0003yg-K1 for qemu-devel@nongnu.org; Mon, 29 Feb 2016 23:02:09 -0500 References: <1456741170-21818-1-git-send-email-dgilbert@redhat.com> <56D43C6A.4020207@huawei.com> From: Li Zhijian Message-ID: <56D5142D.6060604@cn.fujitsu.com> Date: Tue, 1 Mar 2016 12:01:49 +0800 MIME-Version: 1.0 In-Reply-To: <56D43C6A.4020207@huawei.com> Content-Type: text/plain; charset="windows-1252"; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Hailiang Zhang , "Dr. David Alan Gilbert (git)" , qemu-devel@nongnu.org Cc: peter.huangpeng@huawei.com Great work!! I like this patch very much. On 02/29/2016 08:41 PM, Hailiang Zhang wrote: > On 2016/2/29 18:19, Dr. David Alan Gilbert (git) wrote: >> From: "Dr. David Alan Gilbert" >> >> Flush the colo ram cache in parallel; use the same number >> of threads as CPU cores. >> >> On a VM with 4 cores, and 4GB RAM, I've seen a reduction from >> ~20ms to ~16ms using this, which is helpful but not as much >> as I hoped; I guess one problem might be that all the changes >> could be concentrated in one area of RAM? Perhaps another approach > > Agreed. The dirty pages happened in continuous address are common in real > usage scene, Here, the idea of dividing ramblock by address, and flushing > them in parallel could be used in colo_init_ram_cache(), > where we need to backup the SVM's RAM into cache. Agreed, the backup job is very expensive currently, that is a good optimization. Best regards. Li Zhijian > >> would be to have one thread searching the bitmap and other threads >> doing the copy with some type of work queueu. >> > > I guess, in most case, the time of finding the dirty bitmap > is much faster than copy one page. So i think this method could be more > effective and more reasonable, but let's realize and test it first ;) > > Thanks, > Hailiang > >> [Note: Not for merging at the moment, just for discussion with the COLO guys] >> >> Signed-off-by: Dr. David Alan Gilbert >> --- >> migration/ram.c | 215 +++++++++++++++++++++++++++++++++++++++++++++----------- >> 1 file changed, 174 insertions(+), 41 deletions(-) >> >> diff --git a/migration/ram.c b/migration/ram.c >> index 188c3a1..6458863 100644 >> --- a/migration/ram.c >> +++ b/migration/ram.c >> @@ -42,6 +42,7 @@ >> #include "qemu/rcu_queue.h" >> #include "migration/colo.h" >> #include "crypto/hash.h" >> +#include "sysemu/sysemu.h" >> >> #ifdef DEBUG_MIGRATION_RAM >> #define DPRINTF(fmt, ...) \ >> @@ -616,18 +617,18 @@ static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) >> ret = test_and_clear_bit(nr, bitmap); >> >> if (ret) { >> - migration_dirty_pages--; >> + atomic_dec(&migration_dirty_pages); >> } >> return ret; >> } >> >> static inline >> ram_addr_t ramlist_bitmap_find_and_reset_dirty(RAMBlock *rb, >> - ram_addr_t start) >> + ram_addr_t start, >> + uint64_t rb_size) >> { >> unsigned long base = rb->offset >> TARGET_PAGE_BITS; >> unsigned long nr = base + (start >> TARGET_PAGE_BITS); >> - uint64_t rb_size = rb->used_length; >> unsigned long size = base + (rb_size >> TARGET_PAGE_BITS); >> unsigned long next; >> >> @@ -2721,6 +2722,141 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) >> return ret; >> } >> >> +/* For parallel flushing of colo ram cache; it has 'n' threads, >> + * trying 'n'==number of cpus, but might be smarter to do something NUMA >> + * Each thread waits for 'triggers' and signals 'completes'. >> + */ >> +typedef struct ram_cache_threads { >> + QemuThread *threads; >> + QemuEvent *triggers; >> + QemuEvent *completes; >> + bool quit; >> +} ColoRamCacheThreads; >> + >> +static ColoRamCacheThreads colo_flush_threads; >> + >> +/* >> + * Helper for colo_flush_thread. >> + * Given the current block and the bounds of ram (in pages) >> + * that we're dealing with, then find our next useable block, >> + * and set offset_out/used_out to the limits we're interested in. >> + * Result is NULL when we run out of blocks. >> + */ >> +static RAMBlock *flush_find_next_block(RAMBlock *block, >> + ram_addr_t lower_bound, >> + ram_addr_t upper_bound, >> + ram_addr_t *offset_out, >> + ram_addr_t *used_out) >> +{ >> + do { >> + if (!block) { >> + block = QLIST_FIRST_RCU(&ram_list.blocks); >> + } else { >> + block = QLIST_NEXT_RCU(block, next); >> + } >> + if (block && >> + (block->offset + block->used_length) >= lower_bound && >> + (block->offset < upper_bound)) { >> + /* OK, good, the block is at least partially within our bounds */ >> + if (block->offset <= lower_bound) { >> + *offset_out = lower_bound - block->offset; >> + } else { >> + *offset_out = 0; >> + } >> + if ((block->offset + block->used_length) >= upper_bound) { >> + *used_out = upper_bound - block->offset; >> + } else { >> + *used_out = block->used_length; >> + } >> + break; >> + } >> + } while (block); >> + >> + return block; >> +} >> + >> +/* Flush thread for COLO ram cache, synchronises a proportion of the >> + * ram cache. >> + */ >> +static void *colo_flush_thread(void *opaque) >> +{ >> + int i = (int)(intptr_t)opaque; >> + int64_t ram_cache_pages = last_ram_offset() >> TARGET_PAGE_BITS; >> + ram_addr_t lower, upper; >> + /* work out our range, lower..upper-1 >> + * want to avoid SMP issues on the dirty bitmap, so make sure >> + * our limits never land on the same 64bit word >> + */ >> + ram_addr_t chunk_size = (ram_cache_pages / smp_cpus) & ~63ul; >> + >> + lower = i * chunk_size; >> + >> + if (i != (smp_cpus - 1)) { >> + upper = (i + 1) * chunk_size; >> + } else { >> + /* Last thread gets deals with extra few pages due to rounding */ >> + upper = ram_cache_pages; >> + } >> + lower <<= TARGET_PAGE_BITS; >> + upper <<= TARGET_PAGE_BITS; >> + >> + while (true) { >> + RAMBlock *block = NULL; >> + void *dst_host; >> + void *src_host; >> + ram_addr_t offset = ~0, host_off = 0, cache_off = 0, used_length = 0; >> + uint64_t host_dirty = 0, both_dirty = 0; >> + >> + /* Wait for work */ >> + qemu_event_wait(&colo_flush_threads.triggers[i]); >> + qemu_event_reset(&colo_flush_threads.triggers[i]); >> + if (colo_flush_threads.quit) { >> + break; >> + } >> + >> + rcu_read_lock(); >> + /* note offset is initialised to ~0 so 1st time through we drop >> + * through to finding a block >> + */ >> + do { >> + ram_addr_t ram_addr_abs; >> + if (cache_off == offset) { /* walk ramblock->colo_cache */ >> + cache_off = migration_bitmap_find_dirty(block, >> + offset, &ram_addr_abs); >> + if (cache_off < used_length) { >> + migration_bitmap_clear_dirty(ram_addr_abs); >> + } >> + } >> + if (host_off == offset) { /* walk ramblock->host */ >> + host_off = ramlist_bitmap_find_and_reset_dirty(block, offset, >> + used_length); >> + } >> + if (!block || (host_off >= used_length && >> + cache_off >= used_length)) { >> + block = flush_find_next_block(block, lower, upper, >> + &offset, &used_length); >> + cache_off = host_off = offset; >> + } else { >> + if (host_off <= cache_off) { >> + offset = host_off; >> + host_dirty++; >> + both_dirty += (host_off == cache_off); >> + } else { >> + offset = cache_off; >> + } >> + dst_host = block->host + offset; >> + src_host = block->colo_cache + offset; >> + memcpy(dst_host, src_host, TARGET_PAGE_SIZE); >> + } >> + } while (block); >> + rcu_read_unlock(); >> + trace_colo_flush_ram_cache_end(host_dirty, both_dirty); >> + >> + qemu_event_set(&colo_flush_threads.completes[i]); >> + } >> + >> + return NULL; >> +} >> /* >> * colo cache: this is for secondary VM, we cache the whole >> * memory of the secondary VM, it will be called after first migration. >> @@ -2729,6 +2865,7 @@ int colo_init_ram_cache(void) >> { >> RAMBlock *block; >> int64_t ram_cache_pages = last_ram_offset() >> TARGET_PAGE_BITS; >> + int i; >> >> rcu_read_lock(); >> QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { >> @@ -2753,6 +2890,20 @@ int colo_init_ram_cache(void) >> migration_dirty_pages = 0; >> memory_global_dirty_log_start(); >> >> + colo_flush_threads.threads = g_new0(QemuThread, smp_cpus); >> + colo_flush_threads.triggers = g_new0(QemuEvent, smp_cpus); >> + colo_flush_threads.completes = g_new0(QemuEvent, smp_cpus); >> + colo_flush_threads.quit = false; >> + for (i = 0; i < smp_cpus; i++) { >> + char name[32]; >> + qemu_event_init(&colo_flush_threads.triggers[i], false); >> + qemu_event_init(&colo_flush_threads.completes[i], false); >> + sprintf(name, "colofl: %d", i); >> + qemu_thread_create(&colo_flush_threads.threads[i], name, >> + colo_flush_thread, >> + (void *)(intptr_t)i, >> + QEMU_THREAD_JOINABLE); >> + } >> return 0; >> >> out_locked: >> @@ -2771,9 +2922,19 @@ void colo_release_ram_cache(void) >> { >> RAMBlock *block; >> struct BitmapRcu *bitmap = migration_bitmap_rcu; >> + int i; >> >> ram_cache_enable = false; >> >> + colo_flush_threads.quit = true; >> + for (i = 0; i < smp_cpus; i++) { >> + qemu_event_set(&colo_flush_threads.triggers[i]); >> + qemu_thread_join(&colo_flush_threads.threads[i]); >> + } >> + g_free(colo_flush_threads.threads); >> + g_free(colo_flush_threads.triggers); >> + g_free(colo_flush_threads.completes); >> + >> atomic_rcu_set(&migration_bitmap_rcu, NULL); >> if (bitmap) { >> call_rcu(bitmap, migration_bitmap_free, rcu); >> @@ -2795,47 +2956,19 @@ void colo_release_ram_cache(void) >> */ >> void colo_flush_ram_cache(void) >> { >> - RAMBlock *block = NULL; >> - void *dst_host; >> - void *src_host; >> - ram_addr_t offset = 0, host_off = 0, cache_off = 0; >> - uint64_t host_dirty = 0, both_dirty = 0; >> - >> + int i; >> trace_colo_flush_ram_cache_begin(migration_dirty_pages); >> address_space_sync_dirty_bitmap(&address_space_memory); >> - rcu_read_lock(); >> - block = QLIST_FIRST_RCU(&ram_list.blocks); >> - while (block) { >> - ram_addr_t ram_addr_abs; >> - if (cache_off == offset) { /* walk ramblock->colo_cache */ >> - cache_off = migration_bitmap_find_dirty(block, >> - offset, &ram_addr_abs); >> - if (cache_off < block->used_length) { >> - migration_bitmap_clear_dirty(ram_addr_abs); >> - } >> - } >> - if (host_off == offset) { /* walk ramblock->host */ >> - host_off = ramlist_bitmap_find_and_reset_dirty(block, offset); >> - } >> - if (host_off >= block->used_length && >> - cache_off >= block->used_length) { >> - cache_off = host_off = offset = 0; >> - block = QLIST_NEXT_RCU(block, next); >> - } else { >> - if (host_off <= cache_off) { >> - offset = host_off; >> - host_dirty++; >> - both_dirty += (host_off == cache_off); >> - } else { >> - offset = cache_off; >> - } >> - dst_host = block->host + offset; >> - src_host = block->colo_cache + offset; >> - memcpy(dst_host, src_host, TARGET_PAGE_SIZE); >> - } >> + >> + /* Kick all the flush threads to start work */ >> + for (i = 0; i < smp_cpus; i++) { >> + qemu_event_reset(&colo_flush_threads.completes[i]); >> + qemu_event_set(&colo_flush_threads.triggers[i]); >> + } >> + /* ...and wait for them to finish */ >> + for (i = 0; i < smp_cpus; i++) { >> + qemu_event_wait(&colo_flush_threads.completes[i]); >> } >> - rcu_read_unlock(); >> - trace_colo_flush_ram_cache_end(host_dirty, both_dirty); >> assert(migration_dirty_pages == 0); >> } >> >> > > > > . >