From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:37151) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1Ujsse-00077j-04 for qemu-devel@nongnu.org; Tue, 04 Jun 2013 11:10:50 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1UjssV-0004Pd-VV for qemu-devel@nongnu.org; Tue, 04 Jun 2013 11:10:35 -0400 Received: from mx.ipv6.kamp.de ([2a02:248:0:51::16]:50300 helo=mx01.kamp.de) by eggs.gnu.org with smtp (Exim 4.71) (envelope-from ) id 1UjssV-0004PL-HQ for qemu-devel@nongnu.org; Tue, 04 Jun 2013 11:10:27 -0400 Message-ID: <51AE035A.5070301@kamp.de> Date: Tue, 04 Jun 2013 17:10:18 +0200 From: Peter Lieven MIME-Version: 1.0 References: <51A7036A.3050407@ozlabs.ru> <51A7049F.6040207@redhat.com> <51A70B3D.90609@ozlabs.ru> <51A71705.6060009@kamp.de> <51A74D79.7040204@redhat.com> <2765FDFA-8050-4AA3-8621-7E9EA2C89F9C@kamp.de> <51A764FC.7080705@redhat.com> <51ADF122.70307@kamp.de> <51ADF637.7060804@redhat.com> <51ADFBCE.3080200@kamp.de> <51ADFC7A.7030009@redhat.com> In-Reply-To: <51ADFC7A.7030009@redhat.com> Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] broken incoming migration List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Paolo Bonzini Cc: Alexey Kardashevskiy , "qemu-ppc@nongnu.org" , "qemu-devel@nongnu.org" , David Gibson On 04.06.2013 16:40, Paolo Bonzini wrote: > Il 04/06/2013 16:38, Peter Lieven ha scritto: >> On 04.06.2013 16:14, Paolo Bonzini wrote: >>> Il 04/06/2013 15:52, Peter Lieven ha scritto: >>>> On 30.05.2013 16:41, Paolo Bonzini wrote: >>>>> Il 30/05/2013 16:38, Peter Lieven ha scritto: >>>>>>>> You could also scan the page for nonzero values before writing it. >>>>>> i had this in mind, but then choosed the other approach.... turned >>>>>> out to be a bad idea. >>>>>> >>>>>> alexey: i will prepare a patch later today, could you then please >>>>>> verify it fixes your problem. >>>>>> >>>>>> paolo: would we still need the madvise or is it enough to not write >>>>>> the zeroes? >>>>> It should be enough to not write them. >>>> Problem: checking the pages for zero allocates them. even at the source. >>> It doesn't look like. I tried this program and top doesn't show an >>> increasing amount of reserved memory: >>> >>> #include >>> #include >>> int main() >>> { >>> char *x = malloc(500 << 20); >>> int i, j; >>> for (i = 0; i < 500; i += 10) { >>> for (j = 0; j < 10 << 20; j += 4096) { >>> *(volatile char*) (x + (i << 20) + j); >>> } >>> getchar(); >>> } >>> } >> strange. we are talking about RSS size, right? > None of the three top values change, and only VIRT is >500 MB. > >> is the malloc above using mmapped memory? > Yes. > >> which kernel version do you use? > 3.9. > >> what avoids allocating the memory for me is the following (with >> whatever side effects it has ;-)) > This would also fail to migrate any page that is swapped out, breaking > overcommit in a more subtle way. :) > > Paolo the following does also not allocate memory, but qemu does... #include #include #include #include #include #include #include #include #include #if defined __SSE2__ #include #define VECTYPE __m128i #define SPLAT(p) _mm_set1_epi8(*(p)) #define ALL_EQ(v1, v2) (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 0xFFFF) #else #define VECTYPE unsigned long #define SPLAT(p) (*(p) * (~0UL / 255)) #define ALL_EQ(v1, v2) ((v1) == (v2)) #endif #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 /* Round number down to multiple */ #define QEMU_ALIGN_DOWN(n, m) ((n) / (m) * (m)) /* Round number up to multiple */ #define QEMU_ALIGN_UP(n, m) QEMU_ALIGN_DOWN((n) + (m) - 1, (m)) #define QEMU_VMALLOC_ALIGN (256 * 4096) /* alloc shared memory pages */ void *qemu_anon_ram_alloc(size_t size) { size_t align = QEMU_VMALLOC_ALIGN; size_t total = size + align - getpagesize(); void *ptr = mmap(0, total, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); size_t offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr; if (ptr == MAP_FAILED) { fprintf(stderr, "Failed to allocate %zu B: %s\n", size, strerror(errno)); abort(); } ptr += offset; total -= offset; if (offset > 0) { munmap(ptr - offset, offset); } if (total > size) { munmap(ptr + size, total - size); } return ptr; } static inline int can_use_buffer_find_nonzero_offset(const void *buf, size_t len) { return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR * sizeof(VECTYPE)) == 0 && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); } size_t buffer_find_nonzero_offset(const void *buf, size_t len) { const VECTYPE *p = buf; const VECTYPE zero = (VECTYPE){0}; size_t i; if (!len) { return 0; } assert(can_use_buffer_find_nonzero_offset(buf, len)); for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) { if (!ALL_EQ(p[i], zero)) { return i * sizeof(VECTYPE); } } for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i < len / sizeof(VECTYPE); i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) { VECTYPE tmp0 = p[i + 0] | p[i + 1]; VECTYPE tmp1 = p[i + 2] | p[i + 3]; VECTYPE tmp2 = p[i + 4] | p[i + 5]; VECTYPE tmp3 = p[i + 6] | p[i + 7]; VECTYPE tmp01 = tmp0 | tmp1; VECTYPE tmp23 = tmp2 | tmp3; if (!ALL_EQ(tmp01 | tmp23, zero)) { break; } } return i * sizeof(VECTYPE); } int main() { //char *x = malloc(1024 << 20); char *x = qemu_anon_ram_alloc(1024 << 20); int i, j; int ret = 0; struct rusage rusage; for (i = 0; i < 500; i ++) { for (j = 0; j < 10 << 20; j += 4096) { ret += buffer_find_nonzero_offset((char*) (x + (i << 20) + j), 4096); } getrusage( RUSAGE_SELF, &rusage ); printf("read offset: %d kB, RSS size: %ld kB", ((i+1) << 10), rusage.ru_maxrss); getchar(); } printf("%d zero pages\n", ret); }