From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from [140.186.70.92] (port=32780 helo=eggs.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1PKfQA-0001zh-Pz for qemu-devel@nongnu.org; Mon, 22 Nov 2010 18:03:44 -0500 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1PKfQ3-0001fm-MP for qemu-devel@nongnu.org; Mon, 22 Nov 2010 18:03:38 -0500 Received: from mail-qw0-f45.google.com ([209.85.216.45]:33158) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1PKfQ3-0001fb-JC for qemu-devel@nongnu.org; Mon, 22 Nov 2010 18:03:31 -0500 Received: by qwb8 with SMTP id 8so2146556qwb.4 for ; Mon, 22 Nov 2010 15:03:31 -0800 (PST) Message-ID: <4CEAF6BB.3080808@codemonkey.ws> Date: Mon, 22 Nov 2010 17:03:23 -0600 From: Anthony Liguori MIME-Version: 1.0 References: <1290466818-5230-1-git-send-email-aliguori@us.ibm.com> In-Reply-To: <1290466818-5230-1-git-send-email-aliguori@us.ibm.com> Content-Type: multipart/mixed; boundary="------------050701030309050806020401" Subject: [Qemu-devel] Re: [PATCH] qemu-kvm: introduce cpu_start/cpu_stop commands List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Anthony Liguori Cc: Chris Wright , qemu-devel@nongnu.org, kvm@vger.kernel.org This is a multi-part message in MIME format. --------------050701030309050806020401 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit On 11/22/2010 05:00 PM, Anthony Liguori wrote: > qemu-kvm vcpu threads don't response to SIGSTOP/SIGCONT. Instead of teaching > them to respond to these signals, introduce monitor commands that stop and start > individual vcpus. > > The purpose of these commands are to implement CPU hard limits using an external > tool that watches the CPU consumption and stops the CPU as appropriate. > > The monitor commands provide a more elegant solution that signals because it > ensures that a stopped vcpu isn't holding the qemu_mutex. > > I'll reply to this note with an example tool. > This is super rough but demonstrates the concept. If you run it with '0 50 100' it will cap VCPU 0 at 50%. It's not the prettiest thing in the world but it's minimally invasive and seems to work well. Regards, Anthony Liguori > Signed-off-by: Anthony Liguori > > diff --git a/hmp-commands.hx b/hmp-commands.hx > index ba6de28..827bd67 100644 > --- a/hmp-commands.hx > +++ b/hmp-commands.hx > @@ -279,6 +279,24 @@ Resume emulation. > ETEXI > > { > + .name = "cpu_start", > + .args_type = "cpu:i", > + .params = "[cpu]", > + .help = "start cpu emulation", > + .user_print = monitor_user_noop, > + .mhandler.cmd_new = do_vcpu_start, > + }, > + > + { > + .name = "cpu_stop", > + .args_type = "cpu:i", > + .params = "[cpu]", > + .help = "stop cpu emulation", > + .user_print = monitor_user_noop, > + .mhandler.cmd_new = do_vcpu_stop, > + }, > + > + { > .name = "gdbserver", > .args_type = "device:s?", > .params = "[device]", > diff --git a/qemu-kvm.c b/qemu-kvm.c > index 471306b..35121ed 100644 > --- a/qemu-kvm.c > +++ b/qemu-kvm.c > @@ -1351,6 +1351,65 @@ static void pause_all_threads(void) > } > } > > +static void vcpu_stop(int cpu) > +{ > + CPUState *env = first_cpu; > + > + for (env = first_cpu; env; env = env->next_cpu) { > + if (env->cpu_index == cpu) { > + break; > + } > + } > + > + if (env) { > + if (env != cpu_single_env) { > + env->stop = 1; > + pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); > + } else { > + env->stop = 0; > + env->stopped = 1; > + cpu_exit(env); > + } > + > + while (!env->stopped) { > + qemu_cond_wait(&qemu_pause_cond); > + } > + } > +} > + > +static void vcpu_start(int cpu) > +{ > + CPUState *env = first_cpu; > + > + assert(!cpu_single_env); > + > + for (env = first_cpu; env; env = env->next_cpu) { > + if (env->cpu_index == cpu) { > + break; > + } > + } > + > + if (env) { > + env->stop = 0; > + env->stopped = 0; > + pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); > + } > +} > + > +int do_vcpu_stop(Monitor *mon, const QDict *qdict, QObject **ret_data) > +{ > + int vcpu = qdict_get_int(qdict, "cpu"); > + vcpu_stop(vcpu); > + return 0; > +} > + > +int do_vcpu_start(Monitor *mon, const QDict *qdict, QObject **ret_data) > +{ > + int vcpu = qdict_get_int(qdict, "cpu"); > + vcpu_start(vcpu); > + return 0; > +} > + > static void resume_all_threads(void) > { > CPUState *penv = first_cpu; > diff --git a/sysemu.h b/sysemu.h > index 849dc8c..3ef68dd 100644 > --- a/sysemu.h > +++ b/sysemu.h > @@ -61,6 +61,9 @@ void qemu_system_reset(void); > void qemu_add_exit_notifier(Notifier *notify); > void qemu_remove_exit_notifier(Notifier *notify); > > +int do_vcpu_stop(Monitor *mon, const QDict *qdict, QObject **ret_data); > +int do_vcpu_start(Monitor *mon, const QDict *qdict, QObject **ret_data); > + > void do_savevm(Monitor *mon, const QDict *qdict); > int load_vmstate(const char *name); > void do_delvm(Monitor *mon, const QDict *qdict); > --------------050701030309050806020401 Content-Type: text/x-csrc; name="main.c" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="main.c" #define _XOPEN_SOURCE 500 #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #define USEC_PER_SEC 1000000ULL static long get_cguest_time(const char *buffer) { const char *ptr; int space_count; for (ptr = buffer; *ptr && space_count != 42; ptr++) { if (*ptr == ' ') { space_count++; } } return strtol(ptr, NULL, 10); } static void tv_add(struct timeval *tv, suseconds_t usec) { tv->tv_usec += usec; while (tv->tv_usec > USEC_PER_SEC) { tv->tv_sec += 1; tv->tv_usec -= USEC_PER_SEC; } } static int tv_cmp(struct timeval *lhs, struct timeval *rhs) { if (lhs->tv_sec == rhs->tv_sec) { if (lhs->tv_usec < rhs->tv_usec) { return -1; } else if (lhs->tv_usec > rhs->tv_usec) { return 1; } return 0; } else if (lhs->tv_sec < rhs->tv_sec) { return -1; } else if (lhs->tv_sec > rhs->tv_sec) { return 1; } return 0; } static void write_all(int fd, const void *buffer, size_t buffer_len) { size_t offset = 0; while (offset < buffer_len) { ssize_t len; len = write(fd, buffer + offset, buffer_len - offset); if (len > 0) { offset += len; } } } static void read_reply(int fd, char *buffer, size_t buffer_len) { size_t offset = 0; while (offset < buffer_len) { ssize_t len; len = read(fd, buffer + offset, buffer_len - offset); if (len > 0) { offset += len; } if (offset > 8 && memcmp("\n(qemu) ", buffer + (offset - 8), 8) == 0) { char *ptr; buffer[offset - 8] = 0; ptr = strchr(buffer, '\n'); if (ptr == NULL) { buffer[0] = 0; } else { memmove(buffer, ptr + 1, offset - (ptr - buffer) - 1); } return; } } } static int monitor_fd; static void monitor_command(const char *fmt, ...) { char buffer[256]; va_list ap; size_t len; va_start(ap, fmt); len = vsnprintf(buffer, sizeof(buffer), fmt, ap); va_end(ap); write_all(monitor_fd, buffer, len); write_all(monitor_fd, "\n", 1); read_reply(monitor_fd, buffer, sizeof(buffer)); } static void monitor_command_response(char *rsp, size_t rsp_len, const char *fmt, ...) { char buffer[256]; va_list ap; size_t len; va_start(ap, fmt); len = vsnprintf(buffer, sizeof(buffer), fmt, ap); va_end(ap); write_all(monitor_fd, buffer, len); write_all(monitor_fd, "\n", 1); read_reply(monitor_fd, rsp, rsp_len); } static int vm_running = 1; static void guest_start(int vcpu) { if (!vm_running) { monitor_command("cpu_start %d", vcpu); } vm_running = 1; } static void guest_stop(int vcpu) { if (vm_running) { monitor_command("cpu_stop %d", vcpu); } vm_running = 0; } static int find_pid(char *buffer, int vcpu) { char *ptr = buffer; int i; for (i = 0; ptr && i < vcpu; i++) { ptr = strchr(ptr, '\n'); if (ptr) { ptr++; } } if (ptr) { ptr = strstr(ptr, "thread_id="); if (ptr) { ptr += 10; return atoi(ptr); } } return 0; } int main(int argc, char **argv) { int fd, pid, vcpu; char buffer[1024]; long ticks_per_sec; long cguest_time_last = 0; struct timeval period_end; long cguest_ticks; long entitlement; long period; struct sockaddr_un addr; if (argc != 4) { fprintf(stderr, "Missing arguments\n"); return 1; } vcpu = atoi(argv[1]); /* FIXME hack, does guest time get scaled with vcpu count? */ entitlement = atoi(argv[2]) * 2; period = atoi(argv[3]); monitor_fd = socket(PF_UNIX, SOCK_STREAM, 0); addr.sun_family = AF_UNIX; snprintf(addr.sun_path, 108, "/tmp/monitor.sock"); if (connect(monitor_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { return 1; } read_reply(monitor_fd, buffer, sizeof(buffer)); monitor_command_response(buffer, sizeof(buffer), "info cpus"); pid = find_pid(buffer, vcpu); ticks_per_sec = sysconf(_SC_CLK_TCK); entitlement = (entitlement * ticks_per_sec) / 1000; period *= 1000; snprintf(buffer, sizeof(buffer), "/proc/%d/stat", pid); fd = open(buffer, O_RDONLY); gettimeofday(&period_end, NULL); tv_add(&period_end, period); cguest_ticks = 0; while (1) { long cguest_time_now; struct timeval tv_now; ssize_t len; gettimeofday(&tv_now, NULL); len = pread(fd, buffer, sizeof(buffer) - 1, 0); buffer[len] = 0; cguest_time_now = get_cguest_time(buffer); if (cguest_time_last) { cguest_ticks += cguest_time_now - cguest_time_last; if (tv_cmp(&tv_now, &period_end) < 0) { if (cguest_ticks >= entitlement) { guest_stop(vcpu); cguest_ticks = 0; } } else { guest_start(vcpu); cguest_ticks = 0; tv_add(&tv_now, period); period_end = tv_now; } } cguest_time_last = cguest_time_now; usleep(10000); // 10ms } close(fd); return 0; } --------------050701030309050806020401--