Linux virtualization list

Linux virtualization list
 help / color / mirror / Atom feed

* [PATCH 04/13] Tools: hv: Prepare to expand  kvp_get_ip_address() functionality
From: K. Y. Srinivasan @ 2012-06-21 21:31 UTC (permalink / raw)
  To: gregkh, linux-kernel, devel, virtualization, ohering, apw
  Cc: K. Y. Srinivasan
In-Reply-To: <1340314305-27126-1-git-send-email-kys@microsoft.com>

kvp_get_ip_address() implemented the functionality to retrieve IP address info.
Make this function more generic so that we could retrieve additional
per-interface information.


Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 tools/hv/hv_kvp_daemon.c |  129 ++++++++++++++++++++++++++++++----------------
 1 files changed, 84 insertions(+), 45 deletions(-)

diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index 4831997..4fa6f95 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -491,7 +491,8 @@ done:
 }
 
 static int
-kvp_get_ip_address(int family, char *buffer, int length)
+kvp_get_ip_address(int family, char *if_name, int op,
+		 void  *out_buffer, int length)
 {
 	struct ifaddrs *ifap;
 	struct ifaddrs *curp;
@@ -501,10 +502,19 @@ kvp_get_ip_address(int family, char *buffer, int length)
 	const char *str;
 	char tmp[50];
 	int error = 0;
-
+	char *buffer;
+	struct hv_kvp_ipaddr_value *ip_buffer;
+
+	if (op == KVP_OP_ENUMERATE) {
+		buffer = out_buffer;
+	} else {
+		ip_buffer = out_buffer;
+		buffer = (char *)ip_buffer->ip_addr;
+		ip_buffer->addr_family = 0;
+	}
 	/*
 	 * On entry into this function, the buffer is capable of holding the
-	 * maximum key value (2048 bytes).
+	 * maximum key value.
 	 */
 
 	if (getifaddrs(&ifap)) {
@@ -514,58 +524,87 @@ kvp_get_ip_address(int family, char *buffer, int length)
 
 	curp = ifap;
 	while (curp != NULL) {
-		if ((curp->ifa_addr != NULL) &&
-		   (curp->ifa_addr->sa_family == family)) {
-			if (family == AF_INET) {
-				struct sockaddr_in *addr =
-				(struct sockaddr_in *) curp->ifa_addr;
-
-				str = inet_ntop(family, &addr->sin_addr,
-						tmp, 50);
-				if (str == NULL) {
-					strcpy(buffer, "inet_ntop failed\n");
-					error = 1;
-					goto getaddr_done;
-				}
-				if (offset == 0)
-					strcpy(buffer, tmp);
-				else
-					strcat(buffer, tmp);
-				strcat(buffer, ";");
+		if (curp->ifa_addr == NULL) {
+			curp = curp->ifa_next;
+			continue;
+		}
 
-				offset += strlen(str) + 1;
-				if ((length - offset) < (ipv4_len + 1))
-					goto getaddr_done;
+		if ((if_name != NULL) &&
+			(strncmp(curp->ifa_name, if_name, strlen(if_name)))) {
+			/*
+			 * We want info about a specific interface;
+			 * just continue.
+			 */
+			curp = curp->ifa_next;
+			continue;
+		}
 
-			} else {
+		/*
+		 * We only support two address families: AF_INET and AF_INET6.
+		 * If a family value of 0 is specified, we collect both
+		 * supported address families; if not we gather info on
+		 * the specified address family.
+		 */
+		if ((family != 0) && (curp->ifa_addr->sa_family != family)) {
+			curp = curp->ifa_next;
+			continue;
+		}
+		if ((curp->ifa_addr->sa_family != AF_INET) &&
+			(curp->ifa_addr->sa_family != AF_INET6)) {
+			curp = curp->ifa_next;
+			continue;
+		}
+
+		if ((curp->ifa_addr->sa_family == AF_INET) &&
+			((family == AF_INET) || (family == 0))) {
+			struct sockaddr_in *addr =
+			(struct sockaddr_in *) curp->ifa_addr;
+
+			str = inet_ntop(AF_INET, &addr->sin_addr, tmp, 50);
+			if (str == NULL) {
+				strcpy(buffer, "inet_ntop failed\n");
+				error = 1;
+				goto getaddr_done;
+			}
+			if (offset == 0)
+				strcpy(buffer, tmp);
+			else
+				strcat(buffer, tmp);
+			strcat(buffer, ";");
+
+			offset += strlen(str) + 1;
+			if ((length - offset) < (ipv4_len + 1))
+				goto getaddr_done;
+
+		} else if ((family == AF_INET6) || (family == 0)) {
 
 			/*
 			 * We only support AF_INET and AF_INET6
 			 * and the list of addresses is separated by a ";".
 			 */
-				struct sockaddr_in6 *addr =
+			struct sockaddr_in6 *addr =
 				(struct sockaddr_in6 *) curp->ifa_addr;
 
-				str = inet_ntop(family,
+			str = inet_ntop(AF_INET6,
 					&addr->sin6_addr.s6_addr,
 					tmp, 50);
-				if (str == NULL) {
-					strcpy(buffer, "inet_ntop failed\n");
-					error = 1;
-					goto getaddr_done;
-				}
-				if (offset == 0)
-					strcpy(buffer, tmp);
-				else
-					strcat(buffer, tmp);
-				strcat(buffer, ";");
-				offset += strlen(str) + 1;
-				if ((length - offset) < (ipv6_len + 1))
-					goto getaddr_done;
-
+			if (str == NULL) {
+				strcpy(buffer, "inet_ntop failed\n");
+				error = 1;
+				goto getaddr_done;
 			}
+			if (offset == 0)
+				strcpy(buffer, tmp);
+			else
+				strcat(buffer, tmp);
+			strcat(buffer, ";");
+			offset += strlen(str) + 1;
+			if ((length - offset) < (ipv6_len + 1))
+				goto getaddr_done;
 
 		}
+
+
 		curp = curp->ifa_next;
 	}
 
@@ -812,13 +851,13 @@ int main(void)
 			strcpy(key_value, lic_version);
 			break;
 		case NetworkAddressIPv4:
-			kvp_get_ip_address(AF_INET, key_value,
-					HV_KVP_EXCHANGE_MAX_VALUE_SIZE);
+			kvp_get_ip_address(AF_INET, NULL, KVP_OP_ENUMERATE,
+				key_value, HV_KVP_EXCHANGE_MAX_VALUE_SIZE);
 			strcpy(key_name, "NetworkAddressIPv4");
 			break;
 		case NetworkAddressIPv6:
-			kvp_get_ip_address(AF_INET6, key_value,
-					HV_KVP_EXCHANGE_MAX_VALUE_SIZE);
+			kvp_get_ip_address(AF_INET6, NULL, KVP_OP_ENUMERATE,
+				key_value, HV_KVP_EXCHANGE_MAX_VALUE_SIZE);
 			strcpy(key_name, "NetworkAddressIPv6");
 			break;
 		case OSBuildNumber:
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 03/13] Drivers: hv: kvp: Support the new IP injection messages
From: K. Y. Srinivasan @ 2012-06-21 21:31 UTC (permalink / raw)
  To: gregkh, linux-kernel, devel, virtualization, ohering, apw
  Cc: K. Y. Srinivasan
In-Reply-To: <1340314305-27126-1-git-send-email-kys@microsoft.com>

Implement support for the new IP injection messages in the driver code.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/hv/hv_kvp.c |   70 +++++++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 6e6f0c2..4fd0932 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -51,7 +51,7 @@ static struct {
 static void kvp_send_key(struct work_struct *dummy);
 
 
-static void kvp_respond_to_host(char *key, char *value, int error);
+static void kvp_respond_to_host(struct hv_kvp_msg *msg, int error);
 static void kvp_work_func(struct work_struct *dummy);
 static void kvp_register(void);
 
@@ -96,7 +96,7 @@ kvp_work_func(struct work_struct *dummy)
 	 * If the timer fires, the user-mode component has not responded;
 	 * process the pending transaction.
 	 */
-	kvp_respond_to_host("Unknown key", "Guest timed out", HV_E_FAIL);
+	kvp_respond_to_host(NULL, HV_E_FAIL);
 }
 
 /*
@@ -107,7 +107,6 @@ static void
 kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 {
 	struct hv_kvp_msg *message;
-	struct hv_kvp_msg_enumerate *data;
 	int	error;
 
 	message = (struct hv_kvp_msg *)msg->data;
@@ -126,13 +125,59 @@ kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 	 * the user level daemon to transmit errors.
 	 */
 	error = *((int *)(&message->kvp_hdr.operation));
-	data = &message->body.kvp_enum_data;
 	/*
 	 * Complete the transaction by forwarding the key value
 	 * to the host. But first, cancel the timeout.
 	 */
 	if (cancel_delayed_work_sync(&kvp_work))
-		kvp_respond_to_host(data->data.key, data->data.value, error);
+		kvp_respond_to_host(message, error);
+}
+
+static void process_ipinfo(struct hv_kvp_msg *in_msg,
+		struct hv_kvp_msg *out_msg, int op)
+{
+	switch (op) {
+	case KVP_OP_SET_IP_INFO:
+		/*
+		 * Transform all parameters into utf8 encoding.
+		 */
+		utf16s_to_utf8s((wchar_t *)in_msg->body.kvp_ip_val.ip_addr,
+				MAX_IP_ADDR_SIZE,
+				UTF16_LITTLE_ENDIAN,
+				(__u8 *)out_msg->body.kvp_ip_val.ip_addr,
+				MAX_IP_ADDR_SIZE);
+
+		utf16s_to_utf8s((wchar_t *)in_msg->body.kvp_ip_val.sub_net,
+				MAX_IP_ADDR_SIZE,
+				UTF16_LITTLE_ENDIAN,
+				(__u8 *)out_msg->body.kvp_ip_val.sub_net,
+				MAX_IP_ADDR_SIZE);
+
+		utf16s_to_utf8s((wchar_t *)in_msg->body.kvp_ip_val.gate_way,
+				MAX_GATEWAY_SIZE,
+				UTF16_LITTLE_ENDIAN,
+				(__u8 *)out_msg->body.kvp_ip_val.gate_way,
+				MAX_GATEWAY_SIZE);
+
+		utf16s_to_utf8s((wchar_t *)in_msg->body.kvp_ip_val.dns_addr,
+				MAX_IP_ADDR_SIZE,
+				UTF16_LITTLE_ENDIAN,
+				(__u8 *)out_msg->body.kvp_ip_val.dns_addr,
+				MAX_IP_ADDR_SIZE);
+
+		out_msg->body.kvp_ip_val.dhcp_enabled =
+		in_msg->body.kvp_ip_val.dhcp_enabled;
+
+	default:
+		utf16s_to_utf8s((wchar_t *)in_msg->body.kvp_ip_val.adapter_id,
+				MAX_ADAPTER_ID_SIZE,
+				UTF16_LITTLE_ENDIAN,
+				(__u8 *)out_msg->body.kvp_ip_val.adapter_id,
+				MAX_ADAPTER_ID_SIZE);
+
+		out_msg->body.kvp_ip_val.addr_family =
+		in_msg->body.kvp_ip_val.addr_family;
+	}
 }
 
 static void
@@ -170,6 +215,12 @@ kvp_send_key(struct work_struct *dummy)
 	 */
 
 	switch (message->kvp_hdr.operation) {
+	case KVP_OP_SET_IP_INFO:
+		process_ipinfo(in_msg, message, KVP_OP_SET_IP_INFO);
+		break;
+	case KVP_OP_GET_IP_INFO:
+		process_ipinfo(in_msg, message, KVP_OP_GET_IP_INFO);
+		break;
 	case KVP_OP_SET:
 		switch (in_msg->body.kvp_set.data.value_type) {
 		case REG_SZ:
@@ -246,11 +297,12 @@ kvp_send_key(struct work_struct *dummy)
  */
 
 static void
-kvp_respond_to_host(char *key, char *value, int error)
+kvp_respond_to_host(struct hv_kvp_msg *msg_to_host, int error)
 {
 	struct hv_kvp_msg  *kvp_msg;
 	struct hv_kvp_exchg_msg_value  *kvp_data;
 	char	*key_name;
+	char	*value;
 	struct icmsg_hdr *icmsghdrp;
 	int	keylen = 0;
 	int	valuelen = 0;
@@ -309,6 +361,9 @@ kvp_respond_to_host(char *key, char *value, int error)
 			sizeof(struct icmsg_hdr)];
 
 	switch (kvp_transaction.kvp_msg->kvp_hdr.operation) {
+	case KVP_OP_GET_IP_INFO:
+		process_ipinfo(msg_to_host, kvp_msg, KVP_OP_SET_IP_INFO);
+		goto response_done;
 	case KVP_OP_GET:
 		kvp_data = &kvp_msg->body.kvp_get.data;
 		goto copy_value;
@@ -322,7 +377,7 @@ kvp_respond_to_host(char *key, char *value, int error)
 	}
 
 	kvp_data = &kvp_msg->body.kvp_enum_data.data;
-	key_name = key;
+	key_name = msg_to_host->body.kvp_enum_data.data.key;
 
 	/*
 	 * The windows host expects the key/value pair to be encoded
@@ -336,6 +391,7 @@ kvp_respond_to_host(char *key, char *value, int error)
 	kvp_data->key_size = 2*(keylen + 1); /* utf16 encoding */
 
 copy_value:
+	value = msg_to_host->body.kvp_enum_data.data.value;
 	valuelen = utf8s_to_utf16s(value, strlen(value), UTF16_HOST_ENDIAN,
 				(wchar_t *) kvp_data->value,
 				(HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2);
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 02/13] Drivers: hv: kvp: Cleanup error handling in KVP
From: K. Y. Srinivasan @ 2012-06-21 21:31 UTC (permalink / raw)
  To: gregkh, linux-kernel, devel, virtualization, ohering, apw
  Cc: K. Y. Srinivasan
In-Reply-To: <1340314305-27126-1-git-send-email-kys@microsoft.com>

In preparation to implementing IP injection, cleanup the way we propagate
and handle errors both in the driver as well as in the user level daemon.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 drivers/hv/hv_kvp.c      |   43 +++++++++++++++++++------------------
 include/linux/hyperv.h   |   17 +++++++++-----
 tools/hv/hv_kvp_daemon.c |   53 +++++++++++++++++++++++----------------------
 3 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/drivers/hv/hv_kvp.c b/drivers/hv/hv_kvp.c
index 0012eed..6e6f0c2 100644
--- a/drivers/hv/hv_kvp.c
+++ b/drivers/hv/hv_kvp.c
@@ -50,7 +50,6 @@ static struct {
 
 static void kvp_send_key(struct work_struct *dummy);
 
-#define TIMEOUT_FIRED 1
 
 static void kvp_respond_to_host(char *key, char *value, int error);
 static void kvp_work_func(struct work_struct *dummy);
@@ -97,7 +96,7 @@ kvp_work_func(struct work_struct *dummy)
 	 * If the timer fires, the user-mode component has not responded;
 	 * process the pending transaction.
 	 */
-	kvp_respond_to_host("Unknown key", "Guest timed out", TIMEOUT_FIRED);
+	kvp_respond_to_host("Unknown key", "Guest timed out", HV_E_FAIL);
 }
 
 /*
@@ -109,27 +108,31 @@ kvp_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
 {
 	struct hv_kvp_msg *message;
 	struct hv_kvp_msg_enumerate *data;
+	int	error;
 
 	message = (struct hv_kvp_msg *)msg->data;
-	switch (message->kvp_hdr.operation) {
-	case KVP_OP_REGISTER:
+
+	if (message->kvp_hdr.operation == KVP_OP_REGISTER) {
 		pr_info("KVP: user-mode registering done.\n");
 		kvp_register();
 		kvp_transaction.active = false;
-		hv_kvp_onchannelcallback(kvp_transaction.kvp_context);
-		break;
-
-	default:
-		data = &message->body.kvp_enum_data;
-		/*
-		 * Complete the transaction by forwarding the key value
-		 * to the host. But first, cancel the timeout.
-		 */
-		if (cancel_delayed_work_sync(&kvp_work))
-			kvp_respond_to_host(data->data.key,
-					 data->data.value,
-					!strlen(data->data.key));
+		if (kvp_transaction.kvp_context)
+			hv_kvp_onchannelcallback(kvp_transaction.kvp_context);
+		return;
 	}
+
+	/*
+	 * We use the message header information from
+	 * the user level daemon to transmit errors.
+	 */
+	error = *((int *)(&message->kvp_hdr.operation));
+	data = &message->body.kvp_enum_data;
+	/*
+	 * Complete the transaction by forwarding the key value
+	 * to the host. But first, cancel the timeout.
+	 */
+	if (cancel_delayed_work_sync(&kvp_work))
+		kvp_respond_to_host(data->data.key, data->data.value, error);
 }
 
 static void
@@ -287,6 +290,7 @@ kvp_respond_to_host(char *key, char *value, int error)
 		 */
 		return;
 
+	icmsghdrp->status = error;
 
 	/*
 	 * If the error parameter is set, terminate the host's enumeration
@@ -294,15 +298,12 @@ kvp_respond_to_host(char *key, char *value, int error)
 	 */
 	if (error) {
 		/*
-		 * Something failed or the we have timedout;
+		 * Something failed or  we have timedout;
 		 * terminate the current  host-side iteration.
 		 */
-		icmsghdrp->status = HV_S_CONT;
 		goto response_done;
 	}
 
-	icmsghdrp->status = HV_S_OK;
-
 	kvp_msg = (struct hv_kvp_msg *)
 			&recv_buffer[sizeof(struct vmbuspipe_hdr) +
 			sizeof(struct icmsg_hdr)];
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 0497764..9d75699 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -142,6 +142,17 @@ enum hv_kvp_exchg_pool {
 	KVP_POOL_COUNT /* Number of pools, must be last. */
 };
 
+/*
+ * Some Hyper-V status codes.
+ */
+
+#define HV_S_OK				0x00000000
+#define HV_E_FAIL			0x80004005
+#define HV_S_CONT			0x80070103
+#define HV_ERROR_NOT_SUPPORTED		0x80070032
+#define HV_ERROR_MACHINE_LOCKED		0x800704F7
+#define HV_ERROR_DEVICE_NOT_CONNECTED	0x8007048F
+
 #define ADDR_FAMILY_NONE	0x00
 #define ADDR_FAMILY_IPV4	0x01
 #define ADDR_FAMILY_IPV6	0x02
@@ -1000,12 +1011,6 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver);
 #define ICMSGHDRFLAG_REQUEST		2
 #define ICMSGHDRFLAG_RESPONSE		4
 
-#define HV_S_OK				0x00000000
-#define HV_E_FAIL			0x80004005
-#define HV_S_CONT			0x80070103
-#define HV_ERROR_NOT_SUPPORTED		0x80070032
-#define HV_ERROR_MACHINE_LOCKED		0x800704F7
-#define HV_ERROR_DEVICE_NOT_CONNECTED	0x8007048F
 
 /*
  * While we want to handle util services as regular devices,
diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index d9834b3..4831997 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c
@@ -394,7 +394,7 @@ static int kvp_get_value(int pool, __u8 *key, int key_size, __u8 *value,
 	return 1;
 }
 
-static void kvp_pool_enumerate(int pool, int index, __u8 *key, int key_size,
+static int kvp_pool_enumerate(int pool, int index, __u8 *key, int key_size,
 				__u8 *value, int value_size)
 {
 	struct kvp_record *record;
@@ -406,16 +406,12 @@ static void kvp_pool_enumerate(int pool, int index, __u8 *key, int key_size,
 	record = kvp_file_info[pool].records;
 
 	if (index >= kvp_file_info[pool].num_records) {
-		/*
-		 * This is an invalid index; terminate enumeration;
-		 * - a NULL value will do the trick.
-		 */
-		strcpy(value, "");
-		return;
+		return 1;
 	}
 
 	memcpy(key, record[index].key, key_size);
 	memcpy(value, record[index].value, value_size);
+	return 0;
 }
 
 
@@ -646,6 +642,8 @@ int main(void)
 	char	*p;
 	char	*key_value;
 	char	*key_name;
+	int	op;
+	int	pool;
 
 	daemon(1, 0);
 	openlog("KVP", 0, LOG_USER);
@@ -721,7 +719,16 @@ int main(void)
 		incoming_cn_msg = (struct cn_msg *)NLMSG_DATA(incoming_msg);
 		hv_msg = (struct hv_kvp_msg *)incoming_cn_msg->data;
 
-		switch (hv_msg->kvp_hdr.operation) {
+		/*
+		 * We will use the KVP header information to pass back
+		 * the error from this daemon. So, first copy the state
+		 * and set the error code to success.
+		 */
+		op = hv_msg->kvp_hdr.operation;
+		pool = hv_msg->kvp_hdr.pool;
+		*((int *)(&hv_msg->kvp_hdr.operation)) = HV_S_OK;
+
+		switch (op) {
 		case KVP_OP_REGISTER:
 			/*
 			 * Driver is registering with us; stash away the version
@@ -738,20 +745,14 @@ int main(void)
 			}
 			continue;
 
-		/*
-		 * The current protocol with the kernel component uses a
-		 * NULL key name to pass an error condition.
-		 * For the SET, GET and DELETE operations,
-		 * use the existing protocol to pass back error.
-		 */
-
 		case KVP_OP_SET:
 			if (kvp_key_add_or_modify(hv_msg->kvp_hdr.pool,
 					hv_msg->body.kvp_set.data.key,
 					hv_msg->body.kvp_set.data.key_size,
 					hv_msg->body.kvp_set.data.value,
 					hv_msg->body.kvp_set.data.value_size))
-				strcpy(hv_msg->body.kvp_set.data.key, "");
+				*((int *)(&hv_msg->kvp_hdr.operation)) =
+								HV_S_CONT;
 			break;
 
 		case KVP_OP_GET:
@@ -760,14 +761,16 @@ int main(void)
 					hv_msg->body.kvp_set.data.key_size,
 					hv_msg->body.kvp_set.data.value,
 					hv_msg->body.kvp_set.data.value_size))
-				strcpy(hv_msg->body.kvp_set.data.key, "");
+				*((int *)(&hv_msg->kvp_hdr.operation)) =
+								HV_S_CONT;
 			break;
 
 		case KVP_OP_DELETE:
 			if (kvp_key_delete(hv_msg->kvp_hdr.pool,
 					hv_msg->body.kvp_delete.key,
 					hv_msg->body.kvp_delete.key_size))
-				strcpy(hv_msg->body.kvp_delete.key, "");
+				*((int *)(&hv_msg->kvp_hdr.operation)) =
+								HV_S_CONT;
 			break;
 
 		default:
@@ -782,13 +785,15 @@ int main(void)
 		 * both the key and the value; if not read from the
 		 * appropriate pool.
 		 */
-		if (hv_msg->kvp_hdr.pool != KVP_POOL_AUTO) {
-			kvp_pool_enumerate(hv_msg->kvp_hdr.pool,
+		if (pool != KVP_POOL_AUTO) {
+			if (kvp_pool_enumerate(hv_msg->kvp_hdr.pool,
 					hv_msg->body.kvp_enum_data.index,
 					hv_msg->body.kvp_enum_data.data.key,
 					HV_KVP_EXCHANGE_MAX_KEY_SIZE,
 					hv_msg->body.kvp_enum_data.data.value,
-					HV_KVP_EXCHANGE_MAX_VALUE_SIZE);
+					HV_KVP_EXCHANGE_MAX_VALUE_SIZE))
+				*((int *)(&hv_msg->kvp_hdr.operation)) =
+								HV_S_CONT;
 			goto kvp_done;
 		}
 
@@ -841,11 +846,7 @@ int main(void)
 			strcpy(key_name, "ProcessorArchitecture");
 			break;
 		default:
-			strcpy(key_value, "Unknown Key");
-			/*
-			 * We use a null key name to terminate enumeration.
-			 */
-			strcpy(key_name, "");
+			*((int *)(&hv_msg->kvp_hdr.operation)) = HV_S_CONT;
 			break;
 		}
 		/*
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 01/13] Drivers: hv: Add KVP definitions for IP address injection
From: K. Y. Srinivasan @ 2012-06-21 21:31 UTC (permalink / raw)
  To: gregkh, linux-kernel, devel, virtualization, ohering, apw
In-Reply-To: <1340314200-27078-1-git-send-email-kys@microsoft.com>

Add the necessary definitions for supporting the IP injection functionality.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
---
 include/linux/hyperv.h |   24 ++++++++++++++++++++++++
 1 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 68ed7f7..0497764 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -127,6 +127,8 @@ enum hv_kvp_exchg_op {
 	KVP_OP_SET,
 	KVP_OP_DELETE,
 	KVP_OP_ENUMERATE,
+	KVP_OP_GET_IP_INFO,
+	KVP_OP_SET_IP_INFO,
 	KVP_OP_REGISTER,
 	KVP_OP_COUNT /* Number of operations, must be last. */
 };
@@ -140,6 +142,26 @@ enum hv_kvp_exchg_pool {
 	KVP_POOL_COUNT /* Number of pools, must be last. */
 };
 
+#define ADDR_FAMILY_NONE	0x00
+#define ADDR_FAMILY_IPV4	0x01
+#define ADDR_FAMILY_IPV6	0x02
+
+#define MAX_ADAPTER_ID_SIZE	128
+#define MAX_IP_ADDR_SIZE	1024
+#define MAX_GATEWAY_SIZE	512
+
+
+struct hv_kvp_ipaddr_value {
+	__u16	adapter_id[MAX_ADAPTER_ID_SIZE];
+	__u8	addr_family;
+	__u8	dhcp_enabled;
+	__u16	ip_addr[MAX_IP_ADDR_SIZE];
+	__u16	sub_net[MAX_IP_ADDR_SIZE];
+	__u16	gate_way[MAX_GATEWAY_SIZE];
+	__u16	dns_addr[MAX_IP_ADDR_SIZE];
+} __attribute__((packed));
+
+
 struct hv_kvp_hdr {
 	__u8 operation;
 	__u8 pool;
@@ -187,6 +209,7 @@ struct hv_kvp_msg {
 		struct hv_kvp_msg_set		kvp_set;
 		struct hv_kvp_msg_delete	kvp_delete;
 		struct hv_kvp_msg_enumerate	kvp_enum_data;
+		struct hv_kvp_ipaddr_value	kvp_ip_val;
 		struct hv_kvp_register		kvp_register;
 	} body;
 } __attribute__((packed));
@@ -982,6 +1005,7 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver);
 #define HV_S_CONT			0x80070103
 #define HV_ERROR_NOT_SUPPORTED		0x80070032
 #define HV_ERROR_MACHINE_LOCKED		0x800704F7
+#define HV_ERROR_DEVICE_NOT_CONNECTED	0x8007048F
 
 /*
  * While we want to handle util services as regular devices,
-- 
1.7.4.1

^ permalink raw reply related

* [PATCH 00/13] drivers: hv: kvp
From: K. Y. Srinivasan @ 2012-06-21 21:30 UTC (permalink / raw)
  To: gregkh, linux-kernel, devel, virtualization, ohering, apw

This patchset expands the KVP (Key Value Pair) functionality to
implement the mechanism to get/set IP addresses in the guest. This
functionality is used in Windows Server 2012 to implement VM
replication functionality. The way IP configuration information
is managed is distro specific. The current implementation supports
RedHat way of doing things. We will expand support to other distros
incrementally. 

K. Y. Srinivasan (13):
  Drivers: hv: Add KVP definitions for IP address injection
  Drivers: hv: kvp: Cleanup error handling in KVP
  Drivers: hv: kvp: Support the new IP injection messages
  Tools: hv: Prepare to expand  kvp_get_ip_address() functionality
  Tools: hv: Further refactor kvp_get_ip_address()
  Tools: hv: Gather address family information
  Tools: hv: Gather subnet information
  Tools: hv: Represent the ipv6 mask using CIDR notation
  Tools: hv: Gather DNS information
  Tools: hv: Gather ipv[4,6] gateway information
  Tools: hv: Gather dhcp information
  Tools: hv: Implement the KVP verb - KVP_OP_GET_IP_INFO
  Tools: hv: Implement the KVP verb - KVP_OP_SET_IP_INFO

 drivers/hv/hv_kvp.c      |  103 ++++++--
 include/linux/hyperv.h   |   39 +++-
 tools/hv/hv_kvp_daemon.c |  650 ++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 686 insertions(+), 106 deletions(-)

-- 
1.7.4.1

^ permalink raw reply

* Re: [Qemu-devel] [PATCH 2/2] virtio-scsi: Implement hotplug support for virtio-scsi
From: Stefan Hajnoczi @ 2012-06-21 10:56 UTC (permalink / raw)
  To: Cong Meng
  Cc: stefanha, zwanp, linuxram, qemu-devel, senwang, Anthony Liguori,
	Paolo Bonzini, virtualization, Andreas Färber
In-Reply-To: <69a1a349ba21144e66843ab335cb3a243926bba8.1340022196.git.mc@linux.vnet.ibm.com>

On Wed, Jun 20, 2012 at 7:47 AM, Cong Meng <mc@linux.vnet.ibm.com> wrote:
> Implement the hotplug() and hot_unplug() interfaces in virtio-scsi, by signal
> the virtio_scsi.ko in guest kernel via event virtual queue.
>
> The counterpart patch of virtio_scsi.ko will be sent soon in another thread.
>
> Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
> Signed-off-by: Sen Wang <senwang@linux.vnet.ibm.com>
> ---
>  hw/virtio-scsi.c |   72 +++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 69 insertions(+), 3 deletions(-)

I compared against the virtio-scsi specification and this looks good:
http://ozlabs.org/~rusty/virtio-spec/virtio-0.9.5.pdf

Dropped events and event throttling are not implemented by this patch.
 This means that the guest can miss events if it runs out of event
queue elements.  A scenario that might be able to trigger this is if
multiple LUNs are hotplugged in a single QEMU monitor callback.

Implementing dropped events is easy in hw/virtio-scsi.c.  Keep a bool
or counter of dropped events and report them when the guest kicks us
with a free event element (virtio_scsi_handle_event).

Stefan

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Dor Laor @ 2012-06-21  9:49 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Sasha Levin, Christoph Hellwig
In-Reply-To: <4FE155B3.8020206@redhat.com>

On 06/20/2012 07:46 AM, Asias He wrote:
> On 06/19/2012 02:21 PM, Dor Laor wrote:
>> On 06/19/2012 05:51 AM, Asias He wrote:
>>> On 06/18/2012 07:39 PM, Sasha Levin wrote:
>>>> On Mon, 2012-06-18 at 14:14 +0300, Dor Laor wrote:
>>>>> On 06/18/2012 01:05 PM, Rusty Russell wrote:
>>>>>> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He<asias@redhat.com> wrote:
>>>>>>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
>>>>>>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He<asias@redhat.com>
>>>>>>>> wrote:
>>>>>>>>> This patch introduces bio-based IO path for virtio-blk.
>>>>>>>>
>>>>>>>> Why make it optional?
>>>>>>>
>>>>>>> request-based IO path is useful for users who do not want to bypass
>>>>>>> the
>>>>>>> IO scheduler in guest kernel, e.g. users using spinning disk. For
>>>>>>> users
>>>>>>> using fast disk device, e.g. SSD device, they can use bio-based IO
>>>>>>> path.
>>>>>>
>>>>>> Users using a spinning disk still get IO scheduling in the host
>>>>>> though.
>>>>>> What benefit is there in doing it in the guest as well?
>>>>>
>>>>> The io scheduler waits for requests to merge and thus batch IOs
>>>>> together. It's not important w.r.t spinning disks since the host
>>>>> can do
>>>>> it but it causes much less vmexits which is the key issue for VMs.
>>>>
>>>> Is the amount of exits caused by virtio-blk significant at all with
>>>> EVENT_IDX?
>>>
>>> Yes. EVENT_IDX saves the number of notify and interrupt. Let's take the
>>> interrupt as an example, The guest fires 200K request to host, the
>>> number of interrupt is about 6K thanks to EVENT_IDX. The ratio is 200K /
>>> 6K = 33. The ratio of merging is 40000K / 200K = 20.
>>>
>>
>> In this case, why don't you always recommend bio over request based?
>
> This case shows that IO scheduler's merging in guest saves a lot of
> requests to host side. Why should I recommend bio over request based here?
>

Does it merge 20 request _on top_ of what event idx does? Of course if 
that's the case, we should keep that.

Dor

^ permalink raw reply

* Re: [Qemu-devel] [PATCH 1/2 v2] scsi bus: introduce hotplug() and hot_unplug() interfaces for SCSI bus
From: Stefan Hajnoczi @ 2012-06-21  9:17 UTC (permalink / raw)
  To: Cong Meng
  Cc: stefanha, zwanp, linuxram, qemu-devel, senwang, Anthony Liguori,
	Paolo Bonzini, virtualization, Andreas Färber
In-Reply-To: <ad3714e594248d5a8d558b6983db3fb2da65ae9a.1340264611.git.mc@linux.vnet.ibm.com>

On Thu, Jun 21, 2012 at 8:54 AM, Cong Meng <mc@linux.vnet.ibm.com> wrote:
> +static int scsi_qdev_unplug(DeviceState *qdev)
> +{
> +    SCSIDevice *dev = SCSI_DEVICE(qdev);
> +    SCSIBus *bus = scsi_bus_from_device(dev);
> +
> +    if (bus->info->hot_unplug)
> +        bus->info->hot_unplug(bus, dev);

Please use scripts/checkpatch.pl to ensure that your patch follows
QEMU coding style.  if statements must use {}.

Stefan

^ permalink raw reply

* [PATCH 1/2 v2] scsi bus: introduce hotplug() and hot_unplug() interfaces for SCSI bus
From: Cong Meng @ 2012-06-21  7:54 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: stefanha, qemu-devel, zwanp, linuxram, senwang, virtualization,
	Cong Meng, Anthony Liguori, Andreas Färber
In-Reply-To: <cover.1340264611.git.mc@linux.vnet.ibm.com>

Add two interfaces hotplug() and hot_unplug() to scsi bus info.
The embody scsi bus can implement these two interfaces to signal the HBA driver
of guest kernel to add/remove the scsi device in question.

Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
Signed-off-by: Sen Wang <senwang@linux.vnet.ibm.com>
---
 hw/scsi-bus.c |   16 +++++++++++++++-
 hw/scsi.h     |    2 ++
 2 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index dbdb99c..f08900e 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -177,6 +177,10 @@ static int scsi_qdev_init(DeviceState *qdev)
                                                          dev);
     }
 
+    if (bus->info->hotplug) {
+        bus->info->hotplug(bus, dev);
+    }
+
 err:
     return rc;
 }
@@ -1539,6 +1543,16 @@ static int get_scsi_requests(QEMUFile *f, void *pv, size_t size)
     return 0;
 }
 
+static int scsi_qdev_unplug(DeviceState *qdev)
+{
+    SCSIDevice *dev = SCSI_DEVICE(qdev);
+    SCSIBus *bus = scsi_bus_from_device(dev);
+
+    if (bus->info->hot_unplug)
+        bus->info->hot_unplug(bus, dev);
+    return qdev_simple_unplug_cb(qdev);
+}
+
 const VMStateInfo vmstate_info_scsi_requests = {
     .name = "scsi-requests",
     .get  = get_scsi_requests,
@@ -1575,7 +1589,7 @@ static void scsi_device_class_init(ObjectClass *klass, void *data)
     DeviceClass *k = DEVICE_CLASS(klass);
     k->bus_info = &scsi_bus_info;
     k->init     = scsi_qdev_init;
-    k->unplug   = qdev_simple_unplug_cb;
+    k->unplug   = scsi_qdev_unplug;
     k->exit     = scsi_qdev_exit;
 }
 
diff --git a/hw/scsi.h b/hw/scsi.h
index 2eb66f7..5768071 100644
--- a/hw/scsi.h
+++ b/hw/scsi.h
@@ -130,6 +130,8 @@ struct SCSIBusInfo {
     void (*transfer_data)(SCSIRequest *req, uint32_t arg);
     void (*complete)(SCSIRequest *req, uint32_t arg, size_t resid);
     void (*cancel)(SCSIRequest *req);
+    void (*hotplug)(SCSIBus *bus, SCSIDevice *dev);
+    void (*hot_unplug)(SCSIBus *bus, SCSIDevice *dev);
     QEMUSGList *(*get_sg_list)(SCSIRequest *req);
 
     void (*save_request)(QEMUFile *f, SCSIRequest *req);
-- 
1.7.7.6

^ permalink raw reply related

* Re: [PATCH 1/2] scsi bus: introduce hotplug() and hot_unplug() interfaces for SCSI bus
From: Andreas Färber @ 2012-06-20 13:10 UTC (permalink / raw)
  To: Cong Meng, Paolo Bonzini, Anthony Liguori
  Cc: stefanha, qemu-devel, zwanp, linuxram, senwang, virtualization
In-Reply-To: <287bf70a546a1ce51a6f3405ba620daaf284bdfe.1340022196.git.mc@linux.vnet.ibm.com>

Am 20.06.2012 08:47, schrieb Cong Meng:
> Add two interfaces hotplug() and hot_unplug() to scsi bus info.
> The embody scsi bus can implement these two interfaces to signal the HBA driver
> of guest kernel to add/remove the scsi device in question.
> 
> Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
> Signed-off-by: Sen Wang <senwang@linux.vnet.ibm.com>
> ---
>  hw/scsi-bus.c |   16 +++++++++++++++-
>  hw/scsi.h     |    2 ++
>  2 files changed, 17 insertions(+), 1 deletions(-)
> 
> diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
> index dbdb99c..cc3ec75 100644
> --- a/hw/scsi-bus.c
> +++ b/hw/scsi-bus.c
> @@ -177,6 +177,10 @@ static int scsi_qdev_init(DeviceState *qdev)
>                                                           dev);
>      }
>  
> +    if (bus->info->hotplug) {
> +	bus->info->hotplug(bus, dev);

Tab.

> +    }
> +
>  err:
>      return rc;
>  }
> @@ -1539,6 +1543,16 @@ static int get_scsi_requests(QEMUFile *f, void *pv, size_t size)
>      return 0;
>  }
>  
> +static int scsi_qdev_unplug(DeviceState *qdev)
> +{
> +    SCSIDevice *dev = SCSI_DEVICE(qdev);
> +    SCSIBus *bus = DO_UPCAST(SCSIBus, qbus, dev->qdev.parent_bus);

Since the tabs need to be fixed anyway, I would suggest to avoid using
DO_UPCAST() with QOM types:

SCSIBus *bus = SCSI_BUS(qdev->parent_bus);

Also I'd like to raise the question towards Paolo and Anthony whether we
might want to start naming new functions like this one
scsi_device_unplug() to avoid the "qdev"? In this case sticking to
"qdev" provides symmetry so there's good reasons for both approaches.

> +
> +    if (bus->info->hot_unplug)
> +	bus->info->hot_unplug(bus, dev);

Tab. It seems your editor converts all 8-space indents, please check the
second patch.

Otherwise looks okay from my side.

Andreas

> +    return qdev_simple_unplug_cb(qdev);
> +}
> +
>  const VMStateInfo vmstate_info_scsi_requests = {
>      .name = "scsi-requests",
>      .get  = get_scsi_requests,
> @@ -1575,7 +1589,7 @@ static void scsi_device_class_init(ObjectClass *klass, void *data)
>      DeviceClass *k = DEVICE_CLASS(klass);
>      k->bus_info = &scsi_bus_info;
>      k->init     = scsi_qdev_init;
> -    k->unplug   = qdev_simple_unplug_cb;
> +    k->unplug   = scsi_qdev_unplug;
>      k->exit     = scsi_qdev_exit;
>  }
>  
> diff --git a/hw/scsi.h b/hw/scsi.h
> index 2eb66f7..5768071 100644
> --- a/hw/scsi.h
> +++ b/hw/scsi.h
> @@ -130,6 +130,8 @@ struct SCSIBusInfo {
>      void (*transfer_data)(SCSIRequest *req, uint32_t arg);
>      void (*complete)(SCSIRequest *req, uint32_t arg, size_t resid);
>      void (*cancel)(SCSIRequest *req);
> +    void (*hotplug)(SCSIBus *bus, SCSIDevice *dev);
> +    void (*hot_unplug)(SCSIBus *bus, SCSIDevice *dev);
>      QEMUSGList *(*get_sg_list)(SCSIRequest *req);
>  
>      void (*save_request)(QEMUFile *f, SCSIRequest *req);

-- 
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg

^ permalink raw reply

* [PATCH] virtio-scsi: hotplug suppot for virtio-scsi
From: Cong Meng @ 2012-06-20  6:55 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: stefanha, linux-scsi, linux-kernel, zwanp, linuxram, senwang,
	virtualization, James E.J. Bottomley, Cong Meng, Anthony Liguori

This patch implements the hotplug support for virtio-scsi.
When there is a device attached/detached, the virtio-scsi driver will be
signaled via event virtual queue and it will add/remove the scsi device 
in question automatically.

Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
Signed-off-by: Sen Wang <senwang@linux.vnet.ibm.com>
---
 drivers/scsi/virtio_scsi.c  |  109 ++++++++++++++++++++++++++++++++++++++++++-
 include/linux/virtio_scsi.h |    9 ++++
 2 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 1b38431..6dad625 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -25,6 +25,7 @@
 #include <scsi/scsi_cmnd.h>
 
 #define VIRTIO_SCSI_MEMPOOL_SZ 64
+#define VIRTIO_SCSI_EVENT_LEN 8
 
 /* Command queue element */
 struct virtio_scsi_cmd {
@@ -43,9 +44,15 @@ struct virtio_scsi_cmd {
 	} resp;
 } ____cacheline_aligned_in_smp;
 
+struct virtio_scsi_event_node {
+	struct virtio_scsi *vscsi;
+	struct virtio_scsi_event event;
+	struct work_struct work;
+};
+
 /* Driver instance state */
 struct virtio_scsi {
-	/* Protects ctrl_vq, req_vq and sg[] */
+	/* Protects ctrl_vq, event_vq, req_vq and sg[] */
 	spinlock_t vq_lock;
 
 	struct virtio_device *vdev;
@@ -53,6 +60,9 @@ struct virtio_scsi {
 	struct virtqueue *event_vq;
 	struct virtqueue *req_vq;
 
+	/* Get some buffers reday for event vq */
+	struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
+
 	/* For sglist construction when adding commands to the virtqueue.  */
 	struct scatterlist sg[];
 };
@@ -184,9 +194,93 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
 	virtscsi_vq_done(vq, virtscsi_complete_free);
 };
 
+static int virtscsi_kick_event(struct virtio_scsi *vscsi,
+			       struct virtio_scsi_event_node *event_node)
+{
+	int ret;
+	struct scatterlist sg;
+	unsigned long flags;
+
+	sg_set_buf(&sg, &event_node->event, sizeof(struct virtio_scsi_event));
+
+	spin_lock_irqsave(&vscsi->vq_lock, flags);
+
+	ret = virtqueue_add_buf(vscsi->event_vq, &sg, 0, 1, event_node, GFP_ATOMIC);
+	if (ret >= 0)
+		virtqueue_kick(vscsi->event_vq);
+
+	spin_unlock_irqrestore(&vscsi->vq_lock, flags);
+
+	return ret;
+}
+
+static int virtscsi_kick_event_all(struct virtio_scsi *vscsi)
+{
+	int i;
+
+	for (i = 0; i < VIRTIO_SCSI_EVENT_LEN; i++) {
+		vscsi->event_list[i].vscsi = vscsi;
+		virtscsi_kick_event(vscsi, &vscsi->event_list[i]);
+	}
+
+	return 0;
+}
+
+static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
+                                            struct virtio_scsi_event *event)
+{
+	struct scsi_device *sdev;
+	struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
+	unsigned int target = event->lun[1];
+	unsigned int lun = (event->lun[2] << 8) | event->lun[3];
+
+	switch (event->reason) {
+	case VIRTIO_SCSI_EVT_RESET_RESCAN:
+		scsi_add_device(shost, 0, target, lun);
+		break;
+	case VIRTIO_SCSI_EVT_RESET_REMOVED:
+		sdev = scsi_device_lookup(shost, 0, target, lun);
+		if (sdev) {
+			scsi_remove_device(sdev);
+			scsi_device_put(sdev);
+		} else {
+			pr_err("SCSI device %d 0 %d %d not found\n",
+				shost->host_no, target, lun);
+		}
+		break;
+	default:
+		pr_info("Unsupport virtio scsi event reason %x\n", event->reason);
+	}
+}
+
+static void virtscsi_handle_event(struct work_struct *work)
+{
+	struct virtio_scsi_event_node *event_node = 
+		container_of(work, struct virtio_scsi_event_node, work);
+	struct virtio_scsi *vscsi = event_node->vscsi;
+	struct virtio_scsi_event *event = &event_node->event;
+
+	switch (event->event) {
+	case VIRTIO_SCSI_T_TRANSPORT_RESET:
+		virtscsi_handle_transport_reset(vscsi, event);
+		break;
+	default:
+		pr_err("Unsupport virtio scsi event %x\n", event->event);
+	}
+	virtscsi_kick_event(vscsi, event_node);
+}
+
+static void virtscsi_complete_event(void *buf)
+{
+	struct virtio_scsi_event_node *event_node = buf;
+
+	INIT_WORK(&event_node->work, virtscsi_handle_event);
+	schedule_work(&event_node->work);
+}
+
 static void virtscsi_event_done(struct virtqueue *vq)
 {
-	virtscsi_vq_done(vq, virtscsi_complete_free);
+	virtscsi_vq_done(vq, virtscsi_complete_event);
 };
 
 static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
@@ -435,6 +529,11 @@ static int virtscsi_init(struct virtio_device *vdev,
 
 	virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
 	virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
+
+	if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
+		virtscsi_kick_event_all(vscsi);
+	}
+
 	return 0;
 }
 
@@ -532,7 +631,13 @@ static struct virtio_device_id id_table[] = {
 	{ 0 },
 };
 
+static unsigned int features[] = {
+	VIRTIO_SCSI_F_HOTPLUG
+};
+
 static struct virtio_driver virtio_scsi_driver = {
+	.feature_table = features,
+	.feature_table_size = ARRAY_SIZE(features),
 	.driver.name = KBUILD_MODNAME,
 	.driver.owner = THIS_MODULE,
 	.id_table = id_table,
diff --git a/include/linux/virtio_scsi.h b/include/linux/virtio_scsi.h
index 8ddeafd..dc8d305 100644
--- a/include/linux/virtio_scsi.h
+++ b/include/linux/virtio_scsi.h
@@ -69,6 +69,10 @@ struct virtio_scsi_config {
 	u32 max_lun;
 } __packed;
 
+/* Feature Bits */
+#define VIRTIO_SCSI_F_INOUT                    0
+#define VIRTIO_SCSI_F_HOTPLUG                  1
+
 /* Response codes */
 #define VIRTIO_SCSI_S_OK                       0
 #define VIRTIO_SCSI_S_OVERRUN                  1
@@ -105,6 +109,11 @@ struct virtio_scsi_config {
 #define VIRTIO_SCSI_T_TRANSPORT_RESET          1
 #define VIRTIO_SCSI_T_ASYNC_NOTIFY             2
 
+/* Reasons of transport reset event */
+#define VIRTIO_SCSI_EVT_RESET_HARD             0
+#define VIRTIO_SCSI_EVT_RESET_RESCAN           1
+#define VIRTIO_SCSI_EVT_RESET_REMOVED          2
+
 #define VIRTIO_SCSI_S_SIMPLE                   0
 #define VIRTIO_SCSI_S_ORDERED                  1
 #define VIRTIO_SCSI_S_HEAD                     2
-- 
1.7.7.6

^ permalink raw reply related

* [PATCH 2/2] virtio-scsi: Implement hotplug support for virtio-scsi
From: Cong Meng @ 2012-06-20  6:47 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: stefanha, qemu-devel, zwanp, linuxram, senwang, virtualization,
	Cong Meng, Anthony Liguori, Andreas Färber
In-Reply-To: <cover.1340022196.git.mc@linux.vnet.ibm.com>

Implement the hotplug() and hot_unplug() interfaces in virtio-scsi, by signal
the virtio_scsi.ko in guest kernel via event virtual queue.

The counterpart patch of virtio_scsi.ko will be sent soon in another thread.

Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
Signed-off-by: Sen Wang <senwang@linux.vnet.ibm.com>
---
 hw/virtio-scsi.c |   72 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/hw/virtio-scsi.c b/hw/virtio-scsi.c
index e8328f4..626ec5f 100644
--- a/hw/virtio-scsi.c
+++ b/hw/virtio-scsi.c
@@ -24,6 +24,10 @@
 #define VIRTIO_SCSI_MAX_TARGET  255
 #define VIRTIO_SCSI_MAX_LUN     16383
 
+/* Feature Bits */
+#define VIRTIO_SCSI_F_INOUT                    0
+#define VIRTIO_SCSI_F_HOTPLUG                  1
+
 /* Response codes */
 #define VIRTIO_SCSI_S_OK                       0
 #define VIRTIO_SCSI_S_OVERRUN                  1
@@ -60,6 +64,11 @@
 #define VIRTIO_SCSI_T_TRANSPORT_RESET          1
 #define VIRTIO_SCSI_T_ASYNC_NOTIFY             2
 
+/* Reasons of transport reset event */     
+#define VIRTIO_SCSI_EVT_RESET_HARD             0
+#define VIRTIO_SCSI_EVT_RESET_RESCAN           1
+#define VIRTIO_SCSI_EVT_RESET_REMOVED          2
+
 /* SCSI command request, followed by data-out */
 typedef struct {
     uint8_t lun[8];              /* Logical Unit Number */
@@ -206,11 +215,12 @@ static void qemu_sgl_init_external(QEMUSGList *qsgl, struct iovec *sg,
 static void virtio_scsi_parse_req(VirtIOSCSI *s, VirtQueue *vq,
                                   VirtIOSCSIReq *req)
 {
-    assert(req->elem.out_num && req->elem.in_num);
+    assert(req->elem.in_num);
     req->vq = vq;
     req->dev = s;
     req->sreq = NULL;
-    req->req.buf = req->elem.out_sg[0].iov_base;
+    if (req->elem.out_num)
+        req->req.buf = req->elem.out_sg[0].iov_base;
     req->resp.buf = req->elem.in_sg[0].iov_base;
 
     if (req->elem.out_num > 1) {
@@ -405,6 +415,10 @@ static void virtio_scsi_handle_ctrl(VirtIODevice *vdev, VirtQueue *vq)
     }
 }
 
+static void virtio_scsi_handle_event(VirtIODevice *vdev, VirtQueue *vq)
+{
+}
+
 static void virtio_scsi_command_complete(SCSIRequest *r, uint32_t status,
                                          size_t resid)
 {
@@ -541,6 +555,7 @@ static void virtio_scsi_set_config(VirtIODevice *vdev,
 static uint32_t virtio_scsi_get_features(VirtIODevice *vdev,
                                          uint32_t requested_features)
 {
+    requested_features |= (1UL << VIRTIO_SCSI_F_HOTPLUG);
     return requested_features;
 }
 
@@ -568,6 +583,55 @@ static int virtio_scsi_load(QEMUFile *f, void *opaque, int version_id)
     return 0;
 }
 
+static void virtio_scsi_push_event(VirtIOSCSI *s, uint32_t target, uint32_t lun,
+                                   uint32_t event, uint32_t reason)
+{
+    VirtIOSCSIReq *req;
+    VirtIOSCSIEvent *evt;
+
+    if ((req = virtio_scsi_pop_req(s, s->event_vq))) {
+        int in_size;
+        if (req->elem.out_num || req->elem.in_num != 1) {
+            virtio_scsi_bad_req();
+        }
+
+        in_size = req->elem.in_sg[0].iov_len;
+        if (in_size < sizeof(VirtIOSCSIEvent)) {
+            virtio_scsi_bad_req();
+        }
+
+        evt = req->resp.event;
+        evt->event = event;
+        evt->reason = reason;
+        evt->lun[0] = 0;
+        evt->lun[1] = target;
+        evt->lun[2] = (lun >> 8);
+        evt->lun[3] = lun & 0xFF;
+        virtio_scsi_complete_req(req);
+    }
+}
+
+static void virtio_scsi_hotplug(SCSIBus *bus, SCSIDevice *dev)
+{
+    VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus);
+
+    if (((s->vdev.guest_features >> VIRTIO_SCSI_F_HOTPLUG) & 1) &&
+        (s->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+        virtio_scsi_push_event(s, dev->id, dev->lun, 
+            VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_RESCAN);
+    }
+}
+
+static void virtio_scsi_hot_unplug(SCSIBus *bus, SCSIDevice *dev)
+{
+    VirtIOSCSI *s = container_of(bus, VirtIOSCSI, bus);
+
+    if ((s->vdev.guest_features >> VIRTIO_SCSI_F_HOTPLUG) & 1) {
+        virtio_scsi_push_event(s, dev->id, dev->lun, 
+            VIRTIO_SCSI_T_TRANSPORT_RESET, VIRTIO_SCSI_EVT_RESET_REMOVED);
+    }
+}
+
 static struct SCSIBusInfo virtio_scsi_scsi_info = {
     .tcq = true,
     .max_channel = VIRTIO_SCSI_MAX_CHANNEL,
@@ -576,6 +640,8 @@ static struct SCSIBusInfo virtio_scsi_scsi_info = {
 
     .complete = virtio_scsi_command_complete,
     .cancel = virtio_scsi_request_cancelled,
+    .hotplug = virtio_scsi_hotplug,
+    .hot_unplug = virtio_scsi_hot_unplug,
     .get_sg_list = virtio_scsi_get_sg_list,
     .save_request = virtio_scsi_save_request,
     .load_request = virtio_scsi_load_request,
@@ -604,7 +670,7 @@ VirtIODevice *virtio_scsi_init(DeviceState *dev, VirtIOSCSIConf *proxyconf)
     s->ctrl_vq = virtio_add_queue(&s->vdev, VIRTIO_SCSI_VQ_SIZE,
                                    virtio_scsi_handle_ctrl);
     s->event_vq = virtio_add_queue(&s->vdev, VIRTIO_SCSI_VQ_SIZE,
-                                   NULL);
+                                   virtio_scsi_handle_event);
     for (i = 0; i < s->conf->num_queues; i++) {
         s->cmd_vqs[i] = virtio_add_queue(&s->vdev, VIRTIO_SCSI_VQ_SIZE,
                                          virtio_scsi_handle_cmd);
-- 
1.7.7

^ permalink raw reply related

* [PATCH 1/2] scsi bus: introduce hotplug() and hot_unplug() interfaces for SCSI bus
From: Cong Meng @ 2012-06-20  6:47 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: stefanha, qemu-devel, zwanp, linuxram, senwang, virtualization,
	Cong Meng, Anthony Liguori, Andreas Färber
In-Reply-To: <cover.1340022196.git.mc@linux.vnet.ibm.com>

Add two interfaces hotplug() and hot_unplug() to scsi bus info.
The embody scsi bus can implement these two interfaces to signal the HBA driver
of guest kernel to add/remove the scsi device in question.

Signed-off-by: Cong Meng <mc@linux.vnet.ibm.com>
Signed-off-by: Sen Wang <senwang@linux.vnet.ibm.com>
---
 hw/scsi-bus.c |   16 +++++++++++++++-
 hw/scsi.h     |    2 ++
 2 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index dbdb99c..cc3ec75 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -177,6 +177,10 @@ static int scsi_qdev_init(DeviceState *qdev)
                                                          dev);
     }
 
+    if (bus->info->hotplug) {
+	bus->info->hotplug(bus, dev);
+    }
+
 err:
     return rc;
 }
@@ -1539,6 +1543,16 @@ static int get_scsi_requests(QEMUFile *f, void *pv, size_t size)
     return 0;
 }
 
+static int scsi_qdev_unplug(DeviceState *qdev)
+{
+    SCSIDevice *dev = SCSI_DEVICE(qdev);
+    SCSIBus *bus = DO_UPCAST(SCSIBus, qbus, dev->qdev.parent_bus);
+
+    if (bus->info->hot_unplug)
+	bus->info->hot_unplug(bus, dev);
+    return qdev_simple_unplug_cb(qdev);
+}
+
 const VMStateInfo vmstate_info_scsi_requests = {
     .name = "scsi-requests",
     .get  = get_scsi_requests,
@@ -1575,7 +1589,7 @@ static void scsi_device_class_init(ObjectClass *klass, void *data)
     DeviceClass *k = DEVICE_CLASS(klass);
     k->bus_info = &scsi_bus_info;
     k->init     = scsi_qdev_init;
-    k->unplug   = qdev_simple_unplug_cb;
+    k->unplug   = scsi_qdev_unplug;
     k->exit     = scsi_qdev_exit;
 }
 
diff --git a/hw/scsi.h b/hw/scsi.h
index 2eb66f7..5768071 100644
--- a/hw/scsi.h
+++ b/hw/scsi.h
@@ -130,6 +130,8 @@ struct SCSIBusInfo {
     void (*transfer_data)(SCSIRequest *req, uint32_t arg);
     void (*complete)(SCSIRequest *req, uint32_t arg, size_t resid);
     void (*cancel)(SCSIRequest *req);
+    void (*hotplug)(SCSIBus *bus, SCSIDevice *dev);
+    void (*hot_unplug)(SCSIBus *bus, SCSIDevice *dev);
     QEMUSGList *(*get_sg_list)(SCSIRequest *req);
 
     void (*save_request)(QEMUFile *f, SCSIRequest *req);
-- 
1.7.7

^ permalink raw reply related

* [PATCH 0/2] Hotplug support for virtio-scsi
From: Cong Meng @ 2012-06-20  6:47 UTC (permalink / raw)
  To: Paolo Bonzini
  Cc: stefanha, qemu-devel, zwanp, linuxram, senwang, virtualization,
	Cong Meng, Anthony Liguori, Andreas Färber

These patches implement the hotplug support for virtio-scsi.
When a new device attaches/detaches to virtio-scsi bus via device_add/device_del
commands, the HBA driver in guest kernel will be signaled to add/remove the scsi
device.

Cong Meng (2):
  scsi bus: introduce hotplug() and hot_unplug() interfaces for SCSI bus
  virtio-scsi: Implement hotplug support for virtio-scsi

 hw/scsi-bus.c    |   16 +++++++++++-
 hw/scsi.h        |    2 +
 hw/virtio-scsi.c |   72 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 86 insertions(+), 4 deletions(-)

-- 
1.7.7

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Asias He @ 2012-06-20  4:46 UTC (permalink / raw)
  To: dlaor
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Sasha Levin, Christoph Hellwig
In-Reply-To: <4FE01A5F.6060201@redhat.com>

On 06/19/2012 02:21 PM, Dor Laor wrote:
> On 06/19/2012 05:51 AM, Asias He wrote:
>> On 06/18/2012 07:39 PM, Sasha Levin wrote:
>>> On Mon, 2012-06-18 at 14:14 +0300, Dor Laor wrote:
>>>> On 06/18/2012 01:05 PM, Rusty Russell wrote:
>>>>> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He<asias@redhat.com> wrote:
>>>>>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
>>>>>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He<asias@redhat.com>
>>>>>>> wrote:
>>>>>>>> This patch introduces bio-based IO path for virtio-blk.
>>>>>>>
>>>>>>> Why make it optional?
>>>>>>
>>>>>> request-based IO path is useful for users who do not want to bypass
>>>>>> the
>>>>>> IO scheduler in guest kernel, e.g. users using spinning disk. For
>>>>>> users
>>>>>> using fast disk device, e.g. SSD device, they can use bio-based IO
>>>>>> path.
>>>>>
>>>>> Users using a spinning disk still get IO scheduling in the host
>>>>> though.
>>>>> What benefit is there in doing it in the guest as well?
>>>>
>>>> The io scheduler waits for requests to merge and thus batch IOs
>>>> together. It's not important w.r.t spinning disks since the host can do
>>>> it but it causes much less vmexits which is the key issue for VMs.
>>>
>>> Is the amount of exits caused by virtio-blk significant at all with
>>> EVENT_IDX?
>>
>> Yes. EVENT_IDX saves the number of notify and interrupt. Let's take the
>> interrupt as an example, The guest fires 200K request to host, the
>> number of interrupt is about 6K thanks to EVENT_IDX. The ratio is 200K /
>> 6K = 33. The ratio of merging is 40000K / 200K = 20.
>>
>
> In this case, why don't you always recommend bio over request based?

This case shows that IO scheduler's merging in guest saves a lot of 
requests to host side. Why should I recommend bio over request based here?

-- 
Asias

^ permalink raw reply

* Re: [PATCH 1/3] block: Introduce __blk_segment_map_sg() helper
From: Tejun Heo @ 2012-06-19 18:00 UTC (permalink / raw)
  To: Asias He; +Cc: Jens Axboe, Shaohua Li, linux-kernel, kvm, virtualization
In-Reply-To: <4FDFDDBE.3050506@redhat.com>

Hello,

On Mon, Jun 18, 2012 at 7:02 PM, Asias He <asias@redhat.com> wrote:
>> I *hope* this is a bit prettier.  e.g. Do we really need to pass in
>> @sglist and keep using "goto new_segment"?
>
> I think this deserves another patch on top of this splitting one. I'd like
> to clean this up later.

Yeap, doing it in a separate patch would be better. It would be great
if you can include such patch in this series. :)

Thanks.

-- 
tejun

^ permalink raw reply

* Re: [PATCH v2 0/3] Improve virtio-blk performance
From: Stefan Hajnoczi @ 2012-06-19 10:14 UTC (permalink / raw)
  To: Asias He; +Cc: linux-kernel, kvm, virtualization
In-Reply-To: <4FDFFEF8.9000609@redhat.com>

On Tue, Jun 19, 2012 at 5:24 AM, Asias He <asias@redhat.com> wrote:
> On 06/18/2012 06:58 PM, Stefan Hajnoczi wrote:
>>
>> As long as the latency is decreasing that's good.  But It's worth
>> keeping in mind that these percentages are probably wildly different
>> on real storage devices and/or qemu-kvm.  What we don't know here is
>> whether this bottleneck matters in real environments - results with
>> real storage and with qemu-kvm would be interesting.
>
>
> Yes. Here is the performance data on a Fusion-IO SSD device.
>
> Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using kvm
> tool.
>
>    Short version:
>     With bio-based IO path, sequential read/write, random read/write
>     IOPS boost         : 11%, 11%, 13%, 10%
>     Latency improvement: 10%, 10%, 12%, 10%

Nice, I'm glad the improvement shows on real hardware.

Stefan

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Dor Laor @ 2012-06-19  6:21 UTC (permalink / raw)
  To: Asias He
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Sasha Levin, Christoph Hellwig
In-Reply-To: <4FDFE926.7030309@redhat.com>

On 06/19/2012 05:51 AM, Asias He wrote:
> On 06/18/2012 07:39 PM, Sasha Levin wrote:
>> On Mon, 2012-06-18 at 14:14 +0300, Dor Laor wrote:
>>> On 06/18/2012 01:05 PM, Rusty Russell wrote:
>>>> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He<asias@redhat.com> wrote:
>>>>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
>>>>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He<asias@redhat.com> wrote:
>>>>>>> This patch introduces bio-based IO path for virtio-blk.
>>>>>>
>>>>>> Why make it optional?
>>>>>
>>>>> request-based IO path is useful for users who do not want to bypass
>>>>> the
>>>>> IO scheduler in guest kernel, e.g. users using spinning disk. For
>>>>> users
>>>>> using fast disk device, e.g. SSD device, they can use bio-based IO
>>>>> path.
>>>>
>>>> Users using a spinning disk still get IO scheduling in the host though.
>>>> What benefit is there in doing it in the guest as well?
>>>
>>> The io scheduler waits for requests to merge and thus batch IOs
>>> together. It's not important w.r.t spinning disks since the host can do
>>> it but it causes much less vmexits which is the key issue for VMs.
>>
>> Is the amount of exits caused by virtio-blk significant at all with
>> EVENT_IDX?
>
> Yes. EVENT_IDX saves the number of notify and interrupt. Let's take the
> interrupt as an example, The guest fires 200K request to host, the
> number of interrupt is about 6K thanks to EVENT_IDX. The ratio is 200K /
> 6K = 33. The ratio of merging is 40000K / 200K = 20.
>

In this case, why don't you always recommend bio over request based?

^ permalink raw reply

* Re: [RFC 2/2] virtio-ring: Allocate indirect buffers from cache when possible
From: Jean-Mickael Guerin @ 2012-06-19  5:51 UTC (permalink / raw)
  To: Sasha Levin; +Cc: virtualization, kvm, mst
In-Reply-To: <1340028598-10142-2-git-send-email-levinsasha928@gmail.com>

On 18/06/2012 16:09, Sasha Levin wrote:
> Currently if VIRTIO_RING_F_INDIRECT_DESC is enabled we will
> use indirect descriptors and allocate them using a simple
> kmalloc().
>
> This patch adds a cache which will allow indirect buffers under
> a configurable size to be allocated from that cache instead.
>
> Signed-off-by: Sasha Levin<levinsasha928@gmail.com>
> ---
>   drivers/block/virtio_blk.c          |    4 ++++
>   drivers/char/hw_random/virtio-rng.c |    4 ++++
>   drivers/char/virtio_console.c       |    4 ++++
>   drivers/net/virtio_net.c            |    4 ++++
>   drivers/virtio/virtio_balloon.c     |    4 ++++
>   drivers/virtio/virtio_ring.c        |   28 ++++++++++++++++++++++++----
>   include/linux/virtio.h              |    1 +
>   net/9p/trans_virtio.c               |    5 +++++
>   8 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index a2c8d97..3c6d1bd 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -22,6 +22,9 @@ struct workqueue_struct *virtblk_wq;
>   static unsigned int indirect_thresh = 0;
>   module_param(indirect_thresh, uint, S_IRUGO);
>
> +static unsigned int indirect_alloc_thresh = 0;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>   struct virtio_blk
>   {
>   	spinlock_t lock;
> @@ -442,6 +445,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
>   	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
>   	vblk->config_enable = true;
>   	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>
>   	err = init_vq(vblk);
>   	if (err)
> diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
> index bcaddb9..6a32f76 100644
> --- a/drivers/char/hw_random/virtio-rng.c
> +++ b/drivers/char/hw_random/virtio-rng.c
> @@ -28,6 +28,9 @@
>   static unsigned int indirect_thresh = 0;
>   module_param(indirect_thresh, uint, S_IRUGO);
>
> +static unsigned int indirect_alloc_thresh = 0;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>   static struct virtqueue *vq;
>   static unsigned int data_avail;
>   static DECLARE_COMPLETION(have_data);
> @@ -94,6 +97,7 @@ static int virtrng_probe(struct virtio_device *vdev)
>
>   	/* We expect a single virtqueue. */
>   	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>   	vq = virtio_find_single_vq(vdev, random_recv_done, "input");
>   	if (IS_ERR(vq))
>   		return PTR_ERR(vq);
> diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
> index 60397a4..7c33714 100644
> --- a/drivers/char/virtio_console.c
> +++ b/drivers/char/virtio_console.c
> @@ -40,6 +40,9 @@
>   static unsigned int indirect_thresh = 0;
>   module_param(indirect_thresh, uint, S_IRUGO);
>
> +static unsigned int indirect_alloc_thresh = 0;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>   /*
>    * This is a global struct for storing common data for all the devices
>    * this driver handles.
> @@ -1733,6 +1736,7 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
>   			&portdev->config.max_nr_ports) == 0)
>   		multiport = true;
>   	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>
>   	err = init_vqs(portdev);
>   	if (err<  0) {
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 5c1be92..441bd2f 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -37,6 +37,9 @@ module_param(gso, bool, 0444);
>   static unsigned int indirect_thresh = 0;
>   module_param(indirect_thresh, uint, S_IRUGO);
>
> +static unsigned int indirect_alloc_thresh = 0;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>   /* FIXME: MTU in config. */
>   #define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>   #define GOOD_COPY_LEN	128
> @@ -1132,6 +1135,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>   	if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
>   		vi->mergeable_rx_bufs = true;
>   	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>
>   	err = init_vqs(vi);
>   	if (err)
> diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
> index 4fc11ba..6e6313b 100644
> --- a/drivers/virtio/virtio_balloon.c
> +++ b/drivers/virtio/virtio_balloon.c
> @@ -38,6 +38,9 @@
>   static unsigned int indirect_thresh = 0;
>   module_param(indirect_thresh, uint, S_IRUGO);
>
> +static unsigned int indirect_alloc_thresh = 0;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>   struct virtio_balloon
>   {
>   	struct virtio_device *vdev;
> @@ -364,6 +367,7 @@ static int virtballoon_probe(struct virtio_device *vdev)
>   	vb->vdev = vdev;
>   	vb->need_stats_update = 0;
>   	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
>
>   	err = init_vqs(vb);
>   	if (err)
> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> index 99a64a7..f0b0ce3 100644
> --- a/drivers/virtio/virtio_ring.c
> +++ b/drivers/virtio/virtio_ring.c
> @@ -93,6 +93,10 @@ struct vring_virtqueue
>   	 */
>   	unsigned int indirect_thresh;
>
> +	/* Buffers below this size will be allocated from cache */
> +	unsigned int indirect_alloc_thresh;
> +	struct kmem_cache *indirect_cache;
> +
>   	/* Host publishes avail event idx */
>   	bool event;
>
> @@ -135,7 +139,10 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
>   	unsigned head;
>   	int i;
>
> -	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
> +	if ((out + in)<= vq->indirect_alloc_thresh)
> +		desc = kmem_cache_alloc(vq->indirect_cache, gfp);
> +	else
> +		desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
>   	if (!desc)
>   		return -ENOMEM;
>
> @@ -384,8 +391,13 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
>   	i = head;
>
>   	/* Free the indirect table */
> -	if (vq->vring.desc[i].flags&  VRING_DESC_F_INDIRECT)
> -		kfree(phys_to_virt(vq->vring.desc[i].addr));
> +	if (vq->vring.desc[i].flags&  VRING_DESC_F_INDIRECT) {
> +		if (vq->vring.desc[i].len<  vq->indirect_alloc_thresh)
This should be > instead of <,  no?

>
> +			kfree(phys_to_virt(vq->vring.desc[i].addr));
> +		else
> +			kmem_cache_free(vq->indirect_cache,
> +					phys_to_virt(vq->vring.desc[i].addr));
> +	}

>
>
>   	while (vq->vring.desc[i].flags&  VRING_DESC_F_NEXT) {
>   		i = vq->vring.desc[i].next;
> @@ -654,14 +666,20 @@ struct virtqueue *vring_new_virtqueue(unsigned int num,
>   	vq->last_used_idx = 0;
>   	vq->num_added = 0;
>   	vq->indirect_thresh = 0;
> +	vq->indirect_alloc_thresh = 0;
> +	vq->indirect_cache = NULL;
>   	list_add_tail(&vq->vq.list,&vdev->vqs);
>   #ifdef DEBUG
>   	vq->in_use = false;
>   	vq->last_add_time_valid = false;
>   #endif
>
> -	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
> +	if (virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) {
>   		vq->indirect_thresh = vdev->indirect_thresh;
> +		vq->indirect_alloc_thresh = vdev->indirect_alloc_thresh;
> +		if (vq->indirect_alloc_thresh)
> +			vq->indirect_cache = KMEM_CACHE(vring_desc[vq->indirect_alloc_thresh], 0);
> +	}
>
>   	vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
>
> @@ -685,6 +703,8 @@ EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>   void vring_del_virtqueue(struct virtqueue *vq)
>   {
>   	list_del(&vq->list);
> +	if (to_vvq(vq)->indirect_cache)
> +		kmem_cache_destroy(to_vvq(vq)->indirect_cache);
>   	kfree(to_vvq(vq));
>   }
>   EXPORT_SYMBOL_GPL(vring_del_virtqueue);
> diff --git a/include/linux/virtio.h b/include/linux/virtio.h
> index 36019ec..f3f0d3f 100644
> --- a/include/linux/virtio.h
> +++ b/include/linux/virtio.h
> @@ -70,6 +70,7 @@ struct virtio_device {
>   	unsigned long features[1];
>   	void *priv;
>   	unsigned int indirect_thresh;
> +	unsigned int indirect_alloc_thresh;
>   };
>
>   #define dev_to_virtio(dev) container_of(dev, struct virtio_device, dev)
> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
> index 357bfba..8424b36 100644
> --- a/net/9p/trans_virtio.c
> +++ b/net/9p/trans_virtio.c
> @@ -55,6 +55,9 @@
>   static unsigned int indirect_thresh = 0;
>   module_param(indirect_thresh, uint, S_IRUGO);
>
> +static unsigned int indirect_alloc_thresh = 0;
> +module_param(indirect_alloc_thresh, uint, S_IRUGO);
> +
>   /* a single mutex to manage channel initialization and attachment */
>   static DEFINE_MUTEX(virtio_9p_lock);
>   static DECLARE_WAIT_QUEUE_HEAD(vp_wq);
> @@ -505,6 +508,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
>
>   	/* We expect one virtqueue, for requests. */
>   	vdev->indirect_thresh = indirect_thresh;
> +	vdev->indirect_alloc_thresh = indirect_alloc_thresh;
> +
>   	chan->vq = virtio_find_single_vq(vdev, req_done, "requests");
>   	if (IS_ERR(chan->vq)) {
>   		err = PTR_ERR(chan->vq);
Jean-Mickael

^ permalink raw reply

* Re: [PATCH v2 0/3] Improve virtio-blk performance
From: Asias He @ 2012-06-19  4:24 UTC (permalink / raw)
  To: Stefan Hajnoczi; +Cc: linux-kernel, kvm, virtualization
In-Reply-To: <CAJSP0QXj0brxkTRGCMcjYU3AZwFbk+O5D2ATNz8GLECt9rNs7w@mail.gmail.com>

On 06/18/2012 06:58 PM, Stefan Hajnoczi wrote:
> As long as the latency is decreasing that's good.  But It's worth
> keeping in mind that these percentages are probably wildly different
> on real storage devices and/or qemu-kvm.  What we don't know here is
> whether this bottleneck matters in real environments - results with
> real storage and with qemu-kvm would be interesting.

Yes. Here is the performance data on a Fusion-IO SSD device.

Fio test is performed in a 8 vcpu guest with Fusion-IO based guest using 
kvm tool.

     Short version:
      With bio-based IO path, sequential read/write, random read/write
      IOPS boost         : 11%, 11%, 13%, 10%
      Latency improvement: 10%, 10%, 12%, 10%

     Long Version:
      With bio-based IO path:
       read : io=2048.0MB, bw=58920KB/s, iops=117840 , runt= 35593msec
       write: io=2048.0MB, bw=64308KB/s, iops=128616 , runt= 32611msec
       read : io=3095.7MB, bw=59633KB/s, iops=119266 , runt= 53157msec
       write: io=3095.7MB, bw=62993KB/s, iops=125985 , runt= 50322msec
         clat (usec): min=0 , max=1284.3K, avg=128109.01, stdev=71513.29
         clat (usec): min=94 , max=962339 , avg=116832.95, stdev=65836.80
         clat (usec): min=0 , max=1846.6K, avg=128509.99, stdev=89575.07
         clat (usec): min=0 , max=2256.4K, avg=121361.84, stdev=82747.25
cpu : usr=56.79%, sys=421.70%, ctx=147335118, majf=21080, minf=19852517
cpu : usr=61.81%, sys=455.53%, ctx=143269950, majf=16027, minf=24800604
cpu : usr=63.10%, sys=455.38%, ctx=178373538, majf=16958, minf=24822612
cpu : usr=62.04%, sys=453.58%, ctx=226902362, majf=16089, minf=23278105

      With request-based IO path:
       read : io=2048.0MB, bw=52896KB/s, iops=105791 , runt= 39647msec
       write: io=2048.0MB, bw=57856KB/s, iops=115711 , runt= 36248msec
       read : io=3095.7MB, bw=52387KB/s, iops=104773 , runt= 60510msec
       write: io=3095.7MB, bw=57310KB/s, iops=114619 , runt= 55312msec
         clat (usec): min=0 , max=1532.6K, avg=142085.62, stdev=109196.84
         clat (usec): min=0 , max=1487.4K, avg=129110.71, stdev=114973.64
         clat (usec): min=0 , max=1388.6K, avg=145049.22, stdev=107232.55
         clat (usec): min=0 , max=1465.9K, avg=133585.67, stdev=110322.95
cpu : usr=44.08%, sys=590.71%, ctx=451812322, majf=14841, minf=17648641
cpu : usr=48.73%, sys=610.78%, ctx=418953997, majf=22164, minf=26850689
cpu : usr=45.58%, sys=581.16%, ctx=714079216, majf=21497, minf=22558223
cpu : usr=48.40%, sys=599.65%, ctx=656089423, majf=16393, minf=23824409


-- 
Asias

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Asias He @ 2012-06-19  2:51 UTC (permalink / raw)
  To: Sasha Levin
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <1340019575.22848.2.camel@lappy>

On 06/18/2012 07:39 PM, Sasha Levin wrote:
> On Mon, 2012-06-18 at 14:14 +0300, Dor Laor wrote:
>> On 06/18/2012 01:05 PM, Rusty Russell wrote:
>>> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He<asias@redhat.com>  wrote:
>>>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
>>>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He<asias@redhat.com>  wrote:
>>>>>> This patch introduces bio-based IO path for virtio-blk.
>>>>>
>>>>> Why make it optional?
>>>>
>>>> request-based IO path is useful for users who do not want to bypass the
>>>> IO scheduler in guest kernel, e.g. users using spinning disk. For users
>>>> using fast disk device, e.g. SSD device, they can use bio-based IO path.
>>>
>>> Users using a spinning disk still get IO scheduling in the host though.
>>> What benefit is there in doing it in the guest as well?
>>
>> The io scheduler waits for requests to merge and thus batch IOs
>> together. It's not important w.r.t spinning disks since the host can do
>> it but it causes much less vmexits which is the key issue for VMs.
>
> Is the amount of exits caused by virtio-blk significant at all with
> EVENT_IDX?

Yes. EVENT_IDX saves the number of notify and interrupt. Let's take the 
interrupt as an example, The guest fires 200K request to host, the 
number of interrupt is about 6K thanks to EVENT_IDX. The ratio is 200K / 
6K = 33. The ratio of merging is 40000K / 200K = 20.

-- 
Asias

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Asias He @ 2012-06-19  2:39 UTC (permalink / raw)
  To: Rusty Russell
  Cc: kvm, Michael S. Tsirkin, linux-kernel, virtualization,
	Christoph Hellwig
In-Reply-To: <87zk81x7dp.fsf@rustcorp.com.au>

On 06/18/2012 06:05 PM, Rusty Russell wrote:
> On Mon, 18 Jun 2012 16:03:23 +0800, Asias He <asias@redhat.com> wrote:
>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias@redhat.com> wrote:
>>>> This patch introduces bio-based IO path for virtio-blk.
>>>
>>> Why make it optional?
>>
>> request-based IO path is useful for users who do not want to bypass the
>> IO scheduler in guest kernel, e.g. users using spinning disk. For users
>> using fast disk device, e.g. SSD device, they can use bio-based IO path.
>
> Users using a spinning disk still get IO scheduling in the host though.
> What benefit is there in doing it in the guest as well?

Merging in guest kernel's IO scheduling can reduce the number of 
requests guest fires to host side. For instance, with the same workload 
in guest side, the number of request drops to ~200K from ~4000K with 
guest kernel's merging in qemu.

>
> Cheers,
> Rusty.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>


-- 
Asias

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Asias He @ 2012-06-19  2:30 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, linux-kernel, virtualization, Christoph Hellwig
In-Reply-To: <20120618102108.GD23134@redhat.com>

On 06/18/2012 06:21 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 18, 2012 at 02:53:10PM +0800, Asias He wrote:
>> +static void virtblk_make_request(struct request_queue *q, struct bio *bio)
>> +{
>> +	struct virtio_blk *vblk = q->queuedata;
>> +	unsigned int num, out = 0, in = 0;
>> +	struct virtblk_req *vbr;
>> +
>> +	BUG_ON(bio->bi_phys_segments + 2 > vblk->sg_elems);
>> +	BUG_ON(bio->bi_rw & (REQ_FLUSH | REQ_FUA));
>> +
>> +	vbr = virtblk_alloc_req(vblk, GFP_NOIO);
>> +	if (!vbr) {
>> +		bio_endio(bio, -ENOMEM);
>> +		return;
>> +	}
>> +
>> +	vbr->bio = bio;
>> +	vbr->req = NULL;
>> +	vbr->out_hdr.type = 0;
>> +	vbr->out_hdr.sector = bio->bi_sector;
>> +	vbr->out_hdr.ioprio = bio_prio(bio);
>> +
>> +	sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
>> +
>> +	num = blk_bio_map_sg(q, bio, vbr->sg + out);
>> +
>> +	sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
>> +		   sizeof(vbr->status));
>> +
>> +	if (num) {
>> +		if (bio->bi_rw & REQ_WRITE) {
>> +			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
>> +			out += num;
>> +		} else {
>> +			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
>> +			in += num;
>> +		}
>> +	}
>> +
>> +	spin_lock_irq(vblk->disk->queue->queue_lock);
>> +	if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
>> +			      GFP_ATOMIC) < 0) {
>> +		spin_unlock_irq(vblk->disk->queue->queue_lock);
>
> Any implications of dropping lock like that?
> E.g. for suspend. like we are still discussing with
> unlocked kick?
>
>> +		virtblk_add_buf_wait(vblk, vbr, out, in);
>> +	} else {
>> +		virtqueue_kick(vblk->vq);
>
> Why special case the first call? task state manipulation so expensive?

Hmm. Will switch them.

>
>> +		spin_unlock_irq(vblk->disk->queue->queue_lock);
>> +	}
>> +}
>> +
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>


-- 
Asias

^ permalink raw reply

* Re: [PATCH 3/3] virtio-blk: Add bio-based IO path for virtio-blk
From: Asias He @ 2012-06-19  2:21 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: kvm, linux-kernel, virtualization, Christoph Hellwig
In-Reply-To: <20120618101349.GC23134@redhat.com>

On 06/18/2012 06:13 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 18, 2012 at 04:03:23PM +0800, Asias He wrote:
>> On 06/18/2012 03:46 PM, Rusty Russell wrote:
>>> On Mon, 18 Jun 2012 14:53:10 +0800, Asias He <asias@redhat.com> wrote:
>>>> This patch introduces bio-based IO path for virtio-blk.
>>>
>>> Why make it optional?
>>
>> request-based IO path is useful for users who do not want to bypass
>> the IO scheduler in guest kernel, e.g. users using spinning disk.
>> For users using fast disk device, e.g. SSD device, they can use
>> bio-based IO path.
>
> OK I guess but then it should be per-device. There could be
> a mix of slow and fast disks :)

Yes, per-device might be useful. There are issues which need solving.
- How do we tell the drive which IO path to use
   - Device add some flag
     - Old qemu/lkvm can not turn this feature on
   - Through /sys filesystem attribute
     - How do we handle the switch from one path to anther.

So, let's add the per-device feature later.

-- 
Asias

^ permalink raw reply

* Re: [PATCH 1/3] block: Introduce __blk_segment_map_sg() helper
From: Asias He @ 2012-06-19  2:02 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, Shaohua Li, linux-kernel, kvm, virtualization
In-Reply-To: <20120618213103.GC32733@google.com>

On 06/19/2012 05:31 AM, Tejun Heo wrote:
> Hello, Asias.
>
> On Mon, Jun 18, 2012 at 02:53:08PM +0800, Asias He wrote:
>> Split the mapping code in blk_rq_map_sg() to a helper
>> __blk_segment_map_sg(), so that other mapping function, e.g.
>> blk_bio_map_sg(), can share the code.
>>
>> Cc: Jens Axboe <axboe@kernel.dk>
>> Cc: Tejun Heo <tj@kernel.org>
>> Cc: Shaohua Li <shli@kernel.org>
>> Cc: linux-kernel@vger.kernel.org
>> Suggested-by: Tejun Heo <tj@kernel.org>
>> Suggested-by: Jens Axboe <axboe@kernel.dk>
>> Signed-off-by: Asias He <asias@redhat.com>
>> ---
>>   block/blk-merge.c |   80 ++++++++++++++++++++++++++++++-----------------------
>>   1 file changed, 45 insertions(+), 35 deletions(-)
>>
>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>> index 160035f..576b68e 100644
>> --- a/block/blk-merge.c
>> +++ b/block/blk-merge.c
>> @@ -110,6 +110,49 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
>>   	return 0;
>>   }
>>
>> +static void
>> +__blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
>> +		     struct scatterlist *sglist, struct bio_vec **bvprv,
>> +		     struct scatterlist **sg, int *nsegs, int *cluster)
>> +{
>> +
>> +	int nbytes = bvec->bv_len;
>> +
>> +	if (*bvprv && *cluster) {
>> +		if ((*sg)->length + nbytes > queue_max_segment_size(q))
>> +			goto new_segment;
>> +
>> +		if (!BIOVEC_PHYS_MERGEABLE(*bvprv, bvec))
>> +			goto new_segment;
>> +		if (!BIOVEC_SEG_BOUNDARY(q, *bvprv, bvec))
>> +			goto new_segment;
>> +
>> +		(*sg)->length += nbytes;
>> +	} else {
>> +new_segment:
>> +		if (!*sg)
>> +			*sg = sglist;
>> +		else {
>> +			/*
>> +			 * If the driver previously mapped a shorter
>> +			 * list, we could see a termination bit
>> +			 * prematurely unless it fully inits the sg
>> +			 * table on each mapping. We KNOW that there
>> +			 * must be more entries here or the driver
>> +			 * would be buggy, so force clear the
>> +			 * termination bit to avoid doing a full
>> +			 * sg_init_table() in drivers for each command.
>> +			 */
>> +			(*sg)->page_link &= ~0x02;
>> +			*sg = sg_next(*sg);
>> +		}
>> +
>> +		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
>> +		(*nsegs)++;
>> +	}
>> +	*bvprv = bvec;
>> +}
>
> I *hope* this is a bit prettier.  e.g. Do we really need to pass in
> @sglist and keep using "goto new_segment"?

I think this deserves another patch on top of this splitting one. I'd 
like to clean this up later.

-- 
Asias

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox