Netdev List
 help / color / mirror / Atom feed
* dev_queue_xmit and sockets
From: Marjan Schiller @ 2011-01-18  9:04 UTC (permalink / raw)
  To: netdev

Hi!

I am currently developing a netfilter module which passes incoming and outgoing ethernet packages from one interface to an another interface ( a tun device ). This works very well, the tcpdump shows correct packages on the destination device and PING and ARP works good.

But i get into trouble if i want to create a socket connection with these packages. It seams to me that the packages are only "useable" within the kernel space and not the user space. Did i miss something there? Do i have to prepare the SKB for user mode or something like that?

I can establish a connection between the client socket and the server socket, but the server socket does not read any data ( But the packet with the data has reached the interface ).

Here is an example how i pass the packaged to the tun device:

static unsigned int send_ethernet_package( struct sk_buff * in_skb, void * data, int len, struct net_device * dev )
{
  struct sk_buff *skb;

  skb = alloc_skb( len , GFP_ATOMIC);
  skb_put( skb, len );
  skb->pkt_type = PACKET_HOST;
  skb->dev = dev;
  memcpy(skb->head, data , len );
  dev_queue_xmit( skb );

  return 0;
}

The data parameter is a well formed ethernet package.

Someone how has a guess whats wrong with this?

Thanks for help
    
    Marjan

-- 
Neu: GMX De-Mail - Einfach wie E-Mail, sicher wie ein Brief!  
Jetzt De-Mail-Adresse reservieren: http://portal.gmx.net/de/go/demail

^ permalink raw reply

* [PATCH v5 2/4] bt hidp: Wait for ACK on Sent Reports
From: Alan Ott @ 2011-01-18  8:04 UTC (permalink / raw)
  To: Jiri Kosina, Marcel Holtmann, Gustavo F. Padovan, David S. Miller,
	Alan Ott
  Cc: Alan Ott
In-Reply-To: <1295337880-12452-1-git-send-email-alan@signal11.us>

Wait for an ACK from the device before returning from
hidp_output_raw_report(). This way, failures can be returned to the user
application. Also, it prevents ACK/NAK packets from an output packet from
being confused with ACK/NAK packets from an input request packet.

Signed-off-by: Alan Ott <alan@signal11.us>
---
 net/bluetooth/hidp/core.c |   54 ++++++++++++++++++++++++++++++++++++++++++--
 net/bluetooth/hidp/hidp.h |    4 +++
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 67cc4bc..5383e6c 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -316,6 +316,9 @@ static int hidp_send_report(struct hidp_session *session, struct hid_report *rep
 static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count,
 		unsigned char report_type)
 {
+	struct hidp_session *session = hid->driver_data;
+	int ret;
+
 	switch (report_type) {
 	case HID_FEATURE_REPORT:
 		report_type = HIDP_TRANS_SET_REPORT | HIDP_DATA_RTYPE_FEATURE;
@@ -327,10 +330,47 @@ static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, s
 		return -EINVAL;
 	}
 
+	if (mutex_lock_interruptible(&session->report_mutex))
+		return -ERESTARTSYS;
+
+	/* Set up our wait, and send the report request to the device. */
+	set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
 	if (hidp_send_ctrl_message(hid->driver_data, report_type,
-			data, count))
-		return -ENOMEM;
-	return count;
+			data, count)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/* Wait for the ACK from the device. */
+	while (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) {
+		int res;
+
+		res = wait_event_interruptible_timeout(session->report_queue,
+			!test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags),
+			10*HZ);
+		if (res == 0) {
+			/* timeout */
+			ret = -EIO;
+			goto err;
+		}
+		if (res < 0) {
+			/* signal */
+			ret = -ERESTARTSYS;
+			goto err;
+		}
+	}
+
+	if (!session->output_report_success) {
+		ret = -EIO;
+		goto err;
+	}
+
+	ret = count;
+
+err:
+	clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+	mutex_unlock(&session->report_mutex);
+	return ret;
 }
 
 static void hidp_idle_timeout(unsigned long arg)
@@ -357,10 +397,12 @@ static void hidp_process_handshake(struct hidp_session *session,
 					unsigned char param)
 {
 	BT_DBG("session %p param 0x%02x", session, param);
+	session->output_report_success = 0; /* default condition */
 
 	switch (param) {
 	case HIDP_HSHK_SUCCESSFUL:
 		/* FIXME: Call into SET_ GET_ handlers here */
+		session->output_report_success = 1;
 		break;
 
 	case HIDP_HSHK_NOT_READY:
@@ -385,6 +427,12 @@ static void hidp_process_handshake(struct hidp_session *session,
 			HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
 		break;
 	}
+
+	/* Wake up the waiting thread. */
+	if (test_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags)) {
+		clear_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
+		wake_up_interruptible(&session->report_queue);
+	}
 }
 
 static void hidp_process_hid_control(struct hidp_session *session,
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index 2cc35dc..92e093e 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -80,6 +80,7 @@
 #define HIDP_VIRTUAL_CABLE_UNPLUG	0
 #define HIDP_BOOT_PROTOCOL_MODE		1
 #define HIDP_BLUETOOTH_VENDOR_ID	9
+#define HIDP_WAITING_FOR_SEND_ACK	11
 
 struct hidp_connadd_req {
 	int   ctrl_sock;	// Connected control socket
@@ -154,6 +155,9 @@ struct hidp_session {
 	struct sk_buff_head ctrl_transmit;
 	struct sk_buff_head intr_transmit;
 
+	/* Used in hidp_output_raw_report() */
+	int output_report_success; /* boolean */
+
 	/* Report descriptor */
 	__u8 *rd_data;
 	uint rd_size;
-- 
1.7.0.4



^ permalink raw reply related

* [PATCH v5 3/4] hid: Add Support for Setting and Getting Feature Reports from hidraw
From: Alan Ott @ 2011-01-18  8:04 UTC (permalink / raw)
  To: Jiri Kosina, Marcel Holtmann, Gustavo F. Padovan, David S. Miller,
	Alan Ott
  Cc: Alan Ott, Antonio Ospite
In-Reply-To: <1295337880-12452-1-git-send-email-alan@signal11.us>

Per the HID Specification, Feature reports must be sent and received on
the Configuration endpoint (EP 0) through the Set_Report/Get_Report
interfaces.  This patch adds two ioctls to hidraw to set and get feature
reports to and from the device.  Modifications were made to hidraw and
usbhid.

New hidraw ioctls:
  HIDIOCSFEATURE - Perform a Set_Report transfer of a Feature report.
  HIDIOCGFEATURE - Perform a Get_Report transfer of a Feature report.

Signed-off-by: Alan Ott <alan@signal11.us>
Signed-off-by: Antonio Ospite <ospite@studenti.unina.it>
---
 drivers/hid/hidraw.c          |  106 ++++++++++++++++++++++++++++++++++++++--
 drivers/hid/usbhid/hid-core.c |   35 +++++++++++++
 include/linux/hid.h           |    3 +
 include/linux/hidraw.h        |    3 +
 4 files changed, 141 insertions(+), 6 deletions(-)

diff --git a/drivers/hid/hidraw.c b/drivers/hid/hidraw.c
index 468e87b..8f06044 100644
--- a/drivers/hid/hidraw.c
+++ b/drivers/hid/hidraw.c
@@ -102,15 +102,14 @@ out:
 }
 
 /* the first byte is expected to be a report number */
-static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
+/* This function is to be called with the minors_lock mutex held */
+static ssize_t hidraw_send_report(struct file *file, const char __user *buffer, size_t count, unsigned char report_type)
 {
 	unsigned int minor = iminor(file->f_path.dentry->d_inode);
 	struct hid_device *dev;
 	__u8 *buf;
 	int ret = 0;
 
-	mutex_lock(&minors_lock);
-
 	if (!hidraw_table[minor]) {
 		ret = -ENODEV;
 		goto out;
@@ -148,14 +147,92 @@ static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t
 		goto out_free;
 	}
 
-	ret = dev->hid_output_raw_report(dev, buf, count, HID_OUTPUT_REPORT);
+	ret = dev->hid_output_raw_report(dev, buf, count, report_type);
 out_free:
 	kfree(buf);
 out:
+	return ret;
+}
+
+/* the first byte is expected to be a report number */
+static ssize_t hidraw_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
+{
+	ssize_t ret;
+	mutex_lock(&minors_lock);
+	ret = hidraw_send_report(file, buffer, count, HID_OUTPUT_REPORT);
 	mutex_unlock(&minors_lock);
 	return ret;
 }
 
+
+/* This function performs a Get_Report transfer over the control endpoint
+   per section 7.2.1 of the HID specification, version 1.1.  The first byte
+   of buffer is the report number to request, or 0x0 if the defice does not
+   use numbered reports. The report_type parameter can be HID_FEATURE_REPORT
+   or HID_INPUT_REPORT.  This function is to be called with the minors_lock
+   mutex held.  */
+static ssize_t hidraw_get_report(struct file *file, char __user *buffer, size_t count, unsigned char report_type)
+{
+	unsigned int minor = iminor(file->f_path.dentry->d_inode);
+	struct hid_device *dev;
+	__u8 *buf;
+	int ret = 0, len;
+	unsigned char report_number;
+
+	dev = hidraw_table[minor]->hid;
+
+	if (!dev->hid_get_raw_report) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (count > HID_MAX_BUFFER_SIZE) {
+		printk(KERN_WARNING "hidraw: pid %d passed too large report\n",
+				task_pid_nr(current));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (count < 2) {
+		printk(KERN_WARNING "hidraw: pid %d passed too short report\n",
+				task_pid_nr(current));
+		ret = -EINVAL;
+		goto out;
+	}
+
+	buf = kmalloc(count * sizeof(__u8), GFP_KERNEL);
+	if (!buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Read the first byte from the user. This is the report number,
+	   which is passed to dev->hid_get_raw_report(). */
+	if (copy_from_user(&report_number, buffer, 1)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	ret = dev->hid_get_raw_report(dev, report_number, buf, count, report_type);
+
+	if (ret < 0)
+		goto out_free;
+
+	len = (ret < count) ? ret : count;
+
+	if (copy_to_user(buffer, buf, len)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
+	ret = len;
+
+out_free:
+	kfree(buf);
+out:
+	return ret;
+}
+
 static unsigned int hidraw_poll(struct file *file, poll_table *wait)
 {
 	struct hidraw_list *list = file->private_data;
@@ -295,7 +372,24 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd,
 		default:
 			{
 				struct hid_device *hid = dev->hid;
-				if (_IOC_TYPE(cmd) != 'H' || _IOC_DIR(cmd) != _IOC_READ) {
+				if (_IOC_TYPE(cmd) != 'H') {
+					ret = -EINVAL;
+					break;
+				}
+
+				if (_IOC_NR(cmd) == _IOC_NR(HIDIOCSFEATURE(0))) {
+					int len = _IOC_SIZE(cmd);
+					ret = hidraw_send_report(file, user_arg, len, HID_FEATURE_REPORT);
+					break;
+				}
+				if (_IOC_NR(cmd) == _IOC_NR(HIDIOCGFEATURE(0))) {
+					int len = _IOC_SIZE(cmd);
+					ret = hidraw_get_report(file, user_arg, len, HID_FEATURE_REPORT);
+					break;
+				}
+
+				/* Begin Read-only ioctls. */
+				if (_IOC_DIR(cmd) != _IOC_READ) {
 					ret = -EINVAL;
 					break;
 				}
@@ -327,7 +421,7 @@ static long hidraw_ioctl(struct file *file, unsigned int cmd,
 						-EFAULT : len;
 					break;
 				}
-		}
+			}
 
 		ret = -ENOTTY;
 	}
diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index b336dd8..38c261a 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -799,6 +799,40 @@ static int hid_alloc_buffers(struct usb_device *dev, struct hid_device *hid)
 	return 0;
 }
 
+static int usbhid_get_raw_report(struct hid_device *hid,
+		unsigned char report_number, __u8 *buf, size_t count,
+		unsigned char report_type)
+{
+	struct usbhid_device *usbhid = hid->driver_data;
+	struct usb_device *dev = hid_to_usb_dev(hid);
+	struct usb_interface *intf = usbhid->intf;
+	struct usb_host_interface *interface = intf->cur_altsetting;
+	int skipped_report_id = 0;
+	int ret;
+
+	/* Byte 0 is the report number. Report data starts at byte 1.*/
+	buf[0] = report_number;
+	if (report_number == 0x0) {
+		/* Offset the return buffer by 1, so that the report ID
+		   will remain in byte 0. */
+		buf++;
+		count--;
+		skipped_report_id = 1;
+	}
+	ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0),
+		HID_REQ_GET_REPORT,
+		USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_INTERFACE,
+		((report_type + 1) << 8) | report_number,
+		interface->desc.bInterfaceNumber, buf, count,
+		USB_CTRL_SET_TIMEOUT);
+
+	/* count also the report id */
+	if (ret > 0 && skipped_report_id)
+		ret++;
+
+	return ret;
+}
+
 static int usbhid_output_raw_report(struct hid_device *hid, __u8 *buf, size_t count,
 		unsigned char report_type)
 {
@@ -1139,6 +1173,7 @@ static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *
 
 	usb_set_intfdata(intf, hid);
 	hid->ll_driver = &usb_hid_driver;
+	hid->hid_get_raw_report = usbhid_get_raw_report;
 	hid->hid_output_raw_report = usbhid_output_raw_report;
 	hid->ff_init = hid_pidff_init;
 #ifdef CONFIG_USB_HIDDEV
diff --git a/include/linux/hid.h b/include/linux/hid.h
index d91c25e..e8ee0a9 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -504,6 +504,9 @@ struct hid_device {							/* device report descriptor */
 				  struct hid_usage *, __s32);
 	void (*hiddev_report_event) (struct hid_device *, struct hid_report *);
 
+	/* handler for raw input (Get_Report) data, used by hidraw */
+	int (*hid_get_raw_report) (struct hid_device *, unsigned char, __u8 *, size_t, unsigned char);
+
 	/* handler for raw output data, used by hidraw */
 	int (*hid_output_raw_report) (struct hid_device *, __u8 *, size_t, unsigned char);
 
diff --git a/include/linux/hidraw.h b/include/linux/hidraw.h
index dd8d692..4b88e69 100644
--- a/include/linux/hidraw.h
+++ b/include/linux/hidraw.h
@@ -35,6 +35,9 @@ struct hidraw_devinfo {
 #define HIDIOCGRAWINFO		_IOR('H', 0x03, struct hidraw_devinfo)
 #define HIDIOCGRAWNAME(len)     _IOC(_IOC_READ, 'H', 0x04, len)
 #define HIDIOCGRAWPHYS(len)     _IOC(_IOC_READ, 'H', 0x05, len)
+/* The first byte of SFEATURE and GFEATURE is the report number */
+#define HIDIOCSFEATURE(len)    _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x06, len)
+#define HIDIOCGFEATURE(len)    _IOC(_IOC_WRITE|_IOC_READ, 'H', 0x07, len)
 
 #define HIDRAW_FIRST_MINOR 0
 #define HIDRAW_MAX_DEVICES 64
-- 
1.7.0.4



^ permalink raw reply related

* [PATCH v5 4/4] bt hidp: Add support for hidraw HIDIOCGFEATURE and HIDIOCSFEATURE
From: Alan Ott @ 2011-01-18  8:04 UTC (permalink / raw)
  To: Jiri Kosina, Marcel Holtmann, Gustavo F. Padovan, David S. Miller,
	Alan Ott
  Cc: Alan Ott
In-Reply-To: <1295337880-12452-1-git-send-email-alan@signal11.us>

This patch adds support or getting and setting feature reports for bluetooth
HID devices from HIDRAW.

Signed-off-by: Alan Ott <alan@signal11.us>
---
 net/bluetooth/hidp/core.c |  113 +++++++++++++++++++++++++++++++++++++++++++--
 net/bluetooth/hidp/hidp.h |    8 +++
 2 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 5383e6c..6df8ea1 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -36,6 +36,7 @@
 #include <linux/file.h>
 #include <linux/init.h>
 #include <linux/wait.h>
+#include <linux/mutex.h>
 #include <net/sock.h>
 
 #include <linux/input.h>
@@ -313,6 +314,86 @@ static int hidp_send_report(struct hidp_session *session, struct hid_report *rep
 	return hidp_queue_report(session, buf, rsize);
 }
 
+static int hidp_get_raw_report(struct hid_device *hid,
+		unsigned char report_number,
+		unsigned char *data, size_t count,
+		unsigned char report_type)
+{
+	struct hidp_session *session = hid->driver_data;
+	struct sk_buff *skb;
+	size_t len;
+	int numbered_reports = hid->report_enum[report_type].numbered;
+
+	switch (report_type) {
+	case HID_FEATURE_REPORT:
+		report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_FEATURE;
+		break;
+	case HID_INPUT_REPORT:
+		report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_INPUT;
+		break;
+	case HID_OUTPUT_REPORT:
+		report_type = HIDP_TRANS_GET_REPORT | HIDP_DATA_RTYPE_OUPUT;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (mutex_lock_interruptible(&session->report_mutex))
+		return -ERESTARTSYS;
+
+	/* Set up our wait, and send the report request to the device. */
+	session->waiting_report_type = report_type & HIDP_DATA_RTYPE_MASK;
+	session->waiting_report_number = numbered_reports ? report_number : -1;
+	set_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	data[0] = report_number;
+	if (hidp_send_ctrl_message(hid->driver_data, report_type, data, 1))
+		goto err_eio;
+
+	/* Wait for the return of the report. The returned report
+	   gets put in session->report_return.  */
+	while (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) {
+		int res;
+
+		res = wait_event_interruptible_timeout(session->report_queue,
+			!test_bit(HIDP_WAITING_FOR_RETURN, &session->flags),
+			5*HZ);
+		if (res == 0) {
+			/* timeout */
+			goto err_eio;
+		}
+		if (res < 0) {
+			/* signal */
+			goto err_restartsys;
+		}
+	}
+
+	skb = session->report_return;
+	if (skb) {
+		len = skb->len < count ? skb->len : count;
+		memcpy(data, skb->data, len);
+
+		kfree_skb(skb);
+		session->report_return = NULL;
+	} else {
+		/* Device returned a HANDSHAKE, indicating  protocol error. */
+		len = -EIO;
+	}
+
+	clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	mutex_unlock(&session->report_mutex);
+
+	return len;
+
+err_restartsys:
+	clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	mutex_unlock(&session->report_mutex);
+	return -ERESTARTSYS;
+err_eio:
+	clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+	mutex_unlock(&session->report_mutex);
+	return -EIO;
+}
+
 static int hidp_output_raw_report(struct hid_device *hid, unsigned char *data, size_t count,
 		unsigned char report_type)
 {
@@ -409,6 +490,10 @@ static void hidp_process_handshake(struct hidp_session *session,
 	case HIDP_HSHK_ERR_INVALID_REPORT_ID:
 	case HIDP_HSHK_ERR_UNSUPPORTED_REQUEST:
 	case HIDP_HSHK_ERR_INVALID_PARAMETER:
+		if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags)) {
+			clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+			wake_up_interruptible(&session->report_queue);
+		}
 		/* FIXME: Call into SET_ GET_ handlers here */
 		break;
 
@@ -451,9 +536,11 @@ static void hidp_process_hid_control(struct hidp_session *session,
 	}
 }
 
-static void hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
+/* Returns true if the passed-in skb should be freed by the caller. */
+static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
 				unsigned char param)
 {
+	int done_with_skb = 1;
 	BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
 
 	switch (param) {
@@ -465,7 +552,6 @@ static void hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
 
 		if (session->hid)
 			hid_input_report(session->hid, HID_INPUT_REPORT, skb->data, skb->len, 0);
-
 		break;
 
 	case HIDP_DATA_RTYPE_OTHER:
@@ -477,12 +563,27 @@ static void hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
 		__hidp_send_ctrl_message(session,
 			HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0);
 	}
+
+	if (test_bit(HIDP_WAITING_FOR_RETURN, &session->flags) &&
+				param == session->waiting_report_type) {
+		if (session->waiting_report_number < 0 ||
+		    session->waiting_report_number == skb->data[0]) {
+			/* hidp_get_raw_report() is waiting on this report. */
+			session->report_return = skb;
+			done_with_skb = 0;
+			clear_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
+			wake_up_interruptible(&session->report_queue);
+		}
+	}
+
+	return done_with_skb;
 }
 
 static void hidp_recv_ctrl_frame(struct hidp_session *session,
 					struct sk_buff *skb)
 {
 	unsigned char hdr, type, param;
+	int free_skb = 1;
 
 	BT_DBG("session %p skb %p len %d", session, skb, skb->len);
 
@@ -502,7 +603,7 @@ static void hidp_recv_ctrl_frame(struct hidp_session *session,
 		break;
 
 	case HIDP_TRANS_DATA:
-		hidp_process_data(session, skb, param);
+		free_skb = hidp_process_data(session, skb, param);
 		break;
 
 	default:
@@ -511,7 +612,8 @@ static void hidp_recv_ctrl_frame(struct hidp_session *session,
 		break;
 	}
 
-	kfree_skb(skb);
+	if (free_skb)
+		kfree_skb(skb);
 }
 
 static void hidp_recv_intr_frame(struct hidp_session *session,
@@ -845,6 +947,7 @@ static int hidp_setup_hid(struct hidp_session *session,
 	hid->dev.parent = hidp_get_device(session);
 	hid->ll_driver = &hidp_hid_driver;
 
+	hid->hid_get_raw_report = hidp_get_raw_report;
 	hid->hid_output_raw_report = hidp_output_raw_report;
 
 	return 0;
@@ -897,6 +1000,8 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 	skb_queue_head_init(&session->ctrl_transmit);
 	skb_queue_head_init(&session->intr_transmit);
 
+	mutex_init(&session->report_mutex);
+	init_waitqueue_head(&session->report_queue);
 	init_waitqueue_head(&session->startup_queue);
 	session->waiting_for_startup = 1;
 	session->flags   = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID);
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index 92e093e..13de5fa 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -80,6 +80,7 @@
 #define HIDP_VIRTUAL_CABLE_UNPLUG	0
 #define HIDP_BOOT_PROTOCOL_MODE		1
 #define HIDP_BLUETOOTH_VENDOR_ID	9
+#define	HIDP_WAITING_FOR_RETURN		10
 #define HIDP_WAITING_FOR_SEND_ACK	11
 
 struct hidp_connadd_req {
@@ -155,6 +156,13 @@ struct hidp_session {
 	struct sk_buff_head ctrl_transmit;
 	struct sk_buff_head intr_transmit;
 
+	/* Used in hidp_get_raw_report() */
+	int waiting_report_type; /* HIDP_DATA_RTYPE_* */
+	int waiting_report_number; /* -1 for not numbered */
+	struct mutex report_mutex;
+	struct sk_buff *report_return;
+	wait_queue_head_t report_queue;
+
 	/* Used in hidp_output_raw_report() */
 	int output_report_success; /* boolean */
 
-- 
1.7.0.4



^ permalink raw reply related

* [PATCH v5 0/4] Adding HID Feature Report Support to hidraw
From: Alan Ott @ 2011-01-18  8:04 UTC (permalink / raw)
  To: Jiri Kosina, Marcel Holtmann, Gustavo F. Padovan, David S. Miller,
	Alan Ott
  Cc: Alan Ott

This patch adds Feature Report support for USB and Bluetooth HID devices
through hidraw.

The first two patches prepare the bluetooth side for the change.
	a. Make sure the hidp_session() thread is started before
	   device's probe() functions are called.
	b. Wait for ACK/NAK on sent reports, and return proper
	   error codes.
The third patch is the hidraw core and USB changes.
The fourth patch is the Bluetooth changes.

Thanks to Antonio Ospite and Bill Good for providing testing and feedback.


Alan Ott (4):
  bt hidp: Move hid_add_device() call to after hidp_session() has
    started.
  bt hidp: Wait for ACK on Sent Reports
  HID: Add Support for Setting and Getting Feature Reports from hidraw
  Bluetooth hidp: Add support for hidraw HIDIOCGFEATURE and
    HIDIOCSFEATURE

 drivers/hid/hidraw.c          |  106 +++++++++++++++++++-
 drivers/hid/usbhid/hid-core.c |   35 +++++++
 include/linux/hid.h           |    3 +
 include/linux/hidraw.h        |    3 +
 net/bluetooth/hidp/core.c     |  214 ++++++++++++++++++++++++++++++++++++++---
 net/bluetooth/hidp/hidp.h     |   15 +++
 6 files changed, 355 insertions(+), 21 deletions(-)



^ permalink raw reply

* [PATCH v5 1/4] bt hidp: Move hid_add_device() call to after hidp_session() has started.
From: Alan Ott @ 2011-01-18  8:04 UTC (permalink / raw)
  To: Jiri Kosina, Marcel Holtmann, Gustavo F. Padovan, David S. Miller,
	Alan Ott
  Cc: Alan Ott
In-Reply-To: <1295337880-12452-1-git-send-email-alan@signal11.us>

Move the call to hid_add_device() (which calls a device's probe() function)
to after the kernel_thread() call which starts the hidp_session() thread.
This ensures the Bluetooth receive socket is fully running by the time a
device's probe() function is called. This way, a device can communicate
(send and receive) with the Bluetooth device from its probe() function.

Signed-off-by: Alan Ott <alan@signal11.us>
---
 net/bluetooth/hidp/core.c |   28 ++++++++++++++++++++--------
 net/bluetooth/hidp/hidp.h |    3 +++
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 29544c2..67cc4bc 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -563,6 +563,8 @@ static int hidp_session(void *arg)
 	init_waitqueue_entry(&intr_wait, current);
 	add_wait_queue(sk_sleep(ctrl_sk), &ctrl_wait);
 	add_wait_queue(sk_sleep(intr_sk), &intr_wait);
+	session->waiting_for_startup = 0;
+	wake_up_interruptible(&session->startup_queue);
 	while (!atomic_read(&session->terminate)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -754,6 +756,8 @@ static struct hid_ll_driver hidp_hid_driver = {
 	.hidinput_input_event = hidp_hidinput_event,
 };
 
+/* This function sets up the hid device. It does not add it
+   to the HID system. That is done in hidp_add_connection(). */
 static int hidp_setup_hid(struct hidp_session *session,
 				struct hidp_connadd_req *req)
 {
@@ -795,16 +799,8 @@ static int hidp_setup_hid(struct hidp_session *session,
 
 	hid->hid_output_raw_report = hidp_output_raw_report;
 
-	err = hid_add_device(hid);
-	if (err < 0)
-		goto failed;
-
 	return 0;
 
-failed:
-	hid_destroy_device(hid);
-	session->hid = NULL;
-
 fault:
 	kfree(session->rd_data);
 	session->rd_data = NULL;
@@ -853,6 +849,8 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 	skb_queue_head_init(&session->ctrl_transmit);
 	skb_queue_head_init(&session->intr_transmit);
 
+	init_waitqueue_head(&session->startup_queue);
+	session->waiting_for_startup = 1;
 	session->flags   = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID);
 	session->idle_to = req->idle_to;
 
@@ -875,6 +873,14 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 	err = kernel_thread(hidp_session, session, CLONE_KERNEL);
 	if (err < 0)
 		goto unlink;
+	while (session->waiting_for_startup) {
+		wait_event_interruptible(session->startup_queue,
+			!session->waiting_for_startup);
+	}
+
+	err = hid_add_device(session->hid);
+	if (err < 0)
+		goto err_add_device;
 
 	if (session->input) {
 		hidp_send_ctrl_message(session,
@@ -888,6 +894,12 @@ int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock,
 	up_write(&hidp_session_sem);
 	return 0;
 
+err_add_device:
+	hid_destroy_device(session->hid);
+	session->hid = NULL;
+	atomic_inc(&session->terminate);
+	hidp_schedule(session);
+
 unlink:
 	hidp_del_timer(session);
 
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index 8d934a1..2cc35dc 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -157,6 +157,9 @@ struct hidp_session {
 	/* Report descriptor */
 	__u8 *rd_data;
 	uint rd_size;
+
+	wait_queue_head_t startup_queue;
+	int waiting_for_startup;
 };
 
 static inline void hidp_schedule(struct hidp_session *session)
-- 
1.7.0.4



^ permalink raw reply related

* Re: rps testing questions
From: mi wake @ 2011-01-18  8:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev
In-Reply-To: <1295257994.3335.6.camel@edumazet-laptop>

2011/1/17 Eric Dumazet <eric.dumazet@gmail.com>:
> Le lundi 17 janvier 2011 à 17:43 +0800, mi wake a écrit :
>> I do a rps(Receive Packet Steering) testing on centos 5.5 with  kernel 2.6.37.
>> cpu: 8 core Intel.
>> ethernet adapter: bnx2x
>>
>> Problem statement:
>> enable rps with:
>> echo "ff" > /sys/class/net/eth2/queues/rx-0/rps_cpus.
>>
>
> bnx2x with one queue only ?
>
>> running 1 instances of netperf TCP_RR: netperf  -t TCP_RR -H 192.168.0.1 -c -C
>> without rps: 9963.48(Trans Rate per sec)
>> with rps:  9387.59(Trans Rate per sec)
>>
>> I do ab and tbench testing also find there is less tps with enable
>> rps.but,there is more cpu using when with enable rps.when with enable
>> rps ,softirqs is blanced  on cpus.
>
> Really ? that seems unlikely with your one flow test, unless you _also_
> have hardware IRQS hitting all your cpus. (That would be very bad)
>
>>
>> is there something wrong with my test?
>> --
>
> If you test with one flow, RPS brings nothing at all. Better handle the
> packet directly from the cpu handling the hardware IRQ (and NAPI)
>
> You better make sure hardware IRQ are on one cpu, instead of many cpus.
>
>
 I have checked that bnx2x with one queue only and hardware IRQ are on one cpu.
 I do testing again with more flows.when I use ip range from 192.x.x.1
to 192.x.x.200 to send syn packets.
Server can deal with :
 without rps + rfs : 18M/s
 with     rps +rfs : 21M/s.
Maybe  in previous tests,there are less flow. I will continue testing.
Thank you!

^ permalink raw reply

* Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
From: Jason Wang @ 2011-01-18  7:41 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel
In-Reply-To: <20110118043725.GB2233@redhat.com>

Michael S. Tsirkin writes:
 > On Tue, Jan 18, 2011 at 11:05:33AM +0800, Jason Wang wrote:
 > > Michael S. Tsirkin writes:
 > >  > On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
 > >  > > Codes duplication were found between the handling of mergeable and big
 > >  > > buffers, so this patch tries to unify them. This could be easily done
 > >  > > by adding a quota to the get_rx_bufs() which is used to limit the
 > >  > > number of buffers it returns (for mergeable buffer, the quota is
 > >  > > simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
 > >  > > previous handle_rx_mergeable() could be resued also for big buffers.
 > >  > > 
 > >  > > Signed-off-by: Jason Wang <jasowang@redhat.com>
 > >  > 
 > >  > We actually started this way. However the code turned out
 > >  > to have measureable overhead when handle_rx_mergeable
 > >  > is called with quota 1.
 > >  > So before applying this I'd like to see some data
 > >  > to show this is not the case anymore.
 > >  > 
 > > 
 > > I've run a round of test (Host->Guest) for these three patches on my desktop:
 > 
 > Yes but what if you only apply patch 3?
 > 

Result here, slightly better than without it but worse than applying all the three
patches except for big buffer size at 2048 but the differnece could be neglected.

mergeable buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.161 (10.66.91.161) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       584.91   69.41    26.10    38.882  7.310  
 87380  16384    256    60.00      1194.05   72.81    34.82    19.980  4.778  
 87380  16384    512    60.00      1503.93   74.23    36.86    16.174  4.016  
 87380  16384   1024    60.00      2004.57   73.53    37.59    12.019  3.073  
 87380  16384   2048    60.00      3445.96   73.76    38.88    7.014   1.849  

big buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.129 (10.66.91.129) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       646.95   72.25    27.05    36.595  6.850  
 87380  16384    256    60.00      1258.61   74.88    33.01    19.495  4.297  
 87380  16384    512    60.00      1655.95   74.14    33.96    14.671  3.360  
 87380  16384   1024    60.00      3220.32   74.24    33.31    7.555   1.695  
 87380  16384   2048    60.00      4516.40   73.88    42.10    5.360   1.527  

 > > Without these patches
 > > 
 > > mergeable buffers:
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.42 (10.66.91.42) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       575.87   69.20    26.36    39.375  7.499  
 > >  87380  16384    256    60.01      1123.57   73.16    34.73    21.335  5.064  
 > >  87380  16384    512    60.00      1351.12   75.26    35.80    18.251  4.341  
 > >  87380  16384   1024    60.00      1955.31   74.73    37.67    12.523  3.156  
 > >  87380  16384   2048    60.01      3411.92   74.82    39.49    7.186   1.896  
 > > 
 > > bug buffers:
 > > Netperf test results
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.109 (10.66.91.109) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       567.10   72.06    26.13    41.638  7.550  
 > >  87380  16384    256    60.00      1143.69   74.66    32.58    21.392  4.667  
 > >  87380  16384    512    60.00      1460.92   73.94    33.40    16.585  3.746  
 > >  87380  16384   1024    60.00      3454.85   77.49    33.89    7.349   1.607  
 > >  87380  16384   2048    60.00      3781.11   76.51    38.38    6.631   1.663  
 > > 
 > > With these patches:
 > > 
 > > mergeable buffers:
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.236 (10.66.91.236) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       657.53   71.27    26.42    35.517  6.583  
 > >  87380  16384    256    60.00      1217.73   74.34    34.67    20.004  4.665  
 > >  87380  16384    512    60.00      1575.25   75.27    37.12    15.658  3.861  
 > >  87380  16384   1024    60.00      2416.07   74.77    37.20    10.140  2.522  
 > >  87380  16384   2048    60.00      3702.29   77.31    36.29    6.842   1.606  
 > > 
 > > big buffers:
 > > Netperf test results
 > > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.202 (10.66.91.202) port 0 AF_INET
 > > Recv   Send    Send                          Utilization       Service Demand
 > > Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
 > > Size   Size    Size     Time     Throughput  local    remote   local   remote
 > > bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
 > > 
 > >  87380  16384     64    60.00       647.67   71.86    27.26    36.356  6.895  
 > >  87380  16384    256    60.00      1265.82   76.19    36.54    19.724  4.729  
 > >  87380  16384    512    60.00      1796.64   76.06    39.48    13.872  3.601  
 > >  87380  16384   1024    60.00      4008.37   77.05    36.47    6.299   1.491  
 > >  87380  16384   2048    60.00      4468.56   75.18    41.79    5.513   1.532 
 > > 
 > > Looks like the unification does not hurt the performance, and with those patches
 > > we can get some improvement. BTW, the regression of mergeable buffer still
 > > exist.
 > > 
 > >  > > ---
 > >  > >  drivers/vhost/net.c |  128 +++------------------------------------------------
 > >  > >  1 files changed, 7 insertions(+), 121 deletions(-)
 > >  > > 
 > >  > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 > >  > > index 95e49de..c32a2e4 100644
 > >  > > --- a/drivers/vhost/net.c
 > >  > > +++ b/drivers/vhost/net.c
 > >  > > @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
 > >  > >   * @iovcount	- returned count of io vectors we fill
 > >  > >   * @log		- vhost log
 > >  > >   * @log_num	- log offset
 > >  > > + * @quota       - headcount quota, 1 for big buffer
 > >  > >   *	returns number of buffer heads allocated, negative on error
 > >  > >   */
 > >  > >  static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  > > @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  > >  		       int datalen,
 > >  > >  		       unsigned *iovcount,
 > >  > >  		       struct vhost_log *log,
 > >  > > -		       unsigned *log_num)
 > >  > > +		       unsigned *log_num,
 > >  > > +		       unsigned int quota)
 > >  > >  {
 > >  > >  	unsigned int out, in;
 > >  > >  	int seg = 0;
 > >  > > @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  > >  	unsigned d;
 > >  > >  	int r, nlogs = 0;
 > >  > >  
 > >  > > -	while (datalen > 0) {
 > >  > > +	while (datalen > 0 && headcount < quota) {
 > >  > >  		if (unlikely(seg >= UIO_MAXIOV)) {
 > >  > >  			r = -ENOBUFS;
 > >  > >  			goto err;
 > >  > > @@ -282,116 +284,7 @@ err:
 > >  > >  
 > >  > >  /* Expects to be always run from workqueue - which acts as
 > >  > >   * read-size critical section for our kind of RCU. */
 > >  > > -static void handle_rx_big(struct vhost_net *net)
 > >  > > -{
 > >  > > -	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > >  > > -	unsigned out, in, log, s;
 > >  > > -	int head;
 > >  > > -	struct vhost_log *vq_log;
 > >  > > -	struct msghdr msg = {
 > >  > > -		.msg_name = NULL,
 > >  > > -		.msg_namelen = 0,
 > >  > > -		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
 > >  > > -		.msg_controllen = 0,
 > >  > > -		.msg_iov = vq->iov,
 > >  > > -		.msg_flags = MSG_DONTWAIT,
 > >  > > -	};
 > >  > > -
 > >  > > -	struct virtio_net_hdr hdr = {
 > >  > > -		.flags = 0,
 > >  > > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
 > >  > > -	};
 > >  > > -
 > >  > > -	size_t len, total_len = 0;
 > >  > > -	int err;
 > >  > > -	size_t hdr_size;
 > >  > > -	struct socket *sock = rcu_dereference(vq->private_data);
 > >  > > -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 > >  > > -		return;
 > >  > > -
 > >  > > -	mutex_lock(&vq->mutex);
 > >  > > -	vhost_disable_notify(vq);
 > >  > > -	hdr_size = vq->vhost_hlen;
 > >  > > -
 > >  > > -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 > >  > > -		vq->log : NULL;
 > >  > > -
 > >  > > -	for (;;) {
 > >  > > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 > >  > > -					 ARRAY_SIZE(vq->iov),
 > >  > > -					 &out, &in,
 > >  > > -					 vq_log, &log);
 > >  > > -		/* On error, stop handling until the next kick. */
 > >  > > -		if (unlikely(head < 0))
 > >  > > -			break;
 > >  > > -		/* OK, now we need to know about added descriptors. */
 > >  > > -		if (head == vq->num) {
 > >  > > -			if (unlikely(vhost_enable_notify(vq))) {
 > >  > > -				/* They have slipped one in as we were
 > >  > > -				 * doing that: check again. */
 > >  > > -				vhost_disable_notify(vq);
 > >  > > -				continue;
 > >  > > -			}
 > >  > > -			/* Nothing new?  Wait for eventfd to tell us
 > >  > > -			 * they refilled. */
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		/* We don't need to be notified again. */
 > >  > > -		if (out) {
 > >  > > -			vq_err(vq, "Unexpected descriptor format for RX: "
 > >  > > -			       "out %d, int %d\n",
 > >  > > -			       out, in);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
 > >  > > -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 > >  > > -		msg.msg_iovlen = in;
 > >  > > -		len = iov_length(vq->iov, in);
 > >  > > -		/* Sanity check */
 > >  > > -		if (!len) {
 > >  > > -			vq_err(vq, "Unexpected header len for RX: "
 > >  > > -			       "%zd expected %zd\n",
 > >  > > -			       iov_length(vq->hdr, s), hdr_size);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		err = sock->ops->recvmsg(NULL, sock, &msg,
 > >  > > -					 len, MSG_DONTWAIT | MSG_TRUNC);
 > >  > > -		/* TODO: Check specific error and bomb out unless EAGAIN? */
 > >  > > -		if (err < 0) {
 > >  > > -			vhost_discard_vq_desc(vq, 1);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		/* TODO: Should check and handle checksum. */
 > >  > > -		if (err > len) {
 > >  > > -			pr_debug("Discarded truncated rx packet: "
 > >  > > -				 " len %d > %zd\n", err, len);
 > >  > > -			vhost_discard_vq_desc(vq, 1);
 > >  > > -			continue;
 > >  > > -		}
 > >  > > -		len = err;
 > >  > > -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
 > >  > > -		if (err) {
 > >  > > -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
 > >  > > -			       vq->iov->iov_base, err);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -		len += hdr_size;
 > >  > > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
 > >  > > -		if (unlikely(vq_log))
 > >  > > -			vhost_log_write(vq, vq_log, log, len);
 > >  > > -		total_len += len;
 > >  > > -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 > >  > > -			vhost_poll_queue(&vq->poll);
 > >  > > -			break;
 > >  > > -		}
 > >  > > -	}
 > >  > > -
 > >  > > -	mutex_unlock(&vq->mutex);
 > >  > > -}
 > >  > > -
 > >  > > -/* Expects to be always run from workqueue - which acts as
 > >  > > - * read-size critical section for our kind of RCU. */
 > >  > > -static void handle_rx_mergeable(struct vhost_net *net)
 > >  > > +static void handle_rx(struct vhost_net *net)
 > >  > >  {
 > >  > >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > >  > >  	unsigned uninitialized_var(in), log;
 > >  > > @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  > >  		sock_len += sock_hlen;
 > >  > >  		vhost_len = sock_len + vhost_hlen;
 > >  > >  		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
 > >  > > -					&in, vq_log, &log);
 > >  > > +					&in, vq_log, &log,
 > >  > > +					likely(mergeable) ? UIO_MAXIOV : 1);
 > >  > >  		/* On error, stop handling until the next kick. */
 > >  > >  		if (unlikely(headcount < 0))
 > >  > >  			break;
 > >  > > @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  > >  	mutex_unlock(&vq->mutex);
 > >  > >  }
 > >  > >  
 > >  > > -static void handle_rx(struct vhost_net *net)
 > >  > > -{
 > >  > > -	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
 > >  > > -		handle_rx_mergeable(net);
 > >  > > -	else
 > >  > > -		handle_rx_big(net);
 > >  > > -}
 > >  > > -
 > >  > >  static void handle_tx_kick(struct vhost_work *work)
 > >  > >  {
 > >  > >  	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,

^ permalink raw reply

* Re: [PATCH] net offloading: Do not mask out NETIF_F_HW_VLAN_TX for vlan.
From: Jesse Gross @ 2011-01-18  7:12 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev, Michael Chan
In-Reply-To: <1295333714.3362.561.camel@edumazet-laptop>

On Tue, Jan 18, 2011 at 1:55 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le lundi 17 janvier 2011 à 22:46 -0800, Jesse Gross a écrit :
>> In netif_skb_features() we return only the features that are valid for vlans
>> if we have a vlan packet.  However, we should not mask out NETIF_F_HW_VLAN_TX
>> since it enables transmission of vlan tags and is obviously valid.
>>
>> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
>> Signed-off-by: Jesse Gross <jesse@nicira.com>
>
> Thanks Jesse
>
> Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
>
> Now back to the "ethtool -K eth0 txvlan off" problem on bnx2
>
> Is it a driver/software problem or hardware/firmware one ?

CC'ing Michael Chan

It looks like bnx2 is storing the offsets of various headers so the
hardware can find them for TSO.  The parsing logic doesn't do anything
for vlan tags, so the hardware gets confused if one is present in the
packet itself.

Quick fix is to simply disallow disabling TX vlan offload or disable
TSO at the same time or some other Ethtool game.  However, if the
hardware supports it then it would be nicer to fix up the TSO setup
logic.  Maybe we can just add the size of the vlan tag to the offset
but I am not certain.  Michael, do you know if this is possible?

^ permalink raw reply

* Re: [PATCH] net offloading: Do not mask out NETIF_F_HW_VLAN_TX for vlan.
From: Eric Dumazet @ 2011-01-18  6:55 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David Miller, netdev
In-Reply-To: <1295333160-1667-1-git-send-email-jesse@nicira.com>

Le lundi 17 janvier 2011 à 22:46 -0800, Jesse Gross a écrit :
> In netif_skb_features() we return only the features that are valid for vlans
> if we have a vlan packet.  However, we should not mask out NETIF_F_HW_VLAN_TX
> since it enables transmission of vlan tags and is obviously valid.
> 
> Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
> Signed-off-by: Jesse Gross <jesse@nicira.com>

Thanks Jesse

Acked-by: Eric Dumazet <eric.dumazet@gmail.com>

Now back to the "ethtool -K eth0 txvlan off" problem on bnx2

Is it a driver/software problem or hardware/firmware one ?




^ permalink raw reply

* [PATCH] net offloading: Do not mask out NETIF_F_HW_VLAN_TX for vlan.
From: Jesse Gross @ 2011-01-18  6:46 UTC (permalink / raw)
  To: David Miller; +Cc: Eric Dumazet, netdev

In netif_skb_features() we return only the features that are valid for vlans
if we have a vlan packet.  However, we should not mask out NETIF_F_HW_VLAN_TX
since it enables transmission of vlan tags and is obviously valid.

Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/core/dev.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 83507c2..4c58d11 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2023,13 +2023,13 @@ int netif_skb_features(struct sk_buff *skb)
 		return harmonize_features(skb, protocol, features);
 	}
 
-	features &= skb->dev->vlan_features;
+	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
 
 	if (protocol != htons(ETH_P_8021Q)) {
 		return harmonize_features(skb, protocol, features);
 	} else {
 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
-				NETIF_F_GEN_CSUM;
+				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
 		return harmonize_features(skb, protocol, features);
 	}
 }
-- 
1.7.1


^ permalink raw reply related

* Re: [BUG] bnx2 + vlan + TSO : doesnt work
From: Eric Dumazet @ 2011-01-18  6:38 UTC (permalink / raw)
  To: Jesse Gross; +Cc: David Miller, bhutchings, netdev
In-Reply-To: <AANLkTi=zvsmnemO+9OggD01i==3mZnb_-09CjAcRS94w@mail.gmail.com>

Le mardi 18 janvier 2011 à 01:32 -0500, Jesse Gross a écrit :
> On Tue, Jan 18, 2011 at 1:21 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > Le lundi 17 janvier 2011 à 22:09 -0800, David Miller a écrit :
> >> From: Jesse Gross <jesse@nicira.com>
> >> Date: Mon, 17 Jan 2011 16:13:18 -0800
> >>
> >> > I think it is better for netif_skb_features() to actually return the
> >> > correct features rather than bypass it here.  NETIF_F_HW_VLAN_TX
> >> > doesn't depend on any other offloads, so we can always include it if
> >> > it is in dev->features.
> >> >
> >> > Separately, this means there is a problem with bnx2 because it allows
> >> > vlan insertion to be turned off, which would have the same effect.
> >> > Maybe it is looking directly at skb->protocol or similar for TSO.
> >>
> >> Please, someone cons up an acceptable fix fast.
> >>
> >
> > I just woke up, and honestly dont understand why only bnx2 is affected
> > by this problem of masking NETIF_F_HW_VLAN_TX
> 
> If NETIF_F_HW_VLAN_TX is masked then the tag will be inserted in
> software, which is generally OK.  The problem is that some drivers
> assume that if they can do vlan tagging in hardware it will always be
> used and therefore don't expect vlan tags when setting up TSO, etc.
> 

OK, but then this driver gave us the hint core network was actually
always doing software vlan, tagging ;)

> >
> > And I dont understand all this netif_skb_features() stuff : if we want
> > to actually test dev->features & NETIF_F_HW_VLAN_TX, and this flag
> > doesnt depend on other offloads, why are we doing features &
> > vlan_features.
> 
> The idea is to put all of the logic in one place rather than having
> pieces that are really interdependent scattered around in the
> different offloads.  So we could test dev->features directly for vlans
> but I would rather just have netif_skb_features() return the right
> values to start off with.
> 
> >
> > Jesse, I dont understand why you say "bnx2 allows vlan insertion to be
> > turned off". Really.
> 
> You can disable it using Ethtool, which will turn off
> NETIF_F_HW_VLAN_TX the same as this bug.
> 
> I'm running a quick test on a patch to always allow NETIF_F_HW_VLAN_TX
> to be returned from netif_skb_features().

Thanks




^ permalink raw reply

* Re: [BUG] bnx2 + vlan + TSO : doesnt work
From: Jesse Gross @ 2011-01-18  6:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, bhutchings, netdev
In-Reply-To: <1295331690.3362.522.camel@edumazet-laptop>

On Tue, Jan 18, 2011 at 1:21 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Le lundi 17 janvier 2011 à 22:09 -0800, David Miller a écrit :
>> From: Jesse Gross <jesse@nicira.com>
>> Date: Mon, 17 Jan 2011 16:13:18 -0800
>>
>> > I think it is better for netif_skb_features() to actually return the
>> > correct features rather than bypass it here.  NETIF_F_HW_VLAN_TX
>> > doesn't depend on any other offloads, so we can always include it if
>> > it is in dev->features.
>> >
>> > Separately, this means there is a problem with bnx2 because it allows
>> > vlan insertion to be turned off, which would have the same effect.
>> > Maybe it is looking directly at skb->protocol or similar for TSO.
>>
>> Please, someone cons up an acceptable fix fast.
>>
>
> I just woke up, and honestly dont understand why only bnx2 is affected
> by this problem of masking NETIF_F_HW_VLAN_TX

If NETIF_F_HW_VLAN_TX is masked then the tag will be inserted in
software, which is generally OK.  The problem is that some drivers
assume that if they can do vlan tagging in hardware it will always be
used and therefore don't expect vlan tags when setting up TSO, etc.

>
> And I dont understand all this netif_skb_features() stuff : if we want
> to actually test dev->features & NETIF_F_HW_VLAN_TX, and this flag
> doesnt depend on other offloads, why are we doing features &
> vlan_features.

The idea is to put all of the logic in one place rather than having
pieces that are really interdependent scattered around in the
different offloads.  So we could test dev->features directly for vlans
but I would rather just have netif_skb_features() return the right
values to start off with.

>
> Jesse, I dont understand why you say "bnx2 allows vlan insertion to be
> turned off". Really.

You can disable it using Ethtool, which will turn off
NETIF_F_HW_VLAN_TX the same as this bug.

I'm running a quick test on a patch to always allow NETIF_F_HW_VLAN_TX
to be returned from netif_skb_features().

^ permalink raw reply

* Re: [BUG] bnx2 + vlan + TSO : doesnt work
From: Eric Dumazet @ 2011-01-18  6:21 UTC (permalink / raw)
  To: David Miller; +Cc: jesse, bhutchings, netdev
In-Reply-To: <20110117.220927.189715926.davem@davemloft.net>

Le lundi 17 janvier 2011 à 22:09 -0800, David Miller a écrit :
> From: Jesse Gross <jesse@nicira.com>
> Date: Mon, 17 Jan 2011 16:13:18 -0800
> 
> > I think it is better for netif_skb_features() to actually return the
> > correct features rather than bypass it here.  NETIF_F_HW_VLAN_TX
> > doesn't depend on any other offloads, so we can always include it if
> > it is in dev->features.
> > 
> > Separately, this means there is a problem with bnx2 because it allows
> > vlan insertion to be turned off, which would have the same effect.
> > Maybe it is looking directly at skb->protocol or similar for TSO.
> 
> Please, someone cons up an acceptable fix fast.
> 

I just woke up, and honestly dont understand why only bnx2 is affected
by this problem of masking NETIF_F_HW_VLAN_TX

And I dont understand all this netif_skb_features() stuff : if we want
to actually test dev->features & NETIF_F_HW_VLAN_TX, and this flag
doesnt depend on other offloads, why are we doing features &
vlan_features.

Jesse, I dont understand why you say "bnx2 allows vlan insertion to be
turned off". Really.




^ permalink raw reply

* Re: [BUG] bnx2 + vlan + TSO : doesnt work
From: David Miller @ 2011-01-18  6:09 UTC (permalink / raw)
  To: jesse; +Cc: bhutchings, eric.dumazet, netdev
In-Reply-To: <AANLkTincFt4fKLvtity=MgQ2+cuvNFrBHqu3+bMLsypc@mail.gmail.com>

From: Jesse Gross <jesse@nicira.com>
Date: Mon, 17 Jan 2011 16:13:18 -0800

> I think it is better for netif_skb_features() to actually return the
> correct features rather than bypass it here.  NETIF_F_HW_VLAN_TX
> doesn't depend on any other offloads, so we can always include it if
> it is in dev->features.
> 
> Separately, this means there is a problem with bnx2 because it allows
> vlan insertion to be turned off, which would have the same effect.
> Maybe it is looking directly at skb->protocol or similar for TSO.

Please, someone cons up an acceptable fix fast.

Thanks.

^ permalink raw reply

* Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
From: Michael S. Tsirkin @ 2011-01-18  4:37 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel
In-Reply-To: <19765.893.776528.869640@gargle.gargle.HOWL>

On Tue, Jan 18, 2011 at 11:05:33AM +0800, Jason Wang wrote:
> Michael S. Tsirkin writes:
>  > On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
>  > > Codes duplication were found between the handling of mergeable and big
>  > > buffers, so this patch tries to unify them. This could be easily done
>  > > by adding a quota to the get_rx_bufs() which is used to limit the
>  > > number of buffers it returns (for mergeable buffer, the quota is
>  > > simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
>  > > previous handle_rx_mergeable() could be resued also for big buffers.
>  > > 
>  > > Signed-off-by: Jason Wang <jasowang@redhat.com>
>  > 
>  > We actually started this way. However the code turned out
>  > to have measureable overhead when handle_rx_mergeable
>  > is called with quota 1.
>  > So before applying this I'd like to see some data
>  > to show this is not the case anymore.
>  > 
> 
> I've run a round of test (Host->Guest) for these three patches on my desktop:

Yes but what if you only apply patch 3?

> Without these patches
> 
> mergeable buffers:
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.42 (10.66.91.42) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       575.87   69.20    26.36    39.375  7.499  
>  87380  16384    256    60.01      1123.57   73.16    34.73    21.335  5.064  
>  87380  16384    512    60.00      1351.12   75.26    35.80    18.251  4.341  
>  87380  16384   1024    60.00      1955.31   74.73    37.67    12.523  3.156  
>  87380  16384   2048    60.01      3411.92   74.82    39.49    7.186   1.896  
> 
> bug buffers:
> Netperf test results
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.109 (10.66.91.109) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       567.10   72.06    26.13    41.638  7.550  
>  87380  16384    256    60.00      1143.69   74.66    32.58    21.392  4.667  
>  87380  16384    512    60.00      1460.92   73.94    33.40    16.585  3.746  
>  87380  16384   1024    60.00      3454.85   77.49    33.89    7.349   1.607  
>  87380  16384   2048    60.00      3781.11   76.51    38.38    6.631   1.663  
> 
> With these patches:
> 
> mergeable buffers:
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.236 (10.66.91.236) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       657.53   71.27    26.42    35.517  6.583  
>  87380  16384    256    60.00      1217.73   74.34    34.67    20.004  4.665  
>  87380  16384    512    60.00      1575.25   75.27    37.12    15.658  3.861  
>  87380  16384   1024    60.00      2416.07   74.77    37.20    10.140  2.522  
>  87380  16384   2048    60.00      3702.29   77.31    36.29    6.842   1.606  
> 
> big buffers:
> Netperf test results
> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.202 (10.66.91.202) port 0 AF_INET
> Recv   Send    Send                          Utilization       Service Demand
> Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
> Size   Size    Size     Time     Throughput  local    remote   local   remote
> bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB
> 
>  87380  16384     64    60.00       647.67   71.86    27.26    36.356  6.895  
>  87380  16384    256    60.00      1265.82   76.19    36.54    19.724  4.729  
>  87380  16384    512    60.00      1796.64   76.06    39.48    13.872  3.601  
>  87380  16384   1024    60.00      4008.37   77.05    36.47    6.299   1.491  
>  87380  16384   2048    60.00      4468.56   75.18    41.79    5.513   1.532 
> 
> Looks like the unification does not hurt the performance, and with those patches
> we can get some improvement. BTW, the regression of mergeable buffer still
> exist.
> 
>  > > ---
>  > >  drivers/vhost/net.c |  128 +++------------------------------------------------
>  > >  1 files changed, 7 insertions(+), 121 deletions(-)
>  > > 
>  > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>  > > index 95e49de..c32a2e4 100644
>  > > --- a/drivers/vhost/net.c
>  > > +++ b/drivers/vhost/net.c
>  > > @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
>  > >   * @iovcount	- returned count of io vectors we fill
>  > >   * @log		- vhost log
>  > >   * @log_num	- log offset
>  > > + * @quota       - headcount quota, 1 for big buffer
>  > >   *	returns number of buffer heads allocated, negative on error
>  > >   */
>  > >  static int get_rx_bufs(struct vhost_virtqueue *vq,
>  > > @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  > >  		       int datalen,
>  > >  		       unsigned *iovcount,
>  > >  		       struct vhost_log *log,
>  > > -		       unsigned *log_num)
>  > > +		       unsigned *log_num,
>  > > +		       unsigned int quota)
>  > >  {
>  > >  	unsigned int out, in;
>  > >  	int seg = 0;
>  > > @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>  > >  	unsigned d;
>  > >  	int r, nlogs = 0;
>  > >  
>  > > -	while (datalen > 0) {
>  > > +	while (datalen > 0 && headcount < quota) {
>  > >  		if (unlikely(seg >= UIO_MAXIOV)) {
>  > >  			r = -ENOBUFS;
>  > >  			goto err;
>  > > @@ -282,116 +284,7 @@ err:
>  > >  
>  > >  /* Expects to be always run from workqueue - which acts as
>  > >   * read-size critical section for our kind of RCU. */
>  > > -static void handle_rx_big(struct vhost_net *net)
>  > > -{
>  > > -	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
>  > > -	unsigned out, in, log, s;
>  > > -	int head;
>  > > -	struct vhost_log *vq_log;
>  > > -	struct msghdr msg = {
>  > > -		.msg_name = NULL,
>  > > -		.msg_namelen = 0,
>  > > -		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
>  > > -		.msg_controllen = 0,
>  > > -		.msg_iov = vq->iov,
>  > > -		.msg_flags = MSG_DONTWAIT,
>  > > -	};
>  > > -
>  > > -	struct virtio_net_hdr hdr = {
>  > > -		.flags = 0,
>  > > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
>  > > -	};
>  > > -
>  > > -	size_t len, total_len = 0;
>  > > -	int err;
>  > > -	size_t hdr_size;
>  > > -	struct socket *sock = rcu_dereference(vq->private_data);
>  > > -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
>  > > -		return;
>  > > -
>  > > -	mutex_lock(&vq->mutex);
>  > > -	vhost_disable_notify(vq);
>  > > -	hdr_size = vq->vhost_hlen;
>  > > -
>  > > -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  > > -		vq->log : NULL;
>  > > -
>  > > -	for (;;) {
>  > > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
>  > > -					 ARRAY_SIZE(vq->iov),
>  > > -					 &out, &in,
>  > > -					 vq_log, &log);
>  > > -		/* On error, stop handling until the next kick. */
>  > > -		if (unlikely(head < 0))
>  > > -			break;
>  > > -		/* OK, now we need to know about added descriptors. */
>  > > -		if (head == vq->num) {
>  > > -			if (unlikely(vhost_enable_notify(vq))) {
>  > > -				/* They have slipped one in as we were
>  > > -				 * doing that: check again. */
>  > > -				vhost_disable_notify(vq);
>  > > -				continue;
>  > > -			}
>  > > -			/* Nothing new?  Wait for eventfd to tell us
>  > > -			 * they refilled. */
>  > > -			break;
>  > > -		}
>  > > -		/* We don't need to be notified again. */
>  > > -		if (out) {
>  > > -			vq_err(vq, "Unexpected descriptor format for RX: "
>  > > -			       "out %d, int %d\n",
>  > > -			       out, in);
>  > > -			break;
>  > > -		}
>  > > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
>  > > -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
>  > > -		msg.msg_iovlen = in;
>  > > -		len = iov_length(vq->iov, in);
>  > > -		/* Sanity check */
>  > > -		if (!len) {
>  > > -			vq_err(vq, "Unexpected header len for RX: "
>  > > -			       "%zd expected %zd\n",
>  > > -			       iov_length(vq->hdr, s), hdr_size);
>  > > -			break;
>  > > -		}
>  > > -		err = sock->ops->recvmsg(NULL, sock, &msg,
>  > > -					 len, MSG_DONTWAIT | MSG_TRUNC);
>  > > -		/* TODO: Check specific error and bomb out unless EAGAIN? */
>  > > -		if (err < 0) {
>  > > -			vhost_discard_vq_desc(vq, 1);
>  > > -			break;
>  > > -		}
>  > > -		/* TODO: Should check and handle checksum. */
>  > > -		if (err > len) {
>  > > -			pr_debug("Discarded truncated rx packet: "
>  > > -				 " len %d > %zd\n", err, len);
>  > > -			vhost_discard_vq_desc(vq, 1);
>  > > -			continue;
>  > > -		}
>  > > -		len = err;
>  > > -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
>  > > -		if (err) {
>  > > -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
>  > > -			       vq->iov->iov_base, err);
>  > > -			break;
>  > > -		}
>  > > -		len += hdr_size;
>  > > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
>  > > -		if (unlikely(vq_log))
>  > > -			vhost_log_write(vq, vq_log, log, len);
>  > > -		total_len += len;
>  > > -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
>  > > -			vhost_poll_queue(&vq->poll);
>  > > -			break;
>  > > -		}
>  > > -	}
>  > > -
>  > > -	mutex_unlock(&vq->mutex);
>  > > -}
>  > > -
>  > > -/* Expects to be always run from workqueue - which acts as
>  > > - * read-size critical section for our kind of RCU. */
>  > > -static void handle_rx_mergeable(struct vhost_net *net)
>  > > +static void handle_rx(struct vhost_net *net)
>  > >  {
>  > >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
>  > >  	unsigned uninitialized_var(in), log;
>  > > @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  		sock_len += sock_hlen;
>  > >  		vhost_len = sock_len + vhost_hlen;
>  > >  		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
>  > > -					&in, vq_log, &log);
>  > > +					&in, vq_log, &log,
>  > > +					likely(mergeable) ? UIO_MAXIOV : 1);
>  > >  		/* On error, stop handling until the next kick. */
>  > >  		if (unlikely(headcount < 0))
>  > >  			break;
>  > > @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  	mutex_unlock(&vq->mutex);
>  > >  }
>  > >  
>  > > -static void handle_rx(struct vhost_net *net)
>  > > -{
>  > > -	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
>  > > -		handle_rx_mergeable(net);
>  > > -	else
>  > > -		handle_rx_big(net);
>  > > -}
>  > > -
>  > >  static void handle_tx_kick(struct vhost_work *work)
>  > >  {
>  > >  	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,

^ permalink raw reply

* Re: [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop
From: Michael S. Tsirkin @ 2011-01-18  4:36 UTC (permalink / raw)
  To: Jason Wang; +Cc: virtualization, netdev, kvm, linux-kernel
In-Reply-To: <19765.5737.352179.50100@gargle.gargle.HOWL>

On Tue, Jan 18, 2011 at 12:26:17PM +0800, Jason Wang wrote:
> Michael S. Tsirkin writes:
>  > On Mon, Jan 17, 2011 at 04:10:59PM +0800, Jason Wang wrote:
>  > > No need to check the support of mergeable buffer inside the recevie
>  > > loop as the whole handle_rx()_xx is in the read critical region.  So
>  > > this patch move it ahead of the receiving loop.
>  > > 
>  > > Signed-off-by: Jason Wang <jasowang@redhat.com>
>  > 
>  > Well feature check is mostly just features & bit
>  > so why would it be slower? Because of the rcu
>  > adding memory barriers? Do you observe a
>  > measureable speedup with this patch?
>  > 
> 
> I do not measure the performance for just this patch, maybe not obvious. And it
> can also help the code unification.
> 
>  > Apropos, I noticed that the check in vhost_has_feature
>  > is wrong: it uses the same kind of RCU as the
>  > private pointer. So we'll have to fix that properly
>  > by adding more lockdep classes, but for now
>  > we'll need to make
>  > the check 1 || lockdep_is_held(&dev->mutex);
>  > and add a TODO.
>  > 
> 
> Not sure, lockdep_is_head(&dev->mutex) maybe not accurate but sufficient, as it
> was always held in the read critical region.

Not really, when we call vhost_has_feature from the vq handling thread
it's not.

>  > > ---
>  > >  drivers/vhost/net.c |    5 +++--
>  > >  1 files changed, 3 insertions(+), 2 deletions(-)
>  > > 
>  > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
>  > > index 14fc189..95e49de 100644
>  > > --- a/drivers/vhost/net.c
>  > > +++ b/drivers/vhost/net.c
>  > > @@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  	};
>  > >  
>  > >  	size_t total_len = 0;
>  > > -	int err, headcount;
>  > > +	int err, headcount, mergeable;
>  > >  	size_t vhost_hlen, sock_hlen;
>  > >  	size_t vhost_len, sock_len;
>  > >  	struct socket *sock = rcu_dereference(vq->private_data);
>  > > @@ -425,6 +425,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  
>  > >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
>  > >  		vq->log : NULL;
>  > > +	mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
>  > >  
>  > >  	while ((sock_len = peek_head_len(sock->sk))) {
>  > >  		sock_len += sock_hlen;
>  > > @@ -474,7 +475,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
>  > >  			break;
>  > >  		}
>  > >  		/* TODO: Should check and handle checksum. */
>  > > -		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
>  > > +		if (likely(mergeable) &&
>  > >  		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
>  > >  				      offsetof(typeof(hdr), num_buffers),
>  > >  				      sizeof hdr.num_buffers)) {

^ permalink raw reply

* Re: [PATCH 1/3] vhost-net: check the support of mergeable buffer outside the receive loop
From: Jason Wang @ 2011-01-18  4:26 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel
In-Reply-To: <20110117084644.GB23479@redhat.com>

Michael S. Tsirkin writes:
 > On Mon, Jan 17, 2011 at 04:10:59PM +0800, Jason Wang wrote:
 > > No need to check the support of mergeable buffer inside the recevie
 > > loop as the whole handle_rx()_xx is in the read critical region.  So
 > > this patch move it ahead of the receiving loop.
 > > 
 > > Signed-off-by: Jason Wang <jasowang@redhat.com>
 > 
 > Well feature check is mostly just features & bit
 > so why would it be slower? Because of the rcu
 > adding memory barriers? Do you observe a
 > measureable speedup with this patch?
 > 

I do not measure the performance for just this patch, maybe not obvious. And it
can also help the code unification.

 > Apropos, I noticed that the check in vhost_has_feature
 > is wrong: it uses the same kind of RCU as the
 > private pointer. So we'll have to fix that properly
 > by adding more lockdep classes, but for now
 > we'll need to make
 > the check 1 || lockdep_is_held(&dev->mutex);
 > and add a TODO.
 > 

Not sure, lockdep_is_head(&dev->mutex) maybe not accurate but sufficient, as it
was always held in the read critical region.

 > > ---
 > >  drivers/vhost/net.c |    5 +++--
 > >  1 files changed, 3 insertions(+), 2 deletions(-)
 > > 
 > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 > > index 14fc189..95e49de 100644
 > > --- a/drivers/vhost/net.c
 > > +++ b/drivers/vhost/net.c
 > > @@ -411,7 +411,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  	};
 > >  
 > >  	size_t total_len = 0;
 > > -	int err, headcount;
 > > +	int err, headcount, mergeable;
 > >  	size_t vhost_hlen, sock_hlen;
 > >  	size_t vhost_len, sock_len;
 > >  	struct socket *sock = rcu_dereference(vq->private_data);
 > > @@ -425,6 +425,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  
 > >  	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 > >  		vq->log : NULL;
 > > +	mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);
 > >  
 > >  	while ((sock_len = peek_head_len(sock->sk))) {
 > >  		sock_len += sock_hlen;
 > > @@ -474,7 +475,7 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  			break;
 > >  		}
 > >  		/* TODO: Should check and handle checksum. */
 > > -		if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) &&
 > > +		if (likely(mergeable) &&
 > >  		    memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
 > >  				      offsetof(typeof(hdr), num_buffers),
 > >  				      sizeof hdr.num_buffers)) {

^ permalink raw reply

* Re: [PATCH] bonding: added 802.3ad round-robin hashing policy for single TCP session balancing
From: John Fastabend @ 2011-01-18  3:16 UTC (permalink / raw)
  To: Jay Vosburgh; +Cc: Oleg V. Ukhno, netdev@vger.kernel.org, David S. Miller
In-Reply-To: <26330.1295049912@death>

On 1/14/2011 4:05 PM, Jay Vosburgh wrote:
> Oleg V. Ukhno <olegu@yandex-team.ru> wrote:
>> Jay Vosburgh wrote:
>>
>>> 	This is a violation of the 802.3ad (now 802.1ax) standard, 5.2.1
>>> (f), which requires that all frames of a given "conversation" are passed
>>> to a single port.
>>>
>>> 	The existing layer3+4 hash has a similar problem (that it may
>>> send packets from a conversation to multiple ports), but for that case
>>> it's an unlikely exception (only in the case of IP fragmentation), but
>>> here it's the norm.  At a minimum, this must be clearly documented.
>>>
>>> 	Also, what does a round robin in 802.3ad provide that the
>>> existing round robin does not?  My presumption is that you're looking to
>>> get the aggregator autoconfiguration that 802.3ad provides, but you
>>> don't say.
> 
> 	I'm still curious about this question.  Given the rather
> intricate setup of your particular network (described below), I'm not
> sure why 802.3ad is of benefit over traditional etherchannel
> (balance-rr / balance-xor).
> 
>>> 	I don't necessarily think this is a bad cheat (round robining on
>>> 802.3ad as an explicit non-standard extension), since everybody wants to
>>> stripe their traffic across multiple slaves.  I've given some thought to
>>> making round robin into just another hash mode, but this also does some
>>> magic to the MAC addresses of the outgoing frames (more on that below).
>> Yes, I am resetting MAC addresses when transmitting packets to have switch
>> to put packets into different ports of the receiving etherchannel.
> 
> 	By "etherchannel" do you really mean "Cisco switch with a
> port-channel group using LACP"?
> 
>> I am using this patch to provide full-mesh ISCSI connectivity between at
>> least 4 hosts (all hosts of course are in same ethernet segment) and every
>> host is connected with aggregate link with 4 slaves(usually).
>> Using round-robin I provide near-equal load striping when transmitting,
>> using MAC address magic I force switch to stripe packets over all slave
>> links in destination port-channel(when number of rx-ing slaves is equal to
>> number ot tx-ing slaves and is even).
> 
> 	By "MAC address magic" do you mean that you're assigning
> specifically chosen MAC addresses to the slaves so that the switch's
> hash is essentially "assigning" the bonding slaves to particular ports
> on the outgoing port-channel group?
> 
> 	Assuming that this is the case, it's an interesting idea, but
> I'm unconvinced that it's better on 802.3ad vs. balance-rr.  Unless I'm
> missing something, you can get everything you need from an option to
> have balance-rr / balance-xor utilize the slave's permanent address as
> the source address for outgoing traffic.
> 
>> [...] So I am able to utilize all slaves
>> for tx and for rx up to maximum capacity; besides I am getting L2 link
>> failure detection (and load rebalancing), which is (in my opinion) much
>> faster and robust than L3 or than dm-multipath provides.
>> It's my idea with the patch
> 
> 	Can somebody (John?) more knowledgable than I about dm-multipath
> comment on the above?

Here I'll give it a go.

I don't think detecting L2 link failure this way is very robust. If there
is a failure farther away then your immediate link your going to break
completely? Your bonding hash will continue to round robin the iscsi
packets and half them will get dropped on the floor. dm-multipath handles
this reasonably gracefully. Also in this bonding environment you seem to
be very sensitive to RTT times on the network. Maybe not bad out right but
I wouldn't consider this robust either.

You could tweak your scsi timeout values and fail_fast values, set the io
retry to 0 to cause the fail over to occur faster. I suspect you already
did this and still it is too slow? Maybe adding a checker in multipathd to
listen for link events would be fast enough. The checker could then fail
the path immediately.

I'll try to address your comments from the other thread here. In general I
wonder if it would be better to solve the problems in dm-multipath rather than
add another bonding mode?

OVU - it is slow(I am using ISCSI for Oracle , so I need to minimize latency)

The dm-multipath layer is adding latency? How much? If this is really true
maybe its best to the address the real issue here and not avoid it by
using the bonding layer.

OVU - it handles any link failures bad, because of it's command queue 
limitation(all queued commands above 32 are discarded in case of path 
failure, as I remember)

Maybe true but only link failures with the immediate peer are handled
with a bonding strategy. By working at the block layer we can detect
failures throughout the path. I would need to look into this again I
know when we were looking at this sometime ago there was some talk about
improving this behavior. I need to take some time to go back through the
error recovery stuff to remember how this works.

OVU - it performs very bad when there are many devices and maтy paths(I was 
unable to utilize more that 2Gbps of 4 even with 100 disks with 4 paths 
per each disk)

Hmm well that seems like something is broken. I'll try this setup when
I get some time next few days. This really shouldn't be the case dm-multipath
should not add a bunch of extra latency or effect throughput significantly.
By the way what are you seeing without mpio?

Thanks,
John

^ permalink raw reply

* Re: [PATCH v2] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Po-Yu Chuang @ 2011-01-18  3:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, linux-kernel, ratbert, bhutchings, joe, dilinger
In-Reply-To: <1295288462.3335.55.camel@edumazet-laptop>

Dear Eric,

On Tue, Jan 18, 2011 at 2:21 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> BTW, shouldnt you use cpu_to_be32() or cpu_to_le32(), if this driver is
> multi platform ?

This reminds me another thing. Should I use u32 instead of unsigned int for all
hardware related variables (registers, descriptors) ?
Not quite sure about these cross-platform issues.

best regards,
Po-Yu Chuang

^ permalink raw reply

* Re: [PATCH 2/3] vhost-net: Unify the code of mergeable and big buffer handling
From: Jason Wang @ 2011-01-18  3:05 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Jason Wang, virtualization, netdev, kvm, linux-kernel
In-Reply-To: <20110117083644.GA23479@redhat.com>

Michael S. Tsirkin writes:
 > On Mon, Jan 17, 2011 at 04:11:08PM +0800, Jason Wang wrote:
 > > Codes duplication were found between the handling of mergeable and big
 > > buffers, so this patch tries to unify them. This could be easily done
 > > by adding a quota to the get_rx_bufs() which is used to limit the
 > > number of buffers it returns (for mergeable buffer, the quota is
 > > simply UIO_MAXIOV, for big buffers, the quota is just 1), and then the
 > > previous handle_rx_mergeable() could be resued also for big buffers.
 > > 
 > > Signed-off-by: Jason Wang <jasowang@redhat.com>
 > 
 > We actually started this way. However the code turned out
 > to have measureable overhead when handle_rx_mergeable
 > is called with quota 1.
 > So before applying this I'd like to see some data
 > to show this is not the case anymore.
 > 

I've run a round of test (Host->Guest) for these three patches on my desktop:

Without these patches

mergeable buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.42 (10.66.91.42) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       575.87   69.20    26.36    39.375  7.499  
 87380  16384    256    60.01      1123.57   73.16    34.73    21.335  5.064  
 87380  16384    512    60.00      1351.12   75.26    35.80    18.251  4.341  
 87380  16384   1024    60.00      1955.31   74.73    37.67    12.523  3.156  
 87380  16384   2048    60.01      3411.92   74.82    39.49    7.186   1.896  

bug buffers:
Netperf test results
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.109 (10.66.91.109) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       567.10   72.06    26.13    41.638  7.550  
 87380  16384    256    60.00      1143.69   74.66    32.58    21.392  4.667  
 87380  16384    512    60.00      1460.92   73.94    33.40    16.585  3.746  
 87380  16384   1024    60.00      3454.85   77.49    33.89    7.349   1.607  
 87380  16384   2048    60.00      3781.11   76.51    38.38    6.631   1.663  

With these patches:

mergeable buffers:
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.236 (10.66.91.236) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       657.53   71.27    26.42    35.517  6.583  
 87380  16384    256    60.00      1217.73   74.34    34.67    20.004  4.665  
 87380  16384    512    60.00      1575.25   75.27    37.12    15.658  3.861  
 87380  16384   1024    60.00      2416.07   74.77    37.20    10.140  2.522  
 87380  16384   2048    60.00      3702.29   77.31    36.29    6.842   1.606  

big buffers:
Netperf test results
TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 10.66.91.202 (10.66.91.202) port 0 AF_INET
Recv   Send    Send                          Utilization       Service Demand
Socket Socket  Message  Elapsed              Send     Recv     Send    Recv
Size   Size    Size     Time     Throughput  local    remote   local   remote
bytes  bytes   bytes    secs.    10^6bits/s  % S      % S      us/KB   us/KB

 87380  16384     64    60.00       647.67   71.86    27.26    36.356  6.895  
 87380  16384    256    60.00      1265.82   76.19    36.54    19.724  4.729  
 87380  16384    512    60.00      1796.64   76.06    39.48    13.872  3.601  
 87380  16384   1024    60.00      4008.37   77.05    36.47    6.299   1.491  
 87380  16384   2048    60.00      4468.56   75.18    41.79    5.513   1.532 

Looks like the unification does not hurt the performance, and with those patches
we can get some improvement. BTW, the regression of mergeable buffer still
exist.

 > > ---
 > >  drivers/vhost/net.c |  128 +++------------------------------------------------
 > >  1 files changed, 7 insertions(+), 121 deletions(-)
 > > 
 > > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
 > > index 95e49de..c32a2e4 100644
 > > --- a/drivers/vhost/net.c
 > > +++ b/drivers/vhost/net.c
 > > @@ -227,6 +227,7 @@ static int peek_head_len(struct sock *sk)
 > >   * @iovcount	- returned count of io vectors we fill
 > >   * @log		- vhost log
 > >   * @log_num	- log offset
 > > + * @quota       - headcount quota, 1 for big buffer
 > >   *	returns number of buffer heads allocated, negative on error
 > >   */
 > >  static int get_rx_bufs(struct vhost_virtqueue *vq,
 > > @@ -234,7 +235,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  		       int datalen,
 > >  		       unsigned *iovcount,
 > >  		       struct vhost_log *log,
 > > -		       unsigned *log_num)
 > > +		       unsigned *log_num,
 > > +		       unsigned int quota)
 > >  {
 > >  	unsigned int out, in;
 > >  	int seg = 0;
 > > @@ -242,7 +244,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
 > >  	unsigned d;
 > >  	int r, nlogs = 0;
 > >  
 > > -	while (datalen > 0) {
 > > +	while (datalen > 0 && headcount < quota) {
 > >  		if (unlikely(seg >= UIO_MAXIOV)) {
 > >  			r = -ENOBUFS;
 > >  			goto err;
 > > @@ -282,116 +284,7 @@ err:
 > >  
 > >  /* Expects to be always run from workqueue - which acts as
 > >   * read-size critical section for our kind of RCU. */
 > > -static void handle_rx_big(struct vhost_net *net)
 > > -{
 > > -	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > > -	unsigned out, in, log, s;
 > > -	int head;
 > > -	struct vhost_log *vq_log;
 > > -	struct msghdr msg = {
 > > -		.msg_name = NULL,
 > > -		.msg_namelen = 0,
 > > -		.msg_control = NULL, /* FIXME: get and handle RX aux data. */
 > > -		.msg_controllen = 0,
 > > -		.msg_iov = vq->iov,
 > > -		.msg_flags = MSG_DONTWAIT,
 > > -	};
 > > -
 > > -	struct virtio_net_hdr hdr = {
 > > -		.flags = 0,
 > > -		.gso_type = VIRTIO_NET_HDR_GSO_NONE
 > > -	};
 > > -
 > > -	size_t len, total_len = 0;
 > > -	int err;
 > > -	size_t hdr_size;
 > > -	struct socket *sock = rcu_dereference(vq->private_data);
 > > -	if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue))
 > > -		return;
 > > -
 > > -	mutex_lock(&vq->mutex);
 > > -	vhost_disable_notify(vq);
 > > -	hdr_size = vq->vhost_hlen;
 > > -
 > > -	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 > > -		vq->log : NULL;
 > > -
 > > -	for (;;) {
 > > -		head = vhost_get_vq_desc(&net->dev, vq, vq->iov,
 > > -					 ARRAY_SIZE(vq->iov),
 > > -					 &out, &in,
 > > -					 vq_log, &log);
 > > -		/* On error, stop handling until the next kick. */
 > > -		if (unlikely(head < 0))
 > > -			break;
 > > -		/* OK, now we need to know about added descriptors. */
 > > -		if (head == vq->num) {
 > > -			if (unlikely(vhost_enable_notify(vq))) {
 > > -				/* They have slipped one in as we were
 > > -				 * doing that: check again. */
 > > -				vhost_disable_notify(vq);
 > > -				continue;
 > > -			}
 > > -			/* Nothing new?  Wait for eventfd to tell us
 > > -			 * they refilled. */
 > > -			break;
 > > -		}
 > > -		/* We don't need to be notified again. */
 > > -		if (out) {
 > > -			vq_err(vq, "Unexpected descriptor format for RX: "
 > > -			       "out %d, int %d\n",
 > > -			       out, in);
 > > -			break;
 > > -		}
 > > -		/* Skip header. TODO: support TSO/mergeable rx buffers. */
 > > -		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 > > -		msg.msg_iovlen = in;
 > > -		len = iov_length(vq->iov, in);
 > > -		/* Sanity check */
 > > -		if (!len) {
 > > -			vq_err(vq, "Unexpected header len for RX: "
 > > -			       "%zd expected %zd\n",
 > > -			       iov_length(vq->hdr, s), hdr_size);
 > > -			break;
 > > -		}
 > > -		err = sock->ops->recvmsg(NULL, sock, &msg,
 > > -					 len, MSG_DONTWAIT | MSG_TRUNC);
 > > -		/* TODO: Check specific error and bomb out unless EAGAIN? */
 > > -		if (err < 0) {
 > > -			vhost_discard_vq_desc(vq, 1);
 > > -			break;
 > > -		}
 > > -		/* TODO: Should check and handle checksum. */
 > > -		if (err > len) {
 > > -			pr_debug("Discarded truncated rx packet: "
 > > -				 " len %d > %zd\n", err, len);
 > > -			vhost_discard_vq_desc(vq, 1);
 > > -			continue;
 > > -		}
 > > -		len = err;
 > > -		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
 > > -		if (err) {
 > > -			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
 > > -			       vq->iov->iov_base, err);
 > > -			break;
 > > -		}
 > > -		len += hdr_size;
 > > -		vhost_add_used_and_signal(&net->dev, vq, head, len);
 > > -		if (unlikely(vq_log))
 > > -			vhost_log_write(vq, vq_log, log, len);
 > > -		total_len += len;
 > > -		if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
 > > -			vhost_poll_queue(&vq->poll);
 > > -			break;
 > > -		}
 > > -	}
 > > -
 > > -	mutex_unlock(&vq->mutex);
 > > -}
 > > -
 > > -/* Expects to be always run from workqueue - which acts as
 > > - * read-size critical section for our kind of RCU. */
 > > -static void handle_rx_mergeable(struct vhost_net *net)
 > > +static void handle_rx(struct vhost_net *net)
 > >  {
 > >  	struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
 > >  	unsigned uninitialized_var(in), log;
 > > @@ -431,7 +324,8 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  		sock_len += sock_hlen;
 > >  		vhost_len = sock_len + vhost_hlen;
 > >  		headcount = get_rx_bufs(vq, vq->heads, vhost_len,
 > > -					&in, vq_log, &log);
 > > +					&in, vq_log, &log,
 > > +					likely(mergeable) ? UIO_MAXIOV : 1);
 > >  		/* On error, stop handling until the next kick. */
 > >  		if (unlikely(headcount < 0))
 > >  			break;
 > > @@ -497,14 +391,6 @@ static void handle_rx_mergeable(struct vhost_net *net)
 > >  	mutex_unlock(&vq->mutex);
 > >  }
 > >  
 > > -static void handle_rx(struct vhost_net *net)
 > > -{
 > > -	if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF))
 > > -		handle_rx_mergeable(net);
 > > -	else
 > > -		handle_rx_big(net);
 > > -}
 > > -
 > >  static void handle_tx_kick(struct vhost_work *work)
 > >  {
 > >  	struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,

^ permalink raw reply

* Re: [PATCH] ethtool : Add option -L | --set-common to set common flags.
From: Ben Hutchings @ 2011-01-18  2:59 UTC (permalink / raw)
  To: Mahesh Bandewar; +Cc: David Miller, Tom Herbert, Laurent Chavey, netdev
In-Reply-To: <AANLkTiko=12CLgG_MMPshfwSRt7mZA+3DW+GRtxhaxh=@mail.gmail.com>

On Mon, 2011-01-17 at 18:17 -0800, Mahesh Bandewar wrote:
> On Fri, Jan 14, 2011 at 1:19 PM, Ben Hutchings
> <bhutchings@solarflare.com> wrote:
> > On Thu, 2011-01-13 at 16:11 -0800, Mahesh Bandewar wrote:
[...]
> >> +static int do_scommon(int fd, struct ifreq *ifr)
> >> +{
> >> +     struct ethtool_value eval;
> >> +
> >> +     if (common_flags_mask) {
> >> +             eval.cmd = ETHTOOL_GFLAGS;
> >> +             eval.data = 0;
> >> +             ifr->ifr_data = (caddr_t)&eval;
> >> +             if (ioctl(fd, SIOCETHTOOL, ifr)) {
> >> +                     perror("Cannot get device common flags");
> >> +                     return 1;
> >> +             }
> >> +
> >> +             eval.cmd = ETHTOOL_SFLAGS;
> >> +             eval.data =
> >> +                 ((eval.data & ~(common_flags_mask | off_flags_mask)) |
> >> +                  (common_flags_wanted | off_flags_wanted));
> >
> > Why should this use off_flags_mask and off_flags_wanted?  They should
> > both be 0 if this function is called.
> >
> That is right! Actually the get (ETHTOOL_GFLAGS) operation confused
> me. I thought the values are fetched and *preserved* while setting the
> new value. But when looked at it carefully, that is not the case.
> Actually why that ioctl() with ETHTOOL_GFLAGS required?
[...]

This is a read-modify-write operation.  We have to:

1. Parse the options to find out which flags are to be changed
(common_flags_mask) and the wanted values (common_flags_wanted).
2. Read the current flags (ETHTOOL_GFLAGS reads them into eval.data).
3. Modify the flags (eval.data = ...).
4. Write the new flags (ETHTOOL_SFLAGS).

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] ethtool : Add option -L | --set-common to set common flags.
From: Mahesh Bandewar @ 2011-01-18  2:17 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David Miller, Tom Herbert, Laurent Chavey, netdev
In-Reply-To: <1295039984.5386.19.camel@bwh-desktop>

On Fri, Jan 14, 2011 at 1:19 PM, Ben Hutchings
<bhutchings@solarflare.com> wrote:
> On Thu, 2011-01-13 at 16:11 -0800, Mahesh Bandewar wrote:
>> This patch adds -L | --set-common option to add / remove common flags which
>> includes loopback flag. The -l | --show-common displays the current values
>> for these common flags.
>>
>> Signed-off-by: Mahesh Bandewar <maheshb@google.com>
>> ---
>>  ethtool-copy.h |    1 +
>>  ethtool.8      |   16 ++++++++++
>>  ethtool.c      |   90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  3 files changed, 105 insertions(+), 2 deletions(-)
>>
>> diff --git a/ethtool-copy.h b/ethtool-copy.h
>> index 75c3ae7..5fd18c7 100644
>> --- a/ethtool-copy.h
>> +++ b/ethtool-copy.h
>> @@ -309,6 +309,7 @@ struct ethtool_perm_addr {
>>   * flag differs from the read-only value.
>>   */
>>  enum ethtool_flags {
>> +     ETH_FLAG_LOOPBACK       = (1 << 2),     /* Loopback enable / disable */
>>       ETH_FLAG_TXVLAN         = (1 << 7),     /* TX VLAN offload enabled */
>>       ETH_FLAG_RXVLAN         = (1 << 8),     /* RX VLAN offload enabled */
>>       ETH_FLAG_LRO            = (1 << 15),    /* LRO is enabled */
>> diff --git a/ethtool.8 b/ethtool.8
>> index 1760924..cf7128f 100644
>> --- a/ethtool.8
>> +++ b/ethtool.8
>> @@ -174,6 +174,13 @@ ethtool \- Display or change ethernet card settings
>>  .B2 txvlan on off
>>  .B2 rxhash on off
>>
>> +.B ethtool \-l|\-\-show\-common
>> +.I ethX
>> +
>> +.B ethtool \-L|\-\-set\-common
>> +.I ethX
>> +.B2 loopback on off
>> +
>>  .B ethtool \-p|\-\-identify
>>  .I ethX
>>  .RI [ N ]
>> @@ -406,6 +413,15 @@ Specifies whether TX VLAN acceleration should be enabled
>>  .A2 rxhash on off
>>  Specifies whether receive hashing offload should be enabled
>>  .TP
>> +.B \-l \-\-show\-common
>> +Queries the specified ethernet device for common flag settings.
>> +.TP
>> +.B \-L \-\-set\-common
>> +Changes the common parameters of the specified ethernet device.
>> +.TP
>> +.A2 loopback on off
>> +Specifies whether loopback should be enabled.
>> +.TP
>
> I've just gone through the manual page and changed 'ethernet device' to
> 'network device' for all generic operations; please follow that.  The
> source for the manual page was also renamed to ethtool.8.in as it now
> goes through autoconf substitution.
>
OK. I'll make those changes.

>>  .B \-p \-\-identify
>>  Initiates adapter-specific action intended to enable an operator to
>>  easily identify the adapter by sight.  Typically this involves
>> diff --git a/ethtool.c b/ethtool.c
>> index 63e0ead..1a0c10c 100644
>> --- a/ethtool.c
>> +++ b/ethtool.c
> [...]
>> @@ -1905,6 +1932,13 @@ static int dump_offload(int rx, int tx, int sg, int tso, int ufo, int gso,
>>       return 0;
>>  }
>>
>> +static int dump_common_flags(int loopback)
>> +{
>> +     fprintf(stdout, "loopback: %s\n", loopback ? "on" : "off");
>> +
>> +     return 0;
>> +}
>> +
>>  static int dump_rxfhash(int fhash, u64 val)
>>  {
>>       switch (fhash) {
> [...]
>> @@ -2219,6 +2257,53 @@ static int do_scoalesce(int fd, struct ifreq *ifr)
>>       return 0;
>>  }
>>
>> +static int do_gcommon(int fd, struct ifreq *ifr)
>> +{
>> +     struct ethtool_value eval;
>> +     int loopback = 0;
>> +
>> +     fprintf(stdout, "Common flags for %s:\n", devname);
>> +
>> +     eval.cmd = ETHTOOL_GFLAGS;
>> +     ifr->ifr_data = (caddr_t)&eval;
>> +     if (ioctl(fd, SIOCETHTOOL, ifr)) {
>> +             perror("Cannot get device flags");
>> +     } else {
>> +             loopback = (eval.data & ETH_FLAG_LOOPBACK) != 0;
>> +     }
>> +
>> +     return dump_common_flags(loopback);
>
> Breaking up a bitmask into a list of flag parameters is fairly
> pointless.  I realise do_goffload() and dump_offload() do that but I am
> just waiting for Michał Mirosław's changes to offload flags to be
> settled before I fix them.
>
>> +}
>> +
>> +static int do_scommon(int fd, struct ifreq *ifr)
>> +{
>> +     struct ethtool_value eval;
>> +
>> +     if (common_flags_mask) {
>> +             eval.cmd = ETHTOOL_GFLAGS;
>> +             eval.data = 0;
>> +             ifr->ifr_data = (caddr_t)&eval;
>> +             if (ioctl(fd, SIOCETHTOOL, ifr)) {
>> +                     perror("Cannot get device common flags");
>> +                     return 1;
>> +             }
>> +
>> +             eval.cmd = ETHTOOL_SFLAGS;
>> +             eval.data =
>> +                 ((eval.data & ~(common_flags_mask | off_flags_mask)) |
>> +                  (common_flags_wanted | off_flags_wanted));
>
> Why should this use off_flags_mask and off_flags_wanted?  They should
> both be 0 if this function is called.
>
That is right! Actually the get (ETHTOOL_GFLAGS) operation confused
me. I thought the values are fetched and *preserved* while setting the
new value. But when looked at it carefully, that is not the case.
Actually why that ioctl() with ETHTOOL_GFLAGS required?

>> +             if (ioctl(fd, SIOCETHTOOL, ifr)) {
>> +                     perror("Cannot set device common flags");
>> +                     return 1;
>> +             }
>> +     } else {
>> +             fprintf(stdout, "No common settings changed\n");
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>>  static int do_goffload(int fd, struct ifreq *ifr)
>>  {
>>       struct ethtool_value eval;
>> @@ -2407,8 +2492,9 @@ static int do_soffload(int fd, struct ifreq *ifr)
>>               }
>>
>>               eval.cmd = ETHTOOL_SFLAGS;
>> -             eval.data = ((eval.data & ~off_flags_mask) |
>> -                          off_flags_wanted);
>> +             eval.data =
>> +                 ((eval.data & ~(off_flags_mask | common_flags_mask)) |
>> +                  (off_flags_wanted | common_flags_wanted));
>
> Similarly, why should this use common_flags_mask and
> common_flags_wanted?

For the same (wrong) reason mentioned above. I'll correct it and post
the new patch.

Thanks,
--mahesh..
>
> Ben.
>
>>
>>               err = ioctl(fd, SIOCETHTOOL, ifr);
>>               if (err) {
>
> --
> Ben Hutchings, Senior Software Engineer, Solarflare Communications
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
>
>

^ permalink raw reply

* Re: Realtek r8168C / r8169 driver VLAN TAG stripping
From: Francois Romieu @ 2011-01-18  1:21 UTC (permalink / raw)
  To: Anand Raj Manickam; +Cc: netdev, Hayes
In-Reply-To: <AANLkTikoXfgHrJyMbe6A_HmvXT_QRo55w76st5Wo0hSv@mail.gmail.com>

Anand Raj Manickam <anandrm@gmail.com> :
> On Mon, Jan 17, 2011 at 11:52 AM, Anand Raj Manickam <anandrm@gmail.com> wrote:
[...]
> > This is the dmesg  for XID
> >
> > eth0: RTL8168c/8111c at 0xf9628000, 00:17:54:00:f6:62, XID 1c4000c0 IRQ 31
> > r8169: mac_version = 0x16

I do not have one of those (RTL_GIGA_MAC_VER_22) to check if it handles vlan
correctly yet.

> > r8169 Gigabit Ethernet driver 2.3LK-NAPI loaded
> >
> > Unfortunately , i m not able to upgrade my kernel now . If there is a
> > Fix for it , that would be great !!

I doubt there is a lot of glamour/fortune/fame in backporting the 89 r8169
patches between v2.6.30 and v2.6.37 but you may help yourself and give it
a try.

-- 
Ueimor

^ permalink raw reply

* [PATCH] scm: provide full privilege set via SCM_PRIVILEGE
From: Casey Schaufler @ 2011-01-18  0:34 UTC (permalink / raw)
  To: netdev; +Cc: Casey Schaufler


Subject: [PATCH] scm: provide full privilege set via SCM_PRIVILEGE

The SCM mechanism currently provides interfaces for delivering
the uid/gid and the "security context" (LSM information) of the
peer on a UDS socket. All of the security credential information
is available, but there is no interface available to obtain it.
Further, the existing interfaces require that the user chose
between the uid/gid and the context as the existing interfaces
are exclusive.

This patch introduces an additional interface that provides
a complete set of security information from the peer credential.
No additional work is required to provide the information
internally, it is all being passed, just not exposed.

Also sent to LKML and LSM lists. 

Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
---

 include/asm-generic/socket.h |    1 +
 include/linux/net.h          |    1 +
 include/linux/socket.h       |    1 +
 include/net/scm.h            |   80 +++++++++++++++++++++++++++++++++++++++++-
 net/core/sock.c              |   11 ++++++
 5 files changed, 93 insertions(+), 1 deletions(-)
diff --git a/include/asm-generic/socket.h b/include/asm-generic/socket.h
index 9a6115e..7aa8e84 100644
--- a/include/asm-generic/socket.h
+++ b/include/asm-generic/socket.h
@@ -64,4 +64,5 @@
 #define SO_DOMAIN		39
 
 #define SO_RXQ_OVFL             40
+#define SO_PASSPRIV		41
 #endif /* __ASM_GENERIC_SOCKET_H */
diff --git a/include/linux/net.h b/include/linux/net.h
index 16faa13..159a929 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -71,6 +71,7 @@ struct net;
 #define SOCK_NOSPACE		2
 #define SOCK_PASSCRED		3
 #define SOCK_PASSSEC		4
+#define SOCK_PASSPRIV		5
 
 #ifndef ARCH_HAS_SOCKET_TYPES
 /**
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 86b652f..e9cfd68 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -147,6 +147,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
 #define	SCM_RIGHTS	0x01		/* rw: access rights (array of int) */
 #define SCM_CREDENTIALS 0x02		/* rw: struct ucred		*/
 #define SCM_SECURITY	0x03		/* rw: security label		*/
+#define SCM_PRIVILEGES  0x04		/* rw: privilege set		*/
 
 struct ucred {
 	__u32	pid;
diff --git a/include/net/scm.h b/include/net/scm.h
index 3165650..4b8db21 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -101,6 +101,83 @@ static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct sc
 { }
 #endif /* CONFIG_SECURITY_NETWORK */
 
+static __inline__ void scm_passpriv(struct socket *sock, struct msghdr *msg,
+				struct scm_cookie *scm)
+{
+	const struct cred *credp = scm->cred;
+	const struct group_info *gip;
+	char *result;
+	char *cp;
+	int i;
+#ifdef CONFIG_SECURITY_NETWORK
+	char *secdata;
+	u32 seclen;
+	int err;
+#endif /* CONFIG_SECURITY_NETWORK */
+
+	if (!test_bit(SOCK_PASSPRIV, &sock->flags))
+		return;
+
+	gip = credp->group_info;
+
+	/*
+	 * uid + euid + gid + egid + group-list + capabilities
+	 *     + "uid=" + "euid=" + "gid=" + "egid=" + "grps="
+	 *     + "cap-e=" + "cap-p=" + "cap-i="
+	 * 10  + 10   + 10  + 10   + (ngrps * 10) + ecap + pcap + icap
+	 *     + 4 + 5 + 4 + 5 + 5 + 6 + 6 + 6
+	 */
+	i = ((4 + gip->ngroups) * 11) + (3 * (_KERNEL_CAPABILITY_U32S * 8 + 1))
+		+ 41;
+
+#ifdef CONFIG_SECURITY_NETWORK
+	err = security_secid_to_secctx(scm->secid, &secdata, &seclen);
+	if (!err)
+		/*
+		 * " context="
+		 */
+		i += seclen + 10;
+#endif /* CONFIG_SECURITY_NETWORK */
+
+	result = kzalloc(i, GFP_KERNEL);
+	if (result == NULL)
+		return;
+
+	cp = result + sprintf(result, "euid=%d uid=%d egid=%d gid=%d",
+				credp->euid, credp->uid,
+				credp->egid, credp->gid);
+
+	if (gip != NULL && gip->ngroups > 0) {
+		cp += sprintf(cp, " grps=%d", GROUP_AT(gip, 0));
+		for (i = 1 ; i < gip->ngroups; i++)
+			cp += sprintf(cp, ",%d", GROUP_AT(gip, i));
+	}
+
+	cp += sprintf(cp, " cap-e=");
+	CAP_FOR_EACH_U32(i)
+		cp += sprintf(cp, "%08x", credp->cap_effective.cap[i]);
+	cp += sprintf(cp, " cap-p=");
+	CAP_FOR_EACH_U32(i)
+		cp += sprintf(cp, "%08x", credp->cap_permitted.cap[i]);
+	cp += sprintf(cp, " cap-i=");
+	CAP_FOR_EACH_U32(i)
+		cp += sprintf(cp, "%08x", credp->cap_inheritable.cap[i]);
+
+#ifdef CONFIG_SECURITY_NETWORK
+	cp += sprintf(cp, " context=");
+	strncpy(cp, secdata, seclen);
+	cp += seclen;
+	*cp = '\0';
+
+	security_release_secctx(secdata, seclen);
+#endif /* CONFIG_SECURITY_NETWORK */
+
+	put_cmsg(msg, SOL_SOCKET, SCM_PRIVILEGES, strlen(result)+1, result);
+
+	kfree(result);
+}
+
+
 static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 				struct scm_cookie *scm, int flags)
 {
@@ -114,6 +191,8 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 	if (test_bit(SOCK_PASSCRED, &sock->flags))
 		put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(scm->creds), &scm->creds);
 
+	scm_passpriv(sock, msg, scm);
+
 	scm_destroy_cred(scm);
 
 	scm_passec(sock, msg, scm);
@@ -124,6 +203,5 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 	scm_detach_fds(msg, scm);
 }
 
-
 #endif /* __LINUX_NET_SCM_H */
 
diff --git a/net/core/sock.c b/net/core/sock.c
index fb60801..f134126 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -725,6 +725,13 @@ set_rcvbuf:
 		else
 			clear_bit(SOCK_PASSSEC, &sock->flags);
 		break;
+
+	case SO_PASSPRIV:
+		if (valbool)
+			set_bit(SOCK_PASSPRIV, &sock->flags);
+		else
+			clear_bit(SOCK_PASSPRIV, &sock->flags);
+		break;
 	case SO_MARK:
 		if (!capable(CAP_NET_ADMIN))
 			ret = -EPERM;
@@ -950,6 +957,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
 		break;
 
+	case SO_PASSPRIV:
+		v.val = test_bit(SOCK_PASSPRIV, &sock->flags) ? 1 : 0;
+		break;
+
 	case SO_PEERSEC:
 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
 




^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox